import pyspark
from pyspark.sql import SparkSession
# Spark is already setup
spark
SparkSession - hive
# If we want to setup from scratch
spark = SparkSession.builder.appName("Spark-learning").getOrCreate()
df = sqlContext.sql("select _c0, _c1,_c2,_c71, _c73, _c77,_c89, _c90 from telecom_customer_churn_csv")
df.show(2)
+-------+-----+-----+------------------+----+----+----+----+ | _c0| _c1| _c2| _c71|_c73|_c77|_c89|_c90| +-------+-----+-----+------------------+----+----+----+----+ |56.3375| 227|59.99| MIDWEST AREA| N|WCMB|null| 1| | 73.085|254.5|39.99|NEW YORK CITY AREA| N|WCMB|null| 0| +-------+-----+-----+------------------+----+----+----+----+ only showing top 2 rows
df.printSchema()
root |-- _c0: string (nullable = true) |-- _c1: string (nullable = true) |-- _c2: string (nullable = true) |-- _c71: string (nullable = true) |-- _c73: string (nullable = true) |-- _c77: string (nullable = true) |-- _c89: string (nullable = true) |-- _c90: string (nullable = true)