In [0]:

import pyspark
from pyspark.sql import SparkSession

In [0]:

# Spark is already setup
spark

SparkSession - hive

SparkContext

Spark UI

Version: v3.3.0
Master: spark://10.70.205.126:7077
AppName: Databricks Shell

In [0]:

# If we want to setup from scratch
spark = SparkSession.builder.appName("Spark-learning").getOrCreate()

In [0]:

df = sqlContext.sql("select _c0, _c1,_c2,_c71, _c73, _c77,_c89, _c90 from telecom_customer_churn_csv")

In [0]:

df.show(2)

+-------+-----+-----+------------------+----+----+----+----+
|    _c0|  _c1|  _c2|              _c71|_c73|_c77|_c89|_c90|
+-------+-----+-----+------------------+----+----+----+----+
|56.3375|  227|59.99|      MIDWEST AREA|   N|WCMB|null|   1|
| 73.085|254.5|39.99|NEW YORK CITY AREA|   N|WCMB|null|   0|
+-------+-----+-----+------------------+----+----+----+----+
only showing top 2 rows

In [0]:

df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c71: string (nullable = true)
 |-- _c73: string (nullable = true)
 |-- _c77: string (nullable = true)
 |-- _c89: string (nullable = true)
 |-- _c90: string (nullable = true)

In [0]: