from pyspark.sql import SparkSession spark = SparkSession.builder.appName("Operations").getOrCreate() df = spark.read.csv("dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/appl_stock.csv",inferSchema=True,header=True) df.show() df.printSchema() df.filter("Close<500").show() df.filter("Close<500").select('Open').show() df.filter("Close<500").select(['Open','Close']).show() df.filter(df["Close"] < 200).show() df.filter( (df["Close"] < 200) & (df['Open'] > 200) ).show() df.filter( (df["Close"] < 200) | (df['Open'] > 200) ).show() df.filter( (df["Close"] < 200) & ~(df['Open'] < 200) ).show() df.filter(df["Low"] == 197.16).show() df.filter(df["Low"] == 197.16).collect() result = df.filter(df["Low"] == 197.16).collect() type(result[0]) row = result[0] row.asDict() for item in result[0]: print(item) from pyspark.sql.functions import format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format df.select(hour(df['Date'])).show() df.select(dayofyear(df['Date'])).show() df.select(month(df['Date'])).show() df.select(year(df['Date'])).show() df.withColumn("Year",year(df['Date'])).show() newdf = df.withColumn("Year",year(df['Date'])) newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']].show() result = newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']] result = result.withColumnRenamed("avg(Year)","Year") result = result.select('Year',format_number('avg(Close)',2).alias("Mean Close")).show()