GroupBy and Aggregate F
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("groupbyagg").getOrCreate()
df = spark.read.csv("dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/sales_info.csv",inferSchema=True,header=True)
df.printSchema()
root |-- Company: string (nullable = true) |-- Person: string (nullable = true) |-- Sales: double (nullable = true)
df.show()
+-------+-------+-----+ |Company| Person|Sales| +-------+-------+-----+ | GOOG| Sam|200.0| | GOOG|Charlie|120.0| | GOOG| Frank|340.0| | MSFT| Tina|600.0| | MSFT| Amy|124.0| | MSFT|Vanessa|243.0| | FB| Carl|870.0| | FB| Sarah|350.0| | APPL| John|250.0| | APPL| Linda|130.0| | APPL| Mike|750.0| | APPL| Chris|350.0| +-------+-------+-----+
df.groupBy("Company").count().show()
+-------+-----+ |Company|count| +-------+-----+ | APPL| 4| | GOOG| 3| | FB| 2| | MSFT| 3| +-------+-----+
df.groupBy("Company").max().show()
+-------+----------+ |Company|max(Sales)| +-------+----------+ | APPL| 750.0| | GOOG| 340.0| | FB| 870.0| | MSFT| 600.0| +-------+----------+
df.groupBy("Company").min().show()
+-------+----------+ |Company|min(Sales)| +-------+----------+ | APPL| 130.0| | GOOG| 120.0| | FB| 350.0| | MSFT| 124.0| +-------+----------+
df.groupBy("Company").sum().show()
+-------+----------+ |Company|sum(Sales)| +-------+----------+ | APPL| 1480.0| | GOOG| 660.0| | FB| 1220.0| | MSFT| 967.0| +-------+----------+
df.agg({'Sales':'max'}).show()
+----------+ |max(Sales)| +----------+ | 870.0| +----------+
Functions
from pyspark.sql.functions import countDistinct, avg,stddev
df.select(countDistinct("Sales")).show()
+---------------------+ |count(DISTINCT Sales)| +---------------------+ | 11| +---------------------+
df.select(countDistinct("Sales").alias("Distinct Sales")).show()
+--------------+ |Distinct Sales| +--------------+ | 11| +--------------+
df.select(avg('Sales')).show()
+-----------------+ | avg(Sales)| +-----------------+ |360.5833333333333| +-----------------+
df.select(stddev("Sales")).show()
+------------------+ |stddev_samp(Sales)| +------------------+ |250.08742410799007| +------------------+
from pyspark.sql.functions import format_number
sales_std = df.select(stddev("Sales").alias('std'))
sales_std.show()
+------------------+ | std| +------------------+ |250.08742410799007| +------------------+
sales_std.select(format_number('std',2)).show()
+---------------------+ |format_number(std, 2)| +---------------------+ | 250.09| +---------------------+
Order By
df.orderBy("Sales").show()
+-------+-------+-----+ |Company| Person|Sales| +-------+-------+-----+ | GOOG|Charlie|120.0| | MSFT| Amy|124.0| | APPL| Linda|130.0| | GOOG| Sam|200.0| | MSFT|Vanessa|243.0| | APPL| John|250.0| | GOOG| Frank|340.0| | FB| Sarah|350.0| | APPL| Chris|350.0| | MSFT| Tina|600.0| | APPL| Mike|750.0| | FB| Carl|870.0| +-------+-------+-----+
df.orderBy(df["Sales"].desc()).show()
+-------+-------+-----+ |Company| Person|Sales| +-------+-------+-----+ | FB| Carl|870.0| | APPL| Mike|750.0| | MSFT| Tina|600.0| | FB| Sarah|350.0| | APPL| Chris|350.0| | GOOG| Frank|340.0| | APPL| John|250.0| | MSFT|Vanessa|243.0| | GOOG| Sam|200.0| | APPL| Linda|130.0| | MSFT| Amy|124.0| | GOOG|Charlie|120.0| +-------+-------+-----+