from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()
from pyspark.ml.regression import LinearRegression
data = spark.read.csv("dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/Ecommerce_Customers.csv",inferSchema=True,header=True)
data.printSchema()
root |-- Email: string (nullable = true) |-- Address: string (nullable = true) |-- Avatar: string (nullable = true) |-- Avg Session Length: double (nullable = true) |-- Time on App: double (nullable = true) |-- Time on Website: double (nullable = true) |-- Length of Membership: double (nullable = true) |-- Yearly Amount Spent: double (nullable = true)
data.show()
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+ | Email| Address| Avatar|Avg Session Length| Time on App| Time on Website|Length of Membership|Yearly Amount Spent| +--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+ |mstephenson@ferna...|835 Frank TunnelW...| Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616| 4.0826206329529615| 587.9510539684005| | hduke@hotmail.com|4547 Archer Commo...| DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744| 2.66403418213262| 392.2049334443264| | pallen@yahoo.com|24645 Valerie Uni...| Bisque|33.000914755642675|11.330278057777512|37.110597442120856| 4.104543202376424| 487.54750486747207| |riverarebecca@gma...|1414 David Throug...| SaddleBrown| 34.30555662975554|13.717513665142507| 36.72128267790313| 3.120178782748092| 581.8523440352177| |mstephens@davidso...|14023 Rodriguez P...|MediumAquaMarine| 33.33067252364639|12.795188551078114| 37.53665330059473| 4.446308318351434| 599.4060920457634| |alvareznancy@luca...|645 Martha Park A...| FloralWhite|33.871037879341976|12.026925339755056| 34.47687762925054| 5.493507201364199| 637.102447915074| |katherine20@yahoo...|68388 Reyes Light...| DarkSlateBlue| 32.02159550138701|11.366348309710526| 36.68377615286961| 4.685017246570912| 521.5721747578274| | awatkins@yahoo.com|Unit 6538 Box 898...| Aqua|32.739142938380326| 12.35195897300293| 37.37335885854755| 4.4342734348999375| 549.9041461052942| |vchurch@walter-ma...|860 Lee KeyWest D...| Salmon| 33.98777289568564|13.386235275676436|37.534497341555735| 3.2734335777477144| 570.2004089636196| | bonnie69@lin.biz|PSC 2734, Box 525...| Brown|31.936548618448917|11.814128294972196| 37.14516822352819| 3.202806071553459| 427.1993848953282| |andrew06@peterson...|26104 Alexander G...| Tomato|33.992572774953736|13.338975447662113| 37.22580613162114| 2.482607770510596| 492.6060127179966| |ryanwerner@freema...|Unit 2413 Box 034...| Tomato| 33.87936082480498|11.584782999535266| 37.08792607098381| 3.71320920294043| 522.3374046069357| | knelson@gmail.com|6705 Miller Orcha...| RoyalBlue|29.532428967057943|10.961298400154098| 37.42021557502538| 4.046423164299585| 408.6403510726275| |wrightpeter@yahoo...|05302 Dunlap Ferr...| Bisque| 33.19033404372265|12.959226091609382|36.144666700041924| 3.918541839158999| 573.4158673313865| |taylormason@gmail...|7773 Powell Sprin...| DarkBlue|32.387975853153876|13.148725692056516| 36.61995708279922| 2.494543646659249| 470.4527333009554| | jstark@anderson.com|49558 Ramirez Roa...| Peru|30.737720372628182|12.636606052000127|36.213763093698624| 3.3578468423262944| 461.7807421962299| | wjennings@gmail.com|6362 Wilson Mount...| PowderBlue| 32.12538689728784|11.733861690857394| 34.8940927514398| 3.1361327164897803| 457.84769594494855| |rebecca45@hale-ba...|8982 Burton RowWi...| OliveDrab|32.338899323067196|12.013194694014402| 38.38513659413844| 2.420806160901484| 407.70454754954415| |alejandro75@hotma...|64475 Andre Club ...| Cyan|32.187812045932155| 14.7153875441565| 38.24411459434352| 1.516575580831944| 452.3156754800354| |samuel46@love-wes...|544 Alexander Hei...| LightSeaGreen| 32.61785606282345|13.989592555825254|37.190503800397956| 4.064548550437977| 605.061038804892| +--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+ only showing top 20 rows
data.head()
Out[7]: Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)
for item in data.head():
print(item)
mstephenson@fernandez.com 835 Frank TunnelWrightmouth, MI 82180-9605 Violet 34.49726772511229 12.65565114916675 39.57766801952616 4.0826206329529615 587.9510539684005
Set up DataFrame for Machine Learning
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
data.columns
Out[10]: ['Email', 'Address', 'Avatar', 'Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership', 'Yearly Amount Spent']
assembler = VectorAssembler(
inputCols=["Avg Session Length", "Time on App",
"Time on Website",'Length of Membership'],
outputCol="features")
output = assembler.transform(data)
output.select("features").show()
+--------------------+ | features| +--------------------+ |[34.4972677251122...| |[31.9262720263601...| |[33.0009147556426...| |[34.3055566297555...| |[33.3306725236463...| |[33.8710378793419...| |[32.0215955013870...| |[32.7391429383803...| |[33.9877728956856...| |[31.9365486184489...| |[33.9925727749537...| |[33.8793608248049...| |[29.5324289670579...| |[33.1903340437226...| |[32.3879758531538...| |[30.7377203726281...| |[32.1253868972878...| |[32.3388993230671...| |[32.1878120459321...| |[32.6178560628234...| +--------------------+ only showing top 20 rows
output.show()
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+ | Email| Address| Avatar|Avg Session Length| Time on App| Time on Website|Length of Membership|Yearly Amount Spent| features| +--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+ |mstephenson@ferna...|835 Frank TunnelW...| Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616| 4.0826206329529615| 587.9510539684005|[34.4972677251122...| | hduke@hotmail.com|4547 Archer Commo...| DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744| 2.66403418213262| 392.2049334443264|[31.9262720263601...| | pallen@yahoo.com|24645 Valerie Uni...| Bisque|33.000914755642675|11.330278057777512|37.110597442120856| 4.104543202376424| 487.54750486747207|[33.0009147556426...| |riverarebecca@gma...|1414 David Throug...| SaddleBrown| 34.30555662975554|13.717513665142507| 36.72128267790313| 3.120178782748092| 581.8523440352177|[34.3055566297555...| |mstephens@davidso...|14023 Rodriguez P...|MediumAquaMarine| 33.33067252364639|12.795188551078114| 37.53665330059473| 4.446308318351434| 599.4060920457634|[33.3306725236463...| |alvareznancy@luca...|645 Martha Park A...| FloralWhite|33.871037879341976|12.026925339755056| 34.47687762925054| 5.493507201364199| 637.102447915074|[33.8710378793419...| |katherine20@yahoo...|68388 Reyes Light...| DarkSlateBlue| 32.02159550138701|11.366348309710526| 36.68377615286961| 4.685017246570912| 521.5721747578274|[32.0215955013870...| | awatkins@yahoo.com|Unit 6538 Box 898...| Aqua|32.739142938380326| 12.35195897300293| 37.37335885854755| 4.4342734348999375| 549.9041461052942|[32.7391429383803...| |vchurch@walter-ma...|860 Lee KeyWest D...| Salmon| 33.98777289568564|13.386235275676436|37.534497341555735| 3.2734335777477144| 570.2004089636196|[33.9877728956856...| | bonnie69@lin.biz|PSC 2734, Box 525...| Brown|31.936548618448917|11.814128294972196| 37.14516822352819| 3.202806071553459| 427.1993848953282|[31.9365486184489...| |andrew06@peterson...|26104 Alexander G...| Tomato|33.992572774953736|13.338975447662113| 37.22580613162114| 2.482607770510596| 492.6060127179966|[33.9925727749537...| |ryanwerner@freema...|Unit 2413 Box 034...| Tomato| 33.87936082480498|11.584782999535266| 37.08792607098381| 3.71320920294043| 522.3374046069357|[33.8793608248049...| | knelson@gmail.com|6705 Miller Orcha...| RoyalBlue|29.532428967057943|10.961298400154098| 37.42021557502538| 4.046423164299585| 408.6403510726275|[29.5324289670579...| |wrightpeter@yahoo...|05302 Dunlap Ferr...| Bisque| 33.19033404372265|12.959226091609382|36.144666700041924| 3.918541839158999| 573.4158673313865|[33.1903340437226...| |taylormason@gmail...|7773 Powell Sprin...| DarkBlue|32.387975853153876|13.148725692056516| 36.61995708279922| 2.494543646659249| 470.4527333009554|[32.3879758531538...| | jstark@anderson.com|49558 Ramirez Roa...| Peru|30.737720372628182|12.636606052000127|36.213763093698624| 3.3578468423262944| 461.7807421962299|[30.7377203726281...| | wjennings@gmail.com|6362 Wilson Mount...| PowderBlue| 32.12538689728784|11.733861690857394| 34.8940927514398| 3.1361327164897803| 457.84769594494855|[32.1253868972878...| |rebecca45@hale-ba...|8982 Burton RowWi...| OliveDrab|32.338899323067196|12.013194694014402| 38.38513659413844| 2.420806160901484| 407.70454754954415|[32.3388993230671...| |alejandro75@hotma...|64475 Andre Club ...| Cyan|32.187812045932155| 14.7153875441565| 38.24411459434352| 1.516575580831944| 452.3156754800354|[32.1878120459321...| |samuel46@love-wes...|544 Alexander Hei...| LightSeaGreen| 32.61785606282345|13.989592555825254|37.190503800397956| 4.064548550437977| 605.061038804892|[32.6178560628234...| +--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+ only showing top 20 rows
final_data = output.select("features",'Yearly Amount Spent')
train_data,test_data = final_data.randomSplit([0.7,0.3])
train_data.describe().show()
+-------+-------------------+ |summary|Yearly Amount Spent| +-------+-------------------+ | count| 337| | mean| 497.29804728982026| | stddev| 75.40555176703002| | min| 256.67058229005585| | max| 744.2218671047146| +-------+-------------------+
test_data.describe().show()
+-------+-------------------+ |summary|Yearly Amount Spent| +-------+-------------------+ | count| 163| | mean| 503.4820686664174| | stddev| 86.93796653209513| | min| 266.086340948469| | max| 765.5184619388373| +-------+-------------------+
lr = LinearRegression(labelCol='Yearly Amount Spent')
lrModel = lr.fit(train_data,)
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))
Coefficients: [25.752248212757785,38.02153501848721,0.30617838938243924,61.4168452616767] Intercept: -1038.9830898047296
test_results = lrModel.evaluate(test_data)
test_results.residuals.show()
/databricks/spark/python/pyspark/sql/context.py:134: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead. warnings.warn( +-------------------+ | residuals| +-------------------+ |-5.4490893671846266| | 0.2719787663319835| | 11.4190363646768| |-17.042847592942394| | 7.088376721801524| | 4.5686319988404875| |-12.643497447590505| | -20.83164192799984| | 11.618492212560113| | 19.32851684749454| | 4.949463884778538| | 3.674630199081662| | -8.287042728285428| |-0.9510774258145602| | 18.653189983253924| |-1.0136920408320975| | -4.22305643607524| |-2.4446599228739956| | 0.7896275922671521| | 8.81866157085966| +-------------------+ only showing top 20 rows
unlabeled_data = test_data.select('features')
predictions = lrModel.transform(unlabeled_data)
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
RMSE: 10.27251391143641 MSE: 105.5245420606546