This project aims to help a marketing agency to predict customer churn (stop buying their survice) by using a machine learning classification model, so that the company can test this against incoming data for future customers to perdict which customers will churn and assign them an account manager. The data description is as follows:

Name : Name of the latest contact at Company
Age: Customer Age
Total_Purchase: Total Ads Purchased
Account_Manager: Binary 0=No manager, 1= Account manager assigned
Years: Totaly Years as a customer
Num_sites: Number of websites that use the service.
Onboard_date: Date that the name of the latest contact was onboarded
Location: Client HQ Address
Company: Name of Client Company

In [0]:

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:

data = spark.read.csv("dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/customer_churn.csv",inferSchema=True, header=True)

In [0]:

data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)

In [0]:

data.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.7648355920350969|                null|                null| 0.3728852122772358|
|    min|   Aaron King|             22.0|            100.0|                 0|              1.0|               3.0|00103 Jeffrey Cre...|     Abbott-Thompson|                  0|
|    max|Zachary Walsh|             65.0|         18026.01|                 1|             9.15|              14.0|Unit 9800 Box 287...|Zuniga, Clark and...|                  1|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+

In [0]:

data.columns

Out[5]: ['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [0]:

assembler = VectorAssembler(inputCols=['Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites'],outputCol='features')

In [0]:

output = assembler.transform(data)

In [0]:

final_data = output.select('features','churn')

In [0]:

train_churn,test_churn = final_data.randomSplit([0.7,0.3])

In [0]:

lr_churn = LogisticRegression(labelCol='churn')

In [0]:

fitted_churn_model = lr_churn.fit(train_churn)

In [0]:

training_sum = fitted_churn_model.summary

In [0]:

training_sum.predictions.describe().show()

/databricks/spark/python/pyspark/sql/context.py:134: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
+-------+------------------+------------------+
|summary|             churn|        prediction|
+-------+------------------+------------------+
|  count|               621|               621|
|   mean|0.1642512077294686|0.1143317230273752|
| stddev|0.3708020444222667| 0.318470254004318|
|    min|               0.0|               0.0|
|    max|               1.0|               1.0|
+-------+------------------+------------------+

In [0]:

pred_and_labels = fitted_churn_model.evaluate(test_churn)

In [0]:

pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,1....|    0|[4.42610676267903...|[0.98818040771998...|       0.0|
|[25.0,9672.03,0.0...|    0|[4.38649810533113...|[0.98770872408802...|       0.0|
|[27.0,8628.8,1.0,...|    0|[4.99547660560702...|[0.99327701009202...|       0.0|
|[28.0,11128.95,1....|    0|[3.98417417702396...|[0.98173212040432...|       0.0|
|[29.0,8688.17,1.0...|    1|[2.44899083414891...|[0.92048762114072...|       0.0|
|[30.0,8403.78,1.0...|    0|[5.58181352981596...|[0.99624839746366...|       0.0|
|[30.0,10183.98,1....|    0|[2.76467297450614...|[0.94073669499285...|       0.0|
|[30.0,12788.37,0....|    0|[2.71301102813230...|[0.93779004336492...|       0.0|
|[30.0,13473.35,0....|    0|[3.02144798568538...|[0.95353372376920...|       0.0|
|[31.0,8829.83,1.0...|    0|[4.18829744900902...|[0.98505465797648...|       0.0|
|[31.0,10058.87,1....|    0|[3.97785919321561...|[0.98161852124201...|       0.0|
|[31.0,11743.24,0....|    0|[6.36096097807910...|[0.99827527419186...|       0.0|
|[31.0,12264.68,1....|    0|[3.33131327531714...|[0.96548755656129...|       0.0|
|[32.0,6367.22,1.0...|    0|[3.02667356283644...|[0.95376470631089...|       0.0|
|[32.0,8011.38,0.0...|    0|[1.89658424424186...|[0.86950444082524...|       0.0|
|[32.0,8575.71,0.0...|    0|[3.69300591894818...|[0.97570775434429...|       0.0|
|[32.0,8617.98,1.0...|    1|[0.86853858529779...|[0.70444151703483...|       0.0|
|[32.0,11540.86,0....|    0|[6.70293126734090...|[0.99877419538724...|       0.0|
|[32.0,11715.72,0....|    0|[3.40964625291369...|[0.96800464819343...|       0.0|
|[32.0,12403.6,0.0...|    0|[5.42166677112224...|[0.99559967468850...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 20 rows

In [0]:

churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                           labelCol='churn')

In [0]:

auc = churn_eval.evaluate(pred_and_labels.predictions)

In [0]:

auc

Out[24]: 0.7804383116883116

Predict on new data

In [0]:

final_lr_model = lr_churn.fit(final_data)

In [0]:

new_customers = spark.read.csv("dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/new_customers.csv",inferSchema=True, header=True)

In [0]:

new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)

In [0]:

test_new_customers = assembler.transform(new_customers)

In [0]:

test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)

In [0]:

final_results = final_lr_model.transform(test_new_customers)

In [0]:

final_results.select('Company','prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+