This project aims to help a marketing agency to predict customer churn (stop buying their survice) by using a machine learning classification model, so that the company can test this against incoming data for future customers to perdict which customers will churn and assign them an account manager. The data description is as follows:
Name : Name of the latest contact at Company
Age: Customer Age
Total_Purchase: Total Ads Purchased
Account_Manager: Binary 0=No manager, 1= Account manager assigned
Years: Totaly Years as a customer
Num_sites: Number of websites that use the service.
Onboard_date: Date that the name of the latest contact was onboarded
Location: Client HQ Address
Company: Name of Client Company
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
data = spark.read.csv("dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/customer_churn.csv",inferSchema=True, header=True)
data.printSchema()
root |-- Names: string (nullable = true) |-- Age: double (nullable = true) |-- Total_Purchase: double (nullable = true) |-- Account_Manager: integer (nullable = true) |-- Years: double (nullable = true) |-- Num_Sites: double (nullable = true) |-- Onboard_date: timestamp (nullable = true) |-- Location: string (nullable = true) |-- Company: string (nullable = true) |-- Churn: integer (nullable = true)
data.describe().show()
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+ |summary| Names| Age| Total_Purchase| Account_Manager| Years| Num_Sites| Location| Company| Churn| +-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+ | count| 900| 900| 900| 900| 900| 900| 900| 900| 900| | mean| null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777| null| null|0.16666666666666666| | stddev| null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.7648355920350969| null| null| 0.3728852122772358| | min| Aaron King| 22.0| 100.0| 0| 1.0| 3.0|00103 Jeffrey Cre...| Abbott-Thompson| 0| | max|Zachary Walsh| 65.0| 18026.01| 1| 9.15| 14.0|Unit 9800 Box 287...|Zuniga, Clark and...| 1| +-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
data.columns
Out[5]: ['Names', 'Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites', 'Onboard_date', 'Location', 'Company', 'Churn']
assembler = VectorAssembler(inputCols=['Age',
'Total_Purchase',
'Account_Manager',
'Years',
'Num_Sites'],outputCol='features')
output = assembler.transform(data)
final_data = output.select('features','churn')
train_churn,test_churn = final_data.randomSplit([0.7,0.3])
lr_churn = LogisticRegression(labelCol='churn')
fitted_churn_model = lr_churn.fit(train_churn)
training_sum = fitted_churn_model.summary
training_sum.predictions.describe().show()
/databricks/spark/python/pyspark/sql/context.py:134: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead. warnings.warn( +-------+------------------+------------------+ |summary| churn| prediction| +-------+------------------+------------------+ | count| 621| 621| | mean|0.1642512077294686|0.1143317230273752| | stddev|0.3708020444222667| 0.318470254004318| | min| 0.0| 0.0| | max| 1.0| 1.0| +-------+------------------+------------------+
pred_and_labels = fitted_churn_model.evaluate(test_churn)
pred_and_labels.predictions.show()
+--------------------+-----+--------------------+--------------------+----------+ | features|churn| rawPrediction| probability|prediction| +--------------------+-----+--------------------+--------------------+----------+ |[22.0,11254.38,1....| 0|[4.42610676267903...|[0.98818040771998...| 0.0| |[25.0,9672.03,0.0...| 0|[4.38649810533113...|[0.98770872408802...| 0.0| |[27.0,8628.8,1.0,...| 0|[4.99547660560702...|[0.99327701009202...| 0.0| |[28.0,11128.95,1....| 0|[3.98417417702396...|[0.98173212040432...| 0.0| |[29.0,8688.17,1.0...| 1|[2.44899083414891...|[0.92048762114072...| 0.0| |[30.0,8403.78,1.0...| 0|[5.58181352981596...|[0.99624839746366...| 0.0| |[30.0,10183.98,1....| 0|[2.76467297450614...|[0.94073669499285...| 0.0| |[30.0,12788.37,0....| 0|[2.71301102813230...|[0.93779004336492...| 0.0| |[30.0,13473.35,0....| 0|[3.02144798568538...|[0.95353372376920...| 0.0| |[31.0,8829.83,1.0...| 0|[4.18829744900902...|[0.98505465797648...| 0.0| |[31.0,10058.87,1....| 0|[3.97785919321561...|[0.98161852124201...| 0.0| |[31.0,11743.24,0....| 0|[6.36096097807910...|[0.99827527419186...| 0.0| |[31.0,12264.68,1....| 0|[3.33131327531714...|[0.96548755656129...| 0.0| |[32.0,6367.22,1.0...| 0|[3.02667356283644...|[0.95376470631089...| 0.0| |[32.0,8011.38,0.0...| 0|[1.89658424424186...|[0.86950444082524...| 0.0| |[32.0,8575.71,0.0...| 0|[3.69300591894818...|[0.97570775434429...| 0.0| |[32.0,8617.98,1.0...| 1|[0.86853858529779...|[0.70444151703483...| 0.0| |[32.0,11540.86,0....| 0|[6.70293126734090...|[0.99877419538724...| 0.0| |[32.0,11715.72,0....| 0|[3.40964625291369...|[0.96800464819343...| 0.0| |[32.0,12403.6,0.0...| 0|[5.42166677112224...|[0.99559967468850...| 0.0| +--------------------+-----+--------------------+--------------------+----------+ only showing top 20 rows
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
labelCol='churn')
auc = churn_eval.evaluate(pred_and_labels.predictions)
auc
Out[24]: 0.7804383116883116
Predict on new data
final_lr_model = lr_churn.fit(final_data)
new_customers = spark.read.csv("dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/new_customers.csv",inferSchema=True, header=True)
new_customers.printSchema()
root |-- Names: string (nullable = true) |-- Age: double (nullable = true) |-- Total_Purchase: double (nullable = true) |-- Account_Manager: integer (nullable = true) |-- Years: double (nullable = true) |-- Num_Sites: double (nullable = true) |-- Onboard_date: timestamp (nullable = true) |-- Location: string (nullable = true) |-- Company: string (nullable = true)
test_new_customers = assembler.transform(new_customers)
test_new_customers.printSchema()
root |-- Names: string (nullable = true) |-- Age: double (nullable = true) |-- Total_Purchase: double (nullable = true) |-- Account_Manager: integer (nullable = true) |-- Years: double (nullable = true) |-- Num_Sites: double (nullable = true) |-- Onboard_date: timestamp (nullable = true) |-- Location: string (nullable = true) |-- Company: string (nullable = true) |-- features: vector (nullable = true)
final_results = final_lr_model.transform(test_new_customers)
final_results.select('Company','prediction').show()
+----------------+----------+ | Company|prediction| +----------------+----------+ | King Ltd| 0.0| | Cannon-Benson| 1.0| |Barron-Robertson| 1.0| | Sexton-Golden| 1.0| | Wood LLC| 0.0| | Parks-Robbins| 1.0| +----------------+----------+