In this project, we aim to predict the dog food batch was spoiled. The data description is as follows:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier,DecisionTreeClassifier
spark = SparkSession.builder.appName('dogfood').getOrCreate()
data = spark.read.csv("dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/dog_food.csv",inferSchema=True,header=True)
data.printSchema()
root |-- A: integer (nullable = true) |-- B: integer (nullable = true) |-- C: double (nullable = true) |-- D: integer (nullable = true) |-- Spoiled: double (nullable = true)
data.head()
Out[5]: Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)
data.describe().show()
+-------+------------------+------------------+------------------+------------------+-------------------+ |summary| A| B| C| D| Spoiled| +-------+------------------+------------------+------------------+------------------+-------------------+ | count| 490| 490| 490| 490| 490| | mean| 5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857| | stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465| | min| 1| 1| 5.0| 1| 0.0| | max| 10| 10| 14.0| 10| 1.0| +-------+------------------+------------------+------------------+------------------+-------------------+
data.columns
Out[8]: ['A', 'B', 'C', 'D', 'Spoiled']
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol="features")
output = assembler.transform(data)
rfc = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='features')
output.printSchema()
root |-- A: integer (nullable = true) |-- B: integer (nullable = true) |-- C: double (nullable = true) |-- D: integer (nullable = true) |-- Spoiled: double (nullable = true) |-- features: vector (nullable = true)
final_data = output.select('features','Spoiled')
final_data.head()
Out[14]: Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)
rfc_model = rfc.fit(final_data)
rfc_model.featureImportances
Out[16]: SparseVector(4, {1: 0.0019, 2: 0.9832, 3: 0.0149})