학습목표 : Titanic의 탑승자 정보를 통해 생존자를 예측하는 모델 만들기
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession(sc)
titanic = spark.read.option("header", "true").csv("/Users/ryanshin/Downloads/train.csv") \
.withColumn("Survived", col("Survived").cast("double")) \
.withColumn("label", col("Survived")) \
.withColumn("Pclass", col("Pclass").cast("double"))\
.withColumn("SibSp", col("SibSp").cast("double"))\
.withColumn("Parch", col("Parch").cast("double"))\
.na.fill("S", "Embarked")
titanic.printSchema()
root |-- PassengerId: string (nullable = true) |-- Survived: double (nullable = true) |-- Pclass: double (nullable = true) |-- Name: string (nullable = true) |-- Sex: string (nullable = true) |-- Age: string (nullable = true) |-- SibSp: double (nullable = true) |-- Parch: double (nullable = true) |-- Ticket: string (nullable = true) |-- Fare: string (nullable = true) |-- Cabin: string (nullable = true) |-- Embarked: string (nullable = false) |-- label: double (nullable = true)
titanic.show()
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----+ |PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch| Ticket| Fare|Cabin|Embarked|label| +-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----+ | 1| 0.0| 3.0|Braund, Mr. Owen ...| male| 22| 1.0| 0.0| A/5 21171| 7.25| null| S| 0.0| | 2| 1.0| 1.0|Cumings, Mrs. Joh...|female| 38| 1.0| 0.0| PC 17599|71.2833| C85| C| 1.0| | 3| 1.0| 3.0|Heikkinen, Miss. ...|female| 26| 0.0| 0.0|STON/O2. 3101282| 7.925| null| S| 1.0| | 4| 1.0| 1.0|Futrelle, Mrs. Ja...|female| 35| 1.0| 0.0| 113803| 53.1| C123| S| 1.0| | 5| 0.0| 3.0|Allen, Mr. Willia...| male| 35| 0.0| 0.0| 373450| 8.05| null| S| 0.0| | 6| 0.0| 3.0| Moran, Mr. James| male|null| 0.0| 0.0| 330877| 8.4583| null| Q| 0.0| | 7| 0.0| 1.0|McCarthy, Mr. Tim...| male| 54| 0.0| 0.0| 17463|51.8625| E46| S| 0.0| | 8| 0.0| 3.0|Palsson, Master. ...| male| 2| 3.0| 1.0| 349909| 21.075| null| S| 0.0| | 9| 1.0| 3.0|Johnson, Mrs. Osc...|female| 27| 0.0| 2.0| 347742|11.1333| null| S| 1.0| | 10| 1.0| 2.0|Nasser, Mrs. Nich...|female| 14| 1.0| 0.0| 237736|30.0708| null| C| 1.0| | 11| 1.0| 3.0|Sandstrom, Miss. ...|female| 4| 1.0| 1.0| PP 9549| 16.7| G6| S| 1.0| | 12| 1.0| 1.0|Bonnell, Miss. El...|female| 58| 0.0| 0.0| 113783| 26.55| C103| S| 1.0| | 13| 0.0| 3.0|Saundercock, Mr. ...| male| 20| 0.0| 0.0| A/5. 2151| 8.05| null| S| 0.0| | 14| 0.0| 3.0|Andersson, Mr. An...| male| 39| 1.0| 5.0| 347082| 31.275| null| S| 0.0| | 15| 0.0| 3.0|Vestrom, Miss. Hu...|female| 14| 0.0| 0.0| 350406| 7.8542| null| S| 0.0| | 16| 1.0| 2.0|Hewlett, Mrs. (Ma...|female| 55| 0.0| 0.0| 248706| 16| null| S| 1.0| | 17| 0.0| 3.0|Rice, Master. Eugene| male| 2| 4.0| 1.0| 382652| 29.125| null| Q| 0.0| | 18| 1.0| 2.0|Williams, Mr. Cha...| male|null| 0.0| 0.0| 244373| 13| null| S| 1.0| | 19| 0.0| 3.0|Vander Planke, Mr...|female| 31| 1.0| 0.0| 345763| 18| null| S| 0.0| | 20| 1.0| 3.0|Masselmani, Mrs. ...|female|null| 0.0| 0.0| 2649| 7.225| null| C| 1.0| +-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----+ only showing top 20 rows
titanic.select(count("PassengerId"), sum("Survived"), sum("Survived")/count("PassengerId")).show()
+------------------+-------------+------------------------------------+ |count(PassengerId)|sum(Survived)|(sum(Survived) / count(PassengerId))| +------------------+-------------+------------------------------------+ | 891| 342.0| 0.3838383838383838| +------------------+-------------+------------------------------------+
titanic.groupBy("Survived").count().show()
+--------+-----+ |Survived|count| +--------+-----+ | 0.0| 549| | 1.0| 342| +--------+-----+
titanic.groupBy("Pclass", "Survived").count().orderBy("Pclass", "Survived").show()
+------+--------+-----+ |Pclass|Survived|count| +------+--------+-----+ | 1.0| 0.0| 80| | 1.0| 1.0| 136| | 2.0| 0.0| 97| | 2.0| 1.0| 87| | 3.0| 0.0| 372| | 3.0| 1.0| 119| +------+--------+-----+
titanic.groupBy("Sex", "Survived").count().orderBy("Sex", "Survived").show()
+------+--------+-----+ | Sex|Survived|count| +------+--------+-----+ |female| 0.0| 81| |female| 1.0| 233| | male| 0.0| 468| | male| 1.0| 109| +------+--------+-----+
titanic.groupBy("SibSp", "Survived").count().orderBy("SibSp", "Survived").show()
+-----+--------+-----+ |SibSp|Survived|count| +-----+--------+-----+ | 0.0| 0.0| 398| | 0.0| 1.0| 210| | 1.0| 0.0| 97| | 1.0| 1.0| 112| | 2.0| 0.0| 15| | 2.0| 1.0| 13| | 3.0| 0.0| 12| | 3.0| 1.0| 4| | 4.0| 0.0| 15| | 4.0| 1.0| 3| | 5.0| 0.0| 5| | 8.0| 0.0| 7| +-----+--------+-----+
titanic.groupBy("Parch", "Survived").count().orderBy("Parch", "Survived").show()
+-----+--------+-----+ |Parch|Survived|count| +-----+--------+-----+ | 0.0| 0.0| 445| | 0.0| 1.0| 233| | 1.0| 0.0| 53| | 1.0| 1.0| 65| | 2.0| 0.0| 40| | 2.0| 1.0| 40| | 3.0| 0.0| 2| | 3.0| 1.0| 3| | 4.0| 0.0| 4| | 5.0| 0.0| 4| | 5.0| 1.0| 1| | 6.0| 0.0| 1| +-----+--------+-----+
# 다 죽었다고 예측
def predict1_func():
return 0.0
predict1 = udf(predict1_func, returnType=DoubleType())
# 여자는 다 살았다고 남자는 다 죽었다고 예측
def predict2_func(gender):
if gender == "female":
return 1.0
else:
return 0.0
predict2 = udf(predict2_func, returnType=DoubleType())
# UDF 생성
prediction1result = titanic.select(predict1().alias("prediction"), col("Survived").cast("double").alias("label"))
prediction2result = titanic.select(predict2("Sex").alias("prediction"), col("Survived").cast("double").alias("label"))
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setRawPredictionCol("prediction").setLabelCol("label")
evaluator.setMetricName("areaUnderROC")
print("prediction1result areaUnderROC=%f" % evaluator.evaluate(prediction1result))
print("prediction2result areaUnderROC=%f" % evaluator.evaluate(prediction2result))
evaluator.setMetricName("areaUnderPR")
print("prediction1result areaUnderPR=%f" % evaluator.evaluate(prediction1result))
print("prediction2result areaUnderPR=%f" % evaluator.evaluate(prediction2result))
prediction1result areaUnderROC=0.500000 prediction2result areaUnderROC=0.766873 prediction1result areaUnderPR=0.383838 prediction2result areaUnderPR=0.684957
from pyspark.ml.classification import *
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
from pyspark.ml.feature import *
assembler = VectorAssembler().setInputCols(["Pclass", "SibSp"]).setOutputCol("features")
data2 = assembler.transform(titanic)
lrModel = lr.fit(data2)
data2.show()
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----+---------+ |PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch| Ticket| Fare|Cabin|Embarked|label| features| +-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----+---------+ | 1| 0.0| 3.0|Braund, Mr. Owen ...| male| 22| 1.0| 0.0| A/5 21171| 7.25| null| S| 0.0|[3.0,1.0]| | 2| 1.0| 1.0|Cumings, Mrs. Joh...|female| 38| 1.0| 0.0| PC 17599|71.2833| C85| C| 1.0|[1.0,1.0]| | 3| 1.0| 3.0|Heikkinen, Miss. ...|female| 26| 0.0| 0.0|STON/O2. 3101282| 7.925| null| S| 1.0|[3.0,0.0]| | 4| 1.0| 1.0|Futrelle, Mrs. Ja...|female| 35| 1.0| 0.0| 113803| 53.1| C123| S| 1.0|[1.0,1.0]| | 5| 0.0| 3.0|Allen, Mr. Willia...| male| 35| 0.0| 0.0| 373450| 8.05| null| S| 0.0|[3.0,0.0]| | 6| 0.0| 3.0| Moran, Mr. James| male|null| 0.0| 0.0| 330877| 8.4583| null| Q| 0.0|[3.0,0.0]| | 7| 0.0| 1.0|McCarthy, Mr. Tim...| male| 54| 0.0| 0.0| 17463|51.8625| E46| S| 0.0|[1.0,0.0]| | 8| 0.0| 3.0|Palsson, Master. ...| male| 2| 3.0| 1.0| 349909| 21.075| null| S| 0.0|[3.0,3.0]| | 9| 1.0| 3.0|Johnson, Mrs. Osc...|female| 27| 0.0| 2.0| 347742|11.1333| null| S| 1.0|[3.0,0.0]| | 10| 1.0| 2.0|Nasser, Mrs. Nich...|female| 14| 1.0| 0.0| 237736|30.0708| null| C| 1.0|[2.0,1.0]| | 11| 1.0| 3.0|Sandstrom, Miss. ...|female| 4| 1.0| 1.0| PP 9549| 16.7| G6| S| 1.0|[3.0,1.0]| | 12| 1.0| 1.0|Bonnell, Miss. El...|female| 58| 0.0| 0.0| 113783| 26.55| C103| S| 1.0|[1.0,0.0]| | 13| 0.0| 3.0|Saundercock, Mr. ...| male| 20| 0.0| 0.0| A/5. 2151| 8.05| null| S| 0.0|[3.0,0.0]| | 14| 0.0| 3.0|Andersson, Mr. An...| male| 39| 1.0| 5.0| 347082| 31.275| null| S| 0.0|[3.0,1.0]| | 15| 0.0| 3.0|Vestrom, Miss. Hu...|female| 14| 0.0| 0.0| 350406| 7.8542| null| S| 0.0|[3.0,0.0]| | 16| 1.0| 2.0|Hewlett, Mrs. (Ma...|female| 55| 0.0| 0.0| 248706| 16| null| S| 1.0|[2.0,0.0]| | 17| 0.0| 3.0|Rice, Master. Eugene| male| 2| 4.0| 1.0| 382652| 29.125| null| Q| 0.0|[3.0,4.0]| | 18| 1.0| 2.0|Williams, Mr. Cha...| male|null| 0.0| 0.0| 244373| 13| null| S| 1.0|[2.0,0.0]| | 19| 0.0| 3.0|Vander Planke, Mr...|female| 31| 1.0| 0.0| 345763| 18| null| S| 0.0|[3.0,1.0]| | 20| 1.0| 3.0|Masselmani, Mrs. ...|female|null| 0.0| 0.0| 2649| 7.225| null| C| 1.0|[3.0,0.0]| +-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----+---------+ only showing top 20 rows
from IPython.display import IFrame
IFrame('https://www.zepl.com/viewer/notebooks/bm90ZTovL1NEUkx1cmtlci8wMDM2MGM2ZWQzZWM0NjQyYjdlMTk0YzhlZmVmMDNjOC9ub3RlLmpzb24', width='100%', height=600)