# import all necessary library
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import matplotlib.pyplot as plt
# initialize Spark
spark = SparkSession.builder \
.master("local") \
.appName("Data description Spark") \
.getOrCreate()
# reading dataset to dataframe
schema = StructType([
StructField("Dress_ID", StringType(), True),
StructField("Style", StringType(), True),
StructField("Price", StringType(), True),
StructField("Rating", FloatType(), True),
StructField("Size", StringType(), True),
StructField("Season", StringType(), True),
StructField("NeckLine", StringType(), True),
StructField("SleeveLength", StringType(), True),
StructField("waiseline", StringType(), True),
StructField("Material", StringType(), True),
StructField("FabricType", StringType(), True),
StructField("Decoration", StringType(), True),
StructField("Pattern Type", StringType(), True),
StructField("Recommendation", IntegerType(), True)])
df = spark.read \
.schema(schema) \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.load("Dresses_Attribute_Sales.csv")
df.show(5)
+----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+ | Dress_ID| Style| Price|Rating|Size|Season|NeckLine|SleeveLength|waiseline| Material|FabricType|Decoration|Pattern Type|Recommendation| +----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+ |1006032852| Sexy| Low| 4.6| M|Summer| o-neck| sleevless| empire| null| chiffon| ruffles| animal| 1| |1212192089| Casual| Low| 0.0| L|Summer| o-neck| Petal| natural| microfiber| null| ruffles| animal| 0| |1190380701|vintage| High| 0.0| L|Automn| o-neck| full| natural| polyster| null| null| print| 0| | 966005983| Brief|Average| 4.6| L|Spring| o-neck| full| natural| silk| chiffon|embroidary| print| 1| | 876339541| cute| Low| 4.5| M|Summer| o-neck| butterfly| natural|chiffonfabric| chiffon| bow| dot| 0| +----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+ only showing top 5 rows
# Nomial
df_nomial = df[["Style", "NeckLine", "Material", "Pattern Type"]]
df_nomial.show(5)
+-------+--------+-------------+------------+ | Style|NeckLine| Material|Pattern Type| +-------+--------+-------------+------------+ | Sexy| o-neck| null| animal| | Casual| o-neck| microfiber| animal| |vintage| o-neck| polyster| print| | Brief| o-neck| silk| print| | cute| o-neck|chiffonfabric| dot| +-------+--------+-------------+------------+ only showing top 5 rows
# Numbers
df_numbers = df[["Dress_ID"]]
df_numbers.show(5)
+----------+ | Dress_ID| +----------+ |1006032852| |1212192089| |1190380701| | 966005983| | 876339541| +----------+ only showing top 5 rows
# Ordinal
df_ordinal = df[["Size"]]
df_ordinal.show(5)
+----+ |Size| +----+ | M| | L| | L| | L| | M| +----+ only showing top 5 rows
# ratio
df_ratio = df[["Rating"]]
df_ratio.show(5)
+------+ |Rating| +------+ | 4.6| | 0.0| | 0.0| | 4.6| | 4.5| +------+ only showing top 5 rows
df_nomial.describe().show()
+-------+-----+--------+--------+------------+ |summary|Style|NeckLine|Material|Pattern Type| +-------+-----+--------+--------+------------+ | count| 500| 499| 499| 499| | mean| null| null| null| null| | stddev| null| null| null| null| | min|Brief| NULL| acrylic| animal| | max| work| v-neck| wool| striped| +-------+-----+--------+--------+------------+
df_numbers.describe().show()
+-------+--------------------+ |summary| Dress_ID| +-------+--------------------+ | count| 500| | mean| 9.0554168105E8| | stddev|1.7361896065394258E8| | min| 1000425584| | max| 999081623| +-------+--------------------+
df_ordinal.describe().show()
+-------+-----+ |summary| Size| +-------+-----+ | count| 500| | mean| null| | stddev| null| | min| L| | max|small| +-------+-----+
df_ratio.describe().show()
+-------+------------------+ |summary| Rating| +-------+------------------+ | count| 500| | mean|3.5285999937057495| | stddev| 2.00536405618619| | min| 0.0| | max| 5.0| +-------+------------------+
mean_rating = df_ratio.agg(F.mean(df_ratio.Rating)).first()[0]
print "Mean rating:", mean_rating
Mean rating: 3.52859999371
sqlContext.registerDataFrameAsTable(df_ratio, "df_ratio")
median_rating = sqlContext.sql("""
SELECT percentile(Rating, 0.5) AS median_rating
FROM df_ratio
""").first()["median_rating"]
print "Median rating:", median_rating
Median rating: 4.59999990463
counts = df_ratio.groupBy("Rating").count()
mode_rating = counts.join(
counts.agg(F.max('count').alias('count')),
on='count'
).limit(1).select("Rating").first()["Rating"]
print "Mode rating:", mode_rating
Mode rating: 0.0
# visualize price column
fig, ax = plt.subplots(figsize=(15, 5))
ax.axvline(mean_rating, color='red', linewidth=5)
ax.axvline(median_rating, color='green', linewidth=5)
ax.axvline(mode_rating, color='blue', linewidth=5)
# Add arrows annotating the means:
def add_arrow(label, val, align="left"):
ax.annotate(label + ': {:0.2f}'.format(val), xy=(val, 1), xytext=(15, 15),
xycoords=('data', 'axes fraction'), textcoords='offset points',
horizontalalignment=align, verticalalignment='center',
arrowprops=dict(arrowstyle='-|>', fc='black', shrinkA=0, shrinkB=0,
connectionstyle='angle,angleA=0,angleB=90,rad=10'),
)
add_arrow("Mean", mean_rating)
add_arrow("Median", median_rating)
add_arrow("Mode", mode_rating)
ax.legend(loc='upper left')
ax.margins(0.05)
bins, hist = df_ratio.select("Rating").rdd.flatMap(lambda x: x).histogram(70)
hist = np.asarray(hist)
bins = np.asarray(bins)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.title("Rating Histogram")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()
/Users/hongong/virtualenv/trustingsocial/lib/python2.7/site-packages/matplotlib/axes/_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots. warnings.warn("No labelled objects found. "
quartiles = \frac{percentile * n}{100}$$
# range of rating
min_rating = df_ratio.agg(F.min(df_ratio.Rating)).first()[0]
max_rating = df_ratio.agg(F.max(df_ratio.Rating)).first()[0]
range_rating = max_rating - min_rating
print "Min rating:", min_rating
print "Max rating:", max_rating
print "Rating range:", range_rating
Min rating: 0.0 Max rating: 5.0 Rating range: 5.0
# variance of rating
var_rating = df_ratio.agg(F.variance(df_ratio.Rating)).first()[0]
print "Rating variance:", var_rating
Rating variance: 4.02148499784
# standard deviation of rating
std_rating = df_ratio.agg(F.stddev(df_ratio.Rating)).first()[0]
print "Rating standard deviation:", std_rating
Rating standard deviation: 2.00536405619
# z-score of rating
df_z_score_rating = sqlContext.sql("SELECT (Rating - " + \
str(mean_rating) + " / " + str(std_rating) + \
") as Rating FROM df_ratio")
print "\nZ-score of rating:"
df_z_score_rating.show(5)
bins, hist = df_z_score_rating.select("Rating").rdd.flatMap(lambda x: x).histogram(70)
hist = np.asarray(hist)
bins = np.asarray(bins)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
# plotting
fig = plt.figure(figsize=(15, 5), dpi= 80, facecolor='w', edgecolor='k')
plt.bar(center, hist, align='center', width=width)
plt.title("Z score distribution")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()
Z-score of rating: +-------------------+ | Rating| +-------------------+ | 2.840419152789433| |-1.7595807518431354| |-1.7595807518431354| | 2.840419152789433| | 2.7404192481568646| +-------------------+ only showing top 5 rows
n: distinct\ object\ to\ choose\ from$$ $$r: spaces\ to\ fill.$$
Biến độc lập và biến phụ thuộc (independent/dependent variable)
Bayes
Posterior = \frac{Likelihood * Prior}{Evidence}$$
# Conditional probability
# P(Style:sexy|Season:summer) = P(Style:sexy and Season:summer) / P(Season:summer)
num_items = df.select("Rating").count()
df_summer = df.select("Season").where(df["Season"] == "Summer")
p_summer = df_summer.count() * 100.0 / num_items
df_sexy_summer = df.select(df.Style, df.Season).where("Style = 'Sexy' and Season = 'Summer'")
p_sexy_summer = df_sexy_summer.count() * 100.0 / num_items
p_sexy_given_summer = p_sexy_summer * 100 / p_summer
print "P(Season:summer)", p_summer
print "P(Style:sexy and Season:summer)", p_sexy_summer
print "P(Style:sexy|Season:summer)", p_sexy_given_summer
P(Season:summer) 31.8 P(Style:sexy and Season:summer) 4.8 P(Style:sexy|Season:summer) 15.0943396226
# Bayes
# P(Style:sexy|Season:summer) = P(Season:summer|Style:sexy) * P(Style:sexy) / P(Season:summer)
num_items = df.select("Rating").count()
df_style = df.select(df.Style).where("Style = 'Sexy'")
p_sexy = df_style.count() * 100.0 / num_items
p_summer_given_sexy = p_sexy_summer * 100 / p_sexy
p_sexy_given_summer_bayes = p_summer_given_sexy * p_sexy / p_summer
print "P(Style:sexy)", p_sexy
print "P(Season:summer)", p_summer
print "P(Season:summer|Style:sexy)", p_summer_given_sexy
print "P(Style:sexy|Season:summer) Bayes:", p_sexy_given_summer_bayes
P(Style:sexy) 13.8 P(Season:summer) 31.8 P(Season:summer|Style:sexy) 34.7826086957 P(Style:sexy|Season:summer) Bayes: 15.0943396226
# Simulating Central limit theorem
num_sample = 30.0
num_loop = 1000
num_items = df.select("Rating").count()
samp_mean_ls = []
for i in range(0, num_loop):
df_rating_sample = df.select("Rating").sample(False, num_sample / num_items)
sum_of_sample = df_rating_sample.agg(F.sum(df_rating_sample.Rating)).first()[0]
x_bar = sum_of_sample * 100 / num_sample
samp_mean_ls.append(x_bar)
# ploting population
fig = plt.figure(figsize=(15, 5), dpi= 80, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
bins, hist = df_ratio.select("Rating").rdd.flatMap(lambda x: x).histogram(70)
hist = np.asarray(hist)
bins = np.asarray(bins)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.title("Population distribution: Rating")
plt.xlabel("Rating")
plt.ylabel("Frequency")
# ploting sampling
plt.subplot(1, 2, 2)
hist, bins = np.histogram(samp_mean_ls)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.title("Sampling distribution of mean: Rating")
plt.xlabel("Sample mean")
plt.show()