#!/usr/bin/env python
# coding: utf-8

# # Mô tả dữ liệu
# - Dataset: [Dresses_Attribute_Sales](https://archive.ics.uci.edu/ml/datasets/Dresses_Attribute_Sales)
# - Associated Tasks: Classification, Clustering

# In[1]:


# import all necessary library
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import matplotlib.pyplot as plt


# In[2]:


# initialize Spark
spark = SparkSession.builder \
                .master("local") \
                .appName("Data description Spark") \
                .getOrCreate()


# In[3]:


# reading dataset to dataframe
schema = StructType([
    StructField("Dress_ID", StringType(), True),
    StructField("Style", StringType(), True),
    StructField("Price", StringType(), True),
    StructField("Rating", FloatType(), True),
    StructField("Size", StringType(), True),
    StructField("Season", StringType(), True),
    StructField("NeckLine", StringType(), True),
    StructField("SleeveLength", StringType(), True),
    StructField("waiseline", StringType(), True),
    StructField("Material", StringType(), True),
    StructField("FabricType", StringType(), True),
    StructField("Decoration", StringType(), True),
    StructField("Pattern Type", StringType(), True),
    StructField("Recommendation", IntegerType(), True)])

df = spark.read \
        .schema(schema) \
        .format("com.databricks.spark.csv") \
        .option("header", "true") \
        .load("Dresses_Attribute_Sales.csv")

df.show(5)


# # Các loại dữ liệu
# - Nomial
# - Numbers
# - Odinal
# - Ratio

# In[4]:


# Nomial
df_nomial = df[["Style", "NeckLine", "Material", "Pattern Type"]]
df_nomial.show(5)


# In[5]:


# Numbers
df_numbers = df[["Dress_ID"]]
df_numbers.show(5)


# In[6]:


# Ordinal
df_ordinal = df[["Size"]]
df_ordinal.show(5)


# In[7]:


# ratio
df_ratio = df[["Rating"]]
df_ratio.show(5)


# # Trung tâm dữ liệu
# - Trung bình (mean)
# $$\mu = \bar{x} = \frac{1}{n} \sum_{i=1}^n x_i = \frac{1}{n} (x_1 + ... + x_n)$$
# - Trung vị (median)
# - Mode

# In[8]:


df_nomial.describe().show()


# In[9]:


df_numbers.describe().show()


# In[10]:


df_ordinal.describe().show()


# In[11]:


df_ratio.describe().show()


# In[12]:


mean_rating = df_ratio.agg(F.mean(df_ratio.Rating)).first()[0]
print "Mean rating:", mean_rating


# In[13]:


sqlContext.registerDataFrameAsTable(df_ratio, "df_ratio")

median_rating = sqlContext.sql("""
    SELECT percentile(Rating, 0.5) AS median_rating 
    FROM df_ratio
""").first()["median_rating"]

print "Median rating:", median_rating


# In[14]:


counts = df_ratio.groupBy("Rating").count()

mode_rating = counts.join(
        counts.agg(F.max('count').alias('count')),
        on='count'
    ).limit(1).select("Rating").first()["Rating"]

print "Mode rating:", mode_rating


# In[15]:


# visualize price column
fig, ax = plt.subplots(figsize=(15, 5))
ax.axvline(mean_rating, color='red', linewidth=5)
ax.axvline(median_rating, color='green', linewidth=5)
ax.axvline(mode_rating, color='blue', linewidth=5)

# Add arrows annotating the means:
def add_arrow(label, val, align="left"):
    ax.annotate(label + ': {:0.2f}'.format(val), xy=(val, 1), xytext=(15, 15),
            xycoords=('data', 'axes fraction'), textcoords='offset points',
            horizontalalignment=align, verticalalignment='center',
            arrowprops=dict(arrowstyle='-|>', fc='black', shrinkA=0, shrinkB=0,
                            connectionstyle='angle,angleA=0,angleB=90,rad=10'),
            )

add_arrow("Mean", mean_rating)
add_arrow("Median", median_rating)
add_arrow("Mode", mode_rating)
ax.legend(loc='upper left')
ax.margins(0.05)

bins, hist = df_ratio.select("Rating").rdd.flatMap(lambda x: x).histogram(70)
hist = np.asarray(hist)
bins = np.asarray(bins)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.title("Rating Histogram")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()


# # Biến đổi của dữ liệu 
# - Khoảng đoạn (range)
# $$range = maxValue - minValue$$
# 
# - Phương sai (variance)
# $$\sigma^2 = \frac{\sum_{i=1}^n (x_i - \mu)}{n}$$
# 
# - Độ lệch chuẩn (standard deviation)
# $$\sigma = \sqrt{\frac{\sum_{i=1}^n (x_i - \mu)}{n}}$$
# 
# - Z-score: biến đổi từ sample mean để thực hiện Z-test
#     * Shift trung bình mẫu về 0 bằng $X - \mu$
#     * Nén độ lệch chuẩn của mẫu ban đầu lại bằng cách chia cho $\sigma$
# $$Z = \frac{X - \mu}{\sigma}$$
# 
# - Phân vị (percentile)
# $$percentile \ of \ x = \frac{No. value \ below \ x}{n} * 100\\$$
# $$quartiles = \frac{percentile * n}{100}$$

# In[16]:


# range of rating
min_rating = df_ratio.agg(F.min(df_ratio.Rating)).first()[0]
max_rating = df_ratio.agg(F.max(df_ratio.Rating)).first()[0]
range_rating = max_rating - min_rating
print "Min rating:", min_rating
print "Max rating:", max_rating
print "Rating range:", range_rating


# In[17]:


# variance of rating
var_rating = df_ratio.agg(F.variance(df_ratio.Rating)).first()[0]
print "Rating variance:", var_rating


# In[18]:


# standard deviation of rating
std_rating = df_ratio.agg(F.stddev(df_ratio.Rating)).first()[0]
print "Rating standard deviation:", std_rating


# In[19]:


# z-score of rating
df_z_score_rating = sqlContext.sql("SELECT (Rating - " + \
                                   str(mean_rating) + " / " + str(std_rating) + \
                                   ") as Rating FROM df_ratio")               

print "\nZ-score of rating:"
df_z_score_rating.show(5)

bins, hist = df_z_score_rating.select("Rating").rdd.flatMap(lambda x: x).histogram(70)
hist = np.asarray(hist)
bins = np.asarray(bins)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2

# plotting
fig = plt.figure(figsize=(15, 5), dpi= 80, facecolor='w', edgecolor='k')
plt.bar(center, hist, align='center', width=width)
plt.title("Z score distribution")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()


# # Xác suất
# - Xác suất (probability)
# $$probability = \frac{event(s)}{outcome(s)}$$
# 
# - Phép đếm hoán vị (permutation)
# $$P(n, r) = \frac{n!}{(n - r)!}\\$$
# $$n: distinct\ object\ to\ choose\ from$$
# $$r: spaces\ to\ fill.$$
# 
# - Phép đếm tổ hợp (combination)
# $$C(n, r) = \frac{n!}{r!(n - r)!}$$
# 
# - Xác suất có điều kiện (conditional probability)
# $$P(B|A) = \frac{P(A \cap B)}{P(A)}$$
# 
# - Biến độc lập và biến phụ thuộc (independent/dependent variable)
#     - Independent variable: 
#         * Dress_ID
#         * Style
#         * Price
#         * Rating
#         * Size
#         * Season
#         * NeckLine
#         * SleeveLength
#         * waiseline
#         * Material
#         * FabricType
#         * Decoration
#         * Pattern Type
#     - Dependent variable: Recommendation
# 
# - Bayes
# $$P(A|B) = \frac{P(B|A) P(A)}{P(B)}\\$$
# $$Posterior = \frac{Likelihood * Prior}{Evidence}$$

# In[20]:


# Conditional probability
# P(Style:sexy|Season:summer) = P(Style:sexy and Season:summer) / P(Season:summer)
num_items = df.select("Rating").count()
df_summer = df.select("Season").where(df["Season"] == "Summer")
p_summer = df_summer.count() * 100.0 / num_items

df_sexy_summer = df.select(df.Style, df.Season).where("Style = 'Sexy' and Season = 'Summer'")
p_sexy_summer = df_sexy_summer.count() * 100.0 / num_items

p_sexy_given_summer = p_sexy_summer * 100 / p_summer

print "P(Season:summer)", p_summer
print "P(Style:sexy and Season:summer)", p_sexy_summer
print "P(Style:sexy|Season:summer)", p_sexy_given_summer


# In[21]:


# Bayes
# P(Style:sexy|Season:summer) = P(Season:summer|Style:sexy) * P(Style:sexy) / P(Season:summer)
num_items = df.select("Rating").count()
df_style = df.select(df.Style).where("Style = 'Sexy'")
p_sexy = df_style.count() * 100.0 / num_items

p_summer_given_sexy = p_sexy_summer * 100 / p_sexy

p_sexy_given_summer_bayes = p_summer_given_sexy * p_sexy / p_summer

print "P(Style:sexy)", p_sexy
print "P(Season:summer)", p_summer
print "P(Season:summer|Style:sexy)", p_summer_given_sexy
print "P(Style:sexy|Season:summer) Bayes:", p_sexy_given_summer_bayes


# # Central limit theorem
# - Dịch thành: Định lý giới hạn trung tâm.
# - Cho quần thể có phân bố bất kỳ.
# - Ta thực hiện lấy mẫu nhiều lần trên quần thể cho trước với số lượng xác định.
# - Mỗi lần lấy mẫu ta đi tính trung bình mẫu
# - Tổng hợp các trung bình mẫu này lại thành histogram.
# - Quan sát phân phối của trung bình mẫu của các mẫu ngẫu nhiên, ta thấy đây gần giống với phân phối chuẩn dù cho quần thể ban đầu có phân bố bất kỳ.

# In[25]:


# Simulating Central limit theorem
num_sample = 30.0
num_loop = 1000
num_items = df.select("Rating").count()
samp_mean_ls = []

for i in range(0, num_loop):
    df_rating_sample = df.select("Rating").sample(False, num_sample / num_items)
    sum_of_sample = df_rating_sample.agg(F.sum(df_rating_sample.Rating)).first()[0]

    x_bar = sum_of_sample * 100 / num_sample
    samp_mean_ls.append(x_bar)    


# In[26]:


# ploting population
fig = plt.figure(figsize=(15, 5), dpi= 80, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
bins, hist = df_ratio.select("Rating").rdd.flatMap(lambda x: x).histogram(70)
hist = np.asarray(hist)
bins = np.asarray(bins)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2

plt.bar(center, hist, align='center', width=width)
plt.title("Population distribution: Rating")
plt.xlabel("Rating")
plt.ylabel("Frequency")

# ploting sampling
plt.subplot(1, 2, 2)
hist, bins = np.histogram(samp_mean_ls)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.title("Sampling distribution of mean: Rating")
plt.xlabel("Sample mean")
plt.show()