#!/usr/bin/env python # coding: utf-8 # # Mô tả dữ liệu # - Dataset: [Dresses_Attribute_Sales](https://archive.ics.uci.edu/ml/datasets/Dresses_Attribute_Sales) # - Associated Tasks: Classification, Clustering # In[1]: # import all necessary library import pandas as pd import numpy as np from pyspark.sql import SparkSession from pyspark.sql import functions as F from pyspark.sql.types import * import matplotlib.pyplot as plt # In[2]: # initialize Spark spark = SparkSession.builder \ .master("local") \ .appName("Data description Spark") \ .getOrCreate() # In[3]: # reading dataset to dataframe schema = StructType([ StructField("Dress_ID", StringType(), True), StructField("Style", StringType(), True), StructField("Price", StringType(), True), StructField("Rating", FloatType(), True), StructField("Size", StringType(), True), StructField("Season", StringType(), True), StructField("NeckLine", StringType(), True), StructField("SleeveLength", StringType(), True), StructField("waiseline", StringType(), True), StructField("Material", StringType(), True), StructField("FabricType", StringType(), True), StructField("Decoration", StringType(), True), StructField("Pattern Type", StringType(), True), StructField("Recommendation", IntegerType(), True)]) df = spark.read \ .schema(schema) \ .format("com.databricks.spark.csv") \ .option("header", "true") \ .load("Dresses_Attribute_Sales.csv") df.show(5) # # Các loại dữ liệu # - Nomial # - Numbers # - Odinal # - Ratio # In[4]: # Nomial df_nomial = df[["Style", "NeckLine", "Material", "Pattern Type"]] df_nomial.show(5) # In[5]: # Numbers df_numbers = df[["Dress_ID"]] df_numbers.show(5) # In[6]: # Ordinal df_ordinal = df[["Size"]] df_ordinal.show(5) # In[7]: # ratio df_ratio = df[["Rating"]] df_ratio.show(5) # # Trung tâm dữ liệu # - Trung bình (mean) # $$\mu = \bar{x} = \frac{1}{n} \sum_{i=1}^n x_i = \frac{1}{n} (x_1 + ... + x_n)$$ # - Trung vị (median) # - Mode # In[8]: df_nomial.describe().show() # In[9]: df_numbers.describe().show() # In[10]: df_ordinal.describe().show() # In[11]: df_ratio.describe().show() # In[12]: mean_rating = df_ratio.agg(F.mean(df_ratio.Rating)).first()[0] print "Mean rating:", mean_rating # In[13]: sqlContext.registerDataFrameAsTable(df_ratio, "df_ratio") median_rating = sqlContext.sql(""" SELECT percentile(Rating, 0.5) AS median_rating FROM df_ratio """).first()["median_rating"] print "Median rating:", median_rating # In[14]: counts = df_ratio.groupBy("Rating").count() mode_rating = counts.join( counts.agg(F.max('count').alias('count')), on='count' ).limit(1).select("Rating").first()["Rating"] print "Mode rating:", mode_rating # In[15]: # visualize price column fig, ax = plt.subplots(figsize=(15, 5)) ax.axvline(mean_rating, color='red', linewidth=5) ax.axvline(median_rating, color='green', linewidth=5) ax.axvline(mode_rating, color='blue', linewidth=5) # Add arrows annotating the means: def add_arrow(label, val, align="left"): ax.annotate(label + ': {:0.2f}'.format(val), xy=(val, 1), xytext=(15, 15), xycoords=('data', 'axes fraction'), textcoords='offset points', horizontalalignment=align, verticalalignment='center', arrowprops=dict(arrowstyle='-|>', fc='black', shrinkA=0, shrinkB=0, connectionstyle='angle,angleA=0,angleB=90,rad=10'), ) add_arrow("Mean", mean_rating) add_arrow("Median", median_rating) add_arrow("Mode", mode_rating) ax.legend(loc='upper left') ax.margins(0.05) bins, hist = df_ratio.select("Rating").rdd.flatMap(lambda x: x).histogram(70) hist = np.asarray(hist) bins = np.asarray(bins) width = 0.7 * (bins[1] - bins[0]) center = (bins[:-1] + bins[1:]) / 2 plt.bar(center, hist, align='center', width=width) plt.title("Rating Histogram") plt.xlabel("Rating") plt.ylabel("Frequency") plt.show() # # Biến đổi của dữ liệu # - Khoảng đoạn (range) # $$range = maxValue - minValue$$ # # - Phương sai (variance) # $$\sigma^2 = \frac{\sum_{i=1}^n (x_i - \mu)}{n}$$ # # - Độ lệch chuẩn (standard deviation) # $$\sigma = \sqrt{\frac{\sum_{i=1}^n (x_i - \mu)}{n}}$$ # # - Z-score: biến đổi từ sample mean để thực hiện Z-test # * Shift trung bình mẫu về 0 bằng $X - \mu$ # * Nén độ lệch chuẩn của mẫu ban đầu lại bằng cách chia cho $\sigma$ # $$Z = \frac{X - \mu}{\sigma}$$ # # - Phân vị (percentile) # $$percentile \ of \ x = \frac{No. value \ below \ x}{n} * 100\\$$ # $$quartiles = \frac{percentile * n}{100}$$ # In[16]: # range of rating min_rating = df_ratio.agg(F.min(df_ratio.Rating)).first()[0] max_rating = df_ratio.agg(F.max(df_ratio.Rating)).first()[0] range_rating = max_rating - min_rating print "Min rating:", min_rating print "Max rating:", max_rating print "Rating range:", range_rating # In[17]: # variance of rating var_rating = df_ratio.agg(F.variance(df_ratio.Rating)).first()[0] print "Rating variance:", var_rating # In[18]: # standard deviation of rating std_rating = df_ratio.agg(F.stddev(df_ratio.Rating)).first()[0] print "Rating standard deviation:", std_rating # In[19]: # z-score of rating df_z_score_rating = sqlContext.sql("SELECT (Rating - " + \ str(mean_rating) + " / " + str(std_rating) + \ ") as Rating FROM df_ratio") print "\nZ-score of rating:" df_z_score_rating.show(5) bins, hist = df_z_score_rating.select("Rating").rdd.flatMap(lambda x: x).histogram(70) hist = np.asarray(hist) bins = np.asarray(bins) width = 0.7 * (bins[1] - bins[0]) center = (bins[:-1] + bins[1:]) / 2 # plotting fig = plt.figure(figsize=(15, 5), dpi= 80, facecolor='w', edgecolor='k') plt.bar(center, hist, align='center', width=width) plt.title("Z score distribution") plt.xlabel("Rating") plt.ylabel("Frequency") plt.show() # # Xác suất # - Xác suất (probability) # $$probability = \frac{event(s)}{outcome(s)}$$ # # - Phép đếm hoán vị (permutation) # $$P(n, r) = \frac{n!}{(n - r)!}\\$$ # $$n: distinct\ object\ to\ choose\ from$$ # $$r: spaces\ to\ fill.$$ # # - Phép đếm tổ hợp (combination) # $$C(n, r) = \frac{n!}{r!(n - r)!}$$ # # - Xác suất có điều kiện (conditional probability) # $$P(B|A) = \frac{P(A \cap B)}{P(A)}$$ # # - Biến độc lập và biến phụ thuộc (independent/dependent variable) # - Independent variable: # * Dress_ID # * Style # * Price # * Rating # * Size # * Season # * NeckLine # * SleeveLength # * waiseline # * Material # * FabricType # * Decoration # * Pattern Type # - Dependent variable: Recommendation # # - Bayes # $$P(A|B) = \frac{P(B|A) P(A)}{P(B)}\\$$ # $$Posterior = \frac{Likelihood * Prior}{Evidence}$$ # In[20]: # Conditional probability # P(Style:sexy|Season:summer) = P(Style:sexy and Season:summer) / P(Season:summer) num_items = df.select("Rating").count() df_summer = df.select("Season").where(df["Season"] == "Summer") p_summer = df_summer.count() * 100.0 / num_items df_sexy_summer = df.select(df.Style, df.Season).where("Style = 'Sexy' and Season = 'Summer'") p_sexy_summer = df_sexy_summer.count() * 100.0 / num_items p_sexy_given_summer = p_sexy_summer * 100 / p_summer print "P(Season:summer)", p_summer print "P(Style:sexy and Season:summer)", p_sexy_summer print "P(Style:sexy|Season:summer)", p_sexy_given_summer # In[21]: # Bayes # P(Style:sexy|Season:summer) = P(Season:summer|Style:sexy) * P(Style:sexy) / P(Season:summer) num_items = df.select("Rating").count() df_style = df.select(df.Style).where("Style = 'Sexy'") p_sexy = df_style.count() * 100.0 / num_items p_summer_given_sexy = p_sexy_summer * 100 / p_sexy p_sexy_given_summer_bayes = p_summer_given_sexy * p_sexy / p_summer print "P(Style:sexy)", p_sexy print "P(Season:summer)", p_summer print "P(Season:summer|Style:sexy)", p_summer_given_sexy print "P(Style:sexy|Season:summer) Bayes:", p_sexy_given_summer_bayes # # Central limit theorem # - Dịch thành: Định lý giới hạn trung tâm. # - Cho quần thể có phân bố bất kỳ. # - Ta thực hiện lấy mẫu nhiều lần trên quần thể cho trước với số lượng xác định. # - Mỗi lần lấy mẫu ta đi tính trung bình mẫu # - Tổng hợp các trung bình mẫu này lại thành histogram. # - Quan sát phân phối của trung bình mẫu của các mẫu ngẫu nhiên, ta thấy đây gần giống với phân phối chuẩn dù cho quần thể ban đầu có phân bố bất kỳ. # In[25]: # Simulating Central limit theorem num_sample = 30.0 num_loop = 1000 num_items = df.select("Rating").count() samp_mean_ls = [] for i in range(0, num_loop): df_rating_sample = df.select("Rating").sample(False, num_sample / num_items) sum_of_sample = df_rating_sample.agg(F.sum(df_rating_sample.Rating)).first()[0] x_bar = sum_of_sample * 100 / num_sample samp_mean_ls.append(x_bar) # In[26]: # ploting population fig = plt.figure(figsize=(15, 5), dpi= 80, facecolor='w', edgecolor='k') plt.subplot(1, 2, 1) bins, hist = df_ratio.select("Rating").rdd.flatMap(lambda x: x).histogram(70) hist = np.asarray(hist) bins = np.asarray(bins) width = 0.7 * (bins[1] - bins[0]) center = (bins[:-1] + bins[1:]) / 2 plt.bar(center, hist, align='center', width=width) plt.title("Population distribution: Rating") plt.xlabel("Rating") plt.ylabel("Frequency") # ploting sampling plt.subplot(1, 2, 2) hist, bins = np.histogram(samp_mean_ls) width = 0.7 * (bins[1] - bins[0]) center = (bins[:-1] + bins[1:]) / 2 plt.bar(center, hist, align='center', width=width) plt.title("Sampling distribution of mean: Rating") plt.xlabel("Sample mean") plt.show()