#!/usr/bin/env python
# coding: utf-8
# # 데이터 다루기 (Data Handling) - Iris Data Set
# ## 1. Iris(붓꽃) Data Set 개요
# - 참고: https://archive.ics.uci.edu/ml/datasets/Iris
# - 데이터 원본: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
# - 데이터 제목: Iris Plants Database
# - 원작자 정보
# - Creator: R.A. Fisher
# - Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
# - Date: July, 1988
# - 사례 개수: 150
# - 각 붓꽃 종류당 50개씩
# - 4개의 독립 변수(Feature)
# - a: 꽃받침 길이 (sepal length in cm)
# - b: 꽃받침 넓이 (sepal width in cm)
# - c: 꽃잎 길이 (petal length in cm)
# - d: 꽃이 넓이 (petal width in cm)
# - 3부류의 백합 종류 (Image Source: wikipedia)
#
#
# Iris Setosa | Iris Versicolor | Iris Virginica |
#
#
# |
# |
# |
#
#
# - 참고 (Image Source: wikipedia)
# - Sepal(꽃받침), Petal(꽃잎)
# ## 2. 데이터 로딩하기
# In[1]:
import urllib2
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
get_ipython().run_line_magic('matplotlib', 'inline')
path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
raw_csv = urllib2.urlopen(path)
feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')
all_names = feature_names + ('class',)
df = pd.read_csv(raw_csv, names=all_names)
# In[2]:
df
# - data frame의 describe()를 통해 기본적인 통계치 알아보기
# In[3]:
df.describe()
# In[4]:
iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')
df_group = df.groupby('class')['class']
print df_group.count()
Iris_Se_Sub_Df = df[df['class'] == iris_names[0]]
Iris_Ve_Sub_Df = df[df['class'] == iris_names[1]]
Iris_Vi_Sub_Df = df[df['class'] == iris_names[2]]
print
print Iris_Se_Sub_Df
# ## 3. 탐색적 자료 분석 (Exploratory data analysis)
# - Sepal length (a) and Sepal width (b)
# - Sepal length (a) and Petal length (c)
# - Sepal length (a) and Petal width (d)
# - Sepal width (b) and Petal length (c)
# - Sepal width (b) and Petal width (d)
# - Petal length (c) and Petal width (d)
# In[5]:
unit_str = ' (cm)'
options = {
0: {
'data_x': feature_names[0],
'data_y': feature_names[1],
'label_x': feature_names[0] + unit_str,
'label_y': feature_names[1] + unit_str,
'ylim_min': 1.5,
'ylim_max': 5.0
},
1: {
'data_x': feature_names[0],
'data_y': feature_names[2],
'label_x': feature_names[0] + unit_str,
'label_y': feature_names[2] + unit_str,
'ylim_min': 0.0,
'ylim_max': 9.0
},
2: {
'data_x': feature_names[0],
'data_y': feature_names[3],
'label_x': feature_names[0] + unit_str,
'label_y': feature_names[3] + unit_str,
'ylim_min': -0.5,
'ylim_max': 3.5
},
3: {
'data_x': feature_names[1],
'data_y': feature_names[2],
'label_x': feature_names[1] + unit_str,
'label_y': feature_names[2] + unit_str,
'ylim_min': 0.0,
'ylim_max': 9.0
},
4: {
'data_x': feature_names[1],
'data_y': feature_names[3],
'label_x': feature_names[1] + unit_str,
'label_y': feature_names[3] + unit_str,
'ylim_min': 0.0,
'ylim_max': 3.5
},
5: {
'data_x': feature_names[2],
'data_y': feature_names[3],
'label_x': feature_names[2] + unit_str,
'label_y': feature_names[3] + unit_str,
'ylim_min': 0.0,
'ylim_max': 3.5
}
}
ax = []
fig = plt.figure(figsize=(17, 12))
for i in range(0,6):
ax.append(fig.add_subplot(230 + (i+1)))
for i in range(0,6):
se = ax[i].scatter(Iris_Se_Sub_Df[options[i]['data_x']], Iris_Se_Sub_Df[options[i]['data_y']], color='red')
ve = ax[i].scatter(Iris_Ve_Sub_Df[options[i]['data_x']], Iris_Ve_Sub_Df[options[i]['data_y']], color='blue')
vi = ax[i].scatter(Iris_Vi_Sub_Df[options[i]['data_x']], Iris_Vi_Sub_Df[options[i]['data_y']], color='green')
ax[i].set_xlabel(options[i]['label_x'])
ax[i].set_ylabel(options[i]['label_y'])
ax[i].set_ylim([options[i]['ylim_min'], options[i]['ylim_max']])
ax[i].legend((se, ve, vi), iris_names)
# In[6]:
df2 = df.ix[:,0:4]
df2[0:5]
# In[21]:
from pandas.tools.plotting import scatter_matrix
_ = scatter_matrix(df2, figsize=(9,9), diagonal='kde')
# - KDE: Kernel Density Estimattion (위 그래프의 대각선에 있는 Line 그래프 4개)
# - 해당 변수가 가지는 값의 범위와 그 값을 가지는 정도 (히스토그램에 대한 Smoothing 그래프)
# - 자세한 설명: http://darkpgmr.tistory.com/147
# In[23]:
stats = {}
for i in range(0,4):
stats[i] = {}
stats[i]['mean'] = (Iris_Se_Sub_Df[feature_names[i]].mean(),
Iris_Ve_Sub_Df[feature_names[i]].mean(),
Iris_Vi_Sub_Df[feature_names[i]].mean())
stats[i]['std'] = (Iris_Se_Sub_Df[feature_names[i]].std(),
Iris_Ve_Sub_Df[feature_names[i]].std(),
Iris_Vi_Sub_Df[feature_names[i]].std())
ind = Series([0.5, 1.5, 2.5])
width = 0.5
fig = plt.figure(figsize=(20, 5))
ay = []
for i in range(0,4):
ay.append(fig.add_subplot(140 + (i+1)))
for i in range(0,4):
ay[i].bar(ind, stats[i]['mean'], 0.5, color='magenta', yerr=stats[i]['std'])
ay[i].set_xlim([0, 3.5])
ay[i].set_ylabel('Mean of ' + feature_names[i])
ay[i].set_xticks(ind + width/2)
ay[i].set_xticklabels(iris_names)
# - box plot
# - 참고: https://goo.gl/ghxcx2
# In[85]:
_ = df2.boxplot()
# ## 4. scikit 활용한 데이터 탐색
# In[9]:
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
print type(iris)
# In[10]:
iris.keys()
# In[11]:
iris.target_names
# In[12]:
iris.feature_names
# In[13]:
print iris.DESCR
# In[14]:
iris.data[0:5]
# In[15]:
iris.target[0:5]
# In[16]:
iris.data[50:55]
# In[17]:
iris.target[50:55]
# ## 5. Spark을 활용한 데이터 탐색
# - 참고: https://github.com/jadianes/spark-py-notebooks/blob/master/nb7-mllib-statistics/nb7-mllib-statistics.ipynb
# In[5]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkFiles, SQLContext
if not 'sc' in locals():
sc = SparkContext()
import urllib
_ = urllib.urlretrieve ("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", "iris.data")
data_file = "./iris.data"
raw_data = sc.textFile(data_file)
# num of parallel cores
print sc.defaultParallelism
print raw_data.count()
raw_data = sc.textFile(data_file).filter(lambda x: x != '')
print raw_data.count()
print raw_data.take(5)
# In[60]:
def parse_raw_data(line):
line_split = line.split(",")[0:4]
return np.array([float(x) for x in line_split])
vector_data = raw_data.map(parse_raw_data)
print vector_data.take(5)
# - Summary statistics using Spark ML
# In[74]:
from pyspark.mllib.stat import Statistics
from math import sqrt
# Compute column summary statistics.
summary = Statistics.colStats(vector_data)
print "Statistics:"
for i in range(4):
print " Mean - {}: {}".format(feature_names[i], round(summary.mean()[i],3))
print " St. Dev - {}: {}".format(feature_names[i], round(sqrt(summary.variance()[i]),3))
print " Max value - {}: {}".format(feature_names[i], round(summary.max()[i],3))
print " Min value - {}: {}".format(feature_names[i], round(summary.min()[i],3))
print " Number of non-zero values - {}: {}".format(feature_names[i], summary.numNonzeros()[i])
print
# - Correlation
# In[75]:
from pyspark.mllib.stat import Statistics
correlation_matrix = Statistics.corr(vector_data, method="spearman")
# In[76]:
print type(correlation_matrix)
# In[77]:
print pd.DataFrame(correlation_matrix)