#!/usr/bin/env python # coding: utf-8 # # 데이터 다루기 (Data Handling) - Iris Data Set # ## 1. Iris(붓꽃) Data Set 개요 # - 참고: https://archive.ics.uci.edu/ml/datasets/Iris # - 데이터 원본: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data # - 데이터 제목: Iris Plants Database # - 원작자 정보 # - Creator: R.A. Fisher # - Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) # - Date: July, 1988 # - 사례 개수: 150 # - 각 붓꽃 종류당 50개씩 # - 4개의 독립 변수(Feature) # - a: 꽃받침 길이 (sepal length in cm) # - b: 꽃받침 넓이 (sepal width in cm) # - c: 꽃잎 길이 (petal length in cm) # - d: 꽃이 넓이 (petal width in cm) # - 3부류의 백합 종류 (Image Source: wikipedia) # # # # # # # # # #
Iris SetosaIris VersicolorIris Virginica
# - 참고 (Image Source: wikipedia) # - Sepal(꽃받침), Petal(꽃잎) # ## 2. 데이터 로딩하기 # In[1]: import urllib2 from scipy import stats from pandas import Series, DataFrame import pandas as pd import matplotlib.pyplot as plt import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' raw_csv = urllib2.urlopen(path) feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width') all_names = feature_names + ('class',) df = pd.read_csv(raw_csv, names=all_names) # In[2]: df # - data frame의 describe()를 통해 기본적인 통계치 알아보기 # In[3]: df.describe() # In[4]: iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica') df_group = df.groupby('class')['class'] print df_group.count() Iris_Se_Sub_Df = df[df['class'] == iris_names[0]] Iris_Ve_Sub_Df = df[df['class'] == iris_names[1]] Iris_Vi_Sub_Df = df[df['class'] == iris_names[2]] print print Iris_Se_Sub_Df # ## 3. 탐색적 자료 분석 (Exploratory data analysis) # - Sepal length (a) and Sepal width (b) # - Sepal length (a) and Petal length (c) # - Sepal length (a) and Petal width (d) # - Sepal width (b) and Petal length (c) # - Sepal width (b) and Petal width (d) # - Petal length (c) and Petal width (d) # In[5]: unit_str = ' (cm)' options = { 0: { 'data_x': feature_names[0], 'data_y': feature_names[1], 'label_x': feature_names[0] + unit_str, 'label_y': feature_names[1] + unit_str, 'ylim_min': 1.5, 'ylim_max': 5.0 }, 1: { 'data_x': feature_names[0], 'data_y': feature_names[2], 'label_x': feature_names[0] + unit_str, 'label_y': feature_names[2] + unit_str, 'ylim_min': 0.0, 'ylim_max': 9.0 }, 2: { 'data_x': feature_names[0], 'data_y': feature_names[3], 'label_x': feature_names[0] + unit_str, 'label_y': feature_names[3] + unit_str, 'ylim_min': -0.5, 'ylim_max': 3.5 }, 3: { 'data_x': feature_names[1], 'data_y': feature_names[2], 'label_x': feature_names[1] + unit_str, 'label_y': feature_names[2] + unit_str, 'ylim_min': 0.0, 'ylim_max': 9.0 }, 4: { 'data_x': feature_names[1], 'data_y': feature_names[3], 'label_x': feature_names[1] + unit_str, 'label_y': feature_names[3] + unit_str, 'ylim_min': 0.0, 'ylim_max': 3.5 }, 5: { 'data_x': feature_names[2], 'data_y': feature_names[3], 'label_x': feature_names[2] + unit_str, 'label_y': feature_names[3] + unit_str, 'ylim_min': 0.0, 'ylim_max': 3.5 } } ax = [] fig = plt.figure(figsize=(17, 12)) for i in range(0,6): ax.append(fig.add_subplot(230 + (i+1))) for i in range(0,6): se = ax[i].scatter(Iris_Se_Sub_Df[options[i]['data_x']], Iris_Se_Sub_Df[options[i]['data_y']], color='red') ve = ax[i].scatter(Iris_Ve_Sub_Df[options[i]['data_x']], Iris_Ve_Sub_Df[options[i]['data_y']], color='blue') vi = ax[i].scatter(Iris_Vi_Sub_Df[options[i]['data_x']], Iris_Vi_Sub_Df[options[i]['data_y']], color='green') ax[i].set_xlabel(options[i]['label_x']) ax[i].set_ylabel(options[i]['label_y']) ax[i].set_ylim([options[i]['ylim_min'], options[i]['ylim_max']]) ax[i].legend((se, ve, vi), iris_names) # In[6]: df2 = df.ix[:,0:4] df2[0:5] # In[21]: from pandas.tools.plotting import scatter_matrix _ = scatter_matrix(df2, figsize=(9,9), diagonal='kde') # - KDE: Kernel Density Estimattion (위 그래프의 대각선에 있는 Line 그래프 4개) # - 해당 변수가 가지는 값의 범위와 그 값을 가지는 정도 (히스토그램에 대한 Smoothing 그래프) # - 자세한 설명: http://darkpgmr.tistory.com/147 # In[23]: stats = {} for i in range(0,4): stats[i] = {} stats[i]['mean'] = (Iris_Se_Sub_Df[feature_names[i]].mean(), Iris_Ve_Sub_Df[feature_names[i]].mean(), Iris_Vi_Sub_Df[feature_names[i]].mean()) stats[i]['std'] = (Iris_Se_Sub_Df[feature_names[i]].std(), Iris_Ve_Sub_Df[feature_names[i]].std(), Iris_Vi_Sub_Df[feature_names[i]].std()) ind = Series([0.5, 1.5, 2.5]) width = 0.5 fig = plt.figure(figsize=(20, 5)) ay = [] for i in range(0,4): ay.append(fig.add_subplot(140 + (i+1))) for i in range(0,4): ay[i].bar(ind, stats[i]['mean'], 0.5, color='magenta', yerr=stats[i]['std']) ay[i].set_xlim([0, 3.5]) ay[i].set_ylabel('Mean of ' + feature_names[i]) ay[i].set_xticks(ind + width/2) ay[i].set_xticklabels(iris_names) # - box plot # - 참고: https://goo.gl/ghxcx2 # In[85]: _ = df2.boxplot() # ## 4. scikit 활용한 데이터 탐색 # In[9]: from sklearn.datasets import load_iris from sklearn import tree iris = load_iris() print type(iris) # In[10]: iris.keys() # In[11]: iris.target_names # In[12]: iris.feature_names # In[13]: print iris.DESCR # In[14]: iris.data[0:5] # In[15]: iris.target[0:5] # In[16]: iris.data[50:55] # In[17]: iris.target[50:55] # ## 5. Spark을 활용한 데이터 탐색 # - 참고: https://github.com/jadianes/spark-py-notebooks/blob/master/nb7-mllib-statistics/nb7-mllib-statistics.ipynb # In[5]: import findspark findspark.init() from pyspark import SparkContext, SparkFiles, SQLContext if not 'sc' in locals(): sc = SparkContext() import urllib _ = urllib.urlretrieve ("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", "iris.data") data_file = "./iris.data" raw_data = sc.textFile(data_file) # num of parallel cores print sc.defaultParallelism print raw_data.count() raw_data = sc.textFile(data_file).filter(lambda x: x != '') print raw_data.count() print raw_data.take(5) # In[60]: def parse_raw_data(line): line_split = line.split(",")[0:4] return np.array([float(x) for x in line_split]) vector_data = raw_data.map(parse_raw_data) print vector_data.take(5) # - Summary statistics using Spark ML # In[74]: from pyspark.mllib.stat import Statistics from math import sqrt # Compute column summary statistics. summary = Statistics.colStats(vector_data) print "Statistics:" for i in range(4): print " Mean - {}: {}".format(feature_names[i], round(summary.mean()[i],3)) print " St. Dev - {}: {}".format(feature_names[i], round(sqrt(summary.variance()[i]),3)) print " Max value - {}: {}".format(feature_names[i], round(summary.max()[i],3)) print " Min value - {}: {}".format(feature_names[i], round(summary.min()[i],3)) print " Number of non-zero values - {}: {}".format(feature_names[i], summary.numNonzeros()[i]) print # - Correlation # In[75]: from pyspark.mllib.stat import Statistics correlation_matrix = Statistics.corr(vector_data, method="spearman") # In[76]: print type(correlation_matrix) # In[77]: print pd.DataFrame(correlation_matrix)