Iris Setosa | Iris Versicolor | Iris Virginica |
---|---|---|
import urllib2
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
raw_csv = urllib2.urlopen(path)
feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')
all_names = feature_names + ('class',)
df = pd.read_csv(raw_csv, names=all_names)
/Users/yhhan/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment. warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
df
sepal length | sepal width | petal length | petal width | class | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
5 | 5.4 | 3.9 | 1.7 | 0.4 | Iris-setosa |
6 | 4.6 | 3.4 | 1.4 | 0.3 | Iris-setosa |
7 | 5.0 | 3.4 | 1.5 | 0.2 | Iris-setosa |
8 | 4.4 | 2.9 | 1.4 | 0.2 | Iris-setosa |
9 | 4.9 | 3.1 | 1.5 | 0.1 | Iris-setosa |
10 | 5.4 | 3.7 | 1.5 | 0.2 | Iris-setosa |
11 | 4.8 | 3.4 | 1.6 | 0.2 | Iris-setosa |
12 | 4.8 | 3.0 | 1.4 | 0.1 | Iris-setosa |
13 | 4.3 | 3.0 | 1.1 | 0.1 | Iris-setosa |
14 | 5.8 | 4.0 | 1.2 | 0.2 | Iris-setosa |
15 | 5.7 | 4.4 | 1.5 | 0.4 | Iris-setosa |
16 | 5.4 | 3.9 | 1.3 | 0.4 | Iris-setosa |
17 | 5.1 | 3.5 | 1.4 | 0.3 | Iris-setosa |
18 | 5.7 | 3.8 | 1.7 | 0.3 | Iris-setosa |
19 | 5.1 | 3.8 | 1.5 | 0.3 | Iris-setosa |
20 | 5.4 | 3.4 | 1.7 | 0.2 | Iris-setosa |
21 | 5.1 | 3.7 | 1.5 | 0.4 | Iris-setosa |
22 | 4.6 | 3.6 | 1.0 | 0.2 | Iris-setosa |
23 | 5.1 | 3.3 | 1.7 | 0.5 | Iris-setosa |
24 | 4.8 | 3.4 | 1.9 | 0.2 | Iris-setosa |
25 | 5.0 | 3.0 | 1.6 | 0.2 | Iris-setosa |
26 | 5.0 | 3.4 | 1.6 | 0.4 | Iris-setosa |
27 | 5.2 | 3.5 | 1.5 | 0.2 | Iris-setosa |
28 | 5.2 | 3.4 | 1.4 | 0.2 | Iris-setosa |
29 | 4.7 | 3.2 | 1.6 | 0.2 | Iris-setosa |
... | ... | ... | ... | ... | ... |
120 | 6.9 | 3.2 | 5.7 | 2.3 | Iris-virginica |
121 | 5.6 | 2.8 | 4.9 | 2.0 | Iris-virginica |
122 | 7.7 | 2.8 | 6.7 | 2.0 | Iris-virginica |
123 | 6.3 | 2.7 | 4.9 | 1.8 | Iris-virginica |
124 | 6.7 | 3.3 | 5.7 | 2.1 | Iris-virginica |
125 | 7.2 | 3.2 | 6.0 | 1.8 | Iris-virginica |
126 | 6.2 | 2.8 | 4.8 | 1.8 | Iris-virginica |
127 | 6.1 | 3.0 | 4.9 | 1.8 | Iris-virginica |
128 | 6.4 | 2.8 | 5.6 | 2.1 | Iris-virginica |
129 | 7.2 | 3.0 | 5.8 | 1.6 | Iris-virginica |
130 | 7.4 | 2.8 | 6.1 | 1.9 | Iris-virginica |
131 | 7.9 | 3.8 | 6.4 | 2.0 | Iris-virginica |
132 | 6.4 | 2.8 | 5.6 | 2.2 | Iris-virginica |
133 | 6.3 | 2.8 | 5.1 | 1.5 | Iris-virginica |
134 | 6.1 | 2.6 | 5.6 | 1.4 | Iris-virginica |
135 | 7.7 | 3.0 | 6.1 | 2.3 | Iris-virginica |
136 | 6.3 | 3.4 | 5.6 | 2.4 | Iris-virginica |
137 | 6.4 | 3.1 | 5.5 | 1.8 | Iris-virginica |
138 | 6.0 | 3.0 | 4.8 | 1.8 | Iris-virginica |
139 | 6.9 | 3.1 | 5.4 | 2.1 | Iris-virginica |
140 | 6.7 | 3.1 | 5.6 | 2.4 | Iris-virginica |
141 | 6.9 | 3.1 | 5.1 | 2.3 | Iris-virginica |
142 | 5.8 | 2.7 | 5.1 | 1.9 | Iris-virginica |
143 | 6.8 | 3.2 | 5.9 | 2.3 | Iris-virginica |
144 | 6.7 | 3.3 | 5.7 | 2.5 | Iris-virginica |
145 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
150 rows × 5 columns
df.describe()
sepal length | sepal width | petal length | petal width | |
---|---|---|---|---|
count | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
mean | 5.843333 | 3.054000 | 3.758667 | 1.198667 |
std | 0.828066 | 0.433594 | 1.764420 | 0.763161 |
min | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
25% | 5.100000 | 2.800000 | 1.600000 | 0.300000 |
50% | 5.800000 | 3.000000 | 4.350000 | 1.300000 |
75% | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
max | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')
df_group = df.groupby('class')['class']
print df_group.count()
Iris_Se_Sub_Df = df[df['class'] == iris_names[0]]
Iris_Ve_Sub_Df = df[df['class'] == iris_names[1]]
Iris_Vi_Sub_Df = df[df['class'] == iris_names[2]]
print
print Iris_Se_Sub_Df
class Iris-setosa 50 Iris-versicolor 50 Iris-virginica 50 Name: class, dtype: int64 sepal length sepal width petal length petal width class 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa 5 5.4 3.9 1.7 0.4 Iris-setosa 6 4.6 3.4 1.4 0.3 Iris-setosa 7 5.0 3.4 1.5 0.2 Iris-setosa 8 4.4 2.9 1.4 0.2 Iris-setosa 9 4.9 3.1 1.5 0.1 Iris-setosa 10 5.4 3.7 1.5 0.2 Iris-setosa 11 4.8 3.4 1.6 0.2 Iris-setosa 12 4.8 3.0 1.4 0.1 Iris-setosa 13 4.3 3.0 1.1 0.1 Iris-setosa 14 5.8 4.0 1.2 0.2 Iris-setosa 15 5.7 4.4 1.5 0.4 Iris-setosa 16 5.4 3.9 1.3 0.4 Iris-setosa 17 5.1 3.5 1.4 0.3 Iris-setosa 18 5.7 3.8 1.7 0.3 Iris-setosa 19 5.1 3.8 1.5 0.3 Iris-setosa 20 5.4 3.4 1.7 0.2 Iris-setosa 21 5.1 3.7 1.5 0.4 Iris-setosa 22 4.6 3.6 1.0 0.2 Iris-setosa 23 5.1 3.3 1.7 0.5 Iris-setosa 24 4.8 3.4 1.9 0.2 Iris-setosa 25 5.0 3.0 1.6 0.2 Iris-setosa 26 5.0 3.4 1.6 0.4 Iris-setosa 27 5.2 3.5 1.5 0.2 Iris-setosa 28 5.2 3.4 1.4 0.2 Iris-setosa 29 4.7 3.2 1.6 0.2 Iris-setosa 30 4.8 3.1 1.6 0.2 Iris-setosa 31 5.4 3.4 1.5 0.4 Iris-setosa 32 5.2 4.1 1.5 0.1 Iris-setosa 33 5.5 4.2 1.4 0.2 Iris-setosa 34 4.9 3.1 1.5 0.1 Iris-setosa 35 5.0 3.2 1.2 0.2 Iris-setosa 36 5.5 3.5 1.3 0.2 Iris-setosa 37 4.9 3.1 1.5 0.1 Iris-setosa 38 4.4 3.0 1.3 0.2 Iris-setosa 39 5.1 3.4 1.5 0.2 Iris-setosa 40 5.0 3.5 1.3 0.3 Iris-setosa 41 4.5 2.3 1.3 0.3 Iris-setosa 42 4.4 3.2 1.3 0.2 Iris-setosa 43 5.0 3.5 1.6 0.6 Iris-setosa 44 5.1 3.8 1.9 0.4 Iris-setosa 45 4.8 3.0 1.4 0.3 Iris-setosa 46 5.1 3.8 1.6 0.2 Iris-setosa 47 4.6 3.2 1.4 0.2 Iris-setosa 48 5.3 3.7 1.5 0.2 Iris-setosa 49 5.0 3.3 1.4 0.2 Iris-setosa
unit_str = ' (cm)'
options = {
0: {
'data_x': feature_names[0],
'data_y': feature_names[1],
'label_x': feature_names[0] + unit_str,
'label_y': feature_names[1] + unit_str,
'ylim_min': 1.5,
'ylim_max': 5.0
},
1: {
'data_x': feature_names[0],
'data_y': feature_names[2],
'label_x': feature_names[0] + unit_str,
'label_y': feature_names[2] + unit_str,
'ylim_min': 0.0,
'ylim_max': 9.0
},
2: {
'data_x': feature_names[0],
'data_y': feature_names[3],
'label_x': feature_names[0] + unit_str,
'label_y': feature_names[3] + unit_str,
'ylim_min': -0.5,
'ylim_max': 3.5
},
3: {
'data_x': feature_names[1],
'data_y': feature_names[2],
'label_x': feature_names[1] + unit_str,
'label_y': feature_names[2] + unit_str,
'ylim_min': 0.0,
'ylim_max': 9.0
},
4: {
'data_x': feature_names[1],
'data_y': feature_names[3],
'label_x': feature_names[1] + unit_str,
'label_y': feature_names[3] + unit_str,
'ylim_min': 0.0,
'ylim_max': 3.5
},
5: {
'data_x': feature_names[2],
'data_y': feature_names[3],
'label_x': feature_names[2] + unit_str,
'label_y': feature_names[3] + unit_str,
'ylim_min': 0.0,
'ylim_max': 3.5
}
}
ax = []
fig = plt.figure(figsize=(17, 12))
for i in range(0,6):
ax.append(fig.add_subplot(230 + (i+1)))
for i in range(0,6):
se = ax[i].scatter(Iris_Se_Sub_Df[options[i]['data_x']], Iris_Se_Sub_Df[options[i]['data_y']], color='red')
ve = ax[i].scatter(Iris_Ve_Sub_Df[options[i]['data_x']], Iris_Ve_Sub_Df[options[i]['data_y']], color='blue')
vi = ax[i].scatter(Iris_Vi_Sub_Df[options[i]['data_x']], Iris_Vi_Sub_Df[options[i]['data_y']], color='green')
ax[i].set_xlabel(options[i]['label_x'])
ax[i].set_ylabel(options[i]['label_y'])
ax[i].set_ylim([options[i]['ylim_min'], options[i]['ylim_max']])
ax[i].legend((se, ve, vi), iris_names)
df2 = df.ix[:,0:4]
df2[0:5]
sepal length | sepal width | petal length | petal width | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
from pandas.tools.plotting import scatter_matrix
_ = scatter_matrix(df2, figsize=(9,9), diagonal='kde')
stats = {}
for i in range(0,4):
stats[i] = {}
stats[i]['mean'] = (Iris_Se_Sub_Df[feature_names[i]].mean(),
Iris_Ve_Sub_Df[feature_names[i]].mean(),
Iris_Vi_Sub_Df[feature_names[i]].mean())
stats[i]['std'] = (Iris_Se_Sub_Df[feature_names[i]].std(),
Iris_Ve_Sub_Df[feature_names[i]].std(),
Iris_Vi_Sub_Df[feature_names[i]].std())
ind = Series([0.5, 1.5, 2.5])
width = 0.5
fig = plt.figure(figsize=(20, 5))
ay = []
for i in range(0,4):
ay.append(fig.add_subplot(140 + (i+1)))
for i in range(0,4):
ay[i].bar(ind, stats[i]['mean'], 0.5, color='magenta', yerr=stats[i]['std'])
ay[i].set_xlim([0, 3.5])
ay[i].set_ylabel('Mean of ' + feature_names[i])
ay[i].set_xticks(ind + width/2)
ay[i].set_xticklabels(iris_names)
_ = df2.boxplot()
/Users/yhhan/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: The default value for 'return_type' will change to 'axes' in a future release. To use the future behavior now, set return_type='axes'. To keep the previous behavior and silence this warning, set return_type='dict'. if __name__ == '__main__':
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
print type(iris)
<class 'sklearn.datasets.base.Bunch'>
iris.keys()
['target_names', 'data', 'target', 'DESCR', 'feature_names']
iris.target_names
array(['setosa', 'versicolor', 'virginica'], dtype='|S10')
iris.feature_names
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
print iris.DESCR
Iris Plants Database Notes ----- Data Set Characteristics: :Number of Instances: 150 (50 in each of three classes) :Number of Attributes: 4 numeric, predictive attributes and the class :Attribute Information: - sepal length in cm - sepal width in cm - petal length in cm - petal width in cm - class: - Iris-Setosa - Iris-Versicolour - Iris-Virginica :Summary Statistics: ============== ==== ==== ======= ===== ==================== Min Max Mean SD Class Correlation ============== ==== ==== ======= ===== ==================== sepal length: 4.3 7.9 5.84 0.83 0.7826 sepal width: 2.0 4.4 3.05 0.43 -0.4194 petal length: 1.0 6.9 3.76 1.76 0.9490 (high!) petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) ============== ==== ==== ======= ===== ==================== :Missing Attribute Values: None :Class Distribution: 33.3% for each of 3 classes. :Creator: R.A. Fisher :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) :Date: July, 1988 This is a copy of UCI ML iris datasets. http://archive.ics.uci.edu/ml/datasets/Iris The famous Iris database, first used by Sir R.A Fisher This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other. References ---------- - Fisher,R.A. "The use of multiple measurements in taxonomic problems" Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to Mathematical Statistics" (John Wiley, NY, 1950). - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis. (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218. - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System Structure and Classification Rule for Recognition in Partially Exposed Environments". IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. PAMI-2, No. 1, 67-71. - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions on Information Theory, May 1972, 431-433. - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II conceptual clustering system finds 3 classes in the data. - Many, many more ...
iris.data[0:5]
array([[ 5.1, 3.5, 1.4, 0.2], [ 4.9, 3. , 1.4, 0.2], [ 4.7, 3.2, 1.3, 0.2], [ 4.6, 3.1, 1.5, 0.2], [ 5. , 3.6, 1.4, 0.2]])
iris.target[0:5]
array([0, 0, 0, 0, 0])
iris.data[50:55]
array([[ 7. , 3.2, 4.7, 1.4], [ 6.4, 3.2, 4.5, 1.5], [ 6.9, 3.1, 4.9, 1.5], [ 5.5, 2.3, 4. , 1.3], [ 6.5, 2.8, 4.6, 1.5]])
iris.target[50:55]
array([1, 1, 1, 1, 1])
import findspark
findspark.init()
from pyspark import SparkContext, SparkFiles, SQLContext
if not 'sc' in locals():
sc = SparkContext()
import urllib
_ = urllib.urlretrieve ("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", "iris.data")
data_file = "./iris.data"
raw_data = sc.textFile(data_file)
# num of parallel cores
print sc.defaultParallelism
print raw_data.count()
raw_data = sc.textFile(data_file).filter(lambda x: x != '')
print raw_data.count()
print raw_data.take(5)
2 151 150 [u'5.1,3.5,1.4,0.2,Iris-setosa', u'4.9,3.0,1.4,0.2,Iris-setosa', u'4.7,3.2,1.3,0.2,Iris-setosa', u'4.6,3.1,1.5,0.2,Iris-setosa', u'5.0,3.6,1.4,0.2,Iris-setosa']
def parse_raw_data(line):
line_split = line.split(",")[0:4]
return np.array([float(x) for x in line_split])
vector_data = raw_data.map(parse_raw_data)
print vector_data.take(5)
[array([ 5.1, 3.5, 1.4, 0.2]), array([ 4.9, 3. , 1.4, 0.2]), array([ 4.7, 3.2, 1.3, 0.2]), array([ 4.6, 3.1, 1.5, 0.2]), array([ 5. , 3.6, 1.4, 0.2])]
from pyspark.mllib.stat import Statistics
from math import sqrt
# Compute column summary statistics.
summary = Statistics.colStats(vector_data)
print "Statistics:"
for i in range(4):
print " Mean - {}: {}".format(feature_names[i], round(summary.mean()[i],3))
print " St. Dev - {}: {}".format(feature_names[i], round(sqrt(summary.variance()[i]),3))
print " Max value - {}: {}".format(feature_names[i], round(summary.max()[i],3))
print " Min value - {}: {}".format(feature_names[i], round(summary.min()[i],3))
print " Number of non-zero values - {}: {}".format(feature_names[i], summary.numNonzeros()[i])
print
Statistics: Mean - sepal length: 5.843 St. Dev - sepal length: 0.828 Max value - sepal length: 7.9 Min value - sepal length: 4.3 Number of non-zero values - sepal length: 150.0 Mean - sepal width: 3.054 St. Dev - sepal width: 0.434 Max value - sepal width: 4.4 Min value - sepal width: 2.0 Number of non-zero values - sepal width: 150.0 Mean - petal length: 3.759 St. Dev - petal length: 1.764 Max value - petal length: 6.9 Min value - petal length: 1.0 Number of non-zero values - petal length: 150.0 Mean - petal width: 1.199 St. Dev - petal width: 0.763 Max value - petal width: 2.5 Min value - petal width: 0.1 Number of non-zero values - petal width: 150.0
from pyspark.mllib.stat import Statistics
correlation_matrix = Statistics.corr(vector_data, method="spearman")
print type(correlation_matrix)
<type 'numpy.ndarray'>
print pd.DataFrame(correlation_matrix)
0 1 2 3 0 1.000000 -0.159457 0.881386 0.834421 1 -0.159457 1.000000 -0.303421 -0.277511 2 0.881386 -0.303421 1.000000 0.936003 3 0.834421 -0.277511 0.936003 1.000000