通过职业，工作事件长短，种族来预测性别¶

从疝气病症预测病马的死亡率¶

In [ ]:

手写数字识别¶

导包，使用SVM

datasets读取数据

分割训练和预测数据train_test_split(可以放多个要分割的数据)

绘制前100个图片

创建SVC模型gamma =0.001
训练数据

预测数据，可视化

In [ ]:

手迹识别¶

导包

In [1]:

import pandas as pd
from pandas import Series,DataFrame

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

#降维处理，不仅仅可以节省时间，更重要的可以提高准确性
from sklearn.decomposition import PCA

In [2]:

#读取数据
digits = pd.read_csv('./train.csv')
digits.head()

Out[2]:

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...

5 rows × 785 columns

In [4]:

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:

digits.shape

Out[3]:

(42000, 785)

In [5]:

import numpy as np

In [19]:

index = np.random.randint(0,42000,size = 1)

#对DataFrame操作，获取数据是df.loc[?]
#? == 数字 那么返回的数据就是Series
#？ == [数字]那么返回的结果是DataFrame
image = digits.loc[index[0]][1:].values.reshape((28,28))

plt.figure(figsize=(2,2))
plt.imshow(image,cmap = 'gray')

Out[19]:

<matplotlib.image.AxesImage at 0x7fcbb7acc7b8>

In [33]:

#42000个数据
digits['pixel20'].unique()

Out[33]:

array([0])

In [35]:

#PCA 提高计算的精度而

y = digits['label']

In [36]:

x = digits.drop('label',axis = 1)
x.head()

Out[36]:

	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8	pixel9	...	pixel774	pixel775	pixel776	pixel777	pixel778	pixel779	pixel780	pixel781	pixel782	pixel783
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

5 rows × 784 columns

In [38]:

X_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.1)

In [39]:

knn = KNeighborsClassifier(n_neighbors= 10 )
knn.fit(X_train,y_train)

Out[39]:

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [41]:

knn.score(x_test,y_test)

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-41-07209231e2d1> in <module>()
----> 1 knn.score(x_test,y_test)

/usr/local/lib/python3.5/dist-packages/sklearn/base.py in score(self, X, y, sample_weight)
    347         """
    348         from .metrics import accuracy_score
--> 349         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
    350 
    351 

/usr/local/lib/python3.5/dist-packages/sklearn/neighbors/classification.py in predict(self, X)
    143         X = check_array(X, accept_sparse='csr')
    144 
--> 145         neigh_dist, neigh_ind = self.kneighbors(X)
    146 
    147         classes_ = self.classes_

/usr/local/lib/python3.5/dist-packages/sklearn/neighbors/base.py in kneighbors(self, X, n_neighbors, return_distance)
    379                 delayed(self._tree.query, check_pickle=False)(
    380                     X[s], n_neighbors, return_distance)
--> 381                 for s in gen_even_slices(X.shape[0], n_jobs)
    382             )
    383             if return_distance:

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    324         # Don't delay the application, to avoid keeping the input
    325         # arguments in memory
--> 326         self.results = batch()
    327 
    328     def get(self):

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

KeyboardInterrupt:

In [ ]:

In [42]:

#使用pca进行降维
pca = PCA(n_components=150,whiten=True,svd_solver='randomized')

In [43]:

pca.fit(x)

Out[43]:

PCA(copy=True, iterated_power='auto', n_components=150, random_state=None,
  svd_solver='randomized', tol=0.0, whiten=True)

In [44]:

X_train_pca = pca.transform(X_train)
x_test_pca = pca.transform(x_test)

In [48]:

knn.fit(X_train_pca,y_train)

Out[48]:

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

??? 进行PCA降维和不进行PCA降维的准确度，是否有差距

In [49]:

knn.score(x_test_pca,y_test)

Out[49]:

0.86952380952380948

In [ ]:

In [50]:

svc = SVC()

In [51]:

svc.fit(X_train_pca,y_train)

Out[51]:

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [53]:

import time

In [55]:

print(time.time())
svc_orignal = SVC()
svc_orignal.fit(X_train,y_train)
score = svc_orignal.score(x_test[-420:],y_test[-420:])
print('不进行数据降维，预测的准确率：%d'%(score))
print(time.time())

1510196966.2637584
不进行数据降维，预测的准确率：0
1510200943.6910596

In [52]:

svc.score(x_test_pca[-420:],y_test[-420:])

Out[52]:

0.97619047619047616

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...

	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8	pixel9	...	pixel774	pixel775	pixel776	pixel777	pixel778	pixel779	pixel780	pixel781	pixel782	pixel783
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...

	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8	pixel9	...	pixel774	pixel775	pixel776	pixel777	pixel778	pixel779	pixel780	pixel781	pixel782	pixel783
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

通过职业，工作事件长短，种族来预测性别¶

从疝气病症预测病马的死亡率¶

手写数字识别¶

手迹识别¶

人脸识别¶

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...

	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8	pixel9	...	pixel774	pixel775	pixel776	pixel777	pixel778	pixel779	pixel780	pixel781	pixel782	pixel783
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0