导包,使用SVM
datasets读取数据
分割训练和预测数据train_test_split(可以放多个要分割的数据)
绘制前100个图片
创建SVC模型gamma =0.001
训练数据
预测数据,可视化
导包
import pandas as pd
from pandas import Series,DataFrame
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
#降维处理,不仅仅可以节省时间,更重要的可以提高准确性
from sklearn.decomposition import PCA
#读取数据
digits = pd.read_csv('./train.csv')
digits.head()
label | pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | ... | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 785 columns
import matplotlib.pyplot as plt
%matplotlib inline
digits.shape
(42000, 785)
import numpy as np
index = np.random.randint(0,42000,size = 1)
#对DataFrame操作,获取数据是df.loc[?]
#? == 数字 那么返回的数据就是Series
#? == [数字]那么返回的结果是DataFrame
image = digits.loc[index[0]][1:].values.reshape((28,28))
plt.figure(figsize=(2,2))
plt.imshow(image,cmap = 'gray')
<matplotlib.image.AxesImage at 0x7fcbb7acc7b8>
#42000个数据
digits['pixel20'].unique()
array([0])
#PCA 提高计算的精度而
y = digits['label']
x = digits.drop('label',axis = 1)
x.head()
pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | ... | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 784 columns
X_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.1)
knn = KNeighborsClassifier(n_neighbors= 10 )
knn.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=10, p=2, weights='uniform')
knn.score(x_test,y_test)
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-41-07209231e2d1> in <module>() ----> 1 knn.score(x_test,y_test) /usr/local/lib/python3.5/dist-packages/sklearn/base.py in score(self, X, y, sample_weight) 347 """ 348 from .metrics import accuracy_score --> 349 return accuracy_score(y, self.predict(X), sample_weight=sample_weight) 350 351 /usr/local/lib/python3.5/dist-packages/sklearn/neighbors/classification.py in predict(self, X) 143 X = check_array(X, accept_sparse='csr') 144 --> 145 neigh_dist, neigh_ind = self.kneighbors(X) 146 147 classes_ = self.classes_ /usr/local/lib/python3.5/dist-packages/sklearn/neighbors/base.py in kneighbors(self, X, n_neighbors, return_distance) 379 delayed(self._tree.query, check_pickle=False)( 380 X[s], n_neighbors, return_distance) --> 381 for s in gen_even_slices(X.shape[0], n_jobs) 382 ) 383 if return_distance: /usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable) 756 # was dispatched. In particular this covers the edge 757 # case of Parallel used with an exhausted iterator. --> 758 while self.dispatch_one_batch(iterator): 759 self._iterating = True 760 else: /usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator) 606 return False 607 else: --> 608 self._dispatch(tasks) 609 return True 610 /usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch) 569 dispatch_timestamp = time.time() 570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) --> 571 job = self._backend.apply_async(batch, callback=cb) 572 self._jobs.append(job) 573 /usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback) 107 def apply_async(self, func, callback=None): 108 """Schedule a func to be run""" --> 109 result = ImmediateResult(func) 110 if callback: 111 callback(result) /usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch) 324 # Don't delay the application, to avoid keeping the input 325 # arguments in memory --> 326 self.results = batch() 327 328 def get(self): /usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): /usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): KeyboardInterrupt:
#使用pca进行降维
pca = PCA(n_components=150,whiten=True,svd_solver='randomized')
pca.fit(x)
PCA(copy=True, iterated_power='auto', n_components=150, random_state=None, svd_solver='randomized', tol=0.0, whiten=True)
X_train_pca = pca.transform(X_train)
x_test_pca = pca.transform(x_test)
knn.fit(X_train_pca,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=10, p=2, weights='uniform')
??? 进行PCA降维和不进行PCA降维的准确度,是否有差距
knn.score(x_test_pca,y_test)
0.86952380952380948
svc = SVC()
svc.fit(X_train_pca,y_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
import time
print(time.time())
svc_orignal = SVC()
svc_orignal.fit(X_train,y_train)
score = svc_orignal.score(x_test[-420:],y_test[-420:])
print('不进行数据降维,预测的准确率:%d'%(score))
print(time.time())
1510196966.2637584 不进行数据降维,预测的准确率:0 1510200943.6910596
svc.score(x_test_pca[-420:],y_test[-420:])
0.97619047619047616