#!/usr/bin/env python
# coding: utf-8

# ### 本教程教你如何使用 SVM 分类 [Mnist 数据集](http://yann.lecun.com/exdb/mnist/)
# 
# MNIST 有一个 60，000 个示例的训练集和 10，000 个示例的测试集。这些数字已经尺寸归一化，并在固定大小(28*28)的图像中居中。
# 
# #### 1. 下载数据集
# 
# 简单的话，可以通过 Scikit-Learn 获取 MNIST。

# In[1]:


from sklearn.datasets import fetch_openml

mnist = fetch_openml("mnist_784")


# #### 2. 分析数据集
# 

# In[2]:


print(mnist.keys())  # 看下数据集中有啥键，'data', 'target' 分别为 特征(就是每张图像)和标签
print(mnist['DESCR']) # 打印下说明


# 查看下 数据维度和类型

# In[3]:


mnist["data"].shape, mnist["target"].shape, type(mnist["target"])


# 为了方便，我们将 panda 对象转为 numpy

# In[6]:


x = mnist["data"].to_numpy()
y = mnist["target"].to_numpy()
x.shape, y.shape


# 画图显示下数据和对应的标签

# In[7]:


import matplotlib.pyplot as plt

# 8 行 8 列
fig, axes = plt.subplots(8, 8, figsize=(10, 10),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))
# 绿的是 标签值
for i, ax in enumerate(axes.flat):
    ax.imshow(x[i].reshape(28, 28), cmap='binary', interpolation='nearest')
    ax.text(0.05, 0.05, str(y[i]),
            transform=ax.transAxes, color='green')


# #### 3. 数据划分

# In[12]:


from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(x, y, random_state=42)
Xtrain.shape, ytrain.shape


# #### 4. 定义模型对象

# In[9]:


from sklearn.svm import SVC
model = SVC(kernel='rbf', class_weight='balanced')


# #### 5. 训练模型

# In[13]:


model.fit(Xtrain, ytrain) # 需要花一些时间


# 将训练好的模型来跑下测试集

# In[15]:


ypred = model.predict(Xtest) # 需要花一些时间


# 统计下模型分类精度

# In[16]:


from sklearn.metrics import accuracy_score
accuracy_score(ytest, ypred)


# 默认的参数看来模型不太行啊，没有达到 99%。下面画以下 混淆矩阵

# In[17]:


from sklearn.metrics import confusion_matrix
import seaborn as sns

mat = confusion_matrix(ytest, ypred)

sns.heatmap(mat, square=True, annot=True, cbar=False, fmt='.20g', annot_kws={"fontsize":8}) # fmt：不用科学计数法
plt.xlabel('predicted value')
plt.ylabel('true value')


# 预测错误的 标注为 红色，对的为 绿色

# In[18]:


fig, axes = plt.subplots(5, 5, figsize=(28, 28),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))

test_images = Xtest.reshape(-1, 28, 28)

for i, ax in enumerate(axes.flat):
    ax.imshow(test_images[i], cmap='binary', interpolation='nearest')
    ax.text(0.05, 0.05, str(ypred[i]),
            transform=ax.transAxes,
            color='green' if (ytest[i] == ypred[i]) else 'red',
            fontsize=30)


# 统计结果

# In[19]:


from sklearn.metrics import classification_report
target_names = list(mnist['target'].cat.categories)
print(classification_report(ytest, ypred,
                            target_names=target_names))


# ## PCA 降维+SVM 加速训练

# In[21]:


from sklearn.svm import SVC
from sklearn.decomposition import PCA as RandomizedPCA
from sklearn.pipeline import make_pipeline

# 28*28 降维为 10
pca = RandomizedPCA(n_components=10, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)


# In[22]:


model.fit(Xtrain, ytrain) # 速度明显加快，可以统计下时间


# In[23]:


ypred = model.predict(Xtest)


# In[24]:


from sklearn.metrics import accuracy_score
accuracy_score(ytest, ypred)  # 降低了，看来 svm 的默认参数确实效果一般


# 其余步骤请参考上面内容...

# In[ ]: