对MNIST数据集的简单分析与可视化，比较几种不同分类算法在降维后MNIST数据集上的效果

In [1]:

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap

import seaborn as sns


from sklearn import neighbors, datasets
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, TSNE, Isomap
from sklearn.manifold import LocallyLinearEmbedding as LLE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

import gzip
import pickle

/home/nwpu/anaconda3/lib/python3.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/nwpu/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

可视化¶

导入数据

In [2]:

f = gzip.open('datasets/mnist.pkl.gz', 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
f.close()

维数检查

In [3]:

training_data[0].shape

Out[3]:

(50000, 784)

显示实例

In [4]:

plt.imshow(training_data[0][0].reshape(28, 28))
plt.gcf().set_size_inches(6, 6)

PCA

In [5]:

X = training_data[0]
y = training_data[1]

pca = PCA(n_components=2)
pca.fit(X, y)
X_t = pca.transform(X)

转换矩阵可视化

In [6]:

# 第一维投影矩阵
plt.subplot(2, 1, 1)
plt.imshow(pca.components_[0, :].reshape(28, 28))

# 第二维投影矩阵
plt.subplot(2, 1, 2)
plt.imshow(pca.components_[1, :].reshape(28, 28))
plt.gcf().set_size_inches(6, 12)
plt.show()

violin 图可视化

In [7]:

stacked = np.hstack([X_t, y.reshape(y.shape[0], -1)])
stacked = stacked.copy()

df = pd.DataFrame(stacked, columns=['feat1', 'feat2', 'class'])

df.head()

Out[7]:

	feat1	feat2	class
0	0.461510	-1.246882	5.0
1	3.921654	-1.251661	0.0
2	-0.203707	1.547873	4.0
3	-3.148445	-2.296123	1.0
4	-1.442686	2.872100	9.0

In [8]:

fig = plt.figure(figsize=(10, 10), dpi=100)

for column_index, column in enumerate(df.columns):
    if column == 'class':
        continue
    ax = fig.add_subplot(2, 1, column_index + 1, frameon=True)
    sns.violinplot(x='class', y=column, data=df)

/home/nwpu/anaconda3/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval

降维方法比较¶

In [9]:

def plot_1D(ax, X, y, name):
    ax.scatter(X[:, 0], np.zeros(X.shape[0]), c=y, cmap=cm.rainbow, alpha=0.5)
    ax.set_title(name+" 1D")


def plot_2D(ax, X, y, name):
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm.rainbow, alpha=0.5)
    ax.set_title(name+" 2D")


def plot_3D(ax, X, y, reduction_name):
    ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=cm.rainbow, alpha=0.5)

    # 显示标记
    for name, label in [('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]:
        ax.text3D(X[y == label, 0].mean(),
                  X[y == label, 1].mean() + 1.5,
                  X[y == label, 2].mean(), name,
                  horizontalalignment='center',
                  bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))

    ax.set_title(reduction_name+" 3D")
    ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
    ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
    ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))

In [10]:

D_reductions = [PCA, MDS, LDA, Isomap, LLE, TSNE]
plots = [plot_1D, plot_2D, plot_3D]
names = ['PCA', 'MDS', 'LDA', 'Isomap', 'LLE', 'TSNE']

# MNIST
n = 1
data_limits = 100  # 取前 100 个数据
fig = plt.figure(figsize=(15, 25), dpi=100)
for index, reduction in enumerate(D_reductions):
    for i in range(3):
        if reduction == LDA and i == 2:
            n += 1
            continue
        X = training_data[0][:data_limits]
        y = training_data[1][:data_limits]
        instance = reduction(n_components=i+1)
        X = instance.fit_transform(X, y)
        if i < 2:
            ax = fig.add_subplot(len(D_reductions), 3, n, frameon=True)
        else:
            ax = fig.add_subplot(len(D_reductions), 3, n,
                                 elev=48, azim=134, projection='3d')
        plots[i](ax, X, y, names[index])
        n += 1

/home/nwpu/anaconda3/lib/python3.7/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/home/nwpu/anaconda3/lib/python3.7/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")

分类方法比较¶

"KNN", "Linear SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA"

In [11]:

h = .02

names = ["KNN", "Linear SVM", "Decision Tree",
         "Random Forest", "AdaBoost", "Naive Bayes", "LDA"]

classifiers = [
    KNN(3),
    SVC(kernel="linear", C=0.025),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA()]

# 对降维后的 MNIST 进行分类

mnist_t = [stacked[:100, :-1], stacked[:100, -1]]

datasets = [mnist_t]

figure = plt.figure(figsize=(15, 15), dpi=100)
i = 1

for ds in datasets:
    # 数据预处理
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # 先绘制原始数据
    cm = plt.cm.rainbow
    ax = plt.subplot(len(datasets)*3, (len(classifiers) + 3)//3, i)
    # training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm)
    # testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets)*3, (len(classifiers) + 3)//3, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # 绘制决策边界
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

        # 决策边界
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.3)

        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm)
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm,
                   alpha=0.5)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1

plt.show()

MNIST数据集分析

Table of Contents

可视化¶

降维方法比较¶

分类方法比较¶

Published

Category

Tags

Contact