对MNIST数据集的简单分析与可视化,比较几种不同分类算法在降维后MNIST数据集上的效果
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn import neighbors, datasets
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, TSNE, Isomap
from sklearn.manifold import LocallyLinearEmbedding as LLE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import gzip
import pickle
可视化¶
导入数据
In [2]:
f = gzip.open('datasets/mnist.pkl.gz', 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
f.close()
维数检查
In [3]:
training_data[0].shape
Out[3]:
显示实例
In [4]:
plt.imshow(training_data[0][0].reshape(28, 28))
plt.gcf().set_size_inches(6, 6)
PCA
In [5]:
X = training_data[0]
y = training_data[1]
pca = PCA(n_components=2)
pca.fit(X, y)
X_t = pca.transform(X)
转换矩阵可视化
In [6]:
# 第一维投影矩阵
plt.subplot(2, 1, 1)
plt.imshow(pca.components_[0, :].reshape(28, 28))
# 第二维投影矩阵
plt.subplot(2, 1, 2)
plt.imshow(pca.components_[1, :].reshape(28, 28))
plt.gcf().set_size_inches(6, 12)
plt.show()
violin 图可视化
In [7]:
stacked = np.hstack([X_t, y.reshape(y.shape[0], -1)])
stacked = stacked.copy()
df = pd.DataFrame(stacked, columns=['feat1', 'feat2', 'class'])
df.head()
Out[7]:
In [8]:
fig = plt.figure(figsize=(10, 10), dpi=100)
for column_index, column in enumerate(df.columns):
if column == 'class':
continue
ax = fig.add_subplot(2, 1, column_index + 1, frameon=True)
sns.violinplot(x='class', y=column, data=df)
降维方法比较¶
In [9]:
def plot_1D(ax, X, y, name):
ax.scatter(X[:, 0], np.zeros(X.shape[0]), c=y, cmap=cm.rainbow, alpha=0.5)
ax.set_title(name+" 1D")
def plot_2D(ax, X, y, name):
ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm.rainbow, alpha=0.5)
ax.set_title(name+" 2D")
def plot_3D(ax, X, y, reduction_name):
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=cm.rainbow, alpha=0.5)
# 显示标记
for name, label in [('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]:
ax.text3D(X[y == label, 0].mean(),
X[y == label, 1].mean() + 1.5,
X[y == label, 2].mean(), name,
horizontalalignment='center',
bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
ax.set_title(reduction_name+" 3D")
ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
In [10]:
D_reductions = [PCA, MDS, LDA, Isomap, LLE, TSNE]
plots = [plot_1D, plot_2D, plot_3D]
names = ['PCA', 'MDS', 'LDA', 'Isomap', 'LLE', 'TSNE']
# MNIST
n = 1
data_limits = 100 # 取前 100 个数据
fig = plt.figure(figsize=(15, 25), dpi=100)
for index, reduction in enumerate(D_reductions):
for i in range(3):
if reduction == LDA and i == 2:
n += 1
continue
X = training_data[0][:data_limits]
y = training_data[1][:data_limits]
instance = reduction(n_components=i+1)
X = instance.fit_transform(X, y)
if i < 2:
ax = fig.add_subplot(len(D_reductions), 3, n, frameon=True)
else:
ax = fig.add_subplot(len(D_reductions), 3, n,
elev=48, azim=134, projection='3d')
plots[i](ax, X, y, names[index])
n += 1
分类方法比较¶
"KNN", "Linear SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA"
In [11]:
h = .02
names = ["KNN", "Linear SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes", "LDA"]
classifiers = [
KNN(3),
SVC(kernel="linear", C=0.025),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(),
GaussianNB(),
LDA()]
# 对降维后的 MNIST 进行分类
mnist_t = [stacked[:100, :-1], stacked[:100, -1]]
datasets = [mnist_t]
figure = plt.figure(figsize=(15, 15), dpi=100)
i = 1
for ds in datasets:
# 数据预处理
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# 先绘制原始数据
cm = plt.cm.rainbow
ax = plt.subplot(len(datasets)*3, (len(classifiers) + 3)//3, i)
# training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm)
# testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets)*3, (len(classifiers) + 3)//3, i)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# 绘制决策边界
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# 决策边界
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.3)
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm)
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm,
alpha=0.5)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
plt.show()