SVDによる次元削減の例#

  • 全体の流れ

    • iris datasetを準備。

    • SVDで特異値分解し、3次元に圧縮。

    • 比較対象としてPCAで第3主成分まで使うものを用意。

    • LogisticRegressionで分類学習。学習・テスト用のデータセット分割は省略。

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn import linear_model
from sklearn import decomposition

データセット準備#

iris = datasets.load_iris()
X = iris.data
y = iris.target

print(X.shape)
print(X[0])
print(len(y))
print(y)
(150, 4)
[5.1 3.5 1.4 0.2]
150
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

SVDによる特異値分解#

U, s, Vt = np.linalg.svd(X)
print(U.shape, s.shape, Vt.shape)
(150, 150) (4,) (4, 4)
print(U[0])
[-0.06161685  0.12961144  0.0021386   0.00163819 -0.07300798 -0.08134924
 -0.06909558 -0.07113949 -0.06195164 -0.06612256 -0.0779418  -0.06706386
 -0.06552289 -0.06185789 -0.08830269 -0.08976151 -0.08609401 -0.07666031
 -0.08279348 -0.07649756 -0.07454597 -0.07870518 -0.07197388 -0.07751697
 -0.06350528 -0.06858871 -0.07505083 -0.07437007 -0.07521512 -0.06493685
 -0.06604042 -0.08201589 -0.07386819 -0.08227844 -0.06867133 -0.07401577
 -0.08107661 -0.06901449 -0.06347898 -0.07258421 -0.07640179 -0.06508444
 -0.06416128 -0.08048951 -0.07430155 -0.07062043 -0.0727626  -0.06586452
 -0.07649708 -0.07198453 -0.09197854 -0.08823139 -0.09036906 -0.07299205
 -0.08712533 -0.07165625 -0.0873042  -0.06532195 -0.08381366 -0.07375745
 -0.06302969 -0.08388409 -0.07222819 -0.07795265 -0.08122842 -0.09086183
 -0.07599136 -0.0698583  -0.08193049 -0.07120772 -0.08509553 -0.08336609
 -0.07965387 -0.07251397 -0.08448281 -0.08907596 -0.08653833 -0.09104982
 -0.08142908 -0.07518959 -0.07060805 -0.06924548 -0.07732822 -0.0761784
 -0.07310193 -0.08568359 -0.08985201 -0.07980502 -0.0756386  -0.07367435
 -0.06672195 -0.07947999 -0.07580088 -0.06642552 -0.07342897 -0.07334836
 -0.07555598 -0.08159337 -0.07465988 -0.07640103 -0.09482259 -0.08093526
 -0.09634801 -0.08036141 -0.09141467 -0.09526824 -0.06927014 -0.08650523
 -0.0824033  -0.10766229 -0.09530278 -0.08723118 -0.09675863 -0.08254321
 -0.09402024 -0.09913198 -0.08477818 -0.10080471 -0.09688732 -0.07311009
 -0.10161079 -0.08330813 -0.09229571 -0.08798247 -0.09396497 -0.08964253
 -0.08806509 -0.08611647 -0.08911128 -0.08623509 -0.09252995 -0.10215519
 -0.09166004 -0.07830493 -0.06625347 -0.10774146 -0.09735974 -0.08367461
 -0.08585795 -0.09973069 -0.10211517 -0.1083868  -0.08093526 -0.09779369
 -0.10416003 -0.10397003 -0.08866275 -0.09343429 -0.09573864 -0.08085465]
print(s)
[95.95991387 17.76103366  3.46093093  1.88482631]
# 対角行列に変換
sigma = np.zeros((U.shape[1], Vt.shape[0]))
for i in range(Vt.shape[0]):
  sigma[i, i] = s[i]

# 特異値分解によりどのぐらい近似できているかを確認
approximation = np.dot(np.dot(U, sigma), Vt)
diff = X - approximation
print(np.linalg.norm(diff))
4.409066883547835e-14
# 特異値分解によりどのぐらい近似できているかを確認
np.allclose(X, approximation)
True
# 特異値の大きさを確認
plt.plot(s)
[<matplotlib.lines.Line2D at 0x7f8c1fa0f490>]
../_images/668f994e21bf1c20a1da10338b4d28beae15bd8df602157913789747a0699d2b.png

行列Uのk=3までを採用(=次元削減)#

k = 3
new_x = U[:, :k]
print(new_x.shape)
print(new_x[0])
print(X[0])
(150, 3)
[-0.06161685  0.12961144  0.0021386 ]
[5.1 3.5 1.4 0.2]

比較対象用の主成分分析による次元削減#

pca = decomposition.PCA(n_components=3)
pca.fit(X)
new_x2 = pca.transform(X)
print(new_x2.shape)
print(new_x2[0])
(150, 3)
[-2.68412563  0.31939725 -0.02791483]

分類学習の結果#

scores = []
dataset = {"original":X, "SVD(k=3)":new_x, "PCA(n=3)":new_x2}
for label, data in dataset.items():
  clf = linear_model.LogisticRegression(penalty='l1', solver='liblinear',
                                        tol=1e-6, max_iter=int(1e6),
                                        warm_start=True,
                                        intercept_scaling=10000.)
  clf.fit(data, y)
  scores.append({label:clf.score(data, y)})

print(scores)
[{'original': 0.9533333333333334}, {'SVD(k=3)': 0.8733333333333333}, {'PCA(n=3)': 0.9666666666666667}]