1 분 소요

PCA

붓꽃 데이터

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
iris_df.head(3)
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0

sepal_length, sepal_width 두개의 특성으로 산점도

plt.scatter(iris_df['sepal length (cm)'], iris_df['sepal width (cm)'], c=iris_df['target'])
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.show()

png

X = iris.data
y = iris.target
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
iris_scaled = scaler.fit_transform(X)
from sklearn.decomposition import PCA

pca = PCA(n_components=2) # 4차원을 특성을 2차원으로 축소
iris_pca = pca.fit_transform(iris_scaled)
iris_pca.shape
(150, 2)
pca_columns = ['pca_component1', 'pca_component2']
iris_pca_df = pd.DataFrame(iris_pca, columns=pca_columns)
iris_pca_df['target'] = iris.target
iris_pca_df.head(3)
pca_component1 pca_component2 target
0 -2.264703 0.480027 0
1 -2.080961 -0.674134 0
2 -2.364229 -0.341908 0

PCA로 차원축소된 새로운 특성으로 산점도

plt.scatter(iris_pca_df['pca_component1'], iris_pca_df['pca_component2'], c=iris_pca_df['target'])
plt.xlabel('pca component 1')
plt.ylabel('pca component 2')
plt.show()

png

pca.explained_variance_ratio_ # 각 pca component 의 분산 비율
array([0.72962445, 0.22850762])
0.72962445 + 0.22850762
0.95813207

원본 데이터와 PCA 변환된 데이터로 모델 성능 측정

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf_clf = RandomForestClassifier(random_state=42)
scores = cross_val_score(rf_clf, X, y, scoring='accuracy', cv=3)
scores
array([0.98, 0.94, 0.98])
rf_clf = RandomForestClassifier(random_state=42)
scores = cross_val_score(rf_clf, iris_pca, y, scoring='accuracy', cv=3)
scores
array([0.88, 0.88, 0.9 ])
iris_pca.shape
(150, 2)

Reference

댓글남기기