2 분 소요

결정트리

from sklearn.tree import export_graphviz
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import graphviz

1. 분류

iris = load_iris()
X = iris.data[:, 2:] # 꽃잎의 길이, 너비('petal length (cm)', 'petal width (cm)')
y = iris.target 
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X, y)
DecisionTreeClassifier(max_depth=2, random_state=42)
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html
from sklearn import tree
res = tree.plot_tree(tree_clf,
               feature_names = iris.feature_names[2:],
               class_names = iris.target_names,
               rounded = True,
               filled = True)

png

export_graphviz(tree_clf,
               out_file = 'iris_tree.dot',
               feature_names = iris.feature_names[2:],
               class_names = iris.target_names,
               rounded = True,
               filled = True)
with open('iris_tree.dot') as f:
    dot_graph = f.read()
    
graphviz.Source(dot_graph)    

svg

tree_clf.feature_importances_ # petal length(56%), petal width(43%)
array([0.56199095, 0.43800905])

2. 회귀(수치예측)

import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)

m = 200
X = np.random.rand(m, 1)
# y = 4(X-0.5)**2
y = 4 * (X-0.5)**2 + np.random.randn(m, 1)/10
plt.plot(X, y, 'b.')
plt.show()

png

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg.fit(X,y)
DecisionTreeRegressor(max_depth=2, random_state=42)
plt.figure(figsize=(14, 8))
res = tree.plot_tree(tree_reg,
               feature_names = ['x1'],
               class_names = ['y'],
               rounded = True,
               filled = True,
               fontsize=14)

png

tree_reg.predict([[0.6]])
array([0.11063973])

앙상블 학습

1. 투표기반 분류기

from sklearn.datasets import make_moons
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.ensemble import VotingClassifier

# 개별별 분류기
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
X, y = make_moons(n_samples=500, noise=0.3, random_state=42)
X.shape, y.shape
((500, 2), (500,))
plt.plot(X[:, 0][y==0], X[:, 1][y==0], 'b.')
plt.plot(X[:, 0][y==1], X[:, 1][y==1], 'r.')
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()

png

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(375, 2) (125, 2) (375,) (125,)

1.1 하드 보팅

lr_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)


voting_clf = VotingClassifier(
                    estimators=[('lr', lr_clf), ('rf', rnd_clf), ('svm', svm_clf)],
                    voting = 'hard'
              )
from sklearn.metrics import accuracy_score
for clf in (lr_clf, rnd_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.912

1.2 소프트 보팅

lr_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)


voting_clf = VotingClassifier(
                    estimators=[('lr', lr_clf), ('rf', rnd_clf), ('svm', svm_clf)],
                    voting = 'soft'
              )
from sklearn.metrics import accuracy_score
for clf in (lr_clf, rnd_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92

2. 배깅 앙상블

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=500,
                            max_samples=100, bootstrap=True, random_state=42, n_jobs=-1)

bag_clf.fit(X_train, y_train)
BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                  max_samples=100, n_estimators=500, n_jobs=-1,
                  random_state=42)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred) # 정확도도
0.904
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)

y_pred = tree_clf.predict(X_test)
accuracy_score(y_test, y_pred)
0.856

oob 평가

bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=500,
                            max_samples=100, bootstrap=True, oob_score=True, random_state=42, n_jobs=-1)

bag_clf.fit(X_train, y_train)
BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                  max_samples=100, n_estimators=500, n_jobs=-1, oob_score=True,
                  random_state=42)
bag_clf.oob_score_ # 교차검증 대용용
0.9253333333333333
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred) # 정확도
0.904
bag_clf = BaggingClassifier(DecisionTreeClassifier(max_leaf_nodes=16, random_state=42), n_estimators=500,
                            bootstrap=True, oob_score=True, random_state=42, n_jobs=-1)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)
0.912
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)
0.912
rnd_clf.feature_importances_ # x1 : 42%, x2 : 57% 중요도
array([0.42253629, 0.57746371])

Reference

댓글남기기