Machine learning 15 (캘리포니아 주택 가격 예측 baseline )

2023-03-09 10 분 소요

캘리포니아 주택 가격 예측 모델 만들기

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

1. 데이터 가져오기

housing = pd.read_csv('./datasets/housing.csv')

2. 데이터 훑어보기

housing.head()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

범주형 특성 탐색

housing['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

수치형 특성 탐색

housing.describe()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
count	20640.000000	20640.000000	20640.000000	20640.000000	20433.000000	20640.000000	20640.000000	20640.000000	20640.000000
mean	-119.569704	35.631861	28.639486	2635.763081	537.870553	1425.476744	499.539680	3.870671	206855.816909
std	2.003532	2.135952	12.585558	2181.615252	421.385070	1132.462122	382.329753	1.899822	115395.615874
min	-124.350000	32.540000	1.000000	2.000000	1.000000	3.000000	1.000000	0.499900	14999.000000
25%	-121.800000	33.930000	18.000000	1447.750000	296.000000	787.000000	280.000000	2.563400	119600.000000
50%	-118.490000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.534800	179700.000000
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743250	264725.000000
max	-114.310000	41.950000	52.000000	39320.000000	6445.000000	35682.000000	6082.000000	15.000100	500001.000000

수치형 특성별 히스토그램

housing.hist(bins=50, figsize=(20, 15))
plt.show()

png

3. 데이터 세트 분리

훈련 데이터/ 테스트 데이터

계층적 샘플링(Straityfied sampling)

bins = [0, 1.5, 3.0, 4.5, 6.0, np.inf]
labels = [1, 2, 3, 4, 5]
housing['income_cat'] = pd.cut(housing['median_income'], bins=bins, labels=labels)

housing['income_cat'].value_counts() # 도수

  7236
  6581
  3639
  2362
   822
Name: income_cat, dtype: int64

housing['income_cat'].value_counts() / len(housing) # 상대도수

  0.350581
  0.318847
  0.176308
  0.114438
  0.039826
Name: income_cat, dtype: float64

from sklearn.model_selection import train_test_split
# 무작위 샘플링
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
# 계층적 샘플링
strat_train_set, strat_test_set = train_test_split(housing, stratify= housing['income_cat'], test_size=0.2, random_state=42)

strat_test_set['income_cat'].value_counts() / len(strat_test_set)

  0.350533
  0.318798
  0.176357
  0.114583
  0.039729
Name: income_cat, dtype: float64

데이터 되돌리기

strat_train_set = strat_train_set.drop('income_cat', axis=1)
strat_test_set = strat_test_set.drop('income_cat', axis=1)

4. 데이터 탐색

# 훈련세트만을 대상으로 데이터 탐색할 예정 (strat_test_set는 최종 예측에 사용)
housing = strat_train_set.copy()

4.1 지리적 데이터 시각화

# longitude(경도) : 동서
# latitude(위도) : 남북
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.3, grid=True)

<AxesSubplot:xlabel='longitude', ylabel='latitude'>

png

housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.3, grid=True,
             c='median_house_value', cmap='jet', colorbar=True, figsize=(10, 7), # color 를 통해서 주택가격 표시
             s= housing['population']/100, sharex=False) # size 를 통해서 상대적인 인구수를 표시

<AxesSubplot:xlabel='longitude', ylabel='latitude'>

png

지리적 데이터 분석 결과 : 해안가이면서 밀집 지역일수록 주택 가격이 높음

4.2 상관관계 조사

상관계수

# 모든 수치형 특성간의 상관계수 확인(타깃 포함)
corr_matrix = housing.corr()
corr_matrix

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
longitude	1.000000	-0.924478	-0.105848	0.048871	0.076598	0.108030	0.063070	-0.019583	-0.047432
latitude	-0.924478	1.000000	0.005766	-0.039184	-0.072419	-0.115222	-0.077647	-0.075205	-0.142724
housing_median_age	-0.105848	0.005766	1.000000	-0.364509	-0.325047	-0.298710	-0.306428	-0.111360	0.114110
total_rooms	0.048871	-0.039184	-0.364509	1.000000	0.929379	0.855109	0.918392	0.200087	0.135097
total_bedrooms	0.076598	-0.072419	-0.325047	0.929379	1.000000	0.876320	0.980170	-0.009740	0.047689
population	0.108030	-0.115222	-0.298710	0.855109	0.876320	1.000000	0.904637	0.002380	-0.026920
households	0.063070	-0.077647	-0.306428	0.918392	0.980170	0.904637	1.000000	0.010781	0.064506
median_income	-0.019583	-0.075205	-0.111360	0.200087	-0.009740	0.002380	0.010781	1.000000	0.687160
median_house_value	-0.047432	-0.142724	0.114110	0.135097	0.047689	-0.026920	0.064506	0.687160	1.000000

# 중간 주택가격(타깃)가 특성들간의 상관관계 확인
corr_matrix['median_house_value'].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64

산점도

attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
pd.plotting.scatter_matrix(housing[attributes], figsize=(12, 8), alpha=0.3)
plt.show()

png

# 중간 주택가격(타깃)과 중간소득의 산점도
housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.1, grid=True)

<AxesSubplot:xlabel='median_income', ylabel='median_house_value'>

png

4.3 특성 조합을 실험

# 가구당 방의갯수
# 전체방에서 침실방 차지 비율
# 가구당 인구수
housing['rooms_per_households']= housing['total_rooms'] / housing['households']
housing['bedrooms_per_rooms']= housing['total_bedrooms'] / housing['total_rooms']
housing['population_per_households']= housing['population'] / housing['households']

corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

median_house_value           1.000000
median_income                0.687160
rooms_per_households         0.146285
total_rooms                  0.135097
housing_median_age           0.114110
households                   0.064506
total_bedrooms               0.047689
population_per_households   -0.021985
population                  -0.026920
longitude                   -0.047432
latitude                    -0.142724
bedrooms_per_rooms          -0.259984
Name: median_house_value, dtype: float64

5. 데이터 전처리

# strat_train_set (데이터 탐색, 데이터 전처리)
# strat_test_set (최종 예측측)

# 특성(X)과 레이블(y)을 분리
housing = strat_train_set.drop('median_house_value', axis=1) # 특성 (X 데이터)
housing_label = strat_train_set['median_house_value'].copy() # 레이블 (y 데이터)

housing.shape, housing_label.shape

((16512, 9), (16512,))

5.1 데이터 전처리(1) - 결손값 처리

결손값(Null/NaN) 처리 방법

옵션1 : 해당 구역 제거
옵션2 : 전체 특성 삭제
옵션3 : 어떤 값으로 대체(0, 평균, 중간값 등)

scikit-learn의 전처리기를 이용하여 옵션3 을 처리

# <scikit-learn의 전처리기(변환기)들 예시>
# PolynomialFeatures : 다항 특성 추가
# StandardScaler : 표준화(평균 0, 분산 1)
# MinMaxScaler : 정규화(최소 0, 최대 1)
# LabelEncoder, OrdinalEncoder : 숫자로 변환
# OneHotEncoder : OneHot Encoding 
# SimpleImputer : 누락된 데이터 대체

# 함수를 이용한 전처리기
# 나만의 전처리기 

# 수치형 데이터만 준비
housing_num = housing.drop('ocean_proximity', axis=1)
housing_num.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')

# 수치형 데이터만 준비(또다른 방법)
# housing_num = housing.select_dtypes(include=[np.number])
# housing_num.columns

# SimpleImputer를 결측값을 대체(옵션3) 할 수 있음
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median') # 변환기 객체 생성 (중앙값을 대체)
imputer.fit(housing_num) # 변환할 준비 (중앙값 구하기)

SimpleImputer(strategy='median')

imputer.statistics_

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

# 위에서 imputer가 구해준 중앙값과 동일일
housing_num.median()

longitude             -118.5100
latitude                34.2600
housing_median_age      29.0000
total_rooms           2119.5000
total_bedrooms         433.0000
population            1164.0000
households             408.0000
median_income            3.5409
dtype: float64

X = imputer.transform(housing_num) # 변환 (중앙값으로 대체)

# transform의 결과는 numpy이므로 df로 바꿔서 확인인
# X_df = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)
# X_df.info()

5.2 데이터 전처리(2) - 데이터 인코딩

데이터 인코딩을 하는 이유는 머신러닝에서 수치값만 기대하기 때문

housing_cat = housing[['ocean_proximity']] # 2차원 데이터프레임으로 준비

(1) 레이블 인코딩

# pandas
pd.factorize(housing['ocean_proximity'])

(array([0, 0, 1, ..., 2, 0, 3], dtype=int64),
 Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object'))

# scikit-learn 변환기
from sklearn.preprocessing import OrdinalEncoder # LabelEncoder는 1차원 데이터를 기대대

ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit_transform(housing_cat)

array([[0.],
       [0.],
       [4.],
       ...,
       [1.],
       [0.],
       [3.]])

(2) 원핫 인코딩

숫자의 크기가 모델 훈련과정에서 잘못된 영향을 줄 수 있으므로 원핫 인코딩

# pandas
pd.get_dummies(housing_cat)

	ocean_proximity_<1H OCEAN	ocean_proximity_INLAND	ocean_proximity_ISLAND	ocean_proximity_NEAR BAY	ocean_proximity_NEAR OCEAN
17606	1	0	0	0	0
18632	1	0	0	0	0
14650	0	0	0	0	1
3230	0	1	0	0	0
3555	1	0	0	0	0
...	...	...	...	...	...
6563	0	1	0	0	0
12053	0	1	0	0	0
13908	0	1	0	0	0
11159	1	0	0	0	0
15775	0	0	0	1	0

16512 rows × 5 columns

# scikit-learn 변환기
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit_transform(housing_cat)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

onehot_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

5.3 데이터 전처리(3) - 특성 스케일링

표준화 (Z score Standardize) : 평균 0, 표준편차 1
정규화 (Min Max Scaling) : 0~1 사이로 정규화 (참고 : 특잇값에 영향을 받음)
로그 스케일링 : 데이터의 분포가 왜곡되어 있을때 주로 사용

arr = np.arange(9).reshape(3, 3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

Z_arr = (arr - arr.mean())/arr.std()
Z_arr.mean(), Z_arr.std()

(0.0, 1.0)

M_arr = (arr - arr.min())/(arr.max()-arr.min())
M_arr.min(), M_arr.max()

(0.0, 1.0)

# pandas
def minmax_normalize(arr):
  return (arr - arr.min())/(arr.max()-arr.min())

def zscore_standardize(arr):
  return (arr - arr.mean())/arr.std()

# scikit-learn 변환기

# (1) 표준화
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
housing_num_std = std_scaler.fit_transform(housing_num)
housing_num_std.mean(0), housing_num_std.std(0) # 컬럼별로 확인 axis=0

(array([-4.35310702e-15,  2.28456358e-15, -4.70123509e-17,  7.58706190e-17,
                    nan, -3.70074342e-17,  2.07897868e-17, -2.07628918e-16]),
 array([ 1.,  1.,  1.,  1., nan,  1.,  1.,  1.]))

# (2) 정규화
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
housing_num_mm = min_max_scaler.fit_transform(housing_num)
housing_num_mm.min(0), housing_num_mm.max(0)

(array([ 0.,  0.,  0.,  0., nan,  0.,  0.,  0.]),
 array([ 1.,  1.,  1.,  1., nan,  1.,  1.,  1.]))

# (3) 로그 스케일링 
from sklearn.preprocessing import FunctionTransformer
log_transformer = FunctionTransformer(np.log)
log_population = log_transformer.fit_transform(housing_num['population'])

# 로그 변환전
housing_num['population'].hist(bins=50)
plt.show()

png

# 로그 변환후
log_population.hist(bins=50)
plt.show()

png

5.4 데이터 전처리(4) - 변환 파이프라인

housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity'],
      dtype='object')

# 수치형 데이터
# (1) 누락된 데이터를 중앙값으로 대체
# (2) 표준화

from sklearn.pipeline import Pipeline
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('std_scaler', StandardScaler())
                        ])
# num_pipeline.fit_transform(housing_num) # pipeline의 중간결과를 확인하고 싶을 때때

# 범주형 데이터
# (1) 원핫 인코딩
# oh_encoder = OneHotEncoder(sparse=False)
# oh_encoder.fit_transform(housing_cat)

# 수치형 파이프라인과 범주형 변환기를 한번에 연결할 파이프라인인
from sklearn.compose import ColumnTransformer

num_attrib = list(housing_num.columns)
cat_attrib = ['ocean_proximity']

full_pipeline = ColumnTransformer([('num', num_pipeline, num_attrib),
                                   ('cat', OneHotEncoder(), cat_attrib)
                                  ])

housing_prepared = full_pipeline.fit_transform(housing)

housing.shape, housing_prepared.shape # 범주형 데이터의 OneHotEncoding으로 4 컬럼 추가가

((16512, 9), (16512, 13))

6. 모델 선택과 훈련

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

lin_reg = LinearRegression()
tree_reg = DecisionTreeRegressor(random_state=42)
rf_reg = RandomForestRegressor(random_state=42)

# LinearRegression 교차검증
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_label, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
lin_rmse = np.sqrt(-lin_scores.mean())
lin_rmse

69274.16940918249

# DecisionTree 교차검증
tree_scores = cross_val_score(tree_reg, housing_prepared, housing_label, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
tree_rmse = np.sqrt(-tree_scores.mean())
tree_rmse

69448.23452521549

# RandomForest 교차검증
rf_scores = cross_val_score(rf_reg, housing_prepared, housing_label, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
rf_rmse = np.sqrt(-rf_scores.mean())
rf_rmse

49640.00678301197

7. 모델 세부 튜닝

그리드 탐색

from sklearn.model_selection import GridSearchCV

rf_reg = RandomForestRegressor(random_state=42)

param_grid = {'n_estimators' : [30, 50, 100], 'max_features' : [2, 4, 6, 8]} # 3 * 4 = 12가지 조합의 파라미터로 설정된 모델 준비

grid_search = GridSearchCV(rf_reg, param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1) # 3 * 4 * 5 = 60번의 학습과 검증
%time grid_search.fit(housing_prepared, housing_label)

Wall time: 2min 31s

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_features': [2, 4, 6, 8],
                         'n_estimators': [30, 50, 100]},
             scoring='neg_mean_squared_error')

grid_search.best_params_

{'max_features': 8, 'n_estimators': 100}

grid_search.best_estimator_

RandomForestRegressor(max_features=8, random_state=42)

cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
  print(np.sqrt(-mean_score), params)

33887865031 {'max_features': 2, 'n_estimators': 30}
445796615226 {'max_features': 2, 'n_estimators': 50}
43495872492 {'max_features': 2, 'n_estimators': 100}
79774079741 {'max_features': 4, 'n_estimators': 30}
936831978 {'max_features': 4, 'n_estimators': 50}
984409225 {'max_features': 4, 'n_estimators': 100}
060190761295 {'max_features': 6, 'n_estimators': 30}
69831473408 {'max_features': 6, 'n_estimators': 50}
52108533805 {'max_features': 6, 'n_estimators': 100}
81805010987 {'max_features': 8, 'n_estimators': 30}
042709503105 {'max_features': 8, 'n_estimators': 50}
45382221812 {'max_features': 8, 'n_estimators': 100}

랜덤 탐색

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {'n_estimators' : randint(low=1, high=200),
                  'max_features' : randint(low=1, high=8)}

rnd_search = RandomizedSearchCV(rf_reg, param_distribs, n_iter=10, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)                  
%time rnd_search.fit(housing_prepared, housing_label)

Wall time: 2min 45s

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
                   n_jobs=-1,
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001CEA3CF0580>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001CEA3CF0550>},
                   scoring='neg_mean_squared_error')

rnd_search.best_params_

{'max_features': 7, 'n_estimators': 163}

rnd_search.best_estimator_

RandomForestRegressor(max_features=7, n_estimators=163, random_state=42)

cv_results = rnd_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
  print(np.sqrt(-mean_score), params)

59512716884 {'max_features': 7, 'n_estimators': 10}
08766555357 {'max_features': 3, 'n_estimators': 130}
57816434721 {'max_features': 6, 'n_estimators': 12}
12353477528 {'max_features': 5, 'n_estimators': 87}
55348866853 {'max_features': 7, 'n_estimators': 163}
728524855505 {'max_features': 4, 'n_estimators': 120}
82782355478 {'max_features': 6, 'n_estimators': 64}
44706394684 {'max_features': 4, 'n_estimators': 43}
98070924308 {'max_features': 4, 'n_estimators': 132}
67796991394 {'max_features': 3, 'n_estimators': 78}

# 참고
# 지수분포
# https://en.wikipedia.org/wiki/Exponential_distribution

# 로그 유니폼 분포
# https://en.wikipedia.org/wiki/Reciprocal_distribution

# param_distribs = {
#         'kernel': ['linear', 'rbf'],
#         'C': reciprocal(20, 200000), # 로그유니폼분포
#         'gamma': expon(scale=1.0),   # 지수분포
#     }

# reciprocal : 주어진 범위 안에서 균등 분포로 샘플링. 하이파라미터의 스케일에 대해 잘 모를때 사용용
# expon : 하이파라미터의 스케일에대해 어느 정도 알고 있을 때 사용용

best_model = rnd_search.best_estimator_

모델의 특성 중요도

특성 중요도는 트리 기반 모델만 제공

feature_importances = best_model.feature_importances_

onehot_encoder = full_pipeline.named_transformers_['cat']
cat_attrib = list(onehot_encoder.categories_[0])

attributes = num_attrib + cat_attrib
sorted(zip(feature_importances, attributes), reverse=True)

[(0.4306509289965746, 'median_income'),
 (0.14767733038782704, 'INLAND'),
 (0.11659906700600976, 'longitude'),
 (0.10277782275663969, 'latitude'),
 (0.049473849699940384, 'housing_median_age'),
 (0.04047400008194098, 'population'),
 (0.03173798410316071, 'total_rooms'),
 (0.028947917625006446, 'total_bedrooms'),
 (0.027055836949122423, 'households'),
 (0.015431473611590581, '<1H OCEAN'),
 (0.006618272198336929, 'NEAR OCEAN'),
 (0.002492445670795039, 'NEAR BAY'),
 (6.307091305548657e-05, 'ISLAND')]

8. 모델 예측과 성능 평가

테스트 데이터 변환환

X_test = strat_test_set.drop('median_house_value', axis=1)
y_test = strat_test_set['median_house_value'].copy()
X_test.shape, y_test.shape

((4128, 9), (4128,))

# 훈련데이터에 대해서 전처리 했던 것들
# (1) 수치 데이터 -> 누락값처리/표준화
# (2) 범주 데이터 -> 원핫인코딩
# ==> 테스트에 대해서도 동일하게 처리해주자!

# 훈련 데이터를 변경할때는 파이프라인의 fit_transform()을 사용
# 테스트 데이터를 변경할때는 파이프라인의 transform()을 사용

X_test_prepared = full_pipeline.transform(X_test)
X_test_prepared.shape

(4128, 13)

예측과 평가

from sklearn.metrics import mean_squared_error

final_predictions = best_model.predict(X_test_prepared)
final_rmse = mean_squared_error(y_test, final_predictions, squared=False) # RMSE
final_rmse

46496.80161722716

테스트 데이터의 변환과 예측을 한번에

# 전처리와와 모델을 파이프라인으로 연결해서 예측
full_pipeline_with_predictor = Pipeline([('preparation', full_pipeline),
                                         ('final', best_model)
                                         ])
final_predictions = full_pipeline_with_predictor.predict(X_test)
final_rmse = mean_squared_error(y_test, final_predictions, squared=False) # RMSE
final_rmse

46496.80161722716

일반화 오차 추정

테스트 RMSE에 대한 95% 신뢰 구간

from scipy.stats import t

# 추정량 (오차의 제곱들의 합)
squared_erros = (final_predictions - y_test)**2

# 95% 신뢰구간
confidence = 0.95

# 표본의 크기
n = len(squared_erros)

# 자유도 (degree of freedom)
dof = n-1

# 추정량의 평균
m_squared_error = np.mean(squared_erros)

# 표본의 표준편차 (비편상 분산으로 구함)
sample_std = np.std(squared_erros, ddof=1) # n-1로 나눔 (그림에서 U)

# 표준 오차
std_err = sample_std/n**0.5 # (그림에서 U/n**0.5)

mse_ci = t.interval(confidence, dof, m_squared_error, std_err)
rmse_ci = np.sqrt(mse_ci)
rmse_ci

array([44584.21439297, 48333.76612913])

9. 모델 저장

import joblib

joblib.dump(full_pipeline_with_predictor, 'my_model.pkl')

['my_model.pkl']

# 다시 불러오기
loaded_model = joblib.load('my_model.pkl')
final_predictions2 = loaded_model.predict(X_test)
mean_squared_error(y_test, final_predictions2, squared=False) # RMSE

46496.80161722716

Reference

핸즈온 머신러닝 (오렐리앙 제롱 저)

Twitter Facebook LinkedIn