7 분 소요

보스톤 주택 가격 예측

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

1. 데이터 탐색

# 1.2 이하 버전
# from sklearn.datasets import load_boston
# boston = load_boston()

# df = pd.DataFrame(boston.data, columns=boston.feature_names)
# df['PRICE'] = boston.target
# df.head()
# 1.2 이상 버전
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
df = pd.DataFrame(data, columns=feature_names)
df['PRICE'] = target
df.head()
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT PRICE
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2
 CRIM     per capita crime rate by town
 ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
 INDUS    proportion of non-retail business acres per town
 CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
 NOX      nitric oxides concentration (parts per 10 million)
 RM       average number of rooms per dwelling
 AGE      proportion of owner-occupied units built prior to 1940
 DIS      weighted distances to five Boston employment centres
 RAD      index of accessibility to radial highways
 TAX      full-value property-tax rate per $10,000
 PTRATIO  pupil-teacher ratio by town
 B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
 LSTAT    % lower status of the population
 MEDV     Median value of owner-occupied homes in $1000's
  • CRIM: 지역별 범죄 발생률
  • ZN: 25,000평방피트를 초과하는 거주 지역의 비율
  • NDUS: 비상업 지역 넓이 비율
  • CHAS: 찰스강에 대한 더미 변수(강의 경계에 위치한 경우는 1, 아니면 0)
  • NOX: 일산화질소 농도
  • RM: 거주할 수 있는 방 개수
  • AGE: 1940년 이전에 건축된 소유 주택의 비율
  • DIS: 5개 주요 고용센터까지의 가중 거리
  • RAD: 고속도로 접근 용이도
  • TAX: 10,000달러당 재산세율
  • PTRATIO: 지역의 교사와 학생 수 비율
  • B: 지역의 흑인 거주 비율
  • LSTAT: 하위 계층의 비율
  • MEDV: 본인 소유의 주택 가격(중앙값)

  • 누락 데이터 확인
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB
  • 통계 정보
df.describe()
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT PRICE
count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 3.613524 11.363636 11.136779 0.069170 0.554695 6.284634 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 12.653063 22.532806
std 8.601545 23.322453 6.860353 0.253994 0.115878 0.702617 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 7.141062 9.197104
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000 5.000000
25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 6.950000 17.025000
50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 11.360000 21.200000
75% 3.677083 12.500000 18.100000 0.000000 0.624000 6.623500 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000 25.000000
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000 50.000000
  • 수치형 데이터 시각화(히스토그램)
df.hist(bins=50, figsize=(20, 15))
plt.show()

png

  • 상관계수
plt.figure(figsize=(12, 8))
corr = df.corr()
sns.heatmap(corr, annot=True, fmt='.2f')
<AxesSubplot:>

png

df.plot(kind='scatter', x='LSTAT', y='PRICE', alpha=0.5) # 음의 상관관계계
plt.show()

png

df.plot(kind='scatter', x='RM', y='PRICE', alpha=0.5)
plt.show()

png

2. 데이터 준비

from sklearn.model_selection import train_test_split

# 특성과 정답 분리리
X = df.drop('PRICE', axis=1)
y = df['PRICE']
print(X.shape, y.shape)
(506, 13) (506,)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
(404, 13) (404,) (102, 13) (102,)

3. 모델 훈련

3.1 기본 선형 모델

  • 기본 선형 모델 (정규방정식)
from sklearn.linear_model import LinearRegression

# 잘못된 방법 (교차검증을 사용)
lin_reg = LinearRegression() # 모델 생성
lin_reg.fit(X_train, y_train) # 모델 훈련
pred = lin_reg.predict(X_test) # 모델 예측
from sklearn.model_selection import cross_val_score
# 올바른 방법 (교차 검증을 사용)
lin_reg = LinearRegression() # 모델 생성

# cross_val_score(모델, 특성, 정답, 평가지표, 폴드수)
scores = cross_val_score(lin_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
lin_reg_rmse = np.sqrt(-scores.mean()) # RMSE
lin_reg_rmse
4.86358080742005
  • 기본 선형 모델 (경사하강법 + 특성 스케일링)
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler # 평균을 0, 분산을 1로 맞춰주는 변환기

# 특성 스케일링

# 변환기 = 변환기 객체 생성()
# 변환기.fit() # 변환할 준비
# 변환기.transform() # 실제 변환

std_scaler = StandardScaler() 
X_train_scaled = std_scaler.fit_transform(X_train)
print('특성스케일링 후 평균/표준편차:', X_train_scaled.mean(), X_train_scaled.std())

sgd_reg = SGDRegressor()

# cross_val_score(모델, 특성, 정답, 평가지표, 폴드수)
scores = cross_val_score(sgd_reg, X_train_scaled, y_train, scoring='neg_mean_squared_error', cv=5)
sgd_reg_rmse = np.sqrt(-scores.mean())
sgd_reg_rmse
특성스케일링 후 평균/표준편차: -9.740875280793452e-17 1.0





4.892560383275695

3.2 다항 회귀 모델

  • 다항 회귀(정규방정식)
from sklearn.preprocessing import PolynomialFeatures # 원본 특성에 제곱항을 추가해주는 변환기

# 변환기 = 변환기 객체 생성()
# 변환기.fit() # 변환할 준비
# 변환기.transform() # 실제 변환

poly_feature = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_feature.fit_transform(X_train)
print(X_train.shape, X_train_poly.shape)

lin_reg = LinearRegression()

# cross_val_score(모델, 특성, 정답, 평가지표, 폴드수)
scores = cross_val_score(lin_reg, X_train_poly, y_train, scoring='neg_mean_squared_error', cv=5)
poly_lin_reg_rmse = np.sqrt(-scores.mean()) # RMSE
poly_lin_reg_rmse
(404, 13) (404, 104)





4.349154691468671
  • 다항 회귀(경사하강법)
# (1) Poly(제곱특성추가) -> (2) STD scale(표준화) -> (3) SGDRegressor (경사하강법)

# (1) Poly(제곱특성추가)
poly_feature = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_feature.fit_transform(X_train)
print(X_train.shape, X_train_poly.shape)

# (2) STD scale(표준화)
std_scaler = StandardScaler() 
X_train_poly_scaled = std_scaler.fit_transform(X_train_poly)
print('특성스케일링 후 평균/표준편차:', X_train_poly_scaled.mean(), X_train_poly_scaled.std())

# (3) SGDRegressor (경사 하강법)
sgd_reg = SGDRegressor(penalty=None, random_state=42)
scores = cross_val_score(sgd_reg, X_train_poly_scaled, y_train, scoring='neg_mean_squared_error', cv=5)
sgd_reg_rmse = np.sqrt(-scores.mean())
sgd_reg_rmse
(404, 13) (404, 104)
특성스케일링 후 평균/표준편차: 3.016965538356861e-16 0.9999999999999999





3.8507394341607575

3.3 규제모델

  • 모델 파라미터 규제 되는지 확인
from sklearn.linear_model import Ridge, Lasso, ElasticNet
  • 기본 선형회귀 모델
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
LinearRegression()
X_train.shape
(404, 13)
lin_reg.intercept_, lin_reg.coef_
(30.24675099392408,
 array([-1.13055924e-01,  3.01104641e-02,  4.03807204e-02,  2.78443820e+00,
        -1.72026334e+01,  4.43883520e+00, -6.29636221e-03, -1.44786537e+00,
         2.62429736e-01, -1.06467863e-02, -9.15456240e-01,  1.23513347e-02,
        -5.08571424e-01]))
lin_coef = pd.Series(lin_reg.coef_, index=X_train.columns)
lin_coef_sort = lin_coef.sort_values(ascending=False)
sns.barplot(x=lin_coef_sort.values, y= lin_coef_sort.index)
plt.show()

png

  • 릿지 회귀(Ridge)
alphas = [0, 0.1, 1, 10, 100]

coef_df = pd.DataFrame()
for alpha in alphas:
  ridge_reg = Ridge(alpha=alpha, random_state=42)
  ridge_reg.fit(X_train, y_train)

  ridge_coef = pd.Series(ridge_reg.coef_, index=X_train.columns)
  ridge_coef_sort = ridge_coef.sort_values(ascending=False)
  column = 'alpha :' + str(alpha)
  coef_df[column] = ridge_coef_sort

coef_df
alpha :0 alpha :0.1 alpha :1 alpha :10 alpha :100
RM 4.438835 4.445779 4.464505 4.195326 2.438815
CHAS 2.784438 2.750333 2.545470 1.813291 0.550702
RAD 0.262430 0.260043 0.248882 0.248031 0.299014
INDUS 0.040381 0.034896 0.007498 -0.026277 -0.048625
ZN 0.030110 0.030459 0.032271 0.035552 0.039892
B 0.012351 0.012400 0.012642 0.012833 0.011951
AGE -0.006296 -0.007305 -0.012191 -0.015341 0.000545
TAX -0.010647 -0.010780 -0.011475 -0.012744 -0.014630
CRIM -0.113056 -0.112400 -0.109234 -0.107134 -0.110765
LSTAT -0.508571 -0.510902 -0.523833 -0.561835 -0.689539
PTRATIO -0.915456 -0.900771 -0.828604 -0.761769 -0.817852
DIS -1.447865 -1.429608 -1.338700 -1.232621 -1.129400
NOX -17.202633 -15.924459 -9.537952 -1.889245 -0.197859
  • 라쏘 회귀(Lasso)
alphas = [0.05, 0.1, 0.2, 0.5, 1]

coef_df = pd.DataFrame()
for alpha in alphas:
  lasso_reg = Lasso(alpha=alpha, random_state=42)
  lasso_reg.fit(X_train, y_train)

  lasso_coef = pd.Series(lasso_reg.coef_, index=X_train.columns)
  lasso_coef_sort = lasso_coef.sort_values(ascending=False)
  column = 'alpha :' + str(alpha)
  coef_df[column] = lasso_coef_sort

coef_df
alpha :0.05 alpha :0.1 alpha :0.2 alpha :0.5 alpha :1
RM 4.443676 4.311687 4.026917 3.129886 1.630489
CHAS 1.704029 0.919952 0.000000 0.000000 0.000000
RAD 0.234443 0.239237 0.245289 0.236596 0.219654
ZN 0.034602 0.034893 0.034848 0.032640 0.028501
B 0.013035 0.013091 0.013039 0.012350 0.011181
TAX -0.012599 -0.012962 -0.013317 -0.013032 -0.012286
AGE -0.017338 -0.015126 -0.010294 0.000000 0.016395
INDUS -0.023023 -0.016785 -0.005376 -0.000000 -0.000000
CRIM -0.104256 -0.104157 -0.103020 -0.093034 -0.076609
NOX -0.524613 -0.000000 -0.000000 -0.000000 -0.000000
LSTAT -0.549276 -0.564674 -0.590514 -0.649984 -0.747107
PTRATIO -0.729013 -0.732247 -0.741718 -0.729229 -0.708582
DIS -1.183960 -1.151487 -1.094868 -0.915255 -0.630858
  • 엘라스틱넷(ElasticNet)
alphas = [0.05, 0.1, 0.2, 0.5, 1]
coef_df = pd.DataFrame()
for alpha in alphas:
  elastic_reg = ElasticNet(alpha=alpha, random_state=42)
  elastic_reg.fit(X_train, y_train)

  elastic_coef = pd.Series(elastic_reg.coef_, index=X_train.columns)
  elastic_coef_sort = elastic_coef.sort_values(ascending=False)
  column = 'alpha :' + str(alpha)
  coef_df[column] = elastic_coef_sort

coef_df
alpha :0.05 alpha :0.1 alpha :0.2 alpha :0.5 alpha :1
RM 4.134773 3.764341 3.160552 2.051658 1.162996
CHAS 1.521003 0.977221 0.404020 0.000000 0.000000
RAD 0.247966 0.258443 0.273963 0.287364 0.275980
ZN 0.035809 0.037015 0.038071 0.037961 0.035571
B 0.012867 0.012746 0.012439 0.011721 0.011013
TAX -0.012941 -0.013479 -0.014028 -0.014505 -0.014273
AGE -0.014885 -0.011703 -0.005211 0.006508 0.018591
INDUS -0.027025 -0.030900 -0.031594 -0.030560 -0.020130
CRIM -0.106450 -0.106853 -0.107092 -0.103047 -0.093299
LSTAT -0.569848 -0.600051 -0.644219 -0.719262 -0.775576
PTRATIO -0.753753 -0.761627 -0.783677 -0.794361 -0.752705
NOX -0.959210 -0.019932 -0.000000 -0.000000 -0.000000
DIS -1.205832 -1.176020 -1.132253 -0.987340 -0.755423

참고

  • 정규화 (0~1사이로 변환)
def minmax_normalize(arr):
    return (arr-arr.min())/(arr.max()-arr.min())
  • 표준화 (평균 0, 표준편차 1)
def zscore_standize(arr): # 평균 0, 표준편차 1
    return (arr - arr.mean())/arr.std()
X = np.arange(10)
X_normalized = minmax_normalize(X)
X_normalized.min(), X_normalized.max()
(0.0, 1.0)
X = np.arange(10)
X_standardized = zscore_standize(X)
X_standardized.mean(), X_standardized.std()
(-6.661338147750939e-17, 1.0)

댓글남기기