01 02 03 04 05 06 07 08 09 10 11 12 13 14 | import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import load_boston # サンプルデータを用意 dataset = load_boston() # 標本データを取得 data_x = pd.DataFrame(dataset.data,columns = dataset.feature_names) # 正解データを取得 data_y = pd.DataFrame(dataset.target,columns = [ 'target' ]) |
VIF variance_inflation_factor
1 2 3 4 5 | from statsmodels.stats.outliers_influence import variance_inflation_factor vif = pd.DataFrame() vif[ "VIF Factor" ] = [variance_inflation_factor(data_x.values, i) for i in range (data_x.shape[ 1 ])] vif[ "features" ] = data_x.columns |
減少法-変数選択-VIF削除システム
variance_inflation_factoによって、算出したVIF値が指定した許容値に収まるように、変数を削除していく。
変数選択法の減少法を用いる。
VIF値が最大のものから順に削除していく。
01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score ''' VIF削除システム ''' #許容値 perm_level = 10 while True : x = data_x vif = pd.DataFrame() vif[ "VIF Factor" ] = [variance_inflation_factor(x.values, i) for i in range (x.shape[ 1 ])] vif[ "features" ] = x.columns if vif[ "VIF Factor" ]. max () > perm_level: print (vif) OLS_rv = LinearRegression(fit_intercept = True ).fit(x, data_y) print (OLS_rv.score(x, data_y)) remove_id = vif[ "VIF Factor" ].idxmax() remove_data = vif[ "features" ][remove_id] data_x = data_x.drop(remove_data, axis = 1 ) else : print ( "finish" ) print (vif) OLS_rv = LinearRegression(fit_intercept = True ).fit(x, data_y) print (OLS_rv.score(x, data_y)) y_pred = OLS_rv.predict(data_x) #y_pred = pd.DataFrame(y_pred.round(1)) print ( "2乗平均誤差:" , mean_squared_error(data_y, y_pred)) print ( "決定係数:" , r2_score(np.array(data_y), y_pred)) break |