라이브러리 불러오기
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
유틸 함수 추가하기
def get_validation(y_test, y_pred):
mse = mean_squared_error(y_test, y_pred)
return {
"MSE": mse,
"SQRT": np.sqrt(mse),
"R2 Score": r2_score(y_test, y_pred)
}
from io import StringIO
from google.colab import output
io = StringIO()
def autosave_print(*text):
output.clear()
clear_output()
print(*text, file=io)
print(io.getvalue(), end="")
def clear_print():
io.seek(0)
io.truncate(0)
def clear_output():
output.clear()
데이터 불러오기
data = load_diabetes()
df_diabetes = pd.DataFrame(data=data.data, columns=data.feature_names)
df_diabetes['target'] = data.target
df_diabetes.info()
df_diabetes.head()
성별 정보 처리하기
df_diabetes.sex.value_counts()
encoder = LabelEncoder()
df_diabetes['sex'] = encoder.fit_transform(df_diabetes['sex'])
df_diabetes.sex.value_counts()
데이터 시각화하기
cols_size = len(df_diabetes.columns)
plt.figure(figsize=(15, 12))
for i, column in enumerate(df_diabetes.columns):
ax = plt.subplot(cols_size // 4 + 1, 4, i+1)
label_size = len(df_diabetes[column].unique())
if label_size < 10:
sns.violinplot(x=column, y='target', hue=column, data=df_diabetes, ax=ax)
plt.title(column)
plt.xlabel("")
plt.ylabel("")
ax.get_legend().remove()
else:
sns.regplot(x=column, y='target', data=df_diabetes, ax=ax)
plt.title(column)
plt.xlabel("")
plt.ylabel("")
상관계수 출력하기
df_diabetes.corr()[['target']].sort_values(by='target', key=abs, ascending=False)
Train, Test 데이터 분리하기
X = df_diabetes.iloc[:, :-1]
y = df_diabetes.iloc[:, -1]
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=2045)
print("Train Size:", X_train.shape, y_train.shape)
print("Test Size:", X_test.shape, y_test.shape)
모델별 학습하기
%%time
clear_print()
grap_time = datetime.now()
model_lr = LinearRegression(n_jobs=-1)
model_lr.fit(X_train, y_train)
autosave_print("LinearRegression", datetime.now() - grap_time)
grap_time = datetime.now()
model_dt = DecisionTreeRegressor(random_state=2045)
model_dt.fit(X_train, y_train)
autosave_print("DecisionTreeRegressor", datetime.now() - grap_time)
grap_time = datetime.now()
model_random = RandomForestRegressor(n_jobs=-1, verbose=3, random_state=2045)
model_random.fit(X_train, y_train)
autosave_print("RandomForestRegressor", datetime.now() - grap_time)
grap_time = datetime.now()
model_ada = AdaBoostRegressor(random_state=2045)
model_ada.fit(X_train, y_train)
autosave_print("AdaBoostRegressor", datetime.now() - grap_time)
grap_time = datetime.now()
model_gbm = GradientBoostingRegressor(verbose=3, random_state=2045)
model_gbm.fit(X_train, y_train)
autosave_print("GradientBoostingRegressor", datetime.now() - grap_time)
grap_time = datetime.now()
model_lgbm = LGBMRegressor(n_jobs=-1, random_state=2045)
model_lgbm.fit(X_train, y_train)
autosave_print("LGBMRegressor", datetime.now() - grap_time)
grap_time = datetime.now()
model_xgbm = XGBRegressor(n_jobs=-1, random_state=2045)
model_xgbm.fit(X_train, y_train)
autosave_print("XGBRegressor", datetime.now() - grap_time)
grap_time = datetime.now()
model_kn = KNeighborsRegressor(n_jobs=-1)
model_kn.fit(X_train, y_train)
autosave_print("KNeighborsRegressor", datetime.now() - grap_time)
모델별 학습 결과 평가하기
validations = {}
y_pred_lr = model_lr.predict(X_test)
validations["logistic_regression"] = get_validation(y_test, y_pred_lr)
y_pred_dt = model_dt.predict(X_test)
validations["decision_tree"] = get_validation(y_test, y_pred_dt)
y_pred_random = model_random.predict(X_test)
validations["random_forest"] = get_validation(y_test, y_pred_random)
y_pred_ada = model_ada.predict(X_test)
validations["ada_boosting"] = get_validation(y_test, y_pred_ada)
y_pred_gbm = model_gbm.predict(X_test)
validations["gradient_boosting"] = get_validation(y_test, y_pred_gbm)
y_pred_lgbm = model_lgbm.predict(X_test)
validations["light_gbm"] = get_validation(y_test, y_pred_lgbm)
y_pred_xgbm = model_xgbm.predict(X_test)
validations["extra_gbm"] = get_validation(y_test, y_pred_xgbm)
y_pred_kn = model_kn.predict(X_test)
validations["k_neighbor"] = get_validation(y_test, y_pred_kn)
df = pd.DataFrame(validations).transpose()
df.style.background_gradient(subset=['R2 Score'], cmap='BuGn')
그래프로 시각화하기
plt.figure(figsize=(15, 8))
sns.kdeplot(y_train, label='y_train')
sns.kdeplot(y_test, label='y_test')
sns.kdeplot(y_pred_lr, label='linear_regression')
sns.kdeplot(y_pred_dt, label='decision_tree')
sns.kdeplot(y_pred_random, label='random_forest')
sns.kdeplot(y_pred_ada, label='ada_boosting')
sns.kdeplot(y_pred_gbm, label='gradient_boosting')
sns.kdeplot(y_pred_lgbm, label='light_gbm')
sns.kdeplot(y_pred_xgbm, label='extra_gbm')
sns.kdeplot(y_pred_kn, label='k_neighbor')
plt.legend()
plt.show()