라이브러리 불러오기
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
유틸 함수 추가하기
def get_validation(y_test, y_pred):
matrix = confusion_matrix(y_test, y_pred).tolist()
return {
"accuracy": round(accuracy_score(y_test, y_pred), 2),
"TP": str(matrix[0][0]),
"FP": str(matrix[0][1]),
"precision_0": round(precision_score(y_test, y_pred, pos_label=0), 2),
"recall_0": round(recall_score(y_test, y_pred, pos_label=0), 2),
"f1score_0": round(f1_score(y_test, y_pred, pos_label=0), 2),
"TN": str(matrix[1][1]),
"FN": str(matrix[1][0]),
"precision_1": round(precision_score(y_test, y_pred, pos_label=1), 2),
"recall_1": round(recall_score(y_test, y_pred, pos_label=1), 2),
"f1score_1": round(f1_score(y_test, y_pred, pos_label=1), 2),
}
from io import StringIO
from google.colab import output
io = StringIO()
def autosave_print(*text):
output.clear()
print(*text, file=io)
print(io.getvalue(), end="")
def clear_print():
io.seek(0)
io.truncate(0)
def clear_output():
output.clear()
데이터 불러오기
data = load_breast_cancer()
df_cancer = pd.DataFrame(data=data.data, columns=data.feature_names)
df_cancer['target'] = data.target
df_cancer.info()
df_cancer.head()
df_cancer.target.value_counts()
데이터 시각화하기
cols_size = len(df_cancer.columns)
plt.figure(figsize=(15, 30))
for i, column in enumerate(df_cancer.columns):
ax = plt.subplot(cols_size // 4 + 1, 4, i+1)
label_size = len(df_cancer[column].unique())
if label_size < 10:
sns.countplot(x=column, hue='target', data=df_cancer, ax=ax)
plt.title(column)
plt.xlabel("")
plt.ylabel("")
ax.get_legend().remove()
else:
sns.violinplot(x='target', y=column, data=df_cancer, ax=ax)
plt.title(column)
plt.xlabel("")
plt.ylabel("")
상관계수 출력하기
df_cancer.corr()[['target']].sort_values(by='target', key=abs, ascending=False)
Train, Test 데이터 분리하기
X = df_cancer.iloc[:, :-1]
y = df_cancer.iloc[:, -1]
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.2, random_state=2045)
print("Train Size:", X_train.shape, y_train.shape)
print("Test Size:", X_test.shape, y_test.shape)
print("# Train")
print(y_train.value_counts() / y_train.shape[0])
print("# Test")
print(y_test.value_counts() / y_test.shape[0])
모델별 학습하기
%%time
clear_print()
grap_time = datetime.now()
model_lr = LogisticRegression(n_jobs=-1, verbose=3, random_state=2045)
model_lr.fit(X_train, y_train)
autosave_print("LogisticRegression", datetime.now() - grap_time)
grap_time = datetime.now()
model_dt = DecisionTreeClassifier(random_state=2045)
model_dt.fit(X_train, y_train)
autosave_print("DecisionTreeClassifier", datetime.now() - grap_time)
grap_time = datetime.now()
model_random = RandomForestClassifier(n_jobs=-1, verbose=3, random_state=2045)
model_random.fit(X_train, y_train)
autosave_print("RandomForestClassifier", datetime.now() - grap_time)
grap_time = datetime.now()
model_ada = AdaBoostClassifier(random_state=2045)
model_ada.fit(X_train, y_train)
autosave_print("AdaBoostClassifier", datetime.now() - grap_time)
grap_time = datetime.now()
model_gbm = GradientBoostingClassifier(verbose=3, random_state=2045)
model_gbm.fit(X_train, y_train)
autosave_print("GradientBoostingClassifier", datetime.now() - grap_time)
grap_time = datetime.now()
model_lgbm = LGBMClassifier(n_jobs=-1, random_state=2045)
model_lgbm.fit(X_train, y_train)
autosave_print("LGBMClassifier", datetime.now() - grap_time)
grap_time = datetime.now()
model_xgbm = XGBClassifier(n_jobs=-1, random_state=2045)
model_xgbm.fit(X_train, y_train)
autosave_print("XGBClassifier", datetime.now() - grap_time)
grap_time = datetime.now()
model_kn = KNeighborsClassifier(n_jobs=-1)
model_kn.fit(X_train, y_train)
autosave_print("KNeighborsClassifier", datetime.now() - grap_time)
모델별 학습 결과 평가하기
validations = {}
y_pred_lr = model_lr.predict(X_test)
validations["logistic_regression"] = get_validation(y_test, y_pred_lr)
y_pred_dt = model_dt.predict(X_test)
validations["decision_tree"] = get_validation(y_test, y_pred_dt)
y_pred_random = model_random.predict(X_test)
validations["random_forest"] = get_validation(y_test, y_pred_random)
y_pred_ada = model_ada.predict(X_test)
validations["ada_boosting"] = get_validation(y_test, y_pred_ada)
y_pred_gbm = model_gbm.predict(X_test)
validations["gradient_boosting"] = get_validation(y_test, y_pred_gbm)
y_pred_lgbm = model_lgbm.predict(X_test)
validations["light_gbm"] = get_validation(y_test, y_pred_lgbm)
y_pred_xgbm = model_xgbm.predict(X_test)
validations["extra_gbm"] = get_validation(y_test, y_pred_xgbm)
y_pred_kn = model_kn.predict(X_test)
validations["k_neighbor"] = get_validation(y_test, y_pred_kn)
df = pd.DataFrame(validations).transpose()
df.style.background_gradient(subset=['TP', 'TN'], cmap='BuGn')