라이브러리 불러오기
- 코드
import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error
데이터 불러오기
- 코드
DF = sns.load_dataset('mpg') DF.info()
데이터 전처리
- 코드
# null 데이터 처리 DF.drop(index=DF[DF['horsepower'].isnull()].index, inplace=True) # origin 범주형 처리 encoder = LabelEncoder() DF['orgin_state'] = encoder.fit_transform(DF[['origin']]) print('origin:', DF['orgin_state'].unique()) # object 컬럼 삭제 DF.drop(columns=['name', 'origin'], axis=1, inplace=True) print("**" * 20) DF.info()
데이터시각화1 (pairpot)
- 전체 컬럼
sns.pairplot(DF) plt.show()
- 컬럼 범위 설정
sns.pairplot(DF, vars=['mpg', 'displacement', 'weight']) plt.show()
- 특정 컬럼
sns.pairplot(DF, y_vars=['mpg']) plt.show()
데이터시각화2 (heatmap)
- 전체 컬럼
sns.heatmap(DF.corr()) plt.show()
- 특정 컬럼
sns.heatmap(DF.corr()[['mpg']]) plt.show()
상관계수 출력
- 전체 컬럼
DF.corr()
- 특정 컬럼
DF.corr()[['mpg']]
- 특정 컬럼 (정렬)
DF.corr()[['mpg']].sort_values(by='mpg', key=abs, ascending=False)
단순회귀 (Simple Regression)
- 예시
X = DF[['weight']] y = DF['mpg'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2045) RA = LinearRegression() RA.fit(X_train, y_train) print('weight:', RA.coef_) print('bias:', RA.intercept_) print('R Score:', RA.score(X_test, y_test)) y_hat_test = RA.predict(X_test) mse = mean_squared_error(y_test, y_hat_test) print('MSE: %.2f / %.2f' % (mse, np.sqrt(mse))) plt.figure(figsize=(9, 6)) ax1 = sns.kdeplot(y_test, label='y_test') ax2 = sns.kdeplot(y_hat_test, label='y_hat_simple', ax=ax1) ax3 = sns.kdeplot(y_train, label='y_train', ax=ax1) plt.legend() plt.show()
다항회귀 (Polynomial Regression)
- 예시
X = DF[['weight']] y = DF['mpg'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6691) poly = PolynomialFeatures(degree=2, include_bias=False) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.fit_transform(X_test) RA = LinearRegression() RA.fit(X_train_poly, y_train) print('weight:', RA.coef_) print('bias:', RA.intercept_) print('R Score:', RA.score(X_test_poly, y_test)) y_hat_test = RA.predict(X_test_poly) mse = mean_squared_error(y_test, y_hat_test) print('MSE: %.2f / %.2f' % (mse, np.sqrt(mse))) plt.figure(figsize=(9, 6)) ax1 = sns.kdeplot(y_test, label='y_test') ax2 = sns.kdeplot(y_hat_test, label='y_hat_simple', ax=ax1) ax3 = sns.kdeplot(y_train, label='y_train', ax=ax1) plt.legend() plt.show()
다중회귀 (Multiple Regression)
- 예시
X = DF[['weight', 'displacement', 'horsepower']] y = DF['mpg'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2045) RA = LinearRegression() RA.fit(X_train, y_train) plt.figure(figsize=(9, 6)) ax1 = sns.kdeplot(y_test, label='y_test') ax2 = sns.kdeplot(y_hat_test, label='y_hat_simple', ax=ax1) ax3 = sns.kdeplot(y_train, label='y_train', ax=ax1) plt.legend() plt.show()