import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import make_scorer, mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
train_url = ""
test_url = ""
train_df = pd.read_csv(train_url)
test_df = pd.read_csv(test_url)
#train_df = train_df[train_df['count'] < (count_mean + 3* count_std)].reset_index(drop=True)
count_mean = train_df['count'].mean()
count_std = train_df['count'].std()
train_df = train_df[train_df['count'] < (count_mean + 3* count_std)]
print("reading data complete")
def rmsle(actual_values, predicted_values, convertExp=True):
"""
- root mean squared log error는 error를 로그화값으로 변환하고, 제곱하고, 평균을 내고, 루트를 씌웁니다.
- skewness를 해결하기 위해 np.log1p를 했기 때문에, 값을 예측할 때 이를 다시 변환해서 처리해주는 것이 필요합니다.
"""
if convertExp==True:
predicted_values = np.exp(predicted_values),
actual_values = np.exp(actual_values)
log_predicted_values = np.log(np.array(predicted_values)+1)
log_actual_values = np.log(np.array(actual_values)+1)
# 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 해준다.
difference = np.square(log_predicted_values - log_actual_values)
return np.sqrt(difference.mean())
def preprocessingX(input_df):
r_df = input_df.copy()
r_df['datetime'] = pd.to_datetime(r_df['datetime'])
r_df['weekday'] = r_df['datetime'].apply(lambda d: d.weekday())
r_df['year'] = r_df['datetime'].apply(lambda dt: dt.year)
r_df['month'] = r_df['datetime'].apply(lambda dt: dt.month)
r_df['days'] = r_df['datetime'].apply(lambda dt: dt.day)
r_df['hour'] = r_df['datetime'].apply(lambda dt: dt.hour)
r_df['day_from_start'] = (r_df['datetime'] - r_df['datetime'][0]).apply(lambda td: td.days)
r_df['day_from_start//30'] = r_df['day_from_start']//30
r_df['day_from_start/180'] = r_df['day_from_start']//180
r_df['non_windspeed'] = r_df['windspeed'] ==0
try:
del r_df['registered']
del r_df['casual']
except:
pass
del r_df['datetime']
## making categorical colm
for col in ['holiday', 'season', 'workingday', 'hour']:
r_df = r_df.join(pd.get_dummies(r_df[col], prefix=col))
r_df_v = r_df.values
#r_df_v = RobustScaler().fit_transform(r_df_v)
##r_df_v = MinMaxScaler().fit_transform(r_df_v)
return pd.DataFrame(r_df_v, columns=r_df.columns)
# outlier removal
x = preprocessingX(train_df[list(set(train_df.columns)-set(['count']))])
y = train_df['count']
y_log = np.log1p(y)
x_train, x_test, y_train, y_test = train_test_split(x.values, y_log, train_size=0.8, test_size=0.2, random_state=42)
#reg = GradientBoostingRegressor(n_estimators=5000, alpha=0.01)
#reg = MLPRegressor(hidden_layer_sizes=[512, 64, 4], max_iter=1000, alpha=0.005, random_state=42)
#reg = GridSearchCV( Lasso(), { 'max_iter':[3000], 'alpha':1/np.array([0.1, 1, 2, 3, 4, 10, 30,100,200,300,400,800,900,1000])}, cv=5)
#reg = GridSearchCV(GradientBoostingRegressor(), {"n_estimators":[5, 500, 1000], 'alpha':[0.001, 0.01, 0.1, 0.5]}, scoring=make_scorer(rmsle))
reg = GridSearchCV(GradientBoostingRegressor(), {"n_estimators":[4000], 'alpha':[0.01]}, cv=3,
scoring=make_scorer(rmsle))
reg = GridSearchCV(RandomForestRegressor(), {"n_estimators":[5, 50, 100, 500]}, cv=3,
scoring=make_scorer(rmsle))
reg.fit(x.values, y_log)
## submission
submit_df = pd.DataFrame({'datetime':test_df['datetime'],
'count':[max(0, x) for x in np.exp(reg.predict(preprocessingX(test_df)))],
#'count':list(map(lambda x: round(x, 0), np.exp(reg.predict(preprocessingX(test_df)))))
})
submit_df.to_csv('bycicle.csv', index=False)
print("----complete----")
댓글남기기