import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
import time
# this function is only for plotting
def plot_feature(alg,dtrain,top=25):
feat_imp = pd.Series(alg.feature_importances_,index=dtrain.columns).sort_values(ascending=False)[0:top]
feat_imp.plot(kind='bar', title='top {} Feature Importances'.format(top))
plt.ylabel('Feature Importance Score')
plt.show()
# model evalutaion
def eva(model,nfold=5):
scores=cross_val_score(model, X_train, y_train, cv=5,scoring='neg_mean_squared_error')
print("cv scores is {0} cv-std {1} ".format(scores.mean(),scores.std()))
train_start_time = time.time()
model.fit(X_train,y_train)
train_elaspe = time.time()-train_start_time
print("training time {}".format(train_elaspe))
test_start_time = time.time()
y_pred = model.predict(X_test)
score = mean_squared_error(y_test,y_pred)
test_elaspe =time.time()-test_start_time
print("test time {}".format(test_elaspe))
print("test mean squared error {}".format(score))
house=pd.read_csv("../data/data.csv")
# date
house['date'] =list(map(lambda strdate: datetime.strptime(strdate,'%Y%m%dT%H%M%S'), house['date']))
house['sale_year'] =list(map(lambda date: date.year, house['date']))
house['sale_month'] =list(map(lambda date: date.month,house['date']))
house['sale_weekday']=list(map(lambda date: date.dayofweek,house['date'] ))
# location
house['zipcode'] =house['zipcode'].astype('category')
del house['id']
house['log_price'] =np.log(house['price']) # log_normal distribution plot
1/3 as test data
2/3 as train data
X_data = house.drop(columns=['price','log_price','date'])
y_data = house['log_price']
X_train, X_test, y_train, y_test=train_test_split(X_data,y_data,test_size=0.33, random_state=42)
objective function : $ \frac{1}{2n} ||y - \beta X||^2_2 + \alpha* ratio * ||\beta||_1 + 0.5 * \alpha * (1 - ratio) * ||\beta||^2_2 $
$$\alpha = x + y \space and \space ratio = \frac{x}{ (x + y)} $$
alpha control the total penalty for l2 and l1
ratio control the weight between l1 and l2
elastic_model = ElasticNet(max_iter=5000)
param_test = {
"alpha":[0.01,0.05,0.1],
'l1_ratio':[0.01,0.5,1],
'fit_intercept': [True,False],
'normalize':[True,False]
}
gsearch = GridSearchCV(estimator = elastic_model,
param_grid = param_test, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5,verbose=0)
gsearch.fit(X_train,y_train)
gsearch.best_params_, gsearch.best_score_
elastic_model.set_params(**gsearch.best_params_)
eva(elastic_model)
rf_model=RandomForestRegressor()
eva(rf_model) # untuned random forest results
objective function is L2 loss
Input: I: training data, d: iterations
Input: a: sampling ratio of large gradient data
Input: b: sampling ratio of small gradient data
Input: loss: loss function, L: weak learner
models ← {}, fact ← (1−a)/b
topN ← a × len(I) , randN ← b × len(I)
for i = 1 to d do
preds ← models.predict(I)
g ← loss(I, preds), w ← {1,1,...}
sorted ← GetSortedIndices(abs(g))
topSet ← sorted[1:topN]
randSet ← RandomPick(sorted[topN:len(I)],randN)
usedSet ← topSet + randSet
w[randSet] × = fact . Assign weight f act to the small gradient data.
newModel ← L(I[usedSet], − g[usedSet],w[usedSet])
models.append(newModel)
LightGBM offers good accuracy with integer-encoded categorical features.
LightGBM applies Fisher (1958) to find the optimal split over categories as described here. This often performs better than one-hot encoding.
Categorical features must be encoded as non-negative integers (int) less than Int32.MaxValue (2147483647). It is best to use a contiguous range of integers.
Use categorical_feature to specify the categorical features. Refer to the parameter categorical_feature in Parameters.
For a categorical feature with high cardinality (#category is large), it often works best to treat the feature as numeric, either by simply ignoring the categorical interpretation of the integers or by embedding the categories in a low-dimensional numeric space.
lgb_model=LGBMRegressor()
lgbm_params = {
"n_estimators":3000,
"boosting_type":"gbdt",
# "application":"conitunous",
"learning_rate":0.1,
"min_data_in_leaf":80, # dealt with overfiting
"num_leaves":50,
# "min_data_per_group":[10,30,50],
# "cat_smooth":[0,0.5,1],
"max_depth":-1,
# "scale_pos_weight":2,
# "drop_rate":0.02,
"bagging_freq":1,
"bagging_fraction":0.8,
"metric":"rmse",
"min_split_gain":0.0,
# "colsample_bytree":0.0
"save_binary":True,
"max_bin":100
}
lgb_model.set_params(**lgbm_params) #base model
lgbm_train = lgb.Dataset(data=X_train,
label=y_train,
# categorical_feature=cat_col,
free_raw_data=False)
cv_results = lgb.cv(train_set=lgbm_train,
params=lgbm_params,
nfold=5,
num_boost_round=1000,
early_stopping_rounds=200,
stratified=False,
# objective="regression",
verbose_eval=50,
metrics=['mse'])
optimum_boost_rounds = np.argmin(cv_results['l2-mean'])
print('Optimum boost rounds = {}'.format(optimum_boost_rounds))
print('Best LGBM CV result = {}'.format(np.min(cv_results['l2-mean'])))
max num_leaves This is the main parameter to control the complexity of the tree model.
Theoretically, we can set num_leaves = 2^(max_depth) to obtain the same number of leaves as depth-wise tree.
min_data_in_leaf This is a very important parameter to prevent over-fitting in a leaf-wise tree.
Its optimal value depends on the number of training samples and num_leaves.
Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
In practice, setting it to hundreds or thousands is enough for a large dataset.
max_depth You also can use max_depth to limit the tree depth explicitly.
min_data_per_group, cat_smooth is used to deal with over-fitting (when #data is small or #category is large).
lgb_model.set_params(n_estimators=122)
param_test1 = {
"num_leaves":[10,30,50],
"cat_smooth":[0,0.5,1],
'min_data_in_leaf':list(range(1,100,20))
}
gsearch1 = GridSearchCV(estimator = lgb_model,
param_grid = param_test1, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5,verbose=-1)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_, gsearch1.best_score_
lgb_model.set_params(**gsearch1.best_params_)
eva(lgb_model)
plot_feature(lgb_model,dtrain=X_train)
Important features