import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import zipfile,kaggle
from fastai.imports import *
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from nbdevAuto.functions import *
import nbdevAuto.functions
XGBoost: Forecast Energy Consumption
= sns.color_palette()
color_pal 'fivethirtyeight')
plt.style.use(
= 'hourly-energy-consumption'
name = Path(f'Data/{name}')
path = 'robikscube'
user = user,
kaggle_dataset_download(user = name) name
file exists
= pd.read_csv(f'{path}/PJME_hourly.csv')
df = df.set_index('Datetime')
df = pd.to_datetime(df.index) df.index
='.',
df.plot(style=(15, 5),
figsize=color_pal[0],
color='PJME Energy Use in MW')
title plt.show()
1. Outlier Analysis and removal
'PJME_MW'].plot(kind='hist', bins=500) df[
'PJME_MW < 19_000')['PJME_MW'] \
df.query(='.',
.plot(style=(15, 5),
figsize=color_pal[5],
color='Outliers') title
= df.query('PJME_MW > 19_000').copy() df
Reviewing: Train / Test Split
= df.loc[df.index < '01-01-2015']
train = df.loc[df.index >= '01-01-2015']
test
= plt.subplots(figsize=(15, 5))
fig, ax =ax, label='Training Set', title='Data Train/Test Split')
train.plot(ax=ax, label='Test Set')
test.plot(ax'01-01-2015', color='black', ls='--')
ax.axvline('Training Set', 'Test Set'])
ax.legend([ plt.show()
1. Time Series Cross Validation
from sklearn.model_selection import TimeSeriesSplit
= TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
tss = df.sort_index() df
= plt.subplots(5, 1, figsize=(15, 15), sharex=True)
fig, axs
= 0
fold for train_idx, val_idx in tss.split(df):
= df.iloc[train_idx]
train = df.iloc[val_idx]
test 'PJME_MW'].plot(ax=axs[fold],
train[='Training Set',
label=f'Data Train/Test Split Fold {fold}')
title'PJME_MW'].plot(ax=axs[fold],
test[='Test Set')
labelmin(), color='black', ls='--')
axs[fold].axvline(test.index.+= 1
fold plt.show()
2. Forecasting Horizon Explained
The forecast horizon is the length of time into the future for which forecasts are to be prepared. These generally vary from short-term forecasting horizons (less than three months) to long-term horizons (more than two years).
def create_features(df):
"""
Create time series features based on time series index.
"""
= df.copy()
df 'hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['quarter'] = df.index.quarter
df['month'] = df.index.month
df['year'] = df.index.year
df['dayofyear'] = df.index.dayofyear
df['dayofmonth'] = df.index.day
df['weekofyear'] = df.index.isocalendar().week
df[return df
= create_features(df) df
3. Lag Features
- What was the target (x) days in the past.
def add_lags(df):
= df['PJME_MW'].to_dict()
target_map 'lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
df['lag2'] = (df.index - pd.Timedelta('728 days')).map(target_map)
df['lag3'] = (df.index - pd.Timedelta('1092 days')).map(target_map)
df[return df
= add_lags(df) df
Train Using Cross Validation
= TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
tss = df.sort_index()
df
= 0
fold = []
preds = []
scores for train_idx, val_idx in tss.split(df):
= df.iloc[train_idx]
train = df.iloc[val_idx]
test
= create_features(train)
train = create_features(test)
test
= ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
FEATURES 'lag1','lag2','lag3']
= 'PJME_MW'
TARGET
= train[FEATURES]
X_train = train[TARGET]
y_train
= test[FEATURES]
X_test = test[TARGET]
y_test
= xgb.XGBRegressor(base_score=0.5, booster='gbtree',
reg =1000,
n_estimators=50,
early_stopping_rounds='reg:linear',
objective=3,
max_depth=0.01)
learning_rate
reg.fit(X_train, y_train,=[(X_train, y_train), (X_test, y_test)],
eval_set=100)
verbose
= reg.predict(X_test)
y_pred
preds.append(y_pred)= np.sqrt(mean_squared_error(y_test, y_pred))
score scores.append(score)
[0] validation_0-rmse:32732.49608 validation_1-rmse:31956.60163
/home/thekkel/mambaforge/envs/cfast/lib/python3.11/site-packages/xgboost/core.py:160: UserWarning: [17:58:44] WARNING: /workspace/src/objective/regression_obj.cu:209: reg:linear is now deprecated in favor of reg:squarederror.
warnings.warn(smsg, UserWarning)
[100] validation_0-rmse:12532.64369 validation_1-rmse:11906.14134
[200] validation_0-rmse:5747.92495 validation_1-rmse:5359.26490
[300] validation_0-rmse:3872.48134 validation_1-rmse:3900.86965
[400] validation_0-rmse:3434.23853 validation_1-rmse:3762.33705
[442] validation_0-rmse:3369.34730 validation_1-rmse:3764.82810
[0] validation_0-rmse:32672.16678 validation_1-rmse:32138.89241
/home/thekkel/mambaforge/envs/cfast/lib/python3.11/site-packages/xgboost/core.py:160: UserWarning: [17:58:47] WARNING: /workspace/src/objective/regression_obj.cu:209: reg:linear is now deprecated in favor of reg:squarederror.
warnings.warn(smsg, UserWarning)
[100] validation_0-rmse:12513.65574 validation_1-rmse:12224.93373
[200] validation_0-rmse:5753.34937 validation_1-rmse:5662.07107
[300] validation_0-rmse:3902.71304 validation_1-rmse:3933.73076
[400] validation_0-rmse:3476.90515 validation_1-rmse:3590.55005
[500] validation_0-rmse:3353.72424 validation_1-rmse:3516.39915
[600] validation_0-rmse:3297.94766 validation_1-rmse:3481.94003
[700] validation_0-rmse:3258.48267 validation_1-rmse:3461.37383
[800] validation_0-rmse:3221.51553 validation_1-rmse:3436.49603
[900] validation_0-rmse:3190.11480 validation_1-rmse:3428.88699
[999] validation_0-rmse:3166.16314 validation_1-rmse:3420.31309
[0] validation_0-rmse:32631.20370 validation_1-rmse:31073.29733
/home/thekkel/mambaforge/envs/cfast/lib/python3.11/site-packages/xgboost/core.py:160: UserWarning: [17:58:52] WARNING: /workspace/src/objective/regression_obj.cu:209: reg:linear is now deprecated in favor of reg:squarederror.
warnings.warn(smsg, UserWarning)
[100] validation_0-rmse:12499.28425 validation_1-rmse:11136.70202
[200] validation_0-rmse:5750.81453 validation_1-rmse:4813.22087
[300] validation_0-rmse:3917.04200 validation_1-rmse:3553.46419
[400] validation_0-rmse:3494.55924 validation_1-rmse:3495.32356
[410] validation_0-rmse:3476.66883 validation_1-rmse:3502.25535
[0] validation_0-rmse:32528.44438 validation_1-rmse:31475.39670
/home/thekkel/mambaforge/envs/cfast/lib/python3.11/site-packages/xgboost/core.py:160: UserWarning: [17:58:55] WARNING: /workspace/src/objective/regression_obj.cu:209: reg:linear is now deprecated in favor of reg:squarederror.
warnings.warn(smsg, UserWarning)
[100] validation_0-rmse:12462.36581 validation_1-rmse:12020.28283
[200] validation_0-rmse:5738.57925 validation_1-rmse:5796.45874
[300] validation_0-rmse:3918.53218 validation_1-rmse:4388.39477
[400] validation_0-rmse:3501.24270 validation_1-rmse:4173.36380
[500] validation_0-rmse:3384.02490 validation_1-rmse:4119.56538
[600] validation_0-rmse:3325.50024 validation_1-rmse:4105.01446
[700] validation_0-rmse:3282.73755 validation_1-rmse:4091.23557
[800] validation_0-rmse:3250.37610 validation_1-rmse:4083.12690
[900] validation_0-rmse:3223.87814 validation_1-rmse:4081.46154
[999] validation_0-rmse:3199.82843 validation_1-rmse:4052.57120
[0] validation_0-rmse:32462.05557 validation_1-rmse:31463.90500
/home/thekkel/mambaforge/envs/cfast/lib/python3.11/site-packages/xgboost/core.py:160: UserWarning: [17:59:02] WARNING: /workspace/src/objective/regression_obj.cu:209: reg:linear is now deprecated in favor of reg:squarederror.
warnings.warn(smsg, UserWarning)
[100] validation_0-rmse:12445.87740 validation_1-rmse:11963.42706
[200] validation_0-rmse:5752.44568 validation_1-rmse:5611.92884
[300] validation_0-rmse:3951.51709 validation_1-rmse:4156.41403
[400] validation_0-rmse:3539.25569 validation_1-rmse:4006.58873
[440] validation_0-rmse:3479.88018 validation_1-rmse:4011.05271
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
Score across folds 3742.5833
Fold scores:[3760.8277187583353, 3420.313091887879, 3478.018038580526, 4052.5712055405547, 4001.186553933809]
4. Predicting the Future
- Retraining on all data
- To Predict the future we need an emtpy dataframe for future date ranges.
- Run those dates through our feature creation code + lag creation
# Retrain on all data
= create_features(df)
df
= ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year',
FEATURES 'lag1','lag2','lag3']
= 'PJME_MW'
TARGET
= df[FEATURES]
X_all = df[TARGET]
y_all
= xgb.XGBRegressor(base_score=0.5,
reg ='gbtree',
booster=500,
n_estimators='reg:linear',
objective=3,
max_depth=0.01)
learning_rate
reg.fit(X_all, y_all,=[(X_all, y_all)],
eval_set=100) verbose
[0] validation_0-rmse:32403.88991
/home/thekkel/mambaforge/envs/cfast/lib/python3.11/site-packages/xgboost/core.py:160: UserWarning: [17:59:05] WARNING: /workspace/src/objective/regression_obj.cu:209: reg:linear is now deprecated in favor of reg:squarederror.
warnings.warn(smsg, UserWarning)
[100] validation_0-rmse:12426.83220
[200] validation_0-rmse:5751.73275
[300] validation_0-rmse:3971.53256
[400] validation_0-rmse:3571.21833
[499] validation_0-rmse:3456.76877
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.01, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=3, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=500, n_jobs=None, num_parallel_tree=None, objective='reg:linear', ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.01, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=3, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=500, n_jobs=None, num_parallel_tree=None, objective='reg:linear', ...)
max() df.index.
Timestamp('2018-08-03 00:00:00')
# Create future dataframe
= pd.date_range('2018-08-03','2019-08-01', freq='1h')
future = pd.DataFrame(index=future)
future_df 'isFuture'] = True
future_df['isFuture'] = False
df[= pd.concat([df, future_df])
df_and_future = create_features(df_and_future)
df_and_future = add_lags(df_and_future) df_and_future
= df_and_future.query('isFuture').copy() future_w_features
Predict the future
'pred'] = reg.predict(future_w_features[FEATURES]) future_w_features[
'pred'].plot(figsize=(10, 5),
future_w_features[=color_pal[4],
color=1,
ms=1,
lw='Future Predictions')
title plt.show()
Bonus: Saving Model For later
# Save model
'model.json') reg.save_model(
= xgb.XGBRegressor()
reg_new 'model.json')
reg_new.load_model('pred'] = reg_new.predict(future_w_features[FEATURES])
future_w_features['pred'].plot(figsize=(10, 5),
future_w_features[=color_pal[4],
color=1, lw=1,
ms='Future Predictions') title