import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fastai.imports import *
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import zipfile,kaggle
import os
XGBoost
= sns.color_palette()
color_pal 'fivethirtyeight') plt.style.use(
from nbdevAuto.functions import *
import nbdevAuto.functions
= 'hourly-energy-consumption'
name = Path(f'Data/{name}')
path = 'robikscube'
user = user,
kaggle_dataset_download(user = name) name
= pd.read_csv(f'{path}/PJME_hourly.csv')
df = df.set_index('Datetime')
df = pd.to_datetime(df.index) df.index
='.',
df.plot(style=(15, 5),
figsize=color_pal[0],
color='PJME Energy Use in MW')
title plt.show()
Train / Test Split
= df.loc[df.index < '01-01-2015']
train = df.loc[df.index >= '01-01-2015']
test
= plt.subplots(figsize=(15, 5))
fig, ax =ax, label='Training Set', title='Data Train/Test Split')
train.plot(ax=ax, label='Test Set')
test.plot(ax'01-01-2015', color='black', ls='--')
ax.axvline('Training Set', 'Test Set'])
ax.legend([ plt.show()
> '01-01-2010') & (df.index < '01-08-2010')] \
df.loc[(df.index =(15, 5), title='Week Of Data')
.plot(figsize plt.show()
Feature Creation
def create_features(df):
"""
Create time series features based on time series index.
"""
= df.copy()
df 'hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['quarter'] = df.index.quarter
df['month'] = df.index.month
df['year'] = df.index.year
df['dayofyear'] = df.index.dayofyear
df['dayofmonth'] = df.index.day
df['weekofyear'] = df.index.isocalendar().week
df[return df
= create_features(df) df
Visualize our Feature / Target Relationship
= plt.subplots(figsize=(10, 8))
fig, ax =df, x='hour', y='PJME_MW')
sns.boxplot(data'MW by Hour')
ax.set_title( plt.show()
= plt.subplots(figsize=(10, 8))
fig, ax =df, x='month', y='PJME_MW', palette='Blues')
sns.boxplot(data'MW by Month')
ax.set_title( plt.show()
/tmp/ipykernel_109462/958582662.py:3: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.
sns.boxplot(data=df, x='month', y='PJME_MW', palette='Blues')
Create our Model
= create_features(train)
train = create_features(test)
test
= ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year']
FEATURES = 'PJME_MW'
TARGET
= train[FEATURES]
X_train = train[TARGET]
y_train
= test[FEATURES]
X_test = test[TARGET] y_test
= xgb.XGBRegressor(base_score=0.5, booster='gbtree',
reg =1000,
n_estimators=50,
early_stopping_rounds='reg:linear',
objective=3,
max_depth=0.01)
learning_rate
reg.fit(X_train, y_train,=[(X_train, y_train), (X_test, y_test)],
eval_set=100) verbose
[0] validation_0-rmse:32605.13970 validation_1-rmse:31657.15729
/home/thekkel/mambaforge/envs/cfast/lib/python3.11/site-packages/xgboost/core.py:160: UserWarning: [17:58:09] WARNING: /workspace/src/objective/regression_obj.cu:209: reg:linear is now deprecated in favor of reg:squarederror.
warnings.warn(smsg, UserWarning)
[100] validation_0-rmse:12584.35462 validation_1-rmse:11747.28803
[200] validation_0-rmse:5837.33066 validation_1-rmse:5363.58554
[300] validation_0-rmse:3923.28511 validation_1-rmse:4020.48045
[400] validation_0-rmse:3447.54638 validation_1-rmse:3860.60088
[500] validation_0-rmse:3288.19208 validation_1-rmse:3816.37862
[600] validation_0-rmse:3206.55619 validation_1-rmse:3779.04119
[700] validation_0-rmse:3153.61368 validation_1-rmse:3754.45684
[800] validation_0-rmse:3114.34038 validation_1-rmse:3738.38209
[900] validation_0-rmse:3084.39550 validation_1-rmse:3730.01893
[989] validation_0-rmse:3059.85847 validation_1-rmse:3727.94591
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=50, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.01, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=3, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=1000, n_jobs=None, num_parallel_tree=None, objective='reg:linear', ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=50, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.01, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=3, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=1000, n_jobs=None, num_parallel_tree=None, objective='reg:linear', ...)
Feature Importance
= pd.DataFrame(data=reg.feature_importances_,
fi =reg.feature_names_in_,
index=['importance'])
columns'importance').plot(kind='barh', title='Feature Importance')
fi.sort_values( plt.show()
Forecast on Test
'prediction'] = reg.predict(X_test)
test[= df.merge(test[['prediction']], how='left', left_index=True, right_index=True)
df = df[['PJME_MW']].plot(figsize=(15, 5))
ax 'prediction'].plot(ax=ax, style='.')
df['Truth Data', 'Predictions'])
plt.legend(['Raw Dat and Prediction')
ax.set_title( plt.show()
= df.loc[(df.index > '04-01-2018') & (df.index < '04-18-2018')]['PJME_MW'] \
ax =(15, 5), title='Week Of Data')
.plot(figsize> '04-01-2018') & (df.index < '04-18-2018')]['prediction'] \
df.loc[(df.index ='.')
.plot(style'Truth Data','Prediction'])
plt.legend([ plt.show()
Score (RMSE)
= np.sqrt(mean_squared_error(test['PJME_MW'], test['prediction']))
score print(f'RMSE Score on Test set: {score:0.2f}')
RMSE Score on Test set: 3726.80
Calculate Error
- Look at the worst and best predicted days
'error'] = np.abs(test[TARGET] - test['prediction'])
test['date'] = test.index.date
test['date'])['error'].mean().sort_values(ascending=False).head(10) test.groupby([
date
2016-08-13 12879.484619
2016-08-14 12772.887207
2015-02-20 11186.031494
2016-09-09 10966.513102
2016-09-10 10889.102214
2018-01-06 10642.975830
2016-08-12 10041.172689
2015-02-21 9988.168783
2015-02-16 9900.809326
2018-01-07 9852.571370
Name: error, dtype: float64
Next Steps
- More robust cross validation
- Add more features (weather forecast, holidays)