|
@@ -20,13 +20,15 @@ UPDATE_IDENTIFIER = "RBOB"
|
|
|
|
|
|
NUM_BOOST_ROUND = 1000
|
|
|
RANDOM_STATE = 42
|
|
|
-USE_HYPERPARAM_TUNING = True # 若 False 则直接使用默认参数
|
|
|
+USE_HYPERPARAM_TUNING = False # 若 False 则直接使用默认参数
|
|
|
|
|
|
TARGET_COL = '美国RBOB汽油裂解' # 预测目标
|
|
|
TEST_PERIOD = 20 # 测试集样本数量
|
|
|
|
|
|
SEARCH_MODE = "random" # 可选 "grid"/"bayesian"/"random"
|
|
|
-SHOW_PLOTS = True # 是否显示最终预测图表
|
|
|
+SHOW_PLOTS = True # 是否显示最终预测图表
|
|
|
+
|
|
|
+ADJUST_FULL_PREDICTIONS = True
|
|
|
|
|
|
DEFAULT_PARAMS = {
|
|
|
'objective': 'reg:squarederror',
|
|
@@ -99,7 +101,6 @@ def load_and_preprocess_data(file_path):
|
|
|
df_daily.rename(columns={'index': 'Date'}, inplace=True)
|
|
|
|
|
|
df_daily = fill_missing_values(df_daily, FILL_METHODS, return_only_filled=False)
|
|
|
-
|
|
|
for col, shift_days, new_col in SHIFT_CONFIG:
|
|
|
df_daily[new_col] = df_daily[col].shift(shift_days)
|
|
|
|
|
@@ -109,19 +110,15 @@ def load_and_preprocess_data(file_path):
|
|
|
|
|
|
df_daily = df_daily[(df_daily['Date'] >= '2009-08-01') & (df_daily['Date'] <= last_day_ext)]
|
|
|
df_daily = df_daily[df_daily['Date'].dt.dayofweek < 5]
|
|
|
-
|
|
|
for base_col, new_col in REVERSE_CONFIG:
|
|
|
df_daily[new_col] = reverse_column(df_daily, base_col)
|
|
|
-
|
|
|
for special_col, config in SPECIAL_REVERSE.items():
|
|
|
base_col = config['base_column']
|
|
|
condition_date = config['condition_date']
|
|
|
df_daily[special_col] = np.where(df_daily['Date'] >= condition_date,
|
|
|
df_daily[base_col],
|
|
|
np.nan)
|
|
|
-
|
|
|
df_daily = df_daily[(df_daily['Date'] > last_day) | df_daily[TARGET_COL].notna()]
|
|
|
-
|
|
|
return df_daily, last_day
|
|
|
|
|
|
# ------------ 数据划分与特征构建 ------------
|
|
@@ -140,13 +137,11 @@ def split_and_build_features(df_daily, last_day):
|
|
|
'美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年_提前30天',
|
|
|
'美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序_2022-01-01'
|
|
|
]
|
|
|
-
|
|
|
X_train = train_data[feature_columns]
|
|
|
y_train = train_data[TARGET_COL]
|
|
|
X_test = test_data[feature_columns]
|
|
|
y_test = test_data[TARGET_COL]
|
|
|
X_future = future_data[feature_columns]
|
|
|
-
|
|
|
return X_train, y_train, X_test, y_test, X_future, train_data, test_data, future_data
|
|
|
|
|
|
# ------------ 特征缩放与异常值检测 ------------
|
|
@@ -179,11 +174,10 @@ def train_model_with_tuning(X_train_scaled, y_train, X_test_scaled, y_test, weig
|
|
|
xgb_reg = XGBRegressor(objective='reg:squarederror', eval_metric='rmse',
|
|
|
n_estimators=NUM_BOOST_ROUND, seed=RANDOM_STATE)
|
|
|
tscv = TimeSeriesSplit(n_splits=3)
|
|
|
- # 设置额外参数,其中 verbose=200 表示每200轮输出一次验证指标
|
|
|
extra_fit_params = {
|
|
|
'eval_set': [(X_train_scaled, y_train), (X_test_scaled, y_test)],
|
|
|
'early_stopping_rounds': 20,
|
|
|
- 'verbose': 200
|
|
|
+ 'verbose': 200 # 每200轮输出一次验证指标
|
|
|
}
|
|
|
if SEARCH_MODE == "grid":
|
|
|
search = GridSearchCV(
|
|
@@ -191,7 +185,7 @@ def train_model_with_tuning(X_train_scaled, y_train, X_test_scaled, y_test, weig
|
|
|
param_grid=param_dist,
|
|
|
scoring='neg_mean_squared_error',
|
|
|
cv=tscv,
|
|
|
- verbose=2,
|
|
|
+ verbose=1,
|
|
|
n_jobs=-1
|
|
|
)
|
|
|
elif SEARCH_MODE == "bayesian":
|
|
@@ -202,7 +196,7 @@ def train_model_with_tuning(X_train_scaled, y_train, X_test_scaled, y_test, weig
|
|
|
scoring='neg_mean_squared_error',
|
|
|
cv=tscv,
|
|
|
random_state=RANDOM_STATE,
|
|
|
- verbose=2,
|
|
|
+ verbose=1,
|
|
|
n_jobs=-1
|
|
|
)
|
|
|
else:
|
|
@@ -213,13 +207,12 @@ def train_model_with_tuning(X_train_scaled, y_train, X_test_scaled, y_test, weig
|
|
|
scoring='neg_mean_squared_error',
|
|
|
cv=tscv,
|
|
|
random_state=RANDOM_STATE,
|
|
|
- verbose=2,
|
|
|
+ verbose=1,
|
|
|
n_jobs=-1
|
|
|
)
|
|
|
search.fit(X_train_scaled, y_train, sample_weight=weights)
|
|
|
best_model = search.best_estimator_
|
|
|
print("调优后的最佳参数:", search.best_params_)
|
|
|
- # 在二次拟合时使用 extra_fit_params 中的 early_stopping_rounds 和 verbose 控制输出频率
|
|
|
best_model.fit(X_train_scaled, y_train,
|
|
|
eval_set=[(X_test_scaled, y_test)],
|
|
|
verbose=200)
|
|
@@ -264,8 +257,6 @@ def evaluate_and_predict(model, scaler, X_train, y_train, X_test, y_test, X_futu
|
|
|
|
|
|
# ------------ 结果后处理与保存 ------------
|
|
|
def merge_and_save_results(train_data, test_data, future_data, y_test_pred, y_future_pred):
|
|
|
- # 更新 test_data 和 future_data 的预测列,
|
|
|
- # 如果 future_data 包含 "完整模型预测值" 则优先使用
|
|
|
test_data = test_data.copy()
|
|
|
future_data = future_data.copy()
|
|
|
test_data['预测值'] = y_test_pred
|
|
@@ -316,6 +307,12 @@ def update_excel(merged_df):
|
|
|
else:
|
|
|
print("数据更新失败,请检查错误信息")
|
|
|
|
|
|
+def adjust_full_predictions(y_test, future_data):
|
|
|
+ gap = y_test.iloc[-1] - future_data['完整数据_预测值'].iloc[0]
|
|
|
+ future_data['完整数据_预测值'] = future_data['完整数据_预测值'] + gap
|
|
|
+ print(future_data['完整数据_预测值'])
|
|
|
+ return future_data
|
|
|
+
|
|
|
# ------------ 最终预测结果可视化 ------------
|
|
|
def plot_final_predictions(train_data, y_train, y_train_pred,
|
|
|
test_data, y_test, y_test_pred,
|
|
@@ -323,10 +320,9 @@ def plot_final_predictions(train_data, y_train, y_train_pred,
|
|
|
plt.figure(figsize=(15, 6))
|
|
|
plt.plot(train_data['Date'], y_train, label='Train True', color='blue')
|
|
|
plt.plot(train_data['Date'], y_train_pred, label='Train Predicted', color='green')
|
|
|
- plt.plot(test_data['Date'], y_test, label='Test True', color='orange', alpha=0.7)
|
|
|
+ plt.plot(test_data['Date'], y_test, label='Test True', color='blue', alpha=0.7)
|
|
|
plt.plot(test_data['Date'], y_test_pred, label='Test Predicted', color='red')
|
|
|
plt.plot(future_data['Date'], future_data['预测值'], label='Future Prediction', color='purple')
|
|
|
- # 新增使用全数据训练的预测结果,用黑线显示
|
|
|
if '完整数据_预测值' in future_data.columns:
|
|
|
plt.plot(future_data['Date'], future_data['完整数据_预测值'], label='Full Model Future Prediction', color='black')
|
|
|
plt.axvline(x=test_data['Date'].iloc[0], color='black', linestyle='--', label='Train/Test Split')
|
|
@@ -340,7 +336,6 @@ def plot_final_predictions(train_data, y_train, y_train_pred,
|
|
|
|
|
|
# ------------ 全数据训练及未来预测 ------------
|
|
|
def train_full_model_and_predict(X_train, y_train, X_test, y_test, X_future):
|
|
|
- # 合并训练集和测试集
|
|
|
X_full = pd.concat([X_train, X_test])
|
|
|
y_full = pd.concat([y_train, y_test])
|
|
|
scaler_full = StandardScaler().fit(X_full)
|
|
@@ -349,11 +344,9 @@ def train_full_model_and_predict(X_train, y_train, X_test, y_test, X_future):
|
|
|
|
|
|
params = None
|
|
|
if USE_HYPERPARAM_TUNING:
|
|
|
- # 此处可使用之前调优获得的参数;演示中仍使用默认参数
|
|
|
params = None
|
|
|
if params is None:
|
|
|
params = DEFAULT_PARAMS
|
|
|
- # 只设置n_estimators,不再重复设置seed参数
|
|
|
full_model = XGBRegressor(**params, n_estimators=NUM_BOOST_ROUND)
|
|
|
full_model.fit(X_full_scaled, y_full)
|
|
|
y_future_full_pred = full_model.predict(X_future_scaled)
|
|
@@ -368,25 +361,24 @@ def main():
|
|
|
|
|
|
model = train_model_with_tuning(X_train_scaled, y_train, X_test_scaled, y_test, weights,
|
|
|
use_tuning=USE_HYPERPARAM_TUNING)
|
|
|
-
|
|
|
y_train_pred, y_test_pred, y_future_pred = evaluate_and_predict(model, scaler, X_train, y_train, X_test, y_test, X_future,
|
|
|
use_tuning=USE_HYPERPARAM_TUNING)
|
|
|
|
|
|
- # 将预测结果添加到 test_data 和 future_data 中
|
|
|
test_data = test_data.copy()
|
|
|
test_data['预测值'] = y_test_pred
|
|
|
future_data = future_data.copy()
|
|
|
future_data['预测值'] = y_future_pred
|
|
|
|
|
|
- # 训练全数据模型并预测未来
|
|
|
full_model, y_future_full_pred, scaler_full = train_full_model_and_predict(X_train, y_train, X_test, y_test, X_future)
|
|
|
future_data['完整数据_预测值'] = y_future_full_pred
|
|
|
|
|
|
- merged_df = merge_and_save_results(train_data, test_data, future_data, y_test_pred, y_future_pred)
|
|
|
+
|
|
|
+ if ADJUST_FULL_PREDICTIONS:
|
|
|
+ future_data = adjust_full_predictions(y_test, future_data)
|
|
|
+
|
|
|
+ merged_df = merge_and_save_results(train_data, test_data, future_data, y_test_pred, y_future_full_pred)
|
|
|
update_excel(merged_df)
|
|
|
|
|
|
-
|
|
|
- ### 画图展示
|
|
|
if SHOW_PLOTS:
|
|
|
plot_final_predictions(train_data, y_train, y_train_pred,
|
|
|
test_data, y_test, y_test_pred,
|