|
@@ -11,7 +11,6 @@ from Dcel import update_excel_data
|
|
|
from Dtool import fill_missing_values, reverse_column
|
|
|
|
|
|
# ------------ 全局配置参数 -----------
|
|
|
-# 文件路径与输出设置
|
|
|
FILE_PATH = 'data_input/RBOB.xlsx'
|
|
|
OUTPUT_DAILY = 'eta/RBOB_Daily.xlsx'
|
|
|
OUTPUT_MONTHLY = 'eta/RBOB_Monthly.xlsx'
|
|
@@ -21,22 +20,14 @@ UPDATE_IDENTIFIER = "RBOB"
|
|
|
|
|
|
NUM_BOOST_ROUND = 1000
|
|
|
RANDOM_STATE = 42
|
|
|
-# 是否进行超参数调优
|
|
|
-USE_HYPERPARAM_TUNING = True
|
|
|
+USE_HYPERPARAM_TUNING = True # 若 False 则直接使用默认参数
|
|
|
|
|
|
-# 预测目标变量
|
|
|
TARGET_COL = '美国RBOB汽油裂解' # 预测目标
|
|
|
+TEST_PERIOD = 20 # 测试集样本数量
|
|
|
|
|
|
-# 测试集样本个数(TEST_PERIOD),当前默认为 1
|
|
|
-TEST_PERIOD = 20
|
|
|
+SEARCH_MODE = "random" # 可选 "grid"/"bayesian"/"random"
|
|
|
+SHOW_PLOTS = True # 是否显示最终预测图表
|
|
|
|
|
|
-# 搜索模式:选择 "grid"(网格搜索)、"bayesian"(贝叶斯搜索)或 "random"(随机搜索)
|
|
|
-SEARCH_MODE = "random" # 可修改为 "bayesian" 或 "random"
|
|
|
-
|
|
|
-# 是否显示最终预测结果图表,True 为展示,False 则不展示
|
|
|
-SHOW_PLOTS = True
|
|
|
-
|
|
|
-# 默认模型参数(当不进行调优时使用)
|
|
|
DEFAULT_PARAMS = {
|
|
|
'objective': 'reg:squarederror',
|
|
|
'learning_rate': 0.1309,
|
|
@@ -172,6 +163,7 @@ def detect_outliers_weights(X, weight_normal=1.0, weight_outlier=0.05, threshold
|
|
|
weights = np.where(outlier_mask, weight_outlier, weight_normal)
|
|
|
return weights
|
|
|
|
|
|
+# ------------ 模型训练 ------------
|
|
|
def train_model_with_tuning(X_train_scaled, y_train, X_test_scaled, y_test, weights, use_tuning=True):
|
|
|
if use_tuning:
|
|
|
param_dist = {
|
|
@@ -187,45 +179,50 @@ def train_model_with_tuning(X_train_scaled, y_train, X_test_scaled, y_test, weig
|
|
|
xgb_reg = XGBRegressor(objective='reg:squarederror', eval_metric='rmse',
|
|
|
n_estimators=NUM_BOOST_ROUND, seed=RANDOM_STATE)
|
|
|
tscv = TimeSeriesSplit(n_splits=3)
|
|
|
-
|
|
|
+ # 设置额外参数,其中 verbose=200 表示每200轮输出一次验证指标
|
|
|
+ extra_fit_params = {
|
|
|
+ 'eval_set': [(X_train_scaled, y_train), (X_test_scaled, y_test)],
|
|
|
+ 'early_stopping_rounds': 20,
|
|
|
+ 'verbose': 200
|
|
|
+ }
|
|
|
if SEARCH_MODE == "grid":
|
|
|
search = GridSearchCV(
|
|
|
estimator=xgb_reg,
|
|
|
param_grid=param_dist,
|
|
|
scoring='neg_mean_squared_error',
|
|
|
cv=tscv,
|
|
|
- verbose=1, # 显示详细信息
|
|
|
+ verbose=2,
|
|
|
n_jobs=-1
|
|
|
)
|
|
|
elif SEARCH_MODE == "bayesian":
|
|
|
search = BayesSearchCV(
|
|
|
estimator=xgb_reg,
|
|
|
search_spaces=param_dist,
|
|
|
- n_iter=50, # 增加搜索次数
|
|
|
+ n_iter=50,
|
|
|
scoring='neg_mean_squared_error',
|
|
|
cv=tscv,
|
|
|
random_state=RANDOM_STATE,
|
|
|
- verbose=1, # 更详细的训练进度信息
|
|
|
+ verbose=2,
|
|
|
n_jobs=-1
|
|
|
)
|
|
|
else:
|
|
|
search = RandomizedSearchCV(
|
|
|
estimator=xgb_reg,
|
|
|
param_distributions=param_dist,
|
|
|
- n_iter=50, # 增加搜索次数
|
|
|
+ n_iter=50,
|
|
|
scoring='neg_mean_squared_error',
|
|
|
cv=tscv,
|
|
|
random_state=RANDOM_STATE,
|
|
|
- verbose=1,
|
|
|
+ verbose=2,
|
|
|
n_jobs=-1
|
|
|
)
|
|
|
search.fit(X_train_scaled, y_train, sample_weight=weights)
|
|
|
best_model = search.best_estimator_
|
|
|
print("调优后的最佳参数:", search.best_params_)
|
|
|
- # 修改为使用 early_stopping_rounds
|
|
|
+ # 在二次拟合时使用 extra_fit_params 中的 early_stopping_rounds 和 verbose 控制输出频率
|
|
|
best_model.fit(X_train_scaled, y_train,
|
|
|
- eval_set=[(X_test_scaled, y_test)],
|
|
|
- verbose=1)
|
|
|
+ eval_set=[(X_test_scaled, y_test)],
|
|
|
+ verbose=200)
|
|
|
else:
|
|
|
dtrain = xgb.DMatrix(X_train_scaled, label=y_train, weight=weights)
|
|
|
dtest = xgb.DMatrix(X_test_scaled, label=y_test)
|
|
@@ -234,14 +231,12 @@ def train_model_with_tuning(X_train_scaled, y_train, X_test_scaled, y_test, weig
|
|
|
verbose_eval=False)
|
|
|
return best_model
|
|
|
|
|
|
-
|
|
|
# ------------ 模型评价与预测 ------------
|
|
|
def evaluate_and_predict(model, scaler, X_train, y_train, X_test, y_test, X_future, use_tuning=True):
|
|
|
X_train_trans = scaler.transform(X_train)
|
|
|
X_test_trans = scaler.transform(X_test)
|
|
|
X_future_trans = scaler.transform(X_future)
|
|
|
|
|
|
- # 根据模型类型判断
|
|
|
if isinstance(model, xgb.Booster):
|
|
|
y_train_pred = model.predict(xgb.DMatrix(X_train_trans))
|
|
|
y_test_pred = model.predict(xgb.DMatrix(X_test_trans))
|
|
@@ -267,13 +262,17 @@ def evaluate_and_predict(model, scaler, X_train, y_train, X_test, y_test, X_futu
|
|
|
|
|
|
return y_train_pred, y_test_pred, y_future_pred
|
|
|
|
|
|
-
|
|
|
# ------------ 结果后处理与保存 ------------
|
|
|
def merge_and_save_results(train_data, test_data, future_data, y_test_pred, y_future_pred):
|
|
|
+ # 更新 test_data 和 future_data 的预测列,
|
|
|
+ # 如果 future_data 包含 "完整模型预测值" 则优先使用
|
|
|
test_data = test_data.copy()
|
|
|
future_data = future_data.copy()
|
|
|
test_data['预测值'] = y_test_pred
|
|
|
- future_data['预测值'] = y_future_pred
|
|
|
+ if '完整数据_预测值' in future_data.columns:
|
|
|
+ future_data['预测值'] = future_data['完整数据_预测值']
|
|
|
+ else:
|
|
|
+ future_data['预测值'] = y_future_pred
|
|
|
|
|
|
train_data_2023 = train_data[train_data['Date'].dt.year >= 2023][['Date', TARGET_COL]]
|
|
|
test_actual = test_data[['Date', TARGET_COL]]
|
|
@@ -327,6 +326,9 @@ def plot_final_predictions(train_data, y_train, y_train_pred,
|
|
|
plt.plot(test_data['Date'], y_test, label='Test True', color='orange', alpha=0.7)
|
|
|
plt.plot(test_data['Date'], y_test_pred, label='Test Predicted', color='red')
|
|
|
plt.plot(future_data['Date'], future_data['预测值'], label='Future Prediction', color='purple')
|
|
|
+ # 新增使用全数据训练的预测结果,用黑线显示
|
|
|
+ if '完整数据_预测值' in future_data.columns:
|
|
|
+ plt.plot(future_data['Date'], future_data['完整数据_预测值'], label='Full Model Future Prediction', color='black')
|
|
|
plt.axvline(x=test_data['Date'].iloc[0], color='black', linestyle='--', label='Train/Test Split')
|
|
|
plt.axvline(x=last_day, color='gray', linestyle='--', label='Future Split')
|
|
|
plt.title('Prediction Visualization')
|
|
@@ -336,6 +338,27 @@ def plot_final_predictions(train_data, y_train, y_train_pred,
|
|
|
plt.grid(True)
|
|
|
plt.show()
|
|
|
|
|
|
+# ------------ 全数据训练及未来预测 ------------
|
|
|
+def train_full_model_and_predict(X_train, y_train, X_test, y_test, X_future):
|
|
|
+ # 合并训练集和测试集
|
|
|
+ X_full = pd.concat([X_train, X_test])
|
|
|
+ y_full = pd.concat([y_train, y_test])
|
|
|
+ scaler_full = StandardScaler().fit(X_full)
|
|
|
+ X_full_scaled = scaler_full.transform(X_full)
|
|
|
+ X_future_scaled = scaler_full.transform(X_future)
|
|
|
+
|
|
|
+ params = None
|
|
|
+ if USE_HYPERPARAM_TUNING:
|
|
|
+ # 此处可使用之前调优获得的参数;演示中仍使用默认参数
|
|
|
+ params = None
|
|
|
+ if params is None:
|
|
|
+ params = DEFAULT_PARAMS
|
|
|
+ # 只设置n_estimators,不再重复设置seed参数
|
|
|
+ full_model = XGBRegressor(**params, n_estimators=NUM_BOOST_ROUND)
|
|
|
+ full_model.fit(X_full_scaled, y_full)
|
|
|
+ y_future_full_pred = full_model.predict(X_future_scaled)
|
|
|
+ return full_model, y_future_full_pred, scaler_full
|
|
|
+
|
|
|
# ------------ 主函数 ------------
|
|
|
def main():
|
|
|
df_daily, last_day = load_and_preprocess_data(FILE_PATH)
|
|
@@ -343,26 +366,33 @@ def main():
|
|
|
scaler, X_train_scaled, X_test_scaled, X_future_scaled = scale_and_weight_features(X_train, X_test, X_future)
|
|
|
weights = detect_outliers_weights(X_train, weight_normal=1.0, weight_outlier=0.05, threshold=3)
|
|
|
|
|
|
- # 根据 USE_HYPERPARAM_TUNING 决定是否进行参数调优
|
|
|
model = train_model_with_tuning(X_train_scaled, y_train, X_test_scaled, y_test, weights,
|
|
|
use_tuning=USE_HYPERPARAM_TUNING)
|
|
|
|
|
|
y_train_pred, y_test_pred, y_future_pred = evaluate_and_predict(model, scaler, X_train, y_train, X_test, y_test, X_future,
|
|
|
use_tuning=USE_HYPERPARAM_TUNING)
|
|
|
|
|
|
- # 将预测结果添加到数据框中
|
|
|
- future_data = future_data.copy()
|
|
|
- future_data['预测值'] = y_future_pred
|
|
|
+ # 将预测结果添加到 test_data 和 future_data 中
|
|
|
test_data = test_data.copy()
|
|
|
test_data['预测值'] = y_test_pred
|
|
|
+ future_data = future_data.copy()
|
|
|
+ future_data['预测值'] = y_future_pred
|
|
|
+
|
|
|
+ # 训练全数据模型并预测未来
|
|
|
+ full_model, y_future_full_pred, scaler_full = train_full_model_and_predict(X_train, y_train, X_test, y_test, X_future)
|
|
|
+ future_data['完整数据_预测值'] = y_future_full_pred
|
|
|
|
|
|
merged_df = merge_and_save_results(train_data, test_data, future_data, y_test_pred, y_future_pred)
|
|
|
update_excel(merged_df)
|
|
|
|
|
|
+
|
|
|
+ ### 画图展示
|
|
|
if SHOW_PLOTS:
|
|
|
plot_final_predictions(train_data, y_train, y_train_pred,
|
|
|
test_data, y_test, y_test_pred,
|
|
|
future_data, last_day)
|
|
|
+
|
|
|
+ print("全数据模型对未来数据的预测结果:", y_future_full_pred)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|