Rbob.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663
  1. import pandas as pd
  2. import numpy as np
  3. import xgboost as xgb
  4. from xgboost import XGBRegressor
  5. from sklearn.metrics import mean_squared_error, r2_score
  6. import matplotlib.pyplot as plt
  7. from skopt import BayesSearchCV
  8. from sklearn.preprocessing import StandardScaler
  9. from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
  10. import argparse
  11. import itertools
  12. import random
  13. from skopt.space import Real, Integer, Categorical
  14. import json
  15. from Dtool import fill_missing_values, reverse_column
  16. from api import fetch_data_by_indicators
  17. # 添加命令行参数解析
  18. def parse_arguments():
  19. parser = argparse.ArgumentParser(description='RBOB汽油裂解预测模型')
  20. # XGBoost参数
  21. parser.add_argument('--objective', type=str, default='reg:squarederror', help='XGBoost目标函数')
  22. parser.add_argument('--learning_rate', type=float, default=0.1, help='学习率')
  23. parser.add_argument('--max_depth', type=int, default=8, help='最大树深度')
  24. parser.add_argument('--min_child_weight', type=float, default=3, help='最小子权重')
  25. parser.add_argument('--gamma', type=float, default=2, help='gamma参数')
  26. parser.add_argument('--subsample', type=float, default=0.85, help='子样本比例')
  27. parser.add_argument('--colsample_bytree', type=float, default=0.75, help='每棵树的列采样率')
  28. parser.add_argument('--eval_metric', type=str, default='rmse', help='评估指标')
  29. parser.add_argument('--seed', type=int, default=42, help='随机种子')
  30. parser.add_argument('--reg_alpha', type=float, default=0.45, help='L1正则化')
  31. parser.add_argument('--reg_lambda', type=float, default=1.29, help='L2正则化')
  32. parser.add_argument('--booster', type=str, default='gbtree', help='提升器类型')
  33. parser.add_argument('--tree_method', type=str, default='auto', help='树构建方法')
  34. parser.add_argument('--max_delta_step', type=int, default=0, help='最大步长')
  35. # 其他参数
  36. parser.add_argument('--num_boost_round', type=int, default=1000, help='提升迭代次数')
  37. parser.add_argument('--use_hyperparam_tuning', type=str, default='False', help='是否使用超参数调优')
  38. parser.add_argument('--output_prefix', type=str, default='', help='输出文件前缀,如传入1234则生成1234_update.xlsx')
  39. args = parser.parse_args()
  40. return args
  41. # 使用示例
  42. INDICATOR_IDS = ["RBWTICKMc1", "C2406121350446455",'USGGBE02 Index', "Cinjcjc4 index",'injcjc4 index','C2201059138_241106232710','C2406036178','C22411071623523660','C2312081670','REFOC-T-EIA_241114135248','C2304065621_241024124344','REFOC-T-EIA_241114135248','C22503031424010431']
  43. # 这些变量将在main函数中从命令行参数更新
  44. NUM_BOOST_ROUND = 1000
  45. RANDOM_STATE = 42
  46. USE_HYPERPARAM_TUNING = False # 若 False 则直接使用 xgb.train
  47. TARGET_COL = '美国RBOB汽油裂解'
  48. TEST_PERIOD = 20
  49. SEARCH_MODE = 'random' # 可选 'grid' / 'bayesian' / 'random'
  50. SHOW_PLOTS = True
  51. ADJUST_FULL_PREDICTIONS = True
  52. TARGET_NAME = '美国RBOB汽油裂解'
  53. CLASSIFICATION = '原油'
  54. MODEL_FRAMEWORK = 'XGBoost'
  55. CREATOR = '张立舟'
  56. #PRED_DATE = '2024/11/11'
  57. FREQUENCY = '月度'
  58. OUTPUT_PATH = 'update.xlsx'
  59. SIGNIFICANT_DIGITS = 5
  60. # XGBoost默认参数,将在main函数中从命令行参数更新
  61. DEFAULT_PARAMS = {
  62. 'objective': 'reg:squarederror',
  63. 'learning_rate': 0.1,
  64. 'max_depth': 8,
  65. 'min_child_weight': 3,
  66. 'gamma': 2,
  67. 'subsample': 0.85,
  68. 'colsample_bytree': 0.75,
  69. 'eval_metric': 'rmse',
  70. 'seed': 42,
  71. 'reg_alpha': 0.45,
  72. 'reg_lambda': 1.29,
  73. 'max_delta_step': 0,
  74. 'booster': 'gbtree',
  75. 'tree_method': 'auto'
  76. }
  77. # —— 因子预处理相关配置 ——
  78. FILL_METHODS = {
  79. '美国2年通胀预期': 'rolling_mean_5',
  80. '美国首次申领失业金人数/4WMA': 'interpolate',
  81. '道琼斯旅游与休闲/工业平均指数': 'interpolate',
  82. '美国EIA成品油总库存(预测/供应需求3年季节性)': 'interpolate',
  83. '美国成品车用汽油倒推产量(预测/汽油库存维持上年季节性)/8WMA': 'interpolate',
  84. '美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年': 'interpolate',
  85. '美国炼厂可用产能(路透)(预测)': 'interpolate',
  86. '美国炼厂CDU装置检修量(新)': 'interpolate',
  87. '美湾单位辛烷值价格(预测/季节性)': 'interpolate',
  88. '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年': 'interpolate'
  89. }
  90. SHIFT_CONFIG = [
  91. ('美国2年通胀预期', 56, '美国2年通胀预期_提前56天'),
  92. ('美国首次申领失业金人数/4WMA', 100, '美国首次申领失业金人数/4WMA_提前100天'),
  93. ('美国首次申领失业金人数/4WMA', 112, '美国首次申领失业金人数/4WMA_提前112天'),
  94. ('道琼斯旅游与休闲/工业平均指数', 14, '道琼斯旅游与休闲/工业平均指数_提前14天'),
  95. ('美国EIA成品油总库存(预测/供应需求3年季节性)', 15,
  96. '美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天'),
  97. ('美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年',
  98. 30,
  99. '美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年_提前30天'),
  100. ('美国炼厂CDU装置检修量(新)', 30, '美国炼厂CDU装置检修量(新)_提前30天'),
  101. ('美国炼厂可用产能(路透)(预测)', 100,
  102. '美国炼厂可用产能(路透)(预测)_提前100天')
  103. ]
  104. REVERSE_CONFIG = [
  105. ('美国首次申领失业金人数/4WMA',
  106. '美国首次申领失业金人数/4WMA_逆序'),
  107. ('美国首次申领失业金人数/4WMA_提前100天',
  108. '美国首次申领失业金人数/4WMA_提前100天_逆序'),
  109. ('美国首次申领失业金人数/4WMA_提前112天',
  110. '美国首次申领失业金人数/4WMA_提前112天_逆序'),
  111. ('美国EIA成品油总库存(预测/供应需求3年季节性)',
  112. '美国EIA成品油总库存(预测/供应需求3年季节性)_逆序'),
  113. ('美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天',
  114. '美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天_逆序'),
  115. ('美国炼厂可用产能(路透)(预测)_提前100天',
  116. '美国炼厂可用产能(路透)(预测)_逆序'),
  117. ('美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年',
  118. '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序')
  119. ]
  120. SPECIAL_REVERSE = {
  121. '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序_2022-01-01': {
  122. 'base_column': '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序',
  123. 'condition_date': pd.Timestamp('2022-01-01')
  124. }
  125. }
  126. METRICS_JSON = 'model_metrics.json'
  127. # ------------ 数据加载与预处理 ------------
  128. def load_and_preprocess_data():
  129. # 直接从API获取数据
  130. df = fetch_data_by_indicators(INDICATOR_IDS)
  131. # print("Initial DataFrame columns:", df.columns)
  132. df.index = pd.to_datetime(df.index)
  133. df_daily = df.copy()
  134. df_daily['Date'] = df_daily.index
  135. df_daily = df_daily.reset_index(drop=True)
  136. #预处理流程
  137. df_daily = fill_missing_values(df_daily, FILL_METHODS, return_only_filled=False)
  138. for col, days, new_col in SHIFT_CONFIG:
  139. df_daily[new_col] = df_daily[col].shift(days)
  140. last_idx = df_daily[TARGET_COL].last_valid_index()
  141. last_day = df_daily.loc[last_idx, 'Date']
  142. df_daily = df_daily[(df_daily['Date'] >= '2009-08-01') & (df_daily['Date'] <= last_day + pd.Timedelta(days=30))]
  143. df_daily = df_daily[df_daily['Date'].dt.weekday < 5]
  144. for base, new in REVERSE_CONFIG:
  145. df_daily[new] = reverse_column(df_daily, base)
  146. for col, cfg in SPECIAL_REVERSE.items():
  147. df_daily[col] = np.where(df_daily['Date'] >= cfg['condition_date'],
  148. df_daily[cfg['base_column']],
  149. np.nan)
  150. df_daily = df_daily[(df_daily['Date'] > last_day)|df_daily[TARGET_COL].notna()]
  151. return df_daily, last_day
  152. # ------------ 划分与特征构建 ------------
  153. def split_and_build_features(df_daily, last_day):
  154. train = df_daily[df_daily['Date'] <= last_day].copy()
  155. test = train.tail(TEST_PERIOD).copy()
  156. train = train.iloc[:-TEST_PERIOD].copy()
  157. future = df_daily[df_daily['Date'] > last_day].copy()
  158. feature_columns = [
  159. '美湾单位辛烷值价格(预测/季节性)',
  160. '美国炼厂CDU装置检修量(新)_提前30天',
  161. '美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天_逆序',
  162. '美国首次申领失业金人数/4WMA_提前100天_逆序',
  163. '美国成品车用汽油倒推产量(预测/汽油库存维持上年季节性)/8WMA',
  164. '美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年_提前30天',
  165. '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序_2022-01-01'
  166. ]
  167. X_train = train[feature_columns]
  168. y_train = train[TARGET_COL]
  169. X_test = test[feature_columns]
  170. y_test = test[TARGET_COL]
  171. X_future = future[feature_columns]
  172. return X_train, y_train, X_test, y_test, X_future, train, test, future
  173. # ------------ 特征缩放与异常值权重 ------------
  174. def scale_and_weight_features(X_train, X_test, X_future):
  175. scaler = StandardScaler()
  176. X_tr = scaler.fit_transform(X_train)
  177. X_te = scaler.transform(X_test)
  178. X_fu = scaler.transform(X_future)
  179. return scaler, X_tr, X_te, X_fu
  180. def detect_outliers_weights(X,weight_normal=1.0,weight_outlier=0.05,threshold=3):
  181. z = np.abs((X - X.mean()) / X.std())
  182. mask = (z > threshold).any(axis=1)
  183. return np.where(mask, weight_outlier, weight_normal)
  184. # ------------ 模型训练 ------------
  185. def train_model_with_tuning(X_tr, y_tr, X_te, y_te, weights, use_tuning):
  186. if use_tuning:
  187. param_dist = {
  188. 'learning_rate': list(np.arange(0.01, 0.11, 0.01)),
  189. 'max_depth': list(range(4, 11)),
  190. 'min_child_weight': list(range(1, 6)),
  191. 'gamma': list(np.arange(0, 0.6, 0.1)),
  192. 'subsample': list(np.arange(0.5, 1.01, 0.05)),
  193. 'colsample_bytree': list(np.arange(0.5, 1.01, 0.05)),
  194. 'reg_alpha': [0, 0.1, 0.2, 0.3, 0.4, 0.45, 0.5],
  195. 'reg_lambda': list(np.arange(1.0, 1.6, 0.1))
  196. }
  197. # 将数据转换为DMatrix格式
  198. dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=weights)
  199. dtest = xgb.DMatrix(X_te, label=y_te)
  200. # 基础参数设置
  201. base_params = {
  202. 'objective': 'reg:squarederror',
  203. 'eval_metric': 'rmse',
  204. 'seed': RANDOM_STATE
  205. }
  206. best_score = float('inf')
  207. best_params = None
  208. # 网格搜索
  209. if SEARCH_MODE == 'grid':
  210. param_combinations = [dict(zip(param_dist.keys(), v))
  211. for v in itertools.product(*param_dist.values())]
  212. for params in param_combinations:
  213. curr_params = {**base_params, **params}
  214. cv_results = xgb.cv(curr_params, dtrain,
  215. num_boost_round=NUM_BOOST_ROUND,
  216. nfold=3,
  217. early_stopping_rounds=20,
  218. verbose_eval=False)
  219. score = cv_results['test-rmse-mean'].min()
  220. if score < best_score:
  221. best_score = score
  222. best_params = curr_params
  223. # 贝叶斯搜索
  224. elif SEARCH_MODE == 'bayesian':
  225. search_spaces = {
  226. 'learning_rate': Real(0.01, 0.11, prior='uniform'),
  227. 'max_depth': Integer(4, 11),
  228. 'min_child_weight': Integer(1, 6),
  229. 'gamma': Real(0.0, 0.6, prior='uniform'),
  230. 'subsample': Real(0.5, 1.01, prior='uniform'),
  231. 'colsample_bytree': Real(0.5, 1.01, prior='uniform'),
  232. 'reg_alpha': Real(0.0, 0.5, prior='uniform'),
  233. 'reg_lambda': Real(1.0, 1.6, prior='uniform')
  234. }
  235. def objective(params):
  236. curr_params = {**base_params, **params}
  237. cv_results = xgb.cv(curr_params, dtrain,
  238. num_boost_round=NUM_BOOST_ROUND,
  239. nfold=3,
  240. early_stopping_rounds=20,
  241. verbose_eval=False)
  242. return cv_results['test-rmse-mean'].min()
  243. # 执行贝叶斯优化
  244. from skopt import gp_minimize
  245. result = gp_minimize(
  246. objective,
  247. dimensions=[space for space in search_spaces.values()],
  248. n_calls=50,
  249. random_state=RANDOM_STATE
  250. )
  251. best_params = dict(zip(search_spaces.keys(), result.x))
  252. best_params = {**base_params, **best_params}
  253. best_score = result.fun
  254. # 随机搜索
  255. else:
  256. for _ in range(50):
  257. params = {k: random.choice(v) for k, v in param_dist.items()}
  258. curr_params = {**base_params, **params}
  259. cv_results = xgb.cv(curr_params, dtrain,
  260. num_boost_round=NUM_BOOST_ROUND,
  261. nfold=3,
  262. early_stopping_rounds=20,
  263. verbose_eval=False)
  264. score = cv_results['test-rmse-mean'].min()
  265. if score < best_score:
  266. best_score = score
  267. best_params = curr_params
  268. print("调优后的最佳参数:", best_params)
  269. print("最佳得分:", best_score)
  270. # 使用最佳参数训练最终模型
  271. best_model = xgb.train(best_params,
  272. dtrain,
  273. num_boost_round=NUM_BOOST_ROUND,
  274. evals=[(dtrain, 'Train'), (dtest, 'Test')],
  275. early_stopping_rounds=20,
  276. verbose_eval=False)
  277. else:
  278. # 直接使用默认参数训练
  279. dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=weights)
  280. dtest = xgb.DMatrix(X_te, label=y_te)
  281. best_model = xgb.train(DEFAULT_PARAMS,
  282. dtrain,
  283. num_boost_round=NUM_BOOST_ROUND,
  284. evals=[(dtrain, 'Train'),
  285. (dtest, 'Test')],
  286. verbose_eval=False)
  287. return best_model
  288. # ------------ 评估与预测 ------------
  289. def evaluate_and_predict(model, scaler, X_tr, y_tr, X_te, y_te, X_fu, use_tuning):
  290. X_tr_s = scaler.transform(X_tr)
  291. X_te_s = scaler.transform(X_te)
  292. X_fu_s = scaler.transform(X_fu)
  293. if isinstance(model, xgb.Booster):
  294. y_tr_pred = model.predict(xgb.DMatrix(X_tr_s))
  295. y_te_pred = model.predict(xgb.DMatrix(X_te_s))
  296. y_fu_pred = model.predict(xgb.DMatrix(X_fu_s))
  297. else:
  298. y_tr_pred = model.predict(X_tr_s)
  299. y_te_pred = model.predict(X_te_s)
  300. y_fu_pred = model.predict(X_fu_s)
  301. # 计算评估指标并保留4位有效数字
  302. train_mse = float(f"{mean_squared_error(y_tr, y_tr_pred):.4g}")
  303. test_mse = float(f"{mean_squared_error(y_te, y_te_pred):.4g}")
  304. train_r2 = float(f"{r2_score(y_tr, y_tr_pred):.4g}")
  305. test_r2 = float(f"{r2_score(y_te, y_te_pred):.4g}") if len(y_te) >= 2 else None
  306. print("Train MSE:", train_mse, "Test MSE:", test_mse)
  307. if len(y_te) >= 2:
  308. print("Train R2:", train_r2, "Test R2:", test_r2)
  309. else:
  310. print("Test 样本不足,跳过 R² 计算")
  311. metrics = {
  312. 'train_mse': train_mse,
  313. 'test_mse': test_mse,
  314. 'train_r2': train_r2,
  315. 'test_r2': test_r2
  316. }
  317. return y_tr_pred, y_te_pred, y_fu_pred, metrics
  318. # ------------ 结果后处理(生成日度 & 月度 DataFrame) ------------
  319. def merge_and_prepare_df(train, test, future, y_te_pred, y_fu_pred):
  320. # 合并历史与未来预测
  321. test = test.copy()
  322. future = future.copy()
  323. test['预测值'] = y_te_pred
  324. future['预测值'] = y_fu_pred
  325. hist_actual = pd.concat([
  326. train[train['Date'].dt.year >= 2023][['Date', TARGET_COL]],
  327. test[['Date', TARGET_COL]]
  328. ])
  329. hist_actual.columns = ['Date', '实际值']
  330. future_pred = future[future['Date'] >= '2022-08-01'][['Date', '预测值']].rename(columns={'预测值': TARGET_COL}).copy()
  331. last_val = hist_actual.iloc[-1]['实际值']
  332. future_pred[TARGET_COL] = future_pred[TARGET_COL].astype(last_val.dtype)
  333. future_pred.iloc[0, 1] = last_val
  334. # 日度重采样
  335. merged = pd.merge(hist_actual, future_pred,on='Date', how='outer').sort_values('Date', ascending=False)
  336. daily_df = merged.copy()
  337. # 月度重采样
  338. monthly_df = daily_df.copy()
  339. monthly_df['Date'] = pd.to_datetime(monthly_df['Date'])
  340. monthly_df.set_index('Date', inplace=True)
  341. monthly_df = monthly_df.resample('ME').mean().reset_index()
  342. # 方向准确率
  343. pred_dir = np.sign(monthly_df[TARGET_COL].diff())
  344. true_dir = np.sign(monthly_df['实际值'].diff())
  345. valid = monthly_df[TARGET_COL].notna() & monthly_df['实际值'].notna()
  346. monthly_df['方向准确率'] = np.where(valid & (pred_dir == true_dir), '正确',
  347. np.where(valid & (pred_dir != true_dir), '错误', ''))
  348. # 修改绝对偏差计算,转换为百分比
  349. monthly_df['绝对偏差'] = np.where(
  350. monthly_df[TARGET_COL].notna() & monthly_df['实际值'].notna(),
  351. abs((monthly_df[TARGET_COL] - monthly_df['实际值']) / monthly_df['实际值']),
  352. np.nan)
  353. monthly_df = monthly_df.sort_values('Date', ascending=False).reset_index(drop=True)
  354. monthly_df['Date'] = monthly_df['Date'].dt.strftime('%Y-%m-%d')
  355. daily_df['Date'] = daily_df['Date'].dt.strftime('%Y-%m-%d')
  356. return daily_df, monthly_df
  357. def generate_and_fill_excel(
  358. daily_df,
  359. monthly_df,
  360. metrics, # 新增参数
  361. target_name, # 写入的"预测标的"显示名
  362. classification, # 列表页-分类
  363. model_framework, # 列表页-模型框架
  364. creator, # 列表页-创建人
  365. # pred_date, # 列表页-预测日期
  366. frequency, # 列表页-预测频度
  367. significant_digits=5,
  368. output_path='update.xlsx'
  369. ):
  370. with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
  371. workbook = writer.book
  372. # 获取monthly_df的第一个日期作为预测日期
  373. actual_pred_date = pd.to_datetime(monthly_df['Date'].iloc[0]).strftime('%Y/%m/%d')
  374. # 格式化数值的辅助函数 - 用于测试值
  375. def format_test_value(x, sig_digits=significant_digits):
  376. if pd.isna(x):
  377. return ""
  378. return f"{float(x):.{sig_digits}g}"
  379. # 格式化百分比的辅助函数 - 用于方向准确率和偏差率(3位有效数)
  380. def format_percentage(x):
  381. if pd.isna(x):
  382. return ""
  383. return f"{float(x*100):.2f}%"
  384. # 格式化训练指标的辅助函数 - 用于训练结果页(6位有效数)
  385. def format_metrics(x):
  386. if pd.isna(x) or x == '':
  387. return ""
  388. return f"{float(x):.6g}"
  389. # —— 计算三个汇总值 ——
  390. test_value = format_test_value(monthly_df[TARGET_COL].iloc[0])
  391. total = monthly_df['方向准确率'].notna().sum()
  392. correct = (monthly_df['方向准确率'] == '正确').sum()
  393. direction_accuracy = format_percentage(correct/total) if total > 0 else ""
  394. absolute_deviation = format_percentage(monthly_df['绝对偏差'].mean())
  395. # ========= 列表页 =========
  396. ws_list = workbook.add_worksheet('列表页')
  397. writer.sheets['列表页'] = ws_list
  398. headers = ['预测标的','分类','模型框架','创建人','预测日期','测试值','预测频度','方向准确率','绝对偏差']
  399. ws_list.write_row(0, 0, headers)
  400. ws_list.write_row(1, 0, [
  401. target_name,
  402. classification,
  403. model_framework,
  404. creator,
  405. actual_pred_date,
  406. test_value,
  407. frequency,
  408. direction_accuracy,
  409. absolute_deviation
  410. ])
  411. # ========= 详情页 =========
  412. detail_df = monthly_df[['Date', '实际值', TARGET_COL, '方向准确率', '绝对偏差']].copy()
  413. detail_df.columns = ['指标日期','实际值','预测值','方向','偏差率']
  414. # 格式化日期为年/月/日
  415. detail_df['指标日期'] = pd.to_datetime(detail_df['指标日期']).dt.strftime('%Y/%m/%d')
  416. # 格式化实际值和预测值列(使用传入的significant_digits)
  417. detail_df['实际值'] = detail_df['实际值'].apply(format_test_value)
  418. detail_df['预测值'] = detail_df['预测值'].apply(format_test_value)
  419. detail_df['偏差率'] = detail_df['偏差率'].apply(
  420. lambda x: f"{float(x*100):.3g}%" if pd.notnull(x) else "")
  421. detail_df.to_excel(writer,sheet_name='详情页',index=False,header=False,startrow=2)
  422. ws_detail = writer.sheets['详情页']
  423. ws_detail.write(0, 0, target_name)
  424. ws_detail.write_row(1, 0, ['指标日期','实际值','预测值','方向','偏差率'])
  425. # ========= 日度数据表 =========
  426. daily_out = daily_df[['Date', '实际值', TARGET_COL]].copy()
  427. daily_out.columns = ['指标日期','实际值','预测值']
  428. # 格式化日期为年/月/日
  429. daily_out['指标日期'] = pd.to_datetime(daily_out['指标日期']).dt.strftime('%Y/%m/%d')
  430. # 日度数据表不限制有效数字
  431. daily_out.to_excel(writer,sheet_name='日度数据表',index=False,header=False,startrow=2)
  432. ws_daily = writer.sheets['日度数据表']
  433. ws_daily.write(0, 0, target_name)
  434. ws_daily.write_row(1, 0, ['指标日期','实际值','预测值'])
  435. # ========= 训练结果页 =========
  436. ws_metrics = workbook.add_worksheet('训练结果页')
  437. writer.sheets['训练结果页'] = ws_metrics
  438. metrics_headers = ['指标名称', '指标值']
  439. ws_metrics.write_row(0, 0, metrics_headers)
  440. metrics_rows = [
  441. ['训练集 MSE', format_metrics(metrics['train_mse'])],
  442. ['测试集 MSE', format_metrics(metrics['test_mse'])],
  443. ['训练集 R²', format_metrics(metrics['train_r2'])],
  444. ['测试集 R²', format_metrics(metrics['test_r2']) if metrics['test_r2'] is not None else '']
  445. ]
  446. for i, row in enumerate(metrics_rows, start=1):
  447. ws_metrics.write_row(i, 0, row)
  448. print(f"已生成并填充 {output_path}")
  449. # ------------ 全量训练与预测 ------------
  450. def train_full_model_and_predict(X_train, y_train, X_test, y_test, X_future):
  451. X_all = pd.concat([X_train, X_test])
  452. y_all = pd.concat([y_train, y_test])
  453. scaler_all = StandardScaler().fit(X_all)
  454. X_all_s = scaler_all.transform(X_all)
  455. X_fu_s = scaler_all.transform(X_future)
  456. model = XGBRegressor(**DEFAULT_PARAMS, n_estimators=NUM_BOOST_ROUND)
  457. model.fit(X_all_s, y_all)
  458. y_fu_full = model.predict(X_fu_s)
  459. return model, y_fu_full, scaler_all
  460. # ------------ 可视化 ------------
  461. def plot_final_predictions(train, y_tr, y_tr_pred, test, y_te, y_te_pred,
  462. future, last_day):
  463. plt.figure(figsize=(15, 6))
  464. plt.plot(train['Date'], y_tr, label='Train True')
  465. plt.plot(train['Date'], y_tr_pred, label='Train Pred')
  466. plt.plot(test['Date'], y_te, label='Test True', alpha=0.7)
  467. plt.plot(test['Date'], y_te_pred, label='Test Pred')
  468. plt.plot(future['Date'], future['预测值'], label='Future Pred')
  469. plt.axvline(test['Date'].iloc[0], color='gray', linestyle='--')
  470. plt.axvline(last_day, color='black', linestyle='--')
  471. plt.legend()
  472. plt.xlabel('Date')
  473. plt.ylabel(TARGET_COL)
  474. plt.title('Prediction Visualization')
  475. plt.grid(True)
  476. plt.show()
  477. # ------------ 主函数 ------------
  478. def main():
  479. # 解析命令行参数
  480. args = parse_arguments()
  481. # 更新全局变量
  482. global NUM_BOOST_ROUND, USE_HYPERPARAM_TUNING, OUTPUT_PATH, DEFAULT_PARAMS
  483. NUM_BOOST_ROUND = args.num_boost_round
  484. USE_HYPERPARAM_TUNING = args.use_hyperparam_tuning.lower() == 'true'
  485. # 根据前缀生成输出路径
  486. if args.output_prefix:
  487. OUTPUT_PATH = f"{args.output_prefix}_update.xlsx"
  488. # 更新XGBoost参数
  489. DEFAULT_PARAMS = {
  490. 'objective': args.objective,
  491. 'learning_rate': args.learning_rate,
  492. 'max_depth': args.max_depth,
  493. 'min_child_weight': args.min_child_weight,
  494. 'gamma': args.gamma,
  495. 'subsample': args.subsample,
  496. 'colsample_bytree': args.colsample_bytree,
  497. 'eval_metric': args.eval_metric,
  498. 'seed': args.seed,
  499. 'reg_alpha': args.reg_alpha,
  500. 'reg_lambda': args.reg_lambda,
  501. 'max_delta_step': args.max_delta_step,
  502. 'booster': args.booster,
  503. 'tree_method':args.tree_method
  504. }
  505. # print("使用参数:")
  506. # print(f"NUM_BOOST_ROUND: {NUM_BOOST_ROUND}")
  507. # print(f"USE_HYPERPARAM_TUNING: {USE_HYPERPARAM_TUNING}")
  508. # print(f"OUTPUT_PATH: {OUTPUT_PATH}")
  509. # print("DEFAULT_PARAMS:", DEFAULT_PARAMS)
  510. df_daily, last_day = load_and_preprocess_data()
  511. X_tr, y_tr, X_te, y_te, X_fu, train, test, future = split_and_build_features(df_daily, last_day)
  512. scaler, X_tr_s, X_te_s, X_fu_s = scale_and_weight_features(X_tr, X_te, X_fu)
  513. weights = detect_outliers_weights(X_tr_s)
  514. model = train_model_with_tuning(X_tr_s, y_tr, X_te_s, y_te, weights, USE_HYPERPARAM_TUNING)
  515. y_tr_pred, y_te_pred, y_fu_pred, metrics = evaluate_and_predict(model, scaler, X_tr, y_tr, X_te, y_te, X_fu, USE_HYPERPARAM_TUNING)
  516. daily_df, monthly_df = merge_and_prepare_df(train, test, future, y_te_pred, y_fu_pred)
  517. # print(monthly_df)
  518. # print(daily_df)
  519. generate_and_fill_excel(
  520. daily_df,
  521. monthly_df,
  522. metrics,
  523. target_name=TARGET_NAME,
  524. classification=CLASSIFICATION,
  525. model_framework=MODEL_FRAMEWORK,
  526. creator=CREATOR,
  527. # pred_date=PRED_DATE,
  528. frequency=FREQUENCY,
  529. significant_digits= SIGNIFICANT_DIGITS, # 设置6位有效数字
  530. output_path=OUTPUT_PATH
  531. )
  532. full_model, y_fu_full, scaler_full = train_full_model_and_predict(X_tr, y_tr, X_te, y_te, X_fu)
  533. if ADJUST_FULL_PREDICTIONS:
  534. offset = y_te.iloc[-1] - y_fu_full[0]
  535. y_fu_full += offset
  536. if SHOW_PLOTS:
  537. plot_final_predictions(
  538. train, y_tr, y_tr_pred, test, y_te, y_te_pred,
  539. future.assign(预测值=y_fu_full), last_day)
  540. return daily_df, monthly_df
  541. if __name__ == '__main__':
  542. daily_df, monthly_df = main()