2.Rbob.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. import pandas as pd
  2. import numpy as np
  3. import xgboost as xgb
  4. from xgboost import XGBRegressor
  5. from sklearn.metrics import mean_squared_error, r2_score
  6. import matplotlib.pyplot as plt
  7. from skopt import BayesSearchCV
  8. from sklearn.preprocessing import StandardScaler
  9. from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
  10. from Dtool import fill_missing_values, reverse_column
  11. from api import fetch_data_by_indicators
  12. # 使用示例
  13. INDICATOR_IDS = ["RBWTICKMc1", "C2406121350446455",'USGGBE02 Index', "Cinjcjc4 index",'injcjc4 index','C2201059138_241106232710','C2406036178','C22411071623523660','C2312081670','REFOC-T-EIA_241114135248','C2304065621_241024124344','REFOC-T-EIA_241114135248','C22503031424010431']
  14. USE_HYPERPARAM_TUNING = False # 若 False 则直接使用 xgb.train
  15. NUM_BOOST_ROUND = 1000
  16. RANDOM_STATE = 42
  17. TARGET_COL = '美国RBOB汽油裂解'
  18. TEST_PERIOD = 20
  19. SEARCH_MODE = 'random' # 可选 'grid' / 'bayesian' / 'random'
  20. SHOW_PLOTS = False
  21. ADJUST_FULL_PREDICTIONS = True
  22. TARGET_NAME = '美国RBOB汽油裂解'
  23. CLASSIFICATION = '原油'
  24. MODEL_FRAMEWORK = 'XGBoost'
  25. CREATOR = '张立舟'
  26. PRED_DATE = '2024/11/11'
  27. FREQUENCY = '月度'
  28. OUTPUT_PATH = 'update.xlsx'
  29. DEFAULT_PARAMS = {
  30. 'objective': 'reg:squarederror',
  31. 'learning_rate': 0.1,
  32. 'max_depth': 8,
  33. 'min_child_weight': 3,
  34. 'gamma': 2,
  35. 'subsample': 0.85,
  36. 'colsample_bytree': 0.75,
  37. 'eval_metric': 'rmse',
  38. 'seed': 42,
  39. 'reg_alpha': 0.45,
  40. 'reg_lambda': 1.29,
  41. }
  42. # —— 因子预处理相关配置 ——
  43. FILL_METHODS = {
  44. '美国2年通胀预期': 'rolling_mean_5',
  45. '美国首次申领失业金人数/4WMA': 'interpolate',
  46. '道琼斯旅游与休闲/工业平均指数': 'interpolate',
  47. '美国EIA成品油总库存(预测/供应需求3年季节性)': 'interpolate',
  48. '美国成品车用汽油倒推产量(预测/汽油库存维持上年季节性)/8WMA': 'interpolate',
  49. '美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年': 'interpolate',
  50. '美国炼厂可用产能(路透)(预测)': 'interpolate',
  51. '美国炼厂CDU装置检修量(新)': 'interpolate',
  52. '美湾单位辛烷值价格(预测/季节性)': 'interpolate',
  53. '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年': 'interpolate'
  54. }
  55. SHIFT_CONFIG = [
  56. ('美国2年通胀预期', 56, '美国2年通胀预期_提前56天'),
  57. ('美国首次申领失业金人数/4WMA', 100, '美国首次申领失业金人数/4WMA_提前100天'),
  58. ('美国首次申领失业金人数/4WMA', 112, '美国首次申领失业金人数/4WMA_提前112天'),
  59. ('道琼斯旅游与休闲/工业平均指数', 14, '道琼斯旅游与休闲/工业平均指数_提前14天'),
  60. ('美国EIA成品油总库存(预测/供应需求3年季节性)', 15,
  61. '美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天'),
  62. ('美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年',
  63. 30,
  64. '美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年_提前30天'),
  65. ('美国炼厂CDU装置检修量(新)', 30, '美国炼厂CDU装置检修量(新)_提前30天'),
  66. ('美国炼厂可用产能(路透)(预测)', 100,
  67. '美国炼厂可用产能(路透)(预测)_提前100天')
  68. ]
  69. REVERSE_CONFIG = [
  70. ('美国首次申领失业金人数/4WMA',
  71. '美国首次申领失业金人数/4WMA_逆序'),
  72. ('美国首次申领失业金人数/4WMA_提前100天',
  73. '美国首次申领失业金人数/4WMA_提前100天_逆序'),
  74. ('美国首次申领失业金人数/4WMA_提前112天',
  75. '美国首次申领失业金人数/4WMA_提前112天_逆序'),
  76. ('美国EIA成品油总库存(预测/供应需求3年季节性)',
  77. '美国EIA成品油总库存(预测/供应需求3年季节性)_逆序'),
  78. ('美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天',
  79. '美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天_逆序'),
  80. ('美国炼厂可用产能(路透)(预测)_提前100天',
  81. '美国炼厂可用产能(路透)(预测)_逆序'),
  82. ('美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年',
  83. '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序')
  84. ]
  85. SPECIAL_REVERSE = {
  86. '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序_2022-01-01': {
  87. 'base_column': '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序',
  88. 'condition_date': pd.Timestamp('2022-01-01')
  89. }
  90. }
  91. # ------------ 数据加载与预处理 ------------
  92. def load_and_preprocess_data():
  93. # 直接从API获取数据
  94. df = fetch_data_by_indicators(INDICATOR_IDS)
  95. # print("Initial DataFrame columns:", df.columns)
  96. df.index = pd.to_datetime(df.index)
  97. df_daily = df.copy()
  98. df_daily['Date'] = df_daily.index
  99. df_daily = df_daily.reset_index(drop=True)
  100. #预处理流程
  101. df_daily = fill_missing_values(df_daily, FILL_METHODS, return_only_filled=False)
  102. for col, days, new_col in SHIFT_CONFIG:
  103. df_daily[new_col] = df_daily[col].shift(days)
  104. last_idx = df_daily[TARGET_COL].last_valid_index()
  105. last_day = df_daily.loc[last_idx, 'Date']
  106. df_daily = df_daily[(df_daily['Date'] >= '2009-08-01') & (df_daily['Date'] <= last_day + pd.Timedelta(days=30))]
  107. df_daily = df_daily[df_daily['Date'].dt.weekday < 5]
  108. for base, new in REVERSE_CONFIG:
  109. df_daily[new] = reverse_column(df_daily, base)
  110. for col, cfg in SPECIAL_REVERSE.items():
  111. df_daily[col] = np.where(df_daily['Date'] >= cfg['condition_date'],
  112. df_daily[cfg['base_column']],
  113. np.nan)
  114. df_daily = df_daily[(df_daily['Date'] > last_day)|df_daily[TARGET_COL].notna()]
  115. return df_daily, last_day
  116. # ------------ 划分与特征构建 ------------
  117. def split_and_build_features(df_daily, last_day):
  118. train = df_daily[df_daily['Date'] <= last_day].copy()
  119. test = train.tail(TEST_PERIOD).copy()
  120. train = train.iloc[:-TEST_PERIOD].copy()
  121. future = df_daily[df_daily['Date'] > last_day].copy()
  122. feature_columns = [
  123. '美湾单位辛烷值价格(预测/季节性)',
  124. '美国炼厂CDU装置检修量(新)_提前30天',
  125. '美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天_逆序',
  126. '美国首次申领失业金人数/4WMA_提前100天_逆序',
  127. '美国成品车用汽油倒推产量(预测/汽油库存维持上年季节性)/8WMA',
  128. '美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年_提前30天',
  129. '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序_2022-01-01'
  130. ]
  131. X_train = train[feature_columns]
  132. y_train = train[TARGET_COL]
  133. X_test = test[feature_columns]
  134. y_test = test[TARGET_COL]
  135. X_future = future[feature_columns]
  136. return X_train, y_train, X_test, y_test, X_future, train, test, future
  137. # ------------ 特征缩放与异常值权重 ------------
  138. def scale_and_weight_features(X_train, X_test, X_future):
  139. scaler = StandardScaler()
  140. X_tr = scaler.fit_transform(X_train)
  141. X_te = scaler.transform(X_test)
  142. X_fu = scaler.transform(X_future)
  143. return scaler, X_tr, X_te, X_fu
  144. def detect_outliers_weights(X,weight_normal=1.0,weight_outlier=0.05,threshold=3):
  145. z = np.abs((X - X.mean()) / X.std())
  146. mask = (z > threshold).any(axis=1)
  147. return np.where(mask, weight_outlier, weight_normal)
  148. # ------------ 模型训练 ------------
  149. def train_model_with_tuning(X_tr, y_tr, X_te, y_te, weights, use_tuning):
  150. print(f"超参数调优状态: {'启用' if use_tuning else '禁用'}")
  151. if use_tuning:
  152. param_dist = {
  153. 'learning_rate': list(np.arange(0.01, 0.11, 0.01)),
  154. 'max_depth': list(range(4, 11)),
  155. 'min_child_weight': list(range(1, 6)),
  156. 'gamma': list(np.arange(0, 0.6, 0.1)),
  157. 'subsample': list(np.arange(0.5, 1.01, 0.05)),
  158. 'colsample_bytree': list(np.arange(0.5, 1.01, 0.05)),
  159. 'reg_alpha': [0, 0.1, 0.2, 0.3, 0.4, 0.45, 0.5],
  160. 'reg_lambda': list(np.arange(1.0, 1.6, 0.1))
  161. }
  162. xgb_reg = XGBRegressor(objective='reg:squarederror',
  163. eval_metric='rmse',
  164. n_estimators=NUM_BOOST_ROUND,
  165. seed=RANDOM_STATE)
  166. tscv = TimeSeriesSplit(n_splits=3)
  167. if SEARCH_MODE == 'grid':
  168. search = GridSearchCV(xgb_reg,
  169. param_grid=param_dist,
  170. scoring='neg_mean_squared_error',
  171. cv=tscv,
  172. verbose=1,
  173. n_jobs=-1)
  174. elif SEARCH_MODE == 'bayesian':
  175. search = BayesSearchCV(xgb_reg,
  176. search_spaces=param_dist,
  177. n_iter=50,
  178. scoring='neg_mean_squared_error',
  179. cv=tscv,
  180. random_state=RANDOM_STATE,
  181. verbose=1,
  182. n_jobs=-1)
  183. else:
  184. search = RandomizedSearchCV(xgb_reg,
  185. param_distributions=param_dist,
  186. n_iter=50,
  187. scoring='neg_mean_squared_error',
  188. cv=tscv,
  189. random_state=RANDOM_STATE,
  190. verbose=1,
  191. n_jobs=-1)
  192. search.fit(X_tr, y_tr, sample_weight=weights)
  193. best_model = search.best_estimator_
  194. print("调优后的最佳参数:", search.best_params_)
  195. best_model.fit(X_tr, y_tr,
  196. eval_set=[(X_te, y_te)],
  197. early_stopping_rounds=20,
  198. verbose=200)
  199. else:
  200. print("使用默认参数进行训练...")
  201. dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=weights)
  202. dtest = xgb.DMatrix(X_te, label=y_te)
  203. best_model = xgb.train(DEFAULT_PARAMS,
  204. dtrain,
  205. num_boost_round=NUM_BOOST_ROUND,
  206. evals=[(dtrain, 'Train'),
  207. (dtest, 'Test')],
  208. verbose_eval=False)
  209. return best_model
  210. # ------------ 评估与预测 ------------
  211. def evaluate_and_predict(model, scaler, X_tr, y_tr, X_te, y_te, X_fu, use_tuning):
  212. X_tr_s = scaler.transform(X_tr)
  213. X_te_s = scaler.transform(X_te)
  214. X_fu_s = scaler.transform(X_fu)
  215. if isinstance(model, xgb.Booster):
  216. y_tr_pred = model.predict(xgb.DMatrix(X_tr_s))
  217. y_te_pred = model.predict(xgb.DMatrix(X_te_s))
  218. y_fu_pred = model.predict(xgb.DMatrix(X_fu_s))
  219. else:
  220. y_tr_pred = model.predict(X_tr_s)
  221. y_te_pred = model.predict(X_te_s)
  222. y_fu_pred = model.predict(X_fu_s)
  223. print("Train MSE:", mean_squared_error(y_tr, y_tr_pred),
  224. "Test MSE:", mean_squared_error(y_te, y_te_pred))
  225. if len(y_te) >= 2:
  226. print("Train R2:", r2_score(y_tr, y_tr_pred),
  227. "Test R2:", r2_score(y_te, y_te_pred))
  228. else:
  229. print("Test 样本不足,跳过 R² 计算")
  230. return y_tr_pred, y_te_pred, y_fu_pred
  231. # ------------ 结果后处理(生成日度 & 月度 DataFrame) ------------
  232. def merge_and_prepare_df(train, test, future, y_te_pred, y_fu_pred):
  233. # 合并历史与未来预测
  234. test = test.copy()
  235. future = future.copy()
  236. test['预测值'] = y_te_pred
  237. future['预测值'] = y_fu_pred
  238. hist_actual = pd.concat([
  239. train[train['Date'].dt.year >= 2023][['Date', TARGET_COL]],
  240. test[['Date', TARGET_COL]]
  241. ])
  242. hist_actual.columns = ['Date', '实际值']
  243. future_pred = future[future['Date'] >= '2022-08-01'][['Date', '预测值']].rename(columns={'预测值': TARGET_COL}).copy()
  244. last_val = float(hist_actual.iloc[-1]['实际值'])
  245. future_pred.iloc[0, 1] = last_val
  246. # 日度重采样
  247. merged = pd.merge(hist_actual, future_pred,on='Date', how='outer').sort_values('Date', ascending=False)
  248. daily_df = merged.copy()
  249. # 月度重采样
  250. monthly_df = daily_df.copy()
  251. monthly_df['Date'] = pd.to_datetime(monthly_df['Date'])
  252. monthly_df.set_index('Date', inplace=True)
  253. monthly_df = monthly_df.resample('ME').mean().reset_index()
  254. # 方向准确率
  255. pred_dir = np.sign(monthly_df[TARGET_COL].diff())
  256. true_dir = np.sign(monthly_df['实际值'].diff())
  257. valid = monthly_df[TARGET_COL].notna() & monthly_df['实际值'].notna()
  258. monthly_df['方向准确率'] = np.where(valid & (pred_dir == true_dir), '正确',
  259. np.where(valid & (pred_dir != true_dir), '错误', np.nan))
  260. # 绝对偏差
  261. monthly_df['绝对偏差'] = (monthly_df[TARGET_COL] - monthly_df['实际值']).abs()
  262. monthly_df = monthly_df.sort_values('Date', ascending=False).reset_index(drop=True)
  263. return daily_df, monthly_df
  264. def generate_and_fill_excel(
  265. daily_df,
  266. monthly_df,
  267. target_name, # 写入的"预测标的"显示名
  268. classification, # 列表页-分类
  269. model_framework, # 列表页-模型框架
  270. creator, # 列表页-创建人
  271. pred_date, # 列表页-预测日期
  272. frequency, # 列表页-预测频度
  273. output_path='update.xlsx'
  274. ):
  275. with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
  276. workbook = writer.book
  277. # —— 计算三个汇总值 ——
  278. # 1) 测试值:最新月度的预测值
  279. test_value = monthly_df[TARGET_COL].iloc[0]
  280. # 2) 方向准确率:正确数 / 有效数
  281. total = monthly_df['方向准确率'].notna().sum()
  282. correct = (monthly_df['方向准确率'] == '正确').sum()
  283. direction_accuracy = f"{correct/total:.2%}" if total > 0 else ""
  284. # 3) 平均绝对偏差
  285. absolute_deviation = monthly_df['绝对偏差'].mean()
  286. # ========= 列表页 =========
  287. ws_list = workbook.add_worksheet('列表页')
  288. writer.sheets['列表页'] = ws_list
  289. headers = ['预测标的','分类','模型框架','创建人','预测日期','测试值','预测频度','方向准确率','绝对偏差']
  290. ws_list.write_row(0, 0, headers)
  291. ws_list.write_row(1, 0, [
  292. target_name,
  293. classification,
  294. model_framework,
  295. creator,
  296. pred_date,
  297. test_value,
  298. frequency,
  299. direction_accuracy,
  300. absolute_deviation
  301. ])
  302. # ========= 详情页 =========
  303. detail_df = monthly_df[['Date', '实际值', TARGET_COL, '方向准确率', '绝对偏差']].copy()
  304. detail_df.columns = ['指标日期','实际值','预测值','方向','偏差率']
  305. detail_df.to_excel(writer,sheet_name='详情页',index=False,header=False,startrow=2)
  306. ws_detail = writer.sheets['详情页']
  307. ws_detail.write(0, 0, target_name)
  308. ws_detail.write_row(1, 0, ['指标日期','实际值','预测值','方向','偏差率'])
  309. # ========= 日度数据表 =========
  310. daily_out = daily_df[['Date', '实际值', TARGET_COL]].copy()
  311. daily_out.columns = ['指标日期','实际值','预测值']
  312. daily_out.to_excel(writer,sheet_name='日度数据表',index=False,header=False,startrow=2)
  313. ws_daily = writer.sheets['日度数据表']
  314. ws_daily.write(0, 0, target_name)
  315. ws_daily.write_row(1, 0, ['指标日期','实际值','预测值'])
  316. print(f"已生成并填充 {output_path}")
  317. # ------------ 全量训练与预测 ------------
  318. def train_full_model_and_predict(X_train, y_train, X_test, y_test, X_future):
  319. X_all = pd.concat([X_train, X_test])
  320. y_all = pd.concat([y_train, y_test])
  321. scaler_all = StandardScaler().fit(X_all)
  322. X_all_s = scaler_all.transform(X_all)
  323. X_fu_s = scaler_all.transform(X_future)
  324. model = XGBRegressor(**DEFAULT_PARAMS, n_estimators=NUM_BOOST_ROUND)
  325. model.fit(X_all_s, y_all)
  326. y_fu_full = model.predict(X_fu_s)
  327. return model, y_fu_full, scaler_all
  328. # ------------ 可视化 ------------
  329. def plot_final_predictions(train, y_tr, y_tr_pred, test, y_te, y_te_pred,
  330. future, last_day):
  331. plt.figure(figsize=(15, 6))
  332. plt.plot(train['Date'], y_tr, label='Train True')
  333. plt.plot(train['Date'], y_tr_pred, label='Train Pred')
  334. plt.plot(test['Date'], y_te, label='Test True', alpha=0.7)
  335. plt.plot(test['Date'], y_te_pred, label='Test Pred')
  336. plt.plot(future['Date'], future['预测值'], label='Future Pred')
  337. plt.axvline(test['Date'].iloc[0], color='gray', linestyle='--')
  338. plt.axvline(last_day, color='black', linestyle='--')
  339. plt.legend()
  340. plt.xlabel('Date')
  341. plt.ylabel(TARGET_COL)
  342. plt.title('Prediction Visualization')
  343. plt.grid(True)
  344. plt.show()
  345. # ------------ 主函数 ------------
  346. def main():
  347. df_daily, last_day = load_and_preprocess_data()
  348. X_tr, y_tr, X_te, y_te, X_fu, train, test, future = split_and_build_features(df_daily, last_day)
  349. scaler, X_tr_s, X_te_s, X_fu_s = scale_and_weight_features(X_tr, X_te, X_fu)
  350. weights = detect_outliers_weights(X_tr_s)
  351. model = train_model_with_tuning(X_tr_s, y_tr, X_te_s, y_te, weights,USE_HYPERPARAM_TUNING)
  352. y_tr_pred, y_te_pred, y_fu_pred = evaluate_and_predict(model, scaler, X_tr, y_tr, X_te, y_te, X_fu,USE_HYPERPARAM_TUNING)
  353. daily_df, monthly_df = merge_and_prepare_df(train, test, future,y_te_pred, y_fu_pred)
  354. print(monthly_df)
  355. print(daily_df)
  356. generate_and_fill_excel(
  357. daily_df,
  358. monthly_df,
  359. target_name= TARGET_NAME,
  360. classification= CLASSIFICATION,
  361. model_framework= MODEL_FRAMEWORK,
  362. creator= CREATOR,
  363. pred_date= PRED_DATE,
  364. frequency= FREQUENCY,
  365. output_path= OUTPUT_PATH
  366. )
  367. full_model, y_fu_full, scaler_full = train_full_model_and_predict(X_tr, y_tr, X_te, y_te, X_fu)
  368. if ADJUST_FULL_PREDICTIONS:
  369. offset = y_te.iloc[-1] - y_fu_full[0]
  370. y_fu_full += offset
  371. if SHOW_PLOTS:
  372. plot_final_predictions(
  373. train, y_tr, y_tr_pred, test, y_te, y_te_pred,
  374. future.assign(预测值=y_fu_full), last_day)
  375. return daily_df, monthly_df
  376. if __name__ == '__main__':
  377. daily_df, monthly_df = main()