import pandas as pd import numpy as np import xgboost as xgb from xgboost import XGBRegressor from sklearn.metrics import mean_squared_error, r2_score import matplotlib.pyplot as plt from skopt import BayesSearchCV from sklearn.preprocessing import StandardScaler from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit import argparse import itertools import random from skopt.space import Real, Integer, Categorical import json from Dtool import fill_missing_values, reverse_column from api import fetch_data_by_indicators # 添加命令行参数解析 def parse_arguments(): parser = argparse.ArgumentParser(description='RBOB汽油裂解预测模型') # XGBoost参数 parser.add_argument('--objective', type=str, default='reg:squarederror', help='XGBoost目标函数') parser.add_argument('--learning_rate', type=float, default=0.1, help='学习率') parser.add_argument('--max_depth', type=int, default=8, help='最大树深度') parser.add_argument('--min_child_weight', type=float, default=3, help='最小子权重') parser.add_argument('--gamma', type=float, default=2, help='gamma参数') parser.add_argument('--subsample', type=float, default=0.85, help='子样本比例') parser.add_argument('--colsample_bytree', type=float, default=0.75, help='每棵树的列采样率') parser.add_argument('--eval_metric', type=str, default='rmse', help='评估指标') parser.add_argument('--seed', type=int, default=42, help='随机种子') parser.add_argument('--reg_alpha', type=float, default=0.45, help='L1正则化') parser.add_argument('--reg_lambda', type=float, default=1.29, help='L2正则化') parser.add_argument('--booster', type=str, default='gbtree', help='提升器类型') parser.add_argument('--tree_method', type=str, default='auto', help='树构建方法') parser.add_argument('--max_delta_step', type=int, default=0, help='最大步长') # 其他参数 parser.add_argument('--num_boost_round', type=int, default=1000, help='提升迭代次数') parser.add_argument('--use_hyperparam_tuning', type=str, default='False', help='是否使用超参数调优') parser.add_argument('--output_prefix', type=str, default='', help='输出文件前缀,如传入1234则生成1234_update.xlsx') args = parser.parse_args() return args # 使用示例 INDICATOR_IDS = ["RBWTICKMc1", "C2406121350446455",'USGGBE02 Index', "Cinjcjc4 index",'injcjc4 index','C2201059138_241106232710','C2406036178','C22411071623523660','C2312081670','REFOC-T-EIA_241114135248','C2304065621_241024124344','REFOC-T-EIA_241114135248','C22503031424010431'] # 这些变量将在main函数中从命令行参数更新 NUM_BOOST_ROUND = 1000 RANDOM_STATE = 42 USE_HYPERPARAM_TUNING = False # 若 False 则直接使用 xgb.train TARGET_COL = '美国RBOB汽油裂解' TEST_PERIOD = 20 SEARCH_MODE = 'random' # 可选 'grid' / 'bayesian' / 'random' SHOW_PLOTS = True ADJUST_FULL_PREDICTIONS = True TARGET_NAME = '美国RBOB汽油裂解' CLASSIFICATION = '原油' MODEL_FRAMEWORK = 'XGBoost' CREATOR = '张立舟' #PRED_DATE = '2024/11/11' FREQUENCY = '月度' OUTPUT_PATH = 'update.xlsx' SIGNIFICANT_DIGITS = 5 # XGBoost默认参数,将在main函数中从命令行参数更新 DEFAULT_PARAMS = { 'objective': 'reg:squarederror', 'learning_rate': 0.1, 'max_depth': 8, 'min_child_weight': 3, 'gamma': 2, 'subsample': 0.85, 'colsample_bytree': 0.75, 'eval_metric': 'rmse', 'seed': 42, 'reg_alpha': 0.45, 'reg_lambda': 1.29, 'max_delta_step': 0, 'booster': 'gbtree', 'tree_method': 'auto' } # —— 因子预处理相关配置 —— FILL_METHODS = { '美国2年通胀预期': 'rolling_mean_5', '美国首次申领失业金人数/4WMA': 'interpolate', '道琼斯旅游与休闲/工业平均指数': 'interpolate', '美国EIA成品油总库存(预测/供应需求3年季节性)': 'interpolate', '美国成品车用汽油倒推产量(预测/汽油库存维持上年季节性)/8WMA': 'interpolate', '美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年': 'interpolate', '美国炼厂可用产能(路透)(预测)': 'interpolate', '美国炼厂CDU装置检修量(新)': 'interpolate', '美湾单位辛烷值价格(预测/季节性)': 'interpolate', '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年': 'interpolate' } SHIFT_CONFIG = [ ('美国2年通胀预期', 56, '美国2年通胀预期_提前56天'), ('美国首次申领失业金人数/4WMA', 100, '美国首次申领失业金人数/4WMA_提前100天'), ('美国首次申领失业金人数/4WMA', 112, '美国首次申领失业金人数/4WMA_提前112天'), ('道琼斯旅游与休闲/工业平均指数', 14, '道琼斯旅游与休闲/工业平均指数_提前14天'), ('美国EIA成品油总库存(预测/供应需求3年季节性)', 15, '美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天'), ('美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年', 30, '美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年_提前30天'), ('美国炼厂CDU装置检修量(新)', 30, '美国炼厂CDU装置检修量(新)_提前30天'), ('美国炼厂可用产能(路透)(预测)', 100, '美国炼厂可用产能(路透)(预测)_提前100天') ] REVERSE_CONFIG = [ ('美国首次申领失业金人数/4WMA', '美国首次申领失业金人数/4WMA_逆序'), ('美国首次申领失业金人数/4WMA_提前100天', '美国首次申领失业金人数/4WMA_提前100天_逆序'), ('美国首次申领失业金人数/4WMA_提前112天', '美国首次申领失业金人数/4WMA_提前112天_逆序'), ('美国EIA成品油总库存(预测/供应需求3年季节性)', '美国EIA成品油总库存(预测/供应需求3年季节性)_逆序'), ('美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天', '美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天_逆序'), ('美国炼厂可用产能(路透)(预测)_提前100天', '美国炼厂可用产能(路透)(预测)_逆序'), ('美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年', '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序') ] SPECIAL_REVERSE = { '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序_2022-01-01': { 'base_column': '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序', 'condition_date': pd.Timestamp('2022-01-01') } } METRICS_JSON = 'model_metrics.json' # ------------ 数据加载与预处理 ------------ def load_and_preprocess_data(): # 直接从API获取数据 df = fetch_data_by_indicators(INDICATOR_IDS) # print("Initial DataFrame columns:", df.columns) df.index = pd.to_datetime(df.index) df_daily = df.copy() df_daily['Date'] = df_daily.index df_daily = df_daily.reset_index(drop=True) #预处理流程 df_daily = fill_missing_values(df_daily, FILL_METHODS, return_only_filled=False) for col, days, new_col in SHIFT_CONFIG: df_daily[new_col] = df_daily[col].shift(days) last_idx = df_daily[TARGET_COL].last_valid_index() last_day = df_daily.loc[last_idx, 'Date'] df_daily = df_daily[(df_daily['Date'] >= '2009-08-01') & (df_daily['Date'] <= last_day + pd.Timedelta(days=30))] df_daily = df_daily[df_daily['Date'].dt.weekday < 5] for base, new in REVERSE_CONFIG: df_daily[new] = reverse_column(df_daily, base) for col, cfg in SPECIAL_REVERSE.items(): df_daily[col] = np.where(df_daily['Date'] >= cfg['condition_date'], df_daily[cfg['base_column']], np.nan) df_daily = df_daily[(df_daily['Date'] > last_day)|df_daily[TARGET_COL].notna()] return df_daily, last_day # ------------ 划分与特征构建 ------------ def split_and_build_features(df_daily, last_day): train = df_daily[df_daily['Date'] <= last_day].copy() test = train.tail(TEST_PERIOD).copy() train = train.iloc[:-TEST_PERIOD].copy() future = df_daily[df_daily['Date'] > last_day].copy() feature_columns = [ '美湾单位辛烷值价格(预测/季节性)', '美国炼厂CDU装置检修量(新)_提前30天', '美国EIA成品油总库存(预测/供应需求3年季节性)_提前15天_逆序', '美国首次申领失业金人数/4WMA_提前100天_逆序', '美国成品车用汽油倒推产量(预测/汽油库存维持上年季节性)/8WMA', '美国成品车用汽油炼厂与调和装置净产量/4WMA(预测/上年季节性)超季节性/5年_提前30天', '美国汽油调和组分RBOB库存(预测/线性外推)超季节性/3年_逆序_2022-01-01' ] X_train = train[feature_columns] y_train = train[TARGET_COL] X_test = test[feature_columns] y_test = test[TARGET_COL] X_future = future[feature_columns] return X_train, y_train, X_test, y_test, X_future, train, test, future # ------------ 特征缩放与异常值权重 ------------ def scale_and_weight_features(X_train, X_test, X_future): scaler = StandardScaler() X_tr = scaler.fit_transform(X_train) X_te = scaler.transform(X_test) X_fu = scaler.transform(X_future) return scaler, X_tr, X_te, X_fu def detect_outliers_weights(X,weight_normal=1.0,weight_outlier=0.05,threshold=3): z = np.abs((X - X.mean()) / X.std()) mask = (z > threshold).any(axis=1) return np.where(mask, weight_outlier, weight_normal) # ------------ 模型训练 ------------ def train_model_with_tuning(X_tr, y_tr, X_te, y_te, weights, use_tuning): if use_tuning: param_dist = { 'learning_rate': list(np.arange(0.01, 0.11, 0.01)), 'max_depth': list(range(4, 11)), 'min_child_weight': list(range(1, 6)), 'gamma': list(np.arange(0, 0.6, 0.1)), 'subsample': list(np.arange(0.5, 1.01, 0.05)), 'colsample_bytree': list(np.arange(0.5, 1.01, 0.05)), 'reg_alpha': [0, 0.1, 0.2, 0.3, 0.4, 0.45, 0.5], 'reg_lambda': list(np.arange(1.0, 1.6, 0.1)) } # 将数据转换为DMatrix格式 dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=weights) dtest = xgb.DMatrix(X_te, label=y_te) # 基础参数设置 base_params = { 'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'seed': RANDOM_STATE } best_score = float('inf') best_params = None # 网格搜索 if SEARCH_MODE == 'grid': param_combinations = [dict(zip(param_dist.keys(), v)) for v in itertools.product(*param_dist.values())] for params in param_combinations: curr_params = {**base_params, **params} cv_results = xgb.cv(curr_params, dtrain, num_boost_round=NUM_BOOST_ROUND, nfold=3, early_stopping_rounds=20, verbose_eval=False) score = cv_results['test-rmse-mean'].min() if score < best_score: best_score = score best_params = curr_params # 贝叶斯搜索 elif SEARCH_MODE == 'bayesian': search_spaces = { 'learning_rate': Real(0.01, 0.11, prior='uniform'), 'max_depth': Integer(4, 11), 'min_child_weight': Integer(1, 6), 'gamma': Real(0.0, 0.6, prior='uniform'), 'subsample': Real(0.5, 1.01, prior='uniform'), 'colsample_bytree': Real(0.5, 1.01, prior='uniform'), 'reg_alpha': Real(0.0, 0.5, prior='uniform'), 'reg_lambda': Real(1.0, 1.6, prior='uniform') } def objective(params): curr_params = {**base_params, **params} cv_results = xgb.cv(curr_params, dtrain, num_boost_round=NUM_BOOST_ROUND, nfold=3, early_stopping_rounds=20, verbose_eval=False) return cv_results['test-rmse-mean'].min() # 执行贝叶斯优化 from skopt import gp_minimize result = gp_minimize( objective, dimensions=[space for space in search_spaces.values()], n_calls=50, random_state=RANDOM_STATE ) best_params = dict(zip(search_spaces.keys(), result.x)) best_params = {**base_params, **best_params} best_score = result.fun # 随机搜索 else: for _ in range(50): params = {k: random.choice(v) for k, v in param_dist.items()} curr_params = {**base_params, **params} cv_results = xgb.cv(curr_params, dtrain, num_boost_round=NUM_BOOST_ROUND, nfold=3, early_stopping_rounds=20, verbose_eval=False) score = cv_results['test-rmse-mean'].min() if score < best_score: best_score = score best_params = curr_params print("调优后的最佳参数:", best_params) print("最佳得分:", best_score) # 使用最佳参数训练最终模型 best_model = xgb.train(best_params, dtrain, num_boost_round=NUM_BOOST_ROUND, evals=[(dtrain, 'Train'), (dtest, 'Test')], early_stopping_rounds=20, verbose_eval=False) else: # 直接使用默认参数训练 dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=weights) dtest = xgb.DMatrix(X_te, label=y_te) best_model = xgb.train(DEFAULT_PARAMS, dtrain, num_boost_round=NUM_BOOST_ROUND, evals=[(dtrain, 'Train'), (dtest, 'Test')], verbose_eval=False) return best_model # ------------ 评估与预测 ------------ def evaluate_and_predict(model, scaler, X_tr, y_tr, X_te, y_te, X_fu, use_tuning): X_tr_s = scaler.transform(X_tr) X_te_s = scaler.transform(X_te) X_fu_s = scaler.transform(X_fu) if isinstance(model, xgb.Booster): y_tr_pred = model.predict(xgb.DMatrix(X_tr_s)) y_te_pred = model.predict(xgb.DMatrix(X_te_s)) y_fu_pred = model.predict(xgb.DMatrix(X_fu_s)) else: y_tr_pred = model.predict(X_tr_s) y_te_pred = model.predict(X_te_s) y_fu_pred = model.predict(X_fu_s) # 计算评估指标并保留4位有效数字 train_mse = float(f"{mean_squared_error(y_tr, y_tr_pred):.4g}") test_mse = float(f"{mean_squared_error(y_te, y_te_pred):.4g}") train_r2 = float(f"{r2_score(y_tr, y_tr_pred):.4g}") test_r2 = float(f"{r2_score(y_te, y_te_pred):.4g}") if len(y_te) >= 2 else None print("Train MSE:", train_mse, "Test MSE:", test_mse) if len(y_te) >= 2: print("Train R2:", train_r2, "Test R2:", test_r2) else: print("Test 样本不足,跳过 R² 计算") metrics = { 'train_mse': train_mse, 'test_mse': test_mse, 'train_r2': train_r2, 'test_r2': test_r2 } return y_tr_pred, y_te_pred, y_fu_pred, metrics # ------------ 结果后处理(生成日度 & 月度 DataFrame) ------------ def merge_and_prepare_df(train, test, future, y_te_pred, y_fu_pred): # 合并历史与未来预测 test = test.copy() future = future.copy() test['预测值'] = y_te_pred future['预测值'] = y_fu_pred hist_actual = pd.concat([ train[train['Date'].dt.year >= 2023][['Date', TARGET_COL]], test[['Date', TARGET_COL]] ]) hist_actual.columns = ['Date', '实际值'] future_pred = future[future['Date'] >= '2022-08-01'][['Date', '预测值']].rename(columns={'预测值': TARGET_COL}).copy() last_val = hist_actual.iloc[-1]['实际值'] future_pred[TARGET_COL] = future_pred[TARGET_COL].astype(last_val.dtype) future_pred.iloc[0, 1] = last_val # 日度重采样 merged = pd.merge(hist_actual, future_pred,on='Date', how='outer').sort_values('Date', ascending=False) daily_df = merged.copy() # 月度重采样 monthly_df = daily_df.copy() monthly_df['Date'] = pd.to_datetime(monthly_df['Date']) monthly_df.set_index('Date', inplace=True) monthly_df = monthly_df.resample('ME').mean().reset_index() # 方向准确率 pred_dir = np.sign(monthly_df[TARGET_COL].diff()) true_dir = np.sign(monthly_df['实际值'].diff()) valid = monthly_df[TARGET_COL].notna() & monthly_df['实际值'].notna() monthly_df['方向准确率'] = np.where(valid & (pred_dir == true_dir), '正确', np.where(valid & (pred_dir != true_dir), '错误', '')) # 修改绝对偏差计算,转换为百分比 monthly_df['绝对偏差'] = np.where( monthly_df[TARGET_COL].notna() & monthly_df['实际值'].notna(), abs((monthly_df[TARGET_COL] - monthly_df['实际值']) / monthly_df['实际值']), np.nan) monthly_df = monthly_df.sort_values('Date', ascending=False).reset_index(drop=True) monthly_df['Date'] = monthly_df['Date'].dt.strftime('%Y-%m-%d') daily_df['Date'] = daily_df['Date'].dt.strftime('%Y-%m-%d') return daily_df, monthly_df def generate_and_fill_excel( daily_df, monthly_df, metrics, # 新增参数 target_name, # 写入的"预测标的"显示名 classification, # 列表页-分类 model_framework, # 列表页-模型框架 creator, # 列表页-创建人 # pred_date, # 列表页-预测日期 frequency, # 列表页-预测频度 significant_digits=5, output_path='update.xlsx' ): with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer: workbook = writer.book # 获取monthly_df的第一个日期作为预测日期 actual_pred_date = pd.to_datetime(monthly_df['Date'].iloc[0]).strftime('%Y/%m/%d') # 格式化数值的辅助函数 - 用于测试值 def format_test_value(x, sig_digits=significant_digits): if pd.isna(x): return "" return f"{float(x):.{sig_digits}g}" # 格式化百分比的辅助函数 - 用于方向准确率和偏差率(3位有效数) def format_percentage(x): if pd.isna(x): return "" return f"{float(x*100):.2f}%" # 格式化训练指标的辅助函数 - 用于训练结果页(6位有效数) def format_metrics(x): if pd.isna(x) or x == '': return "" return f"{float(x):.6g}" # —— 计算三个汇总值 —— test_value = format_test_value(monthly_df[TARGET_COL].iloc[0]) total = monthly_df['方向准确率'].notna().sum() correct = (monthly_df['方向准确率'] == '正确').sum() direction_accuracy = format_percentage(correct/total) if total > 0 else "" absolute_deviation = format_percentage(monthly_df['绝对偏差'].mean()) # ========= 列表页 ========= ws_list = workbook.add_worksheet('列表页') writer.sheets['列表页'] = ws_list headers = ['预测标的','分类','模型框架','创建人','预测日期','测试值','预测频度','方向准确率','绝对偏差'] ws_list.write_row(0, 0, headers) ws_list.write_row(1, 0, [ target_name, classification, model_framework, creator, actual_pred_date, test_value, frequency, direction_accuracy, absolute_deviation ]) # ========= 详情页 ========= detail_df = monthly_df[['Date', '实际值', TARGET_COL, '方向准确率', '绝对偏差']].copy() detail_df.columns = ['指标日期','实际值','预测值','方向','偏差率'] # 格式化日期为年/月/日 detail_df['指标日期'] = pd.to_datetime(detail_df['指标日期']).dt.strftime('%Y/%m/%d') # 格式化实际值和预测值列(使用传入的significant_digits) detail_df['实际值'] = detail_df['实际值'].apply(format_test_value) detail_df['预测值'] = detail_df['预测值'].apply(format_test_value) detail_df['偏差率'] = detail_df['偏差率'].apply( lambda x: f"{float(x*100):.3g}%" if pd.notnull(x) else "") detail_df.to_excel(writer,sheet_name='详情页',index=False,header=False,startrow=2) ws_detail = writer.sheets['详情页'] ws_detail.write(0, 0, target_name) ws_detail.write_row(1, 0, ['指标日期','实际值','预测值','方向','偏差率']) # ========= 日度数据表 ========= daily_out = daily_df[['Date', '实际值', TARGET_COL]].copy() daily_out.columns = ['指标日期','实际值','预测值'] # 格式化日期为年/月/日 daily_out['指标日期'] = pd.to_datetime(daily_out['指标日期']).dt.strftime('%Y/%m/%d') # 日度数据表不限制有效数字 daily_out.to_excel(writer,sheet_name='日度数据表',index=False,header=False,startrow=2) ws_daily = writer.sheets['日度数据表'] ws_daily.write(0, 0, target_name) ws_daily.write_row(1, 0, ['指标日期','实际值','预测值']) # ========= 训练结果页 ========= ws_metrics = workbook.add_worksheet('训练结果页') writer.sheets['训练结果页'] = ws_metrics metrics_headers = ['指标名称', '指标值'] ws_metrics.write_row(0, 0, metrics_headers) metrics_rows = [ ['训练集 MSE', format_metrics(metrics['train_mse'])], ['测试集 MSE', format_metrics(metrics['test_mse'])], ['训练集 R²', format_metrics(metrics['train_r2'])], ['测试集 R²', format_metrics(metrics['test_r2']) if metrics['test_r2'] is not None else ''] ] for i, row in enumerate(metrics_rows, start=1): ws_metrics.write_row(i, 0, row) print(f"已生成并填充 {output_path}") # ------------ 全量训练与预测 ------------ def train_full_model_and_predict(X_train, y_train, X_test, y_test, X_future): X_all = pd.concat([X_train, X_test]) y_all = pd.concat([y_train, y_test]) scaler_all = StandardScaler().fit(X_all) X_all_s = scaler_all.transform(X_all) X_fu_s = scaler_all.transform(X_future) model = XGBRegressor(**DEFAULT_PARAMS, n_estimators=NUM_BOOST_ROUND) model.fit(X_all_s, y_all) y_fu_full = model.predict(X_fu_s) return model, y_fu_full, scaler_all # ------------ 可视化 ------------ def plot_final_predictions(train, y_tr, y_tr_pred, test, y_te, y_te_pred, future, last_day): plt.figure(figsize=(15, 6)) plt.plot(train['Date'], y_tr, label='Train True') plt.plot(train['Date'], y_tr_pred, label='Train Pred') plt.plot(test['Date'], y_te, label='Test True', alpha=0.7) plt.plot(test['Date'], y_te_pred, label='Test Pred') plt.plot(future['Date'], future['预测值'], label='Future Pred') plt.axvline(test['Date'].iloc[0], color='gray', linestyle='--') plt.axvline(last_day, color='black', linestyle='--') plt.legend() plt.xlabel('Date') plt.ylabel(TARGET_COL) plt.title('Prediction Visualization') plt.grid(True) plt.show() # ------------ 主函数 ------------ def main(): # 解析命令行参数 args = parse_arguments() # 更新全局变量 global NUM_BOOST_ROUND, USE_HYPERPARAM_TUNING, OUTPUT_PATH, DEFAULT_PARAMS NUM_BOOST_ROUND = args.num_boost_round USE_HYPERPARAM_TUNING = args.use_hyperparam_tuning.lower() == 'true' # 根据前缀生成输出路径 if args.output_prefix: OUTPUT_PATH = f"{args.output_prefix}_update.xlsx" # 更新XGBoost参数 DEFAULT_PARAMS = { 'objective': args.objective, 'learning_rate': args.learning_rate, 'max_depth': args.max_depth, 'min_child_weight': args.min_child_weight, 'gamma': args.gamma, 'subsample': args.subsample, 'colsample_bytree': args.colsample_bytree, 'eval_metric': args.eval_metric, 'seed': args.seed, 'reg_alpha': args.reg_alpha, 'reg_lambda': args.reg_lambda, 'max_delta_step': args.max_delta_step, 'booster': args.booster, 'tree_method':args.tree_method } # print("使用参数:") # print(f"NUM_BOOST_ROUND: {NUM_BOOST_ROUND}") # print(f"USE_HYPERPARAM_TUNING: {USE_HYPERPARAM_TUNING}") # print(f"OUTPUT_PATH: {OUTPUT_PATH}") # print("DEFAULT_PARAMS:", DEFAULT_PARAMS) df_daily, last_day = load_and_preprocess_data() X_tr, y_tr, X_te, y_te, X_fu, train, test, future = split_and_build_features(df_daily, last_day) scaler, X_tr_s, X_te_s, X_fu_s = scale_and_weight_features(X_tr, X_te, X_fu) weights = detect_outliers_weights(X_tr_s) model = train_model_with_tuning(X_tr_s, y_tr, X_te_s, y_te, weights, USE_HYPERPARAM_TUNING) y_tr_pred, y_te_pred, y_fu_pred, metrics = evaluate_and_predict(model, scaler, X_tr, y_tr, X_te, y_te, X_fu, USE_HYPERPARAM_TUNING) daily_df, monthly_df = merge_and_prepare_df(train, test, future, y_te_pred, y_fu_pred) # print(monthly_df) # print(daily_df) generate_and_fill_excel( daily_df, monthly_df, metrics, target_name=TARGET_NAME, classification=CLASSIFICATION, model_framework=MODEL_FRAMEWORK, creator=CREATOR, # pred_date=PRED_DATE, frequency=FREQUENCY, significant_digits= SIGNIFICANT_DIGITS, # 设置6位有效数字 output_path=OUTPUT_PATH ) full_model, y_fu_full, scaler_full = train_full_model_and_predict(X_tr, y_tr, X_te, y_te, X_fu) if ADJUST_FULL_PREDICTIONS: offset = y_te.iloc[-1] - y_fu_full[0] y_fu_full += offset if SHOW_PLOTS: plot_final_predictions( train, y_tr, y_tr_pred, test, y_te, y_te_pred, future.assign(预测值=y_fu_full), last_day) return daily_df, monthly_df if __name__ == '__main__': daily_df, monthly_df = main()