data_processor.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. package ruizide
  2. import (
  3. "context"
  4. "encoding/json"
  5. "eta/eta_data_analysis/models"
  6. "eta/eta_data_analysis/utils"
  7. "fmt"
  8. "github.com/xuri/excelize/v2"
  9. "io"
  10. "log"
  11. "os"
  12. "path/filepath"
  13. "time"
  14. "github.com/chromedp/chromedp"
  15. )
  16. // 定义选择器
  17. var (
  18. downloadDir = utils.RZD_EXCEL_PATH
  19. defaultDir = utils.RZD_DOWNLOAD_PATH
  20. rzdLoginPath = "https://portal.rystadenergy.com/home"
  21. rzdBatchSize = 500
  22. )
  23. // 处理数据下载的步骤
  24. func downloadData(ctx context.Context) error {
  25. // Analytics Library
  26. if err := chromedp.Run(ctx,
  27. chromedp.Sleep(5*time.Second), // 考虑移除这一行,如果不必要的话
  28. chromedp.Navigate(rzdLoginPath),
  29. chromedp.WaitVisible(`a.mat-tooltip-trigger.analytics.home__link-card-icon[href="/analytics"]`, chromedp.ByQuery), // 等待元素可见
  30. chromedp.Click(`a.mat-tooltip-trigger.analytics.home__link-card-icon[href="/analytics"]`, chromedp.ByQuery), // 点击链接
  31. chromedp.Sleep(5*time.Second),
  32. ); err != nil {
  33. return fmt.Errorf("下载 Analytics Library 数据错误: %v", err)
  34. }
  35. if err := chromedp.Run(ctx,
  36. chromedp.WaitVisible(`#intro-home-page-step1`, chromedp.ByID), // 等待 input 元素可见
  37. chromedp.SetValue(`#intro-home-page-step1`, "oil demand signals weekly report", chromedp.ByID), // 设置值
  38. chromedp.SendKeys(`#intro-home-page-step1`, "\u000D", chromedp.ByID), // 模拟按下回车键 (\u000D 是回车的 Unicode)
  39. chromedp.Sleep(5*time.Second),
  40. ); err != nil {
  41. return fmt.Errorf("设置 input 标签值时发生错误: %v", err)
  42. }
  43. if err := chromedp.Run(ctx,
  44. // 等待第一个 download-btns div 和下载按钮可见
  45. chromedp.WaitVisible(`//div[contains(@class, 'download-btns')][1]//ul//li//button[contains(@class, 'mat-tooltip-trigger') and contains(@class, 'download__excel') and normalize-space(text())='Data']`, chromedp.BySearch),
  46. // 点击第一个 div 中的 Excel 下载按钮
  47. chromedp.Click(`//div[contains(@class, 'download-btns')][1]//ul//li//button[contains(@class, 'mat-tooltip-trigger') and contains(@class, 'download__excel') and normalize-space(text())='Data']`, chromedp.BySearch),
  48. // 可选:等待下载完成,可以调整等待时间
  49. chromedp.Sleep(5*time.Second), // 根据下载时间设置
  50. ); err != nil {
  51. return fmt.Errorf("点击第一个下载按钮时发生错误: %v", err)
  52. }
  53. // 解析文件移动到目标目录
  54. if err := waitAndRenameDownloadedFile("Oil_Demand_Signals_Weekly_Report_"+utils.GetCurrentYearMonth()+".xlsx", downloadDir); err != nil {
  55. return err
  56. }
  57. return nil
  58. }
  59. // 等待下载文件并重命名
  60. func waitAndRenameDownloadedFile(newFileName, targetDir string) error {
  61. // 等待一段时间以确保文件下载完成
  62. time.Sleep(100 * time.Second) // 可能需要根据实际情况调整
  63. // 查找下载目录中的文件
  64. files, err := filepath.Glob(filepath.Join(defaultDir, "*.xlsx"))
  65. if err != nil {
  66. return fmt.Errorf("查找文件时出错: %v", err)
  67. }
  68. // 如果没有找到文件,返回错误
  69. if len(files) == 0 {
  70. return fmt.Errorf("未找到任何下载的文件")
  71. }
  72. // 找到最新的文件
  73. var latestFile string
  74. var latestTime time.Time
  75. for _, file := range files {
  76. info, err := os.Stat(file)
  77. if err != nil {
  78. return fmt.Errorf("获取文件信息时出错: %v", err)
  79. }
  80. if info.ModTime().After(latestTime) {
  81. latestTime = info.ModTime()
  82. latestFile = file
  83. }
  84. }
  85. // 目标文件的完整路径
  86. targetFilePath := filepath.Join(targetDir, newFileName)
  87. // 重命名并移动到目标目录
  88. if latestFile != "" {
  89. if err := moveFile(latestFile, targetFilePath); err != nil {
  90. return fmt.Errorf("重命名文件时出错: %v", err)
  91. }
  92. // 打印重命名后的文件名
  93. fmt.Printf("文件重命名并移动到: %s\n", targetFilePath)
  94. }
  95. return nil
  96. }
  97. func moveFile(source, destination string) error {
  98. // 复制文件
  99. srcFile, err := os.Open(source)
  100. if err != nil {
  101. return fmt.Errorf("打开源文件时出错: %v", err)
  102. }
  103. defer srcFile.Close()
  104. dstFile, err := os.Create(destination)
  105. if err != nil {
  106. return fmt.Errorf("创建目标文件时出错: %v", err)
  107. }
  108. defer dstFile.Close()
  109. if _, err := io.Copy(dstFile, srcFile); err != nil {
  110. return fmt.Errorf("复制文件时出错: %v", err)
  111. }
  112. time.Sleep(60 * time.Second)
  113. // 删除源文件
  114. /*if err := os.Remove(source); err != nil {
  115. return fmt.Errorf("删除源文件时出错: %v", err)
  116. }*/
  117. return nil
  118. }
  119. func main() {
  120. ResolverNet(context.Background())
  121. }
  122. // 解析网页数据,下载文件
  123. // func main() {
  124. func ResolverNet(cont context.Context) (err error) {
  125. // 创建下载目录
  126. if err := os.MkdirAll(downloadDir, os.ModePerm); err != nil {
  127. fmt.Printf("创建下载目录时出错: %v\n", err)
  128. return nil
  129. }
  130. // 创建 chromedp 执行上下文
  131. options := []chromedp.ExecAllocatorOption{
  132. chromedp.Flag("headless", false),
  133. chromedp.Flag("disable-blink-features", "AutomationControlled"),
  134. chromedp.UserAgent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36`),
  135. // 设置了但并不生效,直接从默认下载路径读取过来
  136. //chromedp.Flag("download.default_directory", downloadDir),
  137. //chromedp.Flag("download.prompt_for_download", false), // 不弹出下载对话框
  138. chromedp.Flag("safebrowsing.enabled", true), // 启用安全浏览
  139. //chromedp.UserDataDir(filepath.Join(downloadDir, "user-data")),
  140. }
  141. allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...)
  142. defer cancel()
  143. ctx, cancel := chromedp.NewContext(allocCtx)
  144. defer cancel()
  145. // 启动 Chrome 实例
  146. if err := chromedp.Run(ctx); err != nil {
  147. fmt.Printf("启动 Chrome 实例时出错: %v\n", err)
  148. return nil
  149. }
  150. // 设置下载行为
  151. /*if err := setDownloadBehavior(ctx); err != nil {
  152. fmt.Printf("设置下载路径时出错: %v\n", err)
  153. return
  154. }*/
  155. // 登录操作
  156. if err := login(ctx); err != nil {
  157. fmt.Printf("登录错误: %v\n", err)
  158. return nil
  159. }
  160. // 下载数据
  161. if err := downloadData(ctx); err != nil {
  162. fmt.Printf("数据下载错误: %v\n", err)
  163. return nil
  164. }
  165. fmt.Println("数据下载完成")
  166. // 解析表格 读取数据
  167. fileResolver()
  168. return nil
  169. }
  170. // 解析本地文件
  171. func fileResolver() {
  172. //func main() {
  173. var tableNameList = []string{
  174. "Oil_Demand_Signals_Weekly_Report",
  175. "Oil_Supply_Analysis",
  176. "Supply_Revision_Analysis",
  177. "Oil_Market_Cube_Upstream_Supply_Oil_Quality_API",
  178. "Oil_Market_Cube_Upstream_Supply_Oil_Quality_Sulphur",
  179. "Oil_Market_Cube_Upstream_Supply_Capacity_Capacity",
  180. "Oil_Market_Cube_Upstream_Supply_Production",
  181. "Oil_Market_Cube_Upstream_Supply_Production_Wo_Seasonality",
  182. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Reference_Production",
  183. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Target_Production",
  184. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Target_Cut",
  185. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Actual_Cut",
  186. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Compliance",
  187. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Production_Subject_To_Cut",
  188. "Oil_Market_Cube_Products_Demand_Products_Demand_Mean",
  189. "Oil_Market_Cube_Products_Demand_Products_Demand+Sigma",
  190. "Oil_Market_Cube_Products_Demand_Products_Demand-Sigma",
  191. "Oil_Market_Cube_Balances_Total_Liquids_Balances",
  192. "Oil_Market_Cube_Geography_Latitude",
  193. "Oil_Market_Cube_Geography_Longitude",
  194. "Oil_Demand_Analysis_Product_Detail",
  195. "Oil_Demand_Analysis_Region",
  196. "Oil_Demand_Analysis_Scenario",
  197. "Oil_Demand_Analysis_Continent",
  198. "Oil_Demand_Analysis_Country",
  199. "Oil_Demand_Analysis_Product_Category",
  200. "Oil_Demand_Analysis_Sector_Category",
  201. "Oil_Demand_Analysis_Sector_Detail",
  202. }
  203. for _, tableName := range tableNameList {
  204. var fileName string
  205. // 解析表格
  206. fileName = tableName + "_" + utils.GetCurrentYearMonth() + ".xlsx"
  207. filePath := filepath.Join(downloadDir, fileName)
  208. // 打开 Excel 文件
  209. f, err := excelize.OpenFile(filePath)
  210. if err != nil {
  211. log.Fatalf("无法打开 Excel 文件: %v", err)
  212. }
  213. // 获取所有工作表
  214. sheetNames := f.GetSheetList()
  215. for _, sheetName := range sheetNames {
  216. fmt.Printf("读取工作表: %s\n", sheetName)
  217. // 获取工作表的最大行数
  218. maxRow, err := f.GetRows(sheetName) // 直接获取所有行数据
  219. if err != nil {
  220. log.Fatalf("获取工作表数据时出错: %v", err)
  221. continue
  222. }
  223. // 遍历行并打印内容
  224. indexData := []models.BaseFromRzdData{}
  225. for rowIndex, rowData := range maxRow {
  226. // 因为excel文件中的sheet表格不固定 对于 Supply_Revision_Analysis, Oil_Supply_Analysis 文件 手动调整sheet表格顺序
  227. if tableName == "Supply_Revision_Analysis" && rowIndex == 0 {
  228. if rowData[0] == "YearQuarter" && rowData[1] == "Revision" && rowData[2] == "CountryRevisionGroup" {
  229. sheetName = "Chart1"
  230. }
  231. if rowData[0] == "YearQuarter" && rowData[1] == "Current" && rowData[2] == "Previous" {
  232. sheetName = "Chart2"
  233. }
  234. if rowData[0] == "Year" && rowData[1] == "Revision" && rowData[2] == "CountryRevisionGroup" {
  235. sheetName = "Chart3"
  236. }
  237. if rowData[0] == "Year" && rowData[1] == "Current" && rowData[2] == "Previous" {
  238. sheetName = "Chart4"
  239. }
  240. if rowData[0] == "Previous" && rowData[1] == "Current" && rowData[2] == "YearMonth" {
  241. sheetName = "Chart5"
  242. }
  243. if rowData[0] == "YearMonth" && rowData[1] == "CountryRevisionGroup" && rowData[2] == "Revision" {
  244. sheetName = "Chart6"
  245. }
  246. } else if tableName == "Oil_Supply_Analysis" && rowIndex == 0 {
  247. if rowData[0] == "Viz Date" && rowData[1] == "OilAndGasCategory" && rowData[2] == "supply_kbbld" {
  248. sheetName = "Chart1"
  249. }
  250. if rowData[0] == "Viz Date" && rowData[1] == "supply_kbbld" && rowData[2] == "Region" {
  251. sheetName = "Chart2"
  252. }
  253. if rowData[0] == "Viz Date" && rowData[1] == "CapacityDetail" && rowData[2] == "Capacity_kbbld" {
  254. sheetName = "Chart3"
  255. }
  256. if rowData[0] == "Viz Date" && rowData[1] == "Oil Classification Group" && rowData[2] == "supply_kbbld" {
  257. sheetName = "Chart4"
  258. }
  259. }
  260. processor, err := GetProcessor(tableName, sheetName)
  261. if err != nil {
  262. continue
  263. }
  264. baseFromRzdDataList, err := processor.Process(tableName, sheetName, rowIndex, rowData)
  265. if err != nil {
  266. log.Printf("processor.Process err: %v", err)
  267. return
  268. }
  269. indexData = append(indexData, baseFromRzdDataList...)
  270. }
  271. // 新增数据源指标数据
  272. if len(indexData) > 0 {
  273. for i := 0; i < len(indexData); i += rzdBatchSize {
  274. // 计算当前批次的结束索引
  275. end := i + rzdBatchSize
  276. if end > len(indexData) {
  277. end = len(indexData)
  278. }
  279. // 获取当前批次的数据
  280. batchData := indexData[i:end]
  281. // 转换成json
  282. marshal, err := json.Marshal(batchData)
  283. if err != nil {
  284. log.Printf("json.Marshal err: %v", err)
  285. return
  286. }
  287. // 发送 HTTP POST 请求
  288. _, err = utils.HttpPostRequest(utils.EDB_LIB_URL+utils.ADD_BATCH_RZD_DATA, string(marshal), "application/json")
  289. if err != nil {
  290. log.Printf("postEdbLib err: %v", err)
  291. return
  292. }
  293. }
  294. }
  295. // 新增指标库数据
  296. edbDataList := []models.EdbDataRzd{}
  297. for _, index := range indexData {
  298. // 补充 判断是否存在于指标库
  299. paramsLib := make(map[string]interface{})
  300. paramsLib["IndexCode"] = index.IndexCode
  301. paramsLib["Source"] = utils.DATA_SOURCE_RZD
  302. postEdbLib, err := httpRequestFill(paramsLib, utils.GET_RZD_EDB_INFO_BY_INDEX_CODE)
  303. if err != nil {
  304. // 有错误就不继续执行
  305. log.Printf("postEdbLib err: %v", err)
  306. continue
  307. }
  308. var requestResponse models.RequestResponse[models.EdbInfo]
  309. err = json.Unmarshal(postEdbLib, &requestResponse)
  310. if err != nil {
  311. log.Printf("postEdbLib err: %v", err)
  312. continue
  313. }
  314. if requestResponse.Data.EdbInfoId != 0 {
  315. edbDataRzd := models.EdbDataRzd{
  316. CreateTime: utils.GetCurrentTime(),
  317. ModifyTime: utils.GetCurrentTime(),
  318. EdbInfoId: index.BaseFromRzdIndexId,
  319. EdbCode: index.IndexCode,
  320. DataTime: index.DataTime,
  321. Value: index.Value,
  322. DataTimestamp: uint64(utils.StringToTimeFormat(index.DataTime, utils.FormatDate).UnixMilli()),
  323. }
  324. edbDataList = append(edbDataList, edbDataRzd)
  325. }
  326. }
  327. if len(edbDataList) > 0 {
  328. // 转换成json
  329. marshal, err := json.Marshal(edbDataList)
  330. if err != nil {
  331. log.Printf("postEdbLib err: %v", err)
  332. return
  333. }
  334. _, err = utils.HttpPostRequest(utils.EDB_LIB_URL+utils.ADD_BATCH_RZD_EDB_DATA, string(marshal), "application/json")
  335. if err != nil {
  336. // 有错误就不继续执行
  337. log.Printf("postEdbLib err: %v", err)
  338. return
  339. }
  340. }
  341. }
  342. }
  343. }
  344. func login(ctx context.Context) error {
  345. return chromedp.Run(ctx,
  346. chromedp.Navigate(rzdLoginPath),
  347. chromedp.SetValue(`input[id="Username"]`, utils.RZD_USERNAME, chromedp.ByQuery),
  348. chromedp.SetValue(`input[id="Password"]`, utils.RZD_PASSWORD, chromedp.ByQuery),
  349. chromedp.WaitEnabled(`//button[text()='Login']`, chromedp.BySearch),
  350. chromedp.Click(`//button[text()='Login']`, chromedp.BySearch),
  351. chromedp.Sleep(5*time.Second),
  352. // 等待并点击登录后页面的链接
  353. /*chromedp.WaitVisible(`a[href="/home"]`, chromedp.ByQuery), // 等待 Analytics Library 链接可见
  354. chromedp.Sleep(5*time.Second), */ // 等待页面加载完成
  355. )
  356. }
  357. func httpRequestFill(data interface{}, urlMethod string) (postEdbLib []byte, err error) {
  358. // 转换成json
  359. marshal, err := json.Marshal(data)
  360. if err != nil {
  361. return nil, err
  362. }
  363. // json 转 interface
  364. var result map[string]interface{}
  365. err = json.Unmarshal(marshal, &result)
  366. if err != nil {
  367. return nil, err
  368. }
  369. postEdbLib, err = utils.PostEdbLibRequest(result, urlMethod)
  370. if err != nil {
  371. // 有错误就不继续执行
  372. log.Printf("postEdbLib err: %v", err)
  373. return nil, err
  374. }
  375. return postEdbLib, nil
  376. }