data_processor.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. package main
  2. import (
  3. "context"
  4. "encoding/json"
  5. "eta/eta_data_analysis/models"
  6. "eta/eta_data_analysis/utils"
  7. "fmt"
  8. "github.com/chromedp/cdproto/cdp"
  9. "github.com/xuri/excelize/v2"
  10. "io"
  11. "log"
  12. "os"
  13. "path/filepath"
  14. "strings"
  15. "time"
  16. "github.com/chromedp/chromedp"
  17. )
  18. // 定义选择器
  19. var (
  20. downloadDir = "D:\\download"
  21. defaultDir = "C:\\Users\\Guo Mengyuan\\Downloads"
  22. rzdLoginPath = "https://clients.rystadenergy.com/clients/"
  23. rzdBatchSize = 500
  24. clientSearchLink = `div.d-none.d-lg-flex.flex-grow-1 a[href="/clients/search/"]`
  25. clientsCubeDashboardsLink = `div.d-none.d-lg-flex.flex-grow-1 a[href="/clients/cube-dashboards/"]`
  26. supplyRevisionAnalysisSelector = `div.ais-Hits li h5:contains("Supply Revision Analysis")`
  27. oilDemandAnalysisSelector = `div.ais-Hits li[contains(., 'Oil Demand Analysis')]`
  28. oilSupplyAnalysisSelector = `div.ais-Hits li[contains(., 'Oil Supply Analysis')]`
  29. dateSlicerInputSelector = `div.visualContainer.unselectable.readMode.hideBorder.visualHeaderBelow.droppableElement.ui-droppable div.date-slicer-control input.date-slicer-input.enable-hover`
  30. downloadButtonSelector = `div.btn.btn-link.btn-sm.dashboard-action.dashboard-action--download-data`
  31. oilDemandIframeSelector = `div#WithPollingInFrame iframe` // 根据实际 iframe 的选择器更新
  32. tabSelectorBase = `h3.preTextWithEllipsis` // H3 标签中每个标签的基础选择器
  33. )
  34. // 函数用于点击下载按钮
  35. func clickDownload(ctx context.Context) error {
  36. return chromedp.Run(ctx, chromedp.Click(downloadButtonSelector, chromedp.ByQuery))
  37. }
  38. // 处理数据下载的步骤
  39. func downloadData(ctx context.Context) error {
  40. // Analytics Library
  41. if err := chromedp.Run(ctx,
  42. chromedp.Sleep(5*time.Second), // 考虑移除这一行,如果不必要的话
  43. chromedp.Navigate(rzdLoginPath),
  44. chromedp.WaitVisible(`div.d-none.d-lg-flex.flex-grow-1`, chromedp.ByQuery),
  45. chromedp.Click(clientSearchLink, chromedp.ByQuery),
  46. chromedp.WaitVisible(`input[class="ais-SearchBox-input rounded border py-2 px-3 shadow-sm font-size-14 w-100"]`, chromedp.ByQuery),
  47. chromedp.SetValue(`input[class="ais-SearchBox-input rounded border py-2 px-3 shadow-sm font-size-14 w-100"]`, "oil demand signals weekly report", chromedp.ByQuery),
  48. //chromedp.Click(`div.ais-InfiniteHits li a:has(img[src="/Static/img/icons/xls.png"])`, chromedp.ByQuery),
  49. ); err != nil {
  50. return fmt.Errorf("下载 Analytics Library 数据错误: %v", err)
  51. }
  52. xpath := `//div[@id='search-page-hits']//li//a[.//div//span[@class='align-middle' and text()='Data']]`
  53. var inputCount int
  54. var nodes []*cdp.Node // 使用 *cdp.Node
  55. if err := chromedp.Run(ctx,
  56. chromedp.ActionFunc(func(ctx context.Context) error {
  57. // 获取匹配的节点
  58. if err := chromedp.Nodes(xpath, &nodes, chromedp.BySearch).Do(ctx); err != nil {
  59. return fmt.Errorf("检查节点失败: %v", err)
  60. }
  61. // 获取节点数量
  62. inputCount = len(nodes)
  63. fmt.Printf("找到 %d 个匹配的元素\n", inputCount)
  64. if inputCount > 0 {
  65. // 点击第一个节点
  66. return chromedp.MouseClickNode(nodes[0]).Do(ctx) // 使用 []cdp.NodeID
  67. }
  68. return nil
  69. }),
  70. chromedp.Sleep(10*time.Second),
  71. ); err != nil {
  72. return fmt.Errorf("下载 Analytics Library 数据错误: %v", err)
  73. }
  74. // 解析文件移动到目标目录
  75. if err := waitAndRenameDownloadedFile("Oil_Demand_Signals_Weekly_Report_"+utils.GetCurrentYearMonth()+".xlsx", downloadDir); err != nil {
  76. return err
  77. }
  78. // Cube Dashboards: Supply Revision Analysis
  79. if err := chromedp.Run(ctx,
  80. chromedp.WaitVisible(`div.d-none.d-lg-flex.flex-grow-1`, chromedp.ByQuery),
  81. chromedp.Click(clientsCubeDashboardsLink, chromedp.ByQuery),
  82. chromedp.Sleep(5*time.Second),
  83. chromedp.WaitVisible(`div.ais-Hits`, chromedp.ByQuery),
  84. chromedp.ActionFunc(func(ctx context.Context) error {
  85. var elements []string
  86. // 获取所有 h5 标签的文本内容
  87. if err := chromedp.Evaluate(`Array.from(document.querySelectorAll('div.ais-Hits li h5.text-body.overflow-hidden.mb-1.mr-3.font-weight-bold.line-height-1.dashboards-hit__name')).map(h => h.textContent)`, &elements).Do(ctx); err != nil {
  88. return err
  89. }
  90. // 遍历文本,查找完全匹配的元素并点击
  91. for i, text := range elements {
  92. if strings.Contains(text, "Supply Revision Analysis") {
  93. // 构造选择器,点击找到的匹配元素
  94. selector := fmt.Sprintf(`div.ais-Hits ol li:nth-child(%d) h5.text-body.overflow-hidden.mb-1.mr-3.font-weight-bold.line-height-1.dashboards-hit__name`, i+2)
  95. if err := chromedp.Click(selector, chromedp.ByQuery).Do(ctx); err != nil {
  96. return fmt.Errorf("点击 'Supply Revision Analysis' 失败: %v", err)
  97. }
  98. break // 找到后跳出循环
  99. }
  100. }
  101. return nil
  102. }),
  103. ); err != nil {
  104. return err
  105. }
  106. if err := clickDownload(ctx); err != nil {
  107. return err
  108. }
  109. if err := waitAndRenameDownloadedFile("Supply_Revision_Analysis_2020.xlsx", downloadDir); err != nil {
  110. return err
  111. }
  112. // Oil Supply Analysis
  113. if err := chromedp.Run(ctx,
  114. chromedp.Click(`a[href="/clients/subscription/"]`, chromedp.ByQuery),
  115. chromedp.Click(oilSupplyAnalysisSelector, chromedp.ByQuery),
  116. ); err != nil {
  117. return err
  118. }
  119. if err := clickDownload(ctx); err != nil {
  120. return err
  121. }
  122. if err := waitAndRenameDownloadedFile("Oil_Supply_Analysis_2010.xlsx", downloadDir); err != nil {
  123. return err
  124. }
  125. return nil
  126. }
  127. // 等待下载文件并重命名
  128. func waitAndRenameDownloadedFile(newFileName, targetDir string) error {
  129. // 等待一段时间以确保文件下载完成
  130. time.Sleep(60 * time.Second) // 可能需要根据实际情况调整
  131. // 查找下载目录中的文件
  132. files, err := filepath.Glob(filepath.Join(defaultDir, "*.xlsx"))
  133. if err != nil {
  134. return fmt.Errorf("查找文件时出错: %v", err)
  135. }
  136. // 如果没有找到文件,返回错误
  137. if len(files) == 0 {
  138. return fmt.Errorf("未找到任何下载的文件")
  139. }
  140. // 找到最新的文件
  141. var latestFile string
  142. var latestTime time.Time
  143. for _, file := range files {
  144. info, err := os.Stat(file)
  145. if err != nil {
  146. return fmt.Errorf("获取文件信息时出错: %v", err)
  147. }
  148. if info.ModTime().After(latestTime) {
  149. latestTime = info.ModTime()
  150. latestFile = file
  151. }
  152. }
  153. // 目标文件的完整路径
  154. targetFilePath := filepath.Join(targetDir, newFileName)
  155. // 重命名并移动到目标目录
  156. if latestFile != "" {
  157. if err := moveFile(latestFile, targetFilePath); err != nil {
  158. return fmt.Errorf("重命名文件时出错: %v", err)
  159. }
  160. // 打印重命名后的文件名
  161. fmt.Printf("文件重命名并移动到: %s\n", targetFilePath)
  162. }
  163. return nil
  164. }
  165. func moveFile(source, destination string) error {
  166. // 复制文件
  167. srcFile, err := os.Open(source)
  168. if err != nil {
  169. return fmt.Errorf("打开源文件时出错: %v", err)
  170. }
  171. defer srcFile.Close()
  172. dstFile, err := os.Create(destination)
  173. if err != nil {
  174. return fmt.Errorf("创建目标文件时出错: %v", err)
  175. }
  176. defer dstFile.Close()
  177. if _, err := io.Copy(dstFile, srcFile); err != nil {
  178. return fmt.Errorf("复制文件时出错: %v", err)
  179. }
  180. time.Sleep(60 * time.Second)
  181. // 删除源文件
  182. if err := os.Remove(source); err != nil {
  183. return fmt.Errorf("删除源文件时出错: %v", err)
  184. }
  185. return nil
  186. }
  187. // 解析网页数据,下载文件
  188. // func main() {
  189. func resolverNet() {
  190. // 创建下载目录
  191. if err := os.MkdirAll(downloadDir, os.ModePerm); err != nil {
  192. fmt.Printf("创建下载目录时出错: %v\n", err)
  193. return
  194. }
  195. // 创建 chromedp 执行上下文
  196. options := []chromedp.ExecAllocatorOption{
  197. chromedp.Flag("headless", false),
  198. chromedp.Flag("disable-blink-features", "AutomationControlled"),
  199. chromedp.UserAgent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36`),
  200. // 设置了但并不生效,直接从默认下载路径读取过来
  201. //chromedp.Flag("download.default_directory", downloadDir),
  202. //chromedp.Flag("download.prompt_for_download", false), // 不弹出下载对话框
  203. chromedp.Flag("safebrowsing.enabled", true), // 启用安全浏览
  204. //chromedp.UserDataDir(filepath.Join(downloadDir, "user-data")),
  205. }
  206. allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...)
  207. defer cancel()
  208. ctx, cancel := chromedp.NewContext(allocCtx)
  209. defer cancel()
  210. // 启动 Chrome 实例
  211. if err := chromedp.Run(ctx); err != nil {
  212. fmt.Printf("启动 Chrome 实例时出错: %v\n", err)
  213. return
  214. }
  215. // 设置下载行为
  216. /*if err := setDownloadBehavior(ctx); err != nil {
  217. fmt.Printf("设置下载路径时出错: %v\n", err)
  218. return
  219. }*/
  220. // 登录操作
  221. if err := login(ctx); err != nil {
  222. fmt.Printf("登录错误: %v\n", err)
  223. return
  224. }
  225. // 下载数据
  226. if err := downloadData(ctx); err != nil {
  227. fmt.Printf("数据下载错误: %v\n", err)
  228. return
  229. }
  230. fmt.Println("数据下载完成")
  231. }
  232. // 解析本地文件
  233. // func fileResolver() {
  234. func main() {
  235. var tableNameList = []string{
  236. //"Oil_Demand_Signals_Weekly_Report",
  237. "Supply_Revision_Analysis",
  238. /*"Oil_Market_Cube_Upstream_Supply_Oil_Quality_Api",
  239. "Oil_Market_Cube_Upstream_Supply_Oil_Quality_Sulphur",
  240. "Oil_Market_Cube_Upstream_Supply_Capacity_Capacity",
  241. "Oil_Market_Cube_Upstream_Supply_Production",
  242. "Oil_Market_Cube_Upstream_Supply_Production_Wo_Seasonality",
  243. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Reference_Production",
  244. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Target_Production",
  245. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Target_Cut",
  246. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Actual_Cut",
  247. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Compliance",
  248. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Production_Subject_To_Cut",
  249. "Oil_Market_Cube_Products_Demand_Products_Demand_Mean",
  250. "Oil_Market_Cube_Products_Demand_Products_Demand+Sigma",
  251. "Oil_Market_Cube_Products_Demand_Products_Demand-Sigma",
  252. "Oil_Market_Cube_Balances_Total_Liquids_Balances",
  253. "Oil_Market_Cube_Geography_Latitude",
  254. "Oil_Market_Cube_Geography_Longitude",*/
  255. }
  256. for _, tableName := range tableNameList {
  257. var fileName string
  258. // 解析表格
  259. fileName = tableName + "_" + utils.GetCurrentYearMonth() + ".xlsx"
  260. filePath := filepath.Join(downloadDir, fileName)
  261. // 打开 Excel 文件
  262. f, err := excelize.OpenFile(filePath)
  263. if err != nil {
  264. log.Fatalf("无法打开 Excel 文件: %v", err)
  265. }
  266. // 获取所有工作表
  267. sheetNames := f.GetSheetList()
  268. for _, sheetName := range sheetNames {
  269. fmt.Printf("读取工作表: %s\n", sheetName)
  270. /*if strings.Contains(sheetName, "Content") {
  271. continue
  272. }
  273. if strings.Contains(sheetName, "Road Index") {
  274. continue
  275. }
  276. if strings.Contains(sheetName, "Road Active Fleet") {
  277. continue
  278. }
  279. if strings.Contains(sheetName, "Aviation Index") {
  280. continue
  281. }
  282. if strings.Contains(sheetName, "Aviation Active Fleet") {
  283. continue
  284. }
  285. if strings.Contains(sheetName, "Demand - Gasoline") {
  286. continue
  287. }
  288. if strings.Contains(sheetName, "Demand - Diesel") {
  289. continue
  290. }
  291. if strings.Contains(sheetName, "Demand - Jet Fuel") {
  292. continue
  293. }
  294. if strings.Contains(sheetName, "Demand - Maritime Bunker") {
  295. continue
  296. }*/
  297. if strings.Contains(sheetName, "Chart1") {
  298. continue
  299. }
  300. if strings.Contains(sheetName, "Chart2") {
  301. continue
  302. }
  303. if strings.Contains(sheetName, "Chart3") {
  304. continue
  305. }
  306. if strings.Contains(sheetName, "Chart4") {
  307. continue
  308. }
  309. if strings.Contains(sheetName, "Chart5") {
  310. continue
  311. }
  312. // 获取工作表的最大行数
  313. maxRow, err := f.GetRows(sheetName) // 直接获取所有行数据
  314. if err != nil {
  315. log.Fatalf("获取工作表数据时出错: %v", err)
  316. continue
  317. }
  318. // 遍历行并打印内容
  319. indexData := []models.BaseFromRzdData{}
  320. for rowIndex, rowData := range maxRow {
  321. processor, err := GetProcessor(tableName, sheetName)
  322. if err != nil {
  323. continue
  324. }
  325. baseFromLyDataList, err := processor.Process(tableName, sheetName, rowIndex, rowData)
  326. if err != nil {
  327. log.Printf("processor.Process err: %v", err)
  328. return
  329. }
  330. indexData = append(indexData, baseFromLyDataList...)
  331. }
  332. // 新增数据源指标数据
  333. if len(indexData) > 0 {
  334. for i := 0; i < len(indexData); i += rzdBatchSize {
  335. // 计算当前批次的结束索引
  336. end := i + rzdBatchSize
  337. if end > len(indexData) {
  338. end = len(indexData)
  339. }
  340. // 获取当前批次的数据
  341. batchData := indexData[i:end]
  342. // 转换成json
  343. marshal, err := json.Marshal(batchData)
  344. if err != nil {
  345. log.Printf("json.Marshal err: %v", err)
  346. return
  347. }
  348. // 发送 HTTP POST 请求
  349. _, err = utils.HttpPostRequest(utils.EDB_LIB_URL+utils.ADD_BATCH_RZD_DATA, string(marshal), "application/json")
  350. if err != nil {
  351. log.Printf("postEdbLib err: %v", err)
  352. return
  353. }
  354. }
  355. }
  356. // 新增指标库数据
  357. edbDataList := []models.EdbDataRzd{}
  358. for _, index := range indexData {
  359. // 补充 判断是否存在于指标库
  360. paramsLib := make(map[string]interface{})
  361. paramsLib["IndexCode"] = index.IndexCode
  362. paramsLib["Source"] = utils.DATA_SOURCE_RZD
  363. postEdbLib, err := httpRequestFill(paramsLib, utils.GET_RZD_EDB_INFO_BY_INDEX_CODE)
  364. if err != nil {
  365. // 有错误就不继续执行
  366. log.Printf("postEdbLib err: %v", err)
  367. continue
  368. }
  369. var requestResponse models.RequestResponse[models.EdbInfo]
  370. err = json.Unmarshal(postEdbLib, &requestResponse)
  371. if err != nil {
  372. log.Printf("postEdbLib err: %v", err)
  373. continue
  374. }
  375. if requestResponse.Data.EdbInfoId != 0 {
  376. edbDataRzd := models.EdbDataRzd{
  377. CreateTime: utils.GetCurrentTime(),
  378. ModifyTime: utils.GetCurrentTime(),
  379. EdbInfoId: index.BaseFromRzdIndexId,
  380. EdbCode: index.IndexCode,
  381. DataTime: index.DataTime,
  382. Value: index.Value,
  383. DataTimestamp: uint64(time.Now().UnixNano() / int64(time.Millisecond)),
  384. }
  385. edbDataList = append(edbDataList, edbDataRzd)
  386. }
  387. }
  388. if len(edbDataList) > 0 {
  389. // 转换成json
  390. marshal, err := json.Marshal(edbDataList)
  391. if err != nil {
  392. log.Printf("postEdbLib err: %v", err)
  393. return
  394. }
  395. _, err = utils.HttpPostRequest(utils.EDB_LIB_URL+utils.ADD_BATCH_RZD_EDB_DATA, string(marshal), "application/json")
  396. if err != nil {
  397. // 有错误就不继续执行
  398. log.Printf("postEdbLib err: %v", err)
  399. return
  400. }
  401. }
  402. }
  403. }
  404. }
  405. func login(ctx context.Context) error {
  406. return chromedp.Run(ctx,
  407. chromedp.Navigate(rzdLoginPath),
  408. chromedp.SetValue(`input[id="Username"]`, utils.RZD_USERNAME, chromedp.ByQuery),
  409. chromedp.SetValue(`input[id="Password"]`, utils.RZD_PASSWORD, chromedp.ByQuery),
  410. chromedp.WaitEnabled(`//button[text()='Login']`, chromedp.BySearch),
  411. chromedp.Click(`//button[text()='Login']`, chromedp.BySearch),
  412. chromedp.Sleep(5*time.Second),
  413. // 等待并点击登录后页面的链接
  414. chromedp.WaitVisible(`a[href="/clients/"]`, chromedp.ByQuery), // 等待 Analytics Library 链接可见
  415. chromedp.Sleep(5*time.Second), // 等待页面加载完成
  416. )
  417. }
  418. func httpRequestFill(data interface{}, urlMethod string) (postEdbLib []byte, err error) {
  419. // 转换成json
  420. marshal, err := json.Marshal(data)
  421. if err != nil {
  422. return nil, err
  423. }
  424. // json 转 interface
  425. var result map[string]interface{}
  426. err = json.Unmarshal(marshal, &result)
  427. if err != nil {
  428. return nil, err
  429. }
  430. postEdbLib, err = utils.PostEdbLibRequest(result, urlMethod)
  431. if err != nil {
  432. // 有错误就不继续执行
  433. log.Printf("postEdbLib err: %v", err)
  434. return nil, err
  435. }
  436. return postEdbLib, nil
  437. }