data_processor.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. package ruizide
  2. import (
  3. "context"
  4. "encoding/json"
  5. "eta/eta_data_analysis/models"
  6. "eta/eta_data_analysis/utils"
  7. "fmt"
  8. "github.com/chromedp/cdproto/cdp"
  9. "github.com/xuri/excelize/v2"
  10. "io"
  11. "log"
  12. "os"
  13. "path/filepath"
  14. "strings"
  15. "time"
  16. "github.com/chromedp/chromedp"
  17. )
  18. // 定义选择器
  19. var (
  20. downloadDir = "D:\\download"
  21. defaultDir = "C:\\Users\\Guo Mengyuan\\Downloads"
  22. rzdLoginPath = "https://clients.rystadenergy.com/clients/"
  23. clientSearchLink = `div.d-none.d-lg-flex.flex-grow-1 a[href="/clients/search/"]`
  24. clientsCubeDashboardsLink = `div.d-none.d-lg-flex.flex-grow-1 a[href="/clients/cube-dashboards/"]`
  25. supplyRevisionAnalysisSelector = `div.ais-Hits li h5:contains("Supply Revision Analysis")`
  26. oilDemandAnalysisSelector = `div.ais-Hits li[contains(., 'Oil Demand Analysis')]`
  27. oilSupplyAnalysisSelector = `div.ais-Hits li[contains(., 'Oil Supply Analysis')]`
  28. dateSlicerInputSelector = `div.visualContainer.unselectable.readMode.hideBorder.visualHeaderBelow.droppableElement.ui-droppable div.date-slicer-control input.date-slicer-input.enable-hover`
  29. downloadButtonSelector = `div.btn.btn-link.btn-sm.dashboard-action.dashboard-action--download-data`
  30. oilDemandIframeSelector = `div#WithPollingInFrame iframe` // 根据实际 iframe 的选择器更新
  31. tabSelectorBase = `h3.preTextWithEllipsis` // H3 标签中每个标签的基础选择器
  32. continentTabSelector = tabSelectorBase + `:contains("Continent")`
  33. regionTabSelector = tabSelectorBase + `:contains("Region")`
  34. countryTabSelector = tabSelectorBase + `:contains("Country")`
  35. productCategoryTabSelector = tabSelectorBase + `:contains("Product category")`
  36. productDetailTabSelector = tabSelectorBase + `:contains("Product detail")`
  37. sectorCategoryTabSelector = tabSelectorBase + `:contains("Sector category")`
  38. sectorDetailTabSelector = tabSelectorBase + `:contains("Sector detail")`
  39. scenarioTabSelector = tabSelectorBase + `:contains("Scenario")`
  40. )
  41. // 函数用于点击下载按钮
  42. func clickDownload(ctx context.Context) error {
  43. return chromedp.Run(ctx, chromedp.Click(downloadButtonSelector, chromedp.ByQuery))
  44. }
  45. // 处理数据下载的步骤
  46. func downloadData(ctx context.Context) error {
  47. // Analytics Library
  48. if err := chromedp.Run(ctx,
  49. chromedp.Sleep(5*time.Second), // 考虑移除这一行,如果不必要的话
  50. chromedp.Navigate(rzdLoginPath),
  51. chromedp.WaitVisible(`div.d-none.d-lg-flex.flex-grow-1`, chromedp.ByQuery),
  52. chromedp.Click(clientSearchLink, chromedp.ByQuery),
  53. chromedp.WaitVisible(`input[class="ais-SearchBox-input rounded border py-2 px-3 shadow-sm font-size-14 w-100"]`, chromedp.ByQuery),
  54. chromedp.SetValue(`input[class="ais-SearchBox-input rounded border py-2 px-3 shadow-sm font-size-14 w-100"]`, "oil demand signals weekly report", chromedp.ByQuery),
  55. //chromedp.Click(`div.ais-InfiniteHits li a:has(img[src="/Static/img/icons/xls.png"])`, chromedp.ByQuery),
  56. ); err != nil {
  57. return fmt.Errorf("下载 Analytics Library 数据错误: %v", err)
  58. }
  59. xpath := `//div[@id='search-page-hits']//li//a[.//div//span[@class='align-middle' and text()='Data']]`
  60. var inputCount int
  61. var nodes []*cdp.Node // 使用 *cdp.Node
  62. if err := chromedp.Run(ctx,
  63. chromedp.ActionFunc(func(ctx context.Context) error {
  64. // 获取匹配的节点
  65. if err := chromedp.Nodes(xpath, &nodes, chromedp.BySearch).Do(ctx); err != nil {
  66. return fmt.Errorf("检查节点失败: %v", err)
  67. }
  68. // 获取节点数量
  69. inputCount = len(nodes)
  70. fmt.Printf("找到 %d 个匹配的元素\n", inputCount)
  71. if inputCount > 0 {
  72. // 点击第一个节点
  73. return chromedp.MouseClickNode(nodes[0]).Do(ctx) // 使用 []cdp.NodeID
  74. }
  75. return nil
  76. }),
  77. chromedp.Sleep(10*time.Second),
  78. ); err != nil {
  79. return fmt.Errorf("下载 Analytics Library 数据错误: %v", err)
  80. }
  81. // 解析文件移动到目标目录
  82. if err := waitAndRenameDownloadedFile("Oil_Demand_Signals_Weekly_Report_"+utils.GetCurrentYearMonth()+".xlsx", downloadDir); err != nil {
  83. return err
  84. }
  85. // Cube Dashboards: Supply Revision Analysis
  86. if err := chromedp.Run(ctx,
  87. chromedp.WaitVisible(`div.d-none.d-lg-flex.flex-grow-1`, chromedp.ByQuery),
  88. chromedp.Click(clientsCubeDashboardsLink, chromedp.ByQuery),
  89. chromedp.Sleep(5*time.Second),
  90. chromedp.WaitVisible(`div.ais-Hits`, chromedp.ByQuery),
  91. chromedp.ActionFunc(func(ctx context.Context) error {
  92. var elements []string
  93. // 获取所有 h5 标签的文本内容
  94. if err := chromedp.Evaluate(`Array.from(document.querySelectorAll('div.ais-Hits li h5.text-body.overflow-hidden.mb-1.mr-3.font-weight-bold.line-height-1.dashboards-hit__name')).map(h => h.textContent)`, &elements).Do(ctx); err != nil {
  95. return err
  96. }
  97. // 遍历文本,查找完全匹配的元素并点击
  98. for i, text := range elements {
  99. if strings.Contains(text, "Supply Revision Analysis") {
  100. // 构造选择器,点击找到的匹配元素
  101. selector := fmt.Sprintf(`div.ais-Hits ol li:nth-child(%d) h5.text-body.overflow-hidden.mb-1.mr-3.font-weight-bold.line-height-1.dashboards-hit__name`, i+2)
  102. if err := chromedp.Click(selector, chromedp.ByQuery).Do(ctx); err != nil {
  103. return fmt.Errorf("点击 'Supply Revision Analysis' 失败: %v", err)
  104. }
  105. break // 找到后跳出循环
  106. }
  107. }
  108. return nil
  109. }),
  110. ); err != nil {
  111. return err
  112. }
  113. if err := clickDownload(ctx); err != nil {
  114. return err
  115. }
  116. if err := waitAndRenameDownloadedFile("Supply_Revision_Analysis_2020.xlsx", downloadDir); err != nil {
  117. return err
  118. }
  119. // Oil Supply Analysis
  120. if err := chromedp.Run(ctx,
  121. chromedp.Click(`a[href="/clients/subscription/"]`, chromedp.ByQuery),
  122. chromedp.Click(oilSupplyAnalysisSelector, chromedp.ByQuery),
  123. ); err != nil {
  124. return err
  125. }
  126. if err := clickDownload(ctx); err != nil {
  127. return err
  128. }
  129. if err := waitAndRenameDownloadedFile("Oil_Supply_Analysis_2010.xlsx", downloadDir); err != nil {
  130. return err
  131. }
  132. return nil
  133. }
  134. // 函数用于处理不同标签的下载
  135. func downloadOilDemandByTab(ctx context.Context, tabSelector string, year string, fileName string) error {
  136. // 切换到 iframe 并在 iframe 内进行操作
  137. if err := chromedp.Run(ctx,
  138. chromedp.WaitVisible(oilDemandIframeSelector, chromedp.ByQuery), // 等待 iframe 可见
  139. chromedp.ActionFunc(func(ctx context.Context) error {
  140. // 点击指定的标签
  141. if err := chromedp.Click(tabSelector, chromedp.ByQuery).Do(ctx); err != nil {
  142. return fmt.Errorf("点击标签失败: %v", err)
  143. }
  144. // 等待页面加载完成
  145. if err := chromedp.Sleep(2 * time.Second).Do(ctx); err != nil {
  146. return fmt.Errorf("等待页面加载失败: %v", err)
  147. }
  148. // 点击下载按钮
  149. if err := clickDownload(ctx); err != nil {
  150. return fmt.Errorf("点击下载按钮失败: %v", err)
  151. }
  152. return nil
  153. }),
  154. ); err != nil {
  155. return fmt.Errorf("操作失败: %v", err)
  156. }
  157. // 下载完成后,重命名文件
  158. if err := waitAndRenameDownloadedFile(fileName, downloadDir); err != nil {
  159. return fmt.Errorf("重命名文件失败: %v", err)
  160. }
  161. return nil
  162. }
  163. // 等待下载文件并重命名
  164. func waitAndRenameDownloadedFile(newFileName, targetDir string) error {
  165. // 等待一段时间以确保文件下载完成
  166. time.Sleep(60 * time.Second) // 可能需要根据实际情况调整
  167. // 查找下载目录中的文件
  168. files, err := filepath.Glob(filepath.Join(defaultDir, "*.xlsx"))
  169. if err != nil {
  170. return fmt.Errorf("查找文件时出错: %v", err)
  171. }
  172. // 如果没有找到文件,返回错误
  173. if len(files) == 0 {
  174. return fmt.Errorf("未找到任何下载的文件")
  175. }
  176. // 找到最新的文件
  177. var latestFile string
  178. var latestTime time.Time
  179. for _, file := range files {
  180. info, err := os.Stat(file)
  181. if err != nil {
  182. return fmt.Errorf("获取文件信息时出错: %v", err)
  183. }
  184. if info.ModTime().After(latestTime) {
  185. latestTime = info.ModTime()
  186. latestFile = file
  187. }
  188. }
  189. // 目标文件的完整路径
  190. targetFilePath := filepath.Join(targetDir, newFileName)
  191. // 重命名并移动到目标目录
  192. if latestFile != "" {
  193. if err := moveFile(latestFile, targetFilePath); err != nil {
  194. return fmt.Errorf("重命名文件时出错: %v", err)
  195. }
  196. // 打印重命名后的文件名
  197. fmt.Printf("文件重命名并移动到: %s\n", targetFilePath)
  198. }
  199. return nil
  200. }
  201. func moveFile(source, destination string) error {
  202. // 复制文件
  203. srcFile, err := os.Open(source)
  204. if err != nil {
  205. return fmt.Errorf("打开源文件时出错: %v", err)
  206. }
  207. defer srcFile.Close()
  208. dstFile, err := os.Create(destination)
  209. if err != nil {
  210. return fmt.Errorf("创建目标文件时出错: %v", err)
  211. }
  212. defer dstFile.Close()
  213. if _, err := io.Copy(dstFile, srcFile); err != nil {
  214. return fmt.Errorf("复制文件时出错: %v", err)
  215. }
  216. time.Sleep(60 * time.Second)
  217. // 删除源文件
  218. if err := os.Remove(source); err != nil {
  219. return fmt.Errorf("删除源文件时出错: %v", err)
  220. }
  221. return nil
  222. }
  223. // 解析网页数据,下载文件
  224. // func main() {
  225. func resolverNet() {
  226. // 创建下载目录
  227. if err := os.MkdirAll(downloadDir, os.ModePerm); err != nil {
  228. fmt.Printf("创建下载目录时出错: %v\n", err)
  229. return
  230. }
  231. // 创建 chromedp 执行上下文
  232. options := []chromedp.ExecAllocatorOption{
  233. chromedp.Flag("headless", false),
  234. chromedp.Flag("disable-blink-features", "AutomationControlled"),
  235. chromedp.UserAgent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36`),
  236. // 设置了但并不生效,直接从默认下载路径读取过来
  237. //chromedp.Flag("download.default_directory", downloadDir),
  238. //chromedp.Flag("download.prompt_for_download", false), // 不弹出下载对话框
  239. chromedp.Flag("safebrowsing.enabled", true), // 启用安全浏览
  240. //chromedp.UserDataDir(filepath.Join(downloadDir, "user-data")),
  241. }
  242. allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...)
  243. defer cancel()
  244. ctx, cancel := chromedp.NewContext(allocCtx)
  245. defer cancel()
  246. // 启动 Chrome 实例
  247. if err := chromedp.Run(ctx); err != nil {
  248. fmt.Printf("启动 Chrome 实例时出错: %v\n", err)
  249. return
  250. }
  251. // 设置下载行为
  252. /*if err := setDownloadBehavior(ctx); err != nil {
  253. fmt.Printf("设置下载路径时出错: %v\n", err)
  254. return
  255. }*/
  256. // 登录操作
  257. if err := login(ctx); err != nil {
  258. fmt.Printf("登录错误: %v\n", err)
  259. return
  260. }
  261. // 下载数据
  262. if err := downloadData(ctx); err != nil {
  263. fmt.Printf("数据下载错误: %v\n", err)
  264. return
  265. }
  266. fmt.Println("数据下载完成")
  267. }
  268. // 解析本地文件
  269. // func fileResolver() {
  270. func main() {
  271. var fileName string
  272. var tableName = "Oil_Demand_Signals_Weekly_Report"
  273. // 解析Oil_Demand_Signals_Weekly_Report_表格
  274. fileName = tableName + "_" + utils.GetCurrentYearMonth() + ".xlsx"
  275. filePath := filepath.Join(downloadDir, fileName)
  276. // 打开 Excel 文件
  277. f, err := excelize.OpenFile(filePath)
  278. if err != nil {
  279. log.Fatalf("无法打开 Excel 文件: %v", err)
  280. }
  281. // 获取所有工作表
  282. sheetNames := f.GetSheetList()
  283. for sheetIndex, sheetName := range sheetNames {
  284. fmt.Printf("读取工作表: %s\n", sheetName)
  285. if sheetIndex == 0 {
  286. continue
  287. }
  288. // 获取工作表的最大行数
  289. maxRow, err := f.GetRows(sheetName) // 直接获取所有行数据
  290. if err != nil {
  291. log.Fatalf("获取工作表数据时出错: %v", err)
  292. }
  293. // 遍历行并打印内容
  294. indexData := []models.BaseFromRzdData{}
  295. for rowIndex, rowData := range maxRow {
  296. if rowIndex < 4 {
  297. continue
  298. }
  299. processor, err := GetProcessor(tableName, sheetName)
  300. if err != nil {
  301. return
  302. }
  303. baseFromLyDataList, err := processor.Process(tableName, sheetName, rowData)
  304. indexData = append(indexData, baseFromLyDataList...)
  305. }
  306. // 新增数据源指标数据
  307. if len(indexData) > 0 {
  308. // 转换成json
  309. marshal, err := json.Marshal(indexData)
  310. if err != nil {
  311. log.Printf("postEdbLib err: %v", err)
  312. return
  313. }
  314. _, err = utils.HttpPostRequest(utils.EDB_LIB_URL+utils.ADD_RZD_DATA_LIST, string(marshal), "application/json")
  315. if err != nil {
  316. // 有错误就不继续执行
  317. log.Printf("postEdbLib err: %v", err)
  318. return
  319. }
  320. }
  321. // 新增指标库数据
  322. edbDataList := []models.EdbDataRzd{}
  323. for _, index := range indexData {
  324. // 补充 判断是否存在于指标库
  325. paramsLib := make(map[string]interface{})
  326. paramsLib["IndexCode"] = index.IndexCode
  327. paramsLib["DataTime"] = index.DataTime
  328. postEdbLib, err := httpRequestFill(paramsLib, utils.GET_EDB_DATA_BY_INDEX_CODE)
  329. if err != nil {
  330. // 有错误就不继续执行
  331. log.Printf("postEdbLib err: %v", err)
  332. continue
  333. }
  334. var requestResponse models.RequestResponse[models.EdbInfo]
  335. err = json.Unmarshal(postEdbLib, &requestResponse)
  336. if err != nil {
  337. log.Printf("postEdbLib err: %v", err)
  338. continue
  339. }
  340. if requestResponse.Data.EdbInfoId == 0 {
  341. edbDataRzd := models.EdbDataRzd{
  342. CreateTime: utils.GetCurrentTime(),
  343. ModifyTime: utils.GetCurrentTime(),
  344. EdbInfoId: index.BaseFromRzdIndexId,
  345. EdbCode: index.IndexCode,
  346. DataTime: index.DataTime,
  347. Value: index.Value,
  348. DataTimestamp: uint64(time.Now().UnixNano() / int64(time.Millisecond)),
  349. }
  350. edbDataList = append(edbDataList, edbDataRzd)
  351. }
  352. }
  353. if len(edbDataList) > 0 {
  354. // 转换成json
  355. marshal, err := json.Marshal(edbDataList)
  356. if err != nil {
  357. log.Printf("postEdbLib err: %v", err)
  358. return
  359. }
  360. _, err = utils.HttpPostRequest(utils.EDB_LIB_URL+utils.ADD_BATCH_RZD_EDB_DATA, string(marshal), "application/json")
  361. if err != nil {
  362. // 有错误就不继续执行
  363. log.Printf("postEdbLib err: %v", err)
  364. return
  365. }
  366. }
  367. }
  368. }
  369. // setDownloadBehavior 设置下载路径
  370. func setDownloadBehavior(ctx context.Context) error {
  371. return chromedp.Run(ctx,
  372. chromedp.ActionFunc(func(ctx context.Context) error {
  373. // 使用 chromedp.Exec 提交下载行为
  374. var result interface{}
  375. if err := chromedp.Evaluate(`(function() {
  376. return new Promise((resolve, reject) => {
  377. chrome.page.setDownloadBehavior({
  378. behavior: 'allow',
  379. downloadPath: '`+downloadDir+`'
  380. }, function() {
  381. resolve();
  382. });
  383. });
  384. })()`, &result).Do(ctx); err != nil {
  385. return fmt.Errorf("设置下载行为失败: %v", err)
  386. }
  387. return nil
  388. }),
  389. )
  390. }
  391. func login(ctx context.Context) error {
  392. return chromedp.Run(ctx,
  393. chromedp.Navigate(rzdLoginPath),
  394. chromedp.SetValue(`input[id="Username"]`, utils.RZD_USERNAME, chromedp.ByQuery),
  395. chromedp.SetValue(`input[id="Password"]`, utils.RZD_PASSWORD, chromedp.ByQuery),
  396. chromedp.WaitEnabled(`//button[text()='Login']`, chromedp.BySearch),
  397. chromedp.Click(`//button[text()='Login']`, chromedp.BySearch),
  398. chromedp.Sleep(5*time.Second),
  399. // 等待并点击登录后页面的链接
  400. chromedp.WaitVisible(`a[href="/clients/"]`, chromedp.ByQuery), // 等待 Analytics Library 链接可见
  401. chromedp.Sleep(5*time.Second), // 等待页面加载完成
  402. )
  403. }
  404. func httpRequestFill(data interface{}, urlMethod string) (postEdbLib []byte, err error) {
  405. // 转换成json
  406. marshal, err := json.Marshal(data)
  407. if err != nil {
  408. return nil, err
  409. }
  410. // json 转 interface
  411. var result map[string]interface{}
  412. err = json.Unmarshal(marshal, &result)
  413. if err != nil {
  414. return nil, err
  415. }
  416. postEdbLib, err = utils.PostEdbLibRequest(result, urlMethod)
  417. if err != nil {
  418. // 有错误就不继续执行
  419. log.Printf("postEdbLib err: %v", err)
  420. return nil, err
  421. }
  422. return postEdbLib, nil
  423. }