data_processor.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. package main
  2. import (
  3. "context"
  4. "eta/eta_data_analysis/utils"
  5. "fmt"
  6. "log"
  7. "os"
  8. "path/filepath"
  9. "strings"
  10. "time"
  11. "github.com/chromedp/chromedp"
  12. )
  13. const downloadDir = "./downloads"
  14. // 定义选择器
  15. var (
  16. rzdLoginPath = "https://clients.rystadenergy.com/clients/"
  17. clientSearchLink = `div.d-none.d-lg-flex.flex-grow-1 a[href="/clients/search/"]`
  18. clientsCubeDashboardsLink = `div.d-none.d-lg-flex.flex-grow-1 a[href="/clients/cube-dashboards/"]`
  19. supplyRevisionAnalysisSelector = `div.ais-Hits li h5:contains("Supply Revision Analysis")`
  20. oilDemandAnalysisSelector = `div.ais-Hits li[contains(., 'Oil Demand Analysis')]`
  21. oilSupplyAnalysisSelector = `div.ais-Hits li[contains(., 'Oil Supply Analysis')]`
  22. dateSlicerInputSelector = `div.visualContainer.unselectable.readMode.hideBorder.visualHeaderBelow.droppableElement.ui-droppable div.date-slicer-control input.date-slicer-input.enable-hover`
  23. downloadButtonSelector = `div.btn.btn-link.btn-sm.dashboard-action.dashboard-action--download-data`
  24. oilDemandIframeSelector = `div#WithPollingInFrame iframe` // 根据实际 iframe 的选择器更新
  25. tabSelectorBase = `h3.preTextWithEllipsis` // H3 标签中每个标签的基础选择器
  26. continentTabSelector = tabSelectorBase + `:contains("Continent")`
  27. regionTabSelector = tabSelectorBase + `:contains("Region")`
  28. countryTabSelector = tabSelectorBase + `:contains("Country")`
  29. productCategoryTabSelector = tabSelectorBase + `:contains("Product category")`
  30. productDetailTabSelector = tabSelectorBase + `:contains("Product detail")`
  31. sectorCategoryTabSelector = tabSelectorBase + `:contains("Sector category")`
  32. sectorDetailTabSelector = tabSelectorBase + `:contains("Sector detail")`
  33. scenarioTabSelector = tabSelectorBase + `:contains("Scenario")`
  34. )
  35. // 函数用于设置查询时间范围
  36. func setQueryTime(ctx context.Context, year string) error {
  37. // 在这里可以直接操作 iframe 中的元素
  38. var inputCount int
  39. if err := chromedp.Run(ctx,
  40. chromedp.WaitVisible(`#reportContainer`, chromedp.ByQuery),
  41. // 获取 reportContainer 下的第一个 iframe 的内容文档
  42. chromedp.ActionFunc(func(ctx context.Context) error {
  43. // 获取 iframe 的内容文档
  44. var iframeSrc string
  45. // 获取 iframe 的 src
  46. err := chromedp.Evaluate(`document.querySelector('#reportContainer iframe').src`, &iframeSrc).Do(ctx)
  47. if err != nil {
  48. return fmt.Errorf("获取 iframe ID 或 src 失败: %v", err)
  49. }
  50. // 在 iframe 的上下文中操作
  51. return chromedp.Run(ctx,
  52. // 等待 iframe 可见
  53. chromedp.WaitVisible(`iframe[src="`+iframeSrc+`"]`, chromedp.ByQuery),
  54. chromedp.Sleep(5*time.Second),
  55. // 在 iframe 中执行操作
  56. chromedp.ActionFunc(func(ctx context.Context) error {
  57. // 选择器
  58. selector := `div.landingContainer`
  59. // 获取元素数量
  60. if err := chromedp.Evaluate(`document.querySelectorAll("`+selector+`").length`, &inputCount).Do(ctx); err != nil {
  61. return fmt.Errorf("检查输入框失败: %v", err)
  62. }
  63. if inputCount == 0 {
  64. return fmt.Errorf("没有找到匹配的 div.landingContainer 标签")
  65. }
  66. return nil
  67. }),
  68. )
  69. }),
  70. ); err != nil {
  71. log.Fatal(err)
  72. }
  73. /*return chromedp.Run(ctx,
  74. chromedp.Sleep(3*time.Second),
  75. chromedp.WaitVisible(dateSlicerInputSelector, chromedp.ByQuery),
  76. chromedp.SetValue(dateSlicerInputSelector, year, chromedp.ByQuery),
  77. chromedp.SendKeys(dateSlicerInputSelector, "\n"), // 回车查询
  78. )*/
  79. return nil
  80. }
  81. // 函数用于点击下载按钮
  82. func clickDownload(ctx context.Context) error {
  83. return chromedp.Run(ctx, chromedp.Click(downloadButtonSelector, chromedp.ByQuery))
  84. }
  85. // 处理数据下载的步骤
  86. func downloadData(ctx context.Context) error {
  87. // Analytics Library
  88. if err := chromedp.Run(ctx,
  89. chromedp.Sleep(5*time.Second), // 考虑移除这一行,如果不必要的话
  90. chromedp.Navigate(rzdLoginPath),
  91. chromedp.WaitVisible(`div.d-none.d-lg-flex.flex-grow-1`, chromedp.ByQuery),
  92. chromedp.Click(clientSearchLink, chromedp.ByQuery),
  93. chromedp.WaitVisible(`input[class="ais-SearchBox-input rounded border py-2 px-3 shadow-sm font-size-14 w-100"]`, chromedp.ByQuery),
  94. chromedp.SetValue(`input[class="ais-SearchBox-input rounded border py-2 px-3 shadow-sm font-size-14 w-100"]`, "oil demand signals weekly report", chromedp.ByQuery),
  95. chromedp.Click(`div.ais-InfiniteHits img[src="/Static/img/icons/xls.png"]`, chromedp.ByQuery),
  96. ); err != nil {
  97. return fmt.Errorf("下载 Analytics Library 数据错误: %v", err)
  98. }
  99. // Cube Dashboards: Supply Revision Analysis
  100. if err := chromedp.Run(ctx,
  101. chromedp.WaitVisible(`div.d-none.d-lg-flex.flex-grow-1`, chromedp.ByQuery),
  102. chromedp.Click(clientsCubeDashboardsLink, chromedp.ByQuery),
  103. chromedp.Sleep(5*time.Second),
  104. chromedp.WaitVisible(`div.ais-Hits`, chromedp.ByQuery),
  105. chromedp.ActionFunc(func(ctx context.Context) error {
  106. var elements []string
  107. // 获取所有 h5 标签的文本内容
  108. if err := chromedp.Evaluate(`Array.from(document.querySelectorAll('div.ais-Hits li h5.text-body.overflow-hidden.mb-1.mr-3.font-weight-bold.line-height-1.dashboards-hit__name')).map(h => h.textContent)`, &elements).Do(ctx); err != nil {
  109. return err
  110. }
  111. // 遍历文本,查找完全匹配的元素并点击
  112. for i, text := range elements {
  113. if strings.Contains(text, "Supply Revision Analysis") {
  114. // 构造选择器,点击找到的匹配元素
  115. selector := fmt.Sprintf(`div.ais-Hits ol li:nth-child(%d) h5.text-body.overflow-hidden.mb-1.mr-3.font-weight-bold.line-height-1.dashboards-hit__name`, i+2)
  116. if err := chromedp.Click(selector, chromedp.ByQuery).Do(ctx); err != nil {
  117. return fmt.Errorf("点击 'Supply Revision Analysis' 失败: %v", err)
  118. }
  119. break // 找到后跳出循环
  120. }
  121. }
  122. return nil
  123. }),
  124. ); err != nil {
  125. return err
  126. }
  127. if err := setQueryTime(ctx, "2020"); err != nil {
  128. return err
  129. }
  130. if err := clickDownload(ctx); err != nil {
  131. return err
  132. }
  133. if err := waitAndRenameDownloadedFile("Supply_Revision_Analysis_2020.xlsx"); err != nil {
  134. return err
  135. }
  136. // Oil Demand Analysis
  137. if err := downloadOilDemandAnalysis(ctx); err != nil {
  138. return fmt.Errorf("下载 Oil Demand Analysis 错误: %v", err)
  139. }
  140. // Oil Supply Analysis
  141. if err := chromedp.Run(ctx,
  142. chromedp.Click(`a[href="/clients/subscription/"]`, chromedp.ByQuery),
  143. chromedp.Click(oilSupplyAnalysisSelector, chromedp.ByQuery),
  144. ); err != nil {
  145. return err
  146. }
  147. if err := setQueryTime(ctx, "2010"); err != nil {
  148. return err
  149. }
  150. if err := clickDownload(ctx); err != nil {
  151. return err
  152. }
  153. if err := waitAndRenameDownloadedFile("Oil_Supply_Analysis_2010.xlsx"); err != nil {
  154. return err
  155. }
  156. return nil
  157. }
  158. // 下载 Oil Demand Analysis 的所有标签数据
  159. func downloadOilDemandAnalysis(ctx context.Context) error {
  160. // 下载 "Continent" 标签的数据
  161. if err := downloadOilDemandByTab(ctx, continentTabSelector, "2015", "Oil_Demand_Continent_2015.xlsx"); err != nil {
  162. return err
  163. }
  164. // 下载 "Region" 标签的数据
  165. if err := downloadOilDemandByTab(ctx, regionTabSelector, "2015", "Oil_Demand_Region_2015.xlsx"); err != nil {
  166. return err
  167. }
  168. // 下载 "Country" 标签的数据
  169. if err := downloadOilDemandByTab(ctx, countryTabSelector, "2015", "Oil_Demand_Country_2015.xlsx"); err != nil {
  170. return err
  171. }
  172. // 下载 "Product_Category" 标签的数据
  173. if err := downloadOilDemandByTab(ctx, productCategoryTabSelector, "2015", "Oil_Demand_Product_Category_2015.xlsx"); err != nil {
  174. return err
  175. }
  176. // 下载 "Product_Detail" 标签的数据
  177. if err := downloadOilDemandByTab(ctx, productDetailTabSelector, "2015", "Oil_Demand_Product_Detail_2015.xlsx"); err != nil {
  178. return err
  179. }
  180. // 下载 "Sector_Category" 标签的数据
  181. if err := downloadOilDemandByTab(ctx, sectorCategoryTabSelector, "2015", "Oil_Demand_Sector_Category_2015.xlsx"); err != nil {
  182. return err
  183. }
  184. // 下载 "Sector_Detail" 标签的数据
  185. if err := downloadOilDemandByTab(ctx, sectorDetailTabSelector, "2015", "Oil_Demand_Sector_Detail_2015.xlsx"); err != nil {
  186. return err
  187. }
  188. // 下载 "Scenario" 标签的数据
  189. if err := downloadOilDemandByTab(ctx, scenarioTabSelector, "2015", "Oil_Demand_Scenario_2015.xlsx"); err != nil {
  190. return err
  191. }
  192. return nil
  193. }
  194. // 函数用于处理不同标签的下载
  195. func downloadOilDemandByTab(ctx context.Context, tabSelector string, year string, fileName string) error {
  196. // 切换到 iframe 并在 iframe 内进行操作
  197. if err := chromedp.Run(ctx,
  198. chromedp.WaitVisible(oilDemandIframeSelector, chromedp.ByQuery), // 等待 iframe 可见
  199. chromedp.ActionFunc(func(ctx context.Context) error {
  200. // 点击指定的标签
  201. if err := chromedp.Click(tabSelector, chromedp.ByQuery).Do(ctx); err != nil {
  202. return fmt.Errorf("点击标签失败: %v", err)
  203. }
  204. // 等待页面加载完成
  205. if err := chromedp.Sleep(2 * time.Second).Do(ctx); err != nil {
  206. return fmt.Errorf("等待页面加载失败: %v", err)
  207. }
  208. // 设置时间范围
  209. if err := setQueryTime(ctx, year); err != nil {
  210. return fmt.Errorf("设置查询时间失败: %v", err)
  211. }
  212. // 点击下载按钮
  213. if err := clickDownload(ctx); err != nil {
  214. return fmt.Errorf("点击下载按钮失败: %v", err)
  215. }
  216. return nil
  217. }),
  218. ); err != nil {
  219. return fmt.Errorf("操作失败: %v", err)
  220. }
  221. // 下载完成后,重命名文件
  222. if err := waitAndRenameDownloadedFile(fileName); err != nil {
  223. return fmt.Errorf("重命名文件失败: %v", err)
  224. }
  225. return nil
  226. }
  227. // 等待下载文件并重命名
  228. func waitAndRenameDownloadedFile(newFileName string) error {
  229. // 等待一段时间以确保文件下载完成
  230. time.Sleep(5 * time.Second) // 可能需要根据实际情况调整
  231. // 查找下载目录中的文件
  232. files, err := filepath.Glob(filepath.Join(downloadDir, "*.xlsx"))
  233. if err != nil {
  234. return fmt.Errorf("查找文件时出错: %v", err)
  235. }
  236. // 重命名最新的文件
  237. for _, file := range files {
  238. if err := os.Rename(file, filepath.Join(downloadDir, newFileName)); err != nil {
  239. return fmt.Errorf("重命名文件时出错: %v", err)
  240. }
  241. // 打印重命名后的文件名
  242. fmt.Printf("文件重命名为: %s\n", newFileName)
  243. break // 只重命名第一个找到的文件
  244. }
  245. return nil
  246. }
  247. func main() {
  248. // 创建下载目录
  249. if err := os.MkdirAll(downloadDir, os.ModePerm); err != nil {
  250. fmt.Printf("创建下载目录时出错: %v\n", err)
  251. return
  252. }
  253. // 创建 chromedp 执行上下文
  254. options := []chromedp.ExecAllocatorOption{
  255. chromedp.Flag("headless", false),
  256. chromedp.Flag("disable-blink-features", "AutomationControlled"),
  257. chromedp.UserAgent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36`),
  258. chromedp.Flag("download.default_directory", downloadDir),
  259. chromedp.Flag("download.prompt_for_download", false), // 不弹出下载对话框
  260. chromedp.Flag("safebrowsing.enabled", true), // 启用安全浏览
  261. }
  262. allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...)
  263. defer cancel()
  264. ctx, cancel := chromedp.NewContext(allocCtx)
  265. defer cancel()
  266. // 启动 Chrome 实例
  267. if err := chromedp.Run(ctx); err != nil {
  268. fmt.Printf("启动 Chrome 实例时出错: %v\n", err)
  269. return
  270. }
  271. // 登录操作
  272. if err := login(ctx); err != nil {
  273. fmt.Printf("登录错误: %v\n", err)
  274. return
  275. }
  276. // 下载数据
  277. if err := downloadData(ctx); err != nil {
  278. fmt.Printf("数据下载错误: %v\n", err)
  279. return
  280. }
  281. fmt.Println("数据下载完成")
  282. }
  283. func login(ctx context.Context) error {
  284. return chromedp.Run(ctx,
  285. chromedp.Navigate(rzdLoginPath),
  286. chromedp.SetValue(`input[id="Username"]`, utils.RZD_USERNAME, chromedp.ByQuery),
  287. chromedp.SetValue(`input[id="Password"]`, utils.RZD_PASSWORD, chromedp.ByQuery),
  288. chromedp.WaitEnabled(`//button[text()='Login']`, chromedp.BySearch),
  289. chromedp.Click(`//button[text()='Login']`, chromedp.BySearch),
  290. chromedp.Sleep(5*time.Second),
  291. // 等待并点击登录后页面的链接
  292. chromedp.WaitVisible(`a[href="/clients/"]`, chromedp.ByQuery), // 等待 Analytics Library 链接可见
  293. chromedp.Sleep(5*time.Second), // 等待页面加载完成
  294. )
  295. }