data_processor.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. package main
  2. import (
  3. "context"
  4. "encoding/json"
  5. "eta/eta_data_analysis/models"
  6. "eta/eta_data_analysis/utils"
  7. "fmt"
  8. "github.com/chromedp/cdproto/cdp"
  9. "github.com/xuri/excelize/v2"
  10. "io"
  11. "log"
  12. "os"
  13. "path/filepath"
  14. "time"
  15. "github.com/chromedp/chromedp"
  16. )
  17. // 定义选择器
  18. var (
  19. downloadDir = "D:\\download"
  20. defaultDir = "C:\\Users\\Guo Mengyuan\\Downloads"
  21. rzdLoginPath = "https://clients.rystadenergy.com/clients/"
  22. rzdBatchSize = 500
  23. clientSearchLink = `div.d-none.d-lg-flex.flex-grow-1 a[href="/clients/search/"]`
  24. clientsCubeDashboardsLink = `div.d-none.d-lg-flex.flex-grow-1 a[href="/clients/cube-dashboards/"]`
  25. supplyRevisionAnalysisSelector = `div.ais-Hits li h5:contains("Supply Revision Analysis")`
  26. oilDemandAnalysisSelector = `div.ais-Hits li[contains(., 'Oil Demand Analysis')]`
  27. oilSupplyAnalysisSelector = `div.ais-Hits li[contains(., 'Oil Supply Analysis')]`
  28. dateSlicerInputSelector = `div.visualContainer.unselectable.readMode.hideBorder.visualHeaderBelow.droppableElement.ui-droppable div.date-slicer-control input.date-slicer-input.enable-hover`
  29. downloadButtonSelector = `div.btn.btn-link.btn-sm.dashboard-action.dashboard-action--download-data`
  30. oilDemandIframeSelector = `div#WithPollingInFrame iframe` // 根据实际 iframe 的选择器更新
  31. tabSelectorBase = `h3.preTextWithEllipsis` // H3 标签中每个标签的基础选择器
  32. )
  33. // 函数用于点击下载按钮
  34. func clickDownload(ctx context.Context) error {
  35. return chromedp.Run(ctx, chromedp.Click(downloadButtonSelector, chromedp.ByQuery))
  36. }
  37. // 处理数据下载的步骤
  38. func downloadData(ctx context.Context) error {
  39. // Analytics Library
  40. if err := chromedp.Run(ctx,
  41. chromedp.Sleep(5*time.Second), // 考虑移除这一行,如果不必要的话
  42. chromedp.Navigate(rzdLoginPath),
  43. chromedp.WaitVisible(`div.d-none.d-lg-flex.flex-grow-1`, chromedp.ByQuery),
  44. chromedp.Click(clientSearchLink, chromedp.ByQuery),
  45. chromedp.WaitVisible(`input[class="ais-SearchBox-input rounded border py-2 px-3 shadow-sm font-size-14 w-100"]`, chromedp.ByQuery),
  46. chromedp.SetValue(`input[class="ais-SearchBox-input rounded border py-2 px-3 shadow-sm font-size-14 w-100"]`, "oil demand signals weekly report", chromedp.ByQuery),
  47. //chromedp.Click(`div.ais-InfiniteHits li a:has(img[src="/Static/img/icons/xls.png"])`, chromedp.ByQuery),
  48. ); err != nil {
  49. return fmt.Errorf("下载 Analytics Library 数据错误: %v", err)
  50. }
  51. xpath := `//div[@id='search-page-hits']//li//a[.//div//span[@class='align-middle' and text()='Data']]`
  52. var inputCount int
  53. var nodes []*cdp.Node // 使用 *cdp.Node
  54. if err := chromedp.Run(ctx,
  55. chromedp.ActionFunc(func(ctx context.Context) error {
  56. // 获取匹配的节点
  57. if err := chromedp.Nodes(xpath, &nodes, chromedp.BySearch).Do(ctx); err != nil {
  58. return fmt.Errorf("检查节点失败: %v", err)
  59. }
  60. // 获取节点数量
  61. inputCount = len(nodes)
  62. fmt.Printf("找到 %d 个匹配的元素\n", inputCount)
  63. if inputCount > 0 {
  64. // 点击第一个节点
  65. return chromedp.MouseClickNode(nodes[0]).Do(ctx) // 使用 []cdp.NodeID
  66. }
  67. return nil
  68. }),
  69. chromedp.Sleep(10*time.Second),
  70. ); err != nil {
  71. return fmt.Errorf("下载 Analytics Library 数据错误: %v", err)
  72. }
  73. // 解析文件移动到目标目录
  74. if err := waitAndRenameDownloadedFile("Oil_Demand_Signals_Weekly_Report_"+utils.GetCurrentYearMonth()+".xlsx", downloadDir); err != nil {
  75. return err
  76. }
  77. return nil
  78. }
  79. // 等待下载文件并重命名
  80. func waitAndRenameDownloadedFile(newFileName, targetDir string) error {
  81. // 等待一段时间以确保文件下载完成
  82. time.Sleep(60 * time.Second) // 可能需要根据实际情况调整
  83. // 查找下载目录中的文件
  84. files, err := filepath.Glob(filepath.Join(defaultDir, "*.xlsx"))
  85. if err != nil {
  86. return fmt.Errorf("查找文件时出错: %v", err)
  87. }
  88. // 如果没有找到文件,返回错误
  89. if len(files) == 0 {
  90. return fmt.Errorf("未找到任何下载的文件")
  91. }
  92. // 找到最新的文件
  93. var latestFile string
  94. var latestTime time.Time
  95. for _, file := range files {
  96. info, err := os.Stat(file)
  97. if err != nil {
  98. return fmt.Errorf("获取文件信息时出错: %v", err)
  99. }
  100. if info.ModTime().After(latestTime) {
  101. latestTime = info.ModTime()
  102. latestFile = file
  103. }
  104. }
  105. // 目标文件的完整路径
  106. targetFilePath := filepath.Join(targetDir, newFileName)
  107. // 重命名并移动到目标目录
  108. if latestFile != "" {
  109. if err := moveFile(latestFile, targetFilePath); err != nil {
  110. return fmt.Errorf("重命名文件时出错: %v", err)
  111. }
  112. // 打印重命名后的文件名
  113. fmt.Printf("文件重命名并移动到: %s\n", targetFilePath)
  114. }
  115. return nil
  116. }
  117. func moveFile(source, destination string) error {
  118. // 复制文件
  119. srcFile, err := os.Open(source)
  120. if err != nil {
  121. return fmt.Errorf("打开源文件时出错: %v", err)
  122. }
  123. defer srcFile.Close()
  124. dstFile, err := os.Create(destination)
  125. if err != nil {
  126. return fmt.Errorf("创建目标文件时出错: %v", err)
  127. }
  128. defer dstFile.Close()
  129. if _, err := io.Copy(dstFile, srcFile); err != nil {
  130. return fmt.Errorf("复制文件时出错: %v", err)
  131. }
  132. time.Sleep(60 * time.Second)
  133. // 删除源文件
  134. if err := os.Remove(source); err != nil {
  135. return fmt.Errorf("删除源文件时出错: %v", err)
  136. }
  137. return nil
  138. }
  139. // 解析网页数据,下载文件
  140. // func main() {
  141. func resolverNet() {
  142. // 创建下载目录
  143. if err := os.MkdirAll(downloadDir, os.ModePerm); err != nil {
  144. fmt.Printf("创建下载目录时出错: %v\n", err)
  145. return
  146. }
  147. // 创建 chromedp 执行上下文
  148. options := []chromedp.ExecAllocatorOption{
  149. chromedp.Flag("headless", false),
  150. chromedp.Flag("disable-blink-features", "AutomationControlled"),
  151. chromedp.UserAgent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36`),
  152. // 设置了但并不生效,直接从默认下载路径读取过来
  153. //chromedp.Flag("download.default_directory", downloadDir),
  154. //chromedp.Flag("download.prompt_for_download", false), // 不弹出下载对话框
  155. chromedp.Flag("safebrowsing.enabled", true), // 启用安全浏览
  156. //chromedp.UserDataDir(filepath.Join(downloadDir, "user-data")),
  157. }
  158. allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...)
  159. defer cancel()
  160. ctx, cancel := chromedp.NewContext(allocCtx)
  161. defer cancel()
  162. // 启动 Chrome 实例
  163. if err := chromedp.Run(ctx); err != nil {
  164. fmt.Printf("启动 Chrome 实例时出错: %v\n", err)
  165. return
  166. }
  167. // 设置下载行为
  168. /*if err := setDownloadBehavior(ctx); err != nil {
  169. fmt.Printf("设置下载路径时出错: %v\n", err)
  170. return
  171. }*/
  172. // 登录操作
  173. if err := login(ctx); err != nil {
  174. fmt.Printf("登录错误: %v\n", err)
  175. return
  176. }
  177. // 下载数据
  178. if err := downloadData(ctx); err != nil {
  179. fmt.Printf("数据下载错误: %v\n", err)
  180. return
  181. }
  182. fmt.Println("数据下载完成")
  183. // 解析表格 读取数据
  184. fileResolver()
  185. }
  186. // 解析本地文件
  187. func fileResolver() {
  188. //func main() {
  189. var tableNameList = []string{
  190. "Oil_Demand_Signals_Weekly_Report",
  191. "Oil_Supply_Analysis",
  192. "Supply_Revision_Analysis",
  193. "Oil_Market_Cube_Upstream_Supply_Oil_Quality_Api",
  194. "Oil_Market_Cube_Upstream_Supply_Oil_Quality_Sulphur",
  195. "Oil_Market_Cube_Upstream_Supply_Capacity_Capacity",
  196. "Oil_Market_Cube_Upstream_Supply_Production",
  197. "Oil_Market_Cube_Upstream_Supply_Production_Wo_Seasonality",
  198. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Reference_Production",
  199. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Target_Production",
  200. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Target_Cut",
  201. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Actual_Cut",
  202. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Compliance",
  203. "Oil_Market_Cube_Upstream_Supply_OPEC_Policy_Production_Subject_To_Cut",
  204. "Oil_Market_Cube_Products_Demand_Products_Demand_Mean",
  205. "Oil_Market_Cube_Products_Demand_Products_Demand+Sigma",
  206. "Oil_Market_Cube_Products_Demand_Products_Demand-Sigma",
  207. "Oil_Market_Cube_Balances_Total_Liquids_Balances",
  208. "Oil_Market_Cube_Geography_Latitude",
  209. "Oil_Market_Cube_Geography_Longitude",
  210. "Oil_Demand_Analysis_Product_Detail",
  211. "Oil_Demand_Analysis_Region",
  212. "Oil_Demand_Analysis_Scenario",
  213. "Oil_Demand_Analysis_Continent",
  214. "Oil_Demand_Analysis_Country",
  215. "Oil_Demand_Analysis_Product_Category",
  216. "Oil_Demand_Analysis_Sector_Category",
  217. "Oil_Demand_Analysis_Sector_Detail",
  218. }
  219. for _, tableName := range tableNameList {
  220. var fileName string
  221. // 解析表格
  222. fileName = tableName + "_" + utils.GetCurrentYearMonth() + ".xlsx"
  223. filePath := filepath.Join(downloadDir, fileName)
  224. // 打开 Excel 文件
  225. f, err := excelize.OpenFile(filePath)
  226. if err != nil {
  227. log.Fatalf("无法打开 Excel 文件: %v", err)
  228. }
  229. // 获取所有工作表
  230. sheetNames := f.GetSheetList()
  231. for _, sheetName := range sheetNames {
  232. fmt.Printf("读取工作表: %s\n", sheetName)
  233. // 获取工作表的最大行数
  234. maxRow, err := f.GetRows(sheetName) // 直接获取所有行数据
  235. if err != nil {
  236. log.Fatalf("获取工作表数据时出错: %v", err)
  237. continue
  238. }
  239. // 遍历行并打印内容
  240. indexData := []models.BaseFromRzdData{}
  241. for rowIndex, rowData := range maxRow {
  242. // 因为excel文件中的sheet表格不固定 对于 Supply_Revision_Analysis, Oil_Supply_Analysis 文件 手动调整sheet表格顺序
  243. if tableName == "Supply_Revision_Analysis" && rowIndex == 0 {
  244. if rowData[0] == "YearQuarter" && rowData[1] == "Revision" && rowData[2] == "CountryRevisionGroup" {
  245. sheetName = "Chart1"
  246. }
  247. if rowData[0] == "YearQuarter" && rowData[1] == "Current" && rowData[2] == "Previous" {
  248. sheetName = "Chart2"
  249. }
  250. if rowData[0] == "Year" && rowData[1] == "Revision" && rowData[2] == "CountryRevisionGroup" {
  251. sheetName = "Chart3"
  252. }
  253. if rowData[0] == "Year" && rowData[1] == "Current" && rowData[2] == "Previous" {
  254. sheetName = "Chart4"
  255. }
  256. if rowData[0] == "Previous" && rowData[1] == "Current" && rowData[2] == "YearMonth" {
  257. sheetName = "Chart5"
  258. }
  259. if rowData[0] == "YearMonth" && rowData[1] == "CountryRevisionGroup" && rowData[2] == "Revision" {
  260. sheetName = "Chart6"
  261. }
  262. } else if tableName == "Oil_Supply_Analysis" && rowIndex == 0 {
  263. if rowData[0] == "Viz Date" && rowData[1] == "OilAndGasCategory" && rowData[2] == "supply_kbbld" {
  264. sheetName = "Chart1"
  265. }
  266. if rowData[0] == "Viz Date" && rowData[1] == "supply_kbbld" && rowData[2] == "Region" {
  267. sheetName = "Chart2"
  268. }
  269. if rowData[0] == "Viz Date" && rowData[1] == "CapacityDetail" && rowData[2] == "Capacity_kbbld" {
  270. sheetName = "Chart3"
  271. }
  272. if rowData[0] == "Viz Date" && rowData[1] == "Oil Classification Group" && rowData[2] == "supply_kbbld" {
  273. sheetName = "Chart4"
  274. }
  275. }
  276. processor, err := GetProcessor(tableName, sheetName)
  277. if err != nil {
  278. continue
  279. }
  280. baseFromLyDataList, err := processor.Process(tableName, sheetName, rowIndex, rowData)
  281. if err != nil {
  282. log.Printf("processor.Process err: %v", err)
  283. return
  284. }
  285. indexData = append(indexData, baseFromLyDataList...)
  286. }
  287. // 新增数据源指标数据
  288. if len(indexData) > 0 {
  289. for i := 0; i < len(indexData); i += rzdBatchSize {
  290. // 计算当前批次的结束索引
  291. end := i + rzdBatchSize
  292. if end > len(indexData) {
  293. end = len(indexData)
  294. }
  295. // 获取当前批次的数据
  296. batchData := indexData[i:end]
  297. // 转换成json
  298. marshal, err := json.Marshal(batchData)
  299. if err != nil {
  300. log.Printf("json.Marshal err: %v", err)
  301. return
  302. }
  303. // 发送 HTTP POST 请求
  304. _, err = utils.HttpPostRequest(utils.EDB_LIB_URL+utils.ADD_BATCH_RZD_DATA, string(marshal), "application/json")
  305. if err != nil {
  306. log.Printf("postEdbLib err: %v", err)
  307. return
  308. }
  309. }
  310. }
  311. // 新增指标库数据
  312. edbDataList := []models.EdbDataRzd{}
  313. for _, index := range indexData {
  314. // 补充 判断是否存在于指标库
  315. paramsLib := make(map[string]interface{})
  316. paramsLib["IndexCode"] = index.IndexCode
  317. paramsLib["Source"] = utils.DATA_SOURCE_RZD
  318. postEdbLib, err := httpRequestFill(paramsLib, utils.GET_RZD_EDB_INFO_BY_INDEX_CODE)
  319. if err != nil {
  320. // 有错误就不继续执行
  321. log.Printf("postEdbLib err: %v", err)
  322. continue
  323. }
  324. var requestResponse models.RequestResponse[models.EdbInfo]
  325. err = json.Unmarshal(postEdbLib, &requestResponse)
  326. if err != nil {
  327. log.Printf("postEdbLib err: %v", err)
  328. continue
  329. }
  330. if requestResponse.Data.EdbInfoId != 0 {
  331. edbDataRzd := models.EdbDataRzd{
  332. CreateTime: utils.GetCurrentTime(),
  333. ModifyTime: utils.GetCurrentTime(),
  334. EdbInfoId: index.BaseFromRzdIndexId,
  335. EdbCode: index.IndexCode,
  336. DataTime: index.DataTime,
  337. Value: index.Value,
  338. DataTimestamp: uint64(time.Now().UnixNano() / int64(time.Millisecond)),
  339. }
  340. edbDataList = append(edbDataList, edbDataRzd)
  341. }
  342. }
  343. if len(edbDataList) > 0 {
  344. // 转换成json
  345. marshal, err := json.Marshal(edbDataList)
  346. if err != nil {
  347. log.Printf("postEdbLib err: %v", err)
  348. return
  349. }
  350. _, err = utils.HttpPostRequest(utils.EDB_LIB_URL+utils.ADD_BATCH_RZD_EDB_DATA, string(marshal), "application/json")
  351. if err != nil {
  352. // 有错误就不继续执行
  353. log.Printf("postEdbLib err: %v", err)
  354. return
  355. }
  356. }
  357. }
  358. }
  359. }
  360. func login(ctx context.Context) error {
  361. return chromedp.Run(ctx,
  362. chromedp.Navigate(rzdLoginPath),
  363. chromedp.SetValue(`input[id="Username"]`, utils.RZD_USERNAME, chromedp.ByQuery),
  364. chromedp.SetValue(`input[id="Password"]`, utils.RZD_PASSWORD, chromedp.ByQuery),
  365. chromedp.WaitEnabled(`//button[text()='Login']`, chromedp.BySearch),
  366. chromedp.Click(`//button[text()='Login']`, chromedp.BySearch),
  367. chromedp.Sleep(5*time.Second),
  368. // 等待并点击登录后页面的链接
  369. chromedp.WaitVisible(`a[href="/clients/"]`, chromedp.ByQuery), // 等待 Analytics Library 链接可见
  370. chromedp.Sleep(5*time.Second), // 等待页面加载完成
  371. )
  372. }
  373. func httpRequestFill(data interface{}, urlMethod string) (postEdbLib []byte, err error) {
  374. // 转换成json
  375. marshal, err := json.Marshal(data)
  376. if err != nil {
  377. return nil, err
  378. }
  379. // json 转 interface
  380. var result map[string]interface{}
  381. err = json.Unmarshal(marshal, &result)
  382. if err != nil {
  383. return nil, err
  384. }
  385. postEdbLib, err = utils.PostEdbLibRequest(result, urlMethod)
  386. if err != nil {
  387. // 有错误就不继续执行
  388. log.Printf("postEdbLib err: %v", err)
  389. return nil, err
  390. }
  391. return postEdbLib, nil
  392. }