commodity_liangyou.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. package liangyou
  2. import (
  3. "context"
  4. "encoding/json"
  5. models "eta/eta_crawler/models"
  6. "eta/eta_crawler/services/alarm_msg"
  7. "eta/eta_crawler/utils"
  8. "fmt"
  9. "github.com/beego/beego/v2/core/logs"
  10. "github.com/chromedp/cdproto/cdp"
  11. "os"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "time"
  16. "github.com/chromedp/chromedp"
  17. )
  18. var (
  19. lyLoginPath = "https://www.fao.com.cn/"
  20. )
  21. func LyDataDeal(cont context.Context) (err error) {
  22. // 读取 JSON 文件
  23. configFile, err := os.ReadFile(utils.LY_JSON_PATH)
  24. if err != nil {
  25. fmt.Printf("读取配置文件错误: %v\n", err)
  26. return nil
  27. }
  28. // 定义通用的 map 结构体来解析 JSON
  29. var data map[string]map[string]map[string][]string
  30. // 解析 JSON 文件内容
  31. err = json.Unmarshal(configFile, &data)
  32. if err != nil {
  33. fmt.Printf("解析配置文件错误: %v\n", err)
  34. return nil
  35. }
  36. // 打印解析后的数据以验证
  37. fmt.Printf("%+v\n", data)
  38. // 创建 chromedp 执行上下文
  39. options := []chromedp.ExecAllocatorOption{
  40. chromedp.Flag("headless", false),
  41. chromedp.Flag("disable-blink-features", "AutomationControlled"),
  42. chromedp.UserAgent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36`),
  43. }
  44. allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...)
  45. defer cancel()
  46. ctx, cancel := chromedp.NewContext(allocCtx)
  47. defer cancel()
  48. // 登录操作
  49. err = login(ctx)
  50. if err != nil {
  51. fmt.Printf("登录错误: %v\n", err)
  52. return nil
  53. }
  54. // 遍历配置并爬取数据
  55. for product, productData := range data {
  56. for category, categoryData := range productData {
  57. for report, keywords := range categoryData {
  58. fmt.Printf("正在获取数据: %s -> %s -> %s\n", product, category, report)
  59. err = fetchReportData(ctx, product, category, report, keywords)
  60. if err != nil {
  61. fmt.Printf("获取数据错误: %s -> %s -> %s: %v\n", product, category, report, err)
  62. // 您看文章的速度太快了,歇一会再看吧
  63. if strings.Contains(err.Error(), "您看文章的速度太快了,歇一会再看吧") {
  64. return
  65. }
  66. }
  67. }
  68. }
  69. }
  70. return nil
  71. }
  72. func login(ctx context.Context) error {
  73. return chromedp.Run(ctx,
  74. chromedp.Navigate(lyLoginPath),
  75. chromedp.Sleep(5*time.Second),
  76. chromedp.Click(`a[id="btnLogin"]`, chromedp.ByQuery),
  77. chromedp.Sleep(2*time.Second),
  78. chromedp.SetValue(`input[id="userName"]`, utils.LY_USERNAME, chromedp.ByQuery),
  79. chromedp.SetValue(`input[id="pwd"]`, utils.LY_PASSWORD, chromedp.ByQuery),
  80. chromedp.Sleep(2*time.Second),
  81. chromedp.Click(`input[id="btn_Login"]`, chromedp.ByQuery),
  82. chromedp.Sleep(5*time.Second),
  83. )
  84. }
  85. func fetchReportData(ctx context.Context, product, category, report string, keywords []string) error {
  86. // Navigate to the main page
  87. err := chromedp.Run(ctx,
  88. chromedp.Navigate(lyLoginPath),
  89. chromedp.Sleep(5*time.Second),
  90. )
  91. if err != nil {
  92. return err
  93. }
  94. // Navigate to the product page
  95. productPageURL, err := fillProductPageURL(ctx, product, category)
  96. if err != nil {
  97. return err
  98. }
  99. // Navigate to the category page
  100. var categoryPageURL string
  101. err = chromedp.Run(ctx,
  102. chromedp.Navigate(productPageURL),
  103. chromedp.Sleep(5*time.Second),
  104. chromedp.Click(fmt.Sprintf(`//div[contains(@class, "newBox")]//a[contains(text(), '%s')]`, category), chromedp.BySearch),
  105. chromedp.Sleep(5*time.Second),
  106. chromedp.Location(&categoryPageURL),
  107. )
  108. if err != nil {
  109. return err
  110. }
  111. logs.Info("categoryPageURL: %s: %s: %s", product, category, categoryPageURL)
  112. //var allReportURLs []string
  113. var allReportURLMap = make(map[string]string)
  114. for {
  115. var htmlContent string
  116. err = chromedp.Run(ctx,
  117. chromedp.Navigate(categoryPageURL),
  118. chromedp.Sleep(5*time.Second),
  119. chromedp.OuterHTML("html", &htmlContent),
  120. )
  121. if err != nil {
  122. return err
  123. }
  124. fmt.Printf("页面内容: %s\n", htmlContent)
  125. // Extract report URLs containing the partial keyword
  126. reportURLMap := extractReportURLs(htmlContent, report)
  127. //allReportURLs = append(allReportURLs, reportURLs...)
  128. for key, value := range reportURLMap {
  129. allReportURLMap[key] = value
  130. }
  131. // 测试环境跑部分数据,上线放开
  132. //break
  133. // Check if next page button is disabled
  134. // 测试环境跑部分数据,上线放开
  135. var nextPageDisabled bool
  136. err = chromedp.Run(ctx,
  137. chromedp.Evaluate(`document.querySelector('div.my-page-next').classList.contains('my-page-forbid')`, &nextPageDisabled),
  138. )
  139. if err != nil {
  140. return err
  141. }
  142. if nextPageDisabled {
  143. break
  144. }
  145. // Click the next page button
  146. err = chromedp.Run(ctx,
  147. chromedp.Click(`div.my-page-next`, chromedp.ByQuery),
  148. chromedp.Sleep(5*time.Second),
  149. chromedp.Location(&categoryPageURL),
  150. )
  151. if err != nil {
  152. return err
  153. }
  154. }
  155. logs.Info("所有报告 URLs: %s: %s: %v", product, category, allReportURLMap)
  156. if len(allReportURLMap) == 0 {
  157. return fmt.Errorf("未找到报告 URL")
  158. }
  159. // 处理报告数据
  160. for key, value := range allReportURLMap {
  161. // 查询报告是否已经处理 这里只对近7天的数据进行处理
  162. lyIndexRecord, err := models.GetLyIndexRecordByUrl(key)
  163. if err != nil {
  164. continue
  165. }
  166. if lyIndexRecord != nil {
  167. toTime, err := utils.StringToTime(lyIndexRecord.DataTime + " 00:00:00")
  168. if err != nil {
  169. logs.Error("时间格式转换错误: %s: %s: %s: %s: %v", product, category, report, key, err)
  170. continue
  171. }
  172. if time.Now().Sub(toTime) > 7*24*time.Hour {
  173. logs.Info("报告已处理: %s: %s: %s: %s", product, category, report, key)
  174. continue
  175. }
  176. }
  177. // 随机睡眠
  178. rand := utils.RangeRand(20, 100)
  179. fmt.Println(report+";sleep:", strconv.Itoa(int(rand)))
  180. time.Sleep(time.Duration(rand) * time.Second)
  181. err = processReport(ctx, product, category, key, keywords)
  182. if err != nil {
  183. logs.Error("处理报告错误: %s: %s: %s: %s: %v", product, category, report, key, err)
  184. if strings.Contains(err.Error(), "您看文章的速度太快了,歇一会再看吧") {
  185. // 如果报告内容包含 “您看文章的速度太快了,歇一会再看吧” 则停止处理,发短信通知
  186. // 发送短信通知
  187. alarm_msg.SendAlarmMsg(fmt.Sprintf("粮油商务网-爬取指标数据被限制,请稍后重试, ErrMsg: %s", err.Error()), 1)
  188. return nil
  189. }
  190. continue
  191. }
  192. format, err := utils.ConvertTimeFormat(value)
  193. if err != nil {
  194. logs.Error("时间格式转换错误: %s, %s, %v: %v", product, category, value, err)
  195. continue
  196. }
  197. // 处理报告成功,将维护指标数据读取进度到数据库,避免后面重复读取
  198. recordId, err := models.AddLyIndexRecord(&models.BaseFromLyIndexRecord{
  199. CreateTime: utils.GetCurrentTime(),
  200. ModifyTime: utils.GetCurrentTime(),
  201. Product: product,
  202. Category: category,
  203. Url: key,
  204. DataTime: format,
  205. })
  206. if err != nil {
  207. logs.Error("维护指标数据读取进度错误: %s, %s, %v: %v", product, category, recordId, err)
  208. continue
  209. }
  210. logs.Info("维护指标数据读取进度成功: %s, %s, %v", product, category, recordId)
  211. }
  212. return nil
  213. }
  214. func fillProductPageURL(ctx context.Context, product string, category string) (string, error) {
  215. // 选择 dl 标签下所有 a 标签的 XPath
  216. selector := `//dl[contains(@class, 'dl_hot')]//a`
  217. logs.Info("选择器表达式: %s", selector)
  218. var nodes []*cdp.Node
  219. var productPageURL string
  220. // 获取 dl 标签下的所有 a 标签节点
  221. err := chromedp.Run(ctx,
  222. chromedp.WaitReady(selector, chromedp.BySearch),
  223. chromedp.Nodes(selector, &nodes, chromedp.BySearch),
  224. )
  225. if err != nil {
  226. return "", err
  227. }
  228. // 提取并打印所有 a 标签的 OuterHTML
  229. var targetURL string
  230. for _, node := range nodes {
  231. var outerHTML string
  232. // 获取 a 标签的 OuterHTML
  233. err = chromedp.Run(ctx,
  234. chromedp.OuterHTML(node.FullXPath(), &outerHTML, chromedp.BySearch),
  235. )
  236. if err != nil {
  237. return "", err
  238. }
  239. // 打印获取的 OuterHTML 内容
  240. logs.Info("Link OuterHTML: %s", outerHTML)
  241. // 从 OuterHTML 中提取 href 和文本内容
  242. // 使用正则或字符串处理提取 href 和文本内容
  243. href, linkText := extractHrefAndText(outerHTML)
  244. // 打印提取的 href 和文本内容
  245. logs.Info("Link Text: %s, Href: %s", linkText, href)
  246. // 如果文本内容匹配目标产品
  247. if linkText == product {
  248. // 拼接完整的 URL
  249. /*if !strings.HasPrefix(href, "http") {
  250. href = lyLoginPath + href
  251. }*/
  252. targetURL = href
  253. break
  254. }
  255. }
  256. if targetURL == "" {
  257. return "", fmt.Errorf("未找到匹配的产品链接")
  258. }
  259. // 显示更多内容
  260. err = chromedp.Run(ctx,
  261. chromedp.Evaluate(`document.getElementById("moreSpeList").style.display = "block";`, nil),
  262. )
  263. if err != nil {
  264. return "", err
  265. }
  266. // 点击目标产品的链接
  267. clickSelector := fmt.Sprintf(`//a[@href='%s']`, targetURL)
  268. err = chromedp.Run(ctx,
  269. chromedp.WaitReady(clickSelector, chromedp.BySearch),
  270. chromedp.Click(clickSelector, chromedp.BySearch),
  271. chromedp.Sleep(5*time.Second),
  272. chromedp.Location(&productPageURL),
  273. )
  274. if err != nil {
  275. return "", err
  276. }
  277. // 返回点击后的页面URL
  278. logs.Info("productPageURL: %s", productPageURL)
  279. return productPageURL, nil
  280. }
  281. // extractHrefAndText 从 OuterHTML 提取 href 和文本内容的辅助函数
  282. func extractHrefAndText(outerHTML string) (string, string) {
  283. // 使用正则表达式或其他字符串处理方法提取 href 和文本内容
  284. // 这里只是一个简单的例子,具体实现需要根据 HTML 结构来调整
  285. hrefRegex := `href="([^"]+)"`
  286. textRegex := `>([^<]+)<`
  287. hrefMatches := regexp.MustCompile(hrefRegex).FindStringSubmatch(outerHTML)
  288. textMatches := regexp.MustCompile(textRegex).FindStringSubmatch(outerHTML)
  289. href := ""
  290. linkText := ""
  291. if len(hrefMatches) > 1 {
  292. href = hrefMatches[1]
  293. }
  294. if len(textMatches) > 1 {
  295. linkText = textMatches[1]
  296. }
  297. return href, linkText
  298. }
  299. // Extract report URLs from the HTML content
  300. func extractReportURLs(htmlContent, keyword string) map[string]string {
  301. //var reportURLs []string
  302. var reportURLMap = make(map[string]string)
  303. var reportURL string
  304. // Find all occurrences of the keyword and extract report URLs
  305. content := htmlContent
  306. for {
  307. startIdx := strings.Index(content, keyword)
  308. if startIdx == -1 {
  309. break
  310. }
  311. startIdx += len(keyword)
  312. // Extract the URL from the HTML content
  313. urlStartIdx := strings.LastIndex(content[:startIdx], `href="`) + len(`href="`)
  314. urlEndIdx := strings.Index(content[urlStartIdx:], `"`) + urlStartIdx
  315. if urlStartIdx > 0 && urlEndIdx > urlStartIdx {
  316. reportURL = content[urlStartIdx:urlEndIdx]
  317. //reportURLs = append(reportURLs, content[urlStartIdx:urlEndIdx])
  318. }
  319. content = content[startIdx:]
  320. // Now extract the content inside the first <div class="short_right">
  321. divStartIdx := strings.Index(content, `<div class="short_right">`)
  322. if divStartIdx != -1 {
  323. divStartIdx += len(`<div class="short_right">`)
  324. divEndIdx := strings.Index(content[divStartIdx:], `</div>`) + divStartIdx
  325. if divEndIdx > divStartIdx {
  326. shortRightContent := content[divStartIdx:divEndIdx]
  327. // Extract the first <div> content inside <div class="short_right">
  328. innerDivStartIdx := strings.Index(shortRightContent, `<div>`)
  329. if innerDivStartIdx != -1 {
  330. innerDivStartIdx += len(`<div>`)
  331. //innerDivEndIdx := strings.Index(shortRightContent[innerDivStartIdx:], `</div>`) + innerDivStartIdx
  332. innerDivContent := shortRightContent[innerDivStartIdx:]
  333. fmt.Println("Inner Div Content:", innerDivContent)
  334. reportURLMap[reportURL] = innerDivContent
  335. }
  336. }
  337. }
  338. }
  339. return reportURLMap
  340. }
  341. // Process the report data
  342. func processReport(ctx context.Context, product string, category string, reportURL string, keywords []string) error {
  343. // Navigate to the report page
  344. var reportContent string
  345. err := chromedp.Run(ctx,
  346. chromedp.Navigate(lyLoginPath+reportURL),
  347. chromedp.WaitVisible("body", chromedp.ByQuery), // 等待 body 元素可见,确保页面已加载
  348. chromedp.Sleep(5*time.Second), // 等待额外时间,以确保动态内容加载
  349. chromedp.OuterHTML("html", &reportContent), // 获取页面 HTML 内容
  350. )
  351. if err != nil {
  352. return err
  353. }
  354. // 如果文章内容包含 “您看文章的速度太快了,歇一会再看吧” 则返回指定错误
  355. if strings.Contains(reportContent, "您看文章的速度太快了,歇一会再看吧") {
  356. return fmt.Errorf("您看文章的速度太快了,歇一会再看吧")
  357. }
  358. var lyIndexDataList []models.BaseFromLyData
  359. // Process the data based on keywords
  360. for _, keyword := range keywords {
  361. partialKeyword := strings.Split(keyword, ":")
  362. // Select appropriate processor based on product and category
  363. processor, err := GetProcessor(product, category)
  364. if err != nil {
  365. return err
  366. }
  367. // 查询报告所属分类
  368. classify, err := models.GetLyClassifyByName(product)
  369. if err != nil {
  370. return err
  371. }
  372. // Process the report content using the selected processor
  373. baseFromLyDataList, err := processor.Process(ctx, product, reportContent, partialKeyword, classify.BaseFromLyClassifyId)
  374. if err != nil {
  375. return err
  376. }
  377. if len(baseFromLyDataList) > 0 {
  378. for _, baseFromLyData := range baseFromLyDataList {
  379. if baseFromLyData.DataTime != "" && baseFromLyData.IndexCode != "" && baseFromLyData.IndexCode != "lysww" {
  380. baseFromLyData.CreateTime = utils.GetCurrentTime()
  381. baseFromLyData.ModifyTime = utils.GetCurrentTime()
  382. lyIndexDataList = append(lyIndexDataList, baseFromLyData)
  383. }
  384. }
  385. }
  386. }
  387. // 新增指标数据
  388. if len(lyIndexDataList) > 0 {
  389. err = models.AddLyDataList(lyIndexDataList)
  390. if err != nil {
  391. return err
  392. }
  393. }
  394. return nil
  395. }
  396. func addLyIndex(classifyId int, indexCode string, indexName string, unit string, frequency string) (int, error) {
  397. // 添加指标
  398. index := &models.BaseFromLyIndex{
  399. CreateTime: utils.GetCurrentTime(),
  400. ModifyTime: utils.GetCurrentTime(),
  401. BaseFromLyClassifyId: classifyId,
  402. IndexCode: indexCode,
  403. IndexName: indexName,
  404. Frequency: frequency,
  405. Unit: unit,
  406. EdbExist: 0,
  407. }
  408. indexId, err := models.AddLyIndex(index)
  409. if err != nil {
  410. return 0, err
  411. }
  412. return int(indexId), nil
  413. }