common.go 27 KB


  1. package base_from_ccf
  2. import (
  3. "bytes"
  4. "compress/gzip"
  5. "context"
  6. "encoding/json"
  7. "eta/eta_data_analysis/utils"
  8. "fmt"
  9. "io"
  10. "log"
  11. "mime/multipart"
  12. "net/http"
  13. "net/url"
  14. "os"
  15. "path/filepath"
  16. "regexp"
  17. "strconv"
  18. "strings"
  19. "time"
  20. "github.com/PuerkitoBio/goquery"
  21. "github.com/chromedp/cdproto/network"
  22. "github.com/chromedp/chromedp"
  23. "golang.org/x/net/html/charset"
  24. "golang.org/x/text/encoding/simplifiedchinese"
  25. "golang.org/x/text/transform"
  26. )
  27. const (
  28. CCFSearchPageUrl = "https://www.ccf.com.cn/newscenter/simplesearch.php" // CCF搜索页地址
  29. CCFReportDetailBaseUrl = "https://www.ccf.com.cn" // CCF报告详情页地址
  30. CCFCHARTDATAURL = "https://www.ccf.com.cn/datacenter/index.php"
  31. )
  32. // postEdbLib 调用指标接口
  33. func postEdbLib(param map[string]interface{}, method string) (result []byte, err error) {
  34. postUrl := utils.EDB_LIB_URL + method
  35. postData, err := json.Marshal(param)
  36. if err != nil {
  37. return
  38. }
  39. result, err = httpPost(postUrl, string(postData), "application/json")
  40. if err != nil {
  41. return
  42. }
  43. return
  44. }
  45. // httpPost HTTP请求
  46. func httpPost(url, postData string, params ...string) ([]byte, error) {
  47. fmt.Println("httpPost Url:" + url)
  48. body := io.NopCloser(strings.NewReader(postData))
  49. client := &http.Client{}
  50. req, err := http.NewRequest("POST", url, body)
  51. if err != nil {
  52. return nil, err
  53. }
  54. contentType := "application/x-www-form-urlencoded;charset=utf-8"
  55. if len(params) > 0 && params[0] != "" {
  56. contentType = params[0]
  57. }
  58. req.Header.Set("Content-Type", contentType)
  59. req.Header.Set("authorization", utils.MD5(utils.APP_EDB_LIB_NAME_EN+utils.EDB_LIB_Md5_KEY))
  60. resp, err := client.Do(req)
  61. if err != nil {
  62. fmt.Println("client.Do err:" + err.Error())
  63. return nil, err
  64. }
  65. defer func() {
  66. _ = resp.Body.Close()
  67. }()
  68. b, err := io.ReadAll(resp.Body)
  69. if err != nil {
  70. fmt.Println("httpPost:" + string(b))
  71. }
  72. return b, err
  73. }
  74. // fetchPageHtml 获取网站HTML文本
  75. func fetchPageHtml(baseUrl string, fetchNum int) (respBody []byte, err error) {
  76. defer func() {
  77. if err != nil {
  78. tips := fmt.Sprintf("BuildCCFRequest ErrMsg: %s", err.Error())
  79. utils.FileLog.Info(tips)
  80. fmt.Println(tips)
  81. }
  82. }()
  83. // 查询次数
  84. fetchNum++
  85. if baseUrl == "" {
  86. err = fmt.Errorf("CCF请求地址为空")
  87. return
  88. }
  89. // 获取Cookie
  90. strCookie, e := getCookie()
  91. if e != nil {
  92. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  93. return
  94. }
  95. if strCookie == "" && fetchNum < 2 {
  96. fmt.Printf("文件cookie为空, 重新获取, fetchNum: %d\n", fetchNum)
  97. utils.FileLog.Info(fmt.Sprintf("文件cookie为空, 重新获取, fetchNum: %d", fetchNum))
  98. _, err = getCookieByChrome()
  99. if err != nil {
  100. return
  101. }
  102. return fetchPageHtml(baseUrl, fetchNum)
  103. }
  104. // 拉取网站内容
  105. cli := new(http.Client)
  106. req, e := http.NewRequest("GET", baseUrl, nil)
  107. if e != nil {
  108. err = fmt.Errorf("")
  109. return
  110. }
  111. req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
  112. req.Header.Set("Accept-Encoding", "gzip, deflate, br")
  113. req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
  114. req.Header.Set("Connection", "keep-alive")
  115. req.Header.Set("Cookie", strCookie)
  116. req.Header.Set("Host", "www.ccf.com.cn")
  117. req.Header.Set("Referer", baseUrl)
  118. req.Header.Set("Sec-Ch-Ua", "\"Not A(Brand\";v=\"99\", \"Microsoft Edge\";v=\"121\", \"Chromium\";v=\"121\"")
  119. req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
  120. req.Header.Set("Sec-Ch-Ua-Platform", "\"Windows\"")
  121. req.Header.Set("Sec-Fetch-Dest", "empty")
  122. req.Header.Set("Sec-Fetch-Mode", "cors")
  123. req.Header.Set("Sec-Fetch-Site", "same-origin")
  124. req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0")
  125. req.Header.Set("X-Requested-With", "XMLHttpRequest")
  126. resp, e := cli.Do(req)
  127. if e != nil {
  128. err = fmt.Errorf("HTTP client Do err: %s", e.Error())
  129. return
  130. }
  131. defer func() {
  132. _ = resp.Body.Close()
  133. }()
  134. // 读取响应的内容
  135. reader, e := gzip.NewReader(resp.Body)
  136. if e != nil {
  137. err = fmt.Errorf("gzip NewReader err: %s", e.Error())
  138. return
  139. }
  140. body, e := io.ReadAll(reader)
  141. if e != nil {
  142. err = fmt.Errorf("read body err: %s", e.Error())
  143. return
  144. }
  145. // 转换编码
  146. utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(body))
  147. if e != nil {
  148. err = fmt.Errorf("utf8 reader err: %s", e.Error())
  149. return
  150. }
  151. utf8Body, e := io.ReadAll(utf8Reader)
  152. if e != nil {
  153. err = fmt.Errorf("utf8 body err: %s", e.Error())
  154. return
  155. }
  156. respBody = utf8Body
  157. isLoginPage := checkIsLoginPage(string(respBody))
  158. fmt.Println("是否登录页:", isLoginPage)
  159. // 如果是登录页,且查询次数少于2次,那么就重新登录后查询
  160. if isLoginPage && fetchNum < 2 {
  161. _, err = getCookieByChrome()
  162. if err != nil {
  163. return
  164. }
  165. return fetchPageHtml(baseUrl, fetchNum)
  166. }
  167. return
  168. }
  169. // postPageHtml 获取网站HTML文本
  170. func postPageHtml(baseUrl string, formData map[string]string, fetchNum int) (respBody []byte, err error) {
  171. defer func() {
  172. if err != nil {
  173. tips := fmt.Sprintf("BuildCCFRequest ErrMsg: %s", err.Error())
  174. utils.FileLog.Info(tips)
  175. fmt.Println(tips)
  176. }
  177. }()
  178. // 查询次数
  179. fetchNum++
  180. if baseUrl == "" {
  181. err = fmt.Errorf("CCF请求地址为空")
  182. return
  183. }
  184. // 获取Cookie
  185. strCookie, e := getCookie()
  186. if e != nil {
  187. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  188. return
  189. }
  190. if strCookie == "" && fetchNum < 2 {
  191. fmt.Printf("文件cookie为空, 重新获取, fetchNum: %d\n", fetchNum)
  192. utils.FileLog.Info(fmt.Sprintf("文件cookie为空, 重新获取, fetchNum: %d", fetchNum))
  193. _, err = getCookieByChrome()
  194. if err != nil {
  195. return
  196. }
  197. return postPageHtml(baseUrl, formData, fetchNum)
  198. }
  199. var b bytes.Buffer
  200. writer := multipart.NewWriter(&b)
  201. for k, v := range formData {
  202. _ = writer.WriteField(k, v)
  203. }
  204. writer.Close()
  205. req, e := http.NewRequest("POST", baseUrl, &b)
  206. if e != nil {
  207. err = e
  208. return
  209. }
  210. cli := new(http.Client)
  211. // 设置请求头
  212. req.Header.Add("Accept", " text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
  213. req.Header.Add("Accept-Encoding", "gzip, deflate, br, zstd")
  214. req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
  215. req.Header.Add("Cache-Control", "max-age=0")
  216. req.Header.Add("Connection", "keep-alive")
  217. req.Header.Add("Content-Type", writer.FormDataContentType())
  218. req.Header.Add("Cookie", strCookie)
  219. req.Header.Add("Host", "www.ccf.com.cn")
  220. req.Header.Add("Origin", "https://www.ccf.com.cn")
  221. req.Header.Add("Referer", baseUrl)
  222. req.Header.Add("Sec-Ch-Ua", `"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"`)
  223. req.Header.Add("Sec-Ch-Ua-Mobile", " ?0")
  224. req.Header.Add("Sec-Ch-Ua-Platform", `"Windows"`)
  225. req.Header.Add("Sec-Fetch-Dest", "document")
  226. req.Header.Add("Sec-Fetch-Mode", "navigate")
  227. req.Header.Add("Sec-Fetch-Site", "same-origin")
  228. req.Header.Add("Sec-Fetch-User", "?1")
  229. req.Header.Add("Upgrade-Insecure-Requests", "1")
  230. req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
  231. resp, e := cli.Do(req)
  232. if e != nil {
  233. err = fmt.Errorf("HTTP client Do err: %s", e.Error())
  234. fmt.Println("HTTP client Do err:", e.Error())
  235. return
  236. }
  237. defer func() {
  238. _ = resp.Body.Close()
  239. }()
  240. reader, e := gzip.NewReader(resp.Body)
  241. if e != nil {
  242. err = fmt.Errorf("gzip NewReader err: %s", e.Error())
  243. return
  244. }
  245. body, e := io.ReadAll(reader)
  246. if e != nil {
  247. err = fmt.Errorf("读取body失败, err: %s", e.Error())
  248. return
  249. }
  250. utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(body))
  251. if e != nil {
  252. err = fmt.Errorf("utf8 reader err: %s", e.Error())
  253. return
  254. }
  255. utf8Body, e := io.ReadAll(utf8Reader)
  256. if e != nil {
  257. err = fmt.Errorf("utf8 body err: %s", e.Error())
  258. return
  259. }
  260. respBody = utf8Body
  261. isLoginPage := checkIsLoginPage(string(respBody))
  262. fmt.Println("是否登录页:", isLoginPage)
  263. // 如果是登录页,且查询次数少于2次,那么就重新登录后查询
  264. if isLoginPage && fetchNum < 2 {
  265. _, err = getCookieByChrome()
  266. if err != nil {
  267. return
  268. }
  269. return postPageHtml(baseUrl, formData, fetchNum)
  270. }
  271. return
  272. }
  273. // DataRule 数据爬取规则
  274. type DataRule struct {
  275. Name string `json:"Name"`
  276. Frequency string `json:"Frequency"`
  277. PageDir string `json:"PageDir"`
  278. Search struct {
  279. ClassId string `json:"ClassId"`
  280. SubClassId string `json:"SubClassId"`
  281. ProductId string `json:"ProductId"`
  282. SubProductId string `json:"SubProductId"`
  283. SimpleTerms string `json:"SimpleTerms"`
  284. } `json:"Search"`
  285. TableFetch []struct {
  286. Keyword string `json:"Keyword"`
  287. Unit string `json:"Unit"`
  288. } `json:"TableFetch"`
  289. EdbMatch []DataRuleEdbMatch `json:"EdbMatch"`
  290. StockTable struct {
  291. ClassifyId int `json:"ClassifyId"`
  292. } `json:"StockTable"`
  293. }
  294. // DataRuleEdbMatch 数据爬取规则-指标匹配
  295. type DataRuleEdbMatch struct {
  296. IndexCode string `json:"IndexCode"`
  297. IndexName string `json:"IndexName"`
  298. ClassifyId int `json:"ClassifyId"`
  299. Frequency string `json:"Frequency"`
  300. Product string `json:"Product"`
  301. Market string `json:"Market"`
  302. MatchUnit string `json:"MatchUnit" description:"匹配单位"`
  303. Unit string `json:"Unit" description:"实际单位"`
  304. }
  305. // loadDataRule 从配置中读取爬取规则
  306. func loadDataRule(nameKey string) (fetchRule *DataRule, err error) {
  307. if utils.CCFDataRuleFile == "" {
  308. err = fmt.Errorf("rule文件不存在")
  309. return
  310. }
  311. b, e := os.ReadFile(utils.CCFDataRuleFile)
  312. if e != nil {
  313. err = fmt.Errorf("读取rule文件失败, err: %v", e)
  314. return
  315. }
  316. rules := make([]*DataRule, 0)
  317. if e = json.Unmarshal(b, &rules); e != nil {
  318. err = fmt.Errorf("解析rule文件失败, err: %v", e)
  319. return
  320. }
  321. for _, v := range rules {
  322. if v.Name != "" && v.Name == nameKey {
  323. fetchRule = v
  324. return
  325. }
  326. }
  327. err = fmt.Errorf("rule不存在, nameKey: %s", nameKey)
  328. return
  329. }
  330. // savePageHtml 拉取历史报告详情
  331. func savePageHtml(nameKey, saveDir string, historyPage bool, reportMax int) (files []string, err error) {
  332. if nameKey == "" {
  333. return
  334. }
  335. defer func() {
  336. if err != nil {
  337. tips := fmt.Sprintf("GetCCFOilEdbHistory ErrMsg: %s", err.Error())
  338. utils.FileLog.Info(tips)
  339. fmt.Println(tips)
  340. }
  341. }()
  342. fetchRule, e := loadDataRule(nameKey)
  343. if e != nil {
  344. err = fmt.Errorf("loadDataRule, err: %v", e)
  345. return
  346. }
  347. if saveDir == "" {
  348. saveDir = "static/ccf"
  349. }
  350. // 获取品种第一页
  351. baseUrl := fmt.Sprintf(`%s?newssubmit=1&sitename=localhost`, CCFSearchPageUrl)
  352. if fetchRule.Search.ClassId != "" {
  353. baseUrl = fmt.Sprintf(`%s&ClassID=%s`, baseUrl, fetchRule.Search.ClassId)
  354. }
  355. if fetchRule.Search.SubClassId != "" {
  356. baseUrl = fmt.Sprintf(`%s&SubClassID=%s`, baseUrl, fetchRule.Search.SubClassId)
  357. }
  358. if fetchRule.Search.ProductId != "" {
  359. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.ProductId)
  360. }
  361. if fetchRule.Search.SubProductId != "" {
  362. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.SubProductId)
  363. }
  364. if fetchRule.Search.SimpleTerms != "" {
  365. termsEncode, e := gb2312ToPercentEncoding(fetchRule.Search.SimpleTerms)
  366. if e != nil {
  367. err = fmt.Errorf("gb2312ToPercentEncoding err: %v", e)
  368. return
  369. }
  370. baseUrl = fmt.Sprintf(`%s&simpleterms=%s`, baseUrl, termsEncode)
  371. }
  372. firstPage := fmt.Sprintf(`%s&cur_pg_num=%d`, baseUrl, 1)
  373. // 首页报告链接
  374. firstHtml, e := fetchPageHtml(firstPage, 0)
  375. if e != nil {
  376. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  377. return
  378. }
  379. firstHrefs, e := analysisReportHrefs(firstHtml, 1)
  380. if e != nil {
  381. err = fmt.Errorf("读取首页列表报告链接失败, err: %v", e)
  382. return
  383. }
  384. var historyHrefs []ReportHrefs
  385. historyHrefs = append(historyHrefs, firstHrefs...)
  386. ticker := time.NewTicker(5 * time.Second)
  387. defer ticker.Stop()
  388. // 历史报告
  389. if historyPage {
  390. endPage, e := analysisEndPage(firstHtml)
  391. if e != nil {
  392. err = fmt.Errorf("解析首页最后页码失败, err: %v", e)
  393. return
  394. }
  395. if endPage > 1 {
  396. for i := 2; i <= endPage; i++ {
  397. <-ticker.C
  398. fmt.Printf("开始读取历史页%d\n", i)
  399. // 每页28条数据, 需要带上页码*28的偏移量不然始终获取第一页
  400. pageUrl := fmt.Sprintf(`%s&cur_pg_num=%d&cur_row_pos=%d`, baseUrl, i, i*28)
  401. fmt.Println("pageUrl: ", pageUrl)
  402. pageContents, e := fetchPageHtml(pageUrl, 0)
  403. if e != nil {
  404. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  405. return
  406. }
  407. pageHrefs, e := analysisReportHrefs(pageContents, i)
  408. if e != nil {
  409. err = fmt.Errorf("读取第%d页列表报告链接失败, err: %v", i, e)
  410. return
  411. }
  412. historyHrefs = append(historyHrefs, pageHrefs...)
  413. fmt.Printf("结束读取历史页%d\n", i)
  414. }
  415. }
  416. fmt.Println("endPage: ", endPage)
  417. }
  418. fmt.Println("historyHrefs len: ", len(historyHrefs))
  419. fmt.Println("historyHrefs: ", historyHrefs)
  420. // 拉取报告留档
  421. strDate := time.Now().Format("20060102")
  422. reportCount := 0
  423. for _, v := range historyHrefs {
  424. <-ticker.C
  425. if reportMax > 0 {
  426. reportCount += 1
  427. if reportCount > reportMax {
  428. break
  429. }
  430. }
  431. fmt.Printf("拉取报告: %s; url: %s\n", v.Title, v.Href)
  432. htm, e := fetchPageHtml(fmt.Sprintf("%s%s", CCFReportDetailBaseUrl, v.Href), 0)
  433. if e != nil {
  434. utils.FileLog.Info("获取页面失败, err: %v", e)
  435. continue
  436. }
  437. dateDir := fmt.Sprintf("%s/%s", saveDir, strDate)
  438. if e = utils.MkDir(dateDir); e != nil {
  439. utils.FileLog.Info(fmt.Sprintf("创建目录失败, err: %v", e))
  440. continue
  441. }
  442. outputPath := fmt.Sprintf("%s/%d-%s.html", dateDir, v.Page, v.Title)
  443. if e = utils.WriteHTMLToFile(string(htm), outputPath); e != nil {
  444. utils.FileLog.Info(fmt.Sprintf("写入HTML出错, err: %v", e))
  445. continue
  446. }
  447. files = append(files, outputPath)
  448. }
  449. fmt.Println("拉取报告 end")
  450. return
  451. }
  452. // analysisEndPage 读取列表页最后一页页码
  453. func analysisEndPage(contents []byte) (endPage int, err error) {
  454. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  455. if e != nil {
  456. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  457. return
  458. }
  459. // 查找页码元素并遍历a标签
  460. sectionDigg := doc.Find(".digg")
  461. aElements := sectionDigg.Find("a")
  462. // 获取倒数第二个a标签中的页码
  463. totalAElements := aElements.Length()
  464. targetIndex := totalAElements - 2
  465. if targetIndex >= 0 && targetIndex < totalAElements {
  466. targetA := aElements.Eq(targetIndex)
  467. txt := targetA.Text()
  468. endPage, e = strconv.Atoi(txt)
  469. if e != nil {
  470. err = fmt.Errorf("页码文本有误, %s", txt)
  471. return
  472. }
  473. fmt.Println(endPage)
  474. return
  475. }
  476. endPage = 1
  477. return
  478. }
  479. // ReportHrefs 报告链接
  480. type ReportHrefs struct {
  481. Title string `description:"报告标题"`
  482. Href string `description:"报告详情链接"`
  483. Page int `description:"页码"`
  484. }
  485. // analysisReportHrefs 解析列表页报告链接
  486. func analysisReportHrefs(contents []byte, page int) (hrefs []ReportHrefs, err error) {
  487. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  488. if e != nil {
  489. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  490. return
  491. }
  492. doc.Find("ul.newslist li a").Each(func(_ int, s *goquery.Selection) {
  493. href, exists := s.Attr("href")
  494. if exists {
  495. title := s.Text()
  496. hrefs = append(hrefs, ReportHrefs{
  497. Title: title,
  498. Href: href,
  499. Page: page,
  500. })
  501. }
  502. })
  503. return
  504. }
  505. // extractReportPublishTime 提取报告发布时间
  506. func extractReportPublishTime(text string) (time.Time, error) {
  507. datePattern := `(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}:\d{2})`
  508. re := regexp.MustCompile(datePattern)
  509. var strTime string
  510. match := re.FindStringSubmatch(text)
  511. if len(match) <= 0 {
  512. return time.Time{}, fmt.Errorf("没有读取出日期")
  513. }
  514. strTime = match[0]
  515. // 转为时间格式
  516. dateFormat := "2006年01月02日15:04"
  517. parsedDate, e := time.ParseInLocation(dateFormat, strTime, time.Local)
  518. if e != nil {
  519. return time.Time{}, fmt.Errorf("日期转换失败, str: %s, err: %v", strTime, e)
  520. }
  521. return parsedDate, nil
  522. }
  523. // calculateDataHalfVal 取出数据区间的折中值, 如"7-9天"返回结果为"8"
  524. func calculateDataHalfVal(duration string) (result string, err error) {
  525. re := regexp.MustCompile(`\d+`)
  526. matches := re.FindAllString(duration, -1)
  527. if len(matches) != 2 {
  528. err = fmt.Errorf("未找到两个数字, Num: %d", len(matches))
  529. return
  530. }
  531. a, e := strconv.Atoi(matches[0])
  532. if e != nil {
  533. err = e
  534. return
  535. }
  536. b, e := strconv.Atoi(matches[1])
  537. if e != nil {
  538. err = e
  539. return
  540. }
  541. average := float64(a+b) / 2.0
  542. // 格式化结果
  543. if average == float64(int(average)) {
  544. result = strconv.Itoa(int(average))
  545. } else {
  546. result = fmt.Sprintf("%.1f", average)
  547. }
  548. return
  549. }
  550. // gb2312ToPercentEncoding 中文字符转码
  551. func gb2312ToPercentEncoding(input string) (string, error) {
  552. // 创建GB18030编码转换器(兼容GB2312)
  553. encoder := simplifiedchinese.GB18030.NewEncoder()
  554. // 使用转换器将字符串转换为GB18030编码的字节流,并写入bytes.Buffer
  555. var buf bytes.Buffer
  556. writer := transform.NewWriter(&buf, encoder)
  557. _, err := writer.Write([]byte(input))
  558. if err != nil {
  559. return "", err
  560. }
  561. err = writer.Close()
  562. if err != nil {
  563. return "", err
  564. }
  565. // 将字节流转换为百分号编码
  566. percentEncoded := url.QueryEscape(buf.String())
  567. return percentEncoded, nil
  568. }
  569. // AnalysisNoneMergeTablePars 解析无合并单元格的简单表格入参
  570. type AnalysisNoneMergeTablePars struct {
  571. DocTable *goquery.Selection
  572. MarketCol struct {
  573. HasCol bool `description:"是否有市场列"`
  574. ColIndex int `description:"市场列"`
  575. }
  576. DateCol struct {
  577. StartIndex int `description:"日期开始列"`
  578. EndIndex int `description:"日期结束列"`
  579. PublishTime time.Time `description:"报告发布时间"`
  580. //PublishYear int `description:"报告发布年份"`
  581. StrTimeFormat string `description:"数据日期格式-需拼接日期列中的变量"`
  582. TimeFormat []string `description:"标准日期格式, 可能存在多种分别进行遍历"`
  583. SplitLast bool `description:"是否分隔日期: 如1.24-1.28"`
  584. SplitFlag string `description:"分隔日期分隔符: 如-"`
  585. }
  586. ValCol struct {
  587. SplitHalfVal bool `description:"是否取折中值: 如8-10天, 9-12天"`
  588. }
  589. }
  590. // TableRow 读取Table的行信息
  591. type TableRow struct {
  592. Product string
  593. Market string
  594. DateData map[string]string
  595. Unit string
  596. }
  597. // analysisNoneMergeTable 解析无合并单元格的简单表格
  598. func analysisNoneMergeTable(params AnalysisNoneMergeTablePars) (items []TableRow) {
  599. if params.DocTable != nil && params.DocTable.Length() <= 0 {
  600. return
  601. }
  602. attemptDates := []string{"2006/1/2", "2006/01/02", "2006/01/2", "2006/1/02", "2006-1-2", "2006-01-02", "2006-01-2", "2006-1-02", "2006.01.02", "2006.1.2", "2006.1.02", "2006.01.2", "2006年01月02日", "2006年1月2日", "2006年1月02日", "2006年01月2日"}
  603. colDate := make(map[int]string)
  604. params.DocTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  605. cells := s.Find("td")
  606. // 表头取出日期
  607. if i == 0 {
  608. cells.Each(func(ii int, ss *goquery.Selection) {
  609. cellTxt := strings.TrimSpace(ss.Text())
  610. //fmt.Println("cellTxt", cellTxt)
  611. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  612. //strTime := fmt.Sprintf("%d.%s", publishYear, cellTxt)
  613. //var strTimeFormat string
  614. completeTime := cellTxt
  615. // 是否需要拼接年份
  616. if params.DateCol.StrTimeFormat != "" {
  617. strDate := cellTxt
  618. // 是否取分隔日期的后一个日期
  619. if params.DateCol.SplitLast && params.DateCol.SplitFlag != "" {
  620. dateArr := strings.Split(cellTxt, params.DateCol.SplitFlag)
  621. if len(dateArr) > 1 {
  622. strDate = dateArr[len(dateArr)-1]
  623. }
  624. }
  625. completeTime = fmt.Sprintf(params.DateCol.StrTimeFormat, params.DateCol.PublishTime.Year(), strDate)
  626. }
  627. //fmt.Println("completeTime: ", completeTime)
  628. // 遍历多种可能的日期格式
  629. var colTime time.Time
  630. for _, f := range params.DateCol.TimeFormat {
  631. t, e := time.ParseInLocation(f, completeTime, time.Local)
  632. if e != nil {
  633. continue
  634. }
  635. colTime = t
  636. break
  637. }
  638. // 统一判断一次, 入参的日期格式可能不全
  639. if colTime.IsZero() {
  640. utils.FileLog.Info(fmt.Sprintf("日期格式异常: cellTxt-%s; completeTime-%s", cellTxt, completeTime))
  641. for _, f := range attemptDates {
  642. t, e := time.ParseInLocation(f, completeTime, time.Local)
  643. if e != nil {
  644. continue
  645. }
  646. colTime = t
  647. break
  648. }
  649. }
  650. // 判断报告是否跨年
  651. if colTime.AddDate(0, -6, 0).After(params.DateCol.PublishTime) {
  652. utils.FileLog.Info(fmt.Sprintf("跨年判断: ColTime-%v; PublishTime-%v", colTime, params.DateCol.PublishTime))
  653. colTime = colTime.AddDate(-1, 0, 0)
  654. }
  655. if !colTime.IsZero() {
  656. colDate[ii] = colTime.Format(utils.FormatDate)
  657. }
  658. fmt.Println("日期:", colTime.Format(utils.FormatDate))
  659. }
  660. })
  661. }
  662. // 取指标
  663. if i > 0 {
  664. row := TableRow{
  665. DateData: make(map[string]string),
  666. }
  667. cells.Each(func(ii int, ss *goquery.Selection) {
  668. cellTxt := filterInvalidVal(ss.Text())
  669. //fmt.Println("cellTxt", cellTxt)
  670. if ii == 0 {
  671. row.Product = cellTxt
  672. }
  673. if params.MarketCol.HasCol && ii == params.MarketCol.ColIndex {
  674. row.Market = cellTxt
  675. }
  676. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  677. d, ok := colDate[ii]
  678. if !ok {
  679. return
  680. }
  681. // 是否取折中值
  682. if params.ValCol.SplitHalfVal {
  683. val, e := calculateDataHalfVal(cellTxt)
  684. if e != nil {
  685. fmt.Printf("calculateDataHalfVal err: %v\n", e)
  686. return
  687. }
  688. cellTxt = val
  689. }
  690. if cellTxt != "" {
  691. row.DateData[d] = cellTxt
  692. }
  693. }
  694. })
  695. //fmt.Println(row)
  696. items = append(items, row)
  697. }
  698. })
  699. return
  700. }
  701. // formatTableRow2ValidEdb 表格行转换为有效指标
  702. func formatTableRow2ValidEdb(rows []TableRow, edbMatch []DataRuleEdbMatch) (indexes []*HandleIndexData) {
  703. indexes = make([]*HandleIndexData, 0)
  704. for _, m := range edbMatch {
  705. for _, v := range rows {
  706. fmt.Printf("产品: %s, 市场: %s, 日期数据: %v, 单位: %s\n", v.Product, v.Market, v.DateData, v.Unit)
  707. var productOk, marketOk, unitOk bool
  708. if (m.Product == "" && v.Product == "") || (m.Product != "" && strings.Contains(v.Product, m.Product)) {
  709. productOk = true
  710. }
  711. if (m.Market == "" && v.Market == "") || (m.Market != "" && strings.Contains(v.Market, m.Market)) {
  712. marketOk = true
  713. }
  714. if (m.MatchUnit == "" && v.Unit == "") || (m.MatchUnit != "" && strings.Contains(v.Unit, m.MatchUnit)) {
  715. unitOk = true
  716. }
  717. if productOk && marketOk && unitOk {
  718. edb := new(HandleIndexData)
  719. edb.IndexCode = m.IndexCode
  720. edb.IndexName = m.IndexName
  721. edb.ClassifyId = m.ClassifyId
  722. edb.Frequency = m.Frequency
  723. edb.Unit = m.Unit
  724. edb.DateData = v.DateData
  725. edb.TerminalCode = utils.TerminalCode
  726. indexes = append(indexes, edb)
  727. }
  728. }
  729. }
  730. return
  731. }
  732. // listFiles 列出目录下所有文件名
  733. func listFiles(dirPath string) ([]string, error) {
  734. var files []string
  735. err := filepath.Walk(dirPath, func(path string, info os.FileInfo, err error) error {
  736. if err != nil {
  737. return err
  738. }
  739. if !info.IsDir() {
  740. files = append(files, info.Name())
  741. }
  742. return nil
  743. })
  744. if err != nil {
  745. return nil, err
  746. }
  747. return files, nil
  748. }
  749. // filterInvalidVal 过滤无效值
  750. func filterInvalidVal(cellTxt string) string {
  751. cellTxt = strings.TrimSpace(cellTxt)
  752. if cellTxt == "休市" || cellTxt == "/" || cellTxt == "-" || cellTxt == "—" {
  753. return ""
  754. }
  755. return cellTxt
  756. }
  757. // formatIntervalData 格式化区间值
  758. func formatIntervalData(cellTxt, flag string) string {
  759. cellTxt = filterInvalidVal(cellTxt)
  760. if flag == "" {
  761. flag = "-"
  762. }
  763. matches := strings.Split(cellTxt, flag)
  764. if len(matches) < 2 {
  765. return cellTxt
  766. }
  767. if len(matches) != 2 {
  768. return ""
  769. }
  770. // 转换不了直接返回空值
  771. a, e := strconv.ParseFloat(matches[0], 64)
  772. if e != nil {
  773. return ""
  774. }
  775. b, e := strconv.ParseFloat(matches[1], 64)
  776. if e != nil {
  777. return ""
  778. }
  779. average := (a + b) / 2
  780. return fmt.Sprint(average)
  781. }
  782. // getCookie
  783. // @Description: 获取cookie
  784. // @author: Roc
  785. // @datetime 2024-07-09 14:00:53
  786. // @return cookieStr string
  787. // @return err error
  788. func getCookie() (cookieStr string, err error) {
  789. // 读取Cookie
  790. if utils.CCFCookieFile == "" {
  791. err = fmt.Errorf("cookie文件未配置")
  792. return
  793. }
  794. cookieByte, e := os.ReadFile(utils.CCFCookieFile)
  795. if e != nil {
  796. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  797. return
  798. }
  799. cookieStr = strings.TrimSpace(string(cookieByte))
  800. //if cookieStr == "" {
  801. // err = fmt.Errorf("cookie为空")
  802. // return
  803. //}
  804. return
  805. }
  806. // getCookieByChrome
  807. // @Description: 获取cookie
  808. // @author: Roc
  809. // @datetime 2024-07-09 14:00:53
  810. // @return cookieStr string
  811. // @return err error
  812. func getCookieByChrome() (cookieStr string, err error) {
  813. // 读取Cookie
  814. if utils.CCFUseName == "" {
  815. err = fmt.Errorf("CCF账号未设置")
  816. return
  817. }
  818. if utils.CCFPassword == "" {
  819. err = fmt.Errorf("CCF密码未设置")
  820. return
  821. }
  822. opts := append(
  823. chromedp.DefaultExecAllocatorOptions[:],
  824. chromedp.Flag("headless", false),
  825. )
  826. allocCtx, cancel1 := chromedp.NewExecAllocator(context.Background(), opts...)
  827. defer cancel1()
  828. // 创建chrome实例
  829. ctx, cancel2 := chromedp.NewContext(
  830. allocCtx,
  831. chromedp.WithLogf(log.Printf),
  832. )
  833. defer cancel2()
  834. err = chromedp.Run(ctx,
  835. chromedp.Navigate(`https://www.ccf.com.cn/member/member.php`),
  836. chromedp.SetValue(`input[name="username"]`, utils.CCFUseName, chromedp.ByQuery),
  837. chromedp.SetValue(`input[name="password"]`, utils.CCFPassword, chromedp.ByQuery),
  838. chromedp.Sleep(2*time.Second),
  839. chromedp.Click(`input[id="imageField"]`, chromedp.ByQuery),
  840. chromedp.Sleep(5*time.Second),
  841. chromedp.Navigate(`https://www.ccf.com.cn/newscenter/detail-410000-2024070600003.shtml`),
  842. chromedp.Sleep(2*time.Second),
  843. chromedp.ActionFunc(func(ctx context.Context) error {
  844. cookies, err := network.GetCookies().Do(ctx)
  845. if err != nil {
  846. return err
  847. }
  848. //cookieJson, err := json.Marshal(cookies)
  849. //if err != nil {
  850. // return err
  851. //}
  852. //fmt.Println("cookieJson:", string(cookieJson))
  853. //utils.FileLog.Info("cookieJson:" + string(cookieJson))
  854. for _, v := range cookies {
  855. cookieStr = cookieStr + v.Name + "=" + v.Value + ";"
  856. }
  857. fmt.Println("header cookie:", cookieStr)
  858. utils.FileLog.Info("header cookie:" + cookieStr)
  859. tmpFile, tmpErr := os.Create(utils.CCFCookieFile)
  860. if tmpErr != nil {
  861. fmt.Println("创建cookie文件失败:", tmpErr.Error())
  862. return nil
  863. }
  864. if _, err := tmpFile.WriteString(cookieStr); err != nil {
  865. fmt.Println("写入cookie到文件失败:", err.Error())
  866. return nil
  867. }
  868. return nil
  869. }),
  870. )
  871. //if err != nil {
  872. // fmt.Println(err)
  873. //}
  874. return
  875. }
  876. // checkIsLoginPage
  877. // @Description: 校验是否是登录页
  878. // @author: Roc
  879. // @datetime 2024-07-09 16:34:17
  880. // @param bodyStr string
  881. // @return isLoginPage bool
  882. func checkIsLoginPage(bodyStr string) (isLoginPage bool) {
  883. // 初始化goquery.Document
  884. doc, err := goquery.NewDocumentFromReader(strings.NewReader(bodyStr))
  885. if err != nil {
  886. log.Fatal(err)
  887. }
  888. // 查找name为LoginForm的表单
  889. doc.Find("form[name=LoginForm]").Each(func(i int, s *goquery.Selection) {
  890. // 如果找到了,打印信息表示这是登录页
  891. //fmt.Println("这是一个登录页面")
  892. isLoginPage = true
  893. return
  894. })
  895. // 如果没有找到,打印信息表示这不是登录页
  896. //if doc.Find("form[name=LoginForm]").Length() == 0 {
  897. // fmt.Println("这不是一个登录页面")
  898. //}
  899. return
  900. }