common.go 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864
  1. package base_from_ccf
  2. import (
  3. "bytes"
  4. "compress/gzip"
  5. "context"
  6. "encoding/json"
  7. "eta/eta_data_analysis/utils"
  8. "fmt"
  9. "github.com/PuerkitoBio/goquery"
  10. "github.com/chromedp/cdproto/network"
  11. "github.com/chromedp/chromedp"
  12. "golang.org/x/net/html/charset"
  13. "golang.org/x/text/encoding/simplifiedchinese"
  14. "golang.org/x/text/transform"
  15. "io"
  16. "log"
  17. "net/http"
  18. "net/url"
  19. "os"
  20. "path/filepath"
  21. "regexp"
  22. "strconv"
  23. "strings"
  24. "time"
  25. )
  26. const (
  27. CCFSearchPageUrl = "https://www.ccf.com.cn/newscenter/simplesearch.php" // CCF搜索页地址
  28. CCFReportDetailBaseUrl = "https://www.ccf.com.cn" // CCF报告详情页地址
  29. CCFCHARTDATAURL = "https://www.ccf.com.cn/datacenter/index.php"
  30. )
  31. // postEdbLib 调用指标接口
  32. func postEdbLib(param map[string]interface{}, method string) (result []byte, err error) {
  33. postUrl := utils.EDB_LIB_URL + method
  34. postData, err := json.Marshal(param)
  35. if err != nil {
  36. return
  37. }
  38. result, err = httpPost(postUrl, string(postData), "application/json")
  39. if err != nil {
  40. return
  41. }
  42. return
  43. }
  44. // httpPost HTTP请求
  45. func httpPost(url, postData string, params ...string) ([]byte, error) {
  46. fmt.Println("httpPost Url:" + url)
  47. body := io.NopCloser(strings.NewReader(postData))
  48. client := &http.Client{}
  49. req, err := http.NewRequest("POST", url, body)
  50. if err != nil {
  51. return nil, err
  52. }
  53. contentType := "application/x-www-form-urlencoded;charset=utf-8"
  54. if len(params) > 0 && params[0] != "" {
  55. contentType = params[0]
  56. }
  57. req.Header.Set("Content-Type", contentType)
  58. req.Header.Set("authorization", utils.MD5(utils.APP_EDB_LIB_NAME_EN+utils.EDB_LIB_Md5_KEY))
  59. resp, err := client.Do(req)
  60. if err != nil {
  61. fmt.Println("client.Do err:" + err.Error())
  62. return nil, err
  63. }
  64. defer func() {
  65. _ = resp.Body.Close()
  66. }()
  67. b, err := io.ReadAll(resp.Body)
  68. if err != nil {
  69. fmt.Println("httpPost:" + string(b))
  70. }
  71. return b, err
  72. }
  73. // fetchPageHtml 获取网站HTML文本
  74. func fetchPageHtml(baseUrl string, fetchNum int) (respBody []byte, err error) {
  75. defer func() {
  76. if err != nil {
  77. tips := fmt.Sprintf("BuildCCFRequest ErrMsg: %s", err.Error())
  78. utils.FileLog.Info(tips)
  79. fmt.Println(tips)
  80. }
  81. }()
  82. // 查询次数
  83. fetchNum++
  84. if baseUrl == "" {
  85. err = fmt.Errorf("CCF请求地址为空")
  86. return
  87. }
  88. // 获取Cookie
  89. strCookie, e := getCookie()
  90. if e != nil {
  91. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  92. return
  93. }
  94. if strCookie == "" && fetchNum < 2 {
  95. fmt.Printf("文件cookie为空, 重新获取, fetchNum: %d\n", fetchNum)
  96. utils.FileLog.Info(fmt.Sprintf("文件cookie为空, 重新获取, fetchNum: %d", fetchNum))
  97. _, err = getCookieByChrome()
  98. if err != nil {
  99. return
  100. }
  101. return fetchPageHtml(baseUrl, fetchNum)
  102. }
  103. // 拉取网站内容
  104. cli := new(http.Client)
  105. req, e := http.NewRequest("GET", baseUrl, nil)
  106. if e != nil {
  107. err = fmt.Errorf("")
  108. return
  109. }
  110. req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
  111. req.Header.Set("Accept-Encoding", "gzip, deflate, br")
  112. req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
  113. req.Header.Set("Connection", "keep-alive")
  114. req.Header.Set("Cookie", strCookie)
  115. req.Header.Set("Host", "www.ccf.com.cn")
  116. req.Header.Set("Referer", baseUrl)
  117. req.Header.Set("Sec-Ch-Ua", "\"Not A(Brand\";v=\"99\", \"Microsoft Edge\";v=\"121\", \"Chromium\";v=\"121\"")
  118. req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
  119. req.Header.Set("Sec-Ch-Ua-Platform", "\"Windows\"")
  120. req.Header.Set("Sec-Fetch-Dest", "empty")
  121. req.Header.Set("Sec-Fetch-Mode", "cors")
  122. req.Header.Set("Sec-Fetch-Site", "same-origin")
  123. req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0")
  124. req.Header.Set("X-Requested-With", "XMLHttpRequest")
  125. resp, e := cli.Do(req)
  126. if e != nil {
  127. err = fmt.Errorf("HTTP client Do err: %s", e.Error())
  128. return
  129. }
  130. defer func() {
  131. _ = resp.Body.Close()
  132. }()
  133. // 读取响应的内容
  134. reader, e := gzip.NewReader(resp.Body)
  135. if e != nil {
  136. err = fmt.Errorf("gzip NewReader err: %s", e.Error())
  137. return
  138. }
  139. body, e := io.ReadAll(reader)
  140. if e != nil {
  141. err = fmt.Errorf("read body err: %s", e.Error())
  142. return
  143. }
  144. // 转换编码
  145. utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(body))
  146. if e != nil {
  147. err = fmt.Errorf("utf8 reader err: %s", e.Error())
  148. return
  149. }
  150. utf8Body, e := io.ReadAll(utf8Reader)
  151. if e != nil {
  152. err = fmt.Errorf("utf8 body err: %s", e.Error())
  153. return
  154. }
  155. respBody = utf8Body
  156. isLoginPage := checkIsLoginPage(string(respBody))
  157. fmt.Println("是否登录页:", isLoginPage)
  158. // 如果是登录页,且查询次数少于2次,那么就重新登录后查询
  159. if isLoginPage && fetchNum < 2 {
  160. _, err = getCookieByChrome()
  161. if err != nil {
  162. return
  163. }
  164. return fetchPageHtml(baseUrl, fetchNum)
  165. }
  166. return
  167. }
  168. // DataRule 数据爬取规则
  169. type DataRule struct {
  170. Name string `json:"Name"`
  171. Frequency string `json:"Frequency"`
  172. PageDir string `json:"PageDir"`
  173. Search struct {
  174. ClassId string `json:"ClassId"`
  175. SubClassId string `json:"SubClassId"`
  176. ProductId string `json:"ProductId"`
  177. SubProductId string `json:"SubProductId"`
  178. SimpleTerms string `json:"SimpleTerms"`
  179. } `json:"Search"`
  180. TableFetch []struct {
  181. Keyword string `json:"Keyword"`
  182. Unit string `json:"Unit"`
  183. } `json:"TableFetch"`
  184. EdbMatch []DataRuleEdbMatch `json:"EdbMatch"`
  185. StockTable struct {
  186. ClassifyId int `json:"ClassifyId"`
  187. } `json:"StockTable"`
  188. }
  189. // DataRuleEdbMatch 数据爬取规则-指标匹配
  190. type DataRuleEdbMatch struct {
  191. IndexCode string `json:"IndexCode"`
  192. IndexName string `json:"IndexName"`
  193. ClassifyId int `json:"ClassifyId"`
  194. Frequency string `json:"Frequency"`
  195. Product string `json:"Product"`
  196. Market string `json:"Market"`
  197. MatchUnit string `json:"MatchUnit" description:"匹配单位"`
  198. Unit string `json:"Unit" description:"实际单位"`
  199. }
  200. // loadDataRule 从配置中读取爬取规则
  201. func loadDataRule(nameKey string) (fetchRule *DataRule, err error) {
  202. if utils.CCFDataRuleFile == "" {
  203. err = fmt.Errorf("rule文件不存在")
  204. return
  205. }
  206. b, e := os.ReadFile(utils.CCFDataRuleFile)
  207. if e != nil {
  208. err = fmt.Errorf("读取rule文件失败, err: %v", e)
  209. return
  210. }
  211. rules := make([]*DataRule, 0)
  212. if e = json.Unmarshal(b, &rules); e != nil {
  213. err = fmt.Errorf("解析rule文件失败, err: %v", e)
  214. return
  215. }
  216. for _, v := range rules {
  217. if v.Name != "" && v.Name == nameKey {
  218. fetchRule = v
  219. return
  220. }
  221. }
  222. err = fmt.Errorf("rule不存在, nameKey: %s", nameKey)
  223. return
  224. }
  225. // savePageHtml 拉取历史报告详情
  226. func savePageHtml(nameKey, saveDir string, historyPage bool, reportMax int) (files []string, err error) {
  227. if nameKey == "" {
  228. return
  229. }
  230. defer func() {
  231. if err != nil {
  232. tips := fmt.Sprintf("GetCCFOilEdbHistory ErrMsg: %s", err.Error())
  233. utils.FileLog.Info(tips)
  234. fmt.Println(tips)
  235. }
  236. }()
  237. fetchRule, e := loadDataRule(nameKey)
  238. if e != nil {
  239. err = fmt.Errorf("loadDataRule, err: %v", e)
  240. return
  241. }
  242. if saveDir == "" {
  243. saveDir = "static/ccf"
  244. }
  245. // 获取品种第一页
  246. baseUrl := fmt.Sprintf(`%s?newssubmit=1&sitename=localhost`, CCFSearchPageUrl)
  247. if fetchRule.Search.ClassId != "" {
  248. baseUrl = fmt.Sprintf(`%s&ClassID=%s`, baseUrl, fetchRule.Search.ClassId)
  249. }
  250. if fetchRule.Search.SubClassId != "" {
  251. baseUrl = fmt.Sprintf(`%s&SubClassID=%s`, baseUrl, fetchRule.Search.SubClassId)
  252. }
  253. if fetchRule.Search.ProductId != "" {
  254. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.ProductId)
  255. }
  256. if fetchRule.Search.SubProductId != "" {
  257. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.SubProductId)
  258. }
  259. if fetchRule.Search.SimpleTerms != "" {
  260. termsEncode, e := gb2312ToPercentEncoding(fetchRule.Search.SimpleTerms)
  261. if e != nil {
  262. err = fmt.Errorf("gb2312ToPercentEncoding err: %v", e)
  263. return
  264. }
  265. baseUrl = fmt.Sprintf(`%s&simpleterms=%s`, baseUrl, termsEncode)
  266. }
  267. firstPage := fmt.Sprintf(`%s&cur_pg_num=%d`, baseUrl, 1)
  268. // 首页报告链接
  269. firstHtml, e := fetchPageHtml(firstPage, 0)
  270. if e != nil {
  271. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  272. return
  273. }
  274. firstHrefs, e := analysisReportHrefs(firstHtml, 1)
  275. if e != nil {
  276. err = fmt.Errorf("读取首页列表报告链接失败, err: %v", e)
  277. return
  278. }
  279. var historyHrefs []ReportHrefs
  280. historyHrefs = append(historyHrefs, firstHrefs...)
  281. ticker := time.NewTicker(5 * time.Second)
  282. defer ticker.Stop()
  283. // 历史报告
  284. if historyPage {
  285. endPage, e := analysisEndPage(firstHtml)
  286. if e != nil {
  287. err = fmt.Errorf("解析首页最后页码失败, err: %v", e)
  288. return
  289. }
  290. if endPage > 1 {
  291. for i := 2; i <= endPage; i++ {
  292. <-ticker.C
  293. fmt.Printf("开始读取历史页%d\n", i)
  294. // 每页28条数据, 需要带上页码*28的偏移量不然始终获取第一页
  295. pageUrl := fmt.Sprintf(`%s&cur_pg_num=%d&cur_row_pos=%d`, baseUrl, i, i*28)
  296. fmt.Println("pageUrl: ", pageUrl)
  297. pageContents, e := fetchPageHtml(pageUrl, 0)
  298. if e != nil {
  299. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  300. return
  301. }
  302. pageHrefs, e := analysisReportHrefs(pageContents, i)
  303. if e != nil {
  304. err = fmt.Errorf("读取第%d页列表报告链接失败, err: %v", i, e)
  305. return
  306. }
  307. historyHrefs = append(historyHrefs, pageHrefs...)
  308. fmt.Printf("结束读取历史页%d\n", i)
  309. }
  310. }
  311. fmt.Println("endPage: ", endPage)
  312. }
  313. fmt.Println("historyHrefs len: ", len(historyHrefs))
  314. fmt.Println("historyHrefs: ", historyHrefs)
  315. // 拉取报告留档
  316. strDate := time.Now().Format("20060102")
  317. reportCount := 0
  318. for _, v := range historyHrefs {
  319. <-ticker.C
  320. if reportMax > 0 {
  321. reportCount += 1
  322. if reportCount > reportMax {
  323. break
  324. }
  325. }
  326. fmt.Printf("拉取报告: %s; url: %s\n", v.Title, v.Href)
  327. htm, e := fetchPageHtml(fmt.Sprintf("%s%s", CCFReportDetailBaseUrl, v.Href), 0)
  328. if e != nil {
  329. utils.FileLog.Info("获取页面失败, err: %v", e)
  330. continue
  331. }
  332. dateDir := fmt.Sprintf("%s/%s", saveDir, strDate)
  333. if e = utils.MkDir(dateDir); e != nil {
  334. utils.FileLog.Info(fmt.Sprintf("创建目录失败, err: %v", e))
  335. continue
  336. }
  337. outputPath := fmt.Sprintf("%s/%d-%s.html", dateDir, v.Page, v.Title)
  338. if e = utils.WriteHTMLToFile(string(htm), outputPath); e != nil {
  339. utils.FileLog.Info(fmt.Sprintf("写入HTML出错, err: %v", e))
  340. continue
  341. }
  342. files = append(files, outputPath)
  343. }
  344. fmt.Println("拉取报告 end")
  345. return
  346. }
  347. // analysisEndPage 读取列表页最后一页页码
  348. func analysisEndPage(contents []byte) (endPage int, err error) {
  349. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  350. if e != nil {
  351. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  352. return
  353. }
  354. // 查找页码元素并遍历a标签
  355. sectionDigg := doc.Find(".digg")
  356. aElements := sectionDigg.Find("a")
  357. // 获取倒数第二个a标签中的页码
  358. totalAElements := aElements.Length()
  359. targetIndex := totalAElements - 2
  360. if targetIndex >= 0 && targetIndex < totalAElements {
  361. targetA := aElements.Eq(targetIndex)
  362. txt := targetA.Text()
  363. endPage, e = strconv.Atoi(txt)
  364. if e != nil {
  365. err = fmt.Errorf("页码文本有误, %s", txt)
  366. return
  367. }
  368. fmt.Println(endPage)
  369. return
  370. }
  371. endPage = 1
  372. return
  373. }
  374. // ReportHrefs 报告链接
  375. type ReportHrefs struct {
  376. Title string `description:"报告标题"`
  377. Href string `description:"报告详情链接"`
  378. Page int `description:"页码"`
  379. }
  380. // analysisReportHrefs 解析列表页报告链接
  381. func analysisReportHrefs(contents []byte, page int) (hrefs []ReportHrefs, err error) {
  382. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  383. if e != nil {
  384. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  385. return
  386. }
  387. doc.Find("ul.newslist li a").Each(func(_ int, s *goquery.Selection) {
  388. href, exists := s.Attr("href")
  389. if exists {
  390. title := s.Text()
  391. hrefs = append(hrefs, ReportHrefs{
  392. Title: title,
  393. Href: href,
  394. Page: page,
  395. })
  396. }
  397. })
  398. return
  399. }
  400. // extractReportPublishTime 提取报告发布时间
  401. func extractReportPublishTime(text string) (time.Time, error) {
  402. datePattern := `(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}:\d{2})`
  403. re := regexp.MustCompile(datePattern)
  404. var strTime string
  405. match := re.FindStringSubmatch(text)
  406. if len(match) <= 0 {
  407. return time.Time{}, fmt.Errorf("没有读取出日期")
  408. }
  409. strTime = match[0]
  410. // 转为时间格式
  411. dateFormat := "2006年01月02日15:04"
  412. parsedDate, e := time.ParseInLocation(dateFormat, strTime, time.Local)
  413. if e != nil {
  414. return time.Time{}, fmt.Errorf("日期转换失败, str: %s, err: %v", strTime, e)
  415. }
  416. return parsedDate, nil
  417. }
  418. // calculateDataHalfVal 取出数据区间的折中值, 如"7-9天"返回结果为"8"
  419. func calculateDataHalfVal(duration string) (result string, err error) {
  420. re := regexp.MustCompile(`\d+`)
  421. matches := re.FindAllString(duration, -1)
  422. if len(matches) != 2 {
  423. err = fmt.Errorf("未找到两个数字, Num: %d", len(matches))
  424. return
  425. }
  426. a, e := strconv.Atoi(matches[0])
  427. if e != nil {
  428. err = e
  429. return
  430. }
  431. b, e := strconv.Atoi(matches[1])
  432. if e != nil {
  433. err = e
  434. return
  435. }
  436. average := float64(a+b) / 2.0
  437. // 格式化结果
  438. if average == float64(int(average)) {
  439. result = strconv.Itoa(int(average))
  440. } else {
  441. result = fmt.Sprintf("%.1f", average)
  442. }
  443. return
  444. }
  445. // gb2312ToPercentEncoding 中文字符转码
  446. func gb2312ToPercentEncoding(input string) (string, error) {
  447. // 创建GB18030编码转换器(兼容GB2312)
  448. encoder := simplifiedchinese.GB18030.NewEncoder()
  449. // 使用转换器将字符串转换为GB18030编码的字节流,并写入bytes.Buffer
  450. var buf bytes.Buffer
  451. writer := transform.NewWriter(&buf, encoder)
  452. _, err := writer.Write([]byte(input))
  453. if err != nil {
  454. return "", err
  455. }
  456. err = writer.Close()
  457. if err != nil {
  458. return "", err
  459. }
  460. // 将字节流转换为百分号编码
  461. percentEncoded := url.QueryEscape(buf.String())
  462. return percentEncoded, nil
  463. }
  464. // AnalysisNoneMergeTablePars 解析无合并单元格的简单表格入参
  465. type AnalysisNoneMergeTablePars struct {
  466. DocTable *goquery.Selection
  467. MarketCol struct {
  468. HasCol bool `description:"是否有市场列"`
  469. ColIndex int `description:"市场列"`
  470. }
  471. DateCol struct {
  472. StartIndex int `description:"日期开始列"`
  473. EndIndex int `description:"日期结束列"`
  474. PublishTime time.Time `description:"报告发布时间"`
  475. //PublishYear int `description:"报告发布年份"`
  476. StrTimeFormat string `description:"数据日期格式-需拼接日期列中的变量"`
  477. TimeFormat []string `description:"标准日期格式, 可能存在多种分别进行遍历"`
  478. SplitLast bool `description:"是否分隔日期: 如1.24-1.28"`
  479. SplitFlag string `description:"分隔日期分隔符: 如-"`
  480. }
  481. ValCol struct {
  482. SplitHalfVal bool `description:"是否取折中值: 如8-10天, 9-12天"`
  483. }
  484. }
  485. // TableRow 读取Table的行信息
  486. type TableRow struct {
  487. Product string
  488. Market string
  489. DateData map[string]string
  490. Unit string
  491. }
  492. // analysisNoneMergeTable 解析无合并单元格的简单表格
  493. func analysisNoneMergeTable(params AnalysisNoneMergeTablePars) (items []TableRow) {
  494. if params.DocTable != nil && params.DocTable.Length() <= 0 {
  495. return
  496. }
  497. attemptDates := []string{"2006/1/2", "2006/01/02", "2006/01/2", "2006/1/02", "2006-1-2", "2006-01-02", "2006-01-2", "2006-1-02", "2006.01.02", "2006.1.2", "2006.1.02", "2006.01.2", "2006年01月02日", "2006年1月2日", "2006年1月02日", "2006年01月2日"}
  498. colDate := make(map[int]string)
  499. params.DocTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  500. cells := s.Find("td")
  501. // 表头取出日期
  502. if i == 0 {
  503. cells.Each(func(ii int, ss *goquery.Selection) {
  504. cellTxt := strings.TrimSpace(ss.Text())
  505. //fmt.Println("cellTxt", cellTxt)
  506. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  507. //strTime := fmt.Sprintf("%d.%s", publishYear, cellTxt)
  508. //var strTimeFormat string
  509. completeTime := cellTxt
  510. // 是否需要拼接年份
  511. if params.DateCol.StrTimeFormat != "" {
  512. strDate := cellTxt
  513. // 是否取分隔日期的后一个日期
  514. if params.DateCol.SplitLast && params.DateCol.SplitFlag != "" {
  515. dateArr := strings.Split(cellTxt, params.DateCol.SplitFlag)
  516. if len(dateArr) > 1 {
  517. strDate = dateArr[len(dateArr)-1]
  518. }
  519. }
  520. completeTime = fmt.Sprintf(params.DateCol.StrTimeFormat, params.DateCol.PublishTime.Year(), strDate)
  521. }
  522. //fmt.Println("completeTime: ", completeTime)
  523. // 遍历多种可能的日期格式
  524. var colTime time.Time
  525. for _, f := range params.DateCol.TimeFormat {
  526. t, e := time.ParseInLocation(f, completeTime, time.Local)
  527. if e != nil {
  528. continue
  529. }
  530. colTime = t
  531. break
  532. }
  533. // 统一判断一次, 入参的日期格式可能不全
  534. if colTime.IsZero() {
  535. utils.FileLog.Info(fmt.Sprintf("日期格式异常: cellTxt-%s; completeTime-%s", cellTxt, completeTime))
  536. for _, f := range attemptDates {
  537. t, e := time.ParseInLocation(f, completeTime, time.Local)
  538. if e != nil {
  539. continue
  540. }
  541. colTime = t
  542. break
  543. }
  544. }
  545. // 判断报告是否跨年
  546. if colTime.AddDate(0, -6, 0).After(params.DateCol.PublishTime) {
  547. utils.FileLog.Info(fmt.Sprintf("跨年判断: ColTime-%v; PublishTime-%v", colTime, params.DateCol.PublishTime))
  548. colTime = colTime.AddDate(-1, 0, 0)
  549. }
  550. if !colTime.IsZero() {
  551. colDate[ii] = colTime.Format(utils.FormatDate)
  552. }
  553. fmt.Println("日期:", colTime.Format(utils.FormatDate))
  554. }
  555. })
  556. }
  557. // 取指标
  558. if i > 0 {
  559. row := TableRow{
  560. DateData: make(map[string]string),
  561. }
  562. cells.Each(func(ii int, ss *goquery.Selection) {
  563. cellTxt := filterInvalidVal(ss.Text())
  564. //fmt.Println("cellTxt", cellTxt)
  565. if ii == 0 {
  566. row.Product = cellTxt
  567. }
  568. if params.MarketCol.HasCol && ii == params.MarketCol.ColIndex {
  569. row.Market = cellTxt
  570. }
  571. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  572. d, ok := colDate[ii]
  573. if !ok {
  574. return
  575. }
  576. // 是否取折中值
  577. if params.ValCol.SplitHalfVal {
  578. val, e := calculateDataHalfVal(cellTxt)
  579. if e != nil {
  580. fmt.Printf("calculateDataHalfVal err: %v\n", e)
  581. return
  582. }
  583. cellTxt = val
  584. }
  585. if cellTxt != "" {
  586. row.DateData[d] = cellTxt
  587. }
  588. }
  589. })
  590. //fmt.Println(row)
  591. items = append(items, row)
  592. }
  593. })
  594. return
  595. }
  596. // formatTableRow2ValidEdb 表格行转换为有效指标
  597. func formatTableRow2ValidEdb(rows []TableRow, edbMatch []DataRuleEdbMatch) (indexes []*HandleIndexData) {
  598. indexes = make([]*HandleIndexData, 0)
  599. for _, m := range edbMatch {
  600. for _, v := range rows {
  601. fmt.Printf("产品: %s, 市场: %s, 日期数据: %v, 单位: %s\n", v.Product, v.Market, v.DateData, v.Unit)
  602. var productOk, marketOk, unitOk bool
  603. if (m.Product == "" && v.Product == "") || (m.Product != "" && strings.Contains(v.Product, m.Product)) {
  604. productOk = true
  605. }
  606. if (m.Market == "" && v.Market == "") || (m.Market != "" && strings.Contains(v.Market, m.Market)) {
  607. marketOk = true
  608. }
  609. if (m.MatchUnit == "" && v.Unit == "") || (m.MatchUnit != "" && strings.Contains(v.Unit, m.MatchUnit)) {
  610. unitOk = true
  611. }
  612. if productOk && marketOk && unitOk {
  613. edb := new(HandleIndexData)
  614. edb.IndexCode = m.IndexCode
  615. edb.IndexName = m.IndexName
  616. edb.ClassifyId = m.ClassifyId
  617. edb.Frequency = m.Frequency
  618. edb.Unit = m.Unit
  619. edb.DateData = v.DateData
  620. edb.TerminalCode = utils.TerminalCode
  621. indexes = append(indexes, edb)
  622. }
  623. }
  624. }
  625. return
  626. }
  627. // listFiles 列出目录下所有文件名
  628. func listFiles(dirPath string) ([]string, error) {
  629. var files []string
  630. err := filepath.Walk(dirPath, func(path string, info os.FileInfo, err error) error {
  631. if err != nil {
  632. return err
  633. }
  634. if !info.IsDir() {
  635. files = append(files, info.Name())
  636. }
  637. return nil
  638. })
  639. if err != nil {
  640. return nil, err
  641. }
  642. return files, nil
  643. }
  644. // filterInvalidVal 过滤无效值
  645. func filterInvalidVal(cellTxt string) string {
  646. cellTxt = strings.TrimSpace(cellTxt)
  647. if cellTxt == "休市" || cellTxt == "/" || cellTxt == "-" || cellTxt == "—" {
  648. return ""
  649. }
  650. return cellTxt
  651. }
  652. // formatIntervalData 格式化区间值
  653. func formatIntervalData(cellTxt, flag string) string {
  654. cellTxt = filterInvalidVal(cellTxt)
  655. if flag == "" {
  656. flag = "-"
  657. }
  658. matches := strings.Split(cellTxt, flag)
  659. if len(matches) < 2 {
  660. return cellTxt
  661. }
  662. if len(matches) != 2 {
  663. return ""
  664. }
  665. // 转换不了直接返回空值
  666. a, e := strconv.ParseFloat(matches[0], 64)
  667. if e != nil {
  668. return ""
  669. }
  670. b, e := strconv.ParseFloat(matches[1], 64)
  671. if e != nil {
  672. return ""
  673. }
  674. average := (a + b) / 2
  675. return fmt.Sprint(average)
  676. }
  677. // getCookie
  678. // @Description: 获取cookie
  679. // @author: Roc
  680. // @datetime 2024-07-09 14:00:53
  681. // @return cookieStr string
  682. // @return err error
  683. func getCookie() (cookieStr string, err error) {
  684. // 读取Cookie
  685. if utils.CCFCookieFile == "" {
  686. err = fmt.Errorf("cookie文件未配置")
  687. return
  688. }
  689. cookieByte, e := os.ReadFile(utils.CCFCookieFile)
  690. if e != nil {
  691. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  692. return
  693. }
  694. cookieStr = strings.TrimSpace(string(cookieByte))
  695. //if cookieStr == "" {
  696. // err = fmt.Errorf("cookie为空")
  697. // return
  698. //}
  699. return
  700. }
  701. // getCookieByChrome
  702. // @Description: 获取cookie
  703. // @author: Roc
  704. // @datetime 2024-07-09 14:00:53
  705. // @return cookieStr string
  706. // @return err error
  707. func getCookieByChrome() (cookieStr string, err error) {
  708. // 读取Cookie
  709. if utils.CCFUseName == "" {
  710. err = fmt.Errorf("CCF账号未设置")
  711. return
  712. }
  713. if utils.CCFPassword == "" {
  714. err = fmt.Errorf("CCF密码未设置")
  715. return
  716. }
  717. opts := append(
  718. chromedp.DefaultExecAllocatorOptions[:],
  719. chromedp.Flag("headless", false),
  720. )
  721. allocCtx, cancel1 := chromedp.NewExecAllocator(context.Background(), opts...)
  722. defer cancel1()
  723. // 创建chrome实例
  724. ctx, cancel2 := chromedp.NewContext(
  725. allocCtx,
  726. chromedp.WithLogf(log.Printf),
  727. )
  728. defer cancel2()
  729. err = chromedp.Run(ctx,
  730. chromedp.Navigate(`https://www.ccf.com.cn/member/member.php`),
  731. chromedp.SetValue(`input[name="username"]`, utils.CCFUseName, chromedp.ByQuery),
  732. chromedp.SetValue(`input[name="password"]`, utils.CCFPassword, chromedp.ByQuery),
  733. chromedp.Sleep(2*time.Second),
  734. chromedp.Click(`input[id="imageField"]`, chromedp.ByQuery),
  735. chromedp.Sleep(5*time.Second),
  736. chromedp.Navigate(`https://www.ccf.com.cn/newscenter/detail-410000-2024070600003.shtml`),
  737. chromedp.Sleep(2*time.Second),
  738. chromedp.ActionFunc(func(ctx context.Context) error {
  739. cookies, err := network.GetCookies().Do(ctx)
  740. if err != nil {
  741. return err
  742. }
  743. //cookieJson, err := json.Marshal(cookies)
  744. //if err != nil {
  745. // return err
  746. //}
  747. //fmt.Println("cookieJson:", string(cookieJson))
  748. //utils.FileLog.Info("cookieJson:" + string(cookieJson))
  749. for _, v := range cookies {
  750. cookieStr = cookieStr + v.Name + "=" + v.Value + ";"
  751. }
  752. fmt.Println("header cookie:", cookieStr)
  753. utils.FileLog.Info("header cookie:" + cookieStr)
  754. tmpFile, tmpErr := os.Create(utils.CCFCookieFile)
  755. if tmpErr != nil {
  756. fmt.Println("创建cookie文件失败:", tmpErr.Error())
  757. return nil
  758. }
  759. if _, err := tmpFile.WriteString(cookieStr); err != nil {
  760. fmt.Println("写入cookie到文件失败:", err.Error())
  761. return nil
  762. }
  763. return nil
  764. }),
  765. )
  766. //if err != nil {
  767. // fmt.Println(err)
  768. //}
  769. return
  770. }
  771. // checkIsLoginPage
  772. // @Description: 校验是否是登录页
  773. // @author: Roc
  774. // @datetime 2024-07-09 16:34:17
  775. // @param bodyStr string
  776. // @return isLoginPage bool
  777. func checkIsLoginPage(bodyStr string) (isLoginPage bool) {
  778. // 初始化goquery.Document
  779. doc, err := goquery.NewDocumentFromReader(strings.NewReader(bodyStr))
  780. if err != nil {
  781. log.Fatal(err)
  782. }
  783. // 查找name为LoginForm的表单
  784. doc.Find("form[name=LoginForm]").Each(func(i int, s *goquery.Selection) {
  785. // 如果找到了,打印信息表示这是登录页
  786. //fmt.Println("这是一个登录页面")
  787. isLoginPage = true
  788. return
  789. })
  790. // 如果没有找到,打印信息表示这不是登录页
  791. //if doc.Find("form[name=LoginForm]").Length() == 0 {
  792. // fmt.Println("这不是一个登录页面")
  793. //}
  794. return
  795. }