common.go 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855
  1. package base_from_ccf
  2. import (
  3. "bytes"
  4. "compress/gzip"
  5. "context"
  6. "encoding/json"
  7. "eta/eta_data_analysis/utils"
  8. "fmt"
  9. "github.com/PuerkitoBio/goquery"
  10. "github.com/chromedp/cdproto/network"
  11. "github.com/chromedp/chromedp"
  12. "golang.org/x/net/html/charset"
  13. "golang.org/x/text/encoding/simplifiedchinese"
  14. "golang.org/x/text/transform"
  15. "io"
  16. "log"
  17. "net/http"
  18. "net/url"
  19. "os"
  20. "path/filepath"
  21. "regexp"
  22. "strconv"
  23. "strings"
  24. "time"
  25. )
  26. const (
  27. CCFSearchPageUrl = "https://www.ccf.com.cn/newscenter/simplesearch.php" // CCF搜索页地址
  28. CCFReportDetailBaseUrl = "https://www.ccf.com.cn" // CCF报告详情页地址
  29. )
  30. // postEdbLib 调用指标接口
  31. func postEdbLib(param map[string]interface{}, method string) (result []byte, err error) {
  32. postUrl := utils.EDB_LIB_URL + method
  33. postData, err := json.Marshal(param)
  34. if err != nil {
  35. return
  36. }
  37. result, err = httpPost(postUrl, string(postData), "application/json")
  38. if err != nil {
  39. return
  40. }
  41. return
  42. }
  43. // httpPost HTTP请求
  44. func httpPost(url, postData string, params ...string) ([]byte, error) {
  45. fmt.Println("httpPost Url:" + url)
  46. body := io.NopCloser(strings.NewReader(postData))
  47. client := &http.Client{}
  48. req, err := http.NewRequest("POST", url, body)
  49. if err != nil {
  50. return nil, err
  51. }
  52. contentType := "application/x-www-form-urlencoded;charset=utf-8"
  53. if len(params) > 0 && params[0] != "" {
  54. contentType = params[0]
  55. }
  56. req.Header.Set("Content-Type", contentType)
  57. req.Header.Set("authorization", utils.MD5(utils.APP_EDB_LIB_NAME_EN+utils.EDB_LIB_Md5_KEY))
  58. resp, err := client.Do(req)
  59. if err != nil {
  60. fmt.Println("client.Do err:" + err.Error())
  61. return nil, err
  62. }
  63. defer func() {
  64. _ = resp.Body.Close()
  65. }()
  66. b, err := io.ReadAll(resp.Body)
  67. if err != nil {
  68. fmt.Println("httpPost:" + string(b))
  69. }
  70. return b, err
  71. }
  72. // fetchPageHtml 获取网站HTML文本
  73. func fetchPageHtml(baseUrl string, fetchNum int) (respBody []byte, err error) {
  74. defer func() {
  75. if err != nil {
  76. tips := fmt.Sprintf("BuildCCFRequest ErrMsg: %s", err.Error())
  77. utils.FileLog.Info(tips)
  78. fmt.Println(tips)
  79. }
  80. }()
  81. // 查询次数
  82. fetchNum++
  83. if baseUrl == "" {
  84. err = fmt.Errorf("CCF请求地址为空")
  85. return
  86. }
  87. // 获取Cookie
  88. strCookie, e := getCookie()
  89. if e != nil {
  90. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  91. return
  92. }
  93. // 拉取网站内容
  94. cli := new(http.Client)
  95. req, e := http.NewRequest("GET", baseUrl, nil)
  96. if e != nil {
  97. err = fmt.Errorf("")
  98. return
  99. }
  100. req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
  101. req.Header.Set("Accept-Encoding", "gzip, deflate, br")
  102. req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
  103. req.Header.Set("Connection", "keep-alive")
  104. req.Header.Set("Cookie", strCookie)
  105. req.Header.Set("Host", "www.ccf.com.cn")
  106. req.Header.Set("Referer", baseUrl)
  107. req.Header.Set("Sec-Ch-Ua", "\"Not A(Brand\";v=\"99\", \"Microsoft Edge\";v=\"121\", \"Chromium\";v=\"121\"")
  108. req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
  109. req.Header.Set("Sec-Ch-Ua-Platform", "\"Windows\"")
  110. req.Header.Set("Sec-Fetch-Dest", "empty")
  111. req.Header.Set("Sec-Fetch-Mode", "cors")
  112. req.Header.Set("Sec-Fetch-Site", "same-origin")
  113. req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0")
  114. req.Header.Set("X-Requested-With", "XMLHttpRequest")
  115. resp, e := cli.Do(req)
  116. if e != nil {
  117. err = fmt.Errorf("HTTP client Do err: %s", e.Error())
  118. return
  119. }
  120. defer func() {
  121. _ = resp.Body.Close()
  122. }()
  123. // 读取响应的内容
  124. reader, e := gzip.NewReader(resp.Body)
  125. if e != nil {
  126. err = fmt.Errorf("gzip NewReader err: %s", e.Error())
  127. return
  128. }
  129. body, e := io.ReadAll(reader)
  130. if e != nil {
  131. err = fmt.Errorf("read body err: %s", e.Error())
  132. return
  133. }
  134. // 转换编码
  135. utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(body))
  136. if e != nil {
  137. err = fmt.Errorf("utf8 reader err: %s", e.Error())
  138. return
  139. }
  140. utf8Body, e := io.ReadAll(utf8Reader)
  141. if e != nil {
  142. err = fmt.Errorf("utf8 body err: %s", e.Error())
  143. return
  144. }
  145. respBody = utf8Body
  146. isLoginPage := checkIsLoginPage(string(respBody))
  147. fmt.Println("是否登录页:", isLoginPage)
  148. // 如果是登录页,且查询次数少于2次,那么就重新登录后查询
  149. if isLoginPage && fetchNum < 2 {
  150. _, err = getCookieByChrome()
  151. if err != nil {
  152. return
  153. }
  154. return fetchPageHtml(baseUrl, fetchNum)
  155. }
  156. return
  157. }
  158. // DataRule 数据爬取规则
  159. type DataRule struct {
  160. Name string `json:"Name"`
  161. Frequency string `json:"Frequency"`
  162. PageDir string `json:"PageDir"`
  163. Search struct {
  164. ClassId string `json:"ClassId"`
  165. SubClassId string `json:"SubClassId"`
  166. ProductId string `json:"ProductId"`
  167. SubProductId string `json:"SubProductId"`
  168. SimpleTerms string `json:"SimpleTerms"`
  169. } `json:"Search"`
  170. TableFetch []struct {
  171. Keyword string `json:"Keyword"`
  172. Unit string `json:"Unit"`
  173. } `json:"TableFetch"`
  174. EdbMatch []DataRuleEdbMatch `json:"EdbMatch"`
  175. StockTable struct {
  176. ClassifyId int `json:"ClassifyId"`
  177. } `json:"StockTable"`
  178. }
  179. // DataRuleEdbMatch 数据爬取规则-指标匹配
  180. type DataRuleEdbMatch struct {
  181. IndexCode string `json:"IndexCode"`
  182. IndexName string `json:"IndexName"`
  183. ClassifyId int `json:"ClassifyId"`
  184. Frequency string `json:"Frequency"`
  185. Product string `json:"Product"`
  186. Market string `json:"Market"`
  187. MatchUnit string `json:"MatchUnit" description:"匹配单位"`
  188. Unit string `json:"Unit" description:"实际单位"`
  189. }
  190. // loadDataRule 从配置中读取爬取规则
  191. func loadDataRule(nameKey string) (fetchRule *DataRule, err error) {
  192. if utils.CCFDataRuleFile == "" {
  193. err = fmt.Errorf("rule文件不存在")
  194. return
  195. }
  196. b, e := os.ReadFile(utils.CCFDataRuleFile)
  197. if e != nil {
  198. err = fmt.Errorf("读取rule文件失败, err: %v", e)
  199. return
  200. }
  201. rules := make([]*DataRule, 0)
  202. if e = json.Unmarshal(b, &rules); e != nil {
  203. err = fmt.Errorf("解析rule文件失败, err: %v", e)
  204. return
  205. }
  206. for _, v := range rules {
  207. if v.Name != "" && v.Name == nameKey {
  208. fetchRule = v
  209. return
  210. }
  211. }
  212. err = fmt.Errorf("rule不存在, nameKey: %s", nameKey)
  213. return
  214. }
  215. // savePageHtml 拉取历史报告详情
  216. func savePageHtml(nameKey, saveDir string, historyPage bool, reportMax int) (files []string, err error) {
  217. if nameKey == "" {
  218. return
  219. }
  220. defer func() {
  221. if err != nil {
  222. tips := fmt.Sprintf("GetCCFOilEdbHistory ErrMsg: %s", err.Error())
  223. utils.FileLog.Info(tips)
  224. fmt.Println(tips)
  225. }
  226. }()
  227. fetchRule, e := loadDataRule(nameKey)
  228. if e != nil {
  229. err = fmt.Errorf("loadDataRule, err: %v", e)
  230. return
  231. }
  232. if saveDir == "" {
  233. saveDir = "static/ccf"
  234. }
  235. // 获取品种第一页
  236. baseUrl := fmt.Sprintf(`%s?newssubmit=1&sitename=localhost`, CCFSearchPageUrl)
  237. if fetchRule.Search.ClassId != "" {
  238. baseUrl = fmt.Sprintf(`%s&ClassID=%s`, baseUrl, fetchRule.Search.ClassId)
  239. }
  240. if fetchRule.Search.SubClassId != "" {
  241. baseUrl = fmt.Sprintf(`%s&SubClassID=%s`, baseUrl, fetchRule.Search.SubClassId)
  242. }
  243. if fetchRule.Search.ProductId != "" {
  244. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.ProductId)
  245. }
  246. if fetchRule.Search.SubProductId != "" {
  247. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.SubProductId)
  248. }
  249. if fetchRule.Search.SimpleTerms != "" {
  250. termsEncode, e := gb2312ToPercentEncoding(fetchRule.Search.SimpleTerms)
  251. if e != nil {
  252. err = fmt.Errorf("gb2312ToPercentEncoding err: %v", e)
  253. return
  254. }
  255. baseUrl = fmt.Sprintf(`%s&simpleterms=%s`, baseUrl, termsEncode)
  256. }
  257. firstPage := fmt.Sprintf(`%s&cur_pg_num=%d`, baseUrl, 1)
  258. // 首页报告链接
  259. firstHtml, e := fetchPageHtml(firstPage, 0)
  260. if e != nil {
  261. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  262. return
  263. }
  264. firstHrefs, e := analysisReportHrefs(firstHtml, 1)
  265. if e != nil {
  266. err = fmt.Errorf("读取首页列表报告链接失败, err: %v", e)
  267. return
  268. }
  269. var historyHrefs []ReportHrefs
  270. historyHrefs = append(historyHrefs, firstHrefs...)
  271. ticker := time.NewTicker(5 * time.Second)
  272. defer ticker.Stop()
  273. // 历史报告
  274. if historyPage {
  275. endPage, e := analysisEndPage(firstHtml)
  276. if e != nil {
  277. err = fmt.Errorf("解析首页最后页码失败, err: %v", e)
  278. return
  279. }
  280. if endPage > 1 {
  281. for i := 2; i <= endPage; i++ {
  282. <-ticker.C
  283. fmt.Printf("开始读取历史页%d\n", i)
  284. // 每页28条数据, 需要带上页码*28的偏移量不然始终获取第一页
  285. pageUrl := fmt.Sprintf(`%s&cur_pg_num=%d&cur_row_pos=%d`, baseUrl, i, i*28)
  286. fmt.Println("pageUrl: ", pageUrl)
  287. pageContents, e := fetchPageHtml(pageUrl, 0)
  288. if e != nil {
  289. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  290. return
  291. }
  292. pageHrefs, e := analysisReportHrefs(pageContents, i)
  293. if e != nil {
  294. err = fmt.Errorf("读取第%d页列表报告链接失败, err: %v", i, e)
  295. return
  296. }
  297. historyHrefs = append(historyHrefs, pageHrefs...)
  298. fmt.Printf("结束读取历史页%d\n", i)
  299. }
  300. }
  301. fmt.Println("endPage: ", endPage)
  302. }
  303. fmt.Println("historyHrefs len: ", len(historyHrefs))
  304. fmt.Println("historyHrefs: ", historyHrefs)
  305. // 拉取报告留档
  306. strDate := time.Now().Format("20060102")
  307. reportCount := 0
  308. for _, v := range historyHrefs {
  309. <-ticker.C
  310. if reportMax > 0 {
  311. reportCount += 1
  312. if reportCount > reportMax {
  313. break
  314. }
  315. }
  316. fmt.Printf("拉取报告: %s; url: %s\n", v.Title, v.Href)
  317. htm, e := fetchPageHtml(fmt.Sprintf("%s%s", CCFReportDetailBaseUrl, v.Href), 0)
  318. if e != nil {
  319. utils.FileLog.Info("获取页面失败, err: %v", e)
  320. continue
  321. }
  322. dateDir := fmt.Sprintf("%s/%s", saveDir, strDate)
  323. if e = utils.MkDir(dateDir); e != nil {
  324. utils.FileLog.Info(fmt.Sprintf("创建目录失败, err: %v", e))
  325. continue
  326. }
  327. outputPath := fmt.Sprintf("%s/%d-%s.html", dateDir, v.Page, v.Title)
  328. if e = utils.WriteHTMLToFile(string(htm), outputPath); e != nil {
  329. utils.FileLog.Info(fmt.Sprintf("写入HTML出错, err: %v", e))
  330. continue
  331. }
  332. files = append(files, outputPath)
  333. }
  334. fmt.Println("拉取报告 end")
  335. return
  336. }
  337. // analysisEndPage 读取列表页最后一页页码
  338. func analysisEndPage(contents []byte) (endPage int, err error) {
  339. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  340. if e != nil {
  341. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  342. return
  343. }
  344. // 查找页码元素并遍历a标签
  345. sectionDigg := doc.Find(".digg")
  346. aElements := sectionDigg.Find("a")
  347. // 获取倒数第二个a标签中的页码
  348. totalAElements := aElements.Length()
  349. targetIndex := totalAElements - 2
  350. if targetIndex >= 0 && targetIndex < totalAElements {
  351. targetA := aElements.Eq(targetIndex)
  352. txt := targetA.Text()
  353. endPage, e = strconv.Atoi(txt)
  354. if e != nil {
  355. err = fmt.Errorf("页码文本有误, %s", txt)
  356. return
  357. }
  358. fmt.Println(endPage)
  359. return
  360. }
  361. endPage = 1
  362. return
  363. }
  364. // ReportHrefs 报告链接
  365. type ReportHrefs struct {
  366. Title string `description:"报告标题"`
  367. Href string `description:"报告详情链接"`
  368. Page int `description:"页码"`
  369. }
  370. // analysisReportHrefs 解析列表页报告链接
  371. func analysisReportHrefs(contents []byte, page int) (hrefs []ReportHrefs, err error) {
  372. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  373. if e != nil {
  374. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  375. return
  376. }
  377. doc.Find("ul.newslist li a").Each(func(_ int, s *goquery.Selection) {
  378. href, exists := s.Attr("href")
  379. if exists {
  380. title := s.Text()
  381. hrefs = append(hrefs, ReportHrefs{
  382. Title: title,
  383. Href: href,
  384. Page: page,
  385. })
  386. }
  387. })
  388. return
  389. }
  390. // extractReportPublishTime 提取报告发布时间
  391. func extractReportPublishTime(text string) (time.Time, error) {
  392. datePattern := `(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}:\d{2})`
  393. re := regexp.MustCompile(datePattern)
  394. var strTime string
  395. match := re.FindStringSubmatch(text)
  396. if len(match) <= 0 {
  397. return time.Time{}, fmt.Errorf("没有读取出日期")
  398. }
  399. strTime = match[0]
  400. // 转为时间格式
  401. dateFormat := "2006年01月02日15:04"
  402. parsedDate, e := time.ParseInLocation(dateFormat, strTime, time.Local)
  403. if e != nil {
  404. return time.Time{}, fmt.Errorf("日期转换失败, str: %s, err: %v", strTime, e)
  405. }
  406. return parsedDate, nil
  407. }
  408. // calculateDataHalfVal 取出数据区间的折中值, 如"7-9天"返回结果为"8"
  409. func calculateDataHalfVal(duration string) (result string, err error) {
  410. re := regexp.MustCompile(`\d+`)
  411. matches := re.FindAllString(duration, -1)
  412. if len(matches) != 2 {
  413. err = fmt.Errorf("未找到两个数字, Num: %d", len(matches))
  414. return
  415. }
  416. a, e := strconv.Atoi(matches[0])
  417. if e != nil {
  418. err = e
  419. return
  420. }
  421. b, e := strconv.Atoi(matches[1])
  422. if e != nil {
  423. err = e
  424. return
  425. }
  426. average := float64(a+b) / 2.0
  427. // 格式化结果
  428. if average == float64(int(average)) {
  429. result = strconv.Itoa(int(average))
  430. } else {
  431. result = fmt.Sprintf("%.1f", average)
  432. }
  433. return
  434. }
  435. // gb2312ToPercentEncoding 中文字符转码
  436. func gb2312ToPercentEncoding(input string) (string, error) {
  437. // 创建GB18030编码转换器(兼容GB2312)
  438. encoder := simplifiedchinese.GB18030.NewEncoder()
  439. // 使用转换器将字符串转换为GB18030编码的字节流,并写入bytes.Buffer
  440. var buf bytes.Buffer
  441. writer := transform.NewWriter(&buf, encoder)
  442. _, err := writer.Write([]byte(input))
  443. if err != nil {
  444. return "", err
  445. }
  446. err = writer.Close()
  447. if err != nil {
  448. return "", err
  449. }
  450. // 将字节流转换为百分号编码
  451. percentEncoded := url.QueryEscape(buf.String())
  452. return percentEncoded, nil
  453. }
  454. // AnalysisNoneMergeTablePars 解析无合并单元格的简单表格入参
  455. type AnalysisNoneMergeTablePars struct {
  456. DocTable *goquery.Selection
  457. MarketCol struct {
  458. HasCol bool `description:"是否有市场列"`
  459. ColIndex int `description:"市场列"`
  460. }
  461. DateCol struct {
  462. StartIndex int `description:"日期开始列"`
  463. EndIndex int `description:"日期结束列"`
  464. PublishTime time.Time `description:"报告发布时间"`
  465. //PublishYear int `description:"报告发布年份"`
  466. StrTimeFormat string `description:"数据日期格式-需拼接日期列中的变量"`
  467. TimeFormat []string `description:"标准日期格式, 可能存在多种分别进行遍历"`
  468. SplitLast bool `description:"是否分隔日期: 如1.24-1.28"`
  469. SplitFlag string `description:"分隔日期分隔符: 如-"`
  470. }
  471. ValCol struct {
  472. SplitHalfVal bool `description:"是否取折中值: 如8-10天, 9-12天"`
  473. }
  474. }
  475. // TableRow 读取Table的行信息
  476. type TableRow struct {
  477. Product string
  478. Market string
  479. DateData map[string]string
  480. Unit string
  481. }
  482. // analysisNoneMergeTable 解析无合并单元格的简单表格
  483. func analysisNoneMergeTable(params AnalysisNoneMergeTablePars) (items []TableRow) {
  484. if params.DocTable != nil && params.DocTable.Length() <= 0 {
  485. return
  486. }
  487. attemptDates := []string{"2006/1/2", "2006/01/02", "2006/01/2", "2006/1/02", "2006-1-2", "2006-01-02", "2006-01-2", "2006-1-02", "2006.01.02", "2006.1.2", "2006.1.02", "2006.01.2", "2006年01月02日", "2006年1月2日", "2006年1月02日", "2006年01月2日"}
  488. colDate := make(map[int]string)
  489. params.DocTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  490. cells := s.Find("td")
  491. // 表头取出日期
  492. if i == 0 {
  493. cells.Each(func(ii int, ss *goquery.Selection) {
  494. cellTxt := strings.TrimSpace(ss.Text())
  495. //fmt.Println("cellTxt", cellTxt)
  496. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  497. //strTime := fmt.Sprintf("%d.%s", publishYear, cellTxt)
  498. //var strTimeFormat string
  499. completeTime := cellTxt
  500. // 是否需要拼接年份
  501. if params.DateCol.StrTimeFormat != "" {
  502. strDate := cellTxt
  503. // 是否取分隔日期的后一个日期
  504. if params.DateCol.SplitLast && params.DateCol.SplitFlag != "" {
  505. dateArr := strings.Split(cellTxt, params.DateCol.SplitFlag)
  506. if len(dateArr) > 1 {
  507. strDate = dateArr[len(dateArr)-1]
  508. }
  509. }
  510. completeTime = fmt.Sprintf(params.DateCol.StrTimeFormat, params.DateCol.PublishTime.Year(), strDate)
  511. }
  512. //fmt.Println("completeTime: ", completeTime)
  513. // 遍历多种可能的日期格式
  514. var colTime time.Time
  515. for _, f := range params.DateCol.TimeFormat {
  516. t, e := time.ParseInLocation(f, completeTime, time.Local)
  517. if e != nil {
  518. continue
  519. }
  520. colTime = t
  521. break
  522. }
  523. // 统一判断一次, 入参的日期格式可能不全
  524. if colTime.IsZero() {
  525. utils.FileLog.Info(fmt.Sprintf("日期格式异常: cellTxt-%s; completeTime-%s", cellTxt, completeTime))
  526. for _, f := range attemptDates {
  527. t, e := time.ParseInLocation(f, completeTime, time.Local)
  528. if e != nil {
  529. continue
  530. }
  531. colTime = t
  532. break
  533. }
  534. }
  535. // 判断报告是否跨年
  536. if colTime.AddDate(0, -6, 0).After(params.DateCol.PublishTime) {
  537. utils.FileLog.Info(fmt.Sprintf("跨年判断: ColTime-%v; PublishTime-%v", colTime, params.DateCol.PublishTime))
  538. colTime = colTime.AddDate(-1, 0, 0)
  539. }
  540. if !colTime.IsZero() {
  541. colDate[ii] = colTime.Format(utils.FormatDate)
  542. }
  543. fmt.Println("日期:", colTime.Format(utils.FormatDate))
  544. }
  545. })
  546. }
  547. // 取指标
  548. if i > 0 {
  549. row := TableRow{
  550. DateData: make(map[string]string),
  551. }
  552. cells.Each(func(ii int, ss *goquery.Selection) {
  553. cellTxt := filterInvalidVal(ss.Text())
  554. //fmt.Println("cellTxt", cellTxt)
  555. if ii == 0 {
  556. row.Product = cellTxt
  557. }
  558. if params.MarketCol.HasCol && ii == params.MarketCol.ColIndex {
  559. row.Market = cellTxt
  560. }
  561. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  562. d, ok := colDate[ii]
  563. if !ok {
  564. return
  565. }
  566. // 是否取折中值
  567. if params.ValCol.SplitHalfVal {
  568. val, e := calculateDataHalfVal(cellTxt)
  569. if e != nil {
  570. fmt.Printf("calculateDataHalfVal err: %v\n", e)
  571. return
  572. }
  573. cellTxt = val
  574. }
  575. if cellTxt != "" {
  576. row.DateData[d] = cellTxt
  577. }
  578. }
  579. })
  580. //fmt.Println(row)
  581. items = append(items, row)
  582. }
  583. })
  584. return
  585. }
  586. // formatTableRow2ValidEdb 表格行转换为有效指标
  587. func formatTableRow2ValidEdb(rows []TableRow, edbMatch []DataRuleEdbMatch) (indexes []*HandleIndexData) {
  588. indexes = make([]*HandleIndexData, 0)
  589. for _, m := range edbMatch {
  590. for _, v := range rows {
  591. fmt.Printf("产品: %s, 市场: %s, 日期数据: %v, 单位: %s\n", v.Product, v.Market, v.DateData, v.Unit)
  592. var productOk, marketOk, unitOk bool
  593. if (m.Product == "" && v.Product == "") || (m.Product != "" && strings.Contains(v.Product, m.Product)) {
  594. productOk = true
  595. }
  596. if (m.Market == "" && v.Market == "") || (m.Market != "" && strings.Contains(v.Market, m.Market)) {
  597. marketOk = true
  598. }
  599. if (m.MatchUnit == "" && v.Unit == "") || (m.MatchUnit != "" && strings.Contains(v.Unit, m.MatchUnit)) {
  600. unitOk = true
  601. }
  602. if productOk && marketOk && unitOk {
  603. edb := new(HandleIndexData)
  604. edb.IndexCode = m.IndexCode
  605. edb.IndexName = m.IndexName
  606. edb.ClassifyId = m.ClassifyId
  607. edb.Frequency = m.Frequency
  608. edb.Unit = m.Unit
  609. edb.DateData = v.DateData
  610. edb.TerminalCode = utils.TerminalCode
  611. indexes = append(indexes, edb)
  612. }
  613. }
  614. }
  615. return
  616. }
  617. // listFiles 列出目录下所有文件名
  618. func listFiles(dirPath string) ([]string, error) {
  619. var files []string
  620. err := filepath.Walk(dirPath, func(path string, info os.FileInfo, err error) error {
  621. if err != nil {
  622. return err
  623. }
  624. if !info.IsDir() {
  625. files = append(files, info.Name())
  626. }
  627. return nil
  628. })
  629. if err != nil {
  630. return nil, err
  631. }
  632. return files, nil
  633. }
  634. // filterInvalidVal 过滤无效值
  635. func filterInvalidVal(cellTxt string) string {
  636. cellTxt = strings.TrimSpace(cellTxt)
  637. if cellTxt == "休市" || cellTxt == "/" || cellTxt == "-" || cellTxt == "—" {
  638. return ""
  639. }
  640. return cellTxt
  641. }
  642. // formatIntervalData 格式化区间值
  643. func formatIntervalData(cellTxt, flag string) string {
  644. cellTxt = filterInvalidVal(cellTxt)
  645. if flag == "" {
  646. flag = "-"
  647. }
  648. matches := strings.Split(cellTxt, flag)
  649. if len(matches) < 2 {
  650. return cellTxt
  651. }
  652. if len(matches) != 2 {
  653. return ""
  654. }
  655. // 转换不了直接返回空值
  656. a, e := strconv.ParseFloat(matches[0], 64)
  657. if e != nil {
  658. return ""
  659. }
  660. b, e := strconv.ParseFloat(matches[1], 64)
  661. if e != nil {
  662. return ""
  663. }
  664. average := (a + b) / 2
  665. return fmt.Sprint(average)
  666. }
  667. // getCookie
  668. // @Description: 获取cookie
  669. // @author: Roc
  670. // @datetime 2024-07-09 14:00:53
  671. // @return cookieStr string
  672. // @return err error
  673. func getCookie() (cookieStr string, err error) {
  674. // 读取Cookie
  675. if utils.CCFCookieFile == "" {
  676. err = fmt.Errorf("cookie文件未配置")
  677. return
  678. }
  679. cookieByte, e := os.ReadFile(utils.CCFCookieFile)
  680. if e != nil {
  681. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  682. return
  683. }
  684. cookieStr = strings.TrimSpace(string(cookieByte))
  685. if cookieStr == "" {
  686. err = fmt.Errorf("cookie为空")
  687. return
  688. }
  689. return
  690. }
  691. // getCookieByChrome
  692. // @Description: 获取cookie
  693. // @author: Roc
  694. // @datetime 2024-07-09 14:00:53
  695. // @return cookieStr string
  696. // @return err error
  697. func getCookieByChrome() (cookieStr string, err error) {
  698. // 读取Cookie
  699. if utils.CCFUseName == "" {
  700. err = fmt.Errorf("CCF账号未设置")
  701. return
  702. }
  703. if utils.CCFPassword == "" {
  704. err = fmt.Errorf("CCF密码未设置")
  705. return
  706. }
  707. opts := append(
  708. chromedp.DefaultExecAllocatorOptions[:],
  709. chromedp.Flag("headless", false),
  710. )
  711. allocCtx, cancel1 := chromedp.NewExecAllocator(context.Background(), opts...)
  712. defer cancel1()
  713. // 创建chrome实例
  714. ctx, cancel2 := chromedp.NewContext(
  715. allocCtx,
  716. chromedp.WithLogf(log.Printf),
  717. )
  718. defer cancel2()
  719. err = chromedp.Run(ctx,
  720. chromedp.Navigate(`https://www.ccf.com.cn/member/member.php`),
  721. chromedp.SetValue(`input[name="username"]`, utils.CCFUseName, chromedp.ByQuery),
  722. chromedp.SetValue(`input[name="password"]`, utils.CCFPassword, chromedp.ByQuery),
  723. chromedp.Sleep(2*time.Second),
  724. chromedp.Click(`input[id="imageField"]`, chromedp.ByQuery),
  725. chromedp.Sleep(5*time.Second),
  726. chromedp.Navigate(`https://www.ccf.com.cn/newscenter/detail-410000-2024070600003.shtml`),
  727. chromedp.Sleep(2*time.Second),
  728. chromedp.ActionFunc(func(ctx context.Context) error {
  729. cookies, err := network.GetCookies().Do(ctx)
  730. if err != nil {
  731. return err
  732. }
  733. //cookieJson, err := json.Marshal(cookies)
  734. //if err != nil {
  735. // return err
  736. //}
  737. //fmt.Println("cookieJson:", string(cookieJson))
  738. //utils.FileLog.Info("cookieJson:" + string(cookieJson))
  739. for _, v := range cookies {
  740. cookieStr = cookieStr + v.Name + "=" + v.Value + ";"
  741. }
  742. //fmt.Println("header cookie:", cookieStr)
  743. //utils.FileLog.Info("header cookie:" + cookieStr)
  744. tmpFile, tmpErr := os.Create(utils.CCFCookieFile)
  745. if tmpErr != nil {
  746. fmt.Println("创建cookie文件失败:", tmpErr.Error())
  747. return nil
  748. }
  749. if _, err := tmpFile.WriteString(cookieStr); err != nil {
  750. fmt.Println("写入cookie到文件失败:", err.Error())
  751. return nil
  752. }
  753. return nil
  754. }),
  755. )
  756. //if err != nil {
  757. // fmt.Println(err)
  758. //}
  759. return
  760. }
  761. // checkIsLoginPage
  762. // @Description: 校验是否是登录页
  763. // @author: Roc
  764. // @datetime 2024-07-09 16:34:17
  765. // @param bodyStr string
  766. // @return isLoginPage bool
  767. func checkIsLoginPage(bodyStr string) (isLoginPage bool) {
  768. // 初始化goquery.Document
  769. doc, err := goquery.NewDocumentFromReader(strings.NewReader(bodyStr))
  770. if err != nil {
  771. log.Fatal(err)
  772. }
  773. // 查找name为LoginForm的表单
  774. doc.Find("form[name=LoginForm]").Each(func(i int, s *goquery.Selection) {
  775. // 如果找到了,打印信息表示这是登录页
  776. //fmt.Println("这是一个登录页面")
  777. isLoginPage = true
  778. return
  779. })
  780. // 如果没有找到,打印信息表示这不是登录页
  781. //if doc.Find("form[name=LoginForm]").Length() == 0 {
  782. // fmt.Println("这不是一个登录页面")
  783. //}
  784. return
  785. }