common.go 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863
  1. package base_from_ccf
  2. import (
  3. "bytes"
  4. "compress/gzip"
  5. "context"
  6. "encoding/json"
  7. "eta/eta_data_analysis/utils"
  8. "fmt"
  9. "github.com/PuerkitoBio/goquery"
  10. "github.com/chromedp/cdproto/network"
  11. "github.com/chromedp/chromedp"
  12. "golang.org/x/net/html/charset"
  13. "golang.org/x/text/encoding/simplifiedchinese"
  14. "golang.org/x/text/transform"
  15. "io"
  16. "log"
  17. "net/http"
  18. "net/url"
  19. "os"
  20. "path/filepath"
  21. "regexp"
  22. "strconv"
  23. "strings"
  24. "time"
  25. )
  26. const (
  27. CCFSearchPageUrl = "https://www.ccf.com.cn/newscenter/simplesearch.php" // CCF搜索页地址
  28. CCFReportDetailBaseUrl = "https://www.ccf.com.cn" // CCF报告详情页地址
  29. )
  30. // postEdbLib 调用指标接口
  31. func postEdbLib(param map[string]interface{}, method string) (result []byte, err error) {
  32. postUrl := utils.EDB_LIB_URL + method
  33. postData, err := json.Marshal(param)
  34. if err != nil {
  35. return
  36. }
  37. result, err = httpPost(postUrl, string(postData), "application/json")
  38. if err != nil {
  39. return
  40. }
  41. return
  42. }
  43. // httpPost HTTP请求
  44. func httpPost(url, postData string, params ...string) ([]byte, error) {
  45. fmt.Println("httpPost Url:" + url)
  46. body := io.NopCloser(strings.NewReader(postData))
  47. client := &http.Client{}
  48. req, err := http.NewRequest("POST", url, body)
  49. if err != nil {
  50. return nil, err
  51. }
  52. contentType := "application/x-www-form-urlencoded;charset=utf-8"
  53. if len(params) > 0 && params[0] != "" {
  54. contentType = params[0]
  55. }
  56. req.Header.Set("Content-Type", contentType)
  57. req.Header.Set("authorization", utils.MD5(utils.APP_EDB_LIB_NAME_EN+utils.EDB_LIB_Md5_KEY))
  58. resp, err := client.Do(req)
  59. if err != nil {
  60. fmt.Println("client.Do err:" + err.Error())
  61. return nil, err
  62. }
  63. defer func() {
  64. _ = resp.Body.Close()
  65. }()
  66. b, err := io.ReadAll(resp.Body)
  67. if err != nil {
  68. fmt.Println("httpPost:" + string(b))
  69. }
  70. return b, err
  71. }
  72. // fetchPageHtml 获取网站HTML文本
  73. func fetchPageHtml(baseUrl string, fetchNum int) (respBody []byte, err error) {
  74. defer func() {
  75. if err != nil {
  76. tips := fmt.Sprintf("BuildCCFRequest ErrMsg: %s", err.Error())
  77. utils.FileLog.Info(tips)
  78. fmt.Println(tips)
  79. }
  80. }()
  81. // 查询次数
  82. fetchNum++
  83. if baseUrl == "" {
  84. err = fmt.Errorf("CCF请求地址为空")
  85. return
  86. }
  87. // 获取Cookie
  88. strCookie, e := getCookie()
  89. if e != nil {
  90. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  91. return
  92. }
  93. if strCookie == "" && fetchNum < 2 {
  94. fmt.Printf("文件cookie为空, 重新获取, fetchNum: %d\n", fetchNum)
  95. utils.FileLog.Info(fmt.Sprintf("文件cookie为空, 重新获取, fetchNum: %d", fetchNum))
  96. _, err = getCookieByChrome()
  97. if err != nil {
  98. return
  99. }
  100. return fetchPageHtml(baseUrl, fetchNum)
  101. }
  102. // 拉取网站内容
  103. cli := new(http.Client)
  104. req, e := http.NewRequest("GET", baseUrl, nil)
  105. if e != nil {
  106. err = fmt.Errorf("")
  107. return
  108. }
  109. req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
  110. req.Header.Set("Accept-Encoding", "gzip, deflate, br")
  111. req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
  112. req.Header.Set("Connection", "keep-alive")
  113. req.Header.Set("Cookie", strCookie)
  114. req.Header.Set("Host", "www.ccf.com.cn")
  115. req.Header.Set("Referer", baseUrl)
  116. req.Header.Set("Sec-Ch-Ua", "\"Not A(Brand\";v=\"99\", \"Microsoft Edge\";v=\"121\", \"Chromium\";v=\"121\"")
  117. req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
  118. req.Header.Set("Sec-Ch-Ua-Platform", "\"Windows\"")
  119. req.Header.Set("Sec-Fetch-Dest", "empty")
  120. req.Header.Set("Sec-Fetch-Mode", "cors")
  121. req.Header.Set("Sec-Fetch-Site", "same-origin")
  122. req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0")
  123. req.Header.Set("X-Requested-With", "XMLHttpRequest")
  124. resp, e := cli.Do(req)
  125. if e != nil {
  126. err = fmt.Errorf("HTTP client Do err: %s", e.Error())
  127. return
  128. }
  129. defer func() {
  130. _ = resp.Body.Close()
  131. }()
  132. // 读取响应的内容
  133. reader, e := gzip.NewReader(resp.Body)
  134. if e != nil {
  135. err = fmt.Errorf("gzip NewReader err: %s", e.Error())
  136. return
  137. }
  138. body, e := io.ReadAll(reader)
  139. if e != nil {
  140. err = fmt.Errorf("read body err: %s", e.Error())
  141. return
  142. }
  143. // 转换编码
  144. utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(body))
  145. if e != nil {
  146. err = fmt.Errorf("utf8 reader err: %s", e.Error())
  147. return
  148. }
  149. utf8Body, e := io.ReadAll(utf8Reader)
  150. if e != nil {
  151. err = fmt.Errorf("utf8 body err: %s", e.Error())
  152. return
  153. }
  154. respBody = utf8Body
  155. isLoginPage := checkIsLoginPage(string(respBody))
  156. fmt.Println("是否登录页:", isLoginPage)
  157. // 如果是登录页,且查询次数少于2次,那么就重新登录后查询
  158. if isLoginPage && fetchNum < 2 {
  159. _, err = getCookieByChrome()
  160. if err != nil {
  161. return
  162. }
  163. return fetchPageHtml(baseUrl, fetchNum)
  164. }
  165. return
  166. }
  167. // DataRule 数据爬取规则
  168. type DataRule struct {
  169. Name string `json:"Name"`
  170. Frequency string `json:"Frequency"`
  171. PageDir string `json:"PageDir"`
  172. Search struct {
  173. ClassId string `json:"ClassId"`
  174. SubClassId string `json:"SubClassId"`
  175. ProductId string `json:"ProductId"`
  176. SubProductId string `json:"SubProductId"`
  177. SimpleTerms string `json:"SimpleTerms"`
  178. } `json:"Search"`
  179. TableFetch []struct {
  180. Keyword string `json:"Keyword"`
  181. Unit string `json:"Unit"`
  182. } `json:"TableFetch"`
  183. EdbMatch []DataRuleEdbMatch `json:"EdbMatch"`
  184. StockTable struct {
  185. ClassifyId int `json:"ClassifyId"`
  186. } `json:"StockTable"`
  187. }
  188. // DataRuleEdbMatch 数据爬取规则-指标匹配
  189. type DataRuleEdbMatch struct {
  190. IndexCode string `json:"IndexCode"`
  191. IndexName string `json:"IndexName"`
  192. ClassifyId int `json:"ClassifyId"`
  193. Frequency string `json:"Frequency"`
  194. Product string `json:"Product"`
  195. Market string `json:"Market"`
  196. MatchUnit string `json:"MatchUnit" description:"匹配单位"`
  197. Unit string `json:"Unit" description:"实际单位"`
  198. }
  199. // loadDataRule 从配置中读取爬取规则
  200. func loadDataRule(nameKey string) (fetchRule *DataRule, err error) {
  201. if utils.CCFDataRuleFile == "" {
  202. err = fmt.Errorf("rule文件不存在")
  203. return
  204. }
  205. b, e := os.ReadFile(utils.CCFDataRuleFile)
  206. if e != nil {
  207. err = fmt.Errorf("读取rule文件失败, err: %v", e)
  208. return
  209. }
  210. rules := make([]*DataRule, 0)
  211. if e = json.Unmarshal(b, &rules); e != nil {
  212. err = fmt.Errorf("解析rule文件失败, err: %v", e)
  213. return
  214. }
  215. for _, v := range rules {
  216. if v.Name != "" && v.Name == nameKey {
  217. fetchRule = v
  218. return
  219. }
  220. }
  221. err = fmt.Errorf("rule不存在, nameKey: %s", nameKey)
  222. return
  223. }
  224. // savePageHtml 拉取历史报告详情
  225. func savePageHtml(nameKey, saveDir string, historyPage bool, reportMax int) (files []string, err error) {
  226. if nameKey == "" {
  227. return
  228. }
  229. defer func() {
  230. if err != nil {
  231. tips := fmt.Sprintf("GetCCFOilEdbHistory ErrMsg: %s", err.Error())
  232. utils.FileLog.Info(tips)
  233. fmt.Println(tips)
  234. }
  235. }()
  236. fetchRule, e := loadDataRule(nameKey)
  237. if e != nil {
  238. err = fmt.Errorf("loadDataRule, err: %v", e)
  239. return
  240. }
  241. if saveDir == "" {
  242. saveDir = "static/ccf"
  243. }
  244. // 获取品种第一页
  245. baseUrl := fmt.Sprintf(`%s?newssubmit=1&sitename=localhost`, CCFSearchPageUrl)
  246. if fetchRule.Search.ClassId != "" {
  247. baseUrl = fmt.Sprintf(`%s&ClassID=%s`, baseUrl, fetchRule.Search.ClassId)
  248. }
  249. if fetchRule.Search.SubClassId != "" {
  250. baseUrl = fmt.Sprintf(`%s&SubClassID=%s`, baseUrl, fetchRule.Search.SubClassId)
  251. }
  252. if fetchRule.Search.ProductId != "" {
  253. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.ProductId)
  254. }
  255. if fetchRule.Search.SubProductId != "" {
  256. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.SubProductId)
  257. }
  258. if fetchRule.Search.SimpleTerms != "" {
  259. termsEncode, e := gb2312ToPercentEncoding(fetchRule.Search.SimpleTerms)
  260. if e != nil {
  261. err = fmt.Errorf("gb2312ToPercentEncoding err: %v", e)
  262. return
  263. }
  264. baseUrl = fmt.Sprintf(`%s&simpleterms=%s`, baseUrl, termsEncode)
  265. }
  266. firstPage := fmt.Sprintf(`%s&cur_pg_num=%d`, baseUrl, 1)
  267. // 首页报告链接
  268. firstHtml, e := fetchPageHtml(firstPage, 0)
  269. if e != nil {
  270. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  271. return
  272. }
  273. firstHrefs, e := analysisReportHrefs(firstHtml, 1)
  274. if e != nil {
  275. err = fmt.Errorf("读取首页列表报告链接失败, err: %v", e)
  276. return
  277. }
  278. var historyHrefs []ReportHrefs
  279. historyHrefs = append(historyHrefs, firstHrefs...)
  280. ticker := time.NewTicker(5 * time.Second)
  281. defer ticker.Stop()
  282. // 历史报告
  283. if historyPage {
  284. endPage, e := analysisEndPage(firstHtml)
  285. if e != nil {
  286. err = fmt.Errorf("解析首页最后页码失败, err: %v", e)
  287. return
  288. }
  289. if endPage > 1 {
  290. for i := 2; i <= endPage; i++ {
  291. <-ticker.C
  292. fmt.Printf("开始读取历史页%d\n", i)
  293. // 每页28条数据, 需要带上页码*28的偏移量不然始终获取第一页
  294. pageUrl := fmt.Sprintf(`%s&cur_pg_num=%d&cur_row_pos=%d`, baseUrl, i, i*28)
  295. fmt.Println("pageUrl: ", pageUrl)
  296. pageContents, e := fetchPageHtml(pageUrl, 0)
  297. if e != nil {
  298. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  299. return
  300. }
  301. pageHrefs, e := analysisReportHrefs(pageContents, i)
  302. if e != nil {
  303. err = fmt.Errorf("读取第%d页列表报告链接失败, err: %v", i, e)
  304. return
  305. }
  306. historyHrefs = append(historyHrefs, pageHrefs...)
  307. fmt.Printf("结束读取历史页%d\n", i)
  308. }
  309. }
  310. fmt.Println("endPage: ", endPage)
  311. }
  312. fmt.Println("historyHrefs len: ", len(historyHrefs))
  313. fmt.Println("historyHrefs: ", historyHrefs)
  314. // 拉取报告留档
  315. strDate := time.Now().Format("20060102")
  316. reportCount := 0
  317. for _, v := range historyHrefs {
  318. <-ticker.C
  319. if reportMax > 0 {
  320. reportCount += 1
  321. if reportCount > reportMax {
  322. break
  323. }
  324. }
  325. fmt.Printf("拉取报告: %s; url: %s\n", v.Title, v.Href)
  326. htm, e := fetchPageHtml(fmt.Sprintf("%s%s", CCFReportDetailBaseUrl, v.Href), 0)
  327. if e != nil {
  328. utils.FileLog.Info("获取页面失败, err: %v", e)
  329. continue
  330. }
  331. dateDir := fmt.Sprintf("%s/%s", saveDir, strDate)
  332. if e = utils.MkDir(dateDir); e != nil {
  333. utils.FileLog.Info(fmt.Sprintf("创建目录失败, err: %v", e))
  334. continue
  335. }
  336. outputPath := fmt.Sprintf("%s/%d-%s.html", dateDir, v.Page, v.Title)
  337. if e = utils.WriteHTMLToFile(string(htm), outputPath); e != nil {
  338. utils.FileLog.Info(fmt.Sprintf("写入HTML出错, err: %v", e))
  339. continue
  340. }
  341. files = append(files, outputPath)
  342. }
  343. fmt.Println("拉取报告 end")
  344. return
  345. }
  346. // analysisEndPage 读取列表页最后一页页码
  347. func analysisEndPage(contents []byte) (endPage int, err error) {
  348. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  349. if e != nil {
  350. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  351. return
  352. }
  353. // 查找页码元素并遍历a标签
  354. sectionDigg := doc.Find(".digg")
  355. aElements := sectionDigg.Find("a")
  356. // 获取倒数第二个a标签中的页码
  357. totalAElements := aElements.Length()
  358. targetIndex := totalAElements - 2
  359. if targetIndex >= 0 && targetIndex < totalAElements {
  360. targetA := aElements.Eq(targetIndex)
  361. txt := targetA.Text()
  362. endPage, e = strconv.Atoi(txt)
  363. if e != nil {
  364. err = fmt.Errorf("页码文本有误, %s", txt)
  365. return
  366. }
  367. fmt.Println(endPage)
  368. return
  369. }
  370. endPage = 1
  371. return
  372. }
  373. // ReportHrefs 报告链接
  374. type ReportHrefs struct {
  375. Title string `description:"报告标题"`
  376. Href string `description:"报告详情链接"`
  377. Page int `description:"页码"`
  378. }
  379. // analysisReportHrefs 解析列表页报告链接
  380. func analysisReportHrefs(contents []byte, page int) (hrefs []ReportHrefs, err error) {
  381. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  382. if e != nil {
  383. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  384. return
  385. }
  386. doc.Find("ul.newslist li a").Each(func(_ int, s *goquery.Selection) {
  387. href, exists := s.Attr("href")
  388. if exists {
  389. title := s.Text()
  390. hrefs = append(hrefs, ReportHrefs{
  391. Title: title,
  392. Href: href,
  393. Page: page,
  394. })
  395. }
  396. })
  397. return
  398. }
  399. // extractReportPublishTime 提取报告发布时间
  400. func extractReportPublishTime(text string) (time.Time, error) {
  401. datePattern := `(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}:\d{2})`
  402. re := regexp.MustCompile(datePattern)
  403. var strTime string
  404. match := re.FindStringSubmatch(text)
  405. if len(match) <= 0 {
  406. return time.Time{}, fmt.Errorf("没有读取出日期")
  407. }
  408. strTime = match[0]
  409. // 转为时间格式
  410. dateFormat := "2006年01月02日15:04"
  411. parsedDate, e := time.ParseInLocation(dateFormat, strTime, time.Local)
  412. if e != nil {
  413. return time.Time{}, fmt.Errorf("日期转换失败, str: %s, err: %v", strTime, e)
  414. }
  415. return parsedDate, nil
  416. }
  417. // calculateDataHalfVal 取出数据区间的折中值, 如"7-9天"返回结果为"8"
  418. func calculateDataHalfVal(duration string) (result string, err error) {
  419. re := regexp.MustCompile(`\d+`)
  420. matches := re.FindAllString(duration, -1)
  421. if len(matches) != 2 {
  422. err = fmt.Errorf("未找到两个数字, Num: %d", len(matches))
  423. return
  424. }
  425. a, e := strconv.Atoi(matches[0])
  426. if e != nil {
  427. err = e
  428. return
  429. }
  430. b, e := strconv.Atoi(matches[1])
  431. if e != nil {
  432. err = e
  433. return
  434. }
  435. average := float64(a+b) / 2.0
  436. // 格式化结果
  437. if average == float64(int(average)) {
  438. result = strconv.Itoa(int(average))
  439. } else {
  440. result = fmt.Sprintf("%.1f", average)
  441. }
  442. return
  443. }
  444. // gb2312ToPercentEncoding 中文字符转码
  445. func gb2312ToPercentEncoding(input string) (string, error) {
  446. // 创建GB18030编码转换器(兼容GB2312)
  447. encoder := simplifiedchinese.GB18030.NewEncoder()
  448. // 使用转换器将字符串转换为GB18030编码的字节流,并写入bytes.Buffer
  449. var buf bytes.Buffer
  450. writer := transform.NewWriter(&buf, encoder)
  451. _, err := writer.Write([]byte(input))
  452. if err != nil {
  453. return "", err
  454. }
  455. err = writer.Close()
  456. if err != nil {
  457. return "", err
  458. }
  459. // 将字节流转换为百分号编码
  460. percentEncoded := url.QueryEscape(buf.String())
  461. return percentEncoded, nil
  462. }
  463. // AnalysisNoneMergeTablePars 解析无合并单元格的简单表格入参
  464. type AnalysisNoneMergeTablePars struct {
  465. DocTable *goquery.Selection
  466. MarketCol struct {
  467. HasCol bool `description:"是否有市场列"`
  468. ColIndex int `description:"市场列"`
  469. }
  470. DateCol struct {
  471. StartIndex int `description:"日期开始列"`
  472. EndIndex int `description:"日期结束列"`
  473. PublishTime time.Time `description:"报告发布时间"`
  474. //PublishYear int `description:"报告发布年份"`
  475. StrTimeFormat string `description:"数据日期格式-需拼接日期列中的变量"`
  476. TimeFormat []string `description:"标准日期格式, 可能存在多种分别进行遍历"`
  477. SplitLast bool `description:"是否分隔日期: 如1.24-1.28"`
  478. SplitFlag string `description:"分隔日期分隔符: 如-"`
  479. }
  480. ValCol struct {
  481. SplitHalfVal bool `description:"是否取折中值: 如8-10天, 9-12天"`
  482. }
  483. }
  484. // TableRow 读取Table的行信息
  485. type TableRow struct {
  486. Product string
  487. Market string
  488. DateData map[string]string
  489. Unit string
  490. }
  491. // analysisNoneMergeTable 解析无合并单元格的简单表格
  492. func analysisNoneMergeTable(params AnalysisNoneMergeTablePars) (items []TableRow) {
  493. if params.DocTable != nil && params.DocTable.Length() <= 0 {
  494. return
  495. }
  496. attemptDates := []string{"2006/1/2", "2006/01/02", "2006/01/2", "2006/1/02", "2006-1-2", "2006-01-02", "2006-01-2", "2006-1-02", "2006.01.02", "2006.1.2", "2006.1.02", "2006.01.2", "2006年01月02日", "2006年1月2日", "2006年1月02日", "2006年01月2日"}
  497. colDate := make(map[int]string)
  498. params.DocTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  499. cells := s.Find("td")
  500. // 表头取出日期
  501. if i == 0 {
  502. cells.Each(func(ii int, ss *goquery.Selection) {
  503. cellTxt := strings.TrimSpace(ss.Text())
  504. //fmt.Println("cellTxt", cellTxt)
  505. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  506. //strTime := fmt.Sprintf("%d.%s", publishYear, cellTxt)
  507. //var strTimeFormat string
  508. completeTime := cellTxt
  509. // 是否需要拼接年份
  510. if params.DateCol.StrTimeFormat != "" {
  511. strDate := cellTxt
  512. // 是否取分隔日期的后一个日期
  513. if params.DateCol.SplitLast && params.DateCol.SplitFlag != "" {
  514. dateArr := strings.Split(cellTxt, params.DateCol.SplitFlag)
  515. if len(dateArr) > 1 {
  516. strDate = dateArr[len(dateArr)-1]
  517. }
  518. }
  519. completeTime = fmt.Sprintf(params.DateCol.StrTimeFormat, params.DateCol.PublishTime.Year(), strDate)
  520. }
  521. //fmt.Println("completeTime: ", completeTime)
  522. // 遍历多种可能的日期格式
  523. var colTime time.Time
  524. for _, f := range params.DateCol.TimeFormat {
  525. t, e := time.ParseInLocation(f, completeTime, time.Local)
  526. if e != nil {
  527. continue
  528. }
  529. colTime = t
  530. break
  531. }
  532. // 统一判断一次, 入参的日期格式可能不全
  533. if colTime.IsZero() {
  534. utils.FileLog.Info(fmt.Sprintf("日期格式异常: cellTxt-%s; completeTime-%s", cellTxt, completeTime))
  535. for _, f := range attemptDates {
  536. t, e := time.ParseInLocation(f, completeTime, time.Local)
  537. if e != nil {
  538. continue
  539. }
  540. colTime = t
  541. break
  542. }
  543. }
  544. // 判断报告是否跨年
  545. if colTime.AddDate(0, -6, 0).After(params.DateCol.PublishTime) {
  546. utils.FileLog.Info(fmt.Sprintf("跨年判断: ColTime-%v; PublishTime-%v", colTime, params.DateCol.PublishTime))
  547. colTime = colTime.AddDate(-1, 0, 0)
  548. }
  549. if !colTime.IsZero() {
  550. colDate[ii] = colTime.Format(utils.FormatDate)
  551. }
  552. fmt.Println("日期:", colTime.Format(utils.FormatDate))
  553. }
  554. })
  555. }
  556. // 取指标
  557. if i > 0 {
  558. row := TableRow{
  559. DateData: make(map[string]string),
  560. }
  561. cells.Each(func(ii int, ss *goquery.Selection) {
  562. cellTxt := filterInvalidVal(ss.Text())
  563. //fmt.Println("cellTxt", cellTxt)
  564. if ii == 0 {
  565. row.Product = cellTxt
  566. }
  567. if params.MarketCol.HasCol && ii == params.MarketCol.ColIndex {
  568. row.Market = cellTxt
  569. }
  570. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  571. d, ok := colDate[ii]
  572. if !ok {
  573. return
  574. }
  575. // 是否取折中值
  576. if params.ValCol.SplitHalfVal {
  577. val, e := calculateDataHalfVal(cellTxt)
  578. if e != nil {
  579. fmt.Printf("calculateDataHalfVal err: %v\n", e)
  580. return
  581. }
  582. cellTxt = val
  583. }
  584. if cellTxt != "" {
  585. row.DateData[d] = cellTxt
  586. }
  587. }
  588. })
  589. //fmt.Println(row)
  590. items = append(items, row)
  591. }
  592. })
  593. return
  594. }
  595. // formatTableRow2ValidEdb 表格行转换为有效指标
  596. func formatTableRow2ValidEdb(rows []TableRow, edbMatch []DataRuleEdbMatch) (indexes []*HandleIndexData) {
  597. indexes = make([]*HandleIndexData, 0)
  598. for _, m := range edbMatch {
  599. for _, v := range rows {
  600. fmt.Printf("产品: %s, 市场: %s, 日期数据: %v, 单位: %s\n", v.Product, v.Market, v.DateData, v.Unit)
  601. var productOk, marketOk, unitOk bool
  602. if (m.Product == "" && v.Product == "") || (m.Product != "" && strings.Contains(v.Product, m.Product)) {
  603. productOk = true
  604. }
  605. if (m.Market == "" && v.Market == "") || (m.Market != "" && strings.Contains(v.Market, m.Market)) {
  606. marketOk = true
  607. }
  608. if (m.MatchUnit == "" && v.Unit == "") || (m.MatchUnit != "" && strings.Contains(v.Unit, m.MatchUnit)) {
  609. unitOk = true
  610. }
  611. if productOk && marketOk && unitOk {
  612. edb := new(HandleIndexData)
  613. edb.IndexCode = m.IndexCode
  614. edb.IndexName = m.IndexName
  615. edb.ClassifyId = m.ClassifyId
  616. edb.Frequency = m.Frequency
  617. edb.Unit = m.Unit
  618. edb.DateData = v.DateData
  619. edb.TerminalCode = utils.TerminalCode
  620. indexes = append(indexes, edb)
  621. }
  622. }
  623. }
  624. return
  625. }
  626. // listFiles 列出目录下所有文件名
  627. func listFiles(dirPath string) ([]string, error) {
  628. var files []string
  629. err := filepath.Walk(dirPath, func(path string, info os.FileInfo, err error) error {
  630. if err != nil {
  631. return err
  632. }
  633. if !info.IsDir() {
  634. files = append(files, info.Name())
  635. }
  636. return nil
  637. })
  638. if err != nil {
  639. return nil, err
  640. }
  641. return files, nil
  642. }
  643. // filterInvalidVal 过滤无效值
  644. func filterInvalidVal(cellTxt string) string {
  645. cellTxt = strings.TrimSpace(cellTxt)
  646. if cellTxt == "休市" || cellTxt == "/" || cellTxt == "-" || cellTxt == "—" {
  647. return ""
  648. }
  649. return cellTxt
  650. }
  651. // formatIntervalData 格式化区间值
  652. func formatIntervalData(cellTxt, flag string) string {
  653. cellTxt = filterInvalidVal(cellTxt)
  654. if flag == "" {
  655. flag = "-"
  656. }
  657. matches := strings.Split(cellTxt, flag)
  658. if len(matches) < 2 {
  659. return cellTxt
  660. }
  661. if len(matches) != 2 {
  662. return ""
  663. }
  664. // 转换不了直接返回空值
  665. a, e := strconv.ParseFloat(matches[0], 64)
  666. if e != nil {
  667. return ""
  668. }
  669. b, e := strconv.ParseFloat(matches[1], 64)
  670. if e != nil {
  671. return ""
  672. }
  673. average := (a + b) / 2
  674. return fmt.Sprint(average)
  675. }
  676. // getCookie
  677. // @Description: 获取cookie
  678. // @author: Roc
  679. // @datetime 2024-07-09 14:00:53
  680. // @return cookieStr string
  681. // @return err error
  682. func getCookie() (cookieStr string, err error) {
  683. // 读取Cookie
  684. if utils.CCFCookieFile == "" {
  685. err = fmt.Errorf("cookie文件未配置")
  686. return
  687. }
  688. cookieByte, e := os.ReadFile(utils.CCFCookieFile)
  689. if e != nil {
  690. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  691. return
  692. }
  693. cookieStr = strings.TrimSpace(string(cookieByte))
  694. //if cookieStr == "" {
  695. // err = fmt.Errorf("cookie为空")
  696. // return
  697. //}
  698. return
  699. }
  700. // getCookieByChrome
  701. // @Description: 获取cookie
  702. // @author: Roc
  703. // @datetime 2024-07-09 14:00:53
  704. // @return cookieStr string
  705. // @return err error
  706. func getCookieByChrome() (cookieStr string, err error) {
  707. // 读取Cookie
  708. if utils.CCFUseName == "" {
  709. err = fmt.Errorf("CCF账号未设置")
  710. return
  711. }
  712. if utils.CCFPassword == "" {
  713. err = fmt.Errorf("CCF密码未设置")
  714. return
  715. }
  716. opts := append(
  717. chromedp.DefaultExecAllocatorOptions[:],
  718. chromedp.Flag("headless", false),
  719. )
  720. allocCtx, cancel1 := chromedp.NewExecAllocator(context.Background(), opts...)
  721. defer cancel1()
  722. // 创建chrome实例
  723. ctx, cancel2 := chromedp.NewContext(
  724. allocCtx,
  725. chromedp.WithLogf(log.Printf),
  726. )
  727. defer cancel2()
  728. err = chromedp.Run(ctx,
  729. chromedp.Navigate(`https://www.ccf.com.cn/member/member.php`),
  730. chromedp.SetValue(`input[name="username"]`, utils.CCFUseName, chromedp.ByQuery),
  731. chromedp.SetValue(`input[name="password"]`, utils.CCFPassword, chromedp.ByQuery),
  732. chromedp.Sleep(2*time.Second),
  733. chromedp.Click(`input[id="imageField"]`, chromedp.ByQuery),
  734. chromedp.Sleep(5*time.Second),
  735. chromedp.Navigate(`https://www.ccf.com.cn/newscenter/detail-410000-2024070600003.shtml`),
  736. chromedp.Sleep(2*time.Second),
  737. chromedp.ActionFunc(func(ctx context.Context) error {
  738. cookies, err := network.GetCookies().Do(ctx)
  739. if err != nil {
  740. return err
  741. }
  742. //cookieJson, err := json.Marshal(cookies)
  743. //if err != nil {
  744. // return err
  745. //}
  746. //fmt.Println("cookieJson:", string(cookieJson))
  747. //utils.FileLog.Info("cookieJson:" + string(cookieJson))
  748. for _, v := range cookies {
  749. cookieStr = cookieStr + v.Name + "=" + v.Value + ";"
  750. }
  751. fmt.Println("header cookie:", cookieStr)
  752. utils.FileLog.Info("header cookie:" + cookieStr)
  753. tmpFile, tmpErr := os.Create(utils.CCFCookieFile)
  754. if tmpErr != nil {
  755. fmt.Println("创建cookie文件失败:", tmpErr.Error())
  756. return nil
  757. }
  758. if _, err := tmpFile.WriteString(cookieStr); err != nil {
  759. fmt.Println("写入cookie到文件失败:", err.Error())
  760. return nil
  761. }
  762. return nil
  763. }),
  764. )
  765. //if err != nil {
  766. // fmt.Println(err)
  767. //}
  768. return
  769. }
  770. // checkIsLoginPage
  771. // @Description: 校验是否是登录页
  772. // @author: Roc
  773. // @datetime 2024-07-09 16:34:17
  774. // @param bodyStr string
  775. // @return isLoginPage bool
  776. func checkIsLoginPage(bodyStr string) (isLoginPage bool) {
  777. // 初始化goquery.Document
  778. doc, err := goquery.NewDocumentFromReader(strings.NewReader(bodyStr))
  779. if err != nil {
  780. log.Fatal(err)
  781. }
  782. // 查找name为LoginForm的表单
  783. doc.Find("form[name=LoginForm]").Each(func(i int, s *goquery.Selection) {
  784. // 如果找到了,打印信息表示这是登录页
  785. //fmt.Println("这是一个登录页面")
  786. isLoginPage = true
  787. return
  788. })
  789. // 如果没有找到,打印信息表示这不是登录页
  790. //if doc.Find("form[name=LoginForm]").Length() == 0 {
  791. // fmt.Println("这不是一个登录页面")
  792. //}
  793. return
  794. }