common.go 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984
  1. package base_from_ccf
  2. import (
  3. "bytes"
  4. "compress/gzip"
  5. "context"
  6. "encoding/json"
  7. "eta/eta_data_analysis/utils"
  8. "fmt"
  9. "io"
  10. "log"
  11. "math"
  12. "mime/multipart"
  13. "net/http"
  14. "net/url"
  15. "os"
  16. "path/filepath"
  17. "regexp"
  18. "strconv"
  19. "strings"
  20. "time"
  21. "github.com/PuerkitoBio/goquery"
  22. "github.com/chromedp/cdproto/network"
  23. "github.com/chromedp/chromedp"
  24. "golang.org/x/net/html/charset"
  25. "golang.org/x/text/encoding/simplifiedchinese"
  26. "golang.org/x/text/transform"
  27. )
  28. const (
  29. CCFSearchPageUrl = "https://www.ccf.com.cn/newscenter/simplesearch.php" // CCF搜索页地址
  30. CCFReportDetailBaseUrl = "https://www.ccf.com.cn" // CCF报告详情页地址
  31. CCFCHARTDATAURL = "https://www.ccf.com.cn/datacenter/index.php"
  32. )
  33. // postEdbLib 调用指标接口
  34. func postEdbLib(param map[string]interface{}, method string) (result []byte, err error) {
  35. postUrl := utils.EDB_LIB_URL + method
  36. postData, err := json.Marshal(param)
  37. if err != nil {
  38. return
  39. }
  40. result, err = httpPost(postUrl, string(postData), "application/json")
  41. if err != nil {
  42. return
  43. }
  44. return
  45. }
  46. // httpPost HTTP请求
  47. func httpPost(url, postData string, params ...string) ([]byte, error) {
  48. fmt.Println("httpPost Url:" + url)
  49. body := io.NopCloser(strings.NewReader(postData))
  50. client := &http.Client{}
  51. req, err := http.NewRequest("POST", url, body)
  52. if err != nil {
  53. return nil, err
  54. }
  55. contentType := "application/x-www-form-urlencoded;charset=utf-8"
  56. if len(params) > 0 && params[0] != "" {
  57. contentType = params[0]
  58. }
  59. req.Header.Set("Content-Type", contentType)
  60. req.Header.Set("authorization", utils.MD5(utils.APP_EDB_LIB_NAME_EN+utils.EDB_LIB_Md5_KEY))
  61. resp, err := client.Do(req)
  62. if err != nil {
  63. fmt.Println("client.Do err:" + err.Error())
  64. return nil, err
  65. }
  66. defer func() {
  67. _ = resp.Body.Close()
  68. }()
  69. b, err := io.ReadAll(resp.Body)
  70. if err != nil {
  71. fmt.Println("httpPost:" + string(b))
  72. }
  73. return b, err
  74. }
  75. // fetchPageHtml 获取网站HTML文本
  76. func fetchPageHtml(baseUrl string, fetchNum int) (respBody []byte, err error) {
  77. defer func() {
  78. if err != nil {
  79. tips := fmt.Sprintf("BuildCCFRequest ErrMsg: %s", err.Error())
  80. utils.FileLog.Info(tips)
  81. fmt.Println(tips)
  82. }
  83. }()
  84. // 查询次数
  85. fetchNum++
  86. if baseUrl == "" {
  87. err = fmt.Errorf("CCF请求地址为空")
  88. return
  89. }
  90. // 获取Cookie
  91. strCookie, e := getCookie()
  92. if e != nil {
  93. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  94. return
  95. }
  96. if strCookie == "" && fetchNum < 2 {
  97. fmt.Printf("文件cookie为空, 重新获取, fetchNum: %d\n", fetchNum)
  98. utils.FileLog.Info(fmt.Sprintf("文件cookie为空, 重新获取, fetchNum: %d", fetchNum))
  99. _, err = getCookieByChrome()
  100. if err != nil {
  101. return
  102. }
  103. return fetchPageHtml(baseUrl, fetchNum)
  104. }
  105. // 拉取网站内容
  106. cli := new(http.Client)
  107. req, e := http.NewRequest("GET", baseUrl, nil)
  108. if e != nil {
  109. err = fmt.Errorf("")
  110. return
  111. }
  112. req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
  113. req.Header.Set("Accept-Encoding", "gzip, deflate, br")
  114. req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
  115. req.Header.Set("Connection", "keep-alive")
  116. req.Header.Set("Cookie", strCookie)
  117. req.Header.Set("Host", "www.ccf.com.cn")
  118. req.Header.Set("Referer", baseUrl)
  119. req.Header.Set("Sec-Ch-Ua", "\"Not A(Brand\";v=\"99\", \"Microsoft Edge\";v=\"121\", \"Chromium\";v=\"121\"")
  120. req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
  121. req.Header.Set("Sec-Ch-Ua-Platform", "\"Windows\"")
  122. req.Header.Set("Sec-Fetch-Dest", "empty")
  123. req.Header.Set("Sec-Fetch-Mode", "cors")
  124. req.Header.Set("Sec-Fetch-Site", "same-origin")
  125. req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0")
  126. req.Header.Set("X-Requested-With", "XMLHttpRequest")
  127. resp, e := cli.Do(req)
  128. if e != nil {
  129. err = fmt.Errorf("HTTP client Do err: %s", e.Error())
  130. return
  131. }
  132. defer func() {
  133. _ = resp.Body.Close()
  134. }()
  135. // 读取响应的内容
  136. reader, e := gzip.NewReader(resp.Body)
  137. if e != nil {
  138. err = fmt.Errorf("gzip NewReader err: %s", e.Error())
  139. return
  140. }
  141. body, e := io.ReadAll(reader)
  142. if e != nil {
  143. err = fmt.Errorf("read body err: %s", e.Error())
  144. return
  145. }
  146. // 转换编码
  147. utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(body))
  148. if e != nil {
  149. err = fmt.Errorf("utf8 reader err: %s", e.Error())
  150. return
  151. }
  152. utf8Body, e := io.ReadAll(utf8Reader)
  153. if e != nil {
  154. err = fmt.Errorf("utf8 body err: %s", e.Error())
  155. return
  156. }
  157. respBody = utf8Body
  158. isLoginPage := checkIsLoginPage(string(respBody))
  159. fmt.Println("是否登录页:", isLoginPage)
  160. // 如果是登录页,且查询次数少于2次,那么就重新登录后查询
  161. if isLoginPage && fetchNum < 2 {
  162. _, err = getCookieByChrome()
  163. if err != nil {
  164. return
  165. }
  166. return fetchPageHtml(baseUrl, fetchNum)
  167. }
  168. return
  169. }
  170. // postPageHtml 获取网站HTML文本
  171. func postPageHtml(baseUrl string, formData map[string]string, fetchNum int) (respBody []byte, err error) {
  172. defer func() {
  173. if err != nil {
  174. tips := fmt.Sprintf("BuildCCFRequest ErrMsg: %s", err.Error())
  175. utils.FileLog.Info(tips)
  176. fmt.Println(tips)
  177. }
  178. }()
  179. // 查询次数
  180. fetchNum++
  181. if baseUrl == "" {
  182. err = fmt.Errorf("CCF请求地址为空")
  183. return
  184. }
  185. // 获取Cookie
  186. strCookie, e := getCookie()
  187. if e != nil {
  188. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  189. return
  190. }
  191. if strCookie == "" && fetchNum < 2 {
  192. fmt.Printf("文件cookie为空, 重新获取, fetchNum: %d\n", fetchNum)
  193. utils.FileLog.Info(fmt.Sprintf("文件cookie为空, 重新获取, fetchNum: %d", fetchNum))
  194. _, err = getCookieByChrome()
  195. if err != nil {
  196. return
  197. }
  198. return postPageHtml(baseUrl, formData, fetchNum)
  199. }
  200. var b bytes.Buffer
  201. writer := multipart.NewWriter(&b)
  202. for k, v := range formData {
  203. _ = writer.WriteField(k, v)
  204. }
  205. writer.Close()
  206. req, e := http.NewRequest("POST", baseUrl, &b)
  207. if e != nil {
  208. err = e
  209. return
  210. }
  211. cli := new(http.Client)
  212. // 设置请求头
  213. req.Header.Add("Accept", " text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
  214. req.Header.Add("Accept-Encoding", "gzip, deflate, br, zstd")
  215. req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
  216. req.Header.Add("Cache-Control", "max-age=0")
  217. req.Header.Add("Connection", "keep-alive")
  218. req.Header.Add("Content-Type", writer.FormDataContentType())
  219. req.Header.Add("Cookie", strCookie)
  220. req.Header.Add("Host", "www.ccf.com.cn")
  221. req.Header.Add("Origin", "https://www.ccf.com.cn")
  222. req.Header.Add("Referer", baseUrl)
  223. req.Header.Add("Sec-Ch-Ua", `"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"`)
  224. req.Header.Add("Sec-Ch-Ua-Mobile", " ?0")
  225. req.Header.Add("Sec-Ch-Ua-Platform", `"Windows"`)
  226. req.Header.Add("Sec-Fetch-Dest", "document")
  227. req.Header.Add("Sec-Fetch-Mode", "navigate")
  228. req.Header.Add("Sec-Fetch-Site", "same-origin")
  229. req.Header.Add("Sec-Fetch-User", "?1")
  230. req.Header.Add("Upgrade-Insecure-Requests", "1")
  231. req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
  232. resp, e := cli.Do(req)
  233. if e != nil {
  234. err = fmt.Errorf("HTTP client Do err: %s", e.Error())
  235. fmt.Println("HTTP client Do err:", e.Error())
  236. return
  237. }
  238. defer func() {
  239. _ = resp.Body.Close()
  240. }()
  241. reader, e := gzip.NewReader(resp.Body)
  242. if e != nil {
  243. err = fmt.Errorf("gzip NewReader err: %s", e.Error())
  244. return
  245. }
  246. body, e := io.ReadAll(reader)
  247. if e != nil {
  248. err = fmt.Errorf("读取body失败, err: %s", e.Error())
  249. return
  250. }
  251. utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(body))
  252. if e != nil {
  253. err = fmt.Errorf("utf8 reader err: %s", e.Error())
  254. return
  255. }
  256. utf8Body, e := io.ReadAll(utf8Reader)
  257. if e != nil {
  258. err = fmt.Errorf("utf8 body err: %s", e.Error())
  259. return
  260. }
  261. respBody = utf8Body
  262. isLoginPage := checkIsLoginPage(string(respBody))
  263. fmt.Println("是否登录页:", isLoginPage)
  264. // 如果是登录页,且查询次数少于2次,那么就重新登录后查询
  265. if isLoginPage && fetchNum < 2 {
  266. _, err = getCookieByChrome()
  267. if err != nil {
  268. return
  269. }
  270. return postPageHtml(baseUrl, formData, fetchNum)
  271. }
  272. return
  273. }
  274. // DataRule 数据爬取规则
  275. type DataRule struct {
  276. Name string `json:"Name"`
  277. Frequency string `json:"Frequency"`
  278. PageDir string `json:"PageDir"`
  279. Search struct {
  280. ClassId string `json:"ClassId"`
  281. SubClassId string `json:"SubClassId"`
  282. ProductId string `json:"ProductId"`
  283. SubProductId string `json:"SubProductId"`
  284. SimpleTerms string `json:"SimpleTerms"`
  285. } `json:"Search"`
  286. TableFetch []struct {
  287. Keyword string `json:"Keyword"`
  288. Unit string `json:"Unit"`
  289. } `json:"TableFetch"`
  290. EdbMatch []DataRuleEdbMatch `json:"EdbMatch"`
  291. StockTable struct {
  292. ClassifyId int `json:"ClassifyId"`
  293. } `json:"StockTable"`
  294. }
  295. // DataRuleEdbMatch 数据爬取规则-指标匹配
  296. type DataRuleEdbMatch struct {
  297. IndexCode string `json:"IndexCode"`
  298. IndexName string `json:"IndexName"`
  299. ClassifyId int `json:"ClassifyId"`
  300. Frequency string `json:"Frequency"`
  301. Product string `json:"Product"`
  302. Market string `json:"Market"`
  303. MatchUnit string `json:"MatchUnit" description:"匹配单位"`
  304. Unit string `json:"Unit" description:"实际单位"`
  305. }
  306. // loadDataRule 从配置中读取爬取规则
  307. func loadDataRule(nameKey string) (fetchRule *DataRule, err error) {
  308. if utils.CCFDataRuleFile == "" {
  309. err = fmt.Errorf("rule文件不存在")
  310. return
  311. }
  312. b, e := os.ReadFile(utils.CCFDataRuleFile)
  313. if e != nil {
  314. err = fmt.Errorf("读取rule文件失败, err: %v", e)
  315. return
  316. }
  317. rules := make([]*DataRule, 0)
  318. if e = json.Unmarshal(b, &rules); e != nil {
  319. err = fmt.Errorf("解析rule文件失败, err: %v", e)
  320. return
  321. }
  322. for _, v := range rules {
  323. if v.Name != "" && v.Name == nameKey {
  324. fetchRule = v
  325. return
  326. }
  327. }
  328. err = fmt.Errorf("rule不存在, nameKey: %s", nameKey)
  329. return
  330. }
  331. // savePageHtml 拉取历史报告详情
  332. func savePageHtml(nameKey, saveDir string, historyPage bool, reportMax int) (files []string, err error) {
  333. if nameKey == "" {
  334. return
  335. }
  336. defer func() {
  337. if err != nil {
  338. tips := fmt.Sprintf("GetCCFOilEdbHistory ErrMsg: %s", err.Error())
  339. utils.FileLog.Info(tips)
  340. fmt.Println(tips)
  341. }
  342. }()
  343. fetchRule, e := loadDataRule(nameKey)
  344. if e != nil {
  345. err = fmt.Errorf("loadDataRule, err: %v", e)
  346. return
  347. }
  348. if saveDir == "" {
  349. saveDir = "static/ccf"
  350. }
  351. // 获取品种第一页
  352. baseUrl := fmt.Sprintf(`%s?newssubmit=1&sitename=localhost`, CCFSearchPageUrl)
  353. if fetchRule.Search.ClassId != "" {
  354. baseUrl = fmt.Sprintf(`%s&ClassID=%s`, baseUrl, fetchRule.Search.ClassId)
  355. }
  356. if fetchRule.Search.SubClassId != "" {
  357. baseUrl = fmt.Sprintf(`%s&SubClassID=%s`, baseUrl, fetchRule.Search.SubClassId)
  358. }
  359. if fetchRule.Search.ProductId != "" {
  360. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.ProductId)
  361. }
  362. if fetchRule.Search.SubProductId != "" {
  363. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.SubProductId)
  364. }
  365. if fetchRule.Search.SimpleTerms != "" {
  366. termsEncode, e := gb2312ToPercentEncoding(fetchRule.Search.SimpleTerms)
  367. if e != nil {
  368. err = fmt.Errorf("gb2312ToPercentEncoding err: %v", e)
  369. return
  370. }
  371. baseUrl = fmt.Sprintf(`%s&simpleterms=%s`, baseUrl, termsEncode)
  372. }
  373. firstPage := fmt.Sprintf(`%s&cur_pg_num=%d`, baseUrl, 1)
  374. // 首页报告链接
  375. firstHtml, e := fetchPageHtml(firstPage, 0)
  376. if e != nil {
  377. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  378. return
  379. }
  380. firstHrefs, e := analysisReportHrefs(firstHtml, 1)
  381. if e != nil {
  382. err = fmt.Errorf("读取首页列表报告链接失败, err: %v", e)
  383. return
  384. }
  385. var historyHrefs []ReportHrefs
  386. historyHrefs = append(historyHrefs, firstHrefs...)
  387. ticker := time.NewTicker(5 * time.Second)
  388. defer ticker.Stop()
  389. // 历史报告
  390. if historyPage {
  391. endPage, e := analysisEndPage(firstHtml)
  392. if e != nil {
  393. err = fmt.Errorf("解析首页最后页码失败, err: %v", e)
  394. return
  395. }
  396. if endPage > 1 {
  397. for i := 2; i <= endPage; i++ {
  398. <-ticker.C
  399. fmt.Printf("开始读取历史页%d\n", i)
  400. // 每页28条数据, 需要带上页码*28的偏移量不然始终获取第一页
  401. pageUrl := fmt.Sprintf(`%s&cur_pg_num=%d&cur_row_pos=%d`, baseUrl, i, i*28)
  402. fmt.Println("pageUrl: ", pageUrl)
  403. pageContents, e := fetchPageHtml(pageUrl, 0)
  404. if e != nil {
  405. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  406. return
  407. }
  408. pageHrefs, e := analysisReportHrefs(pageContents, i)
  409. if e != nil {
  410. err = fmt.Errorf("读取第%d页列表报告链接失败, err: %v", i, e)
  411. return
  412. }
  413. historyHrefs = append(historyHrefs, pageHrefs...)
  414. fmt.Printf("结束读取历史页%d\n", i)
  415. }
  416. }
  417. fmt.Println("endPage: ", endPage)
  418. }
  419. fmt.Println("historyHrefs len: ", len(historyHrefs))
  420. fmt.Println("historyHrefs: ", historyHrefs)
  421. // 拉取报告留档
  422. strDate := time.Now().Format("20060102")
  423. reportCount := 0
  424. for _, v := range historyHrefs {
  425. <-ticker.C
  426. if reportMax > 0 {
  427. reportCount += 1
  428. if reportCount > reportMax {
  429. break
  430. }
  431. }
  432. fmt.Printf("拉取报告: %s; url: %s\n", v.Title, v.Href)
  433. htm, e := fetchPageHtml(fmt.Sprintf("%s%s", CCFReportDetailBaseUrl, v.Href), 0)
  434. if e != nil {
  435. utils.FileLog.Info("获取页面失败, err: %v", e)
  436. continue
  437. }
  438. dateDir := fmt.Sprintf("%s/%s", saveDir, strDate)
  439. if e = utils.MkDir(dateDir); e != nil {
  440. utils.FileLog.Info(fmt.Sprintf("创建目录失败, err: %v", e))
  441. continue
  442. }
  443. outputPath := fmt.Sprintf("%s/%d-%s.html", dateDir, v.Page, v.Title)
  444. if e = utils.WriteHTMLToFile(string(htm), outputPath); e != nil {
  445. utils.FileLog.Info(fmt.Sprintf("写入HTML出错, err: %v", e))
  446. continue
  447. }
  448. files = append(files, outputPath)
  449. }
  450. fmt.Println("拉取报告 end")
  451. return
  452. }
  453. // analysisEndPage 读取列表页最后一页页码
  454. func analysisEndPage(contents []byte) (endPage int, err error) {
  455. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  456. if e != nil {
  457. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  458. return
  459. }
  460. // 查找页码元素并遍历a标签
  461. sectionDigg := doc.Find(".digg")
  462. aElements := sectionDigg.Find("a")
  463. // 获取倒数第二个a标签中的页码
  464. totalAElements := aElements.Length()
  465. targetIndex := totalAElements - 2
  466. if targetIndex >= 0 && targetIndex < totalAElements {
  467. targetA := aElements.Eq(targetIndex)
  468. txt := targetA.Text()
  469. endPage, e = strconv.Atoi(txt)
  470. if e != nil {
  471. err = fmt.Errorf("页码文本有误, %s", txt)
  472. return
  473. }
  474. fmt.Println(endPage)
  475. return
  476. }
  477. endPage = 1
  478. return
  479. }
  480. // ReportHrefs 报告链接
  481. type ReportHrefs struct {
  482. Title string `description:"报告标题"`
  483. Href string `description:"报告详情链接"`
  484. Page int `description:"页码"`
  485. }
  486. // analysisReportHrefs 解析列表页报告链接
  487. func analysisReportHrefs(contents []byte, page int) (hrefs []ReportHrefs, err error) {
  488. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  489. if e != nil {
  490. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  491. return
  492. }
  493. doc.Find("ul.newslist li a").Each(func(_ int, s *goquery.Selection) {
  494. href, exists := s.Attr("href")
  495. if exists {
  496. title := s.Text()
  497. hrefs = append(hrefs, ReportHrefs{
  498. Title: title,
  499. Href: href,
  500. Page: page,
  501. })
  502. }
  503. })
  504. return
  505. }
  506. // extractReportPublishTime 提取报告发布时间
  507. func extractReportPublishTime(text string) (time.Time, error) {
  508. datePattern := `(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}:\d{2})`
  509. re := regexp.MustCompile(datePattern)
  510. var strTime string
  511. match := re.FindStringSubmatch(text)
  512. if len(match) <= 0 {
  513. return time.Time{}, fmt.Errorf("没有读取出日期")
  514. }
  515. strTime = match[0]
  516. // 转为时间格式
  517. dateFormat := "2006年01月02日15:04"
  518. parsedDate, e := time.ParseInLocation(dateFormat, strTime, time.Local)
  519. if e != nil {
  520. return time.Time{}, fmt.Errorf("日期转换失败, str: %s, err: %v", strTime, e)
  521. }
  522. return parsedDate, nil
  523. }
  524. // calculateDataHalfVal 提取字符串中的两个数字(支持整数和浮点数),计算它们的平均值,并格式化输出
  525. func calculateDataHalfVal(duration string) (string, error) {
  526. // 支持整数和浮点数匹配
  527. re := regexp.MustCompile(`\d+(?:\.\d+)?`)
  528. matches := re.FindAllString(duration, -1)
  529. if len(matches) != 2 {
  530. return "", fmt.Errorf("未找到两个数字: %s", duration)
  531. }
  532. a, err := strconv.ParseFloat(matches[0], 64)
  533. if err != nil {
  534. return "", err
  535. }
  536. b, err := strconv.ParseFloat(matches[1], 64)
  537. if err != nil {
  538. return "", err
  539. }
  540. average := (a + b) / 2.0
  541. // 四舍五入到两位小数
  542. rounded := math.Round(average*100) / 100
  543. // 使用 Sprintf 自动判断是否显示小数位
  544. switch rounded {
  545. case math.Trunc(rounded):
  546. return fmt.Sprintf("%.0f", rounded), nil
  547. case math.Round(rounded*10) / 10:
  548. return fmt.Sprintf("%.1f", rounded), nil
  549. default:
  550. return fmt.Sprintf("%.2f", rounded), nil
  551. }
  552. }
  553. // gb2312ToPercentEncoding 中文字符转码
  554. func gb2312ToPercentEncoding(input string) (string, error) {
  555. // 创建GB18030编码转换器(兼容GB2312)
  556. encoder := simplifiedchinese.GB18030.NewEncoder()
  557. // 使用转换器将字符串转换为GB18030编码的字节流,并写入bytes.Buffer
  558. var buf bytes.Buffer
  559. writer := transform.NewWriter(&buf, encoder)
  560. _, err := writer.Write([]byte(input))
  561. if err != nil {
  562. return "", err
  563. }
  564. err = writer.Close()
  565. if err != nil {
  566. return "", err
  567. }
  568. // 将字节流转换为百分号编码
  569. percentEncoded := url.QueryEscape(buf.String())
  570. return percentEncoded, nil
  571. }
  572. // AnalysisNoneMergeTablePars 解析无合并单元格的简单表格入参
  573. type AnalysisNoneMergeTablePars struct {
  574. DocTable *goquery.Selection
  575. MarketCol struct {
  576. HasCol bool `description:"是否有市场列"`
  577. ColIndex int `description:"市场列"`
  578. }
  579. DateCol struct {
  580. StartIndex int `description:"日期开始列"`
  581. EndIndex int `description:"日期结束列"`
  582. PublishTime time.Time `description:"报告发布时间"`
  583. //PublishYear int `description:"报告发布年份"`
  584. StrTimeFormat string `description:"数据日期格式-需拼接日期列中的变量"`
  585. TimeFormat []string `description:"标准日期格式, 可能存在多种分别进行遍历"`
  586. SplitLast bool `description:"是否分隔日期: 如1.24-1.28"`
  587. SplitFlag string `description:"分隔日期分隔符: 如-"`
  588. }
  589. ValCol struct {
  590. SplitHalfVal bool `description:"是否取折中值: 如8-10天, 9-12天"`
  591. }
  592. }
  593. // TableRow 读取Table的行信息
  594. type TableRow struct {
  595. Product string
  596. Market string
  597. DateData map[string]string
  598. Unit string
  599. }
  600. // analysisNoneMergeTable 解析无合并单元格的简单表格
  601. func analysisNoneMergeTable(params AnalysisNoneMergeTablePars) (items []TableRow) {
  602. if params.DocTable != nil && params.DocTable.Length() <= 0 {
  603. return
  604. }
  605. attemptDates := []string{"2006/1/2", "2006/01/02", "2006/01/2", "2006/1/02", "2006-1-2", "2006-01-02", "2006-01-2", "2006-1-02", "2006.01.02", "2006.1.2", "2006.1.02", "2006.01.2", "2006年01月02日", "2006年1月2日", "2006年1月02日", "2006年01月2日"}
  606. colDate := make(map[int]string)
  607. params.DocTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  608. cells := s.Find("td")
  609. // 表头取出日期
  610. if i == 0 {
  611. cells.Each(func(ii int, ss *goquery.Selection) {
  612. cellTxt := strings.TrimSpace(ss.Text())
  613. //fmt.Println("cellTxt", cellTxt)
  614. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  615. //strTime := fmt.Sprintf("%d.%s", publishYear, cellTxt)
  616. //var strTimeFormat string
  617. completeTime := cellTxt
  618. // 是否需要拼接年份
  619. if params.DateCol.StrTimeFormat != "" {
  620. strDate := cellTxt
  621. // 是否取分隔日期的后一个日期
  622. if params.DateCol.SplitLast && params.DateCol.SplitFlag != "" {
  623. dateArr := strings.Split(cellTxt, params.DateCol.SplitFlag)
  624. if len(dateArr) > 1 {
  625. strDate = dateArr[len(dateArr)-1]
  626. }
  627. }
  628. completeTime = fmt.Sprintf(params.DateCol.StrTimeFormat, params.DateCol.PublishTime.Year(), strDate)
  629. }
  630. //fmt.Println("completeTime: ", completeTime)
  631. // 遍历多种可能的日期格式
  632. var colTime time.Time
  633. for _, f := range params.DateCol.TimeFormat {
  634. t, e := time.ParseInLocation(f, completeTime, time.Local)
  635. if e != nil {
  636. continue
  637. }
  638. colTime = t
  639. break
  640. }
  641. // 统一判断一次, 入参的日期格式可能不全
  642. if colTime.IsZero() {
  643. utils.FileLog.Info(fmt.Sprintf("日期格式异常: cellTxt-%s; completeTime-%s", cellTxt, completeTime))
  644. for _, f := range attemptDates {
  645. t, e := time.ParseInLocation(f, completeTime, time.Local)
  646. if e != nil {
  647. continue
  648. }
  649. colTime = t
  650. break
  651. }
  652. }
  653. // 判断报告是否跨年
  654. if colTime.AddDate(0, -6, 0).After(params.DateCol.PublishTime) {
  655. utils.FileLog.Info(fmt.Sprintf("跨年判断: ColTime-%v; PublishTime-%v", colTime, params.DateCol.PublishTime))
  656. colTime = colTime.AddDate(-1, 0, 0)
  657. }
  658. if !colTime.IsZero() {
  659. colDate[ii] = colTime.Format(utils.FormatDate)
  660. }
  661. fmt.Println("日期:", colTime.Format(utils.FormatDate))
  662. }
  663. })
  664. }
  665. // 取指标
  666. if i > 0 {
  667. row := TableRow{
  668. DateData: make(map[string]string),
  669. }
  670. cells.Each(func(ii int, ss *goquery.Selection) {
  671. cellTxt := filterInvalidVal(ss.Text())
  672. //fmt.Println("cellTxt", cellTxt)
  673. if ii == 0 {
  674. row.Product = cellTxt
  675. }
  676. if params.MarketCol.HasCol && ii == params.MarketCol.ColIndex {
  677. row.Market = cellTxt
  678. }
  679. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  680. d, ok := colDate[ii]
  681. if !ok {
  682. return
  683. }
  684. // 是否取折中值
  685. if params.ValCol.SplitHalfVal {
  686. val, e := calculateDataHalfVal(cellTxt)
  687. if e != nil {
  688. fmt.Printf("calculateDataHalfVal err: %v\n", e)
  689. return
  690. }
  691. cellTxt = val
  692. }
  693. if cellTxt != "" {
  694. row.DateData[d] = cellTxt
  695. }
  696. }
  697. })
  698. //fmt.Println(row)
  699. items = append(items, row)
  700. }
  701. })
  702. return
  703. }
  704. // formatTableRow2ValidEdb 表格行转换为有效指标
  705. func formatTableRow2ValidEdb(rows []TableRow, edbMatch []DataRuleEdbMatch) (indexes []*HandleIndexData) {
  706. indexes = make([]*HandleIndexData, 0)
  707. for _, m := range edbMatch {
  708. for _, v := range rows {
  709. fmt.Printf("产品: %s, 市场: %s, 日期数据: %v, 单位: %s\n", v.Product, v.Market, v.DateData, v.Unit)
  710. var productOk, marketOk, unitOk bool
  711. if (m.Product == "" && v.Product == "") || (m.Product != "" && strings.Contains(v.Product, m.Product)) {
  712. productOk = true
  713. }
  714. if (m.Market == "" && v.Market == "") || (m.Market != "" && strings.Contains(v.Market, m.Market)) {
  715. marketOk = true
  716. }
  717. if (m.MatchUnit == "" && v.Unit == "") || (m.MatchUnit != "" && strings.Contains(v.Unit, m.MatchUnit)) {
  718. unitOk = true
  719. }
  720. if productOk && marketOk && unitOk {
  721. edb := new(HandleIndexData)
  722. edb.IndexCode = m.IndexCode
  723. edb.IndexName = m.IndexName
  724. edb.ClassifyId = m.ClassifyId
  725. edb.Frequency = m.Frequency
  726. edb.Unit = m.Unit
  727. edb.DateData = v.DateData
  728. edb.TerminalCode = utils.TerminalCode
  729. indexes = append(indexes, edb)
  730. }
  731. }
  732. }
  733. return
  734. }
  735. // listFiles 列出目录下所有文件名
  736. func listFiles(dirPath string) ([]string, error) {
  737. var files []string
  738. err := filepath.Walk(dirPath, func(path string, info os.FileInfo, err error) error {
  739. if err != nil {
  740. return err
  741. }
  742. if !info.IsDir() {
  743. files = append(files, info.Name())
  744. }
  745. return nil
  746. })
  747. if err != nil {
  748. return nil, err
  749. }
  750. return files, nil
  751. }
  752. // filterInvalidVal 过滤无效值
  753. func filterInvalidVal(cellTxt string) string {
  754. cellTxt = strings.TrimSpace(cellTxt)
  755. if cellTxt == "休市" || cellTxt == "/" || cellTxt == "-" || cellTxt == "—" {
  756. return ""
  757. }
  758. return cellTxt
  759. }
  760. // formatIntervalData 格式化区间值
  761. func formatIntervalData(cellTxt, flag string) string {
  762. cellTxt = filterInvalidVal(cellTxt)
  763. if flag == "" {
  764. flag = "-"
  765. }
  766. matches := strings.Split(cellTxt, flag)
  767. if len(matches) < 2 {
  768. return cellTxt
  769. }
  770. if len(matches) != 2 {
  771. return ""
  772. }
  773. // 转换不了直接返回空值
  774. a, e := strconv.ParseFloat(matches[0], 64)
  775. if e != nil {
  776. return ""
  777. }
  778. b, e := strconv.ParseFloat(matches[1], 64)
  779. if e != nil {
  780. return ""
  781. }
  782. average := (a + b) / 2
  783. return fmt.Sprint(average)
  784. }
  785. // getCookie
  786. // @Description: 获取cookie
  787. // @author: Roc
  788. // @datetime 2024-07-09 14:00:53
  789. // @return cookieStr string
  790. // @return err error
  791. func getCookie() (cookieStr string, err error) {
  792. // 读取Cookie
  793. if utils.CCFCookieFile == "" {
  794. err = fmt.Errorf("cookie文件未配置")
  795. return
  796. }
  797. cookieByte, e := os.ReadFile(utils.CCFCookieFile)
  798. if e != nil {
  799. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  800. return
  801. }
  802. cookieStr = strings.TrimSpace(string(cookieByte))
  803. //if cookieStr == "" {
  804. // err = fmt.Errorf("cookie为空")
  805. // return
  806. //}
  807. return
  808. }
  809. // getCookieByChrome
  810. // @Description: 获取cookie
  811. // @author: Roc
  812. // @datetime 2024-07-09 14:00:53
  813. // @return cookieStr string
  814. // @return err error
  815. func getCookieByChrome() (cookieStr string, err error) {
  816. // 读取Cookie
  817. if utils.CCFUseName == "" {
  818. err = fmt.Errorf("CCF账号未设置")
  819. return
  820. }
  821. if utils.CCFPassword == "" {
  822. err = fmt.Errorf("CCF密码未设置")
  823. return
  824. }
  825. opts := append(
  826. chromedp.DefaultExecAllocatorOptions[:],
  827. chromedp.Flag("headless", false),
  828. )
  829. allocCtx, cancel1 := chromedp.NewExecAllocator(context.Background(), opts...)
  830. defer cancel1()
  831. // 创建chrome实例
  832. ctx, cancel2 := chromedp.NewContext(
  833. allocCtx,
  834. chromedp.WithLogf(log.Printf),
  835. )
  836. defer cancel2()
  837. err = chromedp.Run(ctx,
  838. chromedp.Navigate(`https://www.ccf.com.cn/member/member.php`),
  839. chromedp.SetValue(`input[name="username"]`, utils.CCFUseName, chromedp.ByQuery),
  840. chromedp.SetValue(`input[name="password"]`, utils.CCFPassword, chromedp.ByQuery),
  841. chromedp.Sleep(2*time.Second),
  842. chromedp.Click(`input[id="imageField"]`, chromedp.ByQuery),
  843. chromedp.Sleep(5*time.Second),
  844. chromedp.Navigate(`https://www.ccf.com.cn/newscenter/detail-410000-2024070600003.shtml`),
  845. chromedp.Sleep(2*time.Second),
  846. chromedp.ActionFunc(func(ctx context.Context) error {
  847. cookies, err := network.GetCookies().Do(ctx)
  848. if err != nil {
  849. return err
  850. }
  851. //cookieJson, err := json.Marshal(cookies)
  852. //if err != nil {
  853. // return err
  854. //}
  855. //fmt.Println("cookieJson:", string(cookieJson))
  856. //utils.FileLog.Info("cookieJson:" + string(cookieJson))
  857. for _, v := range cookies {
  858. cookieStr = cookieStr + v.Name + "=" + v.Value + ";"
  859. }
  860. fmt.Println("header cookie:", cookieStr)
  861. utils.FileLog.Info("header cookie:" + cookieStr)
  862. tmpFile, tmpErr := os.Create(utils.CCFCookieFile)
  863. if tmpErr != nil {
  864. fmt.Println("创建cookie文件失败:", tmpErr.Error())
  865. return nil
  866. }
  867. if _, err := tmpFile.WriteString(cookieStr); err != nil {
  868. fmt.Println("写入cookie到文件失败:", err.Error())
  869. return nil
  870. }
  871. return nil
  872. }),
  873. )
  874. //if err != nil {
  875. // fmt.Println(err)
  876. //}
  877. return
  878. }
  879. // checkIsLoginPage
  880. // @Description: 校验是否是登录页
  881. // @author: Roc
  882. // @datetime 2024-07-09 16:34:17
  883. // @param bodyStr string
  884. // @return isLoginPage bool
  885. func checkIsLoginPage(bodyStr string) (isLoginPage bool) {
  886. // 初始化goquery.Document
  887. doc, err := goquery.NewDocumentFromReader(strings.NewReader(bodyStr))
  888. if err != nil {
  889. log.Fatal(err)
  890. }
  891. // 查找name为LoginForm的表单
  892. doc.Find("form[name=LoginForm]").Each(func(i int, s *goquery.Selection) {
  893. // 如果找到了,打印信息表示这是登录页
  894. //fmt.Println("这是一个登录页面")
  895. isLoginPage = true
  896. return
  897. })
  898. // 如果没有找到,打印信息表示这不是登录页
  899. //if doc.Find("form[name=LoginForm]").Length() == 0 {
  900. // fmt.Println("这不是一个登录页面")
  901. //}
  902. return
  903. }