common.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724
  1. package base_from_ccf
  2. import (
  3. "bytes"
  4. "compress/gzip"
  5. "encoding/json"
  6. "eta/eta_data_analysis/utils"
  7. "fmt"
  8. "github.com/PuerkitoBio/goquery"
  9. "golang.org/x/net/html/charset"
  10. "golang.org/x/text/encoding/simplifiedchinese"
  11. "golang.org/x/text/transform"
  12. "io/ioutil"
  13. "net/http"
  14. "net/url"
  15. "os"
  16. "path/filepath"
  17. "regexp"
  18. "strconv"
  19. "strings"
  20. "time"
  21. )
  22. const (
  23. CCFSearchPageUrl = "https://www.ccf.com.cn/newscenter/simplesearch.php" // CCF搜索页地址
  24. CCFReportDetailBaseUrl = "https://www.ccf.com.cn" // CCF报告详情页地址
  25. )
  26. // postEdbLib 调用指标接口
  27. func postEdbLib(param map[string]interface{}, method string) (result []byte, err error) {
  28. postUrl := utils.EDB_LIB_URL + method
  29. postData, err := json.Marshal(param)
  30. if err != nil {
  31. return
  32. }
  33. result, err = httpPost(postUrl, string(postData), "application/json")
  34. if err != nil {
  35. return
  36. }
  37. return
  38. }
  39. // httpPost HTTP请求
  40. func httpPost(url, postData string, params ...string) ([]byte, error) {
  41. fmt.Println("httpPost Url:" + url)
  42. body := ioutil.NopCloser(strings.NewReader(postData))
  43. client := &http.Client{}
  44. req, err := http.NewRequest("POST", url, body)
  45. if err != nil {
  46. return nil, err
  47. }
  48. contentType := "application/x-www-form-urlencoded;charset=utf-8"
  49. if len(params) > 0 && params[0] != "" {
  50. contentType = params[0]
  51. }
  52. req.Header.Set("Content-Type", contentType)
  53. req.Header.Set("authorization", utils.MD5(utils.APP_EDB_LIB_NAME_EN+utils.EDB_LIB_Md5_KEY))
  54. resp, err := client.Do(req)
  55. if err != nil {
  56. fmt.Println("client.Do err:" + err.Error())
  57. return nil, err
  58. }
  59. defer resp.Body.Close()
  60. b, err := ioutil.ReadAll(resp.Body)
  61. if err != nil {
  62. fmt.Println("httpPost:" + string(b))
  63. }
  64. return b, err
  65. }
  66. // fetchPageHtml 获取网站HTML文本
  67. func fetchPageHtml(baseUrl string) (respBody []byte, err error) {
  68. defer func() {
  69. if err != nil {
  70. tips := fmt.Sprintf("BuildCCFRequest ErrMsg: %s", err.Error())
  71. utils.FileLog.Info(tips)
  72. fmt.Println(tips)
  73. }
  74. }()
  75. if baseUrl == "" {
  76. err = fmt.Errorf("CCF请求地址为空")
  77. return
  78. }
  79. // 读取Cookie
  80. if utils.CCFCookieFile == "" {
  81. err = fmt.Errorf("cookie文件未配置")
  82. return
  83. }
  84. cookieByte, e := ioutil.ReadFile(utils.CCFCookieFile)
  85. if e != nil {
  86. err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
  87. return
  88. }
  89. strCookie := strings.TrimSpace(string(cookieByte))
  90. if strCookie == "" {
  91. err = fmt.Errorf("cookie为空")
  92. return
  93. }
  94. // 拉取网站内容
  95. cli := new(http.Client)
  96. req, e := http.NewRequest("GET", baseUrl, nil)
  97. if e != nil {
  98. err = fmt.Errorf("")
  99. return
  100. }
  101. req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
  102. req.Header.Set("Accept-Encoding", "gzip, deflate, br")
  103. req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
  104. req.Header.Set("Connection", "keep-alive")
  105. req.Header.Set("Cookie", strCookie)
  106. req.Header.Set("Host", "www.ccf.com.cn")
  107. req.Header.Set("Referer", baseUrl)
  108. req.Header.Set("Sec-Ch-Ua", "\"Not A(Brand\";v=\"99\", \"Microsoft Edge\";v=\"121\", \"Chromium\";v=\"121\"")
  109. req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
  110. req.Header.Set("Sec-Ch-Ua-Platform", "\"Windows\"")
  111. req.Header.Set("Sec-Fetch-Dest", "empty")
  112. req.Header.Set("Sec-Fetch-Mode", "cors")
  113. req.Header.Set("Sec-Fetch-Site", "same-origin")
  114. req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0")
  115. req.Header.Set("X-Requested-With", "XMLHttpRequest")
  116. resp, e := cli.Do(req)
  117. if e != nil {
  118. err = fmt.Errorf("HTTP client Do err: %s", e.Error())
  119. return
  120. }
  121. defer func() {
  122. _ = resp.Body.Close()
  123. }()
  124. // 读取响应的内容
  125. reader, e := gzip.NewReader(resp.Body)
  126. if e != nil {
  127. err = fmt.Errorf("gzip NewReader err: %s", e.Error())
  128. return
  129. }
  130. body, e := ioutil.ReadAll(reader)
  131. if e != nil {
  132. err = fmt.Errorf("read body err: %s", e.Error())
  133. return
  134. }
  135. // 转换编码
  136. utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(body))
  137. if e != nil {
  138. err = fmt.Errorf("utf8 reader err: %s", e.Error())
  139. return
  140. }
  141. utf8Body, e := ioutil.ReadAll(utf8Reader)
  142. if e != nil {
  143. err = fmt.Errorf("utf8 body err: %s", e.Error())
  144. return
  145. }
  146. respBody = utf8Body
  147. return
  148. }
  149. // DataRule 数据爬取规则
  150. type DataRule struct {
  151. Name string `json:"Name"`
  152. Frequency string `json:"Frequency"`
  153. PageDir string `json:"PageDir"`
  154. Search struct {
  155. ClassId string `json:"ClassId"`
  156. SubClassId string `json:"SubClassId"`
  157. ProductId string `json:"ProductId"`
  158. SubProductId string `json:"SubProductId"`
  159. SimpleTerms string `json:"SimpleTerms"`
  160. } `json:"Search"`
  161. TableFetch []struct {
  162. Keyword string `json:"Keyword"`
  163. Unit string `json:"Unit"`
  164. } `json:"TableFetch"`
  165. EdbMatch []DataRuleEdbMatch `json:"EdbMatch"`
  166. StockTable struct {
  167. ClassifyId int `json:"ClassifyId"`
  168. } `json:"StockTable"`
  169. }
  170. // DataRuleEdbMatch 数据爬取规则-指标匹配
  171. type DataRuleEdbMatch struct {
  172. IndexCode string `json:"IndexCode"`
  173. IndexName string `json:"IndexName"`
  174. ClassifyId int `json:"ClassifyId"`
  175. Frequency string `json:"Frequency"`
  176. Product string `json:"Product"`
  177. Market string `json:"Market"`
  178. MatchUnit string `json:"MatchUnit" description:"匹配单位"`
  179. Unit string `json:"Unit" description:"实际单位"`
  180. }
  181. // loadDataRule 从配置中读取爬取规则
  182. func loadDataRule(nameKey string) (fetchRule *DataRule, err error) {
  183. if utils.CCFDataRuleFile == "" {
  184. err = fmt.Errorf("rule文件不存在")
  185. return
  186. }
  187. b, e := ioutil.ReadFile(utils.CCFDataRuleFile)
  188. if e != nil {
  189. err = fmt.Errorf("读取rule文件失败, err: %v", e)
  190. return
  191. }
  192. rules := make([]*DataRule, 0)
  193. if e = json.Unmarshal(b, &rules); e != nil {
  194. err = fmt.Errorf("解析rule文件失败, err: %v", e)
  195. return
  196. }
  197. for _, v := range rules {
  198. if v.Name != "" && v.Name == nameKey {
  199. fetchRule = v
  200. return
  201. }
  202. }
  203. err = fmt.Errorf("rule不存在, nameKey: %s", nameKey)
  204. return
  205. }
  206. // savePageHtml 拉取历史报告详情
  207. func savePageHtml(nameKey, saveDir string, historyPage bool, reportMax int) (files []string, err error) {
  208. if nameKey == "" {
  209. return
  210. }
  211. defer func() {
  212. if err != nil {
  213. tips := fmt.Sprintf("GetCCFOilEdbHistory ErrMsg: %s", err.Error())
  214. utils.FileLog.Info(tips)
  215. fmt.Println(tips)
  216. }
  217. }()
  218. fetchRule, e := loadDataRule(nameKey)
  219. if e != nil {
  220. err = fmt.Errorf("loadDataRule, err: %v", e)
  221. return
  222. }
  223. if saveDir == "" {
  224. saveDir = "static/ccf"
  225. }
  226. // 获取品种第一页
  227. baseUrl := fmt.Sprintf(`%s?newssubmit=1&sitename=localhost`, CCFSearchPageUrl)
  228. if fetchRule.Search.ClassId != "" {
  229. baseUrl = fmt.Sprintf(`%s&ClassID=%s`, baseUrl, fetchRule.Search.ClassId)
  230. }
  231. if fetchRule.Search.SubClassId != "" {
  232. baseUrl = fmt.Sprintf(`%s&SubClassID=%s`, baseUrl, fetchRule.Search.SubClassId)
  233. }
  234. if fetchRule.Search.ProductId != "" {
  235. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.ProductId)
  236. }
  237. if fetchRule.Search.SubProductId != "" {
  238. baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.SubProductId)
  239. }
  240. if fetchRule.Search.SimpleTerms != "" {
  241. termsEncode, e := gb2312ToPercentEncoding(fetchRule.Search.SimpleTerms)
  242. if e != nil {
  243. err = fmt.Errorf("gb2312ToPercentEncoding err: %v", e)
  244. return
  245. }
  246. baseUrl = fmt.Sprintf(`%s&simpleterms=%s`, baseUrl, termsEncode)
  247. }
  248. firstPage := fmt.Sprintf(`%s&cur_pg_num=%d`, baseUrl, 1)
  249. // 首页报告链接
  250. firstHtml, e := fetchPageHtml(firstPage)
  251. if e != nil {
  252. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  253. return
  254. }
  255. firstHrefs, e := analysisReportHrefs(firstHtml, 1)
  256. if e != nil {
  257. err = fmt.Errorf("读取首页列表报告链接失败, err: %v", e)
  258. return
  259. }
  260. var historyHrefs []ReportHrefs
  261. historyHrefs = append(historyHrefs, firstHrefs...)
  262. ticker := time.NewTicker(5 * time.Second)
  263. defer ticker.Stop()
  264. // 历史报告
  265. if historyPage {
  266. endPage, e := analysisEndPage(firstHtml)
  267. if e != nil {
  268. err = fmt.Errorf("解析首页最后页码失败, err: %v", e)
  269. return
  270. }
  271. if endPage > 1 {
  272. for i := 2; i <= endPage; i++ {
  273. <-ticker.C
  274. fmt.Printf("开始读取历史页%d\n", i)
  275. // 每页28条数据, 需要带上页码*28的偏移量不然始终获取第一页
  276. pageUrl := fmt.Sprintf(`%s&cur_pg_num=%d&cur_row_pos=%d`, baseUrl, i, i*28)
  277. fmt.Println("pageUrl: ", pageUrl)
  278. pageContents, e := fetchPageHtml(pageUrl)
  279. if e != nil {
  280. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  281. return
  282. }
  283. pageHrefs, e := analysisReportHrefs(pageContents, i)
  284. if e != nil {
  285. err = fmt.Errorf("读取第%d页列表报告链接失败, err: %v", i, e)
  286. return
  287. }
  288. historyHrefs = append(historyHrefs, pageHrefs...)
  289. fmt.Printf("结束读取历史页%d\n", i)
  290. }
  291. }
  292. fmt.Println("endPage: ", endPage)
  293. }
  294. fmt.Println("historyHrefs len: ", len(historyHrefs))
  295. fmt.Println("historyHrefs: ", historyHrefs)
  296. // 拉取报告留档
  297. strDate := time.Now().Format("20060102")
  298. reportCount := 0
  299. for _, v := range historyHrefs {
  300. <-ticker.C
  301. if reportMax > 0 {
  302. reportCount += 1
  303. if reportCount > reportMax {
  304. break
  305. }
  306. }
  307. fmt.Printf("拉取报告: %s; url: %s\n", v.Title, v.Href)
  308. htm, e := fetchPageHtml(fmt.Sprintf("%s%s", CCFReportDetailBaseUrl, v.Href))
  309. if e != nil {
  310. err = fmt.Errorf("获取首页HTML失败, err: %v", e)
  311. return
  312. }
  313. outputPath := fmt.Sprintf("%s/%d-%s-%s.html", saveDir, v.Page, strDate, v.Title)
  314. if e = writeHTMLToFile(string(htm), outputPath); e != nil {
  315. fmt.Printf("写入html出错, err: %v", e)
  316. continue
  317. }
  318. files = append(files, outputPath)
  319. }
  320. fmt.Println("拉取报告 end")
  321. return
  322. }
  323. // analysisEndPage 读取列表页最后一页页码
  324. func analysisEndPage(contents []byte) (endPage int, err error) {
  325. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  326. if e != nil {
  327. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  328. return
  329. }
  330. // 查找页码元素并遍历a标签
  331. sectionDigg := doc.Find(".digg")
  332. aElements := sectionDigg.Find("a")
  333. // 获取倒数第二个a标签中的页码
  334. totalAElements := aElements.Length()
  335. targetIndex := totalAElements - 2
  336. if targetIndex >= 0 && targetIndex < totalAElements {
  337. targetA := aElements.Eq(targetIndex)
  338. txt := targetA.Text()
  339. endPage, e = strconv.Atoi(txt)
  340. if e != nil {
  341. err = fmt.Errorf("页码文本有误, %s", txt)
  342. return
  343. }
  344. fmt.Println(endPage)
  345. return
  346. }
  347. endPage = 1
  348. return
  349. }
  350. // ReportHrefs 报告链接
  351. type ReportHrefs struct {
  352. Title string `description:"报告标题"`
  353. Href string `description:"报告详情链接"`
  354. Page int `description:"页码"`
  355. }
  356. // analysisReportHrefs 解析列表页报告链接
  357. func analysisReportHrefs(contents []byte, page int) (hrefs []ReportHrefs, err error) {
  358. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
  359. if e != nil {
  360. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  361. return
  362. }
  363. doc.Find("ul.newslist li a").Each(func(_ int, s *goquery.Selection) {
  364. href, exists := s.Attr("href")
  365. if exists {
  366. title := s.Text()
  367. hrefs = append(hrefs, ReportHrefs{
  368. Title: title,
  369. Href: href,
  370. Page: page,
  371. })
  372. }
  373. })
  374. return
  375. }
  376. // writeHTMLToFile 将HTML内容写入指定的文件中
  377. func writeHTMLToFile(content string, filePath string) error {
  378. // 使用os.Create创建文件,如果文件已存在则会被截断
  379. file, err := os.Create(filePath)
  380. if err != nil {
  381. return err
  382. }
  383. defer file.Close()
  384. // 将HTML内容写入文件
  385. _, err = file.WriteString(content)
  386. if err != nil {
  387. return err
  388. }
  389. return nil
  390. }
  391. // extractReportPublishTime 提取报告发布时间
  392. func extractReportPublishTime(text string) (time.Time, error) {
  393. datePattern := `(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}:\d{2})`
  394. re := regexp.MustCompile(datePattern)
  395. var strTime string
  396. match := re.FindStringSubmatch(text)
  397. if len(match) <= 0 {
  398. return time.Time{}, fmt.Errorf("没有读取出日期")
  399. }
  400. strTime = match[0]
  401. // 转为时间格式
  402. dateFormat := "2006年01月02日15:04"
  403. parsedDate, e := time.ParseInLocation(dateFormat, strTime, time.Local)
  404. if e != nil {
  405. return time.Time{}, fmt.Errorf("日期转换失败, str: %s, err: %v", strTime, e)
  406. }
  407. return parsedDate, nil
  408. }
  409. // calculateDataHalfVal 取出数据区间的折中值, 如"7-9天"返回结果为"8"
  410. func calculateDataHalfVal(duration string) (result string, err error) {
  411. re := regexp.MustCompile(`\d+`)
  412. matches := re.FindAllString(duration, -1)
  413. if len(matches) != 2 {
  414. err = fmt.Errorf("未找到两个数字, Num: %d", len(matches))
  415. return
  416. }
  417. a, e := strconv.Atoi(matches[0])
  418. if e != nil {
  419. err = e
  420. return
  421. }
  422. b, e := strconv.Atoi(matches[1])
  423. if e != nil {
  424. err = e
  425. return
  426. }
  427. average := float64(a+b) / 2.0
  428. // 格式化结果
  429. if average == float64(int(average)) {
  430. result = strconv.Itoa(int(average))
  431. } else {
  432. result = fmt.Sprintf("%.1f", average)
  433. }
  434. return
  435. }
  436. // gb2312ToPercentEncoding 中文字符转码
  437. func gb2312ToPercentEncoding(input string) (string, error) {
  438. // 创建GB18030编码转换器(兼容GB2312)
  439. encoder := simplifiedchinese.GB18030.NewEncoder()
  440. // 使用转换器将字符串转换为GB18030编码的字节流,并写入bytes.Buffer
  441. var buf bytes.Buffer
  442. writer := transform.NewWriter(&buf, encoder)
  443. _, err := writer.Write([]byte(input))
  444. if err != nil {
  445. return "", err
  446. }
  447. err = writer.Close()
  448. if err != nil {
  449. return "", err
  450. }
  451. // 将字节流转换为百分号编码
  452. percentEncoded := url.QueryEscape(buf.String())
  453. return percentEncoded, nil
  454. }
  455. // AnalysisNoneMergeTablePars 解析无合并单元格的简单表格入参
  456. type AnalysisNoneMergeTablePars struct {
  457. DocTable *goquery.Selection
  458. MarketCol struct {
  459. HasCol bool `description:"是否有市场列"`
  460. ColIndex int `description:"市场列"`
  461. }
  462. DateCol struct {
  463. StartIndex int `description:"日期开始列"`
  464. EndIndex int `description:"日期结束列"`
  465. PublishTime time.Time `description:"报告发布时间"`
  466. //PublishYear int `description:"报告发布年份"`
  467. StrTimeFormat string `description:"数据日期格式-需拼接日期列中的变量"`
  468. TimeFormat []string `description:"标准日期格式, 可能存在多种分别进行遍历"`
  469. SplitLast bool `description:"是否分隔日期: 如1.24-1.28"`
  470. SplitFlag string `description:"分隔日期分隔符: 如-"`
  471. }
  472. ValCol struct {
  473. SplitHalfVal bool `description:"是否取折中值: 如8-10天, 9-12天"`
  474. }
  475. }
  476. // TableRow 读取Table的行信息
  477. type TableRow struct {
  478. Product string
  479. Market string
  480. DateData map[string]string
  481. Unit string
  482. }
  483. // analysisNoneMergeTable 解析无合并单元格的简单表格
  484. func analysisNoneMergeTable(params AnalysisNoneMergeTablePars) (items []TableRow) {
  485. if params.DocTable != nil && params.DocTable.Length() <= 0 {
  486. return
  487. }
  488. attemptDates := []string{"2006/1/2", "2006/01/02", "2006/01/2", "2006/1/02", "2006-1-2", "2006-01-02", "2006-01-2", "2006-1-02", "2006.01.02", "2006.1.2", "2006.1.02", "2006.01.2", "2006年01月02日", "2006年1月2日", "2006年1月02日", "2006年01月2日"}
  489. colDate := make(map[int]string)
  490. params.DocTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  491. cells := s.Find("td")
  492. // 表头取出日期
  493. if i == 0 {
  494. cells.Each(func(ii int, ss *goquery.Selection) {
  495. cellTxt := strings.TrimSpace(ss.Text())
  496. //fmt.Println("cellTxt", cellTxt)
  497. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  498. //strTime := fmt.Sprintf("%d.%s", publishYear, cellTxt)
  499. //var strTimeFormat string
  500. completeTime := cellTxt
  501. // 是否需要拼接年份
  502. if params.DateCol.StrTimeFormat != "" {
  503. strDate := cellTxt
  504. // 是否取分隔日期的后一个日期
  505. if params.DateCol.SplitLast && params.DateCol.SplitFlag != "" {
  506. dateArr := strings.Split(cellTxt, params.DateCol.SplitFlag)
  507. if len(dateArr) > 1 {
  508. strDate = dateArr[len(dateArr)-1]
  509. }
  510. }
  511. completeTime = fmt.Sprintf(params.DateCol.StrTimeFormat, params.DateCol.PublishTime.Year(), strDate)
  512. }
  513. //fmt.Println("completeTime: ", completeTime)
  514. // 遍历多种可能的日期格式
  515. var colTime time.Time
  516. for _, f := range params.DateCol.TimeFormat {
  517. t, e := time.ParseInLocation(f, completeTime, time.Local)
  518. if e != nil {
  519. continue
  520. }
  521. colTime = t
  522. break
  523. }
  524. // 统一判断一次, 入参的日期格式可能不全
  525. if colTime.IsZero() {
  526. utils.FileLog.Info(fmt.Sprintf("日期格式异常: cellTxt-%s; completeTime-%s", cellTxt, completeTime))
  527. for _, f := range attemptDates {
  528. t, e := time.ParseInLocation(f, completeTime, time.Local)
  529. if e != nil {
  530. continue
  531. }
  532. colTime = t
  533. break
  534. }
  535. }
  536. // 判断报告是否跨年
  537. if colTime.AddDate(0, -6, 0).After(params.DateCol.PublishTime) {
  538. utils.FileLog.Info(fmt.Sprintf("跨年判断: ColTime-%v; PublishTime-%v", colTime, params.DateCol.PublishTime))
  539. colTime = colTime.AddDate(-1, 0, 0)
  540. }
  541. if !colTime.IsZero() {
  542. colDate[ii] = colTime.Format(utils.FormatDate)
  543. }
  544. fmt.Println("日期:", colTime.Format(utils.FormatDate))
  545. }
  546. })
  547. }
  548. // 取指标
  549. if i > 0 {
  550. row := TableRow{
  551. DateData: make(map[string]string),
  552. }
  553. cells.Each(func(ii int, ss *goquery.Selection) {
  554. cellTxt := filterInvalidVal(ss.Text())
  555. //fmt.Println("cellTxt", cellTxt)
  556. if ii == 0 {
  557. row.Product = cellTxt
  558. }
  559. if params.MarketCol.HasCol && ii == params.MarketCol.ColIndex {
  560. row.Market = cellTxt
  561. }
  562. if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
  563. d, ok := colDate[ii]
  564. if !ok {
  565. return
  566. }
  567. // 是否取折中值
  568. if params.ValCol.SplitHalfVal {
  569. val, e := calculateDataHalfVal(cellTxt)
  570. if e != nil {
  571. fmt.Printf("calculateDataHalfVal err: %v\n", e)
  572. return
  573. }
  574. cellTxt = val
  575. }
  576. if cellTxt != "" {
  577. row.DateData[d] = cellTxt
  578. }
  579. }
  580. })
  581. //fmt.Println(row)
  582. items = append(items, row)
  583. }
  584. })
  585. return
  586. }
  587. // formatTableRow2ValidEdb 表格行转换为有效指标
  588. func formatTableRow2ValidEdb(rows []TableRow, edbMatch []DataRuleEdbMatch) (indexes []*HandleIndexData) {
  589. indexes = make([]*HandleIndexData, 0)
  590. for _, m := range edbMatch {
  591. for _, v := range rows {
  592. fmt.Printf("产品: %s, 市场: %s, 日期数据: %v, 单位: %s\n", v.Product, v.Market, v.DateData, v.Unit)
  593. var productOk, marketOk, unitOk bool
  594. if (m.Product == "" && v.Product == "") || (m.Product != "" && strings.Contains(v.Product, m.Product)) {
  595. productOk = true
  596. }
  597. if (m.Market == "" && v.Market == "") || (m.Market != "" && strings.Contains(v.Market, m.Market)) {
  598. marketOk = true
  599. }
  600. if (m.MatchUnit == "" && v.Unit == "") || (m.MatchUnit != "" && strings.Contains(v.Unit, m.MatchUnit)) {
  601. unitOk = true
  602. }
  603. if productOk && marketOk && unitOk {
  604. edb := new(HandleIndexData)
  605. edb.IndexCode = m.IndexCode
  606. edb.IndexName = m.IndexName
  607. edb.ClassifyId = m.ClassifyId
  608. edb.Frequency = m.Frequency
  609. edb.Unit = m.Unit
  610. edb.DateData = v.DateData
  611. edb.TerminalCode = utils.TerminalCode
  612. indexes = append(indexes, edb)
  613. }
  614. }
  615. }
  616. return
  617. }
  618. // listFiles 列出目录下所有文件名
  619. func listFiles(dirPath string) ([]string, error) {
  620. var files []string
  621. err := filepath.Walk(dirPath, func(path string, info os.FileInfo, err error) error {
  622. if err != nil {
  623. return err
  624. }
  625. if !info.IsDir() {
  626. files = append(files, info.Name())
  627. }
  628. return nil
  629. })
  630. if err != nil {
  631. return nil, err
  632. }
  633. return files, nil
  634. }
  635. // filterInvalidVal 过滤无效值
  636. func filterInvalidVal(cellTxt string) string {
  637. cellTxt = strings.TrimSpace(cellTxt)
  638. if cellTxt == "休市" || cellTxt == "/" || cellTxt == "-" || cellTxt == "—" {
  639. return ""
  640. }
  641. return cellTxt
  642. }
  643. // formatIntervalData 格式化区间值
  644. func formatIntervalData(cellTxt, flag string) string {
  645. cellTxt = filterInvalidVal(cellTxt)
  646. if flag == "" {
  647. flag = "-"
  648. }
  649. matches := strings.Split(cellTxt, flag)
  650. if len(matches) < 2 {
  651. return cellTxt
  652. }
  653. if len(matches) != 2 {
  654. return ""
  655. }
  656. // 转换不了直接返回空值
  657. a, e := strconv.ParseFloat(matches[0], 64)
  658. if e != nil {
  659. return ""
  660. }
  661. b, e := strconv.ParseFloat(matches[1], 64)
  662. if e != nil {
  663. return ""
  664. }
  665. average := (a + b) / 2
  666. return fmt.Sprint(average)
  667. }