123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863 |
- package base_from_ccf
- import (
- "bytes"
- "compress/gzip"
- "context"
- "encoding/json"
- "eta/eta_data_analysis/utils"
- "fmt"
- "github.com/PuerkitoBio/goquery"
- "github.com/chromedp/cdproto/network"
- "github.com/chromedp/chromedp"
- "golang.org/x/net/html/charset"
- "golang.org/x/text/encoding/simplifiedchinese"
- "golang.org/x/text/transform"
- "io"
- "log"
- "net/http"
- "net/url"
- "os"
- "path/filepath"
- "regexp"
- "strconv"
- "strings"
- "time"
- )
- const (
- CCFSearchPageUrl = "https://www.ccf.com.cn/newscenter/simplesearch.php" // CCF搜索页地址
- CCFReportDetailBaseUrl = "https://www.ccf.com.cn" // CCF报告详情页地址
- )
- // postEdbLib 调用指标接口
- func postEdbLib(param map[string]interface{}, method string) (result []byte, err error) {
- postUrl := utils.EDB_LIB_URL + method
- postData, err := json.Marshal(param)
- if err != nil {
- return
- }
- result, err = httpPost(postUrl, string(postData), "application/json")
- if err != nil {
- return
- }
- return
- }
- // httpPost HTTP请求
- func httpPost(url, postData string, params ...string) ([]byte, error) {
- fmt.Println("httpPost Url:" + url)
- body := io.NopCloser(strings.NewReader(postData))
- client := &http.Client{}
- req, err := http.NewRequest("POST", url, body)
- if err != nil {
- return nil, err
- }
- contentType := "application/x-www-form-urlencoded;charset=utf-8"
- if len(params) > 0 && params[0] != "" {
- contentType = params[0]
- }
- req.Header.Set("Content-Type", contentType)
- req.Header.Set("authorization", utils.MD5(utils.APP_EDB_LIB_NAME_EN+utils.EDB_LIB_Md5_KEY))
- resp, err := client.Do(req)
- if err != nil {
- fmt.Println("client.Do err:" + err.Error())
- return nil, err
- }
- defer func() {
- _ = resp.Body.Close()
- }()
- b, err := io.ReadAll(resp.Body)
- if err != nil {
- fmt.Println("httpPost:" + string(b))
- }
- return b, err
- }
- // fetchPageHtml 获取网站HTML文本
- func fetchPageHtml(baseUrl string, fetchNum int) (respBody []byte, err error) {
- defer func() {
- if err != nil {
- tips := fmt.Sprintf("BuildCCFRequest ErrMsg: %s", err.Error())
- utils.FileLog.Info(tips)
- fmt.Println(tips)
- }
- }()
- // 查询次数
- fetchNum++
- if baseUrl == "" {
- err = fmt.Errorf("CCF请求地址为空")
- return
- }
- // 获取Cookie
- strCookie, e := getCookie()
- if e != nil {
- err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
- return
- }
- if strCookie == "" && fetchNum < 2 {
- fmt.Printf("文件cookie为空, 重新获取, fetchNum: %d\n", fetchNum)
- utils.FileLog.Info(fmt.Sprintf("文件cookie为空, 重新获取, fetchNum: %d", fetchNum))
- _, err = getCookieByChrome()
- if err != nil {
- return
- }
- return fetchPageHtml(baseUrl, fetchNum)
- }
- // 拉取网站内容
- cli := new(http.Client)
- req, e := http.NewRequest("GET", baseUrl, nil)
- if e != nil {
- err = fmt.Errorf("")
- return
- }
- req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
- req.Header.Set("Accept-Encoding", "gzip, deflate, br")
- req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
- req.Header.Set("Connection", "keep-alive")
- req.Header.Set("Cookie", strCookie)
- req.Header.Set("Host", "www.ccf.com.cn")
- req.Header.Set("Referer", baseUrl)
- req.Header.Set("Sec-Ch-Ua", "\"Not A(Brand\";v=\"99\", \"Microsoft Edge\";v=\"121\", \"Chromium\";v=\"121\"")
- req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
- req.Header.Set("Sec-Ch-Ua-Platform", "\"Windows\"")
- req.Header.Set("Sec-Fetch-Dest", "empty")
- req.Header.Set("Sec-Fetch-Mode", "cors")
- req.Header.Set("Sec-Fetch-Site", "same-origin")
- req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0")
- req.Header.Set("X-Requested-With", "XMLHttpRequest")
- resp, e := cli.Do(req)
- if e != nil {
- err = fmt.Errorf("HTTP client Do err: %s", e.Error())
- return
- }
- defer func() {
- _ = resp.Body.Close()
- }()
- // 读取响应的内容
- reader, e := gzip.NewReader(resp.Body)
- if e != nil {
- err = fmt.Errorf("gzip NewReader err: %s", e.Error())
- return
- }
- body, e := io.ReadAll(reader)
- if e != nil {
- err = fmt.Errorf("read body err: %s", e.Error())
- return
- }
- // 转换编码
- utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(body))
- if e != nil {
- err = fmt.Errorf("utf8 reader err: %s", e.Error())
- return
- }
- utf8Body, e := io.ReadAll(utf8Reader)
- if e != nil {
- err = fmt.Errorf("utf8 body err: %s", e.Error())
- return
- }
- respBody = utf8Body
- isLoginPage := checkIsLoginPage(string(respBody))
- fmt.Println("是否登录页:", isLoginPage)
- // 如果是登录页,且查询次数少于2次,那么就重新登录后查询
- if isLoginPage && fetchNum < 2 {
- _, err = getCookieByChrome()
- if err != nil {
- return
- }
- return fetchPageHtml(baseUrl, fetchNum)
- }
- return
- }
- // DataRule 数据爬取规则
- type DataRule struct {
- Name string `json:"Name"`
- Frequency string `json:"Frequency"`
- PageDir string `json:"PageDir"`
- Search struct {
- ClassId string `json:"ClassId"`
- SubClassId string `json:"SubClassId"`
- ProductId string `json:"ProductId"`
- SubProductId string `json:"SubProductId"`
- SimpleTerms string `json:"SimpleTerms"`
- } `json:"Search"`
- TableFetch []struct {
- Keyword string `json:"Keyword"`
- Unit string `json:"Unit"`
- } `json:"TableFetch"`
- EdbMatch []DataRuleEdbMatch `json:"EdbMatch"`
- StockTable struct {
- ClassifyId int `json:"ClassifyId"`
- } `json:"StockTable"`
- }
- // DataRuleEdbMatch 数据爬取规则-指标匹配
- type DataRuleEdbMatch struct {
- IndexCode string `json:"IndexCode"`
- IndexName string `json:"IndexName"`
- ClassifyId int `json:"ClassifyId"`
- Frequency string `json:"Frequency"`
- Product string `json:"Product"`
- Market string `json:"Market"`
- MatchUnit string `json:"MatchUnit" description:"匹配单位"`
- Unit string `json:"Unit" description:"实际单位"`
- }
- // loadDataRule 从配置中读取爬取规则
- func loadDataRule(nameKey string) (fetchRule *DataRule, err error) {
- if utils.CCFDataRuleFile == "" {
- err = fmt.Errorf("rule文件不存在")
- return
- }
- b, e := os.ReadFile(utils.CCFDataRuleFile)
- if e != nil {
- err = fmt.Errorf("读取rule文件失败, err: %v", e)
- return
- }
- rules := make([]*DataRule, 0)
- if e = json.Unmarshal(b, &rules); e != nil {
- err = fmt.Errorf("解析rule文件失败, err: %v", e)
- return
- }
- for _, v := range rules {
- if v.Name != "" && v.Name == nameKey {
- fetchRule = v
- return
- }
- }
- err = fmt.Errorf("rule不存在, nameKey: %s", nameKey)
- return
- }
- // savePageHtml 拉取历史报告详情
- func savePageHtml(nameKey, saveDir string, historyPage bool, reportMax int) (files []string, err error) {
- if nameKey == "" {
- return
- }
- defer func() {
- if err != nil {
- tips := fmt.Sprintf("GetCCFOilEdbHistory ErrMsg: %s", err.Error())
- utils.FileLog.Info(tips)
- fmt.Println(tips)
- }
- }()
- fetchRule, e := loadDataRule(nameKey)
- if e != nil {
- err = fmt.Errorf("loadDataRule, err: %v", e)
- return
- }
- if saveDir == "" {
- saveDir = "static/ccf"
- }
- // 获取品种第一页
- baseUrl := fmt.Sprintf(`%s?newssubmit=1&sitename=localhost`, CCFSearchPageUrl)
- if fetchRule.Search.ClassId != "" {
- baseUrl = fmt.Sprintf(`%s&ClassID=%s`, baseUrl, fetchRule.Search.ClassId)
- }
- if fetchRule.Search.SubClassId != "" {
- baseUrl = fmt.Sprintf(`%s&SubClassID=%s`, baseUrl, fetchRule.Search.SubClassId)
- }
- if fetchRule.Search.ProductId != "" {
- baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.ProductId)
- }
- if fetchRule.Search.SubProductId != "" {
- baseUrl = fmt.Sprintf(`%s&ProductID=%s`, baseUrl, fetchRule.Search.SubProductId)
- }
- if fetchRule.Search.SimpleTerms != "" {
- termsEncode, e := gb2312ToPercentEncoding(fetchRule.Search.SimpleTerms)
- if e != nil {
- err = fmt.Errorf("gb2312ToPercentEncoding err: %v", e)
- return
- }
- baseUrl = fmt.Sprintf(`%s&simpleterms=%s`, baseUrl, termsEncode)
- }
- firstPage := fmt.Sprintf(`%s&cur_pg_num=%d`, baseUrl, 1)
- // 首页报告链接
- firstHtml, e := fetchPageHtml(firstPage, 0)
- if e != nil {
- err = fmt.Errorf("获取首页HTML失败, err: %v", e)
- return
- }
- firstHrefs, e := analysisReportHrefs(firstHtml, 1)
- if e != nil {
- err = fmt.Errorf("读取首页列表报告链接失败, err: %v", e)
- return
- }
- var historyHrefs []ReportHrefs
- historyHrefs = append(historyHrefs, firstHrefs...)
- ticker := time.NewTicker(5 * time.Second)
- defer ticker.Stop()
- // 历史报告
- if historyPage {
- endPage, e := analysisEndPage(firstHtml)
- if e != nil {
- err = fmt.Errorf("解析首页最后页码失败, err: %v", e)
- return
- }
- if endPage > 1 {
- for i := 2; i <= endPage; i++ {
- <-ticker.C
- fmt.Printf("开始读取历史页%d\n", i)
- // 每页28条数据, 需要带上页码*28的偏移量不然始终获取第一页
- pageUrl := fmt.Sprintf(`%s&cur_pg_num=%d&cur_row_pos=%d`, baseUrl, i, i*28)
- fmt.Println("pageUrl: ", pageUrl)
- pageContents, e := fetchPageHtml(pageUrl, 0)
- if e != nil {
- err = fmt.Errorf("获取首页HTML失败, err: %v", e)
- return
- }
- pageHrefs, e := analysisReportHrefs(pageContents, i)
- if e != nil {
- err = fmt.Errorf("读取第%d页列表报告链接失败, err: %v", i, e)
- return
- }
- historyHrefs = append(historyHrefs, pageHrefs...)
- fmt.Printf("结束读取历史页%d\n", i)
- }
- }
- fmt.Println("endPage: ", endPage)
- }
- fmt.Println("historyHrefs len: ", len(historyHrefs))
- fmt.Println("historyHrefs: ", historyHrefs)
- // 拉取报告留档
- strDate := time.Now().Format("20060102")
- reportCount := 0
- for _, v := range historyHrefs {
- <-ticker.C
- if reportMax > 0 {
- reportCount += 1
- if reportCount > reportMax {
- break
- }
- }
- fmt.Printf("拉取报告: %s; url: %s\n", v.Title, v.Href)
- htm, e := fetchPageHtml(fmt.Sprintf("%s%s", CCFReportDetailBaseUrl, v.Href), 0)
- if e != nil {
- utils.FileLog.Info("获取页面失败, err: %v", e)
- continue
- }
- dateDir := fmt.Sprintf("%s/%s", saveDir, strDate)
- if e = utils.MkDir(dateDir); e != nil {
- utils.FileLog.Info(fmt.Sprintf("创建目录失败, err: %v", e))
- continue
- }
- outputPath := fmt.Sprintf("%s/%d-%s.html", dateDir, v.Page, v.Title)
- if e = utils.WriteHTMLToFile(string(htm), outputPath); e != nil {
- utils.FileLog.Info(fmt.Sprintf("写入HTML出错, err: %v", e))
- continue
- }
- files = append(files, outputPath)
- }
- fmt.Println("拉取报告 end")
- return
- }
- // analysisEndPage 读取列表页最后一页页码
- func analysisEndPage(contents []byte) (endPage int, err error) {
- doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
- if e != nil {
- err = fmt.Errorf("NewDocumentFromReader err: %v", e)
- return
- }
- // 查找页码元素并遍历a标签
- sectionDigg := doc.Find(".digg")
- aElements := sectionDigg.Find("a")
- // 获取倒数第二个a标签中的页码
- totalAElements := aElements.Length()
- targetIndex := totalAElements - 2
- if targetIndex >= 0 && targetIndex < totalAElements {
- targetA := aElements.Eq(targetIndex)
- txt := targetA.Text()
- endPage, e = strconv.Atoi(txt)
- if e != nil {
- err = fmt.Errorf("页码文本有误, %s", txt)
- return
- }
- fmt.Println(endPage)
- return
- }
- endPage = 1
- return
- }
- // ReportHrefs 报告链接
- type ReportHrefs struct {
- Title string `description:"报告标题"`
- Href string `description:"报告详情链接"`
- Page int `description:"页码"`
- }
- // analysisReportHrefs 解析列表页报告链接
- func analysisReportHrefs(contents []byte, page int) (hrefs []ReportHrefs, err error) {
- doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(contents)))
- if e != nil {
- err = fmt.Errorf("NewDocumentFromReader err: %v", e)
- return
- }
- doc.Find("ul.newslist li a").Each(func(_ int, s *goquery.Selection) {
- href, exists := s.Attr("href")
- if exists {
- title := s.Text()
- hrefs = append(hrefs, ReportHrefs{
- Title: title,
- Href: href,
- Page: page,
- })
- }
- })
- return
- }
- // extractReportPublishTime 提取报告发布时间
- func extractReportPublishTime(text string) (time.Time, error) {
- datePattern := `(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}:\d{2})`
- re := regexp.MustCompile(datePattern)
- var strTime string
- match := re.FindStringSubmatch(text)
- if len(match) <= 0 {
- return time.Time{}, fmt.Errorf("没有读取出日期")
- }
- strTime = match[0]
- // 转为时间格式
- dateFormat := "2006年01月02日15:04"
- parsedDate, e := time.ParseInLocation(dateFormat, strTime, time.Local)
- if e != nil {
- return time.Time{}, fmt.Errorf("日期转换失败, str: %s, err: %v", strTime, e)
- }
- return parsedDate, nil
- }
- // calculateDataHalfVal 取出数据区间的折中值, 如"7-9天"返回结果为"8"
- func calculateDataHalfVal(duration string) (result string, err error) {
- re := regexp.MustCompile(`\d+`)
- matches := re.FindAllString(duration, -1)
- if len(matches) != 2 {
- err = fmt.Errorf("未找到两个数字, Num: %d", len(matches))
- return
- }
- a, e := strconv.Atoi(matches[0])
- if e != nil {
- err = e
- return
- }
- b, e := strconv.Atoi(matches[1])
- if e != nil {
- err = e
- return
- }
- average := float64(a+b) / 2.0
- // 格式化结果
- if average == float64(int(average)) {
- result = strconv.Itoa(int(average))
- } else {
- result = fmt.Sprintf("%.1f", average)
- }
- return
- }
- // gb2312ToPercentEncoding 中文字符转码
- func gb2312ToPercentEncoding(input string) (string, error) {
- // 创建GB18030编码转换器(兼容GB2312)
- encoder := simplifiedchinese.GB18030.NewEncoder()
- // 使用转换器将字符串转换为GB18030编码的字节流,并写入bytes.Buffer
- var buf bytes.Buffer
- writer := transform.NewWriter(&buf, encoder)
- _, err := writer.Write([]byte(input))
- if err != nil {
- return "", err
- }
- err = writer.Close()
- if err != nil {
- return "", err
- }
- // 将字节流转换为百分号编码
- percentEncoded := url.QueryEscape(buf.String())
- return percentEncoded, nil
- }
- // AnalysisNoneMergeTablePars 解析无合并单元格的简单表格入参
- type AnalysisNoneMergeTablePars struct {
- DocTable *goquery.Selection
- MarketCol struct {
- HasCol bool `description:"是否有市场列"`
- ColIndex int `description:"市场列"`
- }
- DateCol struct {
- StartIndex int `description:"日期开始列"`
- EndIndex int `description:"日期结束列"`
- PublishTime time.Time `description:"报告发布时间"`
- //PublishYear int `description:"报告发布年份"`
- StrTimeFormat string `description:"数据日期格式-需拼接日期列中的变量"`
- TimeFormat []string `description:"标准日期格式, 可能存在多种分别进行遍历"`
- SplitLast bool `description:"是否分隔日期: 如1.24-1.28"`
- SplitFlag string `description:"分隔日期分隔符: 如-"`
- }
- ValCol struct {
- SplitHalfVal bool `description:"是否取折中值: 如8-10天, 9-12天"`
- }
- }
- // TableRow 读取Table的行信息
- type TableRow struct {
- Product string
- Market string
- DateData map[string]string
- Unit string
- }
- // analysisNoneMergeTable 解析无合并单元格的简单表格
- func analysisNoneMergeTable(params AnalysisNoneMergeTablePars) (items []TableRow) {
- if params.DocTable != nil && params.DocTable.Length() <= 0 {
- return
- }
- attemptDates := []string{"2006/1/2", "2006/01/02", "2006/01/2", "2006/1/02", "2006-1-2", "2006-01-02", "2006-01-2", "2006-1-02", "2006.01.02", "2006.1.2", "2006.1.02", "2006.01.2", "2006年01月02日", "2006年1月2日", "2006年1月02日", "2006年01月2日"}
- colDate := make(map[int]string)
- params.DocTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
- cells := s.Find("td")
- // 表头取出日期
- if i == 0 {
- cells.Each(func(ii int, ss *goquery.Selection) {
- cellTxt := strings.TrimSpace(ss.Text())
- //fmt.Println("cellTxt", cellTxt)
- if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
- //strTime := fmt.Sprintf("%d.%s", publishYear, cellTxt)
- //var strTimeFormat string
- completeTime := cellTxt
- // 是否需要拼接年份
- if params.DateCol.StrTimeFormat != "" {
- strDate := cellTxt
- // 是否取分隔日期的后一个日期
- if params.DateCol.SplitLast && params.DateCol.SplitFlag != "" {
- dateArr := strings.Split(cellTxt, params.DateCol.SplitFlag)
- if len(dateArr) > 1 {
- strDate = dateArr[len(dateArr)-1]
- }
- }
- completeTime = fmt.Sprintf(params.DateCol.StrTimeFormat, params.DateCol.PublishTime.Year(), strDate)
- }
- //fmt.Println("completeTime: ", completeTime)
- // 遍历多种可能的日期格式
- var colTime time.Time
- for _, f := range params.DateCol.TimeFormat {
- t, e := time.ParseInLocation(f, completeTime, time.Local)
- if e != nil {
- continue
- }
- colTime = t
- break
- }
- // 统一判断一次, 入参的日期格式可能不全
- if colTime.IsZero() {
- utils.FileLog.Info(fmt.Sprintf("日期格式异常: cellTxt-%s; completeTime-%s", cellTxt, completeTime))
- for _, f := range attemptDates {
- t, e := time.ParseInLocation(f, completeTime, time.Local)
- if e != nil {
- continue
- }
- colTime = t
- break
- }
- }
- // 判断报告是否跨年
- if colTime.AddDate(0, -6, 0).After(params.DateCol.PublishTime) {
- utils.FileLog.Info(fmt.Sprintf("跨年判断: ColTime-%v; PublishTime-%v", colTime, params.DateCol.PublishTime))
- colTime = colTime.AddDate(-1, 0, 0)
- }
- if !colTime.IsZero() {
- colDate[ii] = colTime.Format(utils.FormatDate)
- }
- fmt.Println("日期:", colTime.Format(utils.FormatDate))
- }
- })
- }
- // 取指标
- if i > 0 {
- row := TableRow{
- DateData: make(map[string]string),
- }
- cells.Each(func(ii int, ss *goquery.Selection) {
- cellTxt := filterInvalidVal(ss.Text())
- //fmt.Println("cellTxt", cellTxt)
- if ii == 0 {
- row.Product = cellTxt
- }
- if params.MarketCol.HasCol && ii == params.MarketCol.ColIndex {
- row.Market = cellTxt
- }
- if ii >= params.DateCol.StartIndex && ii <= params.DateCol.EndIndex {
- d, ok := colDate[ii]
- if !ok {
- return
- }
- // 是否取折中值
- if params.ValCol.SplitHalfVal {
- val, e := calculateDataHalfVal(cellTxt)
- if e != nil {
- fmt.Printf("calculateDataHalfVal err: %v\n", e)
- return
- }
- cellTxt = val
- }
- if cellTxt != "" {
- row.DateData[d] = cellTxt
- }
- }
- })
- //fmt.Println(row)
- items = append(items, row)
- }
- })
- return
- }
- // formatTableRow2ValidEdb 表格行转换为有效指标
- func formatTableRow2ValidEdb(rows []TableRow, edbMatch []DataRuleEdbMatch) (indexes []*HandleIndexData) {
- indexes = make([]*HandleIndexData, 0)
- for _, m := range edbMatch {
- for _, v := range rows {
- fmt.Printf("产品: %s, 市场: %s, 日期数据: %v, 单位: %s\n", v.Product, v.Market, v.DateData, v.Unit)
- var productOk, marketOk, unitOk bool
- if (m.Product == "" && v.Product == "") || (m.Product != "" && strings.Contains(v.Product, m.Product)) {
- productOk = true
- }
- if (m.Market == "" && v.Market == "") || (m.Market != "" && strings.Contains(v.Market, m.Market)) {
- marketOk = true
- }
- if (m.MatchUnit == "" && v.Unit == "") || (m.MatchUnit != "" && strings.Contains(v.Unit, m.MatchUnit)) {
- unitOk = true
- }
- if productOk && marketOk && unitOk {
- edb := new(HandleIndexData)
- edb.IndexCode = m.IndexCode
- edb.IndexName = m.IndexName
- edb.ClassifyId = m.ClassifyId
- edb.Frequency = m.Frequency
- edb.Unit = m.Unit
- edb.DateData = v.DateData
- edb.TerminalCode = utils.TerminalCode
- indexes = append(indexes, edb)
- }
- }
- }
- return
- }
- // listFiles 列出目录下所有文件名
- func listFiles(dirPath string) ([]string, error) {
- var files []string
- err := filepath.Walk(dirPath, func(path string, info os.FileInfo, err error) error {
- if err != nil {
- return err
- }
- if !info.IsDir() {
- files = append(files, info.Name())
- }
- return nil
- })
- if err != nil {
- return nil, err
- }
- return files, nil
- }
- // filterInvalidVal 过滤无效值
- func filterInvalidVal(cellTxt string) string {
- cellTxt = strings.TrimSpace(cellTxt)
- if cellTxt == "休市" || cellTxt == "/" || cellTxt == "-" || cellTxt == "—" {
- return ""
- }
- return cellTxt
- }
- // formatIntervalData 格式化区间值
- func formatIntervalData(cellTxt, flag string) string {
- cellTxt = filterInvalidVal(cellTxt)
- if flag == "" {
- flag = "-"
- }
- matches := strings.Split(cellTxt, flag)
- if len(matches) < 2 {
- return cellTxt
- }
- if len(matches) != 2 {
- return ""
- }
- // 转换不了直接返回空值
- a, e := strconv.ParseFloat(matches[0], 64)
- if e != nil {
- return ""
- }
- b, e := strconv.ParseFloat(matches[1], 64)
- if e != nil {
- return ""
- }
- average := (a + b) / 2
- return fmt.Sprint(average)
- }
- // getCookie
- // @Description: 获取cookie
- // @author: Roc
- // @datetime 2024-07-09 14:00:53
- // @return cookieStr string
- // @return err error
- func getCookie() (cookieStr string, err error) {
- // 读取Cookie
- if utils.CCFCookieFile == "" {
- err = fmt.Errorf("cookie文件未配置")
- return
- }
- cookieByte, e := os.ReadFile(utils.CCFCookieFile)
- if e != nil {
- err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
- return
- }
- cookieStr = strings.TrimSpace(string(cookieByte))
- //if cookieStr == "" {
- // err = fmt.Errorf("cookie为空")
- // return
- //}
- return
- }
- // getCookieByChrome
- // @Description: 获取cookie
- // @author: Roc
- // @datetime 2024-07-09 14:00:53
- // @return cookieStr string
- // @return err error
- func getCookieByChrome() (cookieStr string, err error) {
- // 读取Cookie
- if utils.CCFUseName == "" {
- err = fmt.Errorf("CCF账号未设置")
- return
- }
- if utils.CCFPassword == "" {
- err = fmt.Errorf("CCF密码未设置")
- return
- }
- opts := append(
- chromedp.DefaultExecAllocatorOptions[:],
- chromedp.Flag("headless", false),
- )
- allocCtx, cancel1 := chromedp.NewExecAllocator(context.Background(), opts...)
- defer cancel1()
- // 创建chrome实例
- ctx, cancel2 := chromedp.NewContext(
- allocCtx,
- chromedp.WithLogf(log.Printf),
- )
- defer cancel2()
- err = chromedp.Run(ctx,
- chromedp.Navigate(`https://www.ccf.com.cn/member/member.php`),
- chromedp.SetValue(`input[name="username"]`, utils.CCFUseName, chromedp.ByQuery),
- chromedp.SetValue(`input[name="password"]`, utils.CCFPassword, chromedp.ByQuery),
- chromedp.Sleep(2*time.Second),
- chromedp.Click(`input[id="imageField"]`, chromedp.ByQuery),
- chromedp.Sleep(5*time.Second),
- chromedp.Navigate(`https://www.ccf.com.cn/newscenter/detail-410000-2024070600003.shtml`),
- chromedp.Sleep(2*time.Second),
- chromedp.ActionFunc(func(ctx context.Context) error {
- cookies, err := network.GetCookies().Do(ctx)
- if err != nil {
- return err
- }
- //cookieJson, err := json.Marshal(cookies)
- //if err != nil {
- // return err
- //}
- //fmt.Println("cookieJson:", string(cookieJson))
- //utils.FileLog.Info("cookieJson:" + string(cookieJson))
- for _, v := range cookies {
- cookieStr = cookieStr + v.Name + "=" + v.Value + ";"
- }
- fmt.Println("header cookie:", cookieStr)
- utils.FileLog.Info("header cookie:" + cookieStr)
- tmpFile, tmpErr := os.Create(utils.CCFCookieFile)
- if tmpErr != nil {
- fmt.Println("创建cookie文件失败:", tmpErr.Error())
- return nil
- }
- if _, err := tmpFile.WriteString(cookieStr); err != nil {
- fmt.Println("写入cookie到文件失败:", err.Error())
- return nil
- }
- return nil
- }),
- )
- //if err != nil {
- // fmt.Println(err)
- //}
- return
- }
- // checkIsLoginPage
- // @Description: 校验是否是登录页
- // @author: Roc
- // @datetime 2024-07-09 16:34:17
- // @param bodyStr string
- // @return isLoginPage bool
- func checkIsLoginPage(bodyStr string) (isLoginPage bool) {
- // 初始化goquery.Document
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(bodyStr))
- if err != nil {
- log.Fatal(err)
- }
- // 查找name为LoginForm的表单
- doc.Find("form[name=LoginForm]").Each(func(i int, s *goquery.Selection) {
- // 如果找到了,打印信息表示这是登录页
- //fmt.Println("这是一个登录页面")
- isLoginPage = true
- return
- })
- // 如果没有找到,打印信息表示这不是登录页
- //if doc.Find("form[name=LoginForm]").Length() == 0 {
- // fmt.Println("这不是一个登录页面")
- //}
- return
- }
|