123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226 |
- package base_from_ccf
- import (
- "context"
- "encoding/json"
- "eta/eta_data_analysis/models"
- "eta/eta_data_analysis/utils"
- "fmt"
- "github.com/PuerkitoBio/goquery"
- "os"
- "strconv"
- "strings"
- )
- // 定义主结构体
- type CCFChartRule struct {
- Name string `json:"Name"`
- ClassifyId int `json:"ClassifyId"`
- CustNo int `json:"CustNo"`
- Frequency string `json:"Frequency"`
- Child []*CCFChartRule `json:"Child,omitempty"` // 使用指针来处理可能不存在的子对象
- }
- func loadCCFChartRule() (rules []*CCFChartRule, err error) {
- if utils.CCFChartRuleFile == "" {
- err = fmt.Errorf("rule文件不存在")
- return
- }
- b, e := os.ReadFile(utils.CCFChartRuleFile)
- if e != nil {
- err = fmt.Errorf("读取rule文件失败, err: %v", e)
- return
- }
- rules = make([]*CCFChartRule, 0)
- if e = json.Unmarshal(b, &rules); e != nil {
- err = fmt.Errorf("解析rule文件失败, err: %v", e)
- return
- }
- return
- }
- func TaskGetCCFChartEdb(context.Context) (err error) {
- _ = GetCCFChartEdb()
- return
- }
- func GetCCFChartEdb() (err error) {
- defer func() {
- if err != nil {
- tips := fmt.Sprintf("GetCCFChartEdb ErrMsg: %s", err.Error())
- utils.FileLog.Info(tips)
- fmt.Println(tips)
- }
- }()
- rules, err := loadCCFChartRule()
- if err != nil {
- return
- }
- indexes := make([]*HandleIndexData, 0)
- for _, v := range rules {
- // 首页报告链接
- pageHtml := fmt.Sprintf("%s?cust_no=%d", CCFCHARTDATAURL, v.CustNo)
- fmt.Println(pageHtml)
- fileContent, e := fetchPageHtml(pageHtml, 0)
- /*fName := v.Name
- if strings.Contains(v.Name, "/") {
- fName = strings.ReplaceAll(fName, "/", "")
- }
- filePath := fmt.Sprintf("/Users/xiexiaoyuan/工作/数据源ccf/ccf图表/%s/index.html", fName)
- fmt.Println(filePath)
- // 打开文件
- file, e := os.Open(filePath)
- if e != nil {
- err = fmt.Errorf("无法打开文件: %v", err)
- return
- }
- defer file.Close()
- // 读取文件内容
- fileContent, e := io.ReadAll(file)
- if e != nil {
- err = fmt.Errorf("读取文件内容失败: %v", e)
- fmt.Printf("无法读取文件内容: %v", e)
- return
- }*/
- // 转换编码
- // 转换编码
- /*utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(fileContent))
- if e != nil {
- err = fmt.Errorf("utf8 reader err: %s", e.Error())
- return
- }
- utf8Body, e := io.ReadAll(utf8Reader)
- if e != nil {
- err = fmt.Errorf("读取utf8 body err: %s", e.Error())
- return
- }*/
- //firstHtml := string(utf8Body)
- //fmt.Println(firstHtml)
- isStop, indexList, e := AnalysisChartInventoryWeeklyEdb(fileContent, v)
- if e != nil {
- err = fmt.Errorf("解析图表失败, err: %v", e)
- return
- }
- if isStop {
- err = fmt.Errorf("图表名称不存在,停止爬取")
- break
- }
- if len(indexList) > 0 {
- indexes = append(indexes, indexList...)
- }
- }
- if len(indexes) == 0 {
- return
- }
- // 写入数据库
- params := make(map[string]interface{})
- params["List"] = indexes
- params["TerminalCode"] = utils.TerminalCode
- result, e := postEdbLib(params, utils.LIB_ROUTE_CCF_EDB_HANDLE)
- if e != nil {
- b, _ := json.Marshal(params)
- err = fmt.Errorf("postEdbLib err: %v, params: %s", e, string(b))
- return
- }
- resp := new(models.BaseEdbLibResponse)
- if e = json.Unmarshal(result, &resp); e != nil {
- err = fmt.Errorf(" postEdbLib resp json.Unmarshal err: %v", e)
- return
- }
- if resp.Ret != 200 {
- err = fmt.Errorf("postEdbLib resp Msg: %s, ErrMsg: %s", resp.Msg, resp.ErrMsg)
- return
- }
- return
- }
- // AnalysisChartInventoryWeeklyEdb 解析周度库存中的日均值
- func AnalysisChartInventoryWeeklyEdb(htm []byte, rule *CCFChartRule) (isStop bool, indexes []*HandleIndexData, err error) {
- if len(htm) == 0 {
- utils.FileLog.Info("htm empty")
- return
- }
- doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm)))
- if e != nil {
- err = fmt.Errorf("NewDocumentFromReader err: %v", e)
- return
- }
- // 判断图表名称是否相符,如果不符合放弃爬取
- title := doc.Find("p a.activated.now").Text()
- fmt.Println(title)
- if title != rule.Name {
- utils.FileLog.Info("图表名称不存在,停止爬取")
- isStop = true
- return
- }
- doc.Find("div.tabCont").Each(func(i int, item *goquery.Selection) {
- // 提取单位(这里假设单位总是位于 .tips 类的 div 中)
- unit := item.Find(".tips").Text()
- unit = strings.TrimSpace(unit)
- unit = strings.TrimPrefix(unit, "编制说明:单位(")
- unit = strings.TrimSuffix(unit, ")")
- fmt.Println("单位: ", unit)
- indexCode := ""
- indexName := ""
- // 获取频度和分类ID
- classifyId := rule.ClassifyId
- frequency := rule.Frequency
- childRule := make(map[string]*CCFChartRule)
- // 判断是否存在子页面
- if len(rule.Child) > 0 {
- for _, v := range childRule {
- childRule[v.Name] = v
- }
- }
- dataMap := make(map[string]string)
- // 遍历表格中的每一行(跳过表头)
- item.Find("table tbody tr").Each(func(k int, row *goquery.Selection) {
- if k == 0 {
- return
- }
- // 提取产品名称
- if indexCode == "" {
- productName := row.Find("td:nth-child(1)").Text()
- productName = strings.TrimSpace(productName)
- indexCode = fmt.Sprintf("ccf%s", utils.GetFirstPingYin(productName))
- indexName = fmt.Sprintf("CCF%s", productName)
- //判断子页面的频度
- if newRule, ok := childRule[productName]; ok {
- frequency = newRule.Frequency
- }
- }
- // 提取日期
- date := row.Find("td:nth-child(2)").Text()
- date = strings.TrimSpace(date)
- // 提取日均值
- dailyAvg := row.Find("td:nth-child(3)").Text()
- dailyAvg = strings.TrimSpace(dailyAvg)
- // 打印提取的信息
- fmt.Printf("单位: %s\n产品名称: %s\n日期: %s\n日均值: %s\n\n", unit, indexName, date, dailyAvg)
- _, e = strconv.ParseFloat(dailyAvg, 64)
- if e != nil {
- utils.FileLog.Info("数据转换失败 err:%s", e.Error())
- return
- }
- dataMap[date] = dailyAvg
- })
- if indexName == "" {
- return
- }
- edb := new(HandleIndexData)
- edb.IndexCode = strings.ToLower(indexCode)
- edb.IndexName = indexName
- edb.ClassifyId = classifyId
- edb.Frequency = frequency
- edb.Unit = unit
- edb.DateData = dataMap
- edb.TerminalCode = utils.TerminalCode
- indexes = append(indexes, edb)
- })
- return
- }
|