package base_from_ccf import ( "context" "encoding/json" "eta/eta_data_analysis/models" "eta/eta_data_analysis/utils" "fmt" "github.com/PuerkitoBio/goquery" "os" "strconv" "strings" ) // 定义主结构体 type CCFChartRule struct { Name string `json:"Name"` ClassifyId int `json:"ClassifyId"` CustNo int `json:"CustNo"` Frequency string `json:"Frequency"` Child []*CCFChartRule `json:"Child,omitempty"` // 使用指针来处理可能不存在的子对象 } func loadCCFChartRule() (rules []*CCFChartRule, err error) { if utils.CCFChartRuleFile == "" { err = fmt.Errorf("rule文件不存在") return } b, e := os.ReadFile(utils.CCFChartRuleFile) if e != nil { err = fmt.Errorf("读取rule文件失败, err: %v", e) return } rules = make([]*CCFChartRule, 0) if e = json.Unmarshal(b, &rules); e != nil { err = fmt.Errorf("解析rule文件失败, err: %v", e) return } return } func TaskGetCCFChartEdb(context.Context) (err error) { _ = GetCCFChartEdb() return } func GetCCFChartEdb() (err error) { defer func() { if err != nil { tips := fmt.Sprintf("GetCCFChartEdb ErrMsg: %s", err.Error()) utils.FileLog.Info(tips) fmt.Println(tips) } }() rules, err := loadCCFChartRule() if err != nil { return } indexes := make([]*HandleIndexData, 0) for _, v := range rules { // 首页报告链接 pageHtml := fmt.Sprintf("%s?cust_no=%d", CCFCHARTDATAURL, v.CustNo) fmt.Println(pageHtml) fileContent, e := fetchPageHtml(pageHtml, 0) /*fName := v.Name if strings.Contains(v.Name, "/") { fName = strings.ReplaceAll(fName, "/", "") } filePath := fmt.Sprintf("/Users/xiexiaoyuan/工作/数据源ccf/ccf图表/%s/index.html", fName) fmt.Println(filePath) // 打开文件 file, e := os.Open(filePath) if e != nil { err = fmt.Errorf("无法打开文件: %v", err) return } defer file.Close() // 读取文件内容 fileContent, e := io.ReadAll(file) if e != nil { err = fmt.Errorf("读取文件内容失败: %v", e) fmt.Printf("无法读取文件内容: %v", e) return }*/ // 转换编码 // 转换编码 /*utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(fileContent)) if e != nil { err = fmt.Errorf("utf8 reader err: %s", e.Error()) return } utf8Body, e := io.ReadAll(utf8Reader) if e != nil { err = fmt.Errorf("读取utf8 body err: %s", e.Error()) return }*/ //firstHtml := string(utf8Body) //fmt.Println(firstHtml) isStop, indexList, e := AnalysisChartInventoryWeeklyEdb(fileContent, v) if e != nil { err = fmt.Errorf("解析图表失败, err: %v", e) return } if isStop { err = fmt.Errorf("图表名称不存在,停止爬取") break } if len(indexList) > 0 { indexes = append(indexes, indexList...) } } if len(indexes) == 0 { return } // 写入数据库 params := make(map[string]interface{}) params["List"] = indexes params["TerminalCode"] = utils.TerminalCode result, e := postEdbLib(params, utils.LIB_ROUTE_CCF_EDB_HANDLE) if e != nil { b, _ := json.Marshal(params) err = fmt.Errorf("postEdbLib err: %v, params: %s", e, string(b)) return } resp := new(models.BaseEdbLibResponse) if e = json.Unmarshal(result, &resp); e != nil { err = fmt.Errorf(" postEdbLib resp json.Unmarshal err: %v", e) return } if resp.Ret != 200 { err = fmt.Errorf("postEdbLib resp Msg: %s, ErrMsg: %s", resp.Msg, resp.ErrMsg) return } return } // AnalysisChartInventoryWeeklyEdb 解析周度库存中的日均值 func AnalysisChartInventoryWeeklyEdb(htm []byte, rule *CCFChartRule) (isStop bool, indexes []*HandleIndexData, err error) { if len(htm) == 0 { utils.FileLog.Info("htm empty") return } doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm))) if e != nil { err = fmt.Errorf("NewDocumentFromReader err: %v", e) return } // 判断图表名称是否相符,如果不符合放弃爬取 title := doc.Find("p a.activated.now").Text() fmt.Println(title) if title != rule.Name { utils.FileLog.Info("图表名称不存在,停止爬取") isStop = true return } doc.Find("div.tabCont").Each(func(i int, item *goquery.Selection) { // 提取单位(这里假设单位总是位于 .tips 类的 div 中) unit := item.Find(".tips").Text() unit = strings.TrimSpace(unit) unit = strings.TrimPrefix(unit, "编制说明:单位(") unit = strings.TrimSuffix(unit, ")") fmt.Println("单位: ", unit) indexCode := "" indexName := "" // 获取频度和分类ID classifyId := rule.ClassifyId frequency := rule.Frequency childRule := make(map[string]*CCFChartRule) // 判断是否存在子页面 if len(rule.Child) > 0 { for _, v := range childRule { childRule[v.Name] = v } } dataMap := make(map[string]string) // 遍历表格中的每一行(跳过表头) item.Find("table tbody tr").Each(func(k int, row *goquery.Selection) { if k == 0 { return } // 提取产品名称 if indexCode == "" { productName := row.Find("td:nth-child(1)").Text() productName = strings.TrimSpace(productName) indexCode = fmt.Sprintf("ccf%s", utils.GetFirstPingYin(productName)) indexName = fmt.Sprintf("CCF%s", productName) //判断子页面的频度 if newRule, ok := childRule[productName]; ok { frequency = newRule.Frequency } } // 提取日期 date := row.Find("td:nth-child(2)").Text() date = strings.TrimSpace(date) // 提取日均值 dailyAvg := row.Find("td:nth-child(3)").Text() dailyAvg = strings.TrimSpace(dailyAvg) // 打印提取的信息 fmt.Printf("单位: %s\n产品名称: %s\n日期: %s\n日均值: %s\n\n", unit, indexName, date, dailyAvg) _, e = strconv.ParseFloat(dailyAvg, 64) if e != nil { utils.FileLog.Info("数据转换失败 err:%s", e.Error()) return } dataMap[date] = dailyAvg }) if indexName == "" { return } edb := new(HandleIndexData) edb.IndexCode = strings.ToLower(indexCode) edb.IndexName = indexName edb.ClassifyId = classifyId edb.Frequency = frequency edb.Unit = unit edb.DateData = dataMap edb.TerminalCode = utils.TerminalCode indexes = append(indexes, edb) }) return }