package base_from_ccf import ( "context" "encoding/json" "eta/eta_data_analysis/models" "eta/eta_data_analysis/utils" "fmt" "os" "strconv" "strings" "time" "github.com/PuerkitoBio/goquery" ) // 定义主结构体 type CCFChartRule struct { Name string `json:"Name"` ClassifyId int `json:"ClassifyId"` CustNo int `json:"CustNo"` Frequency string `json:"Frequency"` IndexType string `json:"IndexType"` Child []*CCFChartRule `json:"Child,omitempty"` // 使用指针来处理可能不存在的子对象 } func loadCCFChartRule() (rules []*CCFChartRule, err error) { if utils.CCFChartRuleFile == "" { err = fmt.Errorf("rule文件不存在") return } b, e := os.ReadFile(utils.CCFChartRuleFile) if e != nil { err = fmt.Errorf("读取rule文件失败, err: %v", e) return } rules = make([]*CCFChartRule, 0) if e = json.Unmarshal(b, &rules); e != nil { err = fmt.Errorf("解析rule文件失败, err: %v", e) return } return } type CCFChartAdditionRule struct { Name string `json:"Name"` ClassifyId int `json:"ClassifyId"` Frequency string `json:"Frequency"` ProdNames string `json:"prodNames"` LastNYear int `json:"LastNYear"` IndexType string `json:"IndexType"` } func LoadCCFChartAdditionRule() (rules []*CCFChartAdditionRule, err error) { if utils.CCFChartAdditionRuleFile == "" { err = fmt.Errorf("rule文件不存在") return } b, e := os.ReadFile(utils.CCFChartAdditionRuleFile) if e != nil { err = fmt.Errorf("读取rule文件失败, err: %v", e) return } rules = make([]*CCFChartAdditionRule, 0) if e = json.Unmarshal(b, &rules); e != nil { err = fmt.Errorf("解析rule文件失败, err: %v", e) return } return } func TaskGetCCFChartEdb(context.Context) (err error) { _ = GetCCFChartEdb() return } func GetCCFChartEdb() (err error) { defer func() { if err != nil { tips := fmt.Sprintf("GetCCFChartEdb ErrMsg: %s", err.Error()) utils.FileLog.Info(tips) fmt.Println(tips) } }() rules, err := loadCCFChartRule() if err != nil { return } indexes := make([]*HandleIndexData, 0) for _, v := range rules { // 首页报告链接 pageHtml := fmt.Sprintf("%s?cust_no=%d", CCFCHARTDATAURL, v.CustNo) fmt.Println(pageHtml) fileContent, e := fetchPageHtml(pageHtml, 0) if e != nil { err = fmt.Errorf("获取首页报告失败, err: %v", e) return } /*fName := v.Name if strings.Contains(v.Name, "/") { fName = strings.ReplaceAll(fName, "/", "") } filePath := fmt.Sprintf("/Users/xiexiaoyuan/工作/数据源ccf/ccf图表/%s/index.html", fName) fmt.Println(filePath) // 打开文件 file, e := os.Open(filePath) if e != nil { err = fmt.Errorf("无法打开文件: %v", err) return } defer file.Close() // 读取文件内容 fileContent, e := io.ReadAll(file) if e != nil { err = fmt.Errorf("读取文件内容失败: %v", e) fmt.Printf("无法读取文件内容: %v", e) return }*/ // 转换编码 // 转换编码 /*utf8Reader, e := charset.NewReaderLabel("gb2312", bytes.NewReader(fileContent)) if e != nil { err = fmt.Errorf("utf8 reader err: %s", e.Error()) return } utf8Body, e := io.ReadAll(utf8Reader) if e != nil { err = fmt.Errorf("读取utf8 body err: %s", e.Error()) return }*/ //firstHtml := string(utf8Body) //fmt.Println(firstHtml) isStop, indexList, e := AnalysisChartInventoryWeeklyEdb(fileContent, v) if e != nil { err = fmt.Errorf("解析图表失败, err: %v", e) return } if isStop { err = fmt.Errorf("图表名称不存在,停止爬取") break } if len(indexList) > 0 { indexes = append(indexes, indexList...) } } additionRules, err := LoadCCFChartAdditionRule() if err != nil { err = fmt.Errorf("加载额外图表规则失败 err: %v", err) return } now := time.Now() for _, v := range additionRules { param := make(map[string]string) if v.LastNYear == 0 { v.LastNYear = 5 } param["startdate"] = time.Date(now.Year()+1-v.LastNYear, 1, 1, 0, 0, 0, 0, time.Local).Format(utils.FormatDate2) param["enddate"] = now.Format(utils.FormatDate2) param["type"] = "1" param["prodNames"] = v.ProdNames param["skin"] = "infographic" param["page"] = "index.php" fmt.Println(param) htmlContent, er := postPageHtml(CCFCHARTDATAURL, param, 0) if er != nil { err = fmt.Errorf("获取首页报告失败, err: %v", er) return } isStop, indexList, e := AnalysisAdditionChartInventoryWeeklyEdb(htmlContent, v) if e != nil { err = fmt.Errorf("解析图表失败, err: %v", e) return } if isStop { err = fmt.Errorf("图表名称不存在,停止爬取") break } if len(indexList) > 0 { indexes = append(indexes, indexList...) } } if len(indexes) == 0 { return } // 写入数据库 params := make(map[string]interface{}) params["List"] = indexes params["TerminalCode"] = utils.TerminalCode result, e := postEdbLib(params, utils.LIB_ROUTE_CCF_EDB_HANDLE) if e != nil { b, _ := json.Marshal(params) err = fmt.Errorf("postEdbLib err: %v, params: %s", e, string(b)) return } resp := new(models.BaseEdbLibResponse) if e = json.Unmarshal(result, &resp); e != nil { err = fmt.Errorf(" postEdbLib resp json.Unmarshal err: %v", e) return } if resp.Ret != 200 { err = fmt.Errorf("postEdbLib resp Msg: %s, ErrMsg: %s", resp.Msg, resp.ErrMsg) return } return } // AnalysisChartInventoryWeeklyEdb 解析周度库存中的日均值 func AnalysisChartInventoryWeeklyEdb(htm []byte, rule *CCFChartRule) (isStop bool, indexes []*HandleIndexData, err error) { if len(htm) == 0 { utils.FileLog.Info("htm empty") return } doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm))) if e != nil { err = fmt.Errorf("NewDocumentFromReader err: %v", e) return } // 判断图表名称是否相符,如果不符合放弃爬取 title := doc.Find("p a.activated.now").Text() fmt.Println(title) if title != rule.Name { utils.FileLog.Info("图表名称不存在,停止爬取") isStop = true return } doc.Find("div.tabCont").Each(func(i int, item *goquery.Selection) { // 提取单位(这里假设单位总是位于 .tips 类的 div 中) unit := item.Find(".tips").Text() unit = strings.TrimSpace(unit) unit = strings.TrimPrefix(unit, "编制说明:单位(") unit = strings.TrimSuffix(unit, ")") fmt.Println("单位: ", unit) indexCode := "" indexName := "" // 获取频度和分类ID classifyId := rule.ClassifyId frequency := rule.Frequency childRule := make(map[string]*CCFChartRule) // 判断是否存在子页面 if len(rule.Child) > 0 { for _, v := range rule.Child { childRule[v.Name] = v } } dataMap := make(map[string]string) // 遍历表格中的每一行(跳过表头) item.Find("table tbody tr").Each(func(k int, row *goquery.Selection) { if k == 0 { return } // 提取产品名称 if indexCode == "" { productName := row.Find("td:nth-child(1)").Text() productName = strings.TrimSpace(productName) //判断子页面的频度 if newRule, ok := childRule[productName]; ok { frequency = newRule.Frequency // 在存在子类的情况下,判断产品是否属于子类,不属于则跳过 } else if len(childRule) > 0 { return } indexCode = fmt.Sprintf("ccf%s", utils.GetFirstPingYin(productName)) indexName = fmt.Sprintf("CCF%s", productName) } // 提取日期 date := row.Find("td:nth-child(2)").Text() date = strings.TrimSpace(date) // 提取日均值 var dailyAvg string if rule.IndexType == "周均" { dailyAvg = row.Find("td:nth-child(4)").Text() dailyAvg = strings.TrimSpace(dailyAvg) } else { dailyAvg = row.Find("td:nth-child(3)").Text() dailyAvg = strings.TrimSpace(dailyAvg) } // 打印提取的信息 fmt.Printf("单位: %s\n产品名称: %s\n日期: %s\n日均值: %s\n\n", unit, indexName, date, dailyAvg) _, e = strconv.ParseFloat(dailyAvg, 64) if e != nil { utils.FileLog.Info("数据转换失败 err:%s", e.Error()) return } dataMap[date] = dailyAvg }) if indexName == "" { return } edb := new(HandleIndexData) edb.IndexCode = strings.ToLower(indexCode) edb.IndexName = indexName edb.ClassifyId = classifyId edb.Frequency = frequency edb.Unit = unit edb.DateData = dataMap edb.TerminalCode = utils.TerminalCode indexes = append(indexes, edb) }) return } func AnalysisAdditionChartInventoryWeeklyEdb(htm []byte, rule *CCFChartAdditionRule) (isStop bool, indexes []*HandleIndexData, err error) { if len(htm) == 0 { utils.FileLog.Info("htm empty") return } doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm))) if e != nil { err = fmt.Errorf("NewDocumentFromReader err: %v", e) return } // 判断图表名称是否相符,如果不符合放弃爬取 doc.Find("div.tabCont").Each(func(i int, item *goquery.Selection) { // 提取单位(这里假设单位总是位于 .tips 类的 div 中) unit := item.Find(".tips").Text() unit = strings.TrimSpace(unit) unit = strings.TrimPrefix(unit, "编制说明:单位(") unit = strings.TrimSuffix(unit, ")") fmt.Println("单位: ", unit) indexCode := "" indexName := "" // 获取频度和分类ID classifyId := rule.ClassifyId frequency := rule.Frequency dataMap := make(map[string]string) // 遍历表格中的每一行(跳过表头) item.Find("table tbody tr").Each(func(k int, row *goquery.Selection) { if k == 0 { return } // 提取产品名称 if indexCode == "" { productName := row.Find("td:nth-child(1)").Text() productName = strings.TrimSpace(productName) if strings.Contains(rule.Name, productName) { indexName = rule.Name code := strings.ToLower(utils.GetFirstPingYin(indexName)) code = strings.ReplaceAll(code, "/", "") code = strings.ReplaceAll(code, " ", "") indexCode = strings.ToLower(code) } } // 提取日期 date := row.Find("td:nth-child(2)").Text() date = strings.TrimSpace(date) // 提取周(日)均值 var dailyAvg string if rule.IndexType == "周均" { dailyAvg = row.Find("td:nth-child(4)").Text() dailyAvg = strings.TrimSpace(dailyAvg) } else { dailyAvg = row.Find("td:nth-child(3)").Text() dailyAvg = strings.TrimSpace(dailyAvg) } // 打印提取的信息 fmt.Printf("单位: %s\n产品名称: %s\n日期: %s\n日均值: %s\n\n", unit, indexName, date, dailyAvg) _, e = strconv.ParseFloat(dailyAvg, 64) if e != nil { utils.FileLog.Info("数据转换失败 err:%s", e.Error()) return } dataMap[date] = dailyAvg }) if indexName == "" { return } edb := new(HandleIndexData) edb.IndexCode = strings.ToLower(indexCode) edb.IndexName = indexName edb.ClassifyId = classifyId edb.Frequency = frequency edb.Unit = unit edb.DateData = dataMap edb.TerminalCode = utils.TerminalCode indexes = append(indexes, edb) }) return }