Browse Source

Merge branch 'refs/heads/eta_2.0.8_ly_0804@guomengyuan' into debug

gmy 7 months ago
parent
commit
91cd2a9aeb

+ 44 - 0
models/base_from_ly_index_record.go

@@ -0,0 +1,44 @@
+// Package models
+// @Author gmy 2024/8/7 9:38:00
+package models
+
+import (
+	"errors"
+	"github.com/beego/beego/v2/client/orm"
+)
+
+type BaseFromLyIndexRecord struct {
+	BaseFromLyIndexRecordId int    `orm:"column(base_from_ly_index_record_id);pk"` // 指标记录ID
+	CreateTime              string `orm:"column(create_time)"`                     // 创建时间
+	ModifyTime              string `orm:"column(modify_time)"`                     // 修改时间
+	Url                     string `orm:"column(url)"`                             // 指标页面地址
+	DataTime                string `orm:"column(data_time)"`                       // 数据日期
+}
+
+// 在 init 函数中注册模型
+func init() {
+	orm.RegisterModel(new(BaseFromLyIndexRecord))
+}
+
+// AddLyIndexRecord 添加指标记录
+func AddLyIndexRecord(item *BaseFromLyIndexRecord) (int64, error) {
+	o := orm.NewOrmUsingDB("data")
+	id, err := o.Insert(item)
+	if err != nil {
+		return 0, err
+	}
+	return id, nil
+}
+
+// GetLyIndexRecordByUrl 查询指标记录是否存在
+func GetLyIndexRecordByUrl(url string) (item *BaseFromLyIndexRecord, err error) {
+	o := orm.NewOrmUsingDB("data")
+	sql := `SELECT * FROM base_from_ly_index_record WHERE url=?`
+	err = o.Raw(sql, url).QueryRow(&item)
+
+	if errors.Is(err, orm.ErrNoRows) {
+		return nil, nil
+	}
+
+	return
+}

+ 83 - 15
services/liangyou/commodity_liangyou.go

@@ -121,7 +121,8 @@ func fetchReportData(ctx context.Context, product, category, report string, keyw
 	}
 	logs.Info("categoryPageURL: %s: %s: %s", product, category, categoryPageURL)
 
-	var allReportURLs []string
+	//var allReportURLs []string
+	var allReportURLMap = make(map[string]string)
 	for {
 		var htmlContent string
 		err = chromedp.Run(ctx,
@@ -135,11 +136,16 @@ func fetchReportData(ctx context.Context, product, category, report string, keyw
 		fmt.Printf("页面内容: %s\n", htmlContent)
 
 		// Extract report URLs containing the partial keyword
-		reportURLs := extractReportURLs(htmlContent, report)
-		allReportURLs = append(allReportURLs, reportURLs...)
+		reportURLMap := extractReportURLs(htmlContent, report)
+		//allReportURLs = append(allReportURLs, reportURLs...)
+		for key, value := range reportURLMap {
+			allReportURLMap[key] = value
+		}
 
+		//  测试环境跑部分数据,上线放开
+		//break
 		// Check if next page button is disabled
-		//  测试环境跑部分数据,上线放开...
+		//  测试环境跑部分数据,上线放开
 		var nextPageDisabled bool
 		err = chromedp.Run(ctx,
 			chromedp.Evaluate(`document.querySelector('div.my-page-next').classList.contains('my-page-forbid')`, &nextPageDisabled),
@@ -161,25 +167,64 @@ func fetchReportData(ctx context.Context, product, category, report string, keyw
 		if err != nil {
 			return err
 		}
+
 	}
 
-	logs.Info("所有报告 URLs: %s: %s: %v", product, category, allReportURLs)
+	logs.Info("所有报告 URLs: %s: %s: %v", product, category, allReportURLMap)
 
-	if len(allReportURLs) == 0 {
+	if len(allReportURLMap) == 0 {
 		return fmt.Errorf("未找到报告 URL")
 	}
 
 	// 处理报告数据
-	for _, reportURL := range allReportURLs {
-		// 随机睡眠 todo 跑数据时放开
-		rand := utils.RangeRand(10, 100)
+	for key, value := range allReportURLMap {
+		// 查询报告是否已经处理  这里只对近7天的数据进行处理
+		lyIndexRecord, err := models.GetLyIndexRecordByUrl(key)
+		if err != nil {
+			continue
+		}
+		if lyIndexRecord != nil {
+			toTime, err := utils.StringToTime(lyIndexRecord.DataTime + " 00:00:00")
+			if err != nil {
+				logs.Error("时间格式转换错误: %s: %s: %s: %s: %v", product, category, report, key, err)
+				continue
+			}
+
+			if time.Now().Sub(toTime) > 7*24*time.Hour {
+				logs.Info("报告已处理: %s: %s: %s: %s", product, category, report, key)
+				continue
+			}
+		}
+
+		// 随机睡眠
+		rand := utils.RangeRand(20, 100)
 		fmt.Println(report+";sleep:", strconv.Itoa(int(rand)))
 		time.Sleep(time.Duration(rand) * time.Second)
 
-		err = processReport(ctx, product, category, reportURL, keywords)
+		err = processReport(ctx, product, category, key, keywords)
 		if err != nil {
-			logs.Error("处理报告错误: %s: %s: %s: %s: %v", product, category, report, reportURL, err)
+			logs.Error("处理报告错误: %s: %s: %s: %s: %v", product, category, report, key, err)
+			continue
 		}
+
+		format, err := utils.ConvertTimeFormat(value)
+		if err != nil {
+			logs.Error("时间格式转换错误: %s, %s, %v: %v", product, category, value, err)
+			continue
+		}
+
+		// 处理报告成功,将维护指标数据读取进度到数据库,避免后面重复读取
+		recordId, err := models.AddLyIndexRecord(&models.BaseFromLyIndexRecord{
+			CreateTime: utils.GetCurrentTime(),
+			ModifyTime: utils.GetCurrentTime(),
+			Url:        key,
+			DataTime:   format,
+		})
+		if err != nil {
+			logs.Error("维护指标数据读取进度错误: %s, %s, %v: %v", product, category, recordId, err)
+			continue
+		}
+		logs.Info("维护指标数据读取进度成功: %s, %s, %v", product, category, recordId)
 	}
 
 	return nil
@@ -288,8 +333,10 @@ func extractHrefAndText(outerHTML string) (string, string) {
 }
 
 // Extract report URLs from the HTML content
-func extractReportURLs(htmlContent, keyword string) []string {
-	var reportURLs []string
+func extractReportURLs(htmlContent, keyword string) map[string]string {
+	//var reportURLs []string
+	var reportURLMap = make(map[string]string)
+	var reportURL string
 
 	// Find all occurrences of the keyword and extract report URLs
 	content := htmlContent
@@ -304,13 +351,34 @@ func extractReportURLs(htmlContent, keyword string) []string {
 		urlStartIdx := strings.LastIndex(content[:startIdx], `href="`) + len(`href="`)
 		urlEndIdx := strings.Index(content[urlStartIdx:], `"`) + urlStartIdx
 		if urlStartIdx > 0 && urlEndIdx > urlStartIdx {
-			reportURLs = append(reportURLs, content[urlStartIdx:urlEndIdx])
+			reportURL = content[urlStartIdx:urlEndIdx]
+			//reportURLs = append(reportURLs, content[urlStartIdx:urlEndIdx])
 		}
 
 		content = content[startIdx:]
+
+		// Now extract the content inside the first <div class="short_right">
+		divStartIdx := strings.Index(content, `<div class="short_right">`)
+		if divStartIdx != -1 {
+			divStartIdx += len(`<div class="short_right">`)
+			divEndIdx := strings.Index(content[divStartIdx:], `</div>`) + divStartIdx
+			if divEndIdx > divStartIdx {
+				shortRightContent := content[divStartIdx:divEndIdx]
+
+				// Extract the first <div> content inside <div class="short_right">
+				innerDivStartIdx := strings.Index(shortRightContent, `<div>`)
+				if innerDivStartIdx != -1 {
+					innerDivStartIdx += len(`<div>`)
+					//innerDivEndIdx := strings.Index(shortRightContent[innerDivStartIdx:], `</div>`) + innerDivStartIdx
+					innerDivContent := shortRightContent[innerDivStartIdx:]
+					fmt.Println("Inner Div Content:", innerDivContent)
+					reportURLMap[reportURL] = innerDivContent
+				}
+			}
+		}
 	}
 
-	return reportURLs
+	return reportURLMap
 }
 
 // Process the report data

+ 4 - 1
services/liangyou/processor_business_logic.go

@@ -1961,11 +1961,14 @@ func extractValueInParentheses(input string) (string, error) {
 }
 
 // 获取指标id,根据指标名称判断,没有插入指标生成返回
-func getIndexId(indexCode string, indexName string, classifyId int, lySourceName string, frequency string, unit string) (int, error) {
+func getIndexId(indexCode string, indexName string, classifyId int, sourceName string, frequency string, unit string) (int, error) {
 	// 判断指标是否存在
 	var indexId int
 	indexInfo, err := models.GetLyIndexByCode(indexCode)
 	if err != nil {
+		return indexId, err
+	}
+	if indexInfo == nil {
 		// 新增指标
 		index, err := addLyIndex(classifyId, indexCode, indexName, frequency, unit)
 		if err != nil {