|
@@ -121,7 +121,8 @@ func fetchReportData(ctx context.Context, product, category, report string, keyw
|
|
|
}
|
|
|
logs.Info("categoryPageURL: %s: %s: %s", product, category, categoryPageURL)
|
|
|
|
|
|
- var allReportURLs []string
|
|
|
+
|
|
|
+ var allReportURLMap = make(map[string]string)
|
|
|
for {
|
|
|
var htmlContent string
|
|
|
err = chromedp.Run(ctx,
|
|
@@ -135,11 +136,16 @@ func fetchReportData(ctx context.Context, product, category, report string, keyw
|
|
|
fmt.Printf("页面内容: %s\n", htmlContent)
|
|
|
|
|
|
|
|
|
- reportURLs := extractReportURLs(htmlContent, report)
|
|
|
- allReportURLs = append(allReportURLs, reportURLs...)
|
|
|
+ reportURLMap := extractReportURLs(htmlContent, report)
|
|
|
+
|
|
|
+ for key, value := range reportURLMap {
|
|
|
+ allReportURLMap[key] = value
|
|
|
+ }
|
|
|
|
|
|
+
|
|
|
+
|
|
|
|
|
|
-
|
|
|
+
|
|
|
var nextPageDisabled bool
|
|
|
err = chromedp.Run(ctx,
|
|
|
chromedp.Evaluate(`document.querySelector('div.my-page-next').classList.contains('my-page-forbid')`, &nextPageDisabled),
|
|
@@ -161,25 +167,64 @@ func fetchReportData(ctx context.Context, product, category, report string, keyw
|
|
|
if err != nil {
|
|
|
return err
|
|
|
}
|
|
|
+
|
|
|
}
|
|
|
|
|
|
- logs.Info("所有报告 URLs: %s: %s: %v", product, category, allReportURLs)
|
|
|
+ logs.Info("所有报告 URLs: %s: %s: %v", product, category, allReportURLMap)
|
|
|
|
|
|
- if len(allReportURLs) == 0 {
|
|
|
+ if len(allReportURLMap) == 0 {
|
|
|
return fmt.Errorf("未找到报告 URL")
|
|
|
}
|
|
|
|
|
|
|
|
|
- for _, reportURL := range allReportURLs {
|
|
|
-
|
|
|
- rand := utils.RangeRand(10, 100)
|
|
|
+ for key, value := range allReportURLMap {
|
|
|
+
|
|
|
+ lyIndexRecord, err := models.GetLyIndexRecordByUrl(key)
|
|
|
+ if err != nil {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if lyIndexRecord != nil {
|
|
|
+ toTime, err := utils.StringToTime(lyIndexRecord.DataTime + " 00:00:00")
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("时间格式转换错误: %s: %s: %s: %s: %v", product, category, report, key, err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ if time.Now().Sub(toTime) > 7*24*time.Hour {
|
|
|
+ logs.Info("报告已处理: %s: %s: %s: %s", product, category, report, key)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ rand := utils.RangeRand(20, 100)
|
|
|
fmt.Println(report+";sleep:", strconv.Itoa(int(rand)))
|
|
|
time.Sleep(time.Duration(rand) * time.Second)
|
|
|
|
|
|
- err = processReport(ctx, product, category, reportURL, keywords)
|
|
|
+ err = processReport(ctx, product, category, key, keywords)
|
|
|
if err != nil {
|
|
|
- logs.Error("处理报告错误: %s: %s: %s: %s: %v", product, category, report, reportURL, err)
|
|
|
+ logs.Error("处理报告错误: %s: %s: %s: %s: %v", product, category, report, key, err)
|
|
|
+ continue
|
|
|
}
|
|
|
+
|
|
|
+ format, err := utils.ConvertTimeFormat(value)
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("时间格式转换错误: %s, %s, %v: %v", product, category, value, err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ recordId, err := models.AddLyIndexRecord(&models.BaseFromLyIndexRecord{
|
|
|
+ CreateTime: utils.GetCurrentTime(),
|
|
|
+ ModifyTime: utils.GetCurrentTime(),
|
|
|
+ Url: key,
|
|
|
+ DataTime: format,
|
|
|
+ })
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("维护指标数据读取进度错误: %s, %s, %v: %v", product, category, recordId, err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ logs.Info("维护指标数据读取进度成功: %s, %s, %v", product, category, recordId)
|
|
|
}
|
|
|
|
|
|
return nil
|
|
@@ -288,8 +333,10 @@ func extractHrefAndText(outerHTML string) (string, string) {
|
|
|
}
|
|
|
|
|
|
|
|
|
-func extractReportURLs(htmlContent, keyword string) []string {
|
|
|
- var reportURLs []string
|
|
|
+func extractReportURLs(htmlContent, keyword string) map[string]string {
|
|
|
+
|
|
|
+ var reportURLMap = make(map[string]string)
|
|
|
+ var reportURL string
|
|
|
|
|
|
|
|
|
content := htmlContent
|
|
@@ -304,13 +351,34 @@ func extractReportURLs(htmlContent, keyword string) []string {
|
|
|
urlStartIdx := strings.LastIndex(content[:startIdx], `href="`) + len(`href="`)
|
|
|
urlEndIdx := strings.Index(content[urlStartIdx:], `"`) + urlStartIdx
|
|
|
if urlStartIdx > 0 && urlEndIdx > urlStartIdx {
|
|
|
- reportURLs = append(reportURLs, content[urlStartIdx:urlEndIdx])
|
|
|
+ reportURL = content[urlStartIdx:urlEndIdx]
|
|
|
+
|
|
|
}
|
|
|
|
|
|
content = content[startIdx:]
|
|
|
+
|
|
|
+
|
|
|
+ divStartIdx := strings.Index(content, `<div class="short_right">`)
|
|
|
+ if divStartIdx != -1 {
|
|
|
+ divStartIdx += len(`<div class="short_right">`)
|
|
|
+ divEndIdx := strings.Index(content[divStartIdx:], `</div>`) + divStartIdx
|
|
|
+ if divEndIdx > divStartIdx {
|
|
|
+ shortRightContent := content[divStartIdx:divEndIdx]
|
|
|
+
|
|
|
+
|
|
|
+ innerDivStartIdx := strings.Index(shortRightContent, `<div>`)
|
|
|
+ if innerDivStartIdx != -1 {
|
|
|
+ innerDivStartIdx += len(`<div>`)
|
|
|
+
|
|
|
+ innerDivContent := shortRightContent[innerDivStartIdx:]
|
|
|
+ fmt.Println("Inner Div Content:", innerDivContent)
|
|
|
+ reportURLMap[reportURL] = innerDivContent
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
- return reportURLs
|
|
|
+ return reportURLMap
|
|
|
}
|
|
|
|
|
|
|