8 月之前 · dc13f45246
--- a/cmd/commodity_liangyou.go
+++ b/cmd/commodity_liangyou.go
@@ -9,6 +9,8 @@ import (
 
				 	"github.com/beego/beego/v2/core/logs"
			
 
				 	"github.com/chromedp/cdproto/cdp"
			
 
				 	"os"
			
 
				+	"regexp"
			
 
				+	"strconv"
			
 
				 	"strings"
			
 
				 	"time"
			
 
				 
			
@@ -146,10 +148,9 @@ func fetchReportData(ctx context.Context, product, category, report string, keyw
 
				 		allReportURLs = append(allReportURLs, reportURLs...)
			
 
				 
			
 
				 		// 随机睡眠
			
 
				-		// todo 上线放开
			
 
				-		/*rand := utils.RangeRand(10, 100)
			
 
				+		rand := utils.RangeRand(10, 100)
			
 
				 		fmt.Println(report+";sleep:", strconv.Itoa(int(rand)))
			
 
				-		time.Sleep(time.Duration(rand) * time.Second)*/
			
 
				+		time.Sleep(time.Duration(rand) * time.Second)
			
 
				 
			
 
				 		// todo 测试环境跑部分数据，上线放开
			
 
				 		break
			
@@ -197,7 +198,8 @@ func fetchReportData(ctx context.Context, product, category, report string, keyw
 
				 }
			
 
				 
			
 
				 func fillProductPageURL(ctx context.Context, product string, category string) (string, error) {
			
 
				-	selector := `//dl[contains(@class, 'dl_hot')]`
			
 
				+	// 选择 dl 标签下所有 a 标签的 XPath
			
 
				+	selector := `//dl[contains(@class, 'dl_hot')]//a`
			
 
				 	logs.Info("选择器表达式: %s", selector)
			
 
				 
			
 
				 	var nodes []*cdp.Node
			
@@ -206,26 +208,60 @@ func fillProductPageURL(ctx context.Context, product string, category string) (s
 
				 	// 获取 dl 标签下的所有 a 标签节点
			
 
				 	err := chromedp.Run(ctx,
			
 
				 		chromedp.WaitReady(selector, chromedp.BySearch),
			
 
				-		chromedp.Nodes(selector+"//a", &nodes, chromedp.BySearch),
			
 
				+		chromedp.Nodes(selector, &nodes, chromedp.BySearch),
			
 
				 	)
			
 
				 	if err != nil {
			
 
				 		return "", err
			
 
				 	}
			
 
				 
			
 
				-	// 提取并打印所有 a 标签的文本内容
			
 
				+	// 提取并打印所有 a 标签的 OuterHTML
			
 
				+	var targetURL string
			
 
				 	for _, node := range nodes {
			
 
				-		var linkText string
			
 
				+		var outerHTML string
			
 
				+
			
 
				+		// 获取 a 标签的 OuterHTML
			
 
				 		err = chromedp.Run(ctx,
			
 
				-			chromedp.Text(node.FullXPath(), &linkText),
			
 
				+			chromedp.OuterHTML(node.FullXPath(), &outerHTML, chromedp.BySearch),
			
 
				 		)
			
 
				 		if err != nil {
			
 
				 			return "", err
			
 
				 		}
			
 
				-		logs.Info("Link Text: %s", linkText)
			
 
				+
			
 
				+		// 打印获取的 OuterHTML 内容
			
 
				+		logs.Info("Link OuterHTML: %s", outerHTML)
			
 
				+
			
 
				+		// 从 OuterHTML 中提取 href 和文本内容
			
 
				+		// 使用正则或字符串处理提取 href 和文本内容
			
 
				+		href, linkText := extractHrefAndText(outerHTML)
			
 
				+
			
 
				+		// 打印提取的 href 和文本内容
			
 
				+		logs.Info("Link Text: %s, Href: %s", linkText, href)
			
 
				+
			
 
				+		// 如果文本内容匹配目标产品
			
 
				+		if linkText == product {
			
 
				+			// 拼接完整的 URL
			
 
				+			/*if !strings.HasPrefix(href, "http") {
			
 
				+				href = lyLoginPath + href
			
 
				+			}*/
			
 
				+			targetURL = href
			
 
				+			break
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if targetURL == "" {
			
 
				+		return "", fmt.Errorf("未找到匹配的产品链接")
			
 
				+	}
			
 
				+
			
 
				+	// 显示更多内容
			
 
				+	err = chromedp.Run(ctx,
			
 
				+		chromedp.Evaluate(`document.getElementById("moreSpeList").style.display = "block";`, nil),
			
 
				+	)
			
 
				+	if err != nil {
			
 
				+		return "", err
			
 
				 	}
			
 
				 
			
 
				 	// 点击目标产品的链接
			
 
				-	clickSelector := fmt.Sprintf(`//dl[contains(@class, 'dl_hot')]//a[text()='%s']`, product)
			
 
				+	clickSelector := fmt.Sprintf(`//a[@href='%s']`, targetURL)
			
 
				 	err = chromedp.Run(ctx,
			
 
				 		chromedp.WaitReady(clickSelector, chromedp.BySearch),
			
 
				 		chromedp.Click(clickSelector, chromedp.BySearch),
			
@@ -241,6 +277,28 @@ func fillProductPageURL(ctx context.Context, product string, category string) (s
 
				 	return productPageURL, nil
			
 
				 }
			
 
				 
			
 
				+// extractHrefAndText 从 OuterHTML 提取 href 和文本内容的辅助函数
			
 
				+func extractHrefAndText(outerHTML string) (string, string) {
			
 
				+	// 使用正则表达式或其他字符串处理方法提取 href 和文本内容
			
 
				+	// 这里只是一个简单的例子，具体实现需要根据 HTML 结构来调整
			
 
				+	hrefRegex := `href="([^"]+)"`
			
 
				+	textRegex := `>([^<]+)<`
			
 
				+
			
 
				+	hrefMatches := regexp.MustCompile(hrefRegex).FindStringSubmatch(outerHTML)
			
 
				+	textMatches := regexp.MustCompile(textRegex).FindStringSubmatch(outerHTML)
			
 
				+
			
 
				+	href := ""
			
 
				+	linkText := ""
			
 
				+	if len(hrefMatches) > 1 {
			
 
				+		href = hrefMatches[1]
			
 
				+	}
			
 
				+	if len(textMatches) > 1 {
			
 
				+		linkText = textMatches[1]
			
 
				+	}
			
 
				+
			
 
				+	return href, linkText
			
 
				+}
			
 
				+
			
 
				 // Extract report URLs from the HTML content
			
 
				 func extractReportURLs(htmlContent, keyword string) []string {
			
 
				 	var reportURLs []string