ziwen 1 年之前
父节点
当前提交
4a2a7c5ab3
共有 2 个文件被更改,包括 12 次插入15 次删除
  1. 4 3
      controllers/yanxuan_special.go
  2. 8 12
      utils/common.go

+ 4 - 3
controllers/yanxuan_special.go

@@ -2,7 +2,6 @@ package controllers
 
 import (
 	"encoding/json"
-	"golang.org/x/net/html"
 	"hongze/hongze_cygx/models"
 	"hongze/hongze_cygx/services"
 	"hongze/hongze_cygx/utils"
@@ -85,8 +84,10 @@ func (this *YanxuanSpecialController) List() {
 		}
 		v.Content = utils.ArticleRemoveImgUrl(v.Content)
 
-		doc, _ := html.Parse(strings.NewReader(v.Content))
-		v.Content = utils.ExtractText(doc)
+		v.Content, err = utils.ExtractText(v.Content)
+		if err != nil {
+			return
+		}
 
 		if v.DocUrl != "" {
 			var docs []models.Doc

+ 8 - 12
utils/common.go

@@ -8,7 +8,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"github.com/PuerkitoBio/goquery"
-	"golang.org/x/net/html"
 	"image"
 	"image/png"
 	"math"
@@ -919,17 +918,14 @@ func FindArticleImgUrls(body string) (imgUrls []string, err error) {
 	return
 }
 
-//提取纯文本
-func ExtractText(node *html.Node) string {
-	var texts []string
-
-	if node.Type == html.TextNode {
-		texts = append(texts, node.Data)
-	}
+// 去除部分style
+func ExtractText(body string) (result string, err error) {
+	// 使用正则表达式去除img标签
+	re := regexp.MustCompile(`<section style[\s\S]*?>`)
+	result = re.ReplaceAllString(body, "")
 
-	for child := node.FirstChild; child != nil; child = child.NextSibling {
-		texts = append(texts, ExtractText(child))
-	}
+	re = regexp.MustCompile(`<span style[\s\S]*?>`)
+	result = re.ReplaceAllString(result, "")
 
-	return strings.Join(texts, "")
+	return
 }