|
@@ -8,7 +8,6 @@ import (
|
|
|
"encoding/json"
|
|
|
"fmt"
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
- "golang.org/x/net/html"
|
|
|
"image"
|
|
|
"image/png"
|
|
|
"math"
|
|
@@ -919,17 +918,14 @@ func FindArticleImgUrls(body string) (imgUrls []string, err error) {
|
|
|
return
|
|
|
}
|
|
|
|
|
|
-//提取纯文本
|
|
|
-func ExtractText(node *html.Node) string {
|
|
|
- var texts []string
|
|
|
-
|
|
|
- if node.Type == html.TextNode {
|
|
|
- texts = append(texts, node.Data)
|
|
|
- }
|
|
|
+// 去除部分style
|
|
|
+func ExtractText(body string) (result string, err error) {
|
|
|
+ // 使用正则表达式去除img标签
|
|
|
+ re := regexp.MustCompile(`<section style[\s\S]*?>`)
|
|
|
+ result = re.ReplaceAllString(body, "")
|
|
|
|
|
|
- for child := node.FirstChild; child != nil; child = child.NextSibling {
|
|
|
- texts = append(texts, ExtractText(child))
|
|
|
- }
|
|
|
+ re = regexp.MustCompile(`<span style[\s\S]*?>`)
|
|
|
+ result = re.ReplaceAllString(result, "")
|
|
|
|
|
|
- return strings.Join(texts, "")
|
|
|
+ return
|
|
|
}
|