123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276 |
- package services
- import (
- "fmt"
- "github.com/PuerkitoBio/goquery"
- "hongze/hongze_cygx/models"
- "html"
- "strings"
- )
- func GetReportContentSub(content string) (contentSub string, err error) {
- content = html.UnescapeString(content)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
- if err != nil {
- fmt.Println("create doc err:", err.Error())
- return
- }
- n := 0
- doc.Find("p").Each(func(i int, s *goquery.Selection) {
- if n > 3 {
- return
- }
- n++
- phtml, err := s.Html()
- if err != nil {
- fmt.Println("get html err", err.Error())
- return
- }
- if s.Text() != "" || strings.Contains(phtml, "src") {
- contentSub = contentSub + "<p>" + phtml + "</p>"
- }
- })
- return
- }
- func GetReportContentTextSub(content string) (contentSub string, err error) {
- content = html.UnescapeString(content)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
- if err != nil {
- fmt.Println("create doc err:", err.Error())
- return
- }
- maxRow := 5
- n := 0
- doc.Find("p").Each(func(i int, s *goquery.Selection) {
- pHtml, _ := s.Html()
- if !strings.Contains(pHtml, "img") && !strings.Contains(pHtml, "table") {
- if n > maxRow {
- return
- }
- text := s.Text()
- if text != "" && !strings.Contains(text, "访谈时间") && !strings.Contains(text, "纪要详情") {
- n++
- contentSub = contentSub + s.Text()
- }
- }
- })
- if contentSub == "" || len(contentSub) < 200 {
- m := 0
- doc.Find("span").Each(func(i int, s *goquery.Selection) {
- spanHtml, _ := s.Html()
- if !strings.Contains(spanHtml, "img") && !strings.Contains(spanHtml, "table") {
- if m > maxRow {
- return
- }
- text := s.Text()
- if text != "" && !strings.Contains(text, "访谈时间") && !strings.Contains(text, "纪要详情") {
- n++
- contentSub = contentSub + s.Text()
- }
- }
- })
- }
- return
- }
- //解析文章内容
- func GetArticleAll() {
- var err error
- defer func() {
- if err != nil {
- fmt.Println("err:", err.Error())
- return
- }
- }()
- list, err := models.GetArticleAll()
- if err != nil {
- return
- }
- for _, v := range list {
- fmt.Println(v.ArticleId, v.Title)
- FixArticleContent(v.ArticleId)
- }
- }
- //解析报告
- func FixArticleContent(articleId int) {
- item, err := models.GetArticleDetailById(articleId)
- if err != nil {
- fmt.Println("GetArticleDetailById Err:" + err.Error())
- return
- }
- content := item.Body
- bodyText, _ := GetReportContentTextSub(content)
- content = html.UnescapeString(content)
- content = strings.Replace(content, "http", "https", -1)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
- if err != nil {
- fmt.Println("create doc err:", err.Error())
- return
- }
- var expertNumArr []string
- var expertContentArr []string
- var interviewDateArr []string
- doc.Find("p").Each(func(i int, s *goquery.Selection) {
- contentTxt := s.Text()
- if strings.Contains(contentTxt, "#访谈时间:") || strings.Contains(contentTxt, "访谈时间:") {
- interviewDate := s.Next().Text()
- interviewDateArr = append(interviewDateArr, interviewDate)
- }
- if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") {
- expertContent := s.Next().Text()
- if expertContent == "" {
- expertContent = contentTxt
- }
- if expertContent != "" {
- rightIndex := strings.Index(expertContent, ")")
- if rightIndex == 0 {
- rightIndex = strings.Index(expertContent, ")")
- }
- if rightIndex > 0 {
- expertNum := expertContent[:rightIndex]
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "专家评价", "", -1)
- if expertNum != "" {
- expertNumArr = append(expertNumArr, expertNum)
- rightIndex = rightIndex
- expertContentStr := expertContent[rightIndex:]
- expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
- expertContentStr = strings.TrimLeft(expertContentStr, ":")
- expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
- expertContentArr = append(expertContentArr, expertContentStr)
- }
- }
- }
- }
- })
- if len(expertContentArr) <= 0 {
- doc.Find("pre").Each(func(i int, pre *goquery.Selection) {
- pre.Find("span").Each(func(n int, span *goquery.Selection) {
- contentTxt := span.Text()
- if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") {
- span.Find("span").Each(func(m int, subspan *goquery.Selection) {
- subspanText := subspan.Text()
- if strings.Contains(subspanText, "专家评价") {
- expertContent := subspan.Next().Text()
- if expertContent != "" {
- rightIndex := strings.Index(expertContent, ")")
- if rightIndex == 0 {
- rightIndex = strings.Index(expertContent, ")")
- }
- if rightIndex > 0 {
- expertNum := expertContent[:rightIndex]
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "专家评价", "", -1)
- if expertNum != "" {
- expertNumArr = append(expertNumArr, expertNum)
- rightIndex = rightIndex
- expertContentStr := expertContent[rightIndex:]
- expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
- expertContentStr = strings.TrimLeft(expertContentStr, ":")
- expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
- expertContentArr = append(expertContentArr, expertContentStr)
- }
- }
- }
- }
- })
- }
- span.Find("span").Each(func(k int, sspan *goquery.Selection) {
- sspanText := sspan.Text()
- if strings.Contains(sspanText, "访谈时间") {
- sspanText = strings.Replace(sspanText, "#访谈时间:", "", -1)
- sspanText = strings.Replace(sspanText, "访谈时间:", "", -1)
- sspanText = strings.Replace(sspanText, "\n", "", -1)
- sspanText = strings.Replace(sspanText, " ", "", -1)
- sspanText = strings.Trim(sspanText, " ")
- sspanText = sspanText[:10]
- interviewDate := sspanText
- if interviewDate != "" {
- interviewDateArr = append(interviewDateArr, interviewDate)
- }
- }
- })
- })
- })
- }
- if len(expertContentArr) <= 0 {
- doc.Find("span").Each(func(i int, span *goquery.Selection) {
- span.Find("strong").Each(func(n int, strong *goquery.Selection) {
- spanText := span.Text()
- strongText := strong.Text()
- if strings.Contains(strongText, "#专家评价") || strings.Contains(strongText, "专家评价") {
- expertContent := strong.Parents().Text()
- if expertContent != "" {
- rightIndex := strings.Index(expertContent, ")")
- if rightIndex == 0 {
- rightIndex = strings.Index(expertContent, ")")
- }
- if rightIndex > 0 {
- expertNum := expertContent[:rightIndex]
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "专家评价", "", -1)
- expertNum = strings.Replace(expertNum, "#", "", -1)
- expertNum = strings.Replace(expertNum, ":", "", -1)
- expertNum = strings.Replace(expertNum, "\n", "", -1)
- if expertNum != "" {
- expertNumArr = append(expertNumArr, expertNum)
- rightIndex = rightIndex
- expertContentStr := expertContent[rightIndex:]
- expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
- expertContentStr = strings.TrimLeft(expertContentStr, ":")
- expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
- expertContentArr = append(expertContentArr, expertContentStr)
- return
- }
- }
- }
- }
- if strings.Contains(spanText, "访谈时间") {
- spanText = strings.Replace(spanText, "#访谈时间:", "", -1)
- spanText = strings.Replace(spanText, "访谈时间:", "", -1)
- spanText = strings.Replace(spanText, "\n", "", -1)
- spanText = strings.Replace(spanText, " ", "", -1)
- spanText = strings.Trim(spanText, " ")
- spanText = spanText[:10]
- interviewDate := spanText
- if interviewDate != "" {
- interviewDateArr = append(interviewDateArr, interviewDate)
- }
- }
- })
- })
- }
- var expertNumStr, expertContentStr, interviewDateStr string
- if len(expertNumArr) > 0 {
- expertNumStr = expertNumArr[0]
- }
- if len(expertContentArr) > 0 {
- expertContentStr = expertContentArr[0]
- }
- if len(interviewDateArr) > 0 {
- interviewDateStr = interviewDateArr[0]
- }
- expertNumStr = strings.Replace(expertNumStr, "#:", "", -1)
- err = models.ModifyArticleExpert(articleId, expertNumStr, expertContentStr, interviewDateStr, bodyText)
- if err != nil {
- fmt.Println("ModifyArticleExpert Err:" + err.Error())
- return
- }
- }
|