article.go 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. package services
  2. import (
  3. "fmt"
  4. "github.com/PuerkitoBio/goquery"
  5. "hongze/hongze_cygx/models"
  6. "html"
  7. "strings"
  8. )
  9. func GetReportContentSub(content string) (contentSub string, err error) {
  10. content = html.UnescapeString(content)
  11. doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
  12. if err != nil {
  13. fmt.Println("create doc err:", err.Error())
  14. return
  15. }
  16. n := 0
  17. doc.Find("p").Each(func(i int, s *goquery.Selection) {
  18. if n > 3 {
  19. return
  20. }
  21. n++
  22. phtml, err := s.Html()
  23. if err != nil {
  24. fmt.Println("get html err", err.Error())
  25. return
  26. }
  27. if s.Text() != "" || strings.Contains(phtml, "src") {
  28. contentSub = contentSub + "<p>" + phtml + "</p>"
  29. }
  30. })
  31. return
  32. }
  33. func GetReportContentTextSub(content string) (contentSub string, err error) {
  34. content = html.UnescapeString(content)
  35. doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
  36. //if err != nil {
  37. // fmt.Println("create doc err:", err.Error())
  38. // return
  39. //}
  40. //doc.Find("p").Each(func(i int, s *goquery.Selection) {
  41. // pHtml, _ := s.Html()
  42. // if strings.Contains(pHtml, "img") || strings.Contains(pHtml, "table") {
  43. // s.Remove()
  44. // }
  45. //})
  46. //if contentSub == "" || len(contentSub) < 200 {
  47. // //m := 0
  48. // doc.Find("span").Each(func(i int, s *goquery.Selection) {
  49. // spanHtml, _ := s.Html()
  50. // if strings.Contains(spanHtml, "img") || strings.Contains(spanHtml, "table") {
  51. // s.Remove()
  52. // }
  53. // })
  54. //}
  55. docText := doc.Text()
  56. bodyRune := []rune(docText)
  57. bodyRuneLen := len(bodyRune)
  58. if bodyRuneLen > 200 {
  59. bodyRuneLen = 200
  60. }
  61. body := string(bodyRune[:bodyRuneLen])
  62. contentSub = body
  63. return
  64. }
  65. //解析文章内容
  66. func GetArticleAll() {
  67. var err error
  68. defer func() {
  69. if err != nil {
  70. fmt.Println("err:", err.Error())
  71. return
  72. }
  73. }()
  74. list, err := models.GetArticleAll()
  75. if err != nil {
  76. return
  77. }
  78. for _, v := range list {
  79. fmt.Println(v.ArticleId, v.Title)
  80. FixArticleContent(v.ArticleId)
  81. }
  82. }
  83. //解析报告
  84. func FixArticleContent(articleId int) {
  85. item, err := models.GetArticleDetailById(articleId)
  86. if err != nil {
  87. fmt.Println("GetArticleDetailById Err:" + err.Error())
  88. return
  89. }
  90. content := item.Body
  91. bodyText, _ := GetReportContentTextSub(content)
  92. content = html.UnescapeString(content)
  93. content = strings.Replace(content, "http", "https", -1)
  94. doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
  95. if err != nil {
  96. fmt.Println("create doc err:", err.Error())
  97. return
  98. }
  99. var expertNumArr []string
  100. var expertContentArr []string
  101. var interviewDateArr []string
  102. doc.Find("p").Each(func(i int, s *goquery.Selection) {
  103. contentTxt := s.Text()
  104. if strings.Contains(contentTxt, "#访谈时间:") || strings.Contains(contentTxt, "访谈时间:") {
  105. interviewDate := s.Next().Text()
  106. interviewDateArr = append(interviewDateArr, interviewDate)
  107. }
  108. if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") {
  109. expertContent := s.Next().Text()
  110. if expertContent == "" {
  111. expertContent = contentTxt
  112. }
  113. if expertContent != "" {
  114. rightIndex := strings.Index(expertContent, ")")
  115. if rightIndex == 0 {
  116. rightIndex = strings.Index(expertContent, ")")
  117. }
  118. if rightIndex > 0 {
  119. expertNum := expertContent[:rightIndex]
  120. expertNum = strings.Replace(expertNum, "(", "", -1)
  121. expertNum = strings.Replace(expertNum, "(", "", -1)
  122. expertNum = strings.Replace(expertNum, "专家评价", "", -1)
  123. if expertNum != "" {
  124. expertNumArr = append(expertNumArr, expertNum)
  125. rightIndex = rightIndex
  126. expertContentStr := expertContent[rightIndex:]
  127. expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
  128. expertContentStr = strings.TrimLeft(expertContentStr, ":")
  129. expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
  130. expertContentArr = append(expertContentArr, expertContentStr)
  131. }
  132. }
  133. }
  134. }
  135. })
  136. if len(expertContentArr) <= 0 {
  137. doc.Find("pre").Each(func(i int, pre *goquery.Selection) {
  138. pre.Find("span").Each(func(n int, span *goquery.Selection) {
  139. contentTxt := span.Text()
  140. if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") {
  141. span.Find("span").Each(func(m int, subspan *goquery.Selection) {
  142. subspanText := subspan.Text()
  143. if strings.Contains(subspanText, "专家评价") {
  144. expertContent := subspan.Next().Text()
  145. if expertContent != "" {
  146. rightIndex := strings.Index(expertContent, ")")
  147. if rightIndex == 0 {
  148. rightIndex = strings.Index(expertContent, ")")
  149. }
  150. if rightIndex > 0 {
  151. expertNum := expertContent[:rightIndex]
  152. expertNum = strings.Replace(expertNum, "(", "", -1)
  153. expertNum = strings.Replace(expertNum, "(", "", -1)
  154. expertNum = strings.Replace(expertNum, "专家评价", "", -1)
  155. if expertNum != "" {
  156. expertNumArr = append(expertNumArr, expertNum)
  157. rightIndex = rightIndex
  158. expertContentStr := expertContent[rightIndex:]
  159. expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
  160. expertContentStr = strings.TrimLeft(expertContentStr, ":")
  161. expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
  162. expertContentArr = append(expertContentArr, expertContentStr)
  163. }
  164. }
  165. }
  166. }
  167. })
  168. }
  169. span.Find("span").Each(func(k int, sspan *goquery.Selection) {
  170. sspanText := sspan.Text()
  171. if strings.Contains(sspanText, "访谈时间") {
  172. sspanText = strings.Replace(sspanText, "#访谈时间:", "", -1)
  173. sspanText = strings.Replace(sspanText, "访谈时间:", "", -1)
  174. sspanText = strings.Replace(sspanText, "\n", "", -1)
  175. sspanText = strings.Replace(sspanText, " ", "", -1)
  176. sspanText = strings.Trim(sspanText, " ")
  177. sspanText = sspanText[:10]
  178. interviewDate := sspanText
  179. if interviewDate != "" {
  180. interviewDateArr = append(interviewDateArr, interviewDate)
  181. }
  182. }
  183. })
  184. })
  185. })
  186. }
  187. if len(expertContentArr) <= 0 {
  188. doc.Find("span").Each(func(i int, span *goquery.Selection) {
  189. span.Find("strong").Each(func(n int, strong *goquery.Selection) {
  190. spanText := span.Text()
  191. strongText := strong.Text()
  192. if strings.Contains(strongText, "#专家评价") || strings.Contains(strongText, "专家评价") {
  193. expertContent := strong.Parents().Text()
  194. if expertContent != "" {
  195. rightIndex := strings.Index(expertContent, ")")
  196. if rightIndex == 0 {
  197. rightIndex = strings.Index(expertContent, ")")
  198. }
  199. if rightIndex > 0 {
  200. expertNum := expertContent[:rightIndex]
  201. expertNum = strings.Replace(expertNum, "(", "", -1)
  202. expertNum = strings.Replace(expertNum, "(", "", -1)
  203. expertNum = strings.Replace(expertNum, "专家评价", "", -1)
  204. expertNum = strings.Replace(expertNum, "#", "", -1)
  205. expertNum = strings.Replace(expertNum, ":", "", -1)
  206. expertNum = strings.Replace(expertNum, "\n", "", -1)
  207. if expertNum != "" {
  208. expertNumArr = append(expertNumArr, expertNum)
  209. rightIndex = rightIndex
  210. expertContentStr := expertContent[rightIndex:]
  211. expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
  212. expertContentStr = strings.TrimLeft(expertContentStr, ":")
  213. expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
  214. expertContentArr = append(expertContentArr, expertContentStr)
  215. return
  216. }
  217. }
  218. }
  219. }
  220. if strings.Contains(spanText, "访谈时间") {
  221. spanText = strings.Replace(spanText, "#访谈时间:", "", -1)
  222. spanText = strings.Replace(spanText, "访谈时间:", "", -1)
  223. spanText = strings.Replace(spanText, "\n", "", -1)
  224. spanText = strings.Replace(spanText, " ", "", -1)
  225. spanText = strings.Trim(spanText, " ")
  226. spanText = spanText[:10]
  227. interviewDate := spanText
  228. if interviewDate != "" {
  229. interviewDateArr = append(interviewDateArr, interviewDate)
  230. }
  231. }
  232. })
  233. })
  234. }
  235. var expertNumStr, expertContentStr, interviewDateStr string
  236. if len(expertNumArr) > 0 {
  237. expertNumStr = expertNumArr[0]
  238. }
  239. if len(expertContentArr) > 0 {
  240. expertContentStr = expertContentArr[0]
  241. }
  242. if len(interviewDateArr) > 0 {
  243. interviewDateStr = interviewDateArr[0]
  244. }
  245. expertNumStr = strings.Replace(expertNumStr, "#:", "", -1)
  246. err = models.ModifyArticleExpert(articleId, expertNumStr, expertContentStr, interviewDateStr, bodyText)
  247. if err != nil {
  248. fmt.Println("ModifyArticleExpert Err:" + err.Error())
  249. return
  250. }
  251. }