article.go 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. package services
  2. import (
  3. "fmt"
  4. "github.com/PuerkitoBio/goquery"
  5. "hongze/hongze_cygx/models"
  6. "html"
  7. "strings"
  8. )
  9. func GetReportContentSub(content string) (contentSub string, err error) {
  10. content = html.UnescapeString(content)
  11. doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
  12. if err != nil {
  13. fmt.Println("create doc err:", err.Error())
  14. return
  15. }
  16. n := 0
  17. doc.Find("p").Each(func(i int, s *goquery.Selection) {
  18. if n > 3 {
  19. return
  20. }
  21. n++
  22. phtml, err := s.Html()
  23. if err != nil {
  24. fmt.Println("get html err", err.Error())
  25. return
  26. }
  27. if s.Text() != "" || strings.Contains(phtml, "src") {
  28. contentSub = contentSub + "<p>" + phtml + "</p>"
  29. }
  30. })
  31. return
  32. }
  33. func GetReportContentTextSub(content string) (contentSub string, err error) {
  34. content = html.UnescapeString(content)
  35. doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
  36. if err != nil {
  37. fmt.Println("create doc err:", err.Error())
  38. return
  39. }
  40. maxRow := 5
  41. n := 0
  42. doc.Find("p").Each(func(i int, s *goquery.Selection) {
  43. pHtml, _ := s.Html()
  44. if !strings.Contains(pHtml, "img") && !strings.Contains(pHtml, "table") {
  45. if n > maxRow {
  46. return
  47. }
  48. text := s.Text()
  49. if text != "" && !strings.Contains(text, "访谈时间") && !strings.Contains(text, "纪要详情") {
  50. n++
  51. contentSub = contentSub + s.Text()
  52. }
  53. }
  54. })
  55. if contentSub == "" || len(contentSub) < 200 {
  56. m := 0
  57. doc.Find("span").Each(func(i int, s *goquery.Selection) {
  58. spanHtml, _ := s.Html()
  59. if !strings.Contains(spanHtml, "img") && !strings.Contains(spanHtml, "table") {
  60. if m > maxRow {
  61. return
  62. }
  63. text := s.Text()
  64. if text != "" && !strings.Contains(text, "访谈时间") && !strings.Contains(text, "纪要详情") {
  65. n++
  66. contentSub = contentSub + s.Text()
  67. }
  68. }
  69. })
  70. }
  71. return
  72. }
  73. //解析文章内容
  74. func GetArticleAll() {
  75. var err error
  76. defer func() {
  77. if err != nil {
  78. fmt.Println("err:", err.Error())
  79. return
  80. }
  81. }()
  82. list, err := models.GetArticleAll()
  83. if err != nil {
  84. return
  85. }
  86. for _, v := range list {
  87. fmt.Println(v.ArticleId, v.Title)
  88. FixArticleContent(v.ArticleId)
  89. }
  90. }
  91. //解析报告
  92. func FixArticleContent(articleId int) {
  93. item, err := models.GetArticleDetailById(articleId)
  94. if err != nil {
  95. fmt.Println("GetArticleDetailById Err:" + err.Error())
  96. return
  97. }
  98. content := item.Body
  99. bodyText, _ := GetReportContentTextSub(content)
  100. content = html.UnescapeString(content)
  101. content = strings.Replace(content, "http", "https", -1)
  102. doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
  103. if err != nil {
  104. fmt.Println("create doc err:", err.Error())
  105. return
  106. }
  107. var expertNumArr []string
  108. var expertContentArr []string
  109. var interviewDateArr []string
  110. doc.Find("p").Each(func(i int, s *goquery.Selection) {
  111. contentTxt := s.Text()
  112. if strings.Contains(contentTxt, "#访谈时间:") || strings.Contains(contentTxt, "访谈时间:") {
  113. interviewDate := s.Next().Text()
  114. interviewDateArr = append(interviewDateArr, interviewDate)
  115. }
  116. if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") {
  117. expertContent := s.Next().Text()
  118. if expertContent == "" {
  119. expertContent = contentTxt
  120. }
  121. if expertContent != "" {
  122. rightIndex := strings.Index(expertContent, ")")
  123. if rightIndex == 0 {
  124. rightIndex = strings.Index(expertContent, ")")
  125. }
  126. if rightIndex > 0 {
  127. expertNum := expertContent[:rightIndex]
  128. expertNum = strings.Replace(expertNum, "(", "", -1)
  129. expertNum = strings.Replace(expertNum, "(", "", -1)
  130. expertNum = strings.Replace(expertNum, "专家评价", "", -1)
  131. if expertNum != "" {
  132. expertNumArr = append(expertNumArr, expertNum)
  133. rightIndex = rightIndex
  134. expertContentStr := expertContent[rightIndex:]
  135. expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
  136. expertContentStr = strings.TrimLeft(expertContentStr, ":")
  137. expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
  138. expertContentArr = append(expertContentArr, expertContentStr)
  139. }
  140. }
  141. }
  142. }
  143. })
  144. if len(expertContentArr) <= 0 {
  145. doc.Find("pre").Each(func(i int, pre *goquery.Selection) {
  146. pre.Find("span").Each(func(n int, span *goquery.Selection) {
  147. contentTxt := span.Text()
  148. if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") {
  149. span.Find("span").Each(func(m int, subspan *goquery.Selection) {
  150. subspanText := subspan.Text()
  151. if strings.Contains(subspanText, "专家评价") {
  152. expertContent := subspan.Next().Text()
  153. if expertContent != "" {
  154. rightIndex := strings.Index(expertContent, ")")
  155. if rightIndex == 0 {
  156. rightIndex = strings.Index(expertContent, ")")
  157. }
  158. if rightIndex > 0 {
  159. expertNum := expertContent[:rightIndex]
  160. expertNum = strings.Replace(expertNum, "(", "", -1)
  161. expertNum = strings.Replace(expertNum, "(", "", -1)
  162. expertNum = strings.Replace(expertNum, "专家评价", "", -1)
  163. if expertNum != "" {
  164. expertNumArr = append(expertNumArr, expertNum)
  165. rightIndex = rightIndex
  166. expertContentStr := expertContent[rightIndex:]
  167. expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
  168. expertContentStr = strings.TrimLeft(expertContentStr, ":")
  169. expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
  170. expertContentArr = append(expertContentArr, expertContentStr)
  171. }
  172. }
  173. }
  174. }
  175. })
  176. }
  177. span.Find("span").Each(func(k int, sspan *goquery.Selection) {
  178. sspanText := sspan.Text()
  179. if strings.Contains(sspanText, "访谈时间") {
  180. sspanText = strings.Replace(sspanText, "#访谈时间:", "", -1)
  181. sspanText = strings.Replace(sspanText, "访谈时间:", "", -1)
  182. sspanText = strings.Replace(sspanText, "\n", "", -1)
  183. sspanText = strings.Replace(sspanText, " ", "", -1)
  184. sspanText = strings.Trim(sspanText, " ")
  185. sspanText = sspanText[:10]
  186. interviewDate := sspanText
  187. if interviewDate != "" {
  188. interviewDateArr = append(interviewDateArr, interviewDate)
  189. }
  190. }
  191. })
  192. })
  193. })
  194. }
  195. if len(expertContentArr) <= 0 {
  196. doc.Find("span").Each(func(i int, span *goquery.Selection) {
  197. span.Find("strong").Each(func(n int, strong *goquery.Selection) {
  198. spanText := span.Text()
  199. strongText := strong.Text()
  200. if strings.Contains(strongText, "#专家评价") || strings.Contains(strongText, "专家评价") {
  201. expertContent := strong.Parents().Text()
  202. if expertContent != "" {
  203. rightIndex := strings.Index(expertContent, ")")
  204. if rightIndex == 0 {
  205. rightIndex = strings.Index(expertContent, ")")
  206. }
  207. if rightIndex > 0 {
  208. expertNum := expertContent[:rightIndex]
  209. expertNum = strings.Replace(expertNum, "(", "", -1)
  210. expertNum = strings.Replace(expertNum, "(", "", -1)
  211. expertNum = strings.Replace(expertNum, "专家评价", "", -1)
  212. expertNum = strings.Replace(expertNum, "#", "", -1)
  213. expertNum = strings.Replace(expertNum, ":", "", -1)
  214. expertNum = strings.Replace(expertNum, "\n", "", -1)
  215. if expertNum != "" {
  216. expertNumArr = append(expertNumArr, expertNum)
  217. rightIndex = rightIndex
  218. expertContentStr := expertContent[rightIndex:]
  219. expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
  220. expertContentStr = strings.TrimLeft(expertContentStr, ":")
  221. expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
  222. expertContentArr = append(expertContentArr, expertContentStr)
  223. return
  224. }
  225. }
  226. }
  227. }
  228. if strings.Contains(spanText, "访谈时间") {
  229. spanText = strings.Replace(spanText, "#访谈时间:", "", -1)
  230. spanText = strings.Replace(spanText, "访谈时间:", "", -1)
  231. spanText = strings.Replace(spanText, "\n", "", -1)
  232. spanText = strings.Replace(spanText, " ", "", -1)
  233. spanText = strings.Trim(spanText, " ")
  234. spanText = spanText[:10]
  235. interviewDate := spanText
  236. if interviewDate != "" {
  237. interviewDateArr = append(interviewDateArr, interviewDate)
  238. }
  239. }
  240. })
  241. })
  242. }
  243. var expertNumStr, expertContentStr, interviewDateStr string
  244. if len(expertNumArr) > 0 {
  245. expertNumStr = expertNumArr[0]
  246. }
  247. if len(expertContentArr) > 0 {
  248. expertContentStr = expertContentArr[0]
  249. }
  250. if len(interviewDateArr) > 0 {
  251. interviewDateStr = interviewDateArr[0]
  252. }
  253. expertNumStr = strings.Replace(expertNumStr, "#:", "", -1)
  254. err = models.ModifyArticleExpert(articleId, expertNumStr, expertContentStr, interviewDateStr, bodyText)
  255. if err != nil {
  256. fmt.Println("ModifyArticleExpert Err:" + err.Error())
  257. return
  258. }
  259. }