package services import ( "context" "encoding/json" "fmt" "github.com/PuerkitoBio/goquery" "hongze/hongze_cygx/models" "hongze/hongze_cygx/utils" "html" "io/ioutil" nhttp "net/http" "strconv" "strings" "time" ) func GetReportContentSub(content string) (contentSub string, err error) { content = html.UnescapeString(content) doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) if err != nil { fmt.Println("create doc err:", err.Error()) return } n := 0 doc.Find("p").Each(func(i int, s *goquery.Selection) { if n > 3 { return } n++ phtml, err := s.Html() if err != nil { fmt.Println("get html err", err.Error()) return } if s.Text() != "" || strings.Contains(phtml, "src") { contentSub = contentSub + "
" + phtml + "
" } }) return } func GetReportContentTextSub(content string) (contentSub string, err error) { content = html.UnescapeString(content) doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) docText := doc.Text() bodyRune := []rune(docText) bodyRuneLen := len(bodyRune) if bodyRuneLen > 200 { bodyRuneLen = 200 } body := string(bodyRune[:bodyRuneLen]) contentSub = body return } //解析文章内容 func GetArticleAll() { var err error defer func() { if err != nil { fmt.Println("err:", err.Error()) return } }() list, err := models.GetArticleAll() if err != nil { return } for _, v := range list { fmt.Println(v.ArticleId, v.Title) FixArticleContent(v.ArticleId) } } //解析报告 func FixArticleContent(articleId int) { item, err := models.GetArticleDetailById(articleId) if err != nil { fmt.Println("GetArticleDetailById Err:" + err.Error()) return } content := item.Body bodyText, _ := GetReportContentTextSub(content) content = html.UnescapeString(content) content = strings.Replace(content, "http", "https", -1) doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) if err != nil { fmt.Println("create doc err:", err.Error()) return } var expertNumArr []string var expertContentArr []string var interviewDateArr []string doc.Find("p").Each(func(i int, s *goquery.Selection) { contentTxt := s.Text() if strings.Contains(contentTxt, "#访谈时间:") || strings.Contains(contentTxt, "访谈时间:") { interviewDate := s.Next().Text() interviewDateArr = append(interviewDateArr, interviewDate) } if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") { expertContent := s.Next().Text() if expertContent == "" { expertContent = contentTxt } if expertContent != "" { rightIndex := strings.Index(expertContent, ")") if rightIndex == 0 { rightIndex = strings.Index(expertContent, ")") } if rightIndex > 0 { expertNum := expertContent[:rightIndex] expertNum = strings.Replace(expertNum, "(", "", -1) expertNum = strings.Replace(expertNum, "(", "", -1) expertNum = strings.Replace(expertNum, "专家评价", "", -1) if expertNum != "" { expertNumArr = append(expertNumArr, expertNum) rightIndex = rightIndex expertContentStr := expertContent[rightIndex:] expertContentStr = strings.Replace(expertContentStr, ")", "", -1) expertContentStr = strings.TrimLeft(expertContentStr, ":") expertContentStr = strings.TrimRight(expertContentStr, "(推荐") expertContentArr = append(expertContentArr, expertContentStr) } } } } }) if len(expertContentArr) <= 0 { doc.Find("pre").Each(func(i int, pre *goquery.Selection) { pre.Find("span").Each(func(n int, span *goquery.Selection) { contentTxt := span.Text() if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") { span.Find("span").Each(func(m int, subspan *goquery.Selection) { subspanText := subspan.Text() if strings.Contains(subspanText, "专家评价") { expertContent := subspan.Next().Text() if expertContent != "" { rightIndex := strings.Index(expertContent, ")") if rightIndex == 0 { rightIndex = strings.Index(expertContent, ")") } if rightIndex > 0 { expertNum := expertContent[:rightIndex] expertNum = strings.Replace(expertNum, "(", "", -1) expertNum = strings.Replace(expertNum, "(", "", -1) expertNum = strings.Replace(expertNum, "专家评价", "", -1) if expertNum != "" { expertNumArr = append(expertNumArr, expertNum) rightIndex = rightIndex expertContentStr := expertContent[rightIndex:] expertContentStr = strings.Replace(expertContentStr, ")", "", -1) expertContentStr = strings.TrimLeft(expertContentStr, ":") expertContentStr = strings.TrimRight(expertContentStr, "(推荐") expertContentArr = append(expertContentArr, expertContentStr) } } } } }) } span.Find("span").Each(func(k int, sspan *goquery.Selection) { sspanText := sspan.Text() if strings.Contains(sspanText, "访谈时间") { sspanText = strings.Replace(sspanText, "#访谈时间:", "", -1) sspanText = strings.Replace(sspanText, "访谈时间:", "", -1) sspanText = strings.Replace(sspanText, "\n", "", -1) sspanText = strings.Replace(sspanText, " ", "", -1) sspanText = strings.Trim(sspanText, " ") sspanText = sspanText[:10] interviewDate := sspanText if interviewDate != "" { interviewDateArr = append(interviewDateArr, interviewDate) } } }) }) }) } if len(expertContentArr) <= 0 { doc.Find("span").Each(func(i int, span *goquery.Selection) { span.Find("strong").Each(func(n int, strong *goquery.Selection) { spanText := span.Text() strongText := strong.Text() if strings.Contains(strongText, "#专家评价") || strings.Contains(strongText, "专家评价") { expertContent := strong.Parents().Text() if expertContent != "" { rightIndex := strings.Index(expertContent, ")") if rightIndex == 0 { rightIndex = strings.Index(expertContent, ")") } if rightIndex > 0 { expertNum := expertContent[:rightIndex] expertNum = strings.Replace(expertNum, "(", "", -1) expertNum = strings.Replace(expertNum, "(", "", -1) expertNum = strings.Replace(expertNum, "专家评价", "", -1) expertNum = strings.Replace(expertNum, "#", "", -1) expertNum = strings.Replace(expertNum, ":", "", -1) expertNum = strings.Replace(expertNum, "\n", "", -1) if expertNum != "" { expertNumArr = append(expertNumArr, expertNum) rightIndex = rightIndex expertContentStr := expertContent[rightIndex:] expertContentStr = strings.Replace(expertContentStr, ")", "", -1) expertContentStr = strings.TrimLeft(expertContentStr, ":") expertContentStr = strings.TrimRight(expertContentStr, "(推荐") expertContentArr = append(expertContentArr, expertContentStr) return } } } } if strings.Contains(spanText, "访谈时间") { spanText = strings.Replace(spanText, "#访谈时间:", "", -1) spanText = strings.Replace(spanText, "访谈时间:", "", -1) spanText = strings.Replace(spanText, "\n", "", -1) spanText = strings.Replace(spanText, " ", "", -1) spanText = strings.Trim(spanText, " ") spanText = spanText[:10] interviewDate := spanText if interviewDate != "" { interviewDateArr = append(interviewDateArr, interviewDate) } } }) }) } var expertNumStr, expertContentStr, interviewDateStr string if len(expertNumArr) > 0 { expertNumStr = expertNumArr[0] } if len(expertContentArr) > 0 { expertContentStr = expertContentArr[0] } if len(interviewDateArr) > 0 { interviewDateStr = interviewDateArr[0] } expertNumStr = strings.Replace(expertNumStr, "#:", "", -1) err = models.ModifyArticleExpert(articleId, expertNumStr, expertContentStr, interviewDateStr, bodyText) if err != nil { fmt.Println("ModifyArticleExpert Err:" + err.Error()) return } } func FixArticleImgUrl(body string) (contentSub string, err error) { r := strings.NewReader(string(body)) doc, err := goquery.NewDocumentFromReader(r) if err != nil { fmt.Println(err) } doc.Find("img").Each(func(i int, s *goquery.Selection) { src, _ := s.Attr("src") if i == 0 && src != "" { contentSub = src } }) return } //获取标签里的第一个内容 func FixArticleFirstCount(body string) (contentSub string, err error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(body)) if err != nil { fmt.Println("create doc err:", err.Error()) return } doc.Find("p").Each(func(i int, s *goquery.Selection) { contentTxt := s.Text() fmt.Println(contentTxt) }) return } func GetArticleListByApi(cont context.Context) (err error) { defer func() { if err != nil { fmt.Println("GetArticleListByApi Err:" + err.Error()) go utils.SendEmail(utils.APPNAME+"【"+utils.RunMode+"】"+"失败提醒", "GetArticleListByApi ErrMsg:"+err.Error(), utils.EmailSendToUsers) } }() url := "https://vmp.hzinsights.com/v2api/articles/mp?take=100&skip=0&publish_status=1" method := "GET" client := &nhttp.Client{} req, err := nhttp.NewRequest(method, url, nil) if err != nil { fmt.Println("GetListApi Err:", err.Error()) return err } req.Header.Add("Authorization", "bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkiLCJwaG9uZV9udW1iZXIiOiIxMjM0NTY3ODkiLCJuYW1lIjoi5YW25LuWIiwiZW50cmFuY2UiOiJwYXNzd3dvcmQiLCJpYXQiOjE2MzQ4NzA1OTQsImV4cCI6MTYzNDg3NDE5NH0.tho2L9jsbDPn8ltEGUVDve_nHsh0Kzf6ZrSz0RcZ0ag") res, err := client.Do(req) if err != nil { fmt.Println(err) return err } defer res.Body.Close() body, err := ioutil.ReadAll(res.Body) if err != nil { fmt.Println("Getres.Body Err:", err.Error()) return err } var pdfResult models.ArticleResultApi err = json.Unmarshal(body, &pdfResult) if err != nil { fmt.Println("Getres.pdfResult Err:", err.Error()) return err } exitMap := make(map[int]int) classMap := make(map[int]int) reportMap := make(map[int]int) summaryMap := make(map[int]int) listMap, err := models.GetArticleApiMap() if err != nil { fmt.Println("GetlistMap Err:", err.Error()) return err } //新旧分类 反向隐射,是否归类,是否是报告,是否是纪要库 for _, v := range listMap { exitMap[v.Id] = v.OldId if v.IsClass == 1 { classMap[v.OldId] = 1 } if v.IsReport == 1 { reportMap[v.OldId] = 1 } if v.IsSummary == 1 { summaryMap[v.OldId] = 1 } } listData := pdfResult.Data var list []*models.Tactics2 var listAuthor []*models.CygxArticleAuthor for _, v := range listData { if exitMap[v.SeriesId] > 0 { v.PublishDate = time.Date(v.PublishDate.Year(), v.PublishDate.Month(), v.PublishDate.Day(), v.PublishDate.Hour(), v.PublishDate.Minute(), v.PublishDate.Second(), v.PublishDate.Nanosecond(), time.Local) item := new(models.Tactics2) itemAuthor := new(models.CygxArticleAuthor) item.ArticleId = v.ArticleId item.Title = v.Title item.TitleEn = v.TitleEn if v.Frequency == "日度" { item.UpdateFrequency = "daily" } else if v.Frequency == "周度" { item.UpdateFrequency = "weekly" } else if v.Frequency == "月度" { item.UpdateFrequency = "monthly" } else if v.Frequency == "季度" { item.UpdateFrequency = "quarterly" } else if v.Frequency == "年度" { item.UpdateFrequency = "yearly" } else { item.UpdateFrequency = "unknow" } item.CreateDate = v.CreateDate item.PublishDate = v.PublishDate item.PublishStatus = v.PublishStatus item.Body = v.Content.Body item.Abstract = v.Content.Abstract item.CategoryName = v.Industry.Name item.CategoryId = exitMap[v.SeriesId] item.SubCategoryName = v.Series.Name list = append(list, item) itemAuthor.ArticleId = v.ArticleId itemAuthor.Name = v.Author.Name itemAuthor.Mobile = v.Author.PhoneNumber listAuthor = append(listAuthor, itemAuthor) } } //同步作者 for _, v := range listAuthor { var count int count, err = models.GetActivityAuthorCount(v.ArticleId, v.Mobile) if err != nil { fmt.Println("GetCount Err:", err.Error()) return err } if count == 0 { _, err = models.AddCygxActivityAuthor(v) if err != nil { fmt.Println("AddCygxActivityAuthor Err:", err.Error()) return err } } } fmt.Println("同步文章条数:", len(list)) listCustomArticle, err := models.GetCustomArticleId() //手动归类的文章,不替换文章类型 if err != nil { fmt.Println("GetTacticsList Err:", err.Error()) return err } listGetMatchTypeName, errMatch := models.GetMatchTypeNamenNotNull() //手动归类的文章,不替换文章类型 if errMatch != nil { fmt.Println("GetTacticsList Err:", errMatch.Error()) return err } fmt.Println("list len:", len(list)) noSummaryArticleIds := "3454,3456,3457,3459,2449,2450,2453,2454,2459,2530,2583,2663,2670,2699,2715,2732,2748,2759,2399,2356,2870,3173,2978,2826,3470" //非纪要库类型的文章ID listNoSummaryArticleIds := strings.Split(noSummaryArticleIds, ",") for k, v := range list { //同步匹配类型 matchTypeName := "" for _, vMatch := range listGetMatchTypeName { if v.CategoryId == vMatch.CategoryId { matchTypeName = vMatch.MatchTypeName } } //是否属于纪要库的数据 if _, has := summaryMap[v.CategoryId]; has { v.IsSummary = 1 } //排除不属于纪要库类型的文章 for _, vArt := range listNoSummaryArticleIds { vArtInt, _ := strconv.Atoi(vArt) if v.ArticleId == vArtInt { v.IsSummary = 0 } } if _, has := reportMap[v.CategoryId]; has { v.IsReport = 1 if _, ok := classMap[v.CategoryId]; ok { v.IsClass = 1 v.ReportType = 1 //是否属于行业报告 } else { v.ReportType = 2 //是否属于产业报告 } } v.Department = "弘则权益研究" //判断是否已经存在 if v.ArticleId < 0 { fmt.Println("AddCygxArticle Err:") return err } var count int count, err = models.GetArticleCountById(v.ArticleId) if err != nil && err.Error() != utils.ErrNoRow() { fmt.Println("AddCygxArticle Err:", err.Error()) return err } v.Body = strings.Replace(v.Body, "http://vmp.hzinsights.com", "https://vmp.hzinsights.com", -1) expertNumStr, expertContentStr, interviewDateStr, fileLink, bodyReturn := BodyAnalysis2(v.Body) if strings.Index(v.Body, "报告全文(") > 0 && strings.Index(v.Body, "PDF格式报告下载.pdf") > 0 { v.Body = strings.Replace(v.Body, "报告全文(", "", -1) v.Body = strings.Replace(v.Body, "PDF格式报告下载.pdf", "", -1) v.Body = strings.Replace(v.Body, "):", "", -1) } var titleNew string titleNew = v.Title // 7资金流向 、11大类资产 、51每日复盘 、80医药周报、9估值研究 if v.CategoryId == 7 || v.CategoryId == 11 || v.CategoryId == 51 || v.CategoryId == 9 { if v.UpdateFrequency == "daily" { var daystr string daystr = strconv.Itoa(v.PublishDate.Day()) if len(daystr) == 1 { daystr = "0" + daystr } titleNew = v.Title + "(" + strconv.Itoa(v.PublishDate.Year())[2:len(strconv.Itoa(v.PublishDate.Year()))-0] + v.PublishDate.Format("01") + daystr + ")" } else if v.UpdateFrequency == "weekly" { titleNew = v.Title + utils.WeekByDate(v.PublishDate) } } if v.CategoryId == 80 { titleNew = v.Title + utils.WeekByDate(v.PublishDate) } if count > 0 { fmt.Println(k, v.ArticleId, "edit") var isCustom bool bodyText, _ := GetReportContentTextSub(v.Body) updateParams := make(map[string]interface{}) //updateParams["Title"] = v.Title updateParams["Title"] = titleNew updateParams["TitleEn"] = v.TitleEn updateParams["UpdateFrequency"] = v.UpdateFrequency updateParams["CreateDate"] = v.CreateDate updateParams["PublishDate"] = v.PublishDate //updateParams["Body"] = html.EscapeString(v.Body) updateParams["Body"] = html.EscapeString(bodyReturn) updateParams["BodyText"] = bodyText updateParams["Abstract"] = html.EscapeString(v.Abstract) updateParams["CategoryName"] = v.CategoryName for _, vCustom := range listCustomArticle { if v.ArticleId == vCustom.ArticleId { fmt.Println("手动归类的文章:" + strconv.Itoa(v.ArticleId)) isCustom = true } } if isCustom == false { updateParams["CategoryId"] = v.CategoryId updateParams["MatchTypeName"] = matchTypeName updateParams["IsSummary"] = v.IsSummary updateParams["IsReport"] = v.IsReport updateParams["ReportType"] = v.ReportType updateParams["SubCategoryName"] = v.SubCategoryName } //updateParams["CategoryId"] = v.CategoryId updateParams["PublishStatus"] = v.PublishStatus updateParams["ExpertBackground"] = expertContentStr updateParams["ExpertNumber"] = expertNumStr updateParams["InterviewDate"] = interviewDateStr //updateParams["IsClass"] = v.IsClass v.Department = "弘则权益研究" updateParams["Department"] = v.Department updateParams["FileLink"] = fileLink whereParam := map[string]interface{}{"article_id": v.ArticleId} err = models.UpdateByExpr(models.CygxArticle{}, whereParam, updateParams) if err != nil { fmt.Println("UpdateByExpr Err:" + err.Error()) return err } } else { fmt.Println(k, v.ArticleId, "add") item := new(models.CygxArticle) articleIdInt := v.ArticleId item.ArticleId = articleIdInt //item.Title = v.Title item.Title = titleNew item.TitleEn = v.TitleEn item.UpdateFrequency = v.UpdateFrequency item.CreateDate = v.CreateDate item.PublishDate = v.PublishDate.Format(utils.FormatDateTime) //item.Body = html.EscapeString(v.Body) item.Body = html.EscapeString(bodyReturn) item.Abstract = html.EscapeString(v.Abstract) item.CategoryName = v.CategoryName item.SubCategoryName = v.SubCategoryName item.CategoryId = v.CategoryId item.PublishStatus = v.PublishStatus item.ExpertBackground = expertContentStr item.ExpertNumber = expertNumStr item.InterviewDate = interviewDateStr item.Department = v.Department item.ArticleIdMd5 = utils.MD5(strconv.Itoa(articleIdInt)) item.IsClass = v.IsClass item.IsSummary = v.IsSummary item.IsReport = v.IsReport item.ReportType = v.ReportType item.FileLink = fileLink item.MatchTypeName = matchTypeName _, err = models.AddCygxArticles(item) if err != nil { fmt.Println("AddCygxArticle Err:", err.Error()) return err } } } return } func SynchronizationArtclehistory() { fmt.Println("同步开始") list, err := models.GetArticleHistoryList() if err != nil { fmt.Println("获取列表失败", err) } fmt.Println(len(list)) for _, v := range list { //endDate := v.ModifyTime.Add(+time.Minute * 10).Format(utils.FormatDateTime) //detail, err := models.GetNewArticleHistoryRecordNewpv(v.UserId, v.ArticleId, endDate) //if err != nil && err.Error() != utils.ErrNoRow() { // fmt.Println("获取信息失败", err) //} v.OutType = 1 //fmt.Println(v.Id) //if detail == nil { // _, err = models.AddCygxArticleViewRecordNewpv(v) // if err != nil { // fmt.Println("新增失败", err) // } //} else { // err = models.UpdateCygxArticleViewRecordNewpvList(v, v.StopTime) // if err != nil { // fmt.Println("修改失败", err) // } //} newId, err := models.AddCygxArticleViewRecordNewpv(v) fmt.Println("新增", newId) if err != nil { fmt.Println("新增失败", err) } } fmt.Println("同步结束") }