123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769 |
- package services
- import (
- "context"
- "fmt"
- "github.com/PuerkitoBio/goquery"
- "hongze/hongze_cygx/models"
- "hongze/hongze_cygx/utils"
- "html"
- "regexp"
- "strconv"
- "strings"
- "time"
- )
- // 同步策略文章
- func SyncTacticsList() (err error) {
- defer func() {
- if err != nil {
- fmt.Println("同步失败,Err:", err.Error())
- }
- }()
- fmt.Println("同步数据")
- indexName := utils.IndexName
- endDate := time.Now().AddDate(0, 0, -30).Format(utils.FormatDate)
- list, err := models.GetTacticsList2(endDate)
- //list, err := models.GetTacticsListAll()
- if err != nil {
- fmt.Println("GetTacticsList Err:", err.Error())
- return
- }
- fmt.Println("list len:", len(list))
- for k, v := range list {
- v.Department = "弘则权益研究"
- fmt.Println(k, v.ArticleId)
- //
- //publishDate, err := time.Parse(utils.FormatDateTime, v.PublishDate)
- //if err != nil {
- // fmt.Println("time.Parse:", err.Error())
- // return err
- //}
- //fmt.Println(publishDate)
- hh, _ := time.ParseDuration("8h")
- //pDate := publishDate.Add(hh)
- v.PublishDate = v.PublishDate.Add(hh)
- //判断是否已经存在
- if v.ArticleId < 0 {
- fmt.Println("参数错误")
- return err
- }
- count, err := models.GetArticleCountById(v.ArticleId)
- if err != nil && err.Error() != utils.ErrNoRow() {
- return err
- }
- v.Body = strings.Replace(v.Body, "http://vmp.hzinsights.com", "https://vmp.hzinsights.com", -1)
- expertNumStr, expertContentStr, interviewDateStr := BodyAnalysis(v.Body)
- if count > 0 {
- bodyText, _ := GetReportContentTextSub(v.Body)
- updateParams := make(map[string]interface{})
- updateParams["Title"] = v.Title
- updateParams["TitleEn"] = v.TitleEn
- updateParams["UpdateFrequency"] = v.UpdateFrequency
- updateParams["CreateDate"] = v.CreateDate
- updateParams["PublishDate"] = v.PublishDate
- updateParams["Body"] = html.EscapeString(v.Body)
- updateParams["BodyText"] = bodyText
- updateParams["Abstract"] = html.EscapeString(v.Abstract)
- updateParams["CategoryName"] = v.CategoryName
- updateParams["SubCategoryName"] = v.SubCategoryName
- updateParams["CategoryId"] = v.CategoryId
- updateParams["PublishStatus"] = v.PublishStatus
- updateParams["ExpertBackground"] = expertContentStr
- updateParams["ExpertNumber"] = expertNumStr
- updateParams["InterviewDate"] = interviewDateStr
- if v.Department != "弘则权益研究" {
- v.Department = "弘则权益研究"
- }
- updateParams["Department"] = v.Department
- whereParam := map[string]interface{}{"article_id": v.ArticleId}
- err = models.UpdateByExpr(models.CygxArticle{}, whereParam, updateParams)
- if err != nil {
- fmt.Println("UpdateByExpr Err:" + err.Error())
- }
- } else {
- fmt.Println(k, v.ArticleId, "add")
- item := new(models.CygxArticle)
- articleIdInt := v.ArticleId
- item.ArticleId = articleIdInt
- item.Title = v.Title
- item.TitleEn = v.TitleEn
- item.UpdateFrequency = v.UpdateFrequency
- item.CreateDate = v.CreateDate
- item.PublishDate = v.PublishDate.Format(utils.FormatDateTime)
- item.Body = html.EscapeString(v.Body)
- item.Abstract = html.EscapeString(v.Abstract)
- item.CategoryName = v.CategoryName
- item.SubCategoryName = v.SubCategoryName
- item.CategoryId = v.CategoryId
- item.PublishStatus = v.PublishStatus
- item.ExpertBackground = expertContentStr
- item.ExpertNumber = expertNumStr
- item.InterviewDate = interviewDateStr
- item.Department = v.Department
- item.ArticleIdMd5 = utils.MD5(strconv.Itoa(articleIdInt))
- _, err = models.AddCygxArticle(item)
- if err != nil {
- fmt.Println("AddCygxArticle Err:", err.Error())
- return err
- }
- }
- content := html.UnescapeString(v.Body)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
- if err != nil {
- fmt.Println("create doc err:", err.Error())
- return err
- }
- doc.Find("a").Each(func(i int, a *goquery.Selection) {
- a.Remove()
- })
- bodyText := doc.Text()
- item := new(ElasticTestArticleDetail)
- item.ArticleId = v.ArticleId
- item.Title = v.Title
- item.BodyText = bodyText
- item.PublishDate = v.PublishDate.Format(utils.FormatDateTime)
- EsAddOrEditData(indexName, strconv.Itoa(v.ArticleId), item)
- }
- return
- }
- // 同步策略文章
- func SyncCygxArticleList() (err error) {
- defer func() {
- if err != nil {
- fmt.Println("同步失败,Err:", err.Error())
- }
- }()
- fmt.Println("同步数据")
- indexName := utils.IndexName
- fmt.Println("indexName:", indexName)
- time.Sleep(5 * time.Second)
- list, err := models.GetCygxArticleListAll()
- if err != nil {
- fmt.Println("GetTacticsList Err:", err.Error())
- return
- }
- fmt.Println("list len:", len(list))
- for k, v := range list {
- v.Department = "弘则权益研究"
- fmt.Println(k, v.ArticleId)
- //判断是否已经存在
- if v.ArticleId < 0 {
- fmt.Println("参数错误")
- return err
- }
- content := html.UnescapeString(v.Body)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
- if err != nil {
- fmt.Println("create doc err:", err.Error())
- return err
- }
- doc.Find("a").Each(func(i int, a *goquery.Selection) {
- a.Remove()
- })
- bodyText := doc.Text()
- item := new(ElasticTestArticleDetail)
- item.ArticleId = v.ArticleId
- item.Title = v.Title
- item.BodyText = bodyText
- item.PublishDate = v.PublishDate.Format(utils.FormatDateTime)
- EsAddOrEditData(indexName, strconv.Itoa(v.ArticleId), item)
- }
- return
- }
- // body 解析
- func BodyAnalysis(body string) (expertNumStr, expertContentStr, interviewDateStr string) {
- body = html.UnescapeString(body)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- if err != nil {
- fmt.Println("create doc err:", err.Error())
- return
- }
- var expertNumArr []string
- var expertContentArr []string
- var interviewDateArr []string
- doc.Find("p").Each(func(i int, s *goquery.Selection) {
- contentTxt := s.Text()
- if strings.Contains(contentTxt, "#访谈时间:") || strings.Contains(contentTxt, "访谈时间:") {
- interviewDate := s.Next().Text()
- interviewDateArr = append(interviewDateArr, interviewDate)
- }
- if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") {
- expertContent := s.Next().Text()
- if expertContent == "" {
- expertContent = contentTxt
- }
- if expertContent != "" {
- rightIndex := strings.Index(expertContent, ")")
- if rightIndex == 0 {
- rightIndex = strings.Index(expertContent, ")")
- }
- if rightIndex > 0 {
- expertNum := expertContent[:rightIndex]
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "专家评价", "", -1)
- if expertNum != "" {
- expertNumArr = append(expertNumArr, expertNum)
- rightIndex = rightIndex
- expertContentStr := expertContent[rightIndex:]
- expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
- expertContentStr = strings.TrimLeft(expertContentStr, ":")
- expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
- expertContentArr = append(expertContentArr, expertContentStr)
- }
- }
- }
- }
- })
- if len(expertContentArr) == 0 {
- doc.Find("pre").Each(func(i int, s *goquery.Selection) {
- contentTxt := s.Text()
- if strings.Contains(contentTxt, "#访谈时间:") || strings.Contains(contentTxt, "访谈时间:") {
- interviewDate := s.Next().Text()
- if interviewDate != "" {
- interviewDateArr = append(interviewDateArr, interviewDate)
- }
- }
- if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") {
- expertContent := s.Next().Text()
- if expertContent == "" {
- expertContent = contentTxt
- }
- if expertContent != "" {
- rightIndex := strings.Index(expertContent, ")")
- if rightIndex == 0 {
- rightIndex = strings.Index(expertContent, ")")
- }
- expertNum := expertContent[:rightIndex]
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "专家评价", "", -1)
- if expertNum != "" {
- expertNumArr = append(expertNumArr, expertNum)
- rightIndex = rightIndex
- expertContentStr := expertContent[rightIndex:]
- expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
- expertContentStr = strings.TrimLeft(expertContentStr, ":")
- expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
- if expertContentStr != "" {
- expertContentArr = append(expertContentArr, expertContentStr)
- }
- }
- }
- }
- })
- }
- if len(expertNumArr) > 0 {
- expertNumStr = expertNumArr[0]
- if expertNumStr != "" {
- expertNumStr = strings.Replace(expertNumStr, "#:", "", -1)
- expertNumStr = strings.Replace(expertNumStr, "# ", "", -1)
- expertNumStr = strings.Trim(expertNumStr, "")
- }
- }
- if len(expertContentArr) > 0 {
- expertContentStr = expertContentArr[0]
- }
- if len(interviewDateArr) > 0 {
- interviewDateStr = interviewDateArr[0]
- }
- return
- }
- // 同步策略,报告文章
- func SyncTacticsListAddreport(cont context.Context) (err error) {
- defer func() {
- if err != nil {
- fmt.Println("同步失败,Err:", err.Error())
- }
- }()
- fmt.Println("同步数据")
- //indexName := utils.IndexName
- endDate := time.Now().AddDate(0, 0, -30).Format(utils.FormatDate)
- list, err := models.GetTacticsList2(endDate)
- //list, err := models.GetTacticsListAll2()
- if err != nil {
- fmt.Println("GetTacticsList Err:", err.Error())
- return
- }
- listCustomArticle, err := models.GetCustomArticleId() //手动归类的文章,不替换文章类型
- if err != nil {
- fmt.Println("GetTacticsList Err:", err.Error())
- return
- }
- listGetMatchTypeName, errMatch := models.GetMatchTypeNamenNotNull() //手动归类的文章,不替换文章类型
- if errMatch != nil {
- fmt.Println("GetTacticsList Err:", errMatch.Error())
- return
- }
- fmt.Println("list len:", len(list))
- summaryCategoryIds := "28,32,45,50,57,62,72,74,79,84,86,88,90,93,95,96" //纪要库的文章类型categoty_id
- listSummary := strings.Split(summaryCategoryIds, ",")
- noSummaryArticleIds := "3454,3456,3457,3459,2449,2450,2453,2454,2459,2530,2583,2663,2670,2699,2715,2732,2748,2759,2399,2356,2870,3173,2978,2826,3470" //非纪要库类型的文章ID
- listNoSummaryArticleIds := strings.Split(noSummaryArticleIds, ",")
- listPermission, errper := models.GetPermissionMappingCategoryID()
- if errper != nil {
- fmt.Println("GetTacticsList Err:", errper.Error())
- return
- }
- summaryMap := make(map[int]int)
- for _, vSum := range listSummary {
- vSumInt, _ := strconv.Atoi(vSum)
- summaryMap[vSumInt] = 1
- }
- for k, v := range list {
- //同步匹配类型
- matchTypeName := ""
- for _, vMatch := range listGetMatchTypeName {
- if v.CategoryId == vMatch.CategoryId {
- matchTypeName = vMatch.MatchTypeName
- }
- }
- //是否属于纪要库的数据
- if _, has := summaryMap[v.CategoryId]; has {
- v.IsSummary = 1
- }
- //for _, vSum := range listSummary {
- // vSumInt, _ := strconv.Atoi(vSum)
- // if v.CategoryId == vSumInt {
- // v.IsSummary = 1
- // }
- //}
- //排除不属于纪要库类型的文章
- for _, vArt := range listNoSummaryArticleIds {
- vArtInt, _ := strconv.Atoi(vArt)
- if v.ArticleId == vArtInt {
- v.IsSummary = 0
- }
- }
- for _, vPer := range listPermission {
- if v.CategoryId == vPer.CategoryId {
- v.IsReport = 1
- }
- }
- if v.IsReport > 0 {
- //是否属于策略 策略自动归类
- //是否属于行业报告 行业报告自动归类
- if v.CategoryId == 7 || v.CategoryId == 9 || v.CategoryId == 11 || v.CategoryId == 51 || v.CategoryId == 52 || v.CategoryId == 64 || v.CategoryId == 80 || v.CategoryId == 87 {
- v.IsClass = 1
- v.ReportType = 1 //是否属于行业报告
- } else {
- v.ReportType = 2 //是否属于产业报告
- }
- }
- v.Department = "弘则权益研究"
- //fmt.Println(k, v.ArticleId)
- hh, _ := time.ParseDuration("8h")
- //pDate := publishDate.Add(hh)
- v.PublishDate = v.PublishDate.Add(hh)
- //判断是否已经存在
- if v.ArticleId < 0 {
- fmt.Println("AddCygxArticle Err:")
- return err
- }
- count, err := models.GetArticleCountById(v.ArticleId)
- if err != nil && err.Error() != utils.ErrNoRow() {
- fmt.Println("AddCygxArticle Err:", err.Error())
- return err
- }
- v.Body = strings.Replace(v.Body, "http://vmp.hzinsights.com", "https://vmp.hzinsights.com", -1)
- expertNumStr, expertContentStr, interviewDateStr, fileLink, bodyReturn := BodyAnalysis2(v.Body)
- if strings.Index(v.Body, "报告全文(") > 0 && strings.Index(v.Body, "PDF格式报告下载.pdf") > 0 {
- v.Body = strings.Replace(v.Body, "报告全文(", "", -1)
- v.Body = strings.Replace(v.Body, "PDF格式报告下载.pdf", "", -1)
- v.Body = strings.Replace(v.Body, "):", "", -1)
- }
- //fmt.Println(fileLink)
- var titleNew string
- titleNew = v.Title
- // 7资金流向 、11大类资产 、51每日复盘 、80医药周报、9估值研究
- if v.CategoryId == 7 || v.CategoryId == 11 || v.CategoryId == 51 || v.CategoryId == 9 {
- if v.UpdateFrequency == "daily" {
- var daystr string
- daystr = strconv.Itoa(v.PublishDate.Day())
- if len(daystr) == 1 {
- daystr = "0" + daystr
- }
- titleNew = v.Title + "(" + strconv.Itoa(v.PublishDate.Year())[2:len(strconv.Itoa(v.PublishDate.Year()))-0] + v.PublishDate.Format("01") + daystr + ")"
- } else if v.UpdateFrequency == "weekly" {
- titleNew = v.Title + utils.WeekByDate(v.PublishDate)
- }
- }
- if v.CategoryId == 80 {
- titleNew = v.Title + utils.WeekByDate(v.PublishDate)
- }
- //fmt.Println(k)
- //fmt.Println(expertContentStr)
- if count > 0 {
- fmt.Println(k, v.ArticleId, "edit")
- var isCustom bool
- bodyText, _ := GetReportContentTextSub(v.Body)
- updateParams := make(map[string]interface{})
- //updateParams["Title"] = v.Title
- updateParams["Title"] = titleNew
- updateParams["TitleEn"] = v.TitleEn
- updateParams["UpdateFrequency"] = v.UpdateFrequency
- updateParams["CreateDate"] = v.CreateDate
- updateParams["PublishDate"] = v.PublishDate
- //updateParams["Body"] = html.EscapeString(v.Body)
- updateParams["Body"] = html.EscapeString(bodyReturn)
- updateParams["BodyText"] = bodyText
- updateParams["Abstract"] = html.EscapeString(v.Abstract)
- updateParams["CategoryName"] = v.CategoryName
- for _, vCustom := range listCustomArticle {
- if v.ArticleId == vCustom.ArticleId {
- fmt.Println("手动归类的文章:" + strconv.Itoa(v.ArticleId))
- isCustom = true
- }
- }
- if isCustom == false {
- updateParams["CategoryId"] = v.CategoryId
- updateParams["MatchTypeName"] = matchTypeName
- updateParams["IsSummary"] = v.IsSummary
- updateParams["IsReport"] = v.IsReport
- updateParams["ReportType"] = v.ReportType
- updateParams["SubCategoryName"] = v.SubCategoryName
- }
- //updateParams["CategoryId"] = v.CategoryId
- updateParams["PublishStatus"] = v.PublishStatus
- updateParams["ExpertBackground"] = expertContentStr
- updateParams["ExpertNumber"] = expertNumStr
- updateParams["InterviewDate"] = interviewDateStr
- //updateParams["IsClass"] = v.IsClass
- if v.Department != "弘则权益研究" {
- v.Department = "弘则权益研究"
- }
- updateParams["Department"] = v.Department
- updateParams["FileLink"] = fileLink
- whereParam := map[string]interface{}{"article_id": v.ArticleId}
- err = models.UpdateByExpr(models.CygxArticle{}, whereParam, updateParams)
- if err != nil {
- fmt.Println("UpdateByExpr Err:" + err.Error())
- }
- } else {
- fmt.Println(k, v.ArticleId, "add")
- item := new(models.CygxArticle)
- articleIdInt := v.ArticleId
- item.ArticleId = articleIdInt
- //item.Title = v.Title
- item.Title = titleNew
- item.TitleEn = v.TitleEn
- item.UpdateFrequency = v.UpdateFrequency
- item.CreateDate = v.CreateDate
- item.PublishDate = v.PublishDate.Format(utils.FormatDateTime)
- //item.Body = html.EscapeString(v.Body)
- item.Body = html.EscapeString(bodyReturn)
- item.Abstract = html.EscapeString(v.Abstract)
- item.CategoryName = v.CategoryName
- item.SubCategoryName = v.SubCategoryName
- item.CategoryId = v.CategoryId
- item.PublishStatus = v.PublishStatus
- item.ExpertBackground = expertContentStr
- item.ExpertNumber = expertNumStr
- item.InterviewDate = interviewDateStr
- item.Department = v.Department
- item.ArticleIdMd5 = utils.MD5(strconv.Itoa(articleIdInt))
- item.IsClass = v.IsClass
- item.IsSummary = v.IsSummary
- item.IsReport = v.IsReport
- item.ReportType = v.ReportType
- item.FileLink = fileLink
- item.MatchTypeName = matchTypeName
- _, err = models.AddCygxArticles(item)
- if err != nil {
- fmt.Println("AddCygxArticle Err:", err.Error())
- return err
- }
- }
- ////纪要库的数据同步到Es
- //if v.IsSummary == 1 {
- // content := html.UnescapeString(v.Body)
- // doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
- // if err != nil {
- // fmt.Println("AddCygxArticle Err:", err.Error())
- // return err
- // }
- // doc.Find("a").Each(func(i int, a *goquery.Selection) {
- // a.Remove()
- // })
- // bodyText := doc.Text()
- // item := new(ElasticTestArticleDetail)
- // item.ArticleId = v.ArticleId
- // item.Title = v.Title
- // item.BodyText = bodyText
- // item.PublishDate = v.PublishDate.Format(utils.FormatDateTime)
- // EsAddOrEditData(indexName, strconv.Itoa(v.ArticleId), item)
- //}
- }
- return
- }
- // 同步策略到Es
- func SyncTacticsListToEs() (err error) {
- defer func() {
- if err != nil {
- fmt.Println("同步失败,Err:", err.Error())
- }
- }()
- fmt.Println("同步数据到Es")
- indexName := utils.IndexName
- endDate := time.Now().AddDate(0, 0, -30).Format(utils.FormatDate)
- list, err := models.GetTacticsList(endDate)
- //list, err := models.GetTacticsListAll()
- if err != nil {
- fmt.Println("GetTacticsList Err:", err.Error())
- return
- }
- fmt.Println("list len:", len(list))
- for k, v := range list {
- //是否属于纪要库的数据
- v.Department = "弘则权益研究"
- fmt.Println(k, v.ArticleId)
- hh, _ := time.ParseDuration("8h")
- //pDate := publishDate.Add(hh)
- v.PublishDate = v.PublishDate.Add(hh)
- //判断是否已经存在
- if v.ArticleId < 0 {
- fmt.Println("AddCygxArticle Err:")
- return err
- }
- //纪要库的数据同步到Es
- content := html.UnescapeString(v.Body)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
- if err != nil {
- fmt.Println("AddCygxArticle Err:", err.Error())
- return err
- }
- doc.Find("a").Each(func(i int, a *goquery.Selection) {
- a.Remove()
- })
- bodyText := doc.Text()
- item := new(ElasticTestArticleDetail)
- item.ArticleId = v.ArticleId
- item.Title = v.Title
- item.BodyText = bodyText
- item.PublishDate = v.PublishDate.Format(utils.FormatDateTime)
- EsAddOrEditData(indexName, strconv.Itoa(v.ArticleId), item)
- }
- return
- }
- // body 解析
- func BodyAnalysis2(body string) (expertNumStr, expertContentStr, interviewDateStr, fileLink, bodyReturn string) {
- body = html.UnescapeString(body)
- //fmt.Println(body)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- if err != nil {
- fmt.Println("create doc err:", err.Error())
- return
- }
- var expertNumArr []string
- var expertContentArr []string
- var interviewDateArr []string
- //var fileLink string
- doc.Find("p").Each(func(i int, s *goquery.Selection) {
- contentTxt := s.Text()
- if strings.Contains(contentTxt, "#访谈时间:") || strings.Contains(contentTxt, "访谈时间:") {
- interviewDate := s.Next().Text()
- interviewDateArr = append(interviewDateArr, interviewDate)
- }
- if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") {
- expertContent := s.Next().Text()
- if expertContent == "" {
- expertContent = contentTxt
- }
- if expertContent != "" {
- rightIndex := strings.Index(expertContent, ")")
- if rightIndex == 0 {
- rightIndex = strings.Index(expertContent, ")")
- }
- if rightIndex > 0 {
- expertNum := expertContent[:rightIndex]
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "专家评价", "", -1)
- if expertNum != "" {
- expertNumArr = append(expertNumArr, expertNum)
- rightIndex = rightIndex
- expertContentStr := expertContent[rightIndex:]
- expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
- expertContentStr = strings.TrimLeft(expertContentStr, ":")
- expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
- expertContentArr = append(expertContentArr, expertContentStr)
- }
- }
- }
- }
- })
- if len(expertContentArr) == 0 {
- doc.Find("pre").Each(func(i int, s *goquery.Selection) {
- contentTxt := s.Text()
- if strings.Contains(contentTxt, "#访谈时间:") || strings.Contains(contentTxt, "访谈时间:") {
- interviewDate := s.Next().Text()
- if interviewDate != "" {
- interviewDateArr = append(interviewDateArr, interviewDate)
- }
- }
- if strings.Contains(contentTxt, "#专家评价") || strings.Contains(contentTxt, "专家评价") {
- expertContent := s.Next().Text()
- if expertContent == "" {
- expertContent = contentTxt
- }
- if expertContent != "" {
- rightIndex := strings.Index(expertContent, ")")
- if rightIndex == 0 {
- rightIndex = strings.Index(expertContent, ")")
- }
- expertNum := expertContent[:rightIndex]
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "(", "", -1)
- expertNum = strings.Replace(expertNum, "专家评价", "", -1)
- if expertNum != "" {
- expertNumArr = append(expertNumArr, expertNum)
- rightIndex = rightIndex
- expertContentStr := expertContent[rightIndex:]
- expertContentStr = strings.Replace(expertContentStr, ")", "", -1)
- expertContentStr = strings.TrimLeft(expertContentStr, ":")
- expertContentStr = strings.TrimRight(expertContentStr, "(推荐")
- if expertContentStr != "" {
- expertContentArr = append(expertContentArr, expertContentStr)
- }
- }
- }
- }
- })
- }
- if len(expertNumArr) > 0 {
- expertNumStr = expertNumArr[0]
- if expertNumStr != "" {
- expertNumStr = strings.Replace(expertNumStr, "#:", "", -1)
- expertNumStr = strings.Replace(expertNumStr, "# ", "", -1)
- expertNumStr = strings.Replace(expertNumStr, " ", "", -1)
- expertNumStr = strings.Replace(expertNumStr, "\n", "", -1)
- expertNumStr = strings.Trim(expertNumStr, "")
- }
- }
- if len(expertContentArr) > 0 {
- expertContentStr = expertContentArr[0]
- }
- //当处理过之后的专家背景长度大于600的时候,说明他的格式跟之前的不一样,还要做二次处理 600 是一个约值,先运行看看
- if len(expertContentStr) > 600 {
- strnum := strings.Index(expertContentStr, "#专家评价:")
- content := expertContentStr[strnum:len(expertContentStr)]
- strnum2 := strings.Index(content, "(")
- content = content[strnum2+9 : len(content)] //中文括号3位 专家编号6位
- expertContentStr = content
- }
- //if strings.Index(body, "报告全文(") > 0 && strings.Index(body, "PDF格式报告下载.pdf") > 0 {
- // numStar := strings.Index(body, "http")
- // numEnd := strings.Index(body, ".pdf")
- // fmt.Println("获取PDF链接")
- // fileLink = body[numStar : numEnd+4]
- //}
- var hrefRegexp = regexp.MustCompile("(?m)<a.*?[^<]>.*?</a>")
- match := hrefRegexp.FindAllString(body, -1)
- if match != nil {
- for _, v := range match {
- //if k == 0 && strings.Index(v, ".pdf") > 0 {
- // numStar := strings.Index(v, "http")
- // numEnd := strings.Index(v, ".pdf")
- // fileLink = v[numStar : numEnd+4]
- //}
- //处理a标签中的PDF
- numStarAcount := strings.Index(v, "<a")
- numEndAcount := strings.Index(v, "<img")
- if numStarAcount < numEndAcount && strings.Index(fileLink, ".pdf") > 0 {
- Acount := v[numStarAcount:numEndAcount]
- if Acount != "" {
- body = strings.Replace(body, Acount, "", -1)
- }
- }
- }
- if !strings.HasPrefix(fileLink, "https") && len(fileLink) > 0 {
- fileLink = "https" + fileLink[4:len(fileLink)]
- }
- fileLink = strings.Replace(fileLink, "https://vmp.hzinsights.com/article/pdfviewer/?file=", "", -1)
- fileLink = strings.Replace(fileLink, "http://vmp.hzinsights.com/article/pdfviewer/?file=", "", -1)
- body = strings.Replace(body, "完整报告请点击链接:", "", -1)
- body = strings.Replace(body, "PDF格式报告下载.pdf", "", -1)
- body = strings.Replace(body, "报告全文():", "", -1)
- }
- bodyReturn = body
- if len(interviewDateArr) > 0 {
- interviewDateStr = interviewDateArr[0]
- }
- return
- }
- //func init() {
- // GetSummarytoEs(7720)
- //}
- func GetSummarytoEs(articleId int) (err error) {
- defer func() {
- if err != nil {
- fmt.Println("同步ES记录失败" + err.Error())
- go utils.SendAlarmMsg("同步ES记录失败"+err.Error(), 2)
- }
- }()
- //endDate := time.Now().AddDate(0, 0, -30).Format(utils.FormatDate)
- v, err := models.GetArticleDetailTestById(articleId)
- //allList, err := models.GetArticleAllDate(endDate)
- if err != nil && err.Error() != utils.ErrNoRow() {
- fmt.Println("GetArticleAll Err:", err.Error())
- return
- }
- if v == nil {
- return
- }
- if v.IsSummary != 1 {
- return
- }
- indexName := utils.IndexName
- //for _, v := range allList {
- content := html.UnescapeString(v.Body)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
- if err != nil {
- fmt.Println("create doc err:", err.Error())
- return err
- }
- bodyText := doc.Text()
- item := new(ElasticTestArticleDetail)
- item.ArticleId = v.ArticleId
- item.Title = v.Title
- item.PublishDate = v.PublishDate
- bodyText, _ = GetReportContentTextSubNew(v.Body)
- item.BodyText = bodyText
- item.CategoryId = strconv.Itoa(v.CategoryId)
- item.ExpertBackground = v.ExpertBackground
- item.Annotation, _ = GetReportContentTextSubNew(v.Annotation)
- item.Abstract, _ = GetReportContentTextSubNew(v.Abstract)
- EsAddOrEditData(indexName, strconv.Itoa(v.ArticleId), item)
- fmt.Println(v.ArticleId)
- //}
- return
- }
|