llm_report.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535
  1. package services
  2. import (
  3. "eta/eta_api/models"
  4. "eta/eta_api/models/rag"
  5. "eta/eta_api/services/elastic"
  6. "eta/eta_api/services/llm"
  7. "eta/eta_api/utils"
  8. "fmt"
  9. "golang.org/x/net/html"
  10. "golang.org/x/net/html/atom"
  11. "os"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "time"
  16. )
  17. // ReportAddOrModifyKnowledge
  18. // @Description: ETA报告加入/修改到知识库
  19. // @author: Roc
  20. // @datetime 2025-04-07 14:41:45
  21. // @param reportId int
  22. // @param reportChapterId int
  23. func ReportAddOrModifyKnowledge(reportId, reportChapterId int) {
  24. if reportId <= 0 {
  25. return
  26. }
  27. var err error
  28. defer func() {
  29. if err != nil {
  30. //fmt.Println("ReportAddOrModifyKnowledge error:", err)
  31. utils.FileLog.Error("ReportAddOrModifyKnowledge error:", err)
  32. }
  33. }()
  34. var title, author, htmlContent string
  35. var publishTime time.Time
  36. if reportChapterId > 0 {
  37. chapterInfo, tmpErr := models.GetReportChapterInfoById(reportChapterId)
  38. if tmpErr != nil {
  39. return
  40. }
  41. title = chapterInfo.Title
  42. author = chapterInfo.Author
  43. publishTime = chapterInfo.PublishTime
  44. htmlContent = chapterInfo.Content
  45. } else {
  46. reportInfo, tmpErr := models.GetReportByReportId(reportId)
  47. if tmpErr != nil {
  48. return
  49. }
  50. title = reportInfo.Title
  51. author = reportInfo.Author
  52. publishTime = reportInfo.PublishTime
  53. htmlContent = reportInfo.Content
  54. }
  55. err = handleReportAddOrModifyKnowledge(reportId, reportChapterId, title, author, htmlContent, publishTime)
  56. return
  57. }
  58. // ReportAddOrModifyKnowledgeByReportId
  59. // @Description: ETA报告加入/修改到知识库(只传id的情况)
  60. // @author: Roc
  61. // @datetime 2025-04-07 15:41:15
  62. // @param reportId int
  63. func ReportAddOrModifyKnowledgeByReportId(reportId int) {
  64. if reportId <= 0 {
  65. return
  66. }
  67. errList := make([]string, 0)
  68. defer func() {
  69. if len(errList) > 0 {
  70. utils.FileLog.Error("ReportAddOrModifyKnowledge error,报告ID:%d:%s", reportId, strings.Join(errList, "\n"))
  71. }
  72. }()
  73. reportInfo, err := models.GetReportByReportId(reportId)
  74. if err != nil {
  75. errList = append(errList, err.Error())
  76. return
  77. }
  78. // 如果是单篇报告,那么直接处理
  79. if reportInfo.HasChapter == 0 {
  80. err = handleReportAddOrModifyKnowledge(reportId, 0, reportInfo.Title, reportInfo.Author, reportInfo.Content, reportInfo.PublishTime)
  81. if err != nil {
  82. errList = append(errList, err.Error())
  83. }
  84. return
  85. }
  86. // 章节类型的报告,需要查询出来后再处理
  87. chapterInfoList, err := models.GetPublishedChapterListByReportId(reportId)
  88. if err != nil {
  89. errList = append(errList, err.Error())
  90. return
  91. }
  92. for _, v := range chapterInfoList {
  93. err = handleReportAddOrModifyKnowledge(reportId, v.ReportChapterId, v.Title, reportInfo.Author, v.Content, v.PublishTime)
  94. if err != nil {
  95. errList = append(errList, fmt.Sprintf("第%d章:%s,异常:\n%s", v.ReportChapterId, v.Title, err.Error()))
  96. continue
  97. }
  98. }
  99. return
  100. }
  101. // handleReportAddOrModifyKnowledge
  102. // @Description: 处理ETA报告加入/修改到知识库
  103. // @author: Roc
  104. // @datetime 2025-04-07 15:33:38
  105. // @param reportId int
  106. // @param reportChapterId int
  107. // @param title string
  108. // @param author string
  109. // @param htmlContent string
  110. // @param publishTime time.Time
  111. // @return err error
  112. func handleReportAddOrModifyKnowledge(reportId, reportChapterId int, title, author, htmlContent string, publishTime time.Time) (err error) {
  113. htmlContent = html.UnescapeString(htmlContent)
  114. doc, err := html.Parse(strings.NewReader(htmlContent))
  115. if err != nil {
  116. return
  117. }
  118. // 只获取文本内容
  119. content := &strings.Builder{}
  120. getArticleContent(content, doc)
  121. textContent := content.String()
  122. textContent = regexp.MustCompile(`\n+`).ReplaceAllString(textContent, "\n")
  123. textContent = strings.Trim(textContent, "\n")
  124. publishTimeStr := `未知`
  125. if !publishTime.IsZero() {
  126. title = fmt.Sprintf("%s(%s)", title, publishTime.Format(utils.FormatMonthDayUnSpace))
  127. publishTimeStr = publishTime.Format(utils.FormatDateTime)
  128. }
  129. textContent = fmt.Sprintf("标题:%s\n发布时间:%s\n%s", title, publishTimeStr, textContent)
  130. obj := rag.RagEtaReport{}
  131. item, err := obj.GetByReportAndChapterId(reportId, reportChapterId)
  132. if err != nil && !utils.IsErrNoRow(err) {
  133. // 查询异常,且不是没找到数据的报错
  134. return
  135. }
  136. if err == nil {
  137. // 标记删除了的话,那就不处理了
  138. if item.IsDeleted == 1 {
  139. return
  140. }
  141. item.Title = title
  142. item.Author = author
  143. item.TextContent = textContent
  144. item.IsPublished = 1
  145. //item.PublishTime = publishTime
  146. item.ModifyTime = time.Now()
  147. //err = item.Update([]string{"title", "author", "text_content", "is_published", "publish_time", "modify_time"})
  148. err = item.Update([]string{"title", "author", "text_content", "is_published", "modify_time"})
  149. } else {
  150. // 无数据的时候,需要新增
  151. err = nil
  152. item = &rag.RagEtaReport{
  153. RagEtaReportId: 0,
  154. ReportId: reportId,
  155. ReportChapterId: reportChapterId,
  156. Title: title,
  157. Author: author,
  158. TextContent: textContent,
  159. VectorKey: "",
  160. IsPublished: 1,
  161. IsDeleted: 0,
  162. PublishTime: publishTime,
  163. ModifyTime: time.Now(),
  164. CreateTime: time.Now(),
  165. }
  166. err = item.Create()
  167. }
  168. return
  169. }
  170. // ReportUnPublishedKnowledge
  171. // @Description: 知识库取消发布
  172. // @author: Roc
  173. // @datetime 2025-04-07 14:58:25
  174. // @param reportId int
  175. // @param reportChapterId int
  176. func ReportUnPublishedKnowledge(reportId, reportChapterId int) {
  177. if reportId <= 0 && reportChapterId <= 0 {
  178. return
  179. }
  180. var err error
  181. defer func() {
  182. if err != nil {
  183. //fmt.Println("ReportAddOrModifyKnowledge error:", err)
  184. utils.FileLog.Error("ReportAddOrModifyKnowledge error:", err)
  185. }
  186. }()
  187. obj := rag.RagEtaReport{}
  188. item, err := obj.GetByReportAndChapterId(reportId, reportChapterId)
  189. if err != nil && !utils.IsErrNoRow(err) {
  190. // 查询异常,且不是没找到数据的报错
  191. return
  192. }
  193. if item.RagEtaReportId > 0 {
  194. item.IsPublished = 0
  195. item.ModifyTime = time.Now()
  196. err = item.Update([]string{"is_published", "modify_time"})
  197. }
  198. return
  199. }
  200. // ReportUnPublishedKnowledgeByReportId
  201. // @Description: ETA报告取消发布同步到知识库(只传报告id的情况)
  202. // @author: Roc
  203. // @datetime 2025-04-07 15:41:15
  204. // @param reportId int
  205. func ReportUnPublishedKnowledgeByReportId(reportId int) {
  206. errList := make([]string, 0)
  207. defer func() {
  208. if len(errList) > 0 {
  209. utils.FileLog.Error("ReportUnPublishedKnowledgeByReportId error,报告ID:%d:%s", reportId, strings.Join(errList, "\n"))
  210. }
  211. }()
  212. obj := rag.RagEtaReport{}
  213. list, err := obj.GetListByCondition(``, ` AND report_id = ? `, []interface{}{reportId}, 0, 1000)
  214. if err != nil && !utils.IsErrNoRow(err) {
  215. // 查询异常,且不是没找到数据的报错
  216. return
  217. }
  218. for _, item := range list {
  219. item.IsPublished = 0
  220. item.ModifyTime = time.Now()
  221. err = item.Update([]string{"is_published", "modify_time"})
  222. if err != nil {
  223. errList = append(errList, fmt.Sprintf("第%d章:%s,异常:\n%s", item.ReportChapterId, item.Title, err.Error()))
  224. continue
  225. }
  226. }
  227. return
  228. }
  229. func getArticleContent(content *strings.Builder, htmlContentNode *html.Node) {
  230. if htmlContentNode.Type == html.TextNode {
  231. cleanData := strings.TrimSpace(htmlContentNode.Data)
  232. if cleanData != `` && cleanData != "</p>" {
  233. content.WriteString(cleanData)
  234. }
  235. } else if htmlContentNode.Type == html.ElementNode {
  236. switch htmlContentNode.DataAtom {
  237. case atom.Ul:
  238. content.WriteString("\n")
  239. case atom.Br:
  240. // 遇到 <br> 标签时添加换行符
  241. content.WriteString("\n")
  242. case atom.P:
  243. content.WriteString("\n")
  244. }
  245. }
  246. for c := htmlContentNode.FirstChild; c != nil; c = c.NextSibling {
  247. getArticleContent(content, c)
  248. }
  249. }
  250. // GenerateArticleAbstract
  251. // @Description: 文章摘要生成(默认提示词批量生成)
  252. // @author: Roc
  253. // @datetime 2025-03-10 16:17:53
  254. // @param item *rag.RagEtaReport
  255. func GenerateArticleAbstract(item *rag.RagEtaReport, forceGenerate bool) {
  256. var err error
  257. defer func() {
  258. if err != nil {
  259. utils.FileLog.Error("文章转临时文件失败,err:%v", err)
  260. fmt.Println("文章转临时文件失败,err:", err)
  261. }
  262. }()
  263. // 内容为空,那就不需要生成摘要
  264. if item.TextContent == `` {
  265. return
  266. }
  267. questionObj := rag.Question{}
  268. questionList, err := questionObj.GetListByCondition(``, ` AND is_default = 1 `, []interface{}{}, 0, 100)
  269. if err != nil {
  270. err = fmt.Errorf("获取问题列表失败,Err:" + err.Error())
  271. return
  272. }
  273. // 没问题就不生成了
  274. if len(questionList) <= 0 {
  275. return
  276. }
  277. for _, question := range questionList {
  278. GenerateArticleAbstractByQuestion(item, question, forceGenerate)
  279. }
  280. return
  281. }
  282. // GenerateArticleAbstractByQuestion
  283. // @Description: 文章摘要生成(根据提示词生成)
  284. // @author: Roc
  285. // @datetime 2025-03-10 16:17:53
  286. // @param item *rag.RagEtaReport
  287. func GenerateArticleAbstractByQuestion(item *rag.RagEtaReport, question *rag.Question, forceGenerate bool) {
  288. var err error
  289. defer func() {
  290. if err != nil {
  291. utils.FileLog.Error("文章转临时文件失败,err:%v", err)
  292. fmt.Println("文章转临时文件失败,err:", err)
  293. }
  294. }()
  295. // 内容为空,那就不需要生成摘要
  296. if item.TextContent == `` {
  297. return
  298. }
  299. abstractObj := rag.RagEtaReportAbstract{}
  300. abstractItem, err := abstractObj.GetByRagEtaReportIdAndQuestionId(item.RagEtaReportId, question.QuestionId)
  301. // 如果找到了,同时不是强制生成,那么就直接处理到知识库中
  302. if err == nil && !forceGenerate {
  303. // 摘要已经生成,不需要重复生成,只需要重新加入到向量库中
  304. ReportAbstractToKnowledge(item, abstractItem, false)
  305. return
  306. }
  307. if !utils.IsErrNoRow(err) {
  308. return
  309. }
  310. //你现在是一名资深的期货行业分析师,请基于以下的问题进行汇总总结,如果不能正常总结出来,那么就只需要回复我:sorry
  311. questionStr := fmt.Sprintf(`%s\n%s`, `你现在是一名资深的期货行业分析师,请基于以下的问题进行汇总总结,如果不能正常总结出来,那么就只需要回复我:sorry。以下是问题:`, question.QuestionContent)
  312. //开始对话
  313. abstract, _, tmpErr := getAnswerByContent(item.RagEtaReportId, utils.AI_ARTICLE_SOURCE_ETA_REPORT, questionStr)
  314. if tmpErr != nil {
  315. err = fmt.Errorf("LLM对话失败,Err:" + tmpErr.Error())
  316. return
  317. }
  318. // 添加问答记录
  319. //if len(addArticleChatRecordList) > 0 {
  320. // recordObj := rag.RagEtaReportChatRecord{}
  321. // err = recordObj.CreateInBatches(addArticleChatRecordList)
  322. // if err != nil {
  323. // return
  324. // }
  325. //}
  326. if abstract == `` {
  327. return
  328. }
  329. //if abstract == `sorry` || strings.Index(abstract, `根据已知信息无法回答该问题`) == 0 {
  330. // item.AbstractStatus = 2
  331. // item.ModifyTime = time.Now()
  332. // err = item.Update([]string{"AbstractStatus", "ModifyTime"})
  333. // return
  334. //}
  335. //item.AbstractStatus = 1
  336. //item.ModifyTime = time.Now()
  337. //err = item.Update([]string{"AbstractStatus", "ModifyTime"})
  338. if abstractItem == nil || abstractItem.RagEtaReportAbstractId <= 0 {
  339. abstractItem = &rag.RagEtaReportAbstract{
  340. RagEtaReportAbstractId: 0,
  341. RagEtaReportId: item.RagEtaReportId,
  342. Content: item.TextContent,
  343. QuestionId: question.QuestionId,
  344. QuestionContent: question.QuestionContent,
  345. Version: 1,
  346. Tags: "",
  347. VectorKey: "",
  348. ModifyTime: time.Now(),
  349. CreateTime: time.Now(),
  350. }
  351. err = abstractItem.Create()
  352. } else {
  353. abstractItem.Content = abstract
  354. abstractItem.Version++
  355. abstractItem.ModifyTime = time.Now()
  356. abstractItem.Tags = ""
  357. abstractItem.QuestionContent = question.QuestionContent
  358. err = abstractItem.Update([]string{"content", "version", "modify_time", "tags", "question_content"})
  359. }
  360. if err != nil {
  361. return
  362. }
  363. // 数据入ES库
  364. go AddOrEditEsRagEtaReportAbstract(abstractItem.RagEtaReportAbstractId)
  365. ReportAbstractToKnowledge(item, abstractItem, false)
  366. }
  367. // AddOrEditEsWechatArticleAbstract
  368. // @Description: 新增/编辑微信文章摘要入ES
  369. // @author: Roc
  370. // @datetime 2025-03-13 14:13:47
  371. // @param articleAbstractId int
  372. func AddOrEditEsRagEtaReportAbstract(ragEtaReportAbstractId int) {
  373. if utils.EsRagEtaReportAbstractName == `` {
  374. return
  375. }
  376. var err error
  377. defer func() {
  378. if err != nil {
  379. utils.FileLog.Error("添加ETA报告微信信息到ES失败,err:%v", err)
  380. fmt.Println("添加ETA报告微信信息到ES失败,err:", err)
  381. }
  382. }()
  383. obj := rag.RagEtaReportAbstract{}
  384. abstractInfo, err := obj.GetById(ragEtaReportAbstractId)
  385. if err != nil {
  386. err = fmt.Errorf("获取ETA报告文章信息失败,Err:" + err.Error())
  387. return
  388. }
  389. ragEtaReportObj := rag.RagEtaReport{}
  390. articleInfo, err := ragEtaReportObj.GetById(abstractInfo.RagEtaReportAbstractId)
  391. if err != nil {
  392. err = fmt.Errorf("获取ETA报告文章信息失败,Err:" + err.Error())
  393. return
  394. }
  395. tagIdList := make([]int, 0)
  396. if abstractInfo.Tags != `` {
  397. tagIdStrList := strings.Split(abstractInfo.Tags, ",")
  398. for _, tagIdStr := range tagIdStrList {
  399. tagId, tmpErr := strconv.Atoi(tagIdStr)
  400. if tmpErr != nil {
  401. err = fmt.Errorf("报告标签ID转int失败,Err:" + tmpErr.Error())
  402. return
  403. }
  404. tagIdList = append(tagIdList, tagId)
  405. }
  406. }
  407. esItem := elastic.RagEtaReportAbstractItem{
  408. RagEtaReportAbstractId: abstractInfo.RagEtaReportAbstractId,
  409. RagEtaReportId: abstractInfo.RagEtaReportId,
  410. Abstract: abstractInfo.Content,
  411. QuestionId: abstractInfo.QuestionId,
  412. Version: abstractInfo.Version,
  413. VectorKey: abstractInfo.VectorKey,
  414. ModifyTime: abstractInfo.ModifyTime,
  415. CreateTime: abstractInfo.CreateTime,
  416. Title: articleInfo.Title,
  417. TagIdList: tagIdList,
  418. }
  419. err = elastic.RagEtaReportAbstractEsAddOrEdit(strconv.Itoa(abstractInfo.RagEtaReportAbstractId), esItem)
  420. }
  421. // WechatArticleAbstractToKnowledge
  422. // @Description: 摘要入向量库
  423. // @author: Roc
  424. // @datetime 2025-03-10 16:14:59
  425. // @param wechatArticleItem *rag.RagEtaReport
  426. // @param abstractItem *rag.RagEtaReportAbstract
  427. func ReportAbstractToKnowledge(ragEtaReport *rag.RagEtaReport, abstractItem *rag.RagEtaReportAbstract, isReUpload bool) {
  428. if abstractItem.Content == `` {
  429. return
  430. }
  431. // 已经生成了,那就不处理了
  432. if abstractItem.VectorKey != `` && !isReUpload {
  433. return
  434. }
  435. var err error
  436. defer func() {
  437. if err != nil {
  438. utils.FileLog.Error("摘要入向量库失败,err:%v", err)
  439. fmt.Println("摘要入向量库失败,err:", err)
  440. }
  441. // 数据入ES库
  442. go AddOrEditEsRagEtaReportAbstract(abstractItem.RagEtaReportAbstractId)
  443. }()
  444. // 生成临时文件
  445. //dateDir := time.Now().Format("20060102")
  446. //uploadDir := + "./static/ai/article/" + dateDir
  447. uploadDir := "./static/ai/abstract"
  448. err = os.MkdirAll(uploadDir, utils.DIR_MOD)
  449. if err != nil {
  450. err = fmt.Errorf("存储目录创建失败,Err:" + err.Error())
  451. return
  452. }
  453. fileName := utils.MD5(fmt.Sprintf("%d_%d", utils.AI_ARTICLE_SOURCE_ETA_REPORT, ragEtaReport.RagEtaReportId)) + `.md`
  454. tmpFilePath := uploadDir + "/" + fileName
  455. err = utils.SaveToFile(abstractItem.Content, tmpFilePath)
  456. if err != nil {
  457. err = fmt.Errorf("生成临时文件失败,Err:" + err.Error())
  458. return
  459. }
  460. defer func() {
  461. os.Remove(tmpFilePath)
  462. }()
  463. knowledgeArticleName := models.BusinessConfMap[models.KnowledgeBaseName]
  464. // 上传临时文件到LLM
  465. uploadFileResp, err := llm.UploadDocsToKnowledge(tmpFilePath, knowledgeArticleName)
  466. if err != nil {
  467. err = fmt.Errorf("上传文章原文到知识库失败,Err:" + err.Error())
  468. return
  469. }
  470. if len(uploadFileResp.FailedFiles) > 0 {
  471. for _, v := range uploadFileResp.FailedFiles {
  472. err = fmt.Errorf("上传文章原文到知识库失败,Err:" + v)
  473. }
  474. }
  475. abstractItem.VectorKey = tmpFilePath
  476. abstractItem.ModifyTime = time.Now()
  477. err = abstractItem.Update([]string{"vector_key", "modify_time"})
  478. }