semantic_analysis.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. package services
  2. import (
  3. saModel "eta/eta_api/models/semantic_analysis"
  4. "eta/eta_api/services/alarm_msg"
  5. "eta/eta_api/utils"
  6. "fmt"
  7. "github.com/PuerkitoBio/goquery"
  8. "html"
  9. "sort"
  10. "strings"
  11. )
  12. // LoadSaDocContent2Section 读取文档内容为段落, 以<p>标签划分
  13. func LoadSaDocContent2Section(content string) (sections []string, err error) {
  14. if content == `` {
  15. return
  16. }
  17. defer func() {
  18. if err != nil {
  19. fmt.Println(err.Error())
  20. }
  21. }()
  22. doc, e := goquery.NewDocumentFromReader(strings.NewReader(content))
  23. if e != nil {
  24. err = fmt.Errorf("读取html内容失败, Err: %s", e.Error())
  25. return
  26. }
  27. doc.Find("p").Each(func(i int, s *goquery.Selection) {
  28. h, e := s.Html()
  29. if e != nil {
  30. err = fmt.Errorf("读取html标签失败, Err: %s", e.Error())
  31. return
  32. }
  33. t := strings.TrimSpace(s.Text())
  34. if t != "" {
  35. sections = append(sections, fmt.Sprintf(`<p>%s</p>`, h))
  36. }
  37. })
  38. return
  39. }
  40. // GetSaCompareTableData 获取文档比对表格数据
  41. func GetSaCompareTableData(labelIds, docIds, secIds []int, contentMap map[string]int) (resp *saModel.SaCompareSaveResp, err error) {
  42. resp = new(saModel.SaCompareSaveResp)
  43. // 文档列表
  44. docMap := make(map[int]*saModel.SaDoc)
  45. if len(docIds) > 0 {
  46. docOB := new(saModel.SaDoc)
  47. docCond := fmt.Sprintf(` AND %s IN (%s)`, saModel.SaDocColumns.SaDocId, utils.GetOrmInReplace(len(docIds)))
  48. docPars := make([]interface{}, 0)
  49. docPars = append(docPars, docIds)
  50. docFields := []string{saModel.SaDocColumns.SaDocId, saModel.SaDocColumns.Title, saModel.SaDocColumns.Theme}
  51. docItems, e := docOB.GetItemsByCondition(docCond, docPars, docFields, "")
  52. if e != nil {
  53. err = fmt.Errorf("获取比对文档列表失败, Err: %s", e.Error())
  54. return
  55. }
  56. for _, d := range docItems {
  57. docMap[d.SaDocId] = d
  58. }
  59. }
  60. // 段落map
  61. secMap := make(map[int]string)
  62. if len(secIds) > 0 {
  63. secOB := new(saModel.SaDocSection)
  64. secCond := fmt.Sprintf(` AND %s IN (%s)`, saModel.SaDocSectionColumns.SaDocSectionId, utils.GetOrmInReplace(len(secIds)))
  65. secPars := make([]interface{}, 0)
  66. secPars = append(secPars, secIds)
  67. secFields := []string{saModel.SaDocSectionColumns.SaDocSectionId, saModel.SaDocSectionColumns.DocId, saModel.SaDocSectionColumns.Content}
  68. secItems, e := secOB.GetItemsByCondition(secCond, secPars, secFields, "")
  69. if e != nil {
  70. err = fmt.Errorf("获取比对文档列表失败, Err: %s", e.Error())
  71. return
  72. }
  73. for _, s := range secItems {
  74. secMap[s.SaDocSectionId] = html.UnescapeString(s.Content)
  75. }
  76. }
  77. // 标签map
  78. labelMap := make(map[int]string)
  79. labelOB := new(saModel.SaLabel)
  80. labelCond := ``
  81. labelPars := make([]interface{}, 0)
  82. labelFields := []string{saModel.SaLabelColumns.SaLabelId, saModel.SaLabelColumns.LabelName}
  83. labelItems, e := labelOB.GetItemsByCondition(labelCond, labelPars, labelFields, "")
  84. if e != nil {
  85. err = fmt.Errorf("获取标签列表失败, Err: %s", e.Error())
  86. return
  87. }
  88. for _, l := range labelItems {
  89. labelMap[l.SaLabelId] = l.LabelName
  90. }
  91. // 表头信息, 按照docIds的顺序不然会乱
  92. for _, d := range docIds {
  93. dv := docMap[d]
  94. if dv != nil {
  95. resp.TitleList = append(resp.TitleList, dv.Title)
  96. //resp.ThemeList = append(resp.ThemeList, dv.Theme)
  97. }
  98. }
  99. // 标签列表
  100. respLabel := make([]*saModel.SaCompareSaveRespLabel, 0)
  101. for _, l := range labelIds {
  102. v := new(saModel.SaCompareSaveRespLabel)
  103. v.SaLabelId = l
  104. v.LabelName = labelMap[l]
  105. // 文档列表
  106. docList := make([]*saModel.SaCompareSaveRespDoc, 0)
  107. for _, d := range docIds {
  108. vd := new(saModel.SaCompareSaveRespDoc)
  109. vd.SaDocId = d
  110. dv := docMap[d]
  111. if dv != nil {
  112. vd.Title = dv.Title
  113. }
  114. secList := make([]*saModel.SaCompareSaveRespSection, 0)
  115. // 段落列表
  116. for _, s := range secIds {
  117. k := fmt.Sprintf("%d-%d-%d", l, d, s)
  118. if contentMap[k] == 0 {
  119. continue
  120. }
  121. vs := new(saModel.SaCompareSaveRespSection)
  122. vs.SaDocSectionId = s
  123. vs.Content = secMap[s]
  124. secList = append(secList, vs)
  125. }
  126. vd.SectionList = secList
  127. docList = append(docList, vd)
  128. }
  129. v.DocList = docList
  130. respLabel = append(respLabel, v)
  131. }
  132. resp.LabelList = respLabel
  133. return
  134. }
  135. // GetSaCompareDetailByDocIds 根据文档IDs获取比对详情信息
  136. // compareId大于0: docIds为比对关联的文档IDs, 段落标签展示当前标签、历史标签、Ta的标签
  137. // compareId等于0: docIds为选择的文档IDs, 段落标签展示历史标签、Ta的标签
  138. func GetSaCompareDetailByDocIds(docIds []int, compareId, sysAdminId int) (detail *saModel.SaCompareDetail, err error) {
  139. if len(docIds) == 0 {
  140. return
  141. }
  142. // 获取文档信息
  143. docOB := new(saModel.SaDoc)
  144. docCond := fmt.Sprintf(` AND %s IN (%s)`, saModel.SaDocColumns.SaDocId, utils.GetOrmInReplace(len(docIds)))
  145. docPars := make([]interface{}, 0)
  146. docPars = append(docPars, docIds)
  147. docs, e := docOB.GetItemsByCondition(docCond, docPars, []string{}, "")
  148. if e != nil {
  149. err = fmt.Errorf("获取文档信息失败, Err: %s", e.Error())
  150. return
  151. }
  152. // 获取文档段落信息
  153. secOB := new(saModel.SaDocSection)
  154. secCond := fmt.Sprintf(` AND %s IN (%s)`, saModel.SaDocSectionColumns.DocId, utils.GetOrmInReplace(len(docIds)))
  155. secPars := make([]interface{}, 0)
  156. secPars = append(secPars, docIds)
  157. secsItems, e := secOB.GetItemsByCondition(secCond, secPars, []string{}, "doc_id ASC, sort ASC")
  158. secsMap := make(map[int][]*saModel.SaDocSection)
  159. for _, s := range secsItems {
  160. if secsMap[s.DocId] == nil {
  161. secsMap[s.DocId] = make([]*saModel.SaDocSection, 0)
  162. }
  163. secsMap[s.DocId] = append(secsMap[s.DocId], s)
  164. }
  165. // 获取文档打的标签
  166. compLabs := make([]*saModel.SaCompareLabel, 0)
  167. compLabOB := new(saModel.SaCompareLabel)
  168. compLabCond := fmt.Sprintf(` AND %s IN (%s)`, saModel.SaCompareLabelColumns.DocId, utils.GetOrmInReplace(len(docIds)))
  169. compLabPars := make([]interface{}, 0)
  170. compLabPars = append(compLabPars, docIds)
  171. compLabQuery, e := compLabOB.GetItemsByCondition(compLabCond, compLabPars, []string{}, "")
  172. if e != nil {
  173. err = fmt.Errorf("获取比对标签失败, Err: %s", e.Error())
  174. return
  175. }
  176. compLabs = compLabQuery
  177. // 历史搜索关键词
  178. keywordsOB := new(saModel.SaCompareSearchKeyword)
  179. keywordsCond := fmt.Sprintf(` AND %s = ?`, saModel.SaCompareSearchKeywordColumns.CompareId)
  180. keywordsPars := make([]interface{}, 0)
  181. keywordsPars = append(keywordsPars, compareId)
  182. keywordsItems, e := keywordsOB.GetItemsByCondition(keywordsCond, keywordsPars, []string{}, "")
  183. if e != nil {
  184. err = fmt.Errorf("获取历史搜索关键词失败, Err: %s", e.Error())
  185. return
  186. }
  187. keywords := make([]string, 0)
  188. for _, k := range keywordsItems {
  189. keywords = append(keywords, k.Keyword)
  190. }
  191. // 段落标签Map
  192. secLabelMap, isMineMap := formatCompareLabelStatusGroupSection(compLabs, compareId, sysAdminId)
  193. partSecMap := make(map[int][]*saModel.SaCompareLabel)
  194. partSecExistMap := make(map[string]bool)
  195. // 头部标签列表-包含多个文档中引用的所有标签并进行去重
  196. tabLabelIds := make([]int, 0)
  197. tabLabels := make([]*saModel.SaCompareDetailHeadLabel, 0)
  198. for _, l := range compLabs {
  199. if !utils.InArrayByInt(tabLabelIds, l.LabelId) {
  200. t := new(saModel.SaCompareDetailHeadLabel)
  201. t.LabelId = l.LabelId
  202. t.LabelName = l.LabelName
  203. t.IsMine = isMineMap[l.LabelId]
  204. tabLabelIds = append(tabLabelIds, l.LabelId)
  205. tabLabels = append(tabLabels, t)
  206. }
  207. // 文档片段Map
  208. if l.IsPart != 1 {
  209. continue
  210. }
  211. if partSecMap[l.DocId] == nil {
  212. partSecMap[l.DocId] = make([]*saModel.SaCompareLabel, 0)
  213. }
  214. ek := fmt.Sprintf("%d-%s", l.SectionId, utils.MD5(l.Content))
  215. if partSecExistMap[ek] {
  216. continue
  217. }
  218. partSecMap[l.DocId] = append(partSecMap[l.DocId], l)
  219. }
  220. // 详情
  221. detail = new(saModel.SaCompareDetail)
  222. docList := make([]*saModel.SaCompareDetailDoc, 0)
  223. for _, d := range docs {
  224. dv := new(saModel.SaCompareDetailDoc)
  225. dv.DocId = d.SaDocId
  226. dv.Title = d.Title
  227. dv.Theme = d.Theme
  228. dv.ClassifyName = d.ClassifyName
  229. // 整段
  230. secList := make([]*saModel.SaCompareDetailSection, 0)
  231. secs := secsMap[d.SaDocId]
  232. if secs != nil {
  233. for _, s := range secs {
  234. sv := new(saModel.SaCompareDetailSection)
  235. sv.SectionId = s.SaDocSectionId
  236. sv.Content = html.UnescapeString(s.Content)
  237. sv.Sort = s.Sort
  238. sv.LabelList = secLabelMap[fmt.Sprintf("%d-%s", s.SaDocSectionId, utils.MD5(``))]
  239. secList = append(secList, sv)
  240. }
  241. }
  242. // 片段
  243. parts := partSecMap[d.SaDocId]
  244. if parts != nil {
  245. for _, p := range parts {
  246. pv := new(saModel.SaCompareDetailSection)
  247. pv.SectionId = p.SectionId
  248. pv.Content = html.UnescapeString(p.Content)
  249. pv.IsPart = 1
  250. pv.StartIndex = p.StartIndex
  251. pv.EndIndex = p.EndIndex
  252. pv.LabelList = secLabelMap[fmt.Sprintf("%d-%s", p.SectionId, utils.MD5(p.Content))]
  253. fmt.Println("kkk", fmt.Sprintf("%d-%s", p.SectionId, utils.MD5(p.Content)))
  254. secList = append(secList, pv)
  255. }
  256. }
  257. dv.SectionList = secList
  258. docList = append(docList, dv)
  259. }
  260. detail.HeadLabel = tabLabels
  261. detail.DocList = docList
  262. detail.KeywordsList = keywords
  263. return
  264. }
  265. // formatCompareLabelStatusGroupSection 根据段落格式化段落标签的状态
  266. func formatCompareLabelStatusGroupSection(compLabels []*saModel.SaCompareLabel, compareId, sysAdminId int) (labelMap map[string][]*saModel.SaCompareDetailFormatLabel, isMineMap map[int]int) {
  267. labelMap = make(map[string][]*saModel.SaCompareDetailFormatLabel)
  268. repeatMap := make(map[string][]int)
  269. thisMap := make(map[string]int)
  270. historyMap := make(map[string]int)
  271. otherMap := make(map[string]int)
  272. isMineMap = make(map[int]int) // 用于判断标签是否自己曾经使用或者当前使用过
  273. for _, l := range compLabels {
  274. // 判断段落标签的三种状态
  275. m := utils.MD5(l.Content)
  276. k := fmt.Sprintf("%d-%s-%d", l.SectionId, m, l.LabelId)
  277. if l.CompareId == compareId {
  278. thisMap[k] = 1
  279. if l.SysAdminId == sysAdminId {
  280. isMineMap[l.LabelId] = 1
  281. }
  282. }
  283. if l.CompareId != compareId && l.SysAdminId == sysAdminId {
  284. historyMap[k] = 1
  285. isMineMap[l.LabelId] = 1
  286. }
  287. if l.SysAdminId != sysAdminId {
  288. otherMap[k] = 1
  289. }
  290. k2 := fmt.Sprintf("%d-%s", l.SectionId, m)
  291. // 判断每段落内的标签是否重复添加
  292. if repeatMap[k2] == nil {
  293. repeatMap[k2] = make([]int, 0)
  294. }
  295. if utils.InArrayByInt(repeatMap[k2], l.LabelId) {
  296. continue
  297. }
  298. repeatMap[k2] = append(repeatMap[k2], l.LabelId)
  299. // 初始化段落标签
  300. if labelMap[k2] == nil {
  301. labelMap[k2] = make([]*saModel.SaCompareDetailFormatLabel, 0)
  302. }
  303. labelMap[k2] = append(labelMap[k2], &saModel.SaCompareDetailFormatLabel{
  304. LabelId: l.LabelId,
  305. LabelName: l.LabelName,
  306. })
  307. }
  308. for s, l := range labelMap {
  309. for _, v := range l {
  310. k := fmt.Sprintf("%s-%d", s, v.LabelId)
  311. v.IsThis = thisMap[k]
  312. v.IsHistory = historyMap[k]
  313. v.IsOther = otherMap[k]
  314. }
  315. }
  316. return
  317. }
  318. // HandleElasticSaDocAndSection Elastic-新增/编辑文档和段落
  319. func HandleElasticSaDocAndSection(saDoc *saModel.SaDoc, sections []*saModel.SaDocSection, delIds []int) (err error) {
  320. defer func() {
  321. if err != nil {
  322. alarm_msg.SendAlarmMsg(fmt.Sprintf("Elastic-语义分析文档, Err: %s", err.Error()), 2)
  323. }
  324. }()
  325. indexName := utils.EsSemanticAnalysisDocIndexName
  326. content := ``
  327. // 段落
  328. items := make([]*saModel.ElasticSaDoc, 0)
  329. for _, s := range sections {
  330. h := html.UnescapeString(s.Content)
  331. content += h
  332. items = append(items, &saModel.ElasticSaDoc{
  333. SaDocId: s.DocId,
  334. SaDocSectionId: s.SaDocSectionId,
  335. ClassifyId: saDoc.ClassifyId,
  336. ClassifyName: saDoc.ClassifyName,
  337. Title: saDoc.Title,
  338. Theme: saDoc.Theme,
  339. BodyContent: h,
  340. Author: saDoc.SysAdminName,
  341. CoverImg: saDoc.CoverImg,
  342. CreateTime: saDoc.CreateTime.Format(utils.FormatDateTime),
  343. })
  344. }
  345. // 文档
  346. docId := fmt.Sprintf("%d-0", saDoc.SaDocId)
  347. item := &saModel.ElasticSaDoc{
  348. SaDocId: saDoc.SaDocId,
  349. SaDocSectionId: 0,
  350. ClassifyId: saDoc.ClassifyId,
  351. ClassifyName: saDoc.ClassifyName,
  352. Title: saDoc.Title,
  353. Theme: saDoc.Theme,
  354. BodyContent: content,
  355. Author: saDoc.SysAdminName,
  356. CoverImg: saDoc.CoverImg,
  357. CreateTime: saDoc.CreateTime.Format(utils.FormatDateTime),
  358. }
  359. // 新增/更新
  360. if e := EsAddOrEditSaDoc(indexName, docId, item); e != nil {
  361. err = fmt.Errorf("新增/更新ES语义分析文档失败, Err: %s", e.Error())
  362. return
  363. }
  364. for _, v := range items {
  365. docId = fmt.Sprintf("%d-%d", v.SaDocId, v.SaDocSectionId)
  366. if e := EsAddOrEditSaDoc(indexName, docId, v); e != nil {
  367. err = fmt.Errorf("新增/更新ES语义分析文档段落失败, Err: %s", e.Error())
  368. return
  369. }
  370. }
  371. // 删除段落
  372. if len(delIds) > 0 {
  373. for _, d := range delIds {
  374. docId = fmt.Sprintf("%d-%d", saDoc.SaDocId, d)
  375. if e := EsDeleteData(indexName, docId); e != nil && !strings.Contains(e.Error(), "404") {
  376. err = fmt.Errorf("删除ES语义分析文档段落失败, Err: %s", e.Error())
  377. return
  378. }
  379. }
  380. }
  381. return
  382. }
  383. // DeleteElasticSaDocAndSection Elastic-删除文档和段落
  384. func DeleteElasticSaDocAndSection(saDocId int, secIds []int) (err error) {
  385. defer func() {
  386. if err != nil {
  387. alarm_msg.SendAlarmMsg(fmt.Sprintf("Elastic-语义分析文档, Err: %s", err.Error()), 2)
  388. }
  389. }()
  390. indexName := utils.EsSemanticAnalysisDocIndexName
  391. docId := fmt.Sprintf("%d-0", saDocId)
  392. if e := EsDeleteData(indexName, docId); e != nil && !strings.Contains(e.Error(), "404") {
  393. err = fmt.Errorf("删除ES语义分析文档失败, Err: %s", e.Error())
  394. return
  395. }
  396. if len(secIds) > 0 {
  397. for _, d := range secIds {
  398. docId = fmt.Sprintf("%d-%d", saDocId, d)
  399. if e := EsDeleteData(indexName, docId); e != nil && !strings.Contains(e.Error(), "404") {
  400. err = fmt.Errorf("删除ES语义分析文档段落失败, Err: %s", e.Error())
  401. return
  402. }
  403. }
  404. }
  405. return
  406. }
  407. // FormatCompareLabels2TableData 格式化比对标签为表格数据
  408. func FormatCompareLabels2TableData(compareLabels []*saModel.SaCompareLabelItem) (resp *saModel.SaCompareSaveResp, err error) {
  409. resp = new(saModel.SaCompareSaveResp)
  410. resp.LabelList = make([]*saModel.SaCompareSaveRespLabel, 0)
  411. // 取出文档作为X轴, 标签作为Y轴
  412. labelMap := make(map[int]*saModel.SaCompareLabelItem)
  413. docMap := make(map[int]*saModel.SaCompareLabelItem)
  414. secMap := make(map[string][]*saModel.SaCompareLabelItem)
  415. for _, v := range compareLabels {
  416. // 标签-Y轴
  417. if labelMap[v.LabelId] == nil {
  418. labelMap[v.LabelId] = v
  419. resp.LabelList = append(resp.LabelList, &saModel.SaCompareSaveRespLabel{
  420. SaLabelId: v.LabelId,
  421. LabelName: v.LabelName,
  422. DocList: make([]*saModel.SaCompareSaveRespDoc, 0),
  423. })
  424. }
  425. // 文档-X轴
  426. if docMap[v.DocId] == nil {
  427. docMap[v.DocId] = v
  428. resp.TitleList = append(resp.TitleList, v.Title)
  429. }
  430. // 标签ID-文档ID作为key写入map, 后续段落匹配
  431. k := fmt.Sprintf("%d-%d", v.LabelId, v.DocId)
  432. if secMap[k] == nil {
  433. secMap[k] = make([]*saModel.SaCompareLabelItem, 0)
  434. }
  435. secMap[k] = append(secMap[k], v)
  436. }
  437. // 填充标签数据
  438. secExistMap := make(map[string]bool)
  439. for _, l := range resp.LabelList {
  440. docs := make([]*saModel.SaCompareSaveRespDoc, 0)
  441. for _, d := range docMap {
  442. dv := new(saModel.SaCompareSaveRespDoc)
  443. dv.SaDocId = d.DocId
  444. dv.Title = d.Title
  445. // 文档段落
  446. k := fmt.Sprintf("%d-%d", l.SaLabelId, d.DocId)
  447. secs := make([]*saModel.SaCompareSaveRespSection, 0)
  448. secList := secMap[k]
  449. if secList != nil && len(secList) > 0 {
  450. for _, s := range secList {
  451. sv := new(saModel.SaCompareSaveRespSection)
  452. sv.SaDocSectionId = s.SectionId
  453. content := html.UnescapeString(s.SectionContent)
  454. if s.CompareContent != "" {
  455. sv.IsPart = 1
  456. content = html.UnescapeString(s.CompareContent)
  457. }
  458. sv.Content = content
  459. // 同标签同文档同段落中的整段/片段去重
  460. ek := fmt.Sprintf("%d-%d-%d-%s", l.SaLabelId, d.DocId, s.SectionId, utils.MD5(content))
  461. fmt.Println(ek)
  462. if secExistMap[ek] {
  463. fmt.Println("跳过", ek)
  464. continue
  465. }
  466. secExistMap[ek] = true
  467. secs = append(secs, sv)
  468. }
  469. }
  470. dv.SectionList = secs
  471. docs = append(docs, dv)
  472. }
  473. // 标签对应的文档列表排序, 与resp.TitleList排序保持一致
  474. sort.Slice(docs, func(i, j int) bool {
  475. return docs[j].SaDocId > docs[i].SaDocId
  476. })
  477. l.DocList = docs
  478. }
  479. return
  480. }