processor_business_logic.go 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464
  1. // @Author gmy 2024/8/6 10:50:00
  2. package main
  3. import (
  4. "context"
  5. "eta/eta_crawler/models"
  6. "eta/eta_crawler/utils"
  7. "fmt"
  8. "github.com/PuerkitoBio/goquery"
  9. "github.com/beego/beego/v2/core/logs"
  10. "github.com/chromedp/chromedp"
  11. "log"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "unicode"
  16. )
  17. const (
  18. sourceName = "lysww" // 粮油商务网
  19. )
  20. // TableData 用于存储表格的数据
  21. type TableData struct {
  22. Headers []string `json:"headers"`
  23. Rows [][]string `json:"rows"`
  24. }
  25. // ImportCostProcessor
  26. // @Description: 进口成本处理器
  27. type ImportCostProcessor struct{}
  28. func (p *ImportCostProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
  29. fmt.Println("Processing import cost...")
  30. // 解析关键字
  31. if len(keywords) < 5 {
  32. return []models.BaseFromLyData{}, fmt.Errorf("ProcessingImportCostProcessor Process() : keywords must contain at least 5 elements")
  33. }
  34. // 拿到 行关键字和列关键字
  35. columnName := keywords[len(keywords)-4]
  36. rowVariety := keywords[0]
  37. rowPort := keywords[len(keywords)-3]
  38. indexNamePrefix := keywords[:1]
  39. indexNameSuffix := keywords[1:]
  40. // 提取所有表格数据
  41. tableData := getNoHeadTableData(reportContent)
  42. // 提取日期信息
  43. dateText, err := getDateInfo(ctx)
  44. if err != nil {
  45. return []models.BaseFromLyData{}, err
  46. }
  47. // 时间格式转换
  48. format, err := utils.ConvertTimeFormat(dateText)
  49. if err != nil {
  50. return []models.BaseFromLyData{}, err
  51. }
  52. // 解析日期并计算当前月份
  53. var targetMonths []string
  54. if product == "油菜籽" {
  55. targetMonths, err = utils.ParseDateAndMonthColzaOil(format)
  56. } else {
  57. targetMonths, err = utils.ParseDateAndMonth(dateText)
  58. }
  59. if err != nil {
  60. return []models.BaseFromLyData{}, fmt.Errorf("ProcessingImportCostProcessor Process() : Failed to parse date: %v", err)
  61. }
  62. fmt.Printf("Target Month: %s\n", targetMonths)
  63. // 处理提取的表格数据
  64. var result []models.BaseFromLyData
  65. for _, data := range tableData {
  66. tableHeaders := data.Headers
  67. tableRows := data.Rows
  68. // 查找目标列
  69. columnIdx := -1
  70. for i, header := range tableHeaders {
  71. if strings.Contains(header, columnName) {
  72. columnIdx = i
  73. break
  74. }
  75. }
  76. if columnIdx == -1 {
  77. log.Printf("ProcessingImportCostProcessor Process() : Column '%s' not found in table", columnName)
  78. continue
  79. }
  80. // 处理表格中的每一行
  81. //var flag bool = true
  82. var previousRowVariety string
  83. var previousRowPort string
  84. for rowIndex, row := range tableRows {
  85. if len(row) == len(tableHeaders) {
  86. previousRowVariety = row[0]
  87. previousRowPort = row[1]
  88. } else if len(row) == len(tableHeaders)-1 {
  89. previousRowPort = row[0]
  90. row = append([]string{previousRowVariety}, row...)
  91. tableRows[rowIndex] = row
  92. } else if len(row) == len(tableHeaders)-2 {
  93. row = append([]string{previousRowVariety, previousRowPort}, row...)
  94. tableRows[rowIndex] = row
  95. }
  96. for _, targetMonth := range targetMonths {
  97. if len(row) >= len(tableHeaders) && strings.Contains(rowVariety, row[0]) && row[1] == targetMonth && row[len(row)-1] == rowPort {
  98. if columnIdx < len(row) {
  99. // 指标名称
  100. indexNameList := append(indexNamePrefix, append([]string{targetMonth}, indexNameSuffix...)...)
  101. indexName := strings.Join(indexNameList[:len(keywords)-2], ":")
  102. // 指标编码
  103. indexCode := utils.GenerateIndexCode(sourceName, indexName)
  104. // 指标id获取
  105. indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
  106. if err != nil {
  107. logs.Error("ProcessingImportCostProcessor Process() : Failed to get index id: %v", err)
  108. continue
  109. }
  110. indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
  111. if err != nil {
  112. logs.Error("ProcessingImportCostProcessor Process() : Failed to get data by index id and date: %v", err)
  113. continue
  114. }
  115. if len(indexData) > 0 {
  116. logs.Info("ProcessingImportCostProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
  117. continue
  118. }
  119. valueStr := row[columnIdx]
  120. value, err := strconv.ParseFloat(valueStr, 64)
  121. if err != nil {
  122. return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
  123. }
  124. // 创建并添加到结果列表
  125. baseFromLyData := models.BaseFromLyData{
  126. DataTime: format,
  127. Value: value,
  128. BaseFromLyIndexId: indexId,
  129. IndexCode: indexCode,
  130. }
  131. result = append(result, baseFromLyData)
  132. } else {
  133. log.Printf("ProcessingImportCostProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, rowPort)
  134. }
  135. break
  136. }
  137. }
  138. }
  139. }
  140. return result, nil
  141. }
  142. // ProcessingProfitProcessor
  143. // @Description: 加工利润处理器
  144. type ProcessingProfitProcessor struct{}
  145. func (p *ProcessingProfitProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
  146. fmt.Println("Processing processing profit...")
  147. // 解析关键字
  148. if len(keywords) < 4 {
  149. return []models.BaseFromLyData{}, fmt.Errorf("ProcessingProfitProcessor Process() : keywords must contain at least 4 elements")
  150. }
  151. // 拿到 行关键字和列关键字
  152. columnName := keywords[1]
  153. rowVariety := keywords[0]
  154. indexNamePrefix := keywords[:1]
  155. indexNameSuffix := keywords[1:]
  156. // 提取所有表格数据
  157. tableData := getNoHeadTableData(reportContent)
  158. // 提取日期信息
  159. dateText, err := getDateInfo(ctx)
  160. if err != nil {
  161. return []models.BaseFromLyData{}, err
  162. }
  163. // 时间格式转换
  164. format, err := utils.ConvertTimeFormat(dateText)
  165. if err != nil {
  166. return []models.BaseFromLyData{}, err
  167. }
  168. // 解析日期并计算当前月份 和 后两月
  169. yearMonths, err := utils.ConvertTimeFormatToYearMonth(format)
  170. if err != nil {
  171. return nil, err
  172. }
  173. fmt.Printf("Target yearMonth: %s\n", yearMonths)
  174. // 处理提取的表格数据
  175. var result []models.BaseFromLyData
  176. for _, data := range tableData {
  177. tableHeaders := data.Headers
  178. tableRows := data.Rows
  179. // 查找目标列
  180. columnIdx := -1
  181. for i, header := range tableHeaders {
  182. if strings.Contains(columnName, header) {
  183. columnIdx = i
  184. break
  185. }
  186. }
  187. if columnIdx == -1 {
  188. log.Printf("ProcessingProfitProcessor Process() : Column '%s' not found in table", columnName)
  189. continue
  190. }
  191. // 处理表格中的每一行
  192. var previousRowVariety string
  193. for rowIndex, row := range tableRows {
  194. if len(row) == len(tableHeaders) {
  195. previousRowVariety = row[0]
  196. } else if len(row) == len(tableHeaders)-1 {
  197. row = append([]string{previousRowVariety}, row...)
  198. tableRows[rowIndex] = row
  199. }
  200. for _, targetMonth := range yearMonths {
  201. if len(row) >= len(tableHeaders) && row[0] == rowVariety && row[1] == targetMonth {
  202. if columnIdx < len(row) {
  203. // 指标名称
  204. indexNameList := append(indexNamePrefix, append([]string{targetMonth}, indexNameSuffix...)...)
  205. indexName := strings.Join(indexNameList[:len(keywords)-2], ":")
  206. // 指标编码
  207. indexCode := utils.GenerateIndexCode(sourceName, indexName)
  208. // 指标id获取
  209. indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
  210. if err != nil {
  211. logs.Error("ProcessingProfitProcessor Process() : Failed to get index id: %v", err)
  212. continue
  213. }
  214. indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
  215. if err != nil {
  216. logs.Error("ProcessingProfitProcessor Process() : Failed to get data by index id and date: %v", err)
  217. continue
  218. }
  219. if len(indexData) > 0 {
  220. logs.Info("ProcessingProfitProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
  221. continue
  222. }
  223. valueStr := row[columnIdx]
  224. value, err := strconv.ParseFloat(valueStr, 64)
  225. if err != nil {
  226. return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
  227. }
  228. // 创建并添加到结果列表
  229. baseFromLyData := models.BaseFromLyData{
  230. DataTime: format,
  231. Value: value,
  232. BaseFromLyIndexId: indexId,
  233. IndexCode: indexCode,
  234. }
  235. result = append(result, baseFromLyData)
  236. } else {
  237. log.Printf("ProcessingProfitProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
  238. }
  239. break
  240. }
  241. }
  242. }
  243. }
  244. return result, nil
  245. }
  246. // ShippingCostProcessor
  247. // @Description: 船运费用处理器
  248. type ShippingCostProcessor struct{}
  249. func (p *ShippingCostProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
  250. fmt.Println("Processing processing profit...")
  251. // 解析关键字
  252. if len(keywords) < 4 {
  253. return []models.BaseFromLyData{}, fmt.Errorf("ShippingCostProcessor Process() : keywords must contain at least 5 elements")
  254. }
  255. // 拿到 行关键字和列关键字
  256. columnName := keywords[len(keywords)-3]
  257. rowVariety := keywords[0]
  258. rowDestination := keywords[1]
  259. rowShipType := keywords[2]
  260. // 提取所有表格数据
  261. tableData := getNoHeadTableData(reportContent)
  262. // 提取日期信息
  263. dateText, err := getDateInfo(ctx)
  264. if err != nil {
  265. return []models.BaseFromLyData{}, err
  266. }
  267. // 时间格式转换
  268. format, err := utils.ConvertTimeFormat(dateText)
  269. if err != nil {
  270. return []models.BaseFromLyData{}, err
  271. }
  272. // 处理提取的表格数据
  273. var result []models.BaseFromLyData
  274. for _, data := range tableData {
  275. tableHeaders := data.Headers
  276. tableRows := data.Rows
  277. // 查找目标列
  278. columnIdx := -1
  279. for i, header := range tableHeaders {
  280. if strings.Contains(header, columnName) {
  281. columnIdx = i
  282. break
  283. }
  284. }
  285. if columnIdx == -1 {
  286. log.Printf("ShippingCostProcessor Process() : Column '%s' not found in table", columnName)
  287. continue
  288. }
  289. // 处理表格中的每一行
  290. for rowIndex, row := range tableRows {
  291. if len(row) == len(tableHeaders)-1 {
  292. row = append([]string{rowVariety}, row...)
  293. tableRows[rowIndex] = row
  294. rowShipType, err = extractValueInParentheses(rowVariety)
  295. if err != nil {
  296. logs.Error("ShippingCostProcessor Process() : Failed to extract value in parentheses: %v", err)
  297. continue
  298. }
  299. }
  300. if len(row) >= len(tableHeaders) && row[0] == rowVariety && (row[1] == rowDestination || strings.Contains(row[0], row[1])) && row[2] == rowShipType {
  301. if columnIdx < len(row) {
  302. // 指标名称
  303. indexName := strings.Join(keywords[:len(keywords)-3], `:`)
  304. // 指标编码
  305. indexCode := utils.GenerateIndexCode(sourceName, indexName)
  306. // 指标id获取
  307. indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
  308. if err != nil {
  309. logs.Error("ShippingCostProcessor Process() : Failed to get index id: %v", err)
  310. continue
  311. }
  312. indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
  313. if err != nil {
  314. logs.Error("ShippingCostProcessor Process() : Failed to get data by index id and date: %v", err)
  315. continue
  316. }
  317. if len(indexData) > 0 {
  318. logs.Info("ShippingCostProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
  319. continue
  320. }
  321. valueStr := row[columnIdx]
  322. value, err := strconv.ParseFloat(valueStr, 64)
  323. if err != nil {
  324. return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
  325. }
  326. // 创建并添加到结果列表
  327. baseFromLyData := models.BaseFromLyData{
  328. DataTime: format,
  329. Value: value,
  330. BaseFromLyIndexId: indexId,
  331. IndexCode: indexCode,
  332. }
  333. result = append(result, baseFromLyData)
  334. } else {
  335. log.Printf("ShippingCostProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
  336. }
  337. break
  338. }
  339. }
  340. }
  341. return result, nil
  342. }
  343. // SupplyDemandBalanceProcessor
  344. // @Description: 供需平衡处理器
  345. type SupplyDemandBalanceProcessor struct{}
  346. func (p *SupplyDemandBalanceProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
  347. // https://www.fao.com.cn/art/gG7gKTCNDHLJNsq9QRYjoQ==.htm
  348. logs.Info("Processing processing report...")
  349. // 解析关键字
  350. if len(keywords) < 4 {
  351. return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : keywords must contain at least 4 elements")
  352. }
  353. // 拿到 行关键字和列关键字
  354. var columnName string
  355. rowVariety := keywords[1]
  356. // 提取所有表格数据
  357. tableData := getTableData(reportContent, true)
  358. logs.Info("SupplyDemandBalanceProcessor Process() : Table data: %v", tableData)
  359. // 提取日期信息
  360. dateText, err := getDateInfo(ctx)
  361. if err != nil {
  362. return []models.BaseFromLyData{}, err
  363. }
  364. // 时间格式转换
  365. format, err := utils.ConvertTimeFormat(dateText)
  366. if err != nil {
  367. return []models.BaseFromLyData{}, err
  368. }
  369. currentYearAndNextYear, err := utils.GetCurrentYearAndNextYear(format)
  370. if err != nil {
  371. return nil, err
  372. }
  373. month, err := utils.GetCurrentMonth(format)
  374. if err != nil {
  375. return nil, err
  376. }
  377. monthSuffix := "预估"
  378. logs.Info("SupplyDemandBalanceProcessor Process() : Target Year: %s:%s\n", currentYearAndNextYear, month+monthSuffix)
  379. // 处理提取的表格数据
  380. var result []models.BaseFromLyData
  381. headers := tableData.Headers
  382. rows := tableData.Rows
  383. for _, year := range currentYearAndNextYear {
  384. columnName = year + month + monthSuffix
  385. isCurrentYear, err := utils.IsCurrentYear(year)
  386. if err != nil {
  387. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to determine if year is current year: %v", err)
  388. continue
  389. }
  390. if !isCurrentYear {
  391. format, err = utils.GetNextYearLastDay(format)
  392. if err != nil {
  393. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get next year last day: %v", err)
  394. continue
  395. }
  396. }
  397. // 查找目标列
  398. columnIdx := -1
  399. for i, header := range headers {
  400. if strings.Contains(columnName, header) {
  401. columnIdx = i
  402. break
  403. }
  404. }
  405. if columnIdx == -1 {
  406. logs.Error("SupplyDemandBalanceProcessor Process() : Column '%s' not found in table", columnName)
  407. continue
  408. }
  409. // 处理表格中的每一行
  410. for _, row := range rows {
  411. if len(row) >= len(headers) && row[0] == rowVariety {
  412. if columnIdx < len(row) {
  413. // 指标名称
  414. indexName := strings.Join(keywords[:len(keywords)-2], ":")
  415. // 指标编码
  416. indexCode := utils.GenerateIndexCode(sourceName, indexName)
  417. // 指标id获取
  418. indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
  419. if err != nil {
  420. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get index id: %v", err)
  421. continue
  422. }
  423. valueStr := row[columnIdx]
  424. value, err := strconv.ParseFloat(valueStr, 64)
  425. if err != nil {
  426. return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : failed to parse value '%s': %v", valueStr, err)
  427. }
  428. yearMonth, err := utils.GetYearMonth(format)
  429. if err != nil {
  430. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get year month: %v", err)
  431. continue
  432. }
  433. indexData, err := models.GetLyDataByIndexIdAndDataTimeYM(indexId, yearMonth)
  434. if err != nil {
  435. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get data by index id and date: %v", err)
  436. continue
  437. }
  438. if len(indexData) > 0 {
  439. logs.Info("SupplyDemandBalanceProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
  440. // 存在走更新逻辑 主要更新今年在去年的预估值
  441. indexData := indexData[0]
  442. if indexData.Value != value {
  443. err := models.UpdateLyDataById(indexData.BaseFromLyDataId, value)
  444. if err != nil {
  445. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to update data: %v", err)
  446. continue
  447. }
  448. }
  449. continue
  450. }
  451. // 创建并添加到结果列表
  452. baseFromLyData := models.BaseFromLyData{
  453. DataTime: format,
  454. Value: value,
  455. BaseFromLyIndexId: indexId,
  456. IndexCode: indexCode,
  457. }
  458. result = append(result, baseFromLyData)
  459. } else {
  460. log.Printf("SupplyDemandBalanceProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
  461. }
  462. break
  463. }
  464. }
  465. }
  466. return result, nil
  467. }
  468. // PurchaseShippingProcessor
  469. // @Description: 采购装船处理器
  470. type PurchaseShippingProcessor struct{}
  471. func (p *PurchaseShippingProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
  472. logs.Info("Processing purchase shipping...")
  473. // TODO 卡住了
  474. // 解析关键字
  475. if len(keywords) < 3 {
  476. return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : keywords must contain at least 3 elements")
  477. }
  478. // 拿到 行关键字和列关键字
  479. var columnName string
  480. rowVariety := keywords[1]
  481. // 提取所有表格数据
  482. tableData := getPurchaseShippingTableData(reportContent)
  483. logs.Info("SupplyDemandBalanceProcessor Process() : Table data: %v", tableData)
  484. // 提取日期信息
  485. dateText, err := getDateInfo(ctx)
  486. if err != nil {
  487. return []models.BaseFromLyData{}, err
  488. }
  489. // 时间格式转换
  490. format, err := utils.ConvertTimeFormat(dateText)
  491. if err != nil {
  492. return []models.BaseFromLyData{}, err
  493. }
  494. currentYearAndNextYear, err := utils.GetCurrentYearAndNextYear(format)
  495. if err != nil {
  496. return nil, err
  497. }
  498. month, err := utils.GetCurrentMonth(format)
  499. if err != nil {
  500. return nil, err
  501. }
  502. monthSuffix := "预估"
  503. logs.Info("SupplyDemandBalanceProcessor Process() : Target Year: %s:%s\n", currentYearAndNextYear, month+monthSuffix)
  504. // 处理提取的表格数据
  505. var result []models.BaseFromLyData
  506. headers := tableData.Headers
  507. rows := tableData.Rows
  508. for _, year := range currentYearAndNextYear {
  509. columnName = year + month + monthSuffix
  510. // 查找目标列
  511. columnIdx := -1
  512. for i, header := range headers {
  513. if strings.Contains(columnName, header) {
  514. columnIdx = i
  515. break
  516. }
  517. }
  518. if columnIdx == -1 {
  519. log.Printf("SupplyDemandBalanceProcessor Process() : Column '%s' not found in table", columnName)
  520. continue
  521. }
  522. // 处理表格中的每一行
  523. for _, row := range rows {
  524. if len(row) >= len(headers) && row[0] == rowVariety {
  525. if columnIdx < len(row) {
  526. // 指标名称
  527. indexName := strings.Join(keywords[:len(keywords)-2], ":")
  528. // 指标编码
  529. indexCode := utils.GenerateIndexCode(sourceName, indexName)
  530. // 指标id获取
  531. indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
  532. if err != nil {
  533. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get index id: %v", err)
  534. continue
  535. }
  536. indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
  537. if err != nil {
  538. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get data by index id and date: %v", err)
  539. continue
  540. }
  541. if len(indexData) > 0 {
  542. logs.Info("SupplyDemandBalanceProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
  543. continue
  544. }
  545. valueStr := row[columnIdx]
  546. value, err := strconv.ParseFloat(valueStr, 64)
  547. if err != nil {
  548. return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : failed to parse value '%s': %v", valueStr, err)
  549. }
  550. // 创建并添加到结果列表
  551. baseFromLyData := models.BaseFromLyData{
  552. DataTime: format,
  553. Value: value,
  554. BaseFromLyIndexId: indexId,
  555. IndexCode: indexCode,
  556. }
  557. result = append(result, baseFromLyData)
  558. } else {
  559. log.Printf("SupplyDemandBalanceProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
  560. }
  561. break
  562. }
  563. }
  564. }
  565. return result, nil
  566. }
  567. // ProcessingReportProcessor
  568. // @Description: 加工报告处理器
  569. type ProcessingReportProcessor struct {
  570. }
  571. func (p *ProcessingReportProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
  572. logs.Info("Processing processing report...")
  573. // 解析关键字
  574. if len(keywords) < 3 {
  575. return []models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : keywords must contain at least 3 elements")
  576. }
  577. // 拿到 行关键字和列关键字
  578. columnName := keywords[0]
  579. rowName := keywords[1]
  580. // 提取所有表格数据
  581. tableData := getAllTableData(reportContent)
  582. // 提取日期信息
  583. dateText, err := getDateInfo(ctx)
  584. if err != nil {
  585. return []models.BaseFromLyData{}, err
  586. }
  587. indexName := strings.Join(keywords[:len(keywords)-2], ":")
  588. // 指标编码
  589. indexCode := utils.GenerateIndexCode(sourceName, indexName)
  590. // 指标id获取
  591. indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
  592. if err != nil {
  593. return nil, err
  594. }
  595. // 校验指标数据是否存在 根据指标id和日期 存在则跳过,不存在正常往下走
  596. format, err := utils.ConvertTimeFormat(dateText)
  597. if err != nil {
  598. return []models.BaseFromLyData{}, err
  599. }
  600. indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
  601. if err != nil {
  602. return []models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : Failed to get data by index id and date: %v", err)
  603. }
  604. if len(indexData) > 0 {
  605. logs.Info("ProcessingReportProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
  606. return []models.BaseFromLyData{}, nil
  607. }
  608. // 解析日期并计算当前周数
  609. targetWeek, err := utils.ParseDateAndWeek(dateText)
  610. if err != nil {
  611. return []models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : Failed to parse date: %v", err)
  612. }
  613. fmt.Printf("Target Week: %s\n", targetWeek)
  614. var result []models.BaseFromLyData
  615. // 处理提取的表格数据
  616. for _, data := range tableData {
  617. tableHeaders := data.Headers
  618. tableRows := data.Rows
  619. // 查找目标列
  620. columnIdx := -1
  621. for i, header := range tableHeaders {
  622. headerString := extractChinese(header)
  623. if strings.Contains(columnName, headerString) {
  624. columnIdx = i
  625. break
  626. }
  627. }
  628. if columnIdx == -1 {
  629. logs.Error("ProcessingReportProcessor Process() : Column '%s' not found in table", columnName)
  630. continue
  631. }
  632. // 查找本周的列位置
  633. weekIdx := -1
  634. for i, header := range tableHeaders {
  635. if strings.Contains(header, targetWeek) && i > columnIdx {
  636. weekIdx = i
  637. break
  638. }
  639. }
  640. if weekIdx == -1 {
  641. fmt.Printf("Week column '%s' not found in table\n", targetWeek)
  642. continue
  643. }
  644. // 查找目标行
  645. for _, row := range tableRows {
  646. if len(row) > 0 && strings.Contains(row[0], rowName) {
  647. if weekIdx < len(row) {
  648. logs.Info("Value in column '%s' - '%s': %s", columnName, rowName, row[columnIdx])
  649. numFlag := isNumeric(row[columnIdx])
  650. if numFlag {
  651. value, err := strconv.ParseFloat(row[columnIdx], 64)
  652. if err != nil {
  653. logs.Error("ProcessingReportProcessor Process() : Error converting value to float64: %v", err)
  654. return []models.BaseFromLyData{}, err
  655. }
  656. // 返回BaseFromLyData对象的数据
  657. baseFromLyData := models.BaseFromLyData{
  658. DataTime: dateText,
  659. Value: value,
  660. }
  661. result = append(result, baseFromLyData)
  662. }
  663. } else {
  664. logs.Error("ProcessingReportProcessor Process() : Column index out of range")
  665. }
  666. }
  667. }
  668. }
  669. // TODO 后面把这个日志打印,不做返回错误处理,一个指标找不到会导致后续指标无法处理
  670. return result, nil
  671. }
  672. // InventoryAnalysisProcessor
  673. // @Description: 库存分析处理器
  674. type InventoryAnalysisProcessor struct{}
  675. func (p *InventoryAnalysisProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
  676. // https://www.fao.com.cn/art/yg1IKj9FpPEIDv2LefnPhQ==.htm
  677. logs.Info("Processing inventory analysis...")
  678. // 解析关键字
  679. if len(keywords) < 4 {
  680. return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : keywords must contain at least 4 elements")
  681. }
  682. // 拿到 行关键字和列关键字
  683. columnName := keywords[0]
  684. rowVariety := keywords[1]
  685. columnSuffix := "本周"
  686. columnName = columnName + columnSuffix
  687. // 提取所有表格数据
  688. tableData := getTableData(reportContent, true)
  689. logs.Info("SupplyDemandBalanceProcessor Process() : Table data: %v", tableData)
  690. // 提取日期信息
  691. dateText, err := getDateInfo(ctx)
  692. if err != nil {
  693. return []models.BaseFromLyData{}, err
  694. }
  695. // 时间格式转换
  696. format, err := utils.ConvertTimeFormat(dateText)
  697. if err != nil {
  698. return []models.BaseFromLyData{}, err
  699. }
  700. // 处理提取的表格数据
  701. var result []models.BaseFromLyData
  702. headers := tableData.Headers
  703. rows := tableData.Rows
  704. // 查找目标列
  705. columnIdx := -1
  706. for i, header := range headers {
  707. header := removeParentheses(header)
  708. if strings.Contains(columnName, header) {
  709. columnIdx = i
  710. break
  711. }
  712. }
  713. if columnIdx == -1 {
  714. logs.Error("SupplyDemandBalanceProcessor Process() : Column '%s' not found in table", columnName)
  715. } else {
  716. // 处理表格中的每一行
  717. for _, row := range rows {
  718. if len(row) >= len(headers) && strings.Contains(row[0], rowVariety) {
  719. if columnIdx < len(row) {
  720. // 指标名称
  721. indexName := strings.Join(keywords[:len(keywords)-2], ":")
  722. indexName = removeParentheses(indexName)
  723. // 指标编码
  724. indexCode := utils.GenerateIndexCode(sourceName, indexName)
  725. // 指标id获取
  726. indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
  727. if err != nil {
  728. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get index id: %v", err)
  729. continue
  730. }
  731. indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
  732. if err != nil {
  733. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get data by index id and date: %v", err)
  734. continue
  735. }
  736. if len(indexData) > 0 {
  737. logs.Info("SupplyDemandBalanceProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
  738. continue
  739. }
  740. valueStr := row[columnIdx]
  741. value, err := strconv.ParseFloat(valueStr, 64)
  742. if err != nil {
  743. return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : failed to parse value '%s': %v", valueStr, err)
  744. }
  745. // 创建并添加到结果列表
  746. baseFromLyData := models.BaseFromLyData{
  747. DataTime: format,
  748. Value: value,
  749. BaseFromLyIndexId: indexId,
  750. IndexCode: indexCode,
  751. }
  752. result = append(result, baseFromLyData)
  753. } else {
  754. log.Printf("SupplyDemandBalanceProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
  755. }
  756. break
  757. }
  758. }
  759. }
  760. return result, nil
  761. }
  762. // PriceSpreadArbitrageProcessor
  763. // @Description: 价差套利处理器
  764. type PriceSpreadArbitrageProcessor struct{}
  765. func (p *PriceSpreadArbitrageProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
  766. fmt.Println("Processing processing profit...")
  767. // 解析关键字
  768. if len(keywords) < 4 {
  769. return []models.BaseFromLyData{}, fmt.Errorf("ProcessingProfitProcessor Process() : keywords must contain at least 4 elements")
  770. }
  771. // 拿到 行关键字和列关键字
  772. var columnDate string
  773. rowVariety := keywords[0]
  774. rowBourse := keywords[1]
  775. // 提取所有表格数据
  776. tableData := getNoHeadTableData(reportContent)
  777. // 提取日期信息
  778. dateText, err := getDateInfo(ctx)
  779. if err != nil {
  780. return []models.BaseFromLyData{}, err
  781. }
  782. // 时间格式转换
  783. format, err := utils.ConvertTimeFormat(dateText)
  784. if err != nil {
  785. return []models.BaseFromLyData{}, err
  786. }
  787. day, err := utils.ConvertTimeFormatToYearMonthDay(format)
  788. if err != nil {
  789. return nil, err
  790. }
  791. columnDate = day
  792. // 处理提取的表格数据
  793. var result []models.BaseFromLyData
  794. for _, data := range tableData {
  795. tableHeaders := data.Headers
  796. tableRows := data.Rows
  797. // 查找目标列
  798. columnIdx := -1
  799. for i, header := range tableHeaders {
  800. if strings.Contains(header, columnDate) {
  801. columnIdx = i
  802. break
  803. }
  804. }
  805. if columnIdx == -1 {
  806. log.Printf("ProcessingProfitProcessor Process() : Column '%s' not found in table", columnDate)
  807. continue
  808. }
  809. // 处理表格中的每一行
  810. for _, row := range tableRows {
  811. if len(row) >= len(tableHeaders) && row[0] == rowVariety && row[1] == rowBourse {
  812. if columnIdx < len(row) {
  813. // 指标名称
  814. indexName := strings.Join(keywords[:len(keywords)-2], ":")
  815. // 指标编码
  816. indexCode := utils.GenerateIndexCode(sourceName, indexName)
  817. // 指标id获取
  818. indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
  819. if err != nil {
  820. logs.Error("ProcessingProfitProcessor Process() : Failed to get index id: %v", err)
  821. continue
  822. }
  823. indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
  824. if err != nil {
  825. logs.Error("ProcessingProfitProcessor Process() : Failed to get data by index id and date: %v", err)
  826. continue
  827. }
  828. if len(indexData) > 0 {
  829. logs.Info("ProcessingProfitProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
  830. continue
  831. }
  832. valueStr := row[columnIdx]
  833. value, err := strconv.ParseFloat(valueStr, 64)
  834. if err != nil {
  835. return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
  836. }
  837. // 创建并添加到结果列表
  838. baseFromLyData := models.BaseFromLyData{
  839. DataTime: format,
  840. Value: value,
  841. BaseFromLyIndexId: indexId,
  842. IndexCode: indexCode,
  843. }
  844. result = append(result, baseFromLyData)
  845. } else {
  846. log.Printf("ProcessingProfitProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnDate)
  847. }
  848. break
  849. }
  850. }
  851. }
  852. return result, nil
  853. }
  854. // DailyTransactionProcessor
  855. // @Description: 每日成交处理器
  856. type DailyTransactionProcessor struct{}
  857. func (p *DailyTransactionProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
  858. fmt.Println("Processing processing profit...")
  859. // 解析关键字
  860. if len(keywords) < 4 {
  861. return []models.BaseFromLyData{}, fmt.Errorf("DailyTransactionProcessor Process() : keywords must contain at least 4 elements")
  862. }
  863. // 获取第一个表格
  864. areaTableData := getNoHeadTableData(reportContent)[0]
  865. // 获取第二个表格
  866. blocTableData := getTableData(reportContent, false)
  867. logs.Info("SupplyDemandBalanceProcessor Process() : areaTableData data: %v, blocTableData data: %v", areaTableData, blocTableData)
  868. // 提取日期信息
  869. dateText, err := getDateInfo(ctx)
  870. if err != nil {
  871. return []models.BaseFromLyData{}, err
  872. }
  873. // 时间格式转换
  874. format, err := utils.ConvertTimeFormat(dateText)
  875. if err != nil {
  876. return []models.BaseFromLyData{}, err
  877. }
  878. // 处理提取的表格数据
  879. var result []models.BaseFromLyData
  880. areaHeaders := areaTableData.Headers
  881. areaRows := areaTableData.Rows
  882. // 第一个表格
  883. // 拿到 行关键字和列关键字
  884. columnArea := keywords[1]
  885. var rowAreaMonthDays []string
  886. rowWeek := "均价"
  887. monthDay, err := utils.GetWeekdaysInSameWeek(format)
  888. if err != nil {
  889. return nil, err
  890. }
  891. rowAreaMonthDays = monthDay
  892. // 查找目标列
  893. areaColumnIdx := -1
  894. for i, header := range areaHeaders {
  895. if strings.Contains(header, columnArea) {
  896. areaColumnIdx = i
  897. break
  898. }
  899. }
  900. if areaColumnIdx == -1 {
  901. log.Printf("DailyTransactionProcessor Process() : One Column '%s' not found in table", columnArea)
  902. } else {
  903. for _, row := range areaRows {
  904. for _, monthDay := range rowAreaMonthDays {
  905. if len(row) >= len(areaHeaders) && (row[0] == monthDay || row[0] == rowWeek) {
  906. if areaColumnIdx < len(row) {
  907. // 指标名称
  908. indexName := strings.Join(keywords[:len(keywords)-3], ":")
  909. // 指标编码
  910. indexCode := utils.GenerateIndexCode(sourceName, indexName)
  911. // 指标id获取
  912. indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
  913. if err != nil {
  914. logs.Error("DailyTransactionProcessor Process() : Failed to get index id: %v", err)
  915. continue
  916. }
  917. indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
  918. if err != nil {
  919. logs.Error("DailyTransactionProcessor Process() : Failed to get data by index id and date: %v", err)
  920. continue
  921. }
  922. if len(indexData) > 0 {
  923. logs.Info("DailyTransactionProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
  924. continue
  925. }
  926. valueStr := row[areaColumnIdx]
  927. isChinese := IsChinese(valueStr)
  928. if isChinese {
  929. continue
  930. }
  931. value, err := strconv.ParseFloat(valueStr, 64)
  932. if err != nil {
  933. return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
  934. }
  935. // 创建并添加到结果列表
  936. var dealDate string
  937. if row[0] == rowWeek {
  938. dealDate = format
  939. } else {
  940. date, err := utils.ConvertToDate(row[0])
  941. if err != nil {
  942. return nil, err
  943. }
  944. dealDate = date
  945. }
  946. baseFromLyData := models.BaseFromLyData{
  947. DataTime: dealDate,
  948. Value: value,
  949. BaseFromLyIndexId: indexId,
  950. IndexCode: indexCode,
  951. }
  952. result = append(result, baseFromLyData)
  953. } else {
  954. log.Printf("DailyTransactionProcessor Process() : Column index out of range for row '%s', '%s'", monthDay, columnArea)
  955. }
  956. break
  957. }
  958. }
  959. }
  960. }
  961. // 第二个表格
  962. // 拿到 行关键字和列关键字
  963. columnBloc := keywords[len(keywords)-3]
  964. rowBloc := keywords[1]
  965. blocHeaders := blocTableData.Headers
  966. blocRows := blocTableData.Rows
  967. // 查找目标列
  968. blocColumnIdx := -1
  969. for i, header := range blocHeaders {
  970. if strings.Contains(header, columnBloc) {
  971. blocColumnIdx = i
  972. break
  973. }
  974. }
  975. if blocColumnIdx == -1 {
  976. log.Printf("DailyTransactionProcessor Process() : Two Column '%s' not found in table", columnBloc)
  977. } else {
  978. // 处理表格中的每一行
  979. for _, row := range blocRows {
  980. if len(row) >= len(blocHeaders) && strings.Contains(row[0], rowBloc) {
  981. if blocColumnIdx < len(row) {
  982. // 指标名称
  983. indexName := strings.Join(keywords[:len(keywords)-3], ":")
  984. indexName = removeParentheses(indexName)
  985. // 指标编码
  986. indexCode := utils.GenerateIndexCode(sourceName, indexName)
  987. // 指标id获取
  988. indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
  989. if err != nil {
  990. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get index id: %v", err)
  991. continue
  992. }
  993. indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
  994. if err != nil {
  995. logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get data by index id and date: %v", err)
  996. continue
  997. }
  998. if len(indexData) > 0 {
  999. logs.Info("SupplyDemandBalanceProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
  1000. continue
  1001. }
  1002. valueStr := row[blocColumnIdx]
  1003. value, err := strconv.ParseFloat(valueStr, 64)
  1004. if err != nil {
  1005. return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : failed to parse value '%s': %v", valueStr, err)
  1006. }
  1007. // 创建并添加到结果列表
  1008. baseFromLyData := models.BaseFromLyData{
  1009. DataTime: format,
  1010. Value: value,
  1011. BaseFromLyIndexId: indexId,
  1012. IndexCode: indexCode,
  1013. }
  1014. result = append(result, baseFromLyData)
  1015. } else {
  1016. log.Printf("SupplyDemandBalanceProcessor Process() : Column index out of range for row '%s', '%s'", rowBloc, columnBloc)
  1017. }
  1018. break
  1019. }
  1020. }
  1021. }
  1022. return result, nil
  1023. }
  1024. // ExtractValueInParentheses 从字符串中提取括号中的值
  1025. func extractValueInParentheses(input string) (string, error) {
  1026. re := regexp.MustCompile(`(([^)]+))`)
  1027. matches := re.FindStringSubmatch(input)
  1028. if len(matches) > 1 {
  1029. return matches[1], nil
  1030. }
  1031. return "", fmt.Errorf("no value found in parentheses")
  1032. }
  1033. // 获取指标id,根据指标名称判断,没有插入指标生成返回
  1034. func getIndexId(indexCode string, indexName string, classifyId int, sourceName string, frequency string, unit string) (int, error) {
  1035. // 判断指标是否存在
  1036. var indexId int
  1037. indexInfo, err := models.GetLyIndexByCode(indexCode)
  1038. if err != nil {
  1039. // 新增指标
  1040. index, err := addLyIndex(classifyId, indexCode, indexName, frequency, unit)
  1041. if err != nil {
  1042. return 0, err
  1043. }
  1044. indexId = index
  1045. } else {
  1046. indexId = indexInfo.BaseFromLyIndexId
  1047. }
  1048. return indexId, nil
  1049. }
  1050. // 获取页面时间信息
  1051. func getDateInfo(ctx context.Context) (string, error) {
  1052. var dateText string
  1053. err := chromedp.Run(ctx,
  1054. chromedp.Evaluate(`document.querySelector('div.a_date span').innerText`, &dateText),
  1055. )
  1056. if err != nil {
  1057. return "", fmt.Errorf("processing Process() : Failed to extract report date: %v", err)
  1058. }
  1059. logs.Info("Processing Process() : Report Extracted Date: %s", dateText)
  1060. return dateText, nil
  1061. }
  1062. // 获取所有表格数据 获取表格中有thead标签的数据
  1063. func getAllTableData(reportContent string) []TableData {
  1064. var tableData []TableData
  1065. doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
  1066. if err != nil {
  1067. log.Fatal(err)
  1068. }
  1069. // 选择 id 为 "a_content" 的 div
  1070. doc.Find("#a_content").Each(func(index int, item *goquery.Selection) {
  1071. item.Find("table").Each(func(index int, table *goquery.Selection) {
  1072. var headers []string
  1073. var rows [][]string
  1074. // 提取表头
  1075. table.Find("thead th").Each(func(index int, th *goquery.Selection) {
  1076. headers = append(headers, th.Text())
  1077. })
  1078. // 提取表格行数据
  1079. table.Find("tbody tr").Each(func(index int, row *goquery.Selection) {
  1080. var rowData []string
  1081. row.Find("td").Each(func(index int, td *goquery.Selection) {
  1082. rowData = append(rowData, td.Text())
  1083. })
  1084. rows = append(rows, rowData)
  1085. })
  1086. // 仅在表头存在时添加到结果中
  1087. if len(headers) > 0 {
  1088. tableData = append(tableData, TableData{
  1089. Headers: headers,
  1090. Rows: rows,
  1091. })
  1092. }
  1093. })
  1094. })
  1095. return tableData
  1096. }
  1097. // 获取无头表格数据
  1098. func getNoHeadTableData(reportContent string) []TableData {
  1099. var tableData []TableData
  1100. doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
  1101. if err != nil {
  1102. log.Fatal(err)
  1103. }
  1104. // Find the div with id "a_content"
  1105. doc.Find("#a_content").Each(func(index int, div *goquery.Selection) {
  1106. // Find all tables within the div
  1107. div.Find("table").Each(func(index int, table *goquery.Selection) {
  1108. var headers []string
  1109. var rows [][]string
  1110. // Extract table headers if any
  1111. table.Find("tr").Each(func(index int, tr *goquery.Selection) {
  1112. var rowData []string
  1113. tr.Find("td, th").Each(func(index int, cell *goquery.Selection) {
  1114. cellText := cell.Text()
  1115. rowData = append(rowData, cellText)
  1116. })
  1117. if index == 0 && len(rowData) > 0 {
  1118. // The first row is treated as the header row
  1119. headers = rowData
  1120. } else if len(rowData) > 0 {
  1121. // Add the row data to the rows slice
  1122. rows = append(rows, rowData)
  1123. }
  1124. })
  1125. // Only add table data if headers are present
  1126. if len(headers) > 0 {
  1127. tableData = append(tableData, TableData{
  1128. Headers: headers,
  1129. Rows: rows,
  1130. })
  1131. }
  1132. })
  1133. })
  1134. return tableData
  1135. }
  1136. // 获取表格数据 获取id 为 a_content 的 div 中的第一个表格 左上角那个单元格会拼在第一个,会拼上列上的合并单元格
  1137. func getTableData(reportContent string, isFirst bool) TableData {
  1138. doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
  1139. if err != nil {
  1140. log.Fatal(err)
  1141. }
  1142. tableData := &TableData{}
  1143. // 只提取 id 为 a_content 的 div 中的第一个表格
  1144. var firstTable *goquery.Selection
  1145. if isFirst {
  1146. firstTable = doc.Find("div#a_content table").First()
  1147. } else {
  1148. firstTable = doc.Find("div#a_content table").Last()
  1149. }
  1150. var combinedHeaders []string
  1151. // 提取表头
  1152. firstTable.Find("tr").Each(func(i int, row *goquery.Selection) {
  1153. if i == 0 {
  1154. // 第一行处理合并单元格,保存到 combinedHeaders
  1155. row.Find("td,th").Each(func(j int, cell *goquery.Selection) {
  1156. if j == 0 {
  1157. // 把左上角的“年度(10/9月)”放入 Headers 第一个元素
  1158. tableData.Headers = append(tableData.Headers, strings.TrimSpace(cell.Text()))
  1159. } else {
  1160. // 处理其他单元格
  1161. colspan, exists := cell.Attr("colspan")
  1162. if exists {
  1163. spanCount := 0
  1164. fmt.Sscanf(colspan, "%d", &spanCount)
  1165. for k := 0; k < spanCount; k++ {
  1166. combinedHeaders = append(combinedHeaders, strings.TrimSpace(cell.Text()))
  1167. }
  1168. } else {
  1169. combinedHeaders = append(combinedHeaders, strings.TrimSpace(cell.Text()))
  1170. }
  1171. }
  1172. })
  1173. } else if i == 1 {
  1174. // 第二行处理具体标题,组合后保存到 Headers
  1175. row.Find("td,th").Each(func(j int, cell *goquery.Selection) {
  1176. if j < len(combinedHeaders) {
  1177. fullHeader := combinedHeaders[j] + strings.TrimSpace(cell.Text())
  1178. tableData.Headers = append(tableData.Headers, fullHeader)
  1179. }
  1180. })
  1181. } else {
  1182. // 处理数据行
  1183. var rowData []string
  1184. row.Find("td").Each(func(j int, cell *goquery.Selection) {
  1185. rowData = append(rowData, strings.TrimSpace(cell.Text()))
  1186. })
  1187. if len(rowData) > 0 {
  1188. tableData.Rows = append(tableData.Rows, rowData)
  1189. }
  1190. }
  1191. })
  1192. return *tableData
  1193. }
  1194. // 获取采购装船表格数据
  1195. func getPurchaseShippingTableData(reportContent string) TableData {
  1196. doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
  1197. if err != nil {
  1198. log.Fatal(err)
  1199. }
  1200. tableData := &TableData{}
  1201. // 只提取 id 为 a_content 的 div 中的第一个表格
  1202. firstTable := doc.Find("div#a_content table").First()
  1203. // 处理表头的变量
  1204. headers := []string{}
  1205. combinedHeaders := []string{}
  1206. headerRowspans := make(map[int]int)
  1207. // 遍历所有表头行
  1208. firstTable.Find("tr").Each(func(rowIndex int, row *goquery.Selection) {
  1209. row.Find("th, td").Each(func(cellIndex int, cell *goquery.Selection) {
  1210. text := strings.TrimSpace(cell.Text())
  1211. // 处理 colspan 属性
  1212. colspan, exists := cell.Attr("colspan")
  1213. if exists {
  1214. spanCount := 0
  1215. fmt.Sscanf(colspan, "%d", &spanCount)
  1216. for k := 0; k < spanCount; k++ {
  1217. combinedHeaders = append(combinedHeaders, text)
  1218. }
  1219. } else {
  1220. combinedHeaders = append(combinedHeaders, text)
  1221. }
  1222. // 处理 rowspan 属性
  1223. rowspan, exists := cell.Attr("rowspan")
  1224. if exists {
  1225. rowspanCount := 0
  1226. fmt.Sscanf(rowspan, "%d", &rowspanCount)
  1227. if rowspanCount > 1 {
  1228. // 记录该单元格的行合并信息
  1229. for i := 1; i < rowspanCount; i++ {
  1230. headerRowspans[rowIndex+i]++
  1231. }
  1232. }
  1233. }
  1234. })
  1235. // 处理第二行的具体标题
  1236. if rowIndex == 1 {
  1237. combinedHeadersIndex := 0
  1238. row.Find("th, td").Each(func(cellIndex int, cell *goquery.Selection) {
  1239. if combinedHeadersIndex < len(combinedHeaders) {
  1240. if colspan, _ := cell.Attr("colspan"); colspan != "" {
  1241. combinedHeadersIndex += len(strings.Split(colspan, " ")) - 1
  1242. }
  1243. headers = append(headers, combinedHeaders[combinedHeadersIndex])
  1244. combinedHeadersIndex++
  1245. }
  1246. })
  1247. }
  1248. })
  1249. // 处理数据行
  1250. firstTable.Find("tr").Each(func(rowIndex int, row *goquery.Selection) {
  1251. if rowIndex >= 2 {
  1252. var rowData []string
  1253. row.Find("td").Each(func(cellIndex int, cell *goquery.Selection) {
  1254. rowData = append(rowData, strings.TrimSpace(cell.Text()))
  1255. })
  1256. if len(rowData) > 0 {
  1257. tableData.Rows = append(tableData.Rows, rowData)
  1258. }
  1259. }
  1260. })
  1261. tableData.Headers = headers
  1262. return *tableData
  1263. }
  1264. // 判断字符串是否是数字
  1265. func isNumeric(value string) bool {
  1266. // 正则表达式匹配整数和浮点数
  1267. re := regexp.MustCompile(`^[+-]?(\d+(\.\d*)?|\.\d+)$`)
  1268. return re.MatchString(value)
  1269. }
  1270. // 只保留汉字
  1271. func extractChinese(text string) string {
  1272. re := regexp.MustCompile(`[^\p{Han}]`) // 匹配非汉字字符
  1273. return re.ReplaceAllString(text, "")
  1274. }
  1275. // 去除括号中的内容 包含括号 ()
  1276. func removeParentheses(text string) string {
  1277. re := regexp.MustCompile(`\([^)]*\)`)
  1278. return re.ReplaceAllString(text, "")
  1279. }
  1280. // IsChinese 判断传入的是否是汉字
  1281. func IsChinese(str string) bool {
  1282. for _, r := range str {
  1283. if unicode.Is(unicode.Han, r) {
  1284. return true
  1285. }
  1286. }
  1287. return false
  1288. }