|
@@ -3,7 +3,6 @@ package main
|
|
|
|
|
|
import (
|
|
|
"context"
|
|
|
- "encoding/json"
|
|
|
"eta/eta_crawler/models"
|
|
|
"eta/eta_crawler/utils"
|
|
|
"fmt"
|
|
@@ -20,122 +19,556 @@ const (
|
|
|
sourceName = "lysww" // 粮油商务网
|
|
|
)
|
|
|
|
|
|
+// TableData 用于存储表格的数据
|
|
|
+type TableData struct {
|
|
|
+ Headers []string `json:"headers"`
|
|
|
+ Rows [][]string `json:"rows"`
|
|
|
+}
|
|
|
+
|
|
|
// ImportCostProcessor
|
|
|
// @Description: 进口成本处理器
|
|
|
type ImportCostProcessor struct{}
|
|
|
|
|
|
-func (p *ImportCostProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, indexId int) (models.BaseFromLyData, error) {
|
|
|
+func (p *ImportCostProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
|
|
|
fmt.Println("Processing import cost...")
|
|
|
- // 实现具体的处理逻辑
|
|
|
- return models.BaseFromLyData{}, nil
|
|
|
+
|
|
|
+ // 解析关键字
|
|
|
+ if len(keywords) < 5 {
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("ProcessingImportCostProcessor Process() : keywords must contain at least 5 elements")
|
|
|
+ }
|
|
|
+
|
|
|
+ // 拿到 行关键字和列关键字
|
|
|
+ columnName := keywords[len(keywords)-4]
|
|
|
+ rowVariety := keywords[0]
|
|
|
+ rowPort := keywords[len(keywords)-3]
|
|
|
+ indexNamePrefix := keywords[:1]
|
|
|
+ indexNameSuffix := keywords[1:]
|
|
|
+
|
|
|
+ // 提取所有表格数据
|
|
|
+ tableData := getNoHeadTableData(reportContent)
|
|
|
+
|
|
|
+ // 提取日期信息
|
|
|
+ dateText, err := getDateInfo(ctx)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, err
|
|
|
+ }
|
|
|
+
|
|
|
+ // 解析日期并计算当前月份
|
|
|
+ targetMonths, err := utils.ParseDateAndMonth(dateText)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("ProcessingImportCostProcessor Process() : Failed to parse date: %v", err)
|
|
|
+ }
|
|
|
+ fmt.Printf("Target Month: %s\n", targetMonths)
|
|
|
+
|
|
|
+ // 时间格式转换
|
|
|
+ format, err := utils.ConvertTimeFormat(dateText)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, err
|
|
|
+ }
|
|
|
+
|
|
|
+ // 处理提取的表格数据
|
|
|
+ var result []models.BaseFromLyData
|
|
|
+
|
|
|
+ for _, data := range tableData {
|
|
|
+ tableHeaders := data.Headers
|
|
|
+ tableRows := data.Rows
|
|
|
+
|
|
|
+ // 查找目标列
|
|
|
+ columnIdx := -1
|
|
|
+ for i, header := range tableHeaders {
|
|
|
+ if strings.Contains(header, columnName) {
|
|
|
+ columnIdx = i
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if columnIdx == -1 {
|
|
|
+ log.Printf("ProcessingImportCostProcessor Process() : Column '%s' not found in table", columnName)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // 处理表格中的每一行
|
|
|
+ //var flag bool = true
|
|
|
+ var previousRowVariety string
|
|
|
+ var previousRowPort string
|
|
|
+ for rowIndex, row := range tableRows {
|
|
|
+ if len(row) == len(tableHeaders) {
|
|
|
+ previousRowVariety = row[0]
|
|
|
+ previousRowPort = row[1]
|
|
|
+ } else if len(row) == len(tableHeaders)-1 {
|
|
|
+ previousRowPort = row[0]
|
|
|
+ row = append([]string{previousRowVariety}, row...)
|
|
|
+ tableRows[rowIndex] = row
|
|
|
+ } else if len(row) == len(tableHeaders)-2 {
|
|
|
+ row = append([]string{previousRowVariety, previousRowPort}, row...)
|
|
|
+ tableRows[rowIndex] = row
|
|
|
+ }
|
|
|
+ for _, targetMonth := range targetMonths {
|
|
|
+ if len(row) >= len(tableHeaders) && row[0] == rowVariety && row[1] == targetMonth && row[len(row)-1] == rowPort {
|
|
|
+ if columnIdx < len(row) {
|
|
|
+ // 指标名称
|
|
|
+ indexNameList := append(indexNamePrefix, append([]string{targetMonth}, indexNameSuffix...)...)
|
|
|
+ indexName := strings.Join(indexNameList[:len(keywords)-2], ":")
|
|
|
+ // 指标编码
|
|
|
+ indexCode := utils.GenerateIndexCode(sourceName, indexName)
|
|
|
+ // 指标id获取
|
|
|
+ indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("ProcessingImportCostProcessor Process() : Failed to get index id: %v", err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("ProcessingImportCostProcessor Process() : Failed to get data by index id and date: %v", err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if len(indexData) > 0 {
|
|
|
+ logs.Info("ProcessingImportCostProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ valueStr := row[columnIdx]
|
|
|
+ value, err := strconv.ParseFloat(valueStr, 64)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
|
|
|
+ }
|
|
|
+ // 创建并添加到结果列表
|
|
|
+ baseFromLyData := models.BaseFromLyData{
|
|
|
+ DataTime: format,
|
|
|
+ Value: value,
|
|
|
+ BaseFromLyIndexId: indexId,
|
|
|
+ IndexCode: indexCode,
|
|
|
+ }
|
|
|
+ result = append(result, baseFromLyData)
|
|
|
+ } else {
|
|
|
+ log.Printf("ProcessingImportCostProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, rowPort)
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ return result, nil
|
|
|
}
|
|
|
|
|
|
// ProcessingProfitProcessor
|
|
|
// @Description: 加工利润处理器
|
|
|
type ProcessingProfitProcessor struct{}
|
|
|
|
|
|
-func (p *ProcessingProfitProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, indexId int) (models.BaseFromLyData, error) {
|
|
|
+func (p *ProcessingProfitProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
|
|
|
fmt.Println("Processing processing profit...")
|
|
|
- // 实现具体的处理逻辑
|
|
|
- return models.BaseFromLyData{}, nil
|
|
|
-}
|
|
|
+ // 解析关键字
|
|
|
+ if len(keywords) < 4 {
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("ProcessingProfitProcessor Process() : keywords must contain at least 4 elements")
|
|
|
+ }
|
|
|
|
|
|
-// ProcessingReportProcessor
|
|
|
-// @Description: 加工报告处理器
|
|
|
-type ProcessingReportProcessor struct {
|
|
|
+ // 拿到 行关键字和列关键字
|
|
|
+ columnName := keywords[1]
|
|
|
+ rowVariety := keywords[0]
|
|
|
+ indexNamePrefix := keywords[:1]
|
|
|
+ indexNameSuffix := keywords[1:]
|
|
|
+
|
|
|
+ // 提取所有表格数据
|
|
|
+ tableData := getNoHeadTableData(reportContent)
|
|
|
+
|
|
|
+ // 提取日期信息
|
|
|
+ dateText, err := getDateInfo(ctx)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, err
|
|
|
+ }
|
|
|
+
|
|
|
+ // 时间格式转换
|
|
|
+ format, err := utils.ConvertTimeFormat(dateText)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, err
|
|
|
+ }
|
|
|
+
|
|
|
+ // 解析日期并计算当前月份 和 后两月
|
|
|
+ yearMonths, err := utils.ConvertTimeFormatToYearMonth(format)
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+ fmt.Printf("Target yearMonth: %s\n", yearMonths)
|
|
|
+
|
|
|
+ // 处理提取的表格数据
|
|
|
+ var result []models.BaseFromLyData
|
|
|
+
|
|
|
+ for _, data := range tableData {
|
|
|
+ tableHeaders := data.Headers
|
|
|
+ tableRows := data.Rows
|
|
|
+
|
|
|
+ // 查找目标列
|
|
|
+ columnIdx := -1
|
|
|
+ for i, header := range tableHeaders {
|
|
|
+ if strings.Contains(columnName, header) {
|
|
|
+ columnIdx = i
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if columnIdx == -1 {
|
|
|
+ log.Printf("ProcessingProfitProcessor Process() : Column '%s' not found in table", columnName)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // 处理表格中的每一行
|
|
|
+ var previousRowVariety string
|
|
|
+ for rowIndex, row := range tableRows {
|
|
|
+ if len(row) == len(tableHeaders) {
|
|
|
+ previousRowVariety = row[0]
|
|
|
+ } else if len(row) == len(tableHeaders)-1 {
|
|
|
+ row = append([]string{previousRowVariety}, row...)
|
|
|
+ tableRows[rowIndex] = row
|
|
|
+ }
|
|
|
+
|
|
|
+ for _, targetMonth := range yearMonths {
|
|
|
+ if len(row) >= len(tableHeaders) && row[0] == rowVariety && row[1] == targetMonth {
|
|
|
+ if columnIdx < len(row) {
|
|
|
+ // 指标名称
|
|
|
+ indexNameList := append(indexNamePrefix, append([]string{targetMonth}, indexNameSuffix...)...)
|
|
|
+ indexName := strings.Join(indexNameList[:len(keywords)-2], ":")
|
|
|
+ // 指标编码
|
|
|
+ indexCode := utils.GenerateIndexCode(sourceName, indexName)
|
|
|
+ // 指标id获取
|
|
|
+ indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("ProcessingProfitProcessor Process() : Failed to get index id: %v", err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("ProcessingProfitProcessor Process() : Failed to get data by index id and date: %v", err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if len(indexData) > 0 {
|
|
|
+ logs.Info("ProcessingProfitProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ valueStr := row[columnIdx]
|
|
|
+ value, err := strconv.ParseFloat(valueStr, 64)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
|
|
|
+ }
|
|
|
+ // 创建并添加到结果列表
|
|
|
+ baseFromLyData := models.BaseFromLyData{
|
|
|
+ DataTime: format,
|
|
|
+ Value: value,
|
|
|
+ BaseFromLyIndexId: indexId,
|
|
|
+ IndexCode: indexCode,
|
|
|
+ }
|
|
|
+ result = append(result, baseFromLyData)
|
|
|
+ } else {
|
|
|
+ log.Printf("ProcessingProfitProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return result, nil
|
|
|
}
|
|
|
|
|
|
-// TableData 用于存储表格的数据
|
|
|
-type TableData struct {
|
|
|
- Headers []string `json:"headers"`
|
|
|
- Rows [][]string `json:"rows"`
|
|
|
+// ShippingCostProcessor
|
|
|
+// @Description: 船运费用处理器
|
|
|
+type ShippingCostProcessor struct{}
|
|
|
+
|
|
|
+func (p *ShippingCostProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
|
|
|
+ fmt.Println("Processing processing profit...")
|
|
|
+ // 解析关键字
|
|
|
+ if len(keywords) < 4 {
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("ShippingCostProcessor Process() : keywords must contain at least 5 elements")
|
|
|
+ }
|
|
|
+
|
|
|
+ // 拿到 行关键字和列关键字
|
|
|
+ columnName := keywords[len(keywords)-3]
|
|
|
+ rowVariety := keywords[0]
|
|
|
+ rowDestination := keywords[1]
|
|
|
+ rowShipType := keywords[2]
|
|
|
+
|
|
|
+ // 提取所有表格数据
|
|
|
+ tableData := getNoHeadTableData(reportContent)
|
|
|
+
|
|
|
+ // 提取日期信息
|
|
|
+ dateText, err := getDateInfo(ctx)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, err
|
|
|
+ }
|
|
|
+
|
|
|
+ // 时间格式转换
|
|
|
+ format, err := utils.ConvertTimeFormat(dateText)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, err
|
|
|
+ }
|
|
|
+
|
|
|
+ // 处理提取的表格数据
|
|
|
+ var result []models.BaseFromLyData
|
|
|
+
|
|
|
+ for _, data := range tableData {
|
|
|
+ tableHeaders := data.Headers
|
|
|
+ tableRows := data.Rows
|
|
|
+
|
|
|
+ // 查找目标列
|
|
|
+ columnIdx := -1
|
|
|
+ for i, header := range tableHeaders {
|
|
|
+ if strings.Contains(header, columnName) {
|
|
|
+ columnIdx = i
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if columnIdx == -1 {
|
|
|
+ log.Printf("ShippingCostProcessor Process() : Column '%s' not found in table", columnName)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // 处理表格中的每一行
|
|
|
+ for rowIndex, row := range tableRows {
|
|
|
+ if len(row) == len(tableHeaders)-1 {
|
|
|
+ row = append([]string{rowVariety}, row...)
|
|
|
+ tableRows[rowIndex] = row
|
|
|
+ rowShipType, err = extractValueInParentheses(rowVariety)
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("ShippingCostProcessor Process() : Failed to extract value in parentheses: %v", err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ if len(row) >= len(tableHeaders) && row[0] == rowVariety && (row[1] == rowDestination || strings.Contains(row[0], row[1])) && row[2] == rowShipType {
|
|
|
+ if columnIdx < len(row) {
|
|
|
+ // 指标名称
|
|
|
+ indexName := strings.Join(keywords[:len(keywords)-3], `:`)
|
|
|
+ // 指标编码
|
|
|
+ indexCode := utils.GenerateIndexCode(sourceName, indexName)
|
|
|
+ // 指标id获取
|
|
|
+ indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("ShippingCostProcessor Process() : Failed to get index id: %v", err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("ShippingCostProcessor Process() : Failed to get data by index id and date: %v", err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if len(indexData) > 0 {
|
|
|
+ logs.Info("ShippingCostProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ valueStr := row[columnIdx]
|
|
|
+ value, err := strconv.ParseFloat(valueStr, 64)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
|
|
|
+ }
|
|
|
+ // 创建并添加到结果列表
|
|
|
+ baseFromLyData := models.BaseFromLyData{
|
|
|
+ DataTime: format,
|
|
|
+ Value: value,
|
|
|
+ BaseFromLyIndexId: indexId,
|
|
|
+ IndexCode: indexCode,
|
|
|
+ }
|
|
|
+ result = append(result, baseFromLyData)
|
|
|
+ } else {
|
|
|
+ log.Printf("ShippingCostProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return result, nil
|
|
|
}
|
|
|
|
|
|
-func (p *ProcessingReportProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, indexId int) (models.BaseFromLyData, error) {
|
|
|
+// SupplyDemandBalanceProcessor
|
|
|
+// @Description: 供需平衡处理器
|
|
|
+type SupplyDemandBalanceProcessor struct{}
|
|
|
+
|
|
|
+func (p *SupplyDemandBalanceProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
|
|
|
logs.Info("Processing processing report...")
|
|
|
// 解析关键字
|
|
|
- if len(keywords) < 3 {
|
|
|
- return models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : keywords must contain at least 3 elements")
|
|
|
+ if len(keywords) < 4 {
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : keywords must contain at least 4 elements")
|
|
|
}
|
|
|
|
|
|
// 拿到 行关键字和列关键字
|
|
|
- columnName := keywords[0]
|
|
|
- rowName := keywords[1]
|
|
|
+ /*columnName := keywords[1]
|
|
|
+ rowVariety := keywords[0]
|
|
|
+ indexNamePrefix := keywords[:1]
|
|
|
+ indexNameSuffix := keywords[1:]*/
|
|
|
|
|
|
// 提取所有表格数据
|
|
|
- var tableData []TableData
|
|
|
+ tableData := getTableData(reportContent)
|
|
|
+ logs.Info("SupplyDemandBalanceProcessor Process() : Table data: %v", tableData)
|
|
|
|
|
|
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
|
|
|
+ // 提取日期信息
|
|
|
+ dateText, err := getDateInfo(ctx)
|
|
|
if err != nil {
|
|
|
- log.Fatal(err)
|
|
|
+ return []models.BaseFromLyData{}, err
|
|
|
}
|
|
|
|
|
|
- // 选择 id 为 "a_content" 的 div
|
|
|
- doc.Find("#a_content").Each(func(index int, item *goquery.Selection) {
|
|
|
- item.Find("table").Each(func(index int, table *goquery.Selection) {
|
|
|
- var headers []string
|
|
|
- var rows [][]string
|
|
|
+ // 时间格式转换
|
|
|
+ format, err := utils.ConvertTimeFormat(dateText)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, err
|
|
|
+ }
|
|
|
|
|
|
- // 提取表头
|
|
|
- table.Find("thead th").Each(func(index int, th *goquery.Selection) {
|
|
|
- headers = append(headers, th.Text())
|
|
|
- })
|
|
|
+ currentYearAndNextYear, err := utils.GetCurrentYearAndNextYear(format)
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
|
|
|
- // 提取表格行数据
|
|
|
- table.Find("tbody tr").Each(func(index int, row *goquery.Selection) {
|
|
|
- var rowData []string
|
|
|
- row.Find("td").Each(func(index int, td *goquery.Selection) {
|
|
|
- rowData = append(rowData, td.Text())
|
|
|
- })
|
|
|
- rows = append(rows, rowData)
|
|
|
- })
|
|
|
+ month, err := utils.GetCurrentMonth(format)
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+ monthSuffix := "预估"
|
|
|
+ logs.Info("SupplyDemandBalanceProcessor Process() : Target Year: %s:%s\n", currentYearAndNextYear, month+monthSuffix)
|
|
|
|
|
|
- // 仅在表头存在时添加到结果中
|
|
|
- if len(headers) > 0 {
|
|
|
- tableData = append(tableData, TableData{
|
|
|
- Headers: headers,
|
|
|
- Rows: rows,
|
|
|
- })
|
|
|
+ // 处理提取的表格数据
|
|
|
+ var result []models.BaseFromLyData
|
|
|
+
|
|
|
+ /*for _, data := range tableData {
|
|
|
+ //tableHeaders := data.Headers
|
|
|
+ tableRows := data.Rows
|
|
|
+
|
|
|
+ tableHeaders := tableRows[0]
|
|
|
+
|
|
|
+ // 查找目标列
|
|
|
+ columnIdx := -1
|
|
|
+ for i, header := range tableHeaders {
|
|
|
+ if strings.Contains(columnName, header) {
|
|
|
+ columnIdx = i
|
|
|
+ break
|
|
|
}
|
|
|
- })
|
|
|
- })
|
|
|
- // 打印提取的数据以进行调试
|
|
|
- dataJSON, _ := json.MarshalIndent(tableData, "", " ")
|
|
|
- fmt.Printf("Extracted Table Data: %s\n", dataJSON)
|
|
|
+ }
|
|
|
+ if columnIdx == -1 {
|
|
|
+ log.Printf("SupplyDemandBalanceProcessor Process() : Column '%s' not found in table", columnName)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // 处理表格中的每一行
|
|
|
+ var previousRowVariety string
|
|
|
+ for rowIndex, row := range tableRows {
|
|
|
+
|
|
|
+ if len(row) == len(tableHeaders) {
|
|
|
+ previousRowVariety = row[0]
|
|
|
+ } else if len(row) == len(tableHeaders)-1 {
|
|
|
+ row = append([]string{previousRowVariety}, row...)
|
|
|
+ tableRows[rowIndex] = row
|
|
|
+ }
|
|
|
+
|
|
|
+ for _, targetMonth := range yearMonths {
|
|
|
+ if len(row) >= len(tableHeaders) && row[0] == rowVariety && row[1] == targetMonth {
|
|
|
+ if columnIdx < len(row) {
|
|
|
+ // 指标名称
|
|
|
+ indexNameList := append(indexNamePrefix, append([]string{targetMonth}, indexNameSuffix...)...)
|
|
|
+ indexName := strings.Join(indexNameList[:len(keywords)-2], ":")
|
|
|
+ // 指标编码
|
|
|
+ indexCode := utils.GenerateIndexCode(sourceName, indexName)
|
|
|
+ // 指标id获取
|
|
|
+ indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get index id: %v", err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
|
|
|
+ if err != nil {
|
|
|
+ logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get data by index id and date: %v", err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if len(indexData) > 0 {
|
|
|
+ logs.Info("SupplyDemandBalanceProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ valueStr := row[columnIdx]
|
|
|
+ value, err := strconv.ParseFloat(valueStr, 64)
|
|
|
+ if err != nil {
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : failed to parse value '%s': %v", valueStr, err)
|
|
|
+ }
|
|
|
+ // 创建并添加到结果列表
|
|
|
+ baseFromLyData := models.BaseFromLyData{
|
|
|
+ DataTime: format,
|
|
|
+ Value: value,
|
|
|
+ BaseFromLyIndexId: indexId,
|
|
|
+ IndexCode: indexCode,
|
|
|
+ }
|
|
|
+ result = append(result, baseFromLyData)
|
|
|
+ } else {
|
|
|
+ log.Printf("SupplyDemandBalanceProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ }*/
|
|
|
+ return result, nil
|
|
|
+}
|
|
|
+
|
|
|
+// ProcessingReportProcessor
|
|
|
+// @Description: 加工报告处理器
|
|
|
+type ProcessingReportProcessor struct {
|
|
|
+}
|
|
|
+
|
|
|
+func (p *ProcessingReportProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
|
|
|
+ logs.Info("Processing processing report...")
|
|
|
+ // 解析关键字
|
|
|
+ if len(keywords) < 3 {
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : keywords must contain at least 3 elements")
|
|
|
+ }
|
|
|
+
|
|
|
+ // 拿到 行关键字和列关键字
|
|
|
+ columnName := keywords[0]
|
|
|
+ rowName := keywords[1]
|
|
|
+
|
|
|
+ // 提取所有表格数据
|
|
|
+ tableData := getAllTableData(reportContent)
|
|
|
|
|
|
// 提取日期信息
|
|
|
- var dateText string
|
|
|
- err = chromedp.Run(ctx,
|
|
|
- chromedp.Evaluate(`document.querySelector('div.a_date span').innerText`, &dateText),
|
|
|
- )
|
|
|
+ dateText, err := getDateInfo(ctx)
|
|
|
if err != nil {
|
|
|
- return models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : Failed to extract date: %v", err)
|
|
|
+ return []models.BaseFromLyData{}, err
|
|
|
+ }
|
|
|
+ indexName := strings.Join(keywords[:len(keywords)-2], ":")
|
|
|
+ // 指标编码
|
|
|
+ indexCode := utils.GenerateIndexCode(sourceName, indexName)
|
|
|
+ // 指标id获取
|
|
|
+ indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
}
|
|
|
-
|
|
|
- logs.Info("ProcessingReportProcessor Process() : Extracted Date: %s", dateText)
|
|
|
|
|
|
// 校验指标数据是否存在 根据指标id和日期 存在则跳过,不存在正常往下走
|
|
|
format, err := utils.ConvertTimeFormat(dateText)
|
|
|
if err != nil {
|
|
|
- return models.BaseFromLyData{}, err
|
|
|
+ return []models.BaseFromLyData{}, err
|
|
|
}
|
|
|
indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
|
|
|
if err != nil {
|
|
|
- return models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : Failed to get data by index id and date: %v", err)
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : Failed to get data by index id and date: %v", err)
|
|
|
}
|
|
|
if len(indexData) > 0 {
|
|
|
logs.Info("ProcessingReportProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
|
|
|
- return models.BaseFromLyData{}, nil
|
|
|
+ return []models.BaseFromLyData{}, nil
|
|
|
}
|
|
|
|
|
|
// 解析日期并计算当前周数
|
|
|
targetWeek, err := utils.ParseDateAndWeek(dateText)
|
|
|
if err != nil {
|
|
|
- return models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : Failed to parse date: %v", err)
|
|
|
+ return []models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : Failed to parse date: %v", err)
|
|
|
}
|
|
|
|
|
|
fmt.Printf("Target Week: %s\n", targetWeek)
|
|
|
|
|
|
+ var result []models.BaseFromLyData
|
|
|
// 处理提取的表格数据
|
|
|
for _, data := range tableData {
|
|
|
tableHeaders := data.Headers
|
|
@@ -180,14 +613,14 @@ func (p *ProcessingReportProcessor) Process(ctx context.Context, product string,
|
|
|
value, err := strconv.ParseFloat(row[columnIdx], 64)
|
|
|
if err != nil {
|
|
|
logs.Error("ProcessingReportProcessor Process() : Error converting value to float64: %v", err)
|
|
|
- return models.BaseFromLyData{}, err
|
|
|
+ return []models.BaseFromLyData{}, err
|
|
|
}
|
|
|
// 返回BaseFromLyData对象的数据
|
|
|
baseFromLyData := models.BaseFromLyData{
|
|
|
DataTime: dateText,
|
|
|
Value: value,
|
|
|
}
|
|
|
- return baseFromLyData, nil
|
|
|
+ result = append(result, baseFromLyData)
|
|
|
}
|
|
|
} else {
|
|
|
logs.Error("ProcessingReportProcessor Process() : Column index out of range")
|
|
@@ -196,7 +629,193 @@ func (p *ProcessingReportProcessor) Process(ctx context.Context, product string,
|
|
|
}
|
|
|
}
|
|
|
// TODO 后面把这个日志打印,不做返回错误处理,一个指标找不到会导致后续指标无法处理
|
|
|
- return models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : No matching row found for '%s'", rowName)
|
|
|
+ return result, nil
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+// ExtractValueInParentheses 从字符串中提取括号中的值
|
|
|
+func extractValueInParentheses(input string) (string, error) {
|
|
|
+ re := regexp.MustCompile(`(([^)]+))`)
|
|
|
+ matches := re.FindStringSubmatch(input)
|
|
|
+
|
|
|
+ if len(matches) > 1 {
|
|
|
+ return matches[1], nil
|
|
|
+ }
|
|
|
+
|
|
|
+ return "", fmt.Errorf("no value found in parentheses")
|
|
|
+}
|
|
|
+
|
|
|
+// 获取指标id,根据指标名称判断,没有插入指标生成返回
|
|
|
+func getIndexId(indexCode string, indexName string, classifyId int, sourceName string, frequency string, unit string) (int, error) {
|
|
|
+ // 判断指标是否存在
|
|
|
+ var indexId int
|
|
|
+ indexInfo, err := models.GetLyIndexByCode(indexCode)
|
|
|
+ if err != nil {
|
|
|
+ // 新增指标
|
|
|
+ index, err := addLyIndex(classifyId, indexCode, indexName, frequency, unit)
|
|
|
+ if err != nil {
|
|
|
+ return 0, err
|
|
|
+ }
|
|
|
+ indexId = index
|
|
|
+ } else {
|
|
|
+ indexId = indexInfo.BaseFromLyIndexId
|
|
|
+ }
|
|
|
+ return indexId, nil
|
|
|
+}
|
|
|
+
|
|
|
+// 获取页面时间信息
|
|
|
+func getDateInfo(ctx context.Context) (string, error) {
|
|
|
+ var dateText string
|
|
|
+ err := chromedp.Run(ctx,
|
|
|
+ chromedp.Evaluate(`document.querySelector('div.a_date span').innerText`, &dateText),
|
|
|
+ )
|
|
|
+ if err != nil {
|
|
|
+ return "", fmt.Errorf("processing Process() : Failed to extract report date: %v", err)
|
|
|
+ }
|
|
|
+
|
|
|
+ logs.Info("Processing Process() : Report Extracted Date: %s", dateText)
|
|
|
+ return dateText, nil
|
|
|
+}
|
|
|
+
|
|
|
+// 获取所有表格数据 获取表格中有thead标签的数据
|
|
|
+func getAllTableData(reportContent string) []TableData {
|
|
|
+ var tableData []TableData
|
|
|
+
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
|
|
|
+ if err != nil {
|
|
|
+ log.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 选择 id 为 "a_content" 的 div
|
|
|
+ doc.Find("#a_content").Each(func(index int, item *goquery.Selection) {
|
|
|
+ item.Find("table").Each(func(index int, table *goquery.Selection) {
|
|
|
+ var headers []string
|
|
|
+ var rows [][]string
|
|
|
+
|
|
|
+ // 提取表头
|
|
|
+ table.Find("thead th").Each(func(index int, th *goquery.Selection) {
|
|
|
+ headers = append(headers, th.Text())
|
|
|
+ })
|
|
|
+
|
|
|
+ // 提取表格行数据
|
|
|
+ table.Find("tbody tr").Each(func(index int, row *goquery.Selection) {
|
|
|
+ var rowData []string
|
|
|
+ row.Find("td").Each(func(index int, td *goquery.Selection) {
|
|
|
+ rowData = append(rowData, td.Text())
|
|
|
+ })
|
|
|
+ rows = append(rows, rowData)
|
|
|
+ })
|
|
|
+
|
|
|
+ // 仅在表头存在时添加到结果中
|
|
|
+ if len(headers) > 0 {
|
|
|
+ tableData = append(tableData, TableData{
|
|
|
+ Headers: headers,
|
|
|
+ Rows: rows,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ })
|
|
|
+ })
|
|
|
+ return tableData
|
|
|
+}
|
|
|
+
|
|
|
+// 获取无头表格数据
|
|
|
+func getNoHeadTableData(reportContent string) []TableData {
|
|
|
+ var tableData []TableData
|
|
|
+
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
|
|
|
+ if err != nil {
|
|
|
+ log.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Find the div with id "a_content"
|
|
|
+ doc.Find("#a_content").Each(func(index int, div *goquery.Selection) {
|
|
|
+ // Find all tables within the div
|
|
|
+ div.Find("table").Each(func(index int, table *goquery.Selection) {
|
|
|
+ var headers []string
|
|
|
+ var rows [][]string
|
|
|
+
|
|
|
+ // Extract table headers if any
|
|
|
+ table.Find("tr").Each(func(index int, tr *goquery.Selection) {
|
|
|
+ var rowData []string
|
|
|
+ tr.Find("td, th").Each(func(index int, cell *goquery.Selection) {
|
|
|
+ cellText := cell.Text()
|
|
|
+ rowData = append(rowData, cellText)
|
|
|
+ })
|
|
|
+
|
|
|
+ if index == 0 && len(rowData) > 0 {
|
|
|
+ // The first row is treated as the header row
|
|
|
+ headers = rowData
|
|
|
+ } else if len(rowData) > 0 {
|
|
|
+ // Add the row data to the rows slice
|
|
|
+ rows = append(rows, rowData)
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ // Only add table data if headers are present
|
|
|
+ if len(headers) > 0 {
|
|
|
+ tableData = append(tableData, TableData{
|
|
|
+ Headers: headers,
|
|
|
+ Rows: rows,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ })
|
|
|
+ })
|
|
|
+
|
|
|
+ return tableData
|
|
|
+}
|
|
|
+
|
|
|
+// 获取表格数据 有tr td标签的数据 列转行存储==>Rows, 行转头存储==>Headers
|
|
|
+func getTableData(reportContent string) TableData {
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
|
|
|
+ if err != nil {
|
|
|
+ fmt.Println("Error:", err)
|
|
|
+ }
|
|
|
+
|
|
|
+ var tableData TableData
|
|
|
+
|
|
|
+ // 查找 id 为 a_content 的 div
|
|
|
+ doc.Find("#a_content").Each(func(index int, divHtml *goquery.Selection) {
|
|
|
+ divHtml.Find("table").Each(func(index int, tableHtml *goquery.Selection) {
|
|
|
+ // 提取 Headers(列信息)
|
|
|
+ tableHtml.Find("tr").Each(func(rowIndex int, rowHtml *goquery.Selection) {
|
|
|
+ if rowIndex == 0 { // 处理第一行(包含年度信息)
|
|
|
+ var headerRow []string
|
|
|
+ rowHtml.Find("td").Each(func(colIndex int, colHtml *goquery.Selection) {
|
|
|
+ text := colHtml.Text()
|
|
|
+ if colIndex > 0 { // 忽略第一列“年度(10/9月)”
|
|
|
+ headerRow = append(headerRow, strings.TrimSpace(text))
|
|
|
+ }
|
|
|
+ })
|
|
|
+ if len(headerRow) > 0 {
|
|
|
+ tableData.Headers = append(tableData.Headers, headerRow...)
|
|
|
+ }
|
|
|
+ } else if rowIndex == 1 { // 处理第二行(列标题)
|
|
|
+ rowHtml.Find("td").Each(func(colIndex int, colHtml *goquery.Selection) {
|
|
|
+ text := colHtml.Text()
|
|
|
+ if colIndex > 0 { // 忽略第一列“年度(10/9月)”
|
|
|
+ tableData.Headers = append(tableData.Headers, strings.TrimSpace(text))
|
|
|
+ }
|
|
|
+ })
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ // 提取数据行
|
|
|
+ tableHtml.Find("tr").Each(func(rowIndex int, rowHtml *goquery.Selection) {
|
|
|
+ if rowIndex > 1 { // 从第三行开始
|
|
|
+ var row []string
|
|
|
+ rowHtml.Find("td").Each(func(colIndex int, colHtml *goquery.Selection) {
|
|
|
+ text := colHtml.Text()
|
|
|
+ row = append(row, strings.TrimSpace(text))
|
|
|
+ })
|
|
|
+ if len(row) > 0 {
|
|
|
+ tableData.Rows = append(tableData.Rows, row)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ })
|
|
|
+ })
|
|
|
+ })
|
|
|
+
|
|
|
+ return tableData
|
|
|
}
|
|
|
|
|
|
// 判断字符串是否是数字
|