123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464 |
- // @Author gmy 2024/8/6 10:50:00
- package main
- import (
- "context"
- "eta/eta_crawler/models"
- "eta/eta_crawler/utils"
- "fmt"
- "github.com/PuerkitoBio/goquery"
- "github.com/beego/beego/v2/core/logs"
- "github.com/chromedp/chromedp"
- "log"
- "regexp"
- "strconv"
- "strings"
- "unicode"
- )
- const (
- sourceName = "lysww" // 粮油商务网
- )
- // TableData 用于存储表格的数据
- type TableData struct {
- Headers []string `json:"headers"`
- Rows [][]string `json:"rows"`
- }
- // ImportCostProcessor
- // @Description: 进口成本处理器
- type ImportCostProcessor struct{}
- func (p *ImportCostProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
- fmt.Println("Processing import cost...")
- // 解析关键字
- if len(keywords) < 5 {
- return []models.BaseFromLyData{}, fmt.Errorf("ProcessingImportCostProcessor Process() : keywords must contain at least 5 elements")
- }
- // 拿到 行关键字和列关键字
- columnName := keywords[len(keywords)-4]
- rowVariety := keywords[0]
- rowPort := keywords[len(keywords)-3]
- indexNamePrefix := keywords[:1]
- indexNameSuffix := keywords[1:]
- // 提取所有表格数据
- tableData := getNoHeadTableData(reportContent)
- // 提取日期信息
- dateText, err := getDateInfo(ctx)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 时间格式转换
- format, err := utils.ConvertTimeFormat(dateText)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 解析日期并计算当前月份
- var targetMonths []string
- if product == "油菜籽" {
- targetMonths, err = utils.ParseDateAndMonthColzaOil(format)
- } else {
- targetMonths, err = utils.ParseDateAndMonth(dateText)
- }
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("ProcessingImportCostProcessor Process() : Failed to parse date: %v", err)
- }
- fmt.Printf("Target Month: %s\n", targetMonths)
- // 处理提取的表格数据
- var result []models.BaseFromLyData
- for _, data := range tableData {
- tableHeaders := data.Headers
- tableRows := data.Rows
- // 查找目标列
- columnIdx := -1
- for i, header := range tableHeaders {
- if strings.Contains(header, columnName) {
- columnIdx = i
- break
- }
- }
- if columnIdx == -1 {
- log.Printf("ProcessingImportCostProcessor Process() : Column '%s' not found in table", columnName)
- continue
- }
- // 处理表格中的每一行
- //var flag bool = true
- var previousRowVariety string
- var previousRowPort string
- for rowIndex, row := range tableRows {
- if len(row) == len(tableHeaders) {
- previousRowVariety = row[0]
- previousRowPort = row[1]
- } else if len(row) == len(tableHeaders)-1 {
- previousRowPort = row[0]
- row = append([]string{previousRowVariety}, row...)
- tableRows[rowIndex] = row
- } else if len(row) == len(tableHeaders)-2 {
- row = append([]string{previousRowVariety, previousRowPort}, row...)
- tableRows[rowIndex] = row
- }
- for _, targetMonth := range targetMonths {
- if len(row) >= len(tableHeaders) && strings.Contains(rowVariety, row[0]) && row[1] == targetMonth && row[len(row)-1] == rowPort {
- if columnIdx < len(row) {
- // 指标名称
- indexNameList := append(indexNamePrefix, append([]string{targetMonth}, indexNameSuffix...)...)
- indexName := strings.Join(indexNameList[:len(keywords)-2], ":")
- // 指标编码
- indexCode := utils.GenerateIndexCode(sourceName, indexName)
- // 指标id获取
- indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
- if err != nil {
- logs.Error("ProcessingImportCostProcessor Process() : Failed to get index id: %v", err)
- continue
- }
- indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
- if err != nil {
- logs.Error("ProcessingImportCostProcessor Process() : Failed to get data by index id and date: %v", err)
- continue
- }
- if len(indexData) > 0 {
- logs.Info("ProcessingImportCostProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
- continue
- }
- valueStr := row[columnIdx]
- value, err := strconv.ParseFloat(valueStr, 64)
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
- }
- // 创建并添加到结果列表
- baseFromLyData := models.BaseFromLyData{
- DataTime: format,
- Value: value,
- BaseFromLyIndexId: indexId,
- IndexCode: indexCode,
- }
- result = append(result, baseFromLyData)
- } else {
- log.Printf("ProcessingImportCostProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, rowPort)
- }
- break
- }
- }
- }
- }
- return result, nil
- }
- // ProcessingProfitProcessor
- // @Description: 加工利润处理器
- type ProcessingProfitProcessor struct{}
- func (p *ProcessingProfitProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
- fmt.Println("Processing processing profit...")
- // 解析关键字
- if len(keywords) < 4 {
- return []models.BaseFromLyData{}, fmt.Errorf("ProcessingProfitProcessor Process() : keywords must contain at least 4 elements")
- }
- // 拿到 行关键字和列关键字
- columnName := keywords[1]
- rowVariety := keywords[0]
- indexNamePrefix := keywords[:1]
- indexNameSuffix := keywords[1:]
- // 提取所有表格数据
- tableData := getNoHeadTableData(reportContent)
- // 提取日期信息
- dateText, err := getDateInfo(ctx)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 时间格式转换
- format, err := utils.ConvertTimeFormat(dateText)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 解析日期并计算当前月份 和 后两月
- yearMonths, err := utils.ConvertTimeFormatToYearMonth(format)
- if err != nil {
- return nil, err
- }
- fmt.Printf("Target yearMonth: %s\n", yearMonths)
- // 处理提取的表格数据
- var result []models.BaseFromLyData
- for _, data := range tableData {
- tableHeaders := data.Headers
- tableRows := data.Rows
- // 查找目标列
- columnIdx := -1
- for i, header := range tableHeaders {
- if strings.Contains(columnName, header) {
- columnIdx = i
- break
- }
- }
- if columnIdx == -1 {
- log.Printf("ProcessingProfitProcessor Process() : Column '%s' not found in table", columnName)
- continue
- }
- // 处理表格中的每一行
- var previousRowVariety string
- for rowIndex, row := range tableRows {
- if len(row) == len(tableHeaders) {
- previousRowVariety = row[0]
- } else if len(row) == len(tableHeaders)-1 {
- row = append([]string{previousRowVariety}, row...)
- tableRows[rowIndex] = row
- }
- for _, targetMonth := range yearMonths {
- if len(row) >= len(tableHeaders) && row[0] == rowVariety && row[1] == targetMonth {
- if columnIdx < len(row) {
- // 指标名称
- indexNameList := append(indexNamePrefix, append([]string{targetMonth}, indexNameSuffix...)...)
- indexName := strings.Join(indexNameList[:len(keywords)-2], ":")
- // 指标编码
- indexCode := utils.GenerateIndexCode(sourceName, indexName)
- // 指标id获取
- indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
- if err != nil {
- logs.Error("ProcessingProfitProcessor Process() : Failed to get index id: %v", err)
- continue
- }
- indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
- if err != nil {
- logs.Error("ProcessingProfitProcessor Process() : Failed to get data by index id and date: %v", err)
- continue
- }
- if len(indexData) > 0 {
- logs.Info("ProcessingProfitProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
- continue
- }
- valueStr := row[columnIdx]
- value, err := strconv.ParseFloat(valueStr, 64)
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
- }
- // 创建并添加到结果列表
- baseFromLyData := models.BaseFromLyData{
- DataTime: format,
- Value: value,
- BaseFromLyIndexId: indexId,
- IndexCode: indexCode,
- }
- result = append(result, baseFromLyData)
- } else {
- log.Printf("ProcessingProfitProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
- }
- break
- }
- }
- }
- }
- return result, nil
- }
- // ShippingCostProcessor
- // @Description: 船运费用处理器
- type ShippingCostProcessor struct{}
- func (p *ShippingCostProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
- fmt.Println("Processing processing profit...")
- // 解析关键字
- if len(keywords) < 4 {
- return []models.BaseFromLyData{}, fmt.Errorf("ShippingCostProcessor Process() : keywords must contain at least 5 elements")
- }
- // 拿到 行关键字和列关键字
- columnName := keywords[len(keywords)-3]
- rowVariety := keywords[0]
- rowDestination := keywords[1]
- rowShipType := keywords[2]
- // 提取所有表格数据
- tableData := getNoHeadTableData(reportContent)
- // 提取日期信息
- dateText, err := getDateInfo(ctx)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 时间格式转换
- format, err := utils.ConvertTimeFormat(dateText)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 处理提取的表格数据
- var result []models.BaseFromLyData
- for _, data := range tableData {
- tableHeaders := data.Headers
- tableRows := data.Rows
- // 查找目标列
- columnIdx := -1
- for i, header := range tableHeaders {
- if strings.Contains(header, columnName) {
- columnIdx = i
- break
- }
- }
- if columnIdx == -1 {
- log.Printf("ShippingCostProcessor Process() : Column '%s' not found in table", columnName)
- continue
- }
- // 处理表格中的每一行
- for rowIndex, row := range tableRows {
- if len(row) == len(tableHeaders)-1 {
- row = append([]string{rowVariety}, row...)
- tableRows[rowIndex] = row
- rowShipType, err = extractValueInParentheses(rowVariety)
- if err != nil {
- logs.Error("ShippingCostProcessor Process() : Failed to extract value in parentheses: %v", err)
- continue
- }
- }
- if len(row) >= len(tableHeaders) && row[0] == rowVariety && (row[1] == rowDestination || strings.Contains(row[0], row[1])) && row[2] == rowShipType {
- if columnIdx < len(row) {
- // 指标名称
- indexName := strings.Join(keywords[:len(keywords)-3], `:`)
- // 指标编码
- indexCode := utils.GenerateIndexCode(sourceName, indexName)
- // 指标id获取
- indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
- if err != nil {
- logs.Error("ShippingCostProcessor Process() : Failed to get index id: %v", err)
- continue
- }
- indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
- if err != nil {
- logs.Error("ShippingCostProcessor Process() : Failed to get data by index id and date: %v", err)
- continue
- }
- if len(indexData) > 0 {
- logs.Info("ShippingCostProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
- continue
- }
- valueStr := row[columnIdx]
- value, err := strconv.ParseFloat(valueStr, 64)
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
- }
- // 创建并添加到结果列表
- baseFromLyData := models.BaseFromLyData{
- DataTime: format,
- Value: value,
- BaseFromLyIndexId: indexId,
- IndexCode: indexCode,
- }
- result = append(result, baseFromLyData)
- } else {
- log.Printf("ShippingCostProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
- }
- break
- }
- }
- }
- return result, nil
- }
- // SupplyDemandBalanceProcessor
- // @Description: 供需平衡处理器
- type SupplyDemandBalanceProcessor struct{}
- func (p *SupplyDemandBalanceProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
- // https://www.fao.com.cn/art/gG7gKTCNDHLJNsq9QRYjoQ==.htm
- logs.Info("Processing processing report...")
- // 解析关键字
- if len(keywords) < 4 {
- return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : keywords must contain at least 4 elements")
- }
- // 拿到 行关键字和列关键字
- var columnName string
- rowVariety := keywords[1]
- // 提取所有表格数据
- tableData := getTableData(reportContent, true)
- logs.Info("SupplyDemandBalanceProcessor Process() : Table data: %v", tableData)
- // 提取日期信息
- dateText, err := getDateInfo(ctx)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 时间格式转换
- format, err := utils.ConvertTimeFormat(dateText)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- currentYearAndNextYear, err := utils.GetCurrentYearAndNextYear(format)
- if err != nil {
- return nil, err
- }
- month, err := utils.GetCurrentMonth(format)
- if err != nil {
- return nil, err
- }
- monthSuffix := "预估"
- logs.Info("SupplyDemandBalanceProcessor Process() : Target Year: %s:%s\n", currentYearAndNextYear, month+monthSuffix)
- // 处理提取的表格数据
- var result []models.BaseFromLyData
- headers := tableData.Headers
- rows := tableData.Rows
- for _, year := range currentYearAndNextYear {
- columnName = year + month + monthSuffix
- isCurrentYear, err := utils.IsCurrentYear(year)
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to determine if year is current year: %v", err)
- continue
- }
- if !isCurrentYear {
- format, err = utils.GetNextYearLastDay(format)
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get next year last day: %v", err)
- continue
- }
- }
- // 查找目标列
- columnIdx := -1
- for i, header := range headers {
- if strings.Contains(columnName, header) {
- columnIdx = i
- break
- }
- }
- if columnIdx == -1 {
- logs.Error("SupplyDemandBalanceProcessor Process() : Column '%s' not found in table", columnName)
- continue
- }
- // 处理表格中的每一行
- for _, row := range rows {
- if len(row) >= len(headers) && row[0] == rowVariety {
- if columnIdx < len(row) {
- // 指标名称
- indexName := strings.Join(keywords[:len(keywords)-2], ":")
- // 指标编码
- indexCode := utils.GenerateIndexCode(sourceName, indexName)
- // 指标id获取
- indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get index id: %v", err)
- continue
- }
- valueStr := row[columnIdx]
- value, err := strconv.ParseFloat(valueStr, 64)
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : failed to parse value '%s': %v", valueStr, err)
- }
- yearMonth, err := utils.GetYearMonth(format)
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get year month: %v", err)
- continue
- }
- indexData, err := models.GetLyDataByIndexIdAndDataTimeYM(indexId, yearMonth)
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get data by index id and date: %v", err)
- continue
- }
- if len(indexData) > 0 {
- logs.Info("SupplyDemandBalanceProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
- // 存在走更新逻辑 主要更新今年在去年的预估值
- indexData := indexData[0]
- if indexData.Value != value {
- err := models.UpdateLyDataById(indexData.BaseFromLyDataId, value)
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to update data: %v", err)
- continue
- }
- }
- continue
- }
- // 创建并添加到结果列表
- baseFromLyData := models.BaseFromLyData{
- DataTime: format,
- Value: value,
- BaseFromLyIndexId: indexId,
- IndexCode: indexCode,
- }
- result = append(result, baseFromLyData)
- } else {
- log.Printf("SupplyDemandBalanceProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
- }
- break
- }
- }
- }
- return result, nil
- }
- // PurchaseShippingProcessor
- // @Description: 采购装船处理器
- type PurchaseShippingProcessor struct{}
- func (p *PurchaseShippingProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
- logs.Info("Processing purchase shipping...")
- // TODO 卡住了
- // 解析关键字
- if len(keywords) < 3 {
- return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : keywords must contain at least 3 elements")
- }
- // 拿到 行关键字和列关键字
- var columnName string
- rowVariety := keywords[1]
- // 提取所有表格数据
- tableData := getPurchaseShippingTableData(reportContent)
- logs.Info("SupplyDemandBalanceProcessor Process() : Table data: %v", tableData)
- // 提取日期信息
- dateText, err := getDateInfo(ctx)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 时间格式转换
- format, err := utils.ConvertTimeFormat(dateText)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- currentYearAndNextYear, err := utils.GetCurrentYearAndNextYear(format)
- if err != nil {
- return nil, err
- }
- month, err := utils.GetCurrentMonth(format)
- if err != nil {
- return nil, err
- }
- monthSuffix := "预估"
- logs.Info("SupplyDemandBalanceProcessor Process() : Target Year: %s:%s\n", currentYearAndNextYear, month+monthSuffix)
- // 处理提取的表格数据
- var result []models.BaseFromLyData
- headers := tableData.Headers
- rows := tableData.Rows
- for _, year := range currentYearAndNextYear {
- columnName = year + month + monthSuffix
- // 查找目标列
- columnIdx := -1
- for i, header := range headers {
- if strings.Contains(columnName, header) {
- columnIdx = i
- break
- }
- }
- if columnIdx == -1 {
- log.Printf("SupplyDemandBalanceProcessor Process() : Column '%s' not found in table", columnName)
- continue
- }
- // 处理表格中的每一行
- for _, row := range rows {
- if len(row) >= len(headers) && row[0] == rowVariety {
- if columnIdx < len(row) {
- // 指标名称
- indexName := strings.Join(keywords[:len(keywords)-2], ":")
- // 指标编码
- indexCode := utils.GenerateIndexCode(sourceName, indexName)
- // 指标id获取
- indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get index id: %v", err)
- continue
- }
- indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get data by index id and date: %v", err)
- continue
- }
- if len(indexData) > 0 {
- logs.Info("SupplyDemandBalanceProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
- continue
- }
- valueStr := row[columnIdx]
- value, err := strconv.ParseFloat(valueStr, 64)
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : failed to parse value '%s': %v", valueStr, err)
- }
- // 创建并添加到结果列表
- baseFromLyData := models.BaseFromLyData{
- DataTime: format,
- Value: value,
- BaseFromLyIndexId: indexId,
- IndexCode: indexCode,
- }
- result = append(result, baseFromLyData)
- } else {
- log.Printf("SupplyDemandBalanceProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
- }
- break
- }
- }
- }
- return result, nil
- }
- // ProcessingReportProcessor
- // @Description: 加工报告处理器
- type ProcessingReportProcessor struct {
- }
- func (p *ProcessingReportProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
- logs.Info("Processing processing report...")
- // 解析关键字
- if len(keywords) < 3 {
- return []models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : keywords must contain at least 3 elements")
- }
- // 拿到 行关键字和列关键字
- columnName := keywords[0]
- rowName := keywords[1]
- // 提取所有表格数据
- tableData := getAllTableData(reportContent)
- // 提取日期信息
- dateText, err := getDateInfo(ctx)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- indexName := strings.Join(keywords[:len(keywords)-2], ":")
- // 指标编码
- indexCode := utils.GenerateIndexCode(sourceName, indexName)
- // 指标id获取
- indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
- if err != nil {
- return nil, err
- }
- // 校验指标数据是否存在 根据指标id和日期 存在则跳过,不存在正常往下走
- format, err := utils.ConvertTimeFormat(dateText)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : Failed to get data by index id and date: %v", err)
- }
- if len(indexData) > 0 {
- logs.Info("ProcessingReportProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
- return []models.BaseFromLyData{}, nil
- }
- // 解析日期并计算当前周数
- targetWeek, err := utils.ParseDateAndWeek(dateText)
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("ProcessingReportProcessor Process() : Failed to parse date: %v", err)
- }
- fmt.Printf("Target Week: %s\n", targetWeek)
- var result []models.BaseFromLyData
- // 处理提取的表格数据
- for _, data := range tableData {
- tableHeaders := data.Headers
- tableRows := data.Rows
- // 查找目标列
- columnIdx := -1
- for i, header := range tableHeaders {
- headerString := extractChinese(header)
- if strings.Contains(columnName, headerString) {
- columnIdx = i
- break
- }
- }
- if columnIdx == -1 {
- logs.Error("ProcessingReportProcessor Process() : Column '%s' not found in table", columnName)
- continue
- }
- // 查找本周的列位置
- weekIdx := -1
- for i, header := range tableHeaders {
- if strings.Contains(header, targetWeek) && i > columnIdx {
- weekIdx = i
- break
- }
- }
- if weekIdx == -1 {
- fmt.Printf("Week column '%s' not found in table\n", targetWeek)
- continue
- }
- // 查找目标行
- for _, row := range tableRows {
- if len(row) > 0 && strings.Contains(row[0], rowName) {
- if weekIdx < len(row) {
- logs.Info("Value in column '%s' - '%s': %s", columnName, rowName, row[columnIdx])
- numFlag := isNumeric(row[columnIdx])
- if numFlag {
- value, err := strconv.ParseFloat(row[columnIdx], 64)
- if err != nil {
- logs.Error("ProcessingReportProcessor Process() : Error converting value to float64: %v", err)
- return []models.BaseFromLyData{}, err
- }
- // 返回BaseFromLyData对象的数据
- baseFromLyData := models.BaseFromLyData{
- DataTime: dateText,
- Value: value,
- }
- result = append(result, baseFromLyData)
- }
- } else {
- logs.Error("ProcessingReportProcessor Process() : Column index out of range")
- }
- }
- }
- }
- // TODO 后面把这个日志打印,不做返回错误处理,一个指标找不到会导致后续指标无法处理
- return result, nil
- }
- // InventoryAnalysisProcessor
- // @Description: 库存分析处理器
- type InventoryAnalysisProcessor struct{}
- func (p *InventoryAnalysisProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
- // https://www.fao.com.cn/art/yg1IKj9FpPEIDv2LefnPhQ==.htm
- logs.Info("Processing inventory analysis...")
- // 解析关键字
- if len(keywords) < 4 {
- return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : keywords must contain at least 4 elements")
- }
- // 拿到 行关键字和列关键字
- columnName := keywords[0]
- rowVariety := keywords[1]
- columnSuffix := "本周"
- columnName = columnName + columnSuffix
- // 提取所有表格数据
- tableData := getTableData(reportContent, true)
- logs.Info("SupplyDemandBalanceProcessor Process() : Table data: %v", tableData)
- // 提取日期信息
- dateText, err := getDateInfo(ctx)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 时间格式转换
- format, err := utils.ConvertTimeFormat(dateText)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 处理提取的表格数据
- var result []models.BaseFromLyData
- headers := tableData.Headers
- rows := tableData.Rows
- // 查找目标列
- columnIdx := -1
- for i, header := range headers {
- header := removeParentheses(header)
- if strings.Contains(columnName, header) {
- columnIdx = i
- break
- }
- }
- if columnIdx == -1 {
- logs.Error("SupplyDemandBalanceProcessor Process() : Column '%s' not found in table", columnName)
- } else {
- // 处理表格中的每一行
- for _, row := range rows {
- if len(row) >= len(headers) && strings.Contains(row[0], rowVariety) {
- if columnIdx < len(row) {
- // 指标名称
- indexName := strings.Join(keywords[:len(keywords)-2], ":")
- indexName = removeParentheses(indexName)
- // 指标编码
- indexCode := utils.GenerateIndexCode(sourceName, indexName)
- // 指标id获取
- indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get index id: %v", err)
- continue
- }
- indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get data by index id and date: %v", err)
- continue
- }
- if len(indexData) > 0 {
- logs.Info("SupplyDemandBalanceProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
- continue
- }
- valueStr := row[columnIdx]
- value, err := strconv.ParseFloat(valueStr, 64)
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : failed to parse value '%s': %v", valueStr, err)
- }
- // 创建并添加到结果列表
- baseFromLyData := models.BaseFromLyData{
- DataTime: format,
- Value: value,
- BaseFromLyIndexId: indexId,
- IndexCode: indexCode,
- }
- result = append(result, baseFromLyData)
- } else {
- log.Printf("SupplyDemandBalanceProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnName)
- }
- break
- }
- }
- }
- return result, nil
- }
- // PriceSpreadArbitrageProcessor
- // @Description: 价差套利处理器
- type PriceSpreadArbitrageProcessor struct{}
- func (p *PriceSpreadArbitrageProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
- fmt.Println("Processing processing profit...")
- // 解析关键字
- if len(keywords) < 4 {
- return []models.BaseFromLyData{}, fmt.Errorf("ProcessingProfitProcessor Process() : keywords must contain at least 4 elements")
- }
- // 拿到 行关键字和列关键字
- var columnDate string
- rowVariety := keywords[0]
- rowBourse := keywords[1]
- // 提取所有表格数据
- tableData := getNoHeadTableData(reportContent)
- // 提取日期信息
- dateText, err := getDateInfo(ctx)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 时间格式转换
- format, err := utils.ConvertTimeFormat(dateText)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- day, err := utils.ConvertTimeFormatToYearMonthDay(format)
- if err != nil {
- return nil, err
- }
- columnDate = day
- // 处理提取的表格数据
- var result []models.BaseFromLyData
- for _, data := range tableData {
- tableHeaders := data.Headers
- tableRows := data.Rows
- // 查找目标列
- columnIdx := -1
- for i, header := range tableHeaders {
- if strings.Contains(header, columnDate) {
- columnIdx = i
- break
- }
- }
- if columnIdx == -1 {
- log.Printf("ProcessingProfitProcessor Process() : Column '%s' not found in table", columnDate)
- continue
- }
- // 处理表格中的每一行
- for _, row := range tableRows {
- if len(row) >= len(tableHeaders) && row[0] == rowVariety && row[1] == rowBourse {
- if columnIdx < len(row) {
- // 指标名称
- indexName := strings.Join(keywords[:len(keywords)-2], ":")
- // 指标编码
- indexCode := utils.GenerateIndexCode(sourceName, indexName)
- // 指标id获取
- indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
- if err != nil {
- logs.Error("ProcessingProfitProcessor Process() : Failed to get index id: %v", err)
- continue
- }
- indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
- if err != nil {
- logs.Error("ProcessingProfitProcessor Process() : Failed to get data by index id and date: %v", err)
- continue
- }
- if len(indexData) > 0 {
- logs.Info("ProcessingProfitProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
- continue
- }
- valueStr := row[columnIdx]
- value, err := strconv.ParseFloat(valueStr, 64)
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
- }
- // 创建并添加到结果列表
- baseFromLyData := models.BaseFromLyData{
- DataTime: format,
- Value: value,
- BaseFromLyIndexId: indexId,
- IndexCode: indexCode,
- }
- result = append(result, baseFromLyData)
- } else {
- log.Printf("ProcessingProfitProcessor Process() : Column index out of range for row '%s', '%s'", rowVariety, columnDate)
- }
- break
- }
- }
- }
- return result, nil
- }
- // DailyTransactionProcessor
- // @Description: 每日成交处理器
- type DailyTransactionProcessor struct{}
- func (p *DailyTransactionProcessor) Process(ctx context.Context, product string, reportContent string, keywords []string, classifyId int) ([]models.BaseFromLyData, error) {
- fmt.Println("Processing processing profit...")
- // 解析关键字
- if len(keywords) < 4 {
- return []models.BaseFromLyData{}, fmt.Errorf("DailyTransactionProcessor Process() : keywords must contain at least 4 elements")
- }
- // 获取第一个表格
- areaTableData := getNoHeadTableData(reportContent)[0]
- // 获取第二个表格
- blocTableData := getTableData(reportContent, false)
- logs.Info("SupplyDemandBalanceProcessor Process() : areaTableData data: %v, blocTableData data: %v", areaTableData, blocTableData)
- // 提取日期信息
- dateText, err := getDateInfo(ctx)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 时间格式转换
- format, err := utils.ConvertTimeFormat(dateText)
- if err != nil {
- return []models.BaseFromLyData{}, err
- }
- // 处理提取的表格数据
- var result []models.BaseFromLyData
- areaHeaders := areaTableData.Headers
- areaRows := areaTableData.Rows
- // 第一个表格
- // 拿到 行关键字和列关键字
- columnArea := keywords[1]
- var rowAreaMonthDays []string
- rowWeek := "均价"
- monthDay, err := utils.GetWeekdaysInSameWeek(format)
- if err != nil {
- return nil, err
- }
- rowAreaMonthDays = monthDay
- // 查找目标列
- areaColumnIdx := -1
- for i, header := range areaHeaders {
- if strings.Contains(header, columnArea) {
- areaColumnIdx = i
- break
- }
- }
- if areaColumnIdx == -1 {
- log.Printf("DailyTransactionProcessor Process() : One Column '%s' not found in table", columnArea)
- } else {
- for _, row := range areaRows {
- for _, monthDay := range rowAreaMonthDays {
- if len(row) >= len(areaHeaders) && (row[0] == monthDay || row[0] == rowWeek) {
- if areaColumnIdx < len(row) {
- // 指标名称
- indexName := strings.Join(keywords[:len(keywords)-3], ":")
- // 指标编码
- indexCode := utils.GenerateIndexCode(sourceName, indexName)
- // 指标id获取
- indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
- if err != nil {
- logs.Error("DailyTransactionProcessor Process() : Failed to get index id: %v", err)
- continue
- }
- indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
- if err != nil {
- logs.Error("DailyTransactionProcessor Process() : Failed to get data by index id and date: %v", err)
- continue
- }
- if len(indexData) > 0 {
- logs.Info("DailyTransactionProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
- continue
- }
- valueStr := row[areaColumnIdx]
- isChinese := IsChinese(valueStr)
- if isChinese {
- continue
- }
- value, err := strconv.ParseFloat(valueStr, 64)
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("failed to parse value '%s': %v", valueStr, err)
- }
- // 创建并添加到结果列表
- var dealDate string
- if row[0] == rowWeek {
- dealDate = format
- } else {
- date, err := utils.ConvertToDate(row[0])
- if err != nil {
- return nil, err
- }
- dealDate = date
- }
- baseFromLyData := models.BaseFromLyData{
- DataTime: dealDate,
- Value: value,
- BaseFromLyIndexId: indexId,
- IndexCode: indexCode,
- }
- result = append(result, baseFromLyData)
- } else {
- log.Printf("DailyTransactionProcessor Process() : Column index out of range for row '%s', '%s'", monthDay, columnArea)
- }
- break
- }
- }
- }
- }
- // 第二个表格
- // 拿到 行关键字和列关键字
- columnBloc := keywords[len(keywords)-3]
- rowBloc := keywords[1]
- blocHeaders := blocTableData.Headers
- blocRows := blocTableData.Rows
- // 查找目标列
- blocColumnIdx := -1
- for i, header := range blocHeaders {
- if strings.Contains(header, columnBloc) {
- blocColumnIdx = i
- break
- }
- }
- if blocColumnIdx == -1 {
- log.Printf("DailyTransactionProcessor Process() : Two Column '%s' not found in table", columnBloc)
- } else {
- // 处理表格中的每一行
- for _, row := range blocRows {
- if len(row) >= len(blocHeaders) && strings.Contains(row[0], rowBloc) {
- if blocColumnIdx < len(row) {
- // 指标名称
- indexName := strings.Join(keywords[:len(keywords)-3], ":")
- indexName = removeParentheses(indexName)
- // 指标编码
- indexCode := utils.GenerateIndexCode(sourceName, indexName)
- // 指标id获取
- indexId, err := getIndexId(indexCode, indexName, classifyId, sourceName, keywords[len(keywords)-2], keywords[len(keywords)-1])
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get index id: %v", err)
- continue
- }
- indexData, err := models.GetLyDataByIndexIdAndDataTime(indexId, format)
- if err != nil {
- logs.Error("SupplyDemandBalanceProcessor Process() : Failed to get data by index id and date: %v", err)
- continue
- }
- if len(indexData) > 0 {
- logs.Info("SupplyDemandBalanceProcessor Process() : Data already exists for index %d and date %s", indexId, dateText)
- continue
- }
- valueStr := row[blocColumnIdx]
- value, err := strconv.ParseFloat(valueStr, 64)
- if err != nil {
- return []models.BaseFromLyData{}, fmt.Errorf("SupplyDemandBalanceProcessor Process() : failed to parse value '%s': %v", valueStr, err)
- }
- // 创建并添加到结果列表
- baseFromLyData := models.BaseFromLyData{
- DataTime: format,
- Value: value,
- BaseFromLyIndexId: indexId,
- IndexCode: indexCode,
- }
- result = append(result, baseFromLyData)
- } else {
- log.Printf("SupplyDemandBalanceProcessor Process() : Column index out of range for row '%s', '%s'", rowBloc, columnBloc)
- }
- break
- }
- }
- }
- return result, nil
- }
- // ExtractValueInParentheses 从字符串中提取括号中的值
- func extractValueInParentheses(input string) (string, error) {
- re := regexp.MustCompile(`(([^)]+))`)
- matches := re.FindStringSubmatch(input)
- if len(matches) > 1 {
- return matches[1], nil
- }
- return "", fmt.Errorf("no value found in parentheses")
- }
- // 获取指标id,根据指标名称判断,没有插入指标生成返回
- func getIndexId(indexCode string, indexName string, classifyId int, sourceName string, frequency string, unit string) (int, error) {
- // 判断指标是否存在
- var indexId int
- indexInfo, err := models.GetLyIndexByCode(indexCode)
- if err != nil {
- // 新增指标
- index, err := addLyIndex(classifyId, indexCode, indexName, frequency, unit)
- if err != nil {
- return 0, err
- }
- indexId = index
- } else {
- indexId = indexInfo.BaseFromLyIndexId
- }
- return indexId, nil
- }
- // 获取页面时间信息
- func getDateInfo(ctx context.Context) (string, error) {
- var dateText string
- err := chromedp.Run(ctx,
- chromedp.Evaluate(`document.querySelector('div.a_date span').innerText`, &dateText),
- )
- if err != nil {
- return "", fmt.Errorf("processing Process() : Failed to extract report date: %v", err)
- }
- logs.Info("Processing Process() : Report Extracted Date: %s", dateText)
- return dateText, nil
- }
- // 获取所有表格数据 获取表格中有thead标签的数据
- func getAllTableData(reportContent string) []TableData {
- var tableData []TableData
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
- if err != nil {
- log.Fatal(err)
- }
- // 选择 id 为 "a_content" 的 div
- doc.Find("#a_content").Each(func(index int, item *goquery.Selection) {
- item.Find("table").Each(func(index int, table *goquery.Selection) {
- var headers []string
- var rows [][]string
- // 提取表头
- table.Find("thead th").Each(func(index int, th *goquery.Selection) {
- headers = append(headers, th.Text())
- })
- // 提取表格行数据
- table.Find("tbody tr").Each(func(index int, row *goquery.Selection) {
- var rowData []string
- row.Find("td").Each(func(index int, td *goquery.Selection) {
- rowData = append(rowData, td.Text())
- })
- rows = append(rows, rowData)
- })
- // 仅在表头存在时添加到结果中
- if len(headers) > 0 {
- tableData = append(tableData, TableData{
- Headers: headers,
- Rows: rows,
- })
- }
- })
- })
- return tableData
- }
- // 获取无头表格数据
- func getNoHeadTableData(reportContent string) []TableData {
- var tableData []TableData
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
- if err != nil {
- log.Fatal(err)
- }
- // Find the div with id "a_content"
- doc.Find("#a_content").Each(func(index int, div *goquery.Selection) {
- // Find all tables within the div
- div.Find("table").Each(func(index int, table *goquery.Selection) {
- var headers []string
- var rows [][]string
- // Extract table headers if any
- table.Find("tr").Each(func(index int, tr *goquery.Selection) {
- var rowData []string
- tr.Find("td, th").Each(func(index int, cell *goquery.Selection) {
- cellText := cell.Text()
- rowData = append(rowData, cellText)
- })
- if index == 0 && len(rowData) > 0 {
- // The first row is treated as the header row
- headers = rowData
- } else if len(rowData) > 0 {
- // Add the row data to the rows slice
- rows = append(rows, rowData)
- }
- })
- // Only add table data if headers are present
- if len(headers) > 0 {
- tableData = append(tableData, TableData{
- Headers: headers,
- Rows: rows,
- })
- }
- })
- })
- return tableData
- }
- // 获取表格数据 获取id 为 a_content 的 div 中的第一个表格 左上角那个单元格会拼在第一个,会拼上列上的合并单元格
- func getTableData(reportContent string, isFirst bool) TableData {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
- if err != nil {
- log.Fatal(err)
- }
- tableData := &TableData{}
- // 只提取 id 为 a_content 的 div 中的第一个表格
- var firstTable *goquery.Selection
- if isFirst {
- firstTable = doc.Find("div#a_content table").First()
- } else {
- firstTable = doc.Find("div#a_content table").Last()
- }
- var combinedHeaders []string
- // 提取表头
- firstTable.Find("tr").Each(func(i int, row *goquery.Selection) {
- if i == 0 {
- // 第一行处理合并单元格,保存到 combinedHeaders
- row.Find("td,th").Each(func(j int, cell *goquery.Selection) {
- if j == 0 {
- // 把左上角的“年度(10/9月)”放入 Headers 第一个元素
- tableData.Headers = append(tableData.Headers, strings.TrimSpace(cell.Text()))
- } else {
- // 处理其他单元格
- colspan, exists := cell.Attr("colspan")
- if exists {
- spanCount := 0
- fmt.Sscanf(colspan, "%d", &spanCount)
- for k := 0; k < spanCount; k++ {
- combinedHeaders = append(combinedHeaders, strings.TrimSpace(cell.Text()))
- }
- } else {
- combinedHeaders = append(combinedHeaders, strings.TrimSpace(cell.Text()))
- }
- }
- })
- } else if i == 1 {
- // 第二行处理具体标题,组合后保存到 Headers
- row.Find("td,th").Each(func(j int, cell *goquery.Selection) {
- if j < len(combinedHeaders) {
- fullHeader := combinedHeaders[j] + strings.TrimSpace(cell.Text())
- tableData.Headers = append(tableData.Headers, fullHeader)
- }
- })
- } else {
- // 处理数据行
- var rowData []string
- row.Find("td").Each(func(j int, cell *goquery.Selection) {
- rowData = append(rowData, strings.TrimSpace(cell.Text()))
- })
- if len(rowData) > 0 {
- tableData.Rows = append(tableData.Rows, rowData)
- }
- }
- })
- return *tableData
- }
- // 获取采购装船表格数据
- func getPurchaseShippingTableData(reportContent string) TableData {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(reportContent))
- if err != nil {
- log.Fatal(err)
- }
- tableData := &TableData{}
- // 只提取 id 为 a_content 的 div 中的第一个表格
- firstTable := doc.Find("div#a_content table").First()
- // 处理表头的变量
- headers := []string{}
- combinedHeaders := []string{}
- headerRowspans := make(map[int]int)
- // 遍历所有表头行
- firstTable.Find("tr").Each(func(rowIndex int, row *goquery.Selection) {
- row.Find("th, td").Each(func(cellIndex int, cell *goquery.Selection) {
- text := strings.TrimSpace(cell.Text())
- // 处理 colspan 属性
- colspan, exists := cell.Attr("colspan")
- if exists {
- spanCount := 0
- fmt.Sscanf(colspan, "%d", &spanCount)
- for k := 0; k < spanCount; k++ {
- combinedHeaders = append(combinedHeaders, text)
- }
- } else {
- combinedHeaders = append(combinedHeaders, text)
- }
- // 处理 rowspan 属性
- rowspan, exists := cell.Attr("rowspan")
- if exists {
- rowspanCount := 0
- fmt.Sscanf(rowspan, "%d", &rowspanCount)
- if rowspanCount > 1 {
- // 记录该单元格的行合并信息
- for i := 1; i < rowspanCount; i++ {
- headerRowspans[rowIndex+i]++
- }
- }
- }
- })
- // 处理第二行的具体标题
- if rowIndex == 1 {
- combinedHeadersIndex := 0
- row.Find("th, td").Each(func(cellIndex int, cell *goquery.Selection) {
- if combinedHeadersIndex < len(combinedHeaders) {
- if colspan, _ := cell.Attr("colspan"); colspan != "" {
- combinedHeadersIndex += len(strings.Split(colspan, " ")) - 1
- }
- headers = append(headers, combinedHeaders[combinedHeadersIndex])
- combinedHeadersIndex++
- }
- })
- }
- })
- // 处理数据行
- firstTable.Find("tr").Each(func(rowIndex int, row *goquery.Selection) {
- if rowIndex >= 2 {
- var rowData []string
- row.Find("td").Each(func(cellIndex int, cell *goquery.Selection) {
- rowData = append(rowData, strings.TrimSpace(cell.Text()))
- })
- if len(rowData) > 0 {
- tableData.Rows = append(tableData.Rows, rowData)
- }
- }
- })
- tableData.Headers = headers
- return *tableData
- }
- // 判断字符串是否是数字
- func isNumeric(value string) bool {
- // 正则表达式匹配整数和浮点数
- re := regexp.MustCompile(`^[+-]?(\d+(\.\d*)?|\.\d+)$`)
- return re.MatchString(value)
- }
- // 只保留汉字
- func extractChinese(text string) string {
- re := regexp.MustCompile(`[^\p{Han}]`) // 匹配非汉字字符
- return re.ReplaceAllString(text, "")
- }
- // 去除括号中的内容 包含括号 ()
- func removeParentheses(text string) string {
- re := regexp.MustCompile(`\([^)]*\)`)
- return re.ReplaceAllString(text, "")
- }
- // IsChinese 判断传入的是否是汉字
- func IsChinese(str string) bool {
- for _, r := range str {
- if unicode.Is(unicode.Han, r) {
- return true
- }
- }
- return false
- }
|