package base_from_ccf import ( "context" "encoding/json" "eta/eta_data_analysis/models" "eta/eta_data_analysis/utils" "fmt" "github.com/PuerkitoBio/goquery" "io/ioutil" "strings" "time" ) // HandleTableData 表格数据 type HandleTableData struct { ClassifyId int `description:"分类ID"` FromPage string `description:"表格来源"` TableDate time.Time `description:"表格日期"` TableContent string `description:"表格HTML"` } // TaskStockTable 获取装置表格 func TaskStockTable(context.Context) (err error) { defer func() { if err != nil { tips := fmt.Sprintf("TaskStockTable ErrMsg: %s", err.Error()) utils.FileLog.Info(tips) fmt.Println(tips) } }() taskNames := []string{"PTA装置", "MEG装置", "PX装置"} readLimit := utils.CCFStockFetchNum for _, nameKey := range taskNames { fetchRule, e := loadDataRule(nameKey) if e != nil { utils.FileLog.Info(fmt.Sprintf("%s无解析规则, err: %v\n", nameKey, e)) continue } // 解析前N篇报告 files, e := savePageHtml(nameKey, fetchRule.PageDir, false, readLimit) if e != nil { utils.FileLog.Info(fmt.Sprintf("%s保存首页失败, err: %v\n", nameKey, e)) continue } readCount := 0 for _, v := range files { readCount += 1 if readCount > readLimit { break } htm, e := ioutil.ReadFile(v) if e != nil { fmt.Printf("file: %s, ReadFile err: %v\n", v, e) utils.FileLog.Info(fmt.Sprintf("file: %s, ReadFile err: %v", v, e)) continue } tableContent, tableDate, e := AnalysisStockTable(htm) if e != nil { fmt.Printf("file: %s, AnalysisStockTable err: %v\n", v, e) utils.FileLog.Info(fmt.Sprintf("file: %s, AnalysisStockTable err: %v", v, e)) continue } tableItem := new(HandleTableData) tableItem.ClassifyId = fetchRule.StockTable.ClassifyId tableItem.FromPage = v tableItem.TableDate = tableDate tableItem.TableContent = tableContent // 写入数据库 params := make(map[string]interface{}) params["Table"] = tableItem params["TerminalCode"] = utils.TerminalCode result, e := postEdbLib(params, utils.LIB_ROUTE_CCF_TABLE_HANDLE) if e != nil { b, _ := json.Marshal(params) fmt.Printf("file: %s, postEdbLib err: %v, params: %s\n", v, e, string(b)) utils.FileLog.Info(fmt.Sprintf("file: %s, postEdbLib err: %v, params: %s", v, e, string(b))) continue } resp := new(models.BaseEdbLibResponse) if e = json.Unmarshal(result, &resp); e != nil { fmt.Printf("file: %s, json.Unmarshal err: %v\n", v, e) utils.FileLog.Info(fmt.Sprintf("file: %s, json.Unmarshal err: %v", v, e)) continue } if resp.Ret != 200 { fmt.Printf("file: %s, Msg: %s, ErrMsg: %s\n", v, resp.Msg, resp.ErrMsg) utils.FileLog.Info(fmt.Sprintf("file: %s, Msg: %s, ErrMsg: %s", v, resp.Msg, resp.ErrMsg)) continue } } } return } // AnalysisStockTable 解析装置表格 func AnalysisStockTable(htm []byte) (tableContent string, tableTime time.Time, err error) { doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm))) if e != nil { err = fmt.Errorf("NewDocumentFromReader err: %v", e) return } // 从收藏按钮往上找table, 取出报告发布日期 collectEle := doc.Find("#savenews") publishTimeTab := collectEle.ParentsFiltered("table").First() publishTxt := publishTimeTab.Find("td:first-child").Text() //fmt.Println("publishTxt: ", publishTxt) publishTime, e := extractReportPublishTime(publishTxt) if e != nil { err = fmt.Errorf("extractReportPublishTime err: %v", e) return } if publishTime.IsZero() { err = fmt.Errorf("发布日期有误") return } //fmt.Println(publishTime) tableTime = publishTime //publishYear := publishTime.Year() //fmt.Println(publishYear) // 查找包含关键词的标签 keyElement := doc.Find("#newscontent") table := keyElement.ChildrenFiltered("table").First() if table.Length() <= 0 { err = fmt.Errorf("表格未找到") return } h, e := table.Html() if e != nil { err = fmt.Errorf("表格HTML有误, err: %v", e) return } tableContent = fmt.Sprintf("%s
", h) //tableContent = regexp.MustCompile(`\n`).ReplaceAllString(tableContent, "") return } // ReadStockHistoryFiles 读取历史文件 func ReadStockHistoryFiles(context.Context) { var err error defer func() { if err != nil { tips := fmt.Sprintf("ReadStockHistoryFiles ErrMsg: %s", err.Error()) utils.FileLog.Info(tips) fmt.Println(tips) } }() taskNames := []string{"PTA装置", "MEG装置", "PX装置"} for _, nameKey := range taskNames { fetchRule, e := loadDataRule(nameKey) if e != nil { utils.FileLog.Info(fmt.Sprintf("%s无解析规则, err: %v\n", nameKey, e)) continue } filePaths, e := listFiles(fetchRule.PageDir) if e != nil { utils.FileLog.Info(fmt.Sprintf("%s读取文件目录失败, err: %v\n", nameKey, e)) continue } for _, v := range filePaths { v = fmt.Sprintf("%s/%s", fetchRule.PageDir, v) fmt.Printf("开始解析: %s", v) //htm, e := ioutil.ReadFile("static/ccf/oil_daily/原油石化早报(4.18).html") htm, e := ioutil.ReadFile(v) if e != nil { fmt.Printf("file: %s, ReadFile err: %v\n", v, e) utils.FileLog.Info(fmt.Sprintf("file: %s, ReadFile err: %v", v, e)) continue } tableContent, tableDate, e := AnalysisStockTable(htm) if e != nil { fmt.Printf("file: %s, AnalysisStockTable err: %v\n", v, e) utils.FileLog.Info(fmt.Sprintf("file: %s, AnalysisStockTable err: %v", v, e)) continue } tableItem := new(HandleTableData) tableItem.ClassifyId = fetchRule.StockTable.ClassifyId tableItem.FromPage = v tableItem.TableDate = tableDate tableItem.TableContent = tableContent // 写入数据库 params := make(map[string]interface{}) params["Table"] = tableItem params["TerminalCode"] = utils.TerminalCode result, e := postEdbLib(params, utils.LIB_ROUTE_CCF_TABLE_HANDLE) if e != nil { b, _ := json.Marshal(params) fmt.Printf("file: %s, postEdbLib err: %v, params: %s\n", v, e, string(b)) utils.FileLog.Info(fmt.Sprintf("file: %s, postEdbLib err: %v, params: %s", v, e, string(b))) continue } resp := new(models.BaseEdbLibResponse) if e = json.Unmarshal(result, &resp); e != nil { fmt.Printf("file: %s, json.Unmarshal err: %v\n", v, e) utils.FileLog.Info(fmt.Sprintf("file: %s, json.Unmarshal err: %v", v, e)) continue } if resp.Ret != 200 { fmt.Printf("file: %s, Msg: %s, ErrMsg: %s\n", v, resp.Msg, resp.ErrMsg) utils.FileLog.Info(fmt.Sprintf("file: %s, Msg: %s, ErrMsg: %s", v, resp.Msg, resp.ErrMsg)) continue } } } return }