123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215 |
- package base_from_ccf
- import (
- "context"
- "encoding/json"
- "eta/eta_data_analysis/models"
- "eta/eta_data_analysis/utils"
- "fmt"
- "github.com/PuerkitoBio/goquery"
- "io/ioutil"
- "strings"
- "time"
- )
- // HandleTableData 表格数据
- type HandleTableData struct {
- ClassifyId int `description:"分类ID"`
- FromPage string `description:"表格来源"`
- TableDate time.Time `description:"表格日期"`
- TableContent string `description:"表格HTML"`
- }
- // TaskStockTable 获取装置表格
- func TaskStockTable(context.Context) (err error) {
- defer func() {
- if err != nil {
- tips := fmt.Sprintf("TaskStockTable ErrMsg: %s", err.Error())
- utils.FileLog.Info(tips)
- fmt.Println(tips)
- }
- }()
- taskNames := []string{"PTA装置", "MEG装置", "PX装置"}
- readLimit := utils.CCFStockFetchNum
- for _, nameKey := range taskNames {
- fetchRule, e := loadDataRule(nameKey)
- if e != nil {
- utils.FileLog.Info(fmt.Sprintf("%s无解析规则, err: %v\n", nameKey, e))
- continue
- }
- // 解析前N篇报告
- files, e := savePageHtml(nameKey, fetchRule.PageDir, false, readLimit)
- if e != nil {
- utils.FileLog.Info(fmt.Sprintf("%s保存首页失败, err: %v\n", nameKey, e))
- continue
- }
- readCount := 0
- for _, v := range files {
- readCount += 1
- if readCount > readLimit {
- break
- }
- htm, e := ioutil.ReadFile(v)
- if e != nil {
- fmt.Printf("file: %s, ReadFile err: %v\n", v, e)
- utils.FileLog.Info(fmt.Sprintf("file: %s, ReadFile err: %v", v, e))
- continue
- }
- tableContent, tableDate, e := AnalysisStockTable(htm)
- if e != nil {
- fmt.Printf("file: %s, AnalysisStockTable err: %v\n", v, e)
- utils.FileLog.Info(fmt.Sprintf("file: %s, AnalysisStockTable err: %v", v, e))
- continue
- }
- tableItem := new(HandleTableData)
- tableItem.ClassifyId = fetchRule.StockTable.ClassifyId
- tableItem.FromPage = v
- tableItem.TableDate = tableDate
- tableItem.TableContent = tableContent
- // 写入数据库
- params := make(map[string]interface{})
- params["Table"] = tableItem
- params["TerminalCode"] = utils.TerminalCode
- result, e := postEdbLib(params, utils.LIB_ROUTE_CCF_TABLE_HANDLE)
- if e != nil {
- b, _ := json.Marshal(params)
- fmt.Printf("file: %s, postEdbLib err: %v, params: %s\n", v, e, string(b))
- utils.FileLog.Info(fmt.Sprintf("file: %s, postEdbLib err: %v, params: %s", v, e, string(b)))
- continue
- }
- resp := new(models.BaseEdbLibResponse)
- if e = json.Unmarshal(result, &resp); e != nil {
- fmt.Printf("file: %s, json.Unmarshal err: %v\n", v, e)
- utils.FileLog.Info(fmt.Sprintf("file: %s, json.Unmarshal err: %v", v, e))
- continue
- }
- if resp.Ret != 200 {
- fmt.Printf("file: %s, Msg: %s, ErrMsg: %s\n", v, resp.Msg, resp.ErrMsg)
- utils.FileLog.Info(fmt.Sprintf("file: %s, Msg: %s, ErrMsg: %s", v, resp.Msg, resp.ErrMsg))
- continue
- }
- }
- }
- return
- }
- // AnalysisStockTable 解析装置表格
- func AnalysisStockTable(htm []byte) (tableContent string, tableTime time.Time, err error) {
- doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm)))
- if e != nil {
- err = fmt.Errorf("NewDocumentFromReader err: %v", e)
- return
- }
- // 从收藏按钮往上找table, 取出报告发布日期
- collectEle := doc.Find("#savenews")
- publishTimeTab := collectEle.ParentsFiltered("table").First()
- publishTxt := publishTimeTab.Find("td:first-child").Text()
- //fmt.Println("publishTxt: ", publishTxt)
- publishTime, e := extractReportPublishTime(publishTxt)
- if e != nil {
- err = fmt.Errorf("extractReportPublishTime err: %v", e)
- return
- }
- if publishTime.IsZero() {
- err = fmt.Errorf("发布日期有误")
- return
- }
- //fmt.Println(publishTime)
- tableTime = publishTime
- //publishYear := publishTime.Year()
- //fmt.Println(publishYear)
- // 查找包含关键词的标签
- keyElement := doc.Find("#newscontent")
- table := keyElement.ChildrenFiltered("table").First()
- if table.Length() <= 0 {
- err = fmt.Errorf("表格未找到")
- return
- }
- h, e := table.Html()
- if e != nil {
- err = fmt.Errorf("表格HTML有误, err: %v", e)
- return
- }
- tableContent = fmt.Sprintf("<table>%s</table>", h)
- //tableContent = regexp.MustCompile(`\n`).ReplaceAllString(tableContent, "")
- return
- }
- // ReadStockHistoryFiles 读取历史文件
- func ReadStockHistoryFiles(context.Context) {
- var err error
- defer func() {
- if err != nil {
- tips := fmt.Sprintf("ReadStockHistoryFiles ErrMsg: %s", err.Error())
- utils.FileLog.Info(tips)
- fmt.Println(tips)
- }
- }()
- taskNames := []string{"PTA装置", "MEG装置", "PX装置"}
- for _, nameKey := range taskNames {
- fetchRule, e := loadDataRule(nameKey)
- if e != nil {
- utils.FileLog.Info(fmt.Sprintf("%s无解析规则, err: %v\n", nameKey, e))
- continue
- }
- filePaths, e := listFiles(fetchRule.PageDir)
- if e != nil {
- utils.FileLog.Info(fmt.Sprintf("%s读取文件目录失败, err: %v\n", nameKey, e))
- continue
- }
- for _, v := range filePaths {
- v = fmt.Sprintf("%s/%s", fetchRule.PageDir, v)
- fmt.Printf("开始解析: %s", v)
- //htm, e := ioutil.ReadFile("static/ccf/oil_daily/原油石化早报(4.18).html")
- htm, e := ioutil.ReadFile(v)
- if e != nil {
- fmt.Printf("file: %s, ReadFile err: %v\n", v, e)
- utils.FileLog.Info(fmt.Sprintf("file: %s, ReadFile err: %v", v, e))
- continue
- }
- tableContent, tableDate, e := AnalysisStockTable(htm)
- if e != nil {
- fmt.Printf("file: %s, AnalysisStockTable err: %v\n", v, e)
- utils.FileLog.Info(fmt.Sprintf("file: %s, AnalysisStockTable err: %v", v, e))
- continue
- }
- tableItem := new(HandleTableData)
- tableItem.ClassifyId = fetchRule.StockTable.ClassifyId
- tableItem.FromPage = v
- tableItem.TableDate = tableDate
- tableItem.TableContent = tableContent
- // 写入数据库
- params := make(map[string]interface{})
- params["Table"] = tableItem
- params["TerminalCode"] = utils.TerminalCode
- result, e := postEdbLib(params, utils.LIB_ROUTE_CCF_TABLE_HANDLE)
- if e != nil {
- b, _ := json.Marshal(params)
- fmt.Printf("file: %s, postEdbLib err: %v, params: %s\n", v, e, string(b))
- utils.FileLog.Info(fmt.Sprintf("file: %s, postEdbLib err: %v, params: %s", v, e, string(b)))
- continue
- }
- resp := new(models.BaseEdbLibResponse)
- if e = json.Unmarshal(result, &resp); e != nil {
- fmt.Printf("file: %s, json.Unmarshal err: %v\n", v, e)
- utils.FileLog.Info(fmt.Sprintf("file: %s, json.Unmarshal err: %v", v, e))
- continue
- }
- if resp.Ret != 200 {
- fmt.Printf("file: %s, Msg: %s, ErrMsg: %s\n", v, resp.Msg, resp.ErrMsg)
- utils.FileLog.Info(fmt.Sprintf("file: %s, Msg: %s, ErrMsg: %s", v, resp.Msg, resp.ErrMsg))
- continue
- }
- }
- }
- return
- }
|