edb.go 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333
  1. package base_from_ccf
  2. import (
  3. "context"
  4. "encoding/json"
  5. "eta/eta_data_analysis/models"
  6. "eta/eta_data_analysis/utils"
  7. "fmt"
  8. "github.com/PuerkitoBio/goquery"
  9. "io/ioutil"
  10. "strconv"
  11. "strings"
  12. "time"
  13. )
  14. // TaskAnalysisHandlers 解析表格的函数
  15. var TaskAnalysisHandlers = map[string]func(htm []byte, fetchRule *DataRule) (indexes []*HandleIndexData, err error){
  16. "原油石化早报": AnalysisOilReportEdb,
  17. "PTA周报": AnalysisPTAWeekEdb,
  18. "MEG周报": AnalysisMEGWeekEdb,
  19. "长丝周报": AnalysisChangSiWeekEdb,
  20. "短纤周报": AnalysisDuanXianWeekEdb,
  21. "瓶片周报": AnalysisPingPianWeekEdb,
  22. "切片周报": AnalysisQiePianWeekEdb,
  23. "PX周报": AnalysisPXWeekEdb,
  24. }
  25. // HandleIndexData 指标数据
  26. type HandleIndexData struct {
  27. IndexName string `description:"指标名称"`
  28. IndexCode string `description:"指标编码"`
  29. ClassifyId int `description:"分类ID"`
  30. Unit string `description:"单位"`
  31. Sort int `description:"排序"`
  32. Frequency string `description:"频度"`
  33. TerminalCode string `description:"终端编码"`
  34. DateData map[string]string `description:"日期数据"`
  35. }
  36. // TaskOilDailyEdb 获取原油石化早报指标
  37. func TaskOilDailyEdb(context.Context) (err error) {
  38. defer func() {
  39. if err != nil {
  40. tips := fmt.Sprintf("TaskOilEdbDaily ErrMsg: %s", err.Error())
  41. utils.FileLog.Info(tips)
  42. fmt.Println(tips)
  43. }
  44. }()
  45. nameKey := "原油石化早报"
  46. fetchRule, e := loadDataRule(nameKey)
  47. if e != nil {
  48. err = fmt.Errorf("loadDataRule, err: %v", e)
  49. return
  50. }
  51. // 解析前N篇报告
  52. readLimit := utils.CCFDailyFetchNum
  53. filePaths, e := savePageHtml(nameKey, fetchRule.PageDir, false, readLimit)
  54. if e != nil {
  55. err = fmt.Errorf("savePageHtml, err: %v", e)
  56. return
  57. }
  58. readCount := 0
  59. for _, v := range filePaths {
  60. readCount += 1
  61. if readCount > readLimit {
  62. return
  63. }
  64. htm, e := ioutil.ReadFile(v)
  65. if e != nil {
  66. fmt.Printf("file: %s, ReadFile err: %v\n", v, e)
  67. utils.FileLog.Info(fmt.Sprintf("file: %s, ReadFile err: %v", v, e))
  68. continue
  69. }
  70. handler, ok := TaskAnalysisHandlers[nameKey]
  71. if !ok {
  72. utils.FileLog.Info(fmt.Sprintf("%s无解析函数\n", nameKey))
  73. continue
  74. }
  75. indexes, e := handler(htm, fetchRule)
  76. if e != nil {
  77. fmt.Printf("file: %s, AnalysisOilReportEdb err: %v\n", v, e)
  78. utils.FileLog.Info(fmt.Sprintf("file: %s, AnalysisOilReportEdb err: %v", v, e))
  79. continue
  80. }
  81. // 写入数据库
  82. params := make(map[string]interface{})
  83. params["List"] = indexes
  84. params["TerminalCode"] = utils.TerminalCode
  85. result, e := postEdbLib(params, utils.LIB_ROUTE_CCF_EDB_HANDLE)
  86. if e != nil {
  87. b, _ := json.Marshal(params)
  88. fmt.Printf("file: %s, postEdbLib err: %v, params: %s\n", v, e, string(b))
  89. utils.FileLog.Info(fmt.Sprintf("file: %s, postEdbLib err: %v, params: %s", v, e, string(b)))
  90. continue
  91. }
  92. resp := new(models.BaseEdbLibResponse)
  93. if e = json.Unmarshal(result, &resp); e != nil {
  94. fmt.Printf("file: %s, json.Unmarshal err: %v\n", v, e)
  95. utils.FileLog.Info(fmt.Sprintf("file: %s, json.Unmarshal err: %v", v, e))
  96. continue
  97. }
  98. if resp.Ret != 200 {
  99. fmt.Printf("file: %s, Msg: %s, ErrMsg: %s\n", v, resp.Msg, resp.ErrMsg)
  100. utils.FileLog.Info(fmt.Sprintf("file: %s, Msg: %s, ErrMsg: %s", v, resp.Msg, resp.ErrMsg))
  101. continue
  102. }
  103. }
  104. return
  105. }
  106. // TaskWeeklyEdb 获取周报指标
  107. func TaskWeeklyEdb(context.Context) (err error) {
  108. defer func() {
  109. if err != nil {
  110. tips := fmt.Sprintf("TaskWeeklyEdb ErrMsg: %s", err.Error())
  111. utils.FileLog.Info(tips)
  112. fmt.Println(tips)
  113. }
  114. }()
  115. taskNames := []string{"PTA周报", "MEG周报", "长丝周报", "短纤周报", "瓶片周报", "切片周报", "PX周报"}
  116. readLimit := utils.CCFWeeklyFetchNum
  117. for _, nameKey := range taskNames {
  118. fmt.Printf("开始获取: %s\n", nameKey)
  119. fetchRule, e := loadDataRule(nameKey)
  120. if e != nil {
  121. utils.FileLog.Info(fmt.Sprintf("%s无解析规则, err: %v\n", nameKey, e))
  122. continue
  123. }
  124. handler, ok := TaskAnalysisHandlers[nameKey]
  125. if !ok {
  126. //fmt.Printf("%s无解析函数\n", nameKey)
  127. utils.FileLog.Info(fmt.Sprintf("%s无解析函数\n", nameKey))
  128. continue
  129. }
  130. // 解析前N篇报告
  131. files, e := savePageHtml(nameKey, fetchRule.PageDir, false, readLimit)
  132. if e != nil {
  133. //fmt.Printf("%s保存首页失败, err: %v\n", nameKey, e)
  134. utils.FileLog.Info(fmt.Sprintf("%s保存首页失败, err: %v\n", nameKey, e))
  135. continue
  136. }
  137. readCount := 0
  138. for _, v := range files {
  139. readCount += 1
  140. if readCount > readLimit {
  141. break
  142. }
  143. htm, e := ioutil.ReadFile(v)
  144. if e != nil {
  145. //fmt.Printf("file: %s, ReadFile err: %v\n", v, e)
  146. utils.FileLog.Info(fmt.Sprintf("file: %s, ReadFile err: %v", v, e))
  147. continue
  148. }
  149. indexes, e := handler(htm, fetchRule)
  150. if e != nil {
  151. //fmt.Printf("file: %s, AnalysisOilReportEdb err: %v\n", v, e)
  152. utils.FileLog.Info(fmt.Sprintf("file: %s, AnalysisOilReportEdb err: %v", v, e))
  153. continue
  154. }
  155. // 写入数据库
  156. params := make(map[string]interface{})
  157. params["List"] = indexes
  158. params["TerminalCode"] = utils.TerminalCode
  159. result, e := postEdbLib(params, utils.LIB_ROUTE_CCF_EDB_HANDLE)
  160. if e != nil {
  161. b, _ := json.Marshal(params)
  162. //fmt.Printf("file: %s, postEdbLib err: %v, params: %s\n", v, e, string(b))
  163. utils.FileLog.Info(fmt.Sprintf("file: %s, postEdbLib err: %v, params: %s", v, e, string(b)))
  164. continue
  165. }
  166. resp := new(models.BaseEdbLibResponse)
  167. if e = json.Unmarshal(result, &resp); e != nil {
  168. //fmt.Printf("file: %s, json.Unmarshal err: %v\n", v, e)
  169. utils.FileLog.Info(fmt.Sprintf("file: %s, json.Unmarshal err: %v", v, e))
  170. continue
  171. }
  172. if resp.Ret != 200 {
  173. //fmt.Printf("file: %s, Msg: %s, ErrMsg: %s\n", v, resp.Msg, resp.ErrMsg)
  174. utils.FileLog.Info(fmt.Sprintf("file: %s, Msg: %s, ErrMsg: %s", v, resp.Msg, resp.ErrMsg))
  175. continue
  176. }
  177. }
  178. fmt.Printf("结束获取: %s\n", nameKey)
  179. }
  180. return
  181. }
  182. // AnalysisOilReportEdb 解析原油石化早报中的指标数据
  183. func AnalysisOilReportEdb(htm []byte, fetchRule *DataRule) (indexes []*HandleIndexData, err error) {
  184. if len(htm) == 0 || fetchRule == nil {
  185. utils.FileLog.Info("htm empty")
  186. return
  187. }
  188. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm)))
  189. if e != nil {
  190. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  191. return
  192. }
  193. // 找到表格
  194. keyElement := doc.Find("#newscontent")
  195. tableBody := keyElement.ChildrenFiltered("table").First().ChildrenFiltered("tbody")
  196. if tableBody.Length() <= 0 {
  197. err = fmt.Errorf("表格未找到")
  198. return
  199. }
  200. colDates := make(map[int]string)
  201. colLen := tableBody.Children().First().Find("td").Length()
  202. attemptDates := []string{"2006/1/2", "2006/01/02", "2006-01-02", "2006-1-2", "2006.01.02", "2006.1.2"}
  203. var rows []TableRow
  204. var mergeBegin, mergeRows int
  205. var mergeProduct string
  206. tableBody.Children().Each(func(i int, s *goquery.Selection) {
  207. cells := s.Find("td")
  208. // 从表头取出日期列
  209. // 格式: [产品|市场|日期列(列数不定)|涨跌|单位]
  210. if i == 0 {
  211. cells.Each(func(ii int, ss *goquery.Selection) {
  212. cellTxt := strings.TrimSpace(ss.Text())
  213. if ii > 1 && ii < colLen-2 {
  214. var d time.Time
  215. // 尝试解析日期
  216. for _, a := range attemptDates {
  217. t, e := time.ParseInLocation(a, cellTxt, time.Local)
  218. if e == nil {
  219. d = t
  220. break
  221. }
  222. }
  223. //fmt.Println("colDate: ", d)
  224. if !d.IsZero() {
  225. colDates[ii] = d.Format(utils.FormatDate)
  226. }
  227. }
  228. })
  229. }
  230. // 取指标
  231. if i > 0 {
  232. row := TableRow{
  233. DateData: make(map[string]string),
  234. }
  235. mergedRow := false // 是否为被合并行
  236. cellsLen := cells.Length()
  237. cells.Each(func(ii int, cell *goquery.Selection) {
  238. cellData := filterInvalidVal(cell.Text())
  239. if cellData == "" {
  240. return
  241. }
  242. switch ii {
  243. case 0:
  244. // 被合并行为市场列, 其余为产品列
  245. hasMerge, _ := cell.Attr("rowspan")
  246. if hasMerge != "" {
  247. // 开始合并行
  248. mergeRows, _ = strconv.Atoi(hasMerge)
  249. mergeBegin = i
  250. row.Product = cellData
  251. mergeProduct = row.Product
  252. } else {
  253. // 被合并行的后一行, 重置合并计数
  254. if i >= (mergeBegin + mergeRows) {
  255. mergeBegin = 0
  256. mergeRows = 0
  257. }
  258. // 被合并行, 第一列为市场
  259. if mergeBegin > 0 && mergeRows > 0 && i < (mergeBegin+mergeRows) {
  260. row.Product = mergeProduct
  261. row.Market = cellData
  262. mergedRow = true
  263. }
  264. if mergeBegin == 0 && mergeRows == 0 {
  265. row.Product = cellData
  266. }
  267. }
  268. case 1:
  269. // 被合并行为日期列, 其余为市场列
  270. if mergedRow {
  271. d, ok := colDates[ii+1]
  272. if ok {
  273. row.DateData[d] = formatIntervalData(cellData, "")
  274. }
  275. } else {
  276. row.Market = cellData
  277. }
  278. case cellsLen - 2:
  279. // 忽略涨跌列
  280. case cellsLen - 1:
  281. row.Unit = cellData
  282. default:
  283. // 日期列
  284. if mergedRow {
  285. d, ok := colDates[ii+1]
  286. if ok {
  287. row.DateData[d] = formatIntervalData(cellData, "")
  288. }
  289. } else {
  290. d, ok := colDates[ii]
  291. if ok {
  292. row.DateData[d] = formatIntervalData(cellData, "")
  293. }
  294. }
  295. }
  296. })
  297. rows = append(rows, row)
  298. }
  299. })
  300. // 只取需要的指标
  301. indexes = formatTableRow2ValidEdb(rows, fetchRule.EdbMatch)
  302. return
  303. }
  304. // AnalysisPTAWeekEdb 解析PTA周报中的指标数据
  305. func AnalysisPTAWeekEdb(htm []byte, fetchRule *DataRule) (indexes []*HandleIndexData, err error) {
  306. if len(htm) == 0 || fetchRule == nil {
  307. utils.FileLog.Info("htm empty")
  308. return
  309. }
  310. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm)))
  311. if e != nil {
  312. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  313. return
  314. }
  315. burdenTitle, ptaTitle := "负荷", "PTA库存"
  316. // 从收藏按钮往上找table, 取出报告发布日期
  317. collectEle := doc.Find("#savenews")
  318. publishTimeTab := collectEle.ParentsFiltered("table").First()
  319. publishTxt := publishTimeTab.Find("td:first-child").Text()
  320. publishTime, e := extractReportPublishTime(publishTxt)
  321. if e != nil {
  322. err = fmt.Errorf("extractReportPublishTime err: %v", e)
  323. return
  324. }
  325. //publishYear := publishTime.Year()
  326. //fmt.Println(publishTime)
  327. //fmt.Println("年份", publishYear)
  328. // 遍历h2, 找出负荷和PTA库存下第一个table
  329. burdenTable, ptaTable := new(goquery.Selection), new(goquery.Selection)
  330. h2Selections := doc.Find("h2")
  331. h2Selections.Each(func(i int, h2 *goquery.Selection) {
  332. //fmt.Println(i, h2.Text())
  333. if strings.Contains(h2.Text(), burdenTitle) {
  334. burdenTable = h2.NextAllFiltered("table").First()
  335. }
  336. if strings.Contains(h2.Text(), ptaTitle) {
  337. ptaTable = h2.NextAllFiltered("table").First()
  338. }
  339. })
  340. // 负荷
  341. //var rows []TableRow
  342. //var burdenRows []TableRow
  343. //var burdenDataTime string
  344. //burdenTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  345. // // 表头取出日期
  346. // cells := s.Find("td")
  347. // if i == 0 {
  348. // cells.Each(func(ii int, ss *goquery.Selection) {
  349. // cellTxt := strings.TrimSpace(ss.Text())
  350. // if ii == 2 {
  351. // strTime := fmt.Sprintf("%d年%s", publishYear, cellTxt)
  352. // t, e := time.ParseInLocation("2006年01月02日", strTime, time.Local)
  353. // if e != nil {
  354. // err = fmt.Errorf("解析PTA负荷数据日期失败, err: %v", e)
  355. // return
  356. // }
  357. // burdenDataTime = t.Format(utils.FormatDate)
  358. // }
  359. // })
  360. // }
  361. // // 取指标
  362. // if i > 0 {
  363. // row := TableRow{
  364. // DateData: make(map[string]string),
  365. // }
  366. // cells.Each(func(ii int, ss *goquery.Selection) {
  367. // cellTxt := strings.TrimSpace(ss.Text())
  368. // switch ii {
  369. // case 0:
  370. // row.Product = cellTxt
  371. // case 1:
  372. // row.Market = cellTxt
  373. // case 2:
  374. // row.DateData[burdenDataTime] = cellTxt
  375. // }
  376. // })
  377. // //row.Unit = burdenUnit
  378. // burdenRows = append(burdenRows, row)
  379. // }
  380. //})
  381. //rows = append(rows, burdenRows...)
  382. var rows []TableRow
  383. var analysisPars AnalysisNoneMergeTablePars
  384. analysisPars.DocTable = burdenTable
  385. analysisPars.MarketCol.HasCol = true
  386. analysisPars.MarketCol.ColIndex = 1
  387. analysisPars.DateCol.StartIndex = 2
  388. analysisPars.DateCol.EndIndex = 3
  389. analysisPars.DateCol.PublishTime = publishTime
  390. //analysisPars.DateCol.PublishYear = publishYear
  391. analysisPars.DateCol.StrTimeFormat = "%d年%s"
  392. analysisPars.DateCol.TimeFormat = []string{"2006年01月02日", "2006年1月2日"}
  393. burdenRows := analysisNoneMergeTable(analysisPars)
  394. rows = append(rows, burdenRows...)
  395. // PTA库存, 存在特殊格式
  396. ptaRows := make(map[int]TableRow)
  397. ptaTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  398. cells := s.Find("td")
  399. cellLen := cells.Length()
  400. // 判断tr下td的长度, 兼容处理
  401. // td长度为2, 数据日期取发布日期
  402. if cellLen == 2 {
  403. if i == 0 {
  404. cells.Each(func(ii int, ss *goquery.Selection) {
  405. cellTxt := strings.TrimSpace(ss.Text())
  406. row := TableRow{
  407. Product: cellTxt,
  408. DateData: make(map[string]string),
  409. }
  410. ptaRows[ii] = row
  411. })
  412. }
  413. if i > 0 {
  414. cells.Each(func(ii int, ss *goquery.Selection) {
  415. cellTxt := filterInvalidVal(ss.Text())
  416. if cellTxt == "" {
  417. return
  418. }
  419. val, e := calculateDataHalfVal(cellTxt)
  420. if e != nil {
  421. utils.FileLog.Info(fmt.Sprintf("PTA周报-calculateDataHalfVal: cellTxt-%s, err: %v", cellTxt, e))
  422. return
  423. }
  424. ptaRows[ii].DateData[publishTime.Format(utils.FormatDate)] = val
  425. })
  426. }
  427. }
  428. // 大于2时, 内容第一列为日期
  429. if cellLen > 2 {
  430. if i == 0 {
  431. cells.Each(func(ii int, ss *goquery.Selection) {
  432. if ii == 0 {
  433. return
  434. }
  435. cellTxt := strings.TrimSpace(ss.Text())
  436. row := TableRow{
  437. Product: cellTxt,
  438. DateData: make(map[string]string),
  439. }
  440. ptaRows[ii] = row
  441. })
  442. }
  443. if i > 0 {
  444. var dataTime string
  445. cells.Each(func(ii int, ss *goquery.Selection) {
  446. cellTxt := filterInvalidVal(ss.Text())
  447. if cellTxt == "" {
  448. return
  449. }
  450. if ii == 0 {
  451. strTime := fmt.Sprintf("%d/%s", publishTime.Year(), cellTxt)
  452. t, e := time.ParseInLocation("2006/1/2", strTime, time.Local)
  453. if e != nil {
  454. fmt.Printf("time parse err: %v", e)
  455. return
  456. }
  457. // 判断报告是否跨年
  458. if t.AddDate(0, -6, 0).After(publishTime) {
  459. utils.FileLog.Info(fmt.Sprintf("跨年判断-2: ColTime-%v; PublishTime-%v", t, publishTime))
  460. t = t.AddDate(-1, 0, 0)
  461. }
  462. dataTime = t.Format(utils.FormatDate)
  463. return
  464. }
  465. val, e := calculateDataHalfVal(cellTxt)
  466. if e != nil {
  467. fmt.Printf("calculateDataHalfVal err: %v\n", e)
  468. return
  469. }
  470. if dataTime != "" && val != "" {
  471. ptaRows[ii].DateData[dataTime] = val
  472. }
  473. })
  474. }
  475. }
  476. })
  477. for _, v := range ptaRows {
  478. rows = append(rows, v)
  479. }
  480. indexes = formatTableRow2ValidEdb(rows, fetchRule.EdbMatch)
  481. return
  482. }
  483. // AnalysisMEGWeekEdb 解析MEG周报中的指标数据
  484. func AnalysisMEGWeekEdb(htm []byte, fetchRule *DataRule) (indexes []*HandleIndexData, err error) {
  485. if len(htm) == 0 || fetchRule == nil {
  486. utils.FileLog.Info("htm empty")
  487. return
  488. }
  489. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm)))
  490. if e != nil {
  491. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  492. return
  493. }
  494. burdenTitle, stockTitle := "CCF指数", "MEG华东港口库存情况"
  495. // 从收藏按钮往上找table, 取出报告发布日期
  496. collectEle := doc.Find("#savenews")
  497. publishTimeTab := collectEle.ParentsFiltered("table").First()
  498. publishTxt := publishTimeTab.Find("td:first-child").Text()
  499. //fmt.Println("publishTxt: ", publishTxt)
  500. publishTime, e := extractReportPublishTime(publishTxt)
  501. if e != nil {
  502. err = fmt.Errorf("extractReportPublishTime err: %v", e)
  503. return
  504. }
  505. //publishYear := publishTime.Year()
  506. //fmt.Println(publishTime)
  507. //fmt.Println(publishYear)
  508. // 遍历h2, 找出对应Table
  509. burdenTable, stockTable := new(goquery.Selection), new(goquery.Selection)
  510. //h2Selections := doc.Find("h2")
  511. doc.Find("h2").Each(func(i int, h2 *goquery.Selection) {
  512. //fmt.Println(i, h2.Text())
  513. if strings.Contains(h2.Text(), burdenTitle) {
  514. burdenTable = h2.NextAllFiltered("table").First()
  515. }
  516. if strings.Contains(h2.Text(), stockTitle) {
  517. stockTable = h2.NextAllFiltered("table").First()
  518. }
  519. })
  520. // 负荷-存在合并行
  521. var rows []TableRow
  522. //var burdenRows []TableRow
  523. {
  524. //var burdenDataTime string
  525. var mergeBegin, mergeRows int
  526. var mergeProduct string
  527. burdenColDate := make(map[int]string) // 日期列key->日期
  528. burdenTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  529. // 表头取出日期
  530. cells := s.Find("td")
  531. if i == 0 {
  532. cells.Each(func(ii int, ss *goquery.Selection) {
  533. cellTxt := strings.TrimSpace(ss.Text())
  534. if cellTxt == "" {
  535. return
  536. }
  537. if ii == 2 || ii == 3 {
  538. //fmt.Println("日期列")
  539. strTime := fmt.Sprintf("%d年%s", publishTime.Year(), cellTxt)
  540. //fmt.Println("日期str", strTime)
  541. t, e := time.ParseInLocation("2006年1月2日", strTime, time.Local)
  542. if e != nil {
  543. utils.FileLog.Info(fmt.Sprintf("MEG周报-日期解析: cellTxt-%s, err: %v", cellTxt, e))
  544. //fmt.Println("e: ", e)
  545. //err = fmt.Errorf("解析MEG负荷数据日期失败, err: %v", e)
  546. return
  547. }
  548. // 判断报告是否跨年
  549. if t.AddDate(0, -6, 0).After(publishTime) {
  550. utils.FileLog.Info(fmt.Sprintf("跨年判断-MEG: ColTime-%v; PublishTime-%v", t, publishTime))
  551. t = t.AddDate(-1, 0, 0)
  552. }
  553. if !t.IsZero() {
  554. burdenColDate[ii] = t.Format(utils.FormatDate)
  555. }
  556. //fmt.Println("日期:", t.Format(utils.FormatDate))
  557. }
  558. })
  559. }
  560. // 取指标
  561. if i > 0 {
  562. row := TableRow{
  563. DateData: make(map[string]string),
  564. }
  565. mergedRow := false // 是否为被合并行
  566. cells.Each(func(ii int, ss *goquery.Selection) {
  567. cellTxt := filterInvalidVal(ss.Text())
  568. if cellTxt == "" {
  569. return
  570. }
  571. switch ii {
  572. case 0:
  573. // 被合并行为市场列, 其余为产品列
  574. hasMerge, _ := ss.Attr("rowspan")
  575. if hasMerge != "" {
  576. // 开始合并行
  577. mergeRows, _ = strconv.Atoi(hasMerge)
  578. mergeBegin = i
  579. row.Product = cellTxt
  580. mergeProduct = row.Product
  581. } else {
  582. // 被合并行的后一行, 重置合并计数
  583. if i >= (mergeBegin + mergeRows) {
  584. mergeBegin = 0
  585. mergeRows = 0
  586. }
  587. // 被合并行第一列为产品
  588. if mergeBegin > 0 && mergeRows > 0 && i < (mergeBegin+mergeRows) {
  589. row.Product = mergeProduct
  590. row.Market = cellTxt
  591. mergedRow = true
  592. //fmt.Println("被合并行: ", i, mergeBegin+mergeRows)
  593. }
  594. if mergeBegin == 0 && mergeRows == 0 {
  595. row.Product = cellTxt
  596. }
  597. }
  598. case 1:
  599. // 被合并行为值列, 其余为市场列
  600. if mergedRow {
  601. d, ok := burdenColDate[ii+1]
  602. if ok {
  603. row.DateData[d] = cellTxt
  604. }
  605. } else {
  606. row.Market = cellTxt
  607. }
  608. case 2:
  609. if mergedRow {
  610. d, ok := burdenColDate[ii+1]
  611. if ok {
  612. row.DateData[d] = cellTxt
  613. }
  614. } else {
  615. d, ok := burdenColDate[ii]
  616. if ok {
  617. row.DateData[d] = cellTxt
  618. }
  619. }
  620. case 3:
  621. if !mergedRow {
  622. d, ok := burdenColDate[ii]
  623. if ok {
  624. row.DateData[d] = cellTxt
  625. }
  626. }
  627. }
  628. })
  629. rows = append(rows, row)
  630. }
  631. })
  632. }
  633. // 库存
  634. //var stockRows []TableRow
  635. //{
  636. // colDate := make(map[int]string) // 日期列key->日期
  637. // stockTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  638. // cells := s.Find("td")
  639. //
  640. // // 表头取出日期
  641. // if i == 0 {
  642. // cells.Each(func(ii int, ss *goquery.Selection) {
  643. // cellTxt := strings.TrimSpace(ss.Text())
  644. // if ii > 0 {
  645. // t, e := time.ParseInLocation("2006/1/2", cellTxt, time.Local)
  646. // if e != nil {
  647. // fmt.Println("e: ", e)
  648. // //err = fmt.Errorf("解析MEG负荷数据日期失败, err: %v", e)
  649. // return
  650. // }
  651. // colDate[ii] = t.Format(utils.FormatDate)
  652. // fmt.Println("日期:", t.Format(utils.FormatDate))
  653. // }
  654. // })
  655. // }
  656. //
  657. // // 取指标
  658. // if i > 0 {
  659. // row := TableRow{
  660. // Product: stockTitle,
  661. // //Unit: stockUnit,
  662. // DateData: make(map[string]string),
  663. // }
  664. // cells.Each(func(ii int, ss *goquery.Selection) {
  665. // cellTxt := strings.TrimSpace(ss.Text())
  666. // switch ii {
  667. // case 0:
  668. // row.Market = cellTxt
  669. // case 1, 2:
  670. // row.DateData[colDate[ii]] = cellTxt
  671. // }
  672. // })
  673. // fmt.Println(row)
  674. // stockRows = append(stockRows, row)
  675. // }
  676. // })
  677. //}
  678. var analysisPars AnalysisNoneMergeTablePars
  679. analysisPars.DocTable = stockTable
  680. analysisPars.DateCol.StartIndex = 1
  681. analysisPars.DateCol.EndIndex = 2
  682. analysisPars.DateCol.PublishTime = publishTime
  683. //analysisPars.DateCol.PublishYear = publishYear
  684. analysisPars.DateCol.StrTimeFormat = ""
  685. analysisPars.DateCol.TimeFormat = []string{"2006/1/2"}
  686. stockRows := analysisNoneMergeTable(analysisPars)
  687. rows = append(rows, stockRows...)
  688. indexes = formatTableRow2ValidEdb(rows, fetchRule.EdbMatch)
  689. //fmt.Println(111)
  690. return
  691. }
  692. // AnalysisChangSiWeekEdb 解析长丝周报中的指标数据
  693. func AnalysisChangSiWeekEdb(htm []byte, fetchRule *DataRule) (indexes []*HandleIndexData, err error) {
  694. if len(htm) == 0 || fetchRule == nil {
  695. utils.FileLog.Info("htm empty")
  696. return
  697. }
  698. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm)))
  699. if e != nil {
  700. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  701. return
  702. }
  703. burdenTitle, stockTitle, observeTitle := "负荷指数", "库存指数", "下游观察"
  704. // 从收藏按钮往上找table, 取出报告发布日期
  705. collectEle := doc.Find("#savenews")
  706. publishTimeTab := collectEle.ParentsFiltered("table").First()
  707. publishTxt := publishTimeTab.Find("td:first-child").Text()
  708. //fmt.Println("publishTxt: ", publishTxt)
  709. publishTime, e := extractReportPublishTime(publishTxt)
  710. if e != nil {
  711. err = fmt.Errorf("extractReportPublishTime err: %v", e)
  712. return
  713. }
  714. //publishYear := publishTime.Year()
  715. //fmt.Println(publishTime)
  716. //fmt.Println(publishYear)
  717. // 遍历h2, 找出对应Table
  718. burdenTable, stockTable, observeTable := new(goquery.Selection), new(goquery.Selection), new(goquery.Selection)
  719. //h2Selections := doc.Find("h2")
  720. doc.Find("h2").Each(func(i int, h2 *goquery.Selection) {
  721. //fmt.Println(i, h2.Text())
  722. if strings.Contains(h2.Text(), burdenTitle) {
  723. burdenTable = h2.NextAllFiltered("table").First()
  724. }
  725. if strings.Contains(h2.Text(), stockTitle) {
  726. stockTable = h2.NextAllFiltered("table").First()
  727. }
  728. if strings.Contains(h2.Text(), observeTitle) {
  729. observeTable = h2.NextAllFiltered("table").First()
  730. }
  731. })
  732. // 负荷/下游观察解析
  733. //noneMergeAnalysis := func(docTable *goquery.Selection, unit string) (items []TableRow) {
  734. // colDate := make(map[int]string)
  735. //
  736. // docTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  737. // cells := s.Find("td")
  738. //
  739. // // 表头取出日期
  740. // if i == 0 {
  741. // cells.Each(func(ii int, ss *goquery.Selection) {
  742. // cellTxt := strings.TrimSpace(ss.Text())
  743. // fmt.Println("cellTxt", cellTxt)
  744. // if ii >= 1 && ii <= 3 {
  745. // strTime := fmt.Sprintf("%d.%s", publishYear, cellTxt)
  746. // t, e := time.ParseInLocation("2006.01.02", strTime, time.Local)
  747. // if e != nil {
  748. // fmt.Println("e: ", e)
  749. // //err = fmt.Errorf("解析MEG负荷数据日期失败, err: %v", e)
  750. // return
  751. // }
  752. // colDate[ii] = t.Format(utils.FormatDate)
  753. // fmt.Println("日期:", t.Format(utils.FormatDate))
  754. // }
  755. // })
  756. // }
  757. //
  758. // // 取指标
  759. // if i > 0 {
  760. // row := TableRow{
  761. // //Product: stockTitle,
  762. // Unit: unit,
  763. // DateData: make(map[string]string),
  764. // }
  765. // cells.Each(func(ii int, ss *goquery.Selection) {
  766. // cellTxt := strings.TrimSpace(ss.Text())
  767. // fmt.Println("cellTxt", cellTxt)
  768. // switch ii {
  769. // case 0:
  770. // row.Product = cellTxt
  771. // case 1, 2, 3:
  772. // row.DateData[colDate[ii]] = cellTxt
  773. // }
  774. // })
  775. // //fmt.Println(row)
  776. // items = append(items, row)
  777. // }
  778. // })
  779. // return
  780. //}
  781. // 库存解析-存在合并行
  782. mergeAnalysis := func(docTable *goquery.Selection) (items []TableRow) {
  783. var mergeBegin, mergeRows int
  784. var mergeProduct string
  785. colDate := make(map[int]string) // 日期列key->日期
  786. attemptDates := []string{"2006.01.02", "2006.1.02", "2006.01.2"}
  787. docTable.Find("tbody").Children().Each(func(i int, s *goquery.Selection) {
  788. // 表头取出日期
  789. cells := s.Find("td")
  790. if i == 0 {
  791. cells.Each(func(ii int, ss *goquery.Selection) {
  792. cellTxt := strings.TrimSpace(ss.Text())
  793. fmt.Println("1-cellTxt", cellTxt)
  794. if ii >= 2 && ii <= 4 {
  795. //fmt.Println("日期列")
  796. strTime := fmt.Sprintf("%d.%s", publishTime.Year(), cellTxt)
  797. //fmt.Println("日期str", strTime)
  798. //t, e := time.ParseInLocation("2006.01.02", strTime, time.Local)
  799. //if e != nil {
  800. // utils.FileLog.Info(fmt.Sprintf("长丝周报-日期解析: cellTxt-%s, err: %v", cellTxt, e))
  801. // //fmt.Println("time e: ", e)
  802. // //err = fmt.Errorf("解析MEG负荷数据日期失败, err: %v", e)
  803. // return
  804. //}
  805. var colTime time.Time
  806. for _, f := range attemptDates {
  807. t, e := time.ParseInLocation(f, strTime, time.Local)
  808. if e != nil {
  809. continue
  810. }
  811. colTime = t
  812. break
  813. }
  814. // 判断报告是否跨年
  815. if colTime.AddDate(0, -6, 0).After(publishTime) {
  816. utils.FileLog.Info(fmt.Sprintf("跨年判断-长丝: ColTime-%v; PublishTime-%v", colTime, publishTime))
  817. colTime = colTime.AddDate(-1, 0, 0)
  818. }
  819. if !colTime.IsZero() {
  820. colDate[ii] = colTime.Format(utils.FormatDate)
  821. }
  822. //fmt.Println("日期:", t.Format(utils.FormatDate))
  823. }
  824. })
  825. }
  826. // 取指标
  827. if i > 0 {
  828. row := TableRow{
  829. DateData: make(map[string]string),
  830. }
  831. mergedRow := false // 是否为被合并行
  832. cells.Each(func(ii int, ss *goquery.Selection) {
  833. cellTxt := filterInvalidVal(ss.Text())
  834. fmt.Println("2-cellTxt", cellTxt)
  835. switch ii {
  836. case 0:
  837. // 被合并行为市场列, 其余为产品列
  838. hasMerge, _ := ss.Attr("rowspan")
  839. if hasMerge != "" {
  840. // 开始合并行
  841. mergeRows, _ = strconv.Atoi(hasMerge)
  842. mergeBegin = i
  843. row.Product = cellTxt
  844. mergeProduct = row.Product
  845. } else {
  846. // 被合并行的后一行, 重置合并计数
  847. if i >= (mergeBegin + mergeRows) {
  848. mergeBegin = 0
  849. mergeRows = 0
  850. }
  851. // 被合并行第一列为产品
  852. if mergeBegin > 0 && mergeRows > 0 && i < (mergeBegin+mergeRows) {
  853. row.Product = mergeProduct
  854. row.Market = cellTxt
  855. mergedRow = true
  856. //fmt.Println("被合并行: ", i, mergeBegin+mergeRows)
  857. }
  858. if mergeBegin == 0 && mergeRows == 0 {
  859. row.Product = cellTxt
  860. }
  861. }
  862. case 1:
  863. // 被合并行为值列, 其余为市场列
  864. if mergedRow {
  865. d, ok := colDate[ii+1]
  866. if ok {
  867. row.DateData[d] = cellTxt
  868. }
  869. } else {
  870. row.Market = cellTxt
  871. }
  872. case 2, 3:
  873. if mergedRow {
  874. d, ok := colDate[ii+1]
  875. if ok {
  876. row.DateData[d] = cellTxt
  877. }
  878. } else {
  879. d, ok := colDate[ii]
  880. if ok {
  881. row.DateData[d] = cellTxt
  882. }
  883. }
  884. case 4:
  885. if !mergedRow {
  886. d, ok := colDate[ii]
  887. if ok {
  888. row.DateData[d] = cellTxt
  889. }
  890. }
  891. }
  892. })
  893. items = append(items, row)
  894. }
  895. })
  896. return
  897. }
  898. // 负荷
  899. var rows []TableRow
  900. fmt.Println("blen", burdenTable.Length())
  901. if burdenTable.Length() > 0 {
  902. //items := noneMergeAnalysis(burdenTable, burdenUnit)
  903. //if len(items) > 0 {
  904. // rows = append(rows, items...)
  905. //}
  906. //strTime := fmt.Sprintf("%d.%s", publishYear, cellTxt)
  907. //t, e := time.ParseInLocation("2006.01.02", strTime, time.Local)
  908. var analysisPars AnalysisNoneMergeTablePars
  909. analysisPars.DocTable = burdenTable
  910. analysisPars.DateCol.StartIndex = 1
  911. analysisPars.DateCol.EndIndex = 3
  912. analysisPars.DateCol.PublishTime = publishTime
  913. //analysisPars.DateCol.PublishYear = publishYear
  914. analysisPars.DateCol.StrTimeFormat = "%d.%s"
  915. analysisPars.DateCol.TimeFormat = []string{"2006.01.02"}
  916. burdenRows := analysisNoneMergeTable(analysisPars)
  917. rows = append(rows, burdenRows...)
  918. }
  919. // 下游观察
  920. fmt.Println("olen", observeTable.Length())
  921. if observeTable.Length() > 0 {
  922. //items := noneMergeAnalysis(observeTable, observeUnit)
  923. //if len(items) > 0 {
  924. // rows = append(rows, items...)
  925. //}
  926. var analysisPars AnalysisNoneMergeTablePars
  927. analysisPars.DocTable = observeTable
  928. analysisPars.DateCol.StartIndex = 1
  929. analysisPars.DateCol.EndIndex = 3
  930. analysisPars.DateCol.PublishTime = publishTime
  931. //analysisPars.DateCol.PublishYear = publishYear
  932. analysisPars.DateCol.StrTimeFormat = "%d.%s"
  933. analysisPars.DateCol.TimeFormat = []string{"2006.01.02"}
  934. observeRows := analysisNoneMergeTable(analysisPars)
  935. rows = append(rows, observeRows...)
  936. }
  937. // 下游观察
  938. fmt.Println("slen", stockTable.Length())
  939. if stockTable.Length() > 0 {
  940. //fmt.Println(stockUnit)
  941. items := mergeAnalysis(stockTable)
  942. if len(items) > 0 {
  943. rows = append(rows, items...)
  944. }
  945. }
  946. fmt.Println(rows)
  947. indexes = formatTableRow2ValidEdb(rows, fetchRule.EdbMatch)
  948. return
  949. }
  950. // AnalysisDuanXianWeekEdb 解析短纤周报中的指标数据
  951. func AnalysisDuanXianWeekEdb(htm []byte, fetchRule *DataRule) (indexes []*HandleIndexData, err error) {
  952. if len(htm) == 0 || fetchRule == nil {
  953. utils.FileLog.Info("htm empty")
  954. return
  955. }
  956. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm)))
  957. if e != nil {
  958. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  959. return
  960. }
  961. mainTitle := "主要运行指数"
  962. // 从收藏按钮往上找table, 取出报告发布日期
  963. collectEle := doc.Find("#savenews")
  964. publishTimeTab := collectEle.ParentsFiltered("table").First()
  965. publishTxt := publishTimeTab.Find("td:first-child").Text()
  966. //fmt.Println("publishTxt: ", publishTxt)
  967. publishTime, e := extractReportPublishTime(publishTxt)
  968. if e != nil {
  969. err = fmt.Errorf("extractReportPublishTime err: %v", e)
  970. return
  971. }
  972. //publishYear := publishTime.Year()
  973. //fmt.Println(publishTime)
  974. //fmt.Println(publishYear)
  975. // 查找包含文本的<p>元素
  976. mainElement := doc.Find(fmt.Sprintf("p:contains('%s')", mainTitle))
  977. if mainElement.Length() <= 0 {
  978. err = fmt.Errorf("未找到p标签, keyword: %s", mainTitle)
  979. return
  980. }
  981. table := mainElement.NextAllFiltered("table").First()
  982. if table.Length() <= 0 {
  983. err = fmt.Errorf("未找到p标签后的table, keyword: %s", mainTitle)
  984. return
  985. }
  986. var analysisPars AnalysisNoneMergeTablePars
  987. analysisPars.DocTable = table
  988. analysisPars.DateCol.StartIndex = 1
  989. analysisPars.DateCol.EndIndex = 2
  990. analysisPars.DateCol.PublishTime = publishTime
  991. //analysisPars.DateCol.PublishYear = publishYear
  992. analysisPars.DateCol.StrTimeFormat = "%d年%s"
  993. analysisPars.DateCol.TimeFormat = []string{"2006年1月2日"}
  994. rows := analysisNoneMergeTable(analysisPars)
  995. fmt.Println(rows)
  996. indexes = formatTableRow2ValidEdb(rows, fetchRule.EdbMatch)
  997. return
  998. }
  999. // AnalysisPingPianWeekEdb 解析瓶片周报中的指标数据
  1000. func AnalysisPingPianWeekEdb(htm []byte, fetchRule *DataRule) (indexes []*HandleIndexData, err error) {
  1001. if len(htm) == 0 || fetchRule == nil {
  1002. utils.FileLog.Info("htm empty")
  1003. return
  1004. }
  1005. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm)))
  1006. if e != nil {
  1007. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  1008. return
  1009. }
  1010. mainTitle := "周均负荷指数"
  1011. // 从收藏按钮往上找table, 取出报告发布日期
  1012. collectEle := doc.Find("#savenews")
  1013. publishTimeTab := collectEle.ParentsFiltered("table").First()
  1014. publishTxt := publishTimeTab.Find("td:first-child").Text()
  1015. //fmt.Println("publishTxt: ", publishTxt)
  1016. publishTime, e := extractReportPublishTime(publishTxt)
  1017. if e != nil {
  1018. err = fmt.Errorf("extractReportPublishTime err: %v", e)
  1019. return
  1020. }
  1021. //publishYear := publishTime.Year()
  1022. //fmt.Println(publishTime)
  1023. //fmt.Println(publishYear)
  1024. // 查找包含文本的<p>元素
  1025. mainElement := doc.Find(fmt.Sprintf("h2:contains('%s')", mainTitle))
  1026. if mainElement.Length() <= 0 {
  1027. err = fmt.Errorf("未找到p标签, keyword: %s", mainTitle)
  1028. return
  1029. }
  1030. table := mainElement.NextAllFiltered("table").First()
  1031. if table.Length() <= 0 {
  1032. err = fmt.Errorf("未找到p标签后的table, keyword: %s", mainTitle)
  1033. return
  1034. }
  1035. var analysisPars AnalysisNoneMergeTablePars
  1036. analysisPars.DocTable = table
  1037. analysisPars.DateCol.StartIndex = 1
  1038. analysisPars.DateCol.EndIndex = 3
  1039. analysisPars.DateCol.PublishTime = publishTime
  1040. //analysisPars.DateCol.PublishYear = publishYear
  1041. analysisPars.DateCol.StrTimeFormat = "%d.%s"
  1042. analysisPars.DateCol.TimeFormat = []string{"2006.1.2"}
  1043. analysisPars.DateCol.SplitLast = true
  1044. analysisPars.DateCol.SplitFlag = "-"
  1045. rows := analysisNoneMergeTable(analysisPars)
  1046. fmt.Println(rows)
  1047. indexes = formatTableRow2ValidEdb(rows, fetchRule.EdbMatch)
  1048. return
  1049. }
  1050. // AnalysisQiePianWeekEdb 解析切片周报中的指标数据
  1051. func AnalysisQiePianWeekEdb(htm []byte, fetchRule *DataRule) (indexes []*HandleIndexData, err error) {
  1052. if len(htm) == 0 || fetchRule == nil {
  1053. utils.FileLog.Info("htm empty")
  1054. return
  1055. }
  1056. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm)))
  1057. if e != nil {
  1058. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  1059. return
  1060. }
  1061. mainTitle := "切片纺方面"
  1062. // 从收藏按钮往上找table, 取出报告发布日期
  1063. collectEle := doc.Find("#savenews")
  1064. publishTimeTab := collectEle.ParentsFiltered("table").First()
  1065. publishTxt := publishTimeTab.Find("td:first-child").Text()
  1066. //fmt.Println("publishTxt: ", publishTxt)
  1067. publishTime, e := extractReportPublishTime(publishTxt)
  1068. if e != nil {
  1069. err = fmt.Errorf("extractReportPublishTime err: %v", e)
  1070. return
  1071. }
  1072. //publishYear := publishTime.Year()
  1073. //fmt.Println(publishTime)
  1074. //fmt.Println(publishYear)
  1075. // 查找包含关键词的标签
  1076. mainElement := doc.Find(fmt.Sprintf("h2:contains('%s')", mainTitle))
  1077. if mainElement.Length() <= 0 {
  1078. err = fmt.Errorf("未找到关键词标签, keyword: %s", mainTitle)
  1079. return
  1080. }
  1081. table := mainElement.NextAllFiltered("table").First()
  1082. if table.Length() <= 0 {
  1083. err = fmt.Errorf("未找到p标签后的table, keyword: %s", mainTitle)
  1084. return
  1085. }
  1086. var analysisPars AnalysisNoneMergeTablePars
  1087. analysisPars.DocTable = table
  1088. analysisPars.DateCol.StartIndex = 1
  1089. analysisPars.DateCol.EndIndex = 3
  1090. analysisPars.DateCol.PublishTime = publishTime
  1091. //analysisPars.DateCol.PublishYear = publishYear
  1092. analysisPars.DateCol.StrTimeFormat = ""
  1093. analysisPars.DateCol.TimeFormat = []string{"2006-1-2", "2006/1/2"}
  1094. analysisPars.ValCol.SplitHalfVal = true
  1095. rows := analysisNoneMergeTable(analysisPars)
  1096. fmt.Println(rows)
  1097. indexes = formatTableRow2ValidEdb(rows, fetchRule.EdbMatch)
  1098. return
  1099. }
  1100. // AnalysisPXWeekEdb 解析PX周报中的指标数据
  1101. func AnalysisPXWeekEdb(htm []byte, fetchRule *DataRule) (indexes []*HandleIndexData, err error) {
  1102. if len(htm) == 0 || fetchRule == nil {
  1103. utils.FileLog.Info("htm empty")
  1104. return
  1105. }
  1106. doc, e := goquery.NewDocumentFromReader(strings.NewReader(string(htm)))
  1107. if e != nil {
  1108. err = fmt.Errorf("NewDocumentFromReader err: %v", e)
  1109. return
  1110. }
  1111. mainTitle := "负荷指数"
  1112. // 从收藏按钮往上找table, 取出报告发布日期
  1113. collectEle := doc.Find("#savenews")
  1114. publishTimeTab := collectEle.ParentsFiltered("table").First()
  1115. publishTxt := publishTimeTab.Find("td:first-child").Text()
  1116. //fmt.Println("publishTxt: ", publishTxt)
  1117. publishTime, e := extractReportPublishTime(publishTxt)
  1118. if e != nil {
  1119. err = fmt.Errorf("extractReportPublishTime err: %v", e)
  1120. return
  1121. }
  1122. //publishYear := publishTime.Year()
  1123. //fmt.Println(publishTime)
  1124. //fmt.Println(publishYear)
  1125. // 查找包含关键词的标签
  1126. mainElement := doc.Find(fmt.Sprintf("h2:contains('%s')", mainTitle))
  1127. if mainElement.Length() <= 0 {
  1128. err = fmt.Errorf("未找到关键词标签, keyword: %s", mainTitle)
  1129. return
  1130. }
  1131. table := mainElement.NextAllFiltered("table").First()
  1132. if table.Length() <= 0 {
  1133. err = fmt.Errorf("未找到p标签后的table, keyword: %s", mainTitle)
  1134. return
  1135. }
  1136. var analysisPars AnalysisNoneMergeTablePars
  1137. analysisPars.DocTable = table
  1138. analysisPars.DateCol.StartIndex = 1
  1139. analysisPars.DateCol.EndIndex = 3
  1140. analysisPars.DateCol.PublishTime = publishTime
  1141. //analysisPars.DateCol.PublishYear = publishYear
  1142. analysisPars.DateCol.StrTimeFormat = "%d年%s"
  1143. analysisPars.DateCol.TimeFormat = []string{"2006年1月2日"}
  1144. rows := analysisNoneMergeTable(analysisPars)
  1145. fmt.Println(rows)
  1146. indexes = formatTableRow2ValidEdb(rows, fetchRule.EdbMatch)
  1147. return
  1148. }
  1149. // FetchHistoryFiles 获取历史文件
  1150. func FetchHistoryFiles(context.Context) {
  1151. var err error
  1152. defer func() {
  1153. if err != nil {
  1154. tips := fmt.Sprintf("FetchEdbHistoryFiles ErrMsg: %s", err.Error())
  1155. utils.FileLog.Info(tips)
  1156. fmt.Println(tips)
  1157. }
  1158. }()
  1159. taskNames := []string{"原油石化早报", "PTA周报", "MEG周报", "长丝周报", "短纤周报", "瓶片周报", "切片周报", "PX周报", "PTA装置", "MEG装置", "PX装置"}
  1160. //taskNames := []string{"原油石化早报"}
  1161. for _, nameKey := range taskNames {
  1162. fmt.Println("开始获取: ", nameKey)
  1163. fetchRule, e := loadDataRule(nameKey)
  1164. if e != nil {
  1165. err = fmt.Errorf("loadDataRule, err: %v", e)
  1166. return
  1167. }
  1168. _, e = savePageHtml(nameKey, fetchRule.PageDir, true, 0)
  1169. if e != nil {
  1170. err = fmt.Errorf("savePageHtml, err: %v", e)
  1171. return
  1172. }
  1173. fmt.Println("结束获取: ", nameKey)
  1174. }
  1175. return
  1176. }
  1177. // ReadEdbHistoryFiles 读取历史文件
  1178. func ReadEdbHistoryFiles(context.Context) {
  1179. var err error
  1180. defer func() {
  1181. if err != nil {
  1182. tips := fmt.Sprintf("ReadEdbHistoryFiles ErrMsg: %s", err.Error())
  1183. utils.FileLog.Info(tips)
  1184. fmt.Println(tips)
  1185. }
  1186. }()
  1187. taskNames := []string{"原油石化早报", "PTA周报", "MEG周报", "长丝周报", "短纤周报", "瓶片周报", "切片周报", "PX周报"}
  1188. //taskNames := []string{"原油石化早报", "PTA周报", "MEG周报", "长丝周报", "短纤周报", "瓶片周报", "切片周报", "PX周报"}
  1189. for _, nameKey := range taskNames {
  1190. fetchRule, e := loadDataRule(nameKey)
  1191. if e != nil {
  1192. utils.FileLog.Info(fmt.Sprintf("%s无解析规则, err: %v\n", nameKey, e))
  1193. continue
  1194. }
  1195. filePaths, e := listFiles(fetchRule.PageDir)
  1196. if e != nil {
  1197. utils.FileLog.Info(fmt.Sprintf("%s读取文件目录失败, err: %v\n", nameKey, e))
  1198. continue
  1199. }
  1200. for _, v := range filePaths {
  1201. //if k > 0 {
  1202. // break
  1203. //}
  1204. v = fmt.Sprintf("%s/%s", fetchRule.PageDir, v)
  1205. fmt.Printf("开始解析: %s", v)
  1206. //htm, e := ioutil.ReadFile("static/ccf/oil_daily/28-20240604-原油石化早报(6.7).html")
  1207. htm, e := ioutil.ReadFile(v)
  1208. if e != nil {
  1209. fmt.Printf("file: %s, ReadFile err: %v\n", v, e)
  1210. utils.FileLog.Info(fmt.Sprintf("file: %s, ReadFile err: %v", v, e))
  1211. continue
  1212. }
  1213. handler, ok := TaskAnalysisHandlers[nameKey]
  1214. if !ok {
  1215. utils.FileLog.Info(fmt.Sprintf("%s无解析函数\n", nameKey))
  1216. continue
  1217. }
  1218. indexes, e := handler(htm, fetchRule)
  1219. if e != nil {
  1220. fmt.Printf("file: %s, handler err: %v\n", v, e)
  1221. utils.FileLog.Info(fmt.Sprintf("file: %s, handler err: %v", v, e))
  1222. continue
  1223. }
  1224. // 写入数据库
  1225. params := make(map[string]interface{})
  1226. params["List"] = indexes
  1227. params["TerminalCode"] = utils.TerminalCode
  1228. result, e := postEdbLib(params, utils.LIB_ROUTE_CCF_EDB_HANDLE)
  1229. if e != nil {
  1230. b, _ := json.Marshal(params)
  1231. fmt.Printf("file: %s, postEdbLib err: %v, params: %s\n", v, e, string(b))
  1232. utils.FileLog.Info(fmt.Sprintf("file: %s, postEdbLib err: %v, params: %s", v, e, string(b)))
  1233. continue
  1234. }
  1235. resp := new(models.BaseEdbLibResponse)
  1236. if e = json.Unmarshal(result, &resp); e != nil {
  1237. fmt.Printf("file: %s, json.Unmarshal err: %v\n", v, e)
  1238. utils.FileLog.Info(fmt.Sprintf("file: %s, json.Unmarshal err: %v", v, e))
  1239. continue
  1240. }
  1241. if resp.Ret != 200 {
  1242. fmt.Printf("file: %s, Msg: %s, ErrMsg: %s\n", v, resp.Msg, resp.ErrMsg)
  1243. utils.FileLog.Info(fmt.Sprintf("file: %s, Msg: %s, ErrMsg: %s", v, resp.Msg, resp.ErrMsg))
  1244. continue
  1245. }
  1246. }
  1247. }
  1248. return
  1249. }