|
@@ -3,14 +3,18 @@ package base_from_ccf
|
|
|
import (
|
|
|
"bytes"
|
|
|
"compress/gzip"
|
|
|
+ "context"
|
|
|
"encoding/json"
|
|
|
"eta/eta_data_analysis/utils"
|
|
|
"fmt"
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
+ "github.com/chromedp/cdproto/network"
|
|
|
+ "github.com/chromedp/chromedp"
|
|
|
"golang.org/x/net/html/charset"
|
|
|
"golang.org/x/text/encoding/simplifiedchinese"
|
|
|
"golang.org/x/text/transform"
|
|
|
- "io/ioutil"
|
|
|
+ "io"
|
|
|
+ "log"
|
|
|
"net/http"
|
|
|
"net/url"
|
|
|
"os"
|
|
@@ -43,7 +47,7 @@ func postEdbLib(param map[string]interface{}, method string) (result []byte, err
|
|
|
// httpPost HTTP请求
|
|
|
func httpPost(url, postData string, params ...string) ([]byte, error) {
|
|
|
fmt.Println("httpPost Url:" + url)
|
|
|
- body := ioutil.NopCloser(strings.NewReader(postData))
|
|
|
+ body := io.NopCloser(strings.NewReader(postData))
|
|
|
client := &http.Client{}
|
|
|
req, err := http.NewRequest("POST", url, body)
|
|
|
if err != nil {
|
|
@@ -60,8 +64,10 @@ func httpPost(url, postData string, params ...string) ([]byte, error) {
|
|
|
fmt.Println("client.Do err:" + err.Error())
|
|
|
return nil, err
|
|
|
}
|
|
|
- defer resp.Body.Close()
|
|
|
- b, err := ioutil.ReadAll(resp.Body)
|
|
|
+ defer func() {
|
|
|
+ _ = resp.Body.Close()
|
|
|
+ }()
|
|
|
+ b, err := io.ReadAll(resp.Body)
|
|
|
if err != nil {
|
|
|
fmt.Println("httpPost:" + string(b))
|
|
|
}
|
|
@@ -69,7 +75,7 @@ func httpPost(url, postData string, params ...string) ([]byte, error) {
|
|
|
}
|
|
|
|
|
|
// fetchPageHtml 获取网站HTML文本
|
|
|
-func fetchPageHtml(baseUrl string) (respBody []byte, err error) {
|
|
|
+func fetchPageHtml(baseUrl string, fetchNum int) (respBody []byte, err error) {
|
|
|
defer func() {
|
|
|
if err != nil {
|
|
|
tips := fmt.Sprintf("BuildCCFRequest ErrMsg: %s", err.Error())
|
|
@@ -77,26 +83,19 @@ func fetchPageHtml(baseUrl string) (respBody []byte, err error) {
|
|
|
fmt.Println(tips)
|
|
|
}
|
|
|
}()
|
|
|
+ // 查询次数
|
|
|
+ fetchNum++
|
|
|
if baseUrl == "" {
|
|
|
err = fmt.Errorf("CCF请求地址为空")
|
|
|
return
|
|
|
}
|
|
|
|
|
|
- // 读取Cookie
|
|
|
- if utils.CCFCookieFile == "" {
|
|
|
- err = fmt.Errorf("cookie文件未配置")
|
|
|
- return
|
|
|
- }
|
|
|
- cookieByte, e := ioutil.ReadFile(utils.CCFCookieFile)
|
|
|
+ // 获取Cookie
|
|
|
+ strCookie, e := getCookie()
|
|
|
if e != nil {
|
|
|
err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
|
|
|
return
|
|
|
}
|
|
|
- strCookie := strings.TrimSpace(string(cookieByte))
|
|
|
- if strCookie == "" {
|
|
|
- err = fmt.Errorf("cookie为空")
|
|
|
- return
|
|
|
- }
|
|
|
|
|
|
// 拉取网站内容
|
|
|
cli := new(http.Client)
|
|
@@ -137,7 +136,7 @@ func fetchPageHtml(baseUrl string) (respBody []byte, err error) {
|
|
|
err = fmt.Errorf("gzip NewReader err: %s", e.Error())
|
|
|
return
|
|
|
}
|
|
|
- body, e := ioutil.ReadAll(reader)
|
|
|
+ body, e := io.ReadAll(reader)
|
|
|
if e != nil {
|
|
|
err = fmt.Errorf("read body err: %s", e.Error())
|
|
|
return
|
|
@@ -149,12 +148,25 @@ func fetchPageHtml(baseUrl string) (respBody []byte, err error) {
|
|
|
err = fmt.Errorf("utf8 reader err: %s", e.Error())
|
|
|
return
|
|
|
}
|
|
|
- utf8Body, e := ioutil.ReadAll(utf8Reader)
|
|
|
+ utf8Body, e := io.ReadAll(utf8Reader)
|
|
|
if e != nil {
|
|
|
err = fmt.Errorf("utf8 body err: %s", e.Error())
|
|
|
return
|
|
|
}
|
|
|
respBody = utf8Body
|
|
|
+
|
|
|
+ isLoginPage := checkIsLoginPage(string(respBody))
|
|
|
+ fmt.Println("是否登录页:", isLoginPage)
|
|
|
+
|
|
|
+ // 如果是登录页,且查询次数少于2次,那么就重新登录后查询
|
|
|
+ if isLoginPage && fetchNum < 2 {
|
|
|
+ _, err = getCookieByChrome()
|
|
|
+ if err != nil {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ return fetchPageHtml(baseUrl, fetchNum)
|
|
|
+ }
|
|
|
+
|
|
|
return
|
|
|
}
|
|
|
|
|
@@ -198,7 +210,7 @@ func loadDataRule(nameKey string) (fetchRule *DataRule, err error) {
|
|
|
err = fmt.Errorf("rule文件不存在")
|
|
|
return
|
|
|
}
|
|
|
- b, e := ioutil.ReadFile(utils.CCFDataRuleFile)
|
|
|
+ b, e := os.ReadFile(utils.CCFDataRuleFile)
|
|
|
if e != nil {
|
|
|
err = fmt.Errorf("读取rule文件失败, err: %v", e)
|
|
|
return
|
|
@@ -264,7 +276,7 @@ func savePageHtml(nameKey, saveDir string, historyPage bool, reportMax int) (fil
|
|
|
firstPage := fmt.Sprintf(`%s&cur_pg_num=%d`, baseUrl, 1)
|
|
|
|
|
|
// 首页报告链接
|
|
|
- firstHtml, e := fetchPageHtml(firstPage)
|
|
|
+ firstHtml, e := fetchPageHtml(firstPage, 0)
|
|
|
if e != nil {
|
|
|
err = fmt.Errorf("获取首页HTML失败, err: %v", e)
|
|
|
return
|
|
@@ -295,7 +307,7 @@ func savePageHtml(nameKey, saveDir string, historyPage bool, reportMax int) (fil
|
|
|
// 每页28条数据, 需要带上页码*28的偏移量不然始终获取第一页
|
|
|
pageUrl := fmt.Sprintf(`%s&cur_pg_num=%d&cur_row_pos=%d`, baseUrl, i, i*28)
|
|
|
fmt.Println("pageUrl: ", pageUrl)
|
|
|
- pageContents, e := fetchPageHtml(pageUrl)
|
|
|
+ pageContents, e := fetchPageHtml(pageUrl, 0)
|
|
|
if e != nil {
|
|
|
err = fmt.Errorf("获取首页HTML失败, err: %v", e)
|
|
|
return
|
|
@@ -328,7 +340,7 @@ func savePageHtml(nameKey, saveDir string, historyPage bool, reportMax int) (fil
|
|
|
}
|
|
|
fmt.Printf("拉取报告: %s; url: %s\n", v.Title, v.Href)
|
|
|
|
|
|
- htm, e := fetchPageHtml(fmt.Sprintf("%s%s", CCFReportDetailBaseUrl, v.Href))
|
|
|
+ htm, e := fetchPageHtml(fmt.Sprintf("%s%s", CCFReportDetailBaseUrl, v.Href), 0)
|
|
|
if e != nil {
|
|
|
utils.FileLog.Info("获取页面失败, err: %v", e)
|
|
|
continue
|
|
@@ -414,7 +426,9 @@ func writeHTMLToFile(content string, filePath string) error {
|
|
|
if err != nil {
|
|
|
return err
|
|
|
}
|
|
|
- defer file.Close()
|
|
|
+ defer func() {
|
|
|
+ _ = file.Close()
|
|
|
+ }()
|
|
|
|
|
|
// 将HTML内容写入文件
|
|
|
_, err = file.WriteString(content)
|
|
@@ -727,3 +741,134 @@ func formatIntervalData(cellTxt, flag string) string {
|
|
|
|
|
|
return fmt.Sprint(average)
|
|
|
}
|
|
|
+
|
|
|
+// getCookie
|
|
|
+// @Description: 获取cookie
|
|
|
+// @author: Roc
|
|
|
+// @datetime 2024-07-09 14:00:53
|
|
|
+// @return cookieStr string
|
|
|
+// @return err error
|
|
|
+func getCookie() (cookieStr string, err error) {
|
|
|
+ // 读取Cookie
|
|
|
+ if utils.CCFCookieFile == "" {
|
|
|
+ err = fmt.Errorf("cookie文件未配置")
|
|
|
+ return
|
|
|
+ }
|
|
|
+ cookieByte, e := os.ReadFile(utils.CCFCookieFile)
|
|
|
+ if e != nil {
|
|
|
+ err = fmt.Errorf("读取cookie文件失败, err: %s", e.Error())
|
|
|
+ return
|
|
|
+ }
|
|
|
+ cookieStr = strings.TrimSpace(string(cookieByte))
|
|
|
+ if cookieStr == "" {
|
|
|
+ err = fmt.Errorf("cookie为空")
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+// getCookieByChrome
|
|
|
+// @Description: 获取cookie
|
|
|
+// @author: Roc
|
|
|
+// @datetime 2024-07-09 14:00:53
|
|
|
+// @return cookieStr string
|
|
|
+// @return err error
|
|
|
+func getCookieByChrome() (cookieStr string, err error) {
|
|
|
+ // 读取Cookie
|
|
|
+ if utils.CCFUseName == "" {
|
|
|
+ err = fmt.Errorf("CCF账号未设置")
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if utils.CCFPassword == "" {
|
|
|
+ err = fmt.Errorf("CCF密码未设置")
|
|
|
+ return
|
|
|
+ }
|
|
|
+ opts := append(
|
|
|
+ chromedp.DefaultExecAllocatorOptions[:],
|
|
|
+ chromedp.Flag("headless", false),
|
|
|
+ )
|
|
|
+ allocCtx, cancel1 := chromedp.NewExecAllocator(context.Background(), opts...)
|
|
|
+ defer cancel1()
|
|
|
+
|
|
|
+ // 创建chrome实例
|
|
|
+ ctx, cancel2 := chromedp.NewContext(
|
|
|
+ allocCtx,
|
|
|
+ chromedp.WithLogf(log.Printf),
|
|
|
+ )
|
|
|
+ defer cancel2()
|
|
|
+ err = chromedp.Run(ctx,
|
|
|
+ chromedp.Navigate(`https://www.ccf.com.cn/member/member.php`),
|
|
|
+ chromedp.SetValue(`input[name="username"]`, utils.CCFUseName, chromedp.ByQuery),
|
|
|
+ chromedp.SetValue(`input[name="password"]`, utils.CCFPassword, chromedp.ByQuery),
|
|
|
+ chromedp.Sleep(2*time.Second),
|
|
|
+ chromedp.Click(`input[id="imageField"]`, chromedp.ByQuery),
|
|
|
+ chromedp.Sleep(5*time.Second),
|
|
|
+
|
|
|
+ chromedp.Navigate(`https://www.ccf.com.cn/newscenter/detail-410000-2024070600003.shtml`),
|
|
|
+ chromedp.Sleep(2*time.Second),
|
|
|
+ chromedp.ActionFunc(func(ctx context.Context) error {
|
|
|
+ cookies, err := network.GetCookies().Do(ctx)
|
|
|
+ if err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ //cookieJson, err := json.Marshal(cookies)
|
|
|
+ //if err != nil {
|
|
|
+ // return err
|
|
|
+ //}
|
|
|
+ //fmt.Println("cookieJson:", string(cookieJson))
|
|
|
+ //utils.FileLog.Info("cookieJson:" + string(cookieJson))
|
|
|
+ for _, v := range cookies {
|
|
|
+ cookieStr = cookieStr + v.Name + "=" + v.Value + ";"
|
|
|
+ }
|
|
|
+ //fmt.Println("header cookie:", cookieStr)
|
|
|
+ //utils.FileLog.Info("header cookie:" + cookieStr)
|
|
|
+
|
|
|
+ tmpFile, tmpErr := os.Create(utils.CCFCookieFile)
|
|
|
+ if tmpErr != nil {
|
|
|
+ fmt.Println("创建cookie文件失败:", tmpErr.Error())
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+ if _, err := tmpFile.WriteString(cookieStr); err != nil {
|
|
|
+ fmt.Println("写入cookie到文件失败:", err.Error())
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+ }),
|
|
|
+ )
|
|
|
+
|
|
|
+ //if err != nil {
|
|
|
+ // fmt.Println(err)
|
|
|
+ //}
|
|
|
+
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+// checkIsLoginPage
|
|
|
+// @Description: 校验是否是登录页
|
|
|
+// @author: Roc
|
|
|
+// @datetime 2024-07-09 16:34:17
|
|
|
+// @param bodyStr string
|
|
|
+// @return isLoginPage bool
|
|
|
+func checkIsLoginPage(bodyStr string) (isLoginPage bool) {
|
|
|
+ // 初始化goquery.Document
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(bodyStr))
|
|
|
+ if err != nil {
|
|
|
+ log.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 查找name为LoginForm的表单
|
|
|
+ doc.Find("form[name=LoginForm]").Each(func(i int, s *goquery.Selection) {
|
|
|
+ // 如果找到了,打印信息表示这是登录页
|
|
|
+ //fmt.Println("这是一个登录页面")
|
|
|
+ isLoginPage = true
|
|
|
+ return
|
|
|
+ })
|
|
|
+
|
|
|
+ // 如果没有找到,打印信息表示这不是登录页
|
|
|
+ //if doc.Find("form[name=LoginForm]").Length() == 0 {
|
|
|
+ // fmt.Println("这不是一个登录页面")
|
|
|
+ //}
|
|
|
+
|
|
|
+ return
|
|
|
+}
|