Browse Source

涌益生猪爬虫

xyxie 1 year ago
parent
commit
4145df3676
1 changed files with 58 additions and 75 deletions
  1. 58 75
      yongyi_pig/yongyi_pig_manual.py

+ 58 - 75
yongyi_pig/yongyi_pig_manual.py

@@ -1,7 +1,9 @@
 # coding:utf-8
+import os
 from time import sleep
 import datetime
 import openpyxl
+import rarfile as rarfile
 import requests
 from selenium import webdriver
 
@@ -14,74 +16,19 @@ from selenium.webdriver.support.wait import WebDriverWait
 
 from imgcode_ak import image_code
 
-"""
-根据table的id属性和table中的某一个元素定位其在table中的位置
-table包括表头,位置坐标都是从1开始算
-tableId:table的id属性
-queryContent:需要确定位置的内容
-"""
-
-
-def get_table_content(driver, tableId, queryContent):
-    # 按行查询表格的数据,取出的数据是一整行,按空格分隔每一列的数据
-    table_tr_list = driver.find_element(By.ID, tableId).find_elements(By.TAG_NAME, "tr")
-    table_list = []  # 存放table数据
-    for tr in table_tr_list:  # 遍历每一个tr
-        # 将每一个tr的数据根据td查询出来,返回结果为list对象
-        table_td_list = tr.find_elements(By.TAG_NAME, "td")
-        row_list = []
-        print(table_td_list)
-        for td in table_td_list:  # 遍历每一个td
-            row_list.append(td.text)  # 取出表格的数据,并放入行列表里
-        table_list.append(row_list)
-
-    # 循环遍历table数据,确定查询数据的位置
-    # for i in range(len(table_list)):
-    #     for j in range(len(table_list[i])):
-    #         if queryContent == table_list[i][j]:
-    #             print("%r坐标为(%r,%r)" % (queryContent, i + 1, j + 1))
-
-
-# 写入文件
-def write_excel_xlsx(path, sheet_name, value):
-    index = len(value)  # 列表中所含元组的个数,从而确定写入Excel的行数
-    # 打开Excel
-    wb = openpyxl.Workbook()
-    # wb = load_workbook(path)
-    sheet = wb.active  # 获得一个的工作表
-    sheet.title = sheet_name
-    # 设置格式
-    sheet.column_dimensions['B'].width = 115
-    # 按行加入
-    for i in range(index):
-        sheet.append(value[i])
-    # 保存文件
-    print(sheet.values)
-    wb.save(path)
-    print("题目写入数据成功!")
-
-
-def send_file(url, file_path):
-    with open(file_path, 'rb') as file:
-        files = {'file': file}
-        response2 = requests.post(url, files=files)
-    return response2
-
-
-def get_element(my_driver, xpaths):
-    """
-    判断是否存在元素并获取元素对象
-    :param my_driver:
-    :param xpaths: xpaths表达式
-    :return: 元素对象或为空
-    """
-    try:
-        target = my_driver.find_element(By.XPATH, xpaths)
-    except exceptions.NoSuchElementException:
-        return False
-    else:
-        return target
-
+def rename_week_file(new_dir, current_time, rar_name):
+    files = os.listdir(rar_name)
+    for dir in files:
+        print(dir.title())
+        if os.path.isdir(dir.title()):
+            dir_list = os.listdir(rar_name + "/" + dir.title())
+            for f in dir_list:
+                print(f.title())
+                if f.title().find("周度数据") != -1:
+                    new_name = f'{new_dir}/{current_time}_week.xlsx'
+                    os.rename(rar_name + "/" + dir.title() + "/" + f.title(), new_name)
+                    return
+    return
 
 if __name__ == "__main__":
     # python+selunium定位已打开的浏览器
@@ -102,13 +49,15 @@ if __name__ == "__main__":
     # 后面你只需要python + selenium + webdriver定位到这个已经登录的浏览器进行操作就可以啦
     options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
     # 修改下载地址
-    options.add_argument("--download.default_directory=/Users/xiexiaoyuan/Downloads/")
+    # save_to_dir = '/Users/xiexiaoyuan/Downloads'
+    save_to_dir = r'D:\download\excel'
+    options.add_argument("--download.default_directory="+save_to_dir)
     options.add_argument('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, '
                           'like Gecko) Chrome/118.0.5993.70 Safari/537.36')
     options.add_argument(" window-size=1920,1080")
 
-    s = Service(executable_path='/Users/xiexiaoyuan/chromedriver_mac64_114/chromedriver')
-    # s = Service(executable_path='/Users/xi/Desktop/chromedriver')
+    # s = Service(executable_path='/Users/xiexiaoyuan/chromedriver_mac64_114/chromedriver')
+    s = Service(executable_path='D:\download\chromedriver119-win64\chromedriver.exe')
     driver = webdriver.Chrome(service=s, options=options)
     # driver.maximize_window()
     driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
@@ -131,10 +80,44 @@ if __name__ == "__main__":
     a.click()
 
     # 下载涌溢完整数据库
-    sleep(1)
+    sleep(2)
     b = driver.find_element(By.XPATH, '/html/body/div[4]/div[1]/div[2]/div[3]/a')
     print(b.get_attribute("href"))
     b.click()
-    sleep(10)
-    # WebDriverWait(driver, 10).until(
-    #     EC.element_to_be_clickable((By.XPATH, '/html/body/div[4]/div[1]/div[2]/div[2]/a'))).click()
+    sleep(30)
+
+    # 获取当前时间,并将其格式化为指定的形式
+    current_time = datetime.datetime.now().strftime("%Y-%m-%d")
+    # 查找文件并重命名
+    os.chdir(save_to_dir)
+    files = filter(os.path.isfile, os.listdir(save_to_dir))
+    files = [os.path.join(save_to_dir, f) for f in files]  # add path to each file
+    files.sort(key=lambda x: os.path.getmtime(x))
+    day_file = files[-1]
+    # new_dir = '/Users/xiexiaoyuan/Downloads/yongyi'
+    new_dir = r'D:\data\yongyi'
+    if day_file.title().find("日度") == -1:
+        day_file = files[-2]
+
+    if day_file.title().find("日度") != -1:
+        new_name = f'{new_dir}/{current_time}_day.xlsx'
+        os.rename(day_file.title(), new_name)
+    else:
+        print("未找到日度下载文件")
+
+    week_file = files[-2]
+    if week_file.title().find("周度") == -1:
+        week_file = files[-1]
+    print(week_file.title())
+    if week_file.title().find("周度") != -1:
+        filename = week_file.title()
+        index = filename.find(".Rar")
+        rar_name = filename[:index]
+        # 解压缩
+        rar_file = rarfile.RarFile(filename, 'r')
+        rar_file.extractall(rar_name)
+        rar_file.close()
+
+        rename_week_file(new_dir, current_time, rar_name)
+    else:
+        print("未找到周度下载文件")