Browse Source

改成类方式

liuyuqi-dellpc 3 years ago
parent
commit
3d937a5edd
6 changed files with 170 additions and 133 deletions
  1. 2 1
      .gitignore
  2. 1 1
      README.md
  3. 1 0
      crawl_car/__init__.py
  4. 145 0
      crawl_car/crawl_car.py
  5. 0 131
      crwal122.py
  6. 21 0
      run.py

+ 2 - 1
.gitignore

@@ -1,2 +1,3 @@
 /.ipynb_checkpoints
-/data
+/data
+*.pyc

+ 1 - 1
README.md

@@ -11,7 +11,7 @@ source venv/bin/activate
 pip install -r requirements.txt
 
 
-python crwal122.py
+python run.py
 
 
 ```

+ 1 - 0
crawl_car/__init__.py

@@ -0,0 +1 @@
+from .crawl_car import *

+ 145 - 0
crawl_car/crawl_car.py

@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2020/09/07 16:38:41
+@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
+@Desc    :   驾考报名爬虫
+'''
+
+import os
+import sys
+import re
+import time
+import logging
+from selenium import webdriver
+import pandas as pd
+
+class Enum(tuple):
+    __getattr__ = tuple.index
+
+BrowserType = Enum(['FIREFOX', 'CHROME', 'IE', 'SAFARI', 'PHANTOMJS'])
+
+class CrawlCar():
+    def __init__(self, site, save_folder="data", browser=BrowserType.FIREFOX, driver=None):
+        self.__site = site
+        self.__save_folder = save_folder
+        self.__chapter_list = []
+
+        if not os.path.exists(self.__save_folder):
+            os.mkdir(self.__save_folder)
+
+        if BrowserType.FIREFOX == browser:
+            self.__browser = webdriver.Firefox()
+        elif BrowserType.CHROME == browser:
+            option = webdriver.ChromeOptions()
+            option.add_argument("lang=zh_CN.UTF-8")
+            option.add_argument(
+                "User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
+            # option.add_argument("--headless")
+
+            # 禁止加载图片
+            prefs = {
+                'profile.default_content_setting_values.images': 2
+            }
+            option.add_experimental_option('prefs', prefs)
+            self.__browser = webdriver.Chrome(
+                executable_path=driver, options=option)
+            # window.navigater.webdriver 取消webdriver标志
+            self.__browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+                "source": """
+                    Object.defineProperty(navigator, 'webdriver', {
+                    get: () => undefined
+                    })
+                """})
+        elif BrowserType.IE == browser:
+            self.__browser = webdriver.Ie(driver)
+        elif BrowserType.SAFARI == browser:
+            self.__browser = webdriver.Safari(driver)
+        elif BrowserType.PHANTOMJS == browser:
+            self.__browser = webdriver.PhantomJS(driver)
+        else:
+            raise TypeError('UNKNOWN BROWSER TYPE: %s' % browser)
+        logging.basicConfig(
+            format='[%(asctime)s] %(levelname)s::%(module)s::%(funcName)s() %(message)s', level=logging.INFO)
+
+    def __del__(self):
+        self.__browser.quit()
+
+    def getLink(self):
+        self.__browser.implicitly_wait(10)
+        self.__browser.get(self.__site)  # 加载页面
+        link_path = self.__save_folder + "/link.csv"
+        if not os.path.exists(link_path):
+            for i in range(5):
+                monthData = self.__browser.find_elements_by_css_selector(
+                    "#querylist li a")
+                # # 获取本页所有月份
+                for i in monthData:
+                    print(i.text, i.get_attribute("href"))
+                    link.append([i.text, i.get_attribute("href")])
+                # 获取下一步按钮,点击
+                self.__browser.find_element_by_xpath(
+                    '//*[@id="pppagination"]/ul/li[2]/a').click()
+                # cookie = getCookie()
+                # print(cookie)
+            link = pd.DataFrame(link)
+            link.to_csv(link_path, header=False)
+        else:
+            link = pd.read_csv(link_path, names=["month", "link"])
+        return link
+
+    def downlaod(self, url, save_path):
+        pass
+
+    def getCookie(self):
+        cookie = self.__browser.get_cookies()
+        cookie_dict = []
+        for c in cookie:
+            ck = "{0}={1};".format(c['name'], c['value'])
+            cookie_dict.append(ck)
+        return cookie_dict
+
+    def crawl(self):
+        link_path = self.__save_folder + "/link.csv"
+        link = pd.read_csv(link_path, names=["month", "link"])
+        for i in range(len(link)):
+            link1 = link.loc[i]["link"]    # 链接
+            month1 = link.loc[i]["month"]  # 月份
+            if not os.path.exists(self.__save_folder + "/report" + month1 + ".csv"):
+                self.__browser.implicitly_wait(10)
+                self.__browser.get(link1)
+                # # 找出多少条,多少页
+                try:
+                    text = self.__browser.find_element_by_xpath(
+                        '//*[@id="pagination"]/span').text  # 有异常
+                    # 共2391条记录 1/120页
+                    pagesize = re.split(
+                        "[/页]", re.search("/.*页  ", text).group())[1]
+                    reportData = pd.DataFrame(
+                        columns=["date", "place", "course1", "course2", "course3", "course4"])
+                    for i in range(int(pagesize)):
+                        # 找出本页table
+                        trlist = self.__browser.find_elements_by_tag_name("tr")
+                        for row in trlist:
+                            tdlist = row.find_elements_by_tag_name("td")
+                            tmp = []
+                            for col in tdlist:
+                                tmp.append(col.text)
+                            reportData = reportData.append(
+                                pd.Series(tmp, index=reportData.columns), ignore_index=True)  # 增加一行
+                        
+                        # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
+                        if i > 0:
+                            self.__browser.find_element_by_xpath(
+                                '//*[@id="pagination"]/span/a[3]').click()
+                        else:
+                            self.__browser.find_element_by_xpath(
+                                '//*[@id="pagination"]/span/a[2]').click()
+                except Exception as e:
+                    print(e)
+                reportData.to_csv(self.__save_folder +
+                                  "/report" + month1 + ".csv", header=False)
+
+    def start(self):
+        self.crawl()

+ 0 - 131
crwal122.py

@@ -1,131 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-'''
-@Contact :   liuyuqi.gov@msn.cn
-@Time    :   2020/09/06 01:38:09
-@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
-@Desc    :   爬取 https://sh.122.gov.cn 驾照考试报名数据
-'''
-
-import pandas as pd
-import numpy as np
-import requests
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.common.keys import Keys
-from selenium.webdriver import ActionChains
-
-import os
-import re
-import sys
-import time
-
-base = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003"
-chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe"
-phantomjspath = r"D:/Program-Files/phantomjs-2.1.1-windows/bin/phantomjs.exe"
-
-link = []
-res = []
-save_path = r"download"
-link_path = r"data/link.csv"
-report_path = r"data/report.csv"
-# if not os.path.exists(save_path):
-#     os.mkdir(save_path)
-
-option = webdriver.ChromeOptions()
-option.add_argument("lang=zh_CN.UTF-8")
-option.add_argument("User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
-option.add_argument("--headless")
-
-# 禁止加载图片
-prefs = {
-    'profile.default_content_setting_values.images': 2
-}
-option.add_experimental_option('prefs', prefs)
-
-# driver = webdriver.PhantomJS(executable_path=phantomjspath)
-driver = webdriver.Chrome(executable_path=chormepath, options=option)
-# driver.maximize_window()
-
-
-def getLink():
-    driver.implicitly_wait(10)
-    driver.get(base)  # 加载页面
-    if not os.path.exists(link_path):
-        for i in range(5):
-            monthData = driver.find_elements_by_css_selector(
-                "#querylist li a")
-            # # 获取本页所有月份
-            for i in monthData:
-                print(i.text, i.get_attribute("href"))
-                link.append([i.text, i.get_attribute("href")])
-            # 获取下一步按钮,点击
-            driver.find_element_by_xpath(
-                '//*[@id="pppagination"]/ul/li[2]/a').click()
-            # cookie = getCookie()
-            # print(cookie)
-        link = pd.DataFrame(link)
-        link.to_csv(link_path, header=False)
-    else:
-        link = pd.read_csv(link_path, names=["month", "link"])
-    return link
-
-
-def download(url, save_path):
-    try:
-        with open(save_path, "wb") as file:
-            file.write(requests.get(url).raw)
-    except Exception as e:
-        print(e)
-
-
-def getCookie():
-    cookie = driver.get_cookies()
-    cookie_dict = []
-    for c in cookie:
-        ck = "{0}={1};".format(c['name'], c['value'])
-        cookie_dict.append(ck)
-    return cookie_dict
-
-
-def crawl():
-    global link
-    link = pd.read_csv(link_path, names=["month", "link"])
-    for i in range(len(link)):
-        link1 = link.loc[i]["link"]    # 链接
-        month1 = link.loc[i]["month"]  # 月份
-        if not os.path.exists("/data/report" + month1 + ".csv"):
-            driver.implicitly_wait(10)
-            driver.get(link1)
-            # # 找出多少条,多少页
-            try:
-                text = driver.find_element_by_xpath(
-                    '//*[@id="pagination"]/span').text  # 有异常
-                # 共2391条记录 1/120页
-                pagesize = re.split("[/页]", re.search("/.*页  ", text).group())[1]
-                reportData = pd.DataFrame(
-                    columns=["date", "place", "course1", "course2", "course3", "course4"])
-                for i in range(int(pagesize)):
-                    # 找出本页table
-                    trlist = driver.find_elements_by_tag_name("tr")
-                    for row in trlist:
-                        tdlist = row.find_elements_by_tag_name("td")
-                        tmp = []
-                        for col in tdlist:
-                            tmp.append(col.text)
-                        reportData = reportData.append(
-                            pd.Series(tmp, index=reportData.columns), ignore_index=True)  # 增加一行
-                # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
-                    if i > 0:
-                        driver.find_element_by_xpath(
-                            '//*[@id="pagination"]/span/a[3]').click()
-                    else:
-                        driver.find_element_by_xpath(
-                            '//*[@id="pagination"]/span/a[2]').click()
-            except Exception as e:
-                print(e)
-            reportData.to_csv("data/report" + month1 + ".csv", header=False)
-    driver.close()
-
-if __name__ == "__main__":
-    crawl()

+ 21 - 0
run.py

@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2020/09/07 16:39:11
+@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
+@Desc    :   入口
+'''
+import crawl_car as cc
+
+site = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003"
+chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe"
+
+if __name__ == '__main__':
+    crawler = cc.CrawlCar(
+        site=site,
+        save_folder='./data', 
+        browser=cc.BrowserType.CHROME,
+        driver=chormepath  # 驱动程序路径,firefox不需要
+    )
+    crawler.start()