3 years ago · 3d937a5edd
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 
				 /.ipynb_checkpoints
			
 
				-/data
			
 
				+/data
			
 
				+*.pyc
			
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ source venv/bin/activate
 
				 pip install -r requirements.txt
			
 
				 
			
 
				 
			
 
				-python crwal122.py
			
 
				+python run.py
			
 
				 
			
 
				 
			
 
				 ```
			
--- a/crawl_car/__init__.py
+++ b/crawl_car/__init__.py
@@ -0,0 +1 @@
 
				+from .crawl_car import *
			
--- a/crawl_car/crawl_car.py
+++ b/crawl_car/crawl_car.py
@@ -0,0 +1,145 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2020/09/07 16:38:41
			
 
				+@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   驾考报名爬虫
			
 
				+'''
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import re
			
 
				+import time
			
 
				+import logging
			
 
				+from selenium import webdriver
			
 
				+import pandas as pd
			
 
				+
			
 
				+class Enum(tuple):
			
 
				+    __getattr__ = tuple.index
			
 
				+
			
 
				+BrowserType = Enum(['FIREFOX', 'CHROME', 'IE', 'SAFARI', 'PHANTOMJS'])
			
 
				+
			
 
				+class CrawlCar():
			
 
				+    def __init__(self, site, save_folder="data", browser=BrowserType.FIREFOX, driver=None):
			
 
				+        self.__site = site
			
 
				+        self.__save_folder = save_folder
			
 
				+        self.__chapter_list = []
			
 
				+
			
 
				+        if not os.path.exists(self.__save_folder):
			
 
				+            os.mkdir(self.__save_folder)
			
 
				+
			
 
				+        if BrowserType.FIREFOX == browser:
			
 
				+            self.__browser = webdriver.Firefox()
			
 
				+        elif BrowserType.CHROME == browser:
			
 
				+            option = webdriver.ChromeOptions()
			
 
				+            option.add_argument("lang=zh_CN.UTF-8")
			
 
				+            option.add_argument(
			
 
				+                "User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
			
 
				+            # option.add_argument("--headless")
			
 
				+
			
 
				+            # 禁止加载图片
			
 
				+            prefs = {
			
 
				+                'profile.default_content_setting_values.images': 2
			
 
				+            }
			
 
				+            option.add_experimental_option('prefs', prefs)
			
 
				+            self.__browser = webdriver.Chrome(
			
 
				+                executable_path=driver, options=option)
			
 
				+            # window.navigater.webdriver 取消webdriver标志
			
 
				+            self.__browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
			
 
				+                "source": """
			
 
				+                    Object.defineProperty(navigator, 'webdriver', {
			
 
				+                    get: () => undefined
			
 
				+                    })
			
 
				+                """})
			
 
				+        elif BrowserType.IE == browser:
			
 
				+            self.__browser = webdriver.Ie(driver)
			
 
				+        elif BrowserType.SAFARI == browser:
			
 
				+            self.__browser = webdriver.Safari(driver)
			
 
				+        elif BrowserType.PHANTOMJS == browser:
			
 
				+            self.__browser = webdriver.PhantomJS(driver)
			
 
				+        else:
			
 
				+            raise TypeError('UNKNOWN BROWSER TYPE: %s' % browser)
			
 
				+        logging.basicConfig(
			
 
				+            format='[%(asctime)s] %(levelname)s::%(module)s::%(funcName)s() %(message)s', level=logging.INFO)
			
 
				+
			
 
				+    def __del__(self):
			
 
				+        self.__browser.quit()
			
 
				+
			
 
				+    def getLink(self):
			
 
				+        self.__browser.implicitly_wait(10)
			
 
				+        self.__browser.get(self.__site)  # 加载页面
			
 
				+        link_path = self.__save_folder + "/link.csv"
			
 
				+        if not os.path.exists(link_path):
			
 
				+            for i in range(5):
			
 
				+                monthData = self.__browser.find_elements_by_css_selector(
			
 
				+                    "#querylist li a")
			
 
				+                # # 获取本页所有月份
			
 
				+                for i in monthData:
			
 
				+                    print(i.text, i.get_attribute("href"))
			
 
				+                    link.append([i.text, i.get_attribute("href")])
			
 
				+                # 获取下一步按钮，点击
			
 
				+                self.__browser.find_element_by_xpath(
			
 
				+                    '//*[@id="pppagination"]/ul/li[2]/a').click()
			
 
				+                # cookie = getCookie()
			
 
				+                # print(cookie)
			
 
				+            link = pd.DataFrame(link)
			
 
				+            link.to_csv(link_path, header=False)
			
 
				+        else:
			
 
				+            link = pd.read_csv(link_path, names=["month", "link"])
			
 
				+        return link
			
 
				+
			
 
				+    def downlaod(self, url, save_path):
			
 
				+        pass
			
 
				+
			
 
				+    def getCookie(self):
			
 
				+        cookie = self.__browser.get_cookies()
			
 
				+        cookie_dict = []
			
 
				+        for c in cookie:
			
 
				+            ck = "{0}={1};".format(c['name'], c['value'])
			
 
				+            cookie_dict.append(ck)
			
 
				+        return cookie_dict
			
 
				+
			
 
				+    def crawl(self):
			
 
				+        link_path = self.__save_folder + "/link.csv"
			
 
				+        link = pd.read_csv(link_path, names=["month", "link"])
			
 
				+        for i in range(len(link)):
			
 
				+            link1 = link.loc[i]["link"]    # 链接
			
 
				+            month1 = link.loc[i]["month"]  # 月份
			
 
				+            if not os.path.exists(self.__save_folder + "/report" + month1 + ".csv"):
			
 
				+                self.__browser.implicitly_wait(10)
			
 
				+                self.__browser.get(link1)
			
 
				+                # # 找出多少条，多少页
			
 
				+                try:
			
 
				+                    text = self.__browser.find_element_by_xpath(
			
 
				+                        '//*[@id="pagination"]/span').text  # 有异常
			
 
				+                    # 共2391条记录 1/120页
			
 
				+                    pagesize = re.split(
			
 
				+                        "[/页]", re.search("/.*页  ", text).group())[1]
			
 
				+                    reportData = pd.DataFrame(
			
 
				+                        columns=["date", "place", "course1", "course2", "course3", "course4"])
			
 
				+                    for i in range(int(pagesize)):
			
 
				+                        # 找出本页table
			
 
				+                        trlist = self.__browser.find_elements_by_tag_name("tr")
			
 
				+                        for row in trlist:
			
 
				+                            tdlist = row.find_elements_by_tag_name("td")
			
 
				+                            tmp = []
			
 
				+                            for col in tdlist:
			
 
				+                                tmp.append(col.text)
			
 
				+                            reportData = reportData.append(
			
 
				+                                pd.Series(tmp, index=reportData.columns), ignore_index=True)  # 增加一行
			
 
				+                        
			
 
				+                        # 点击下一步 ，这里有一个问题，第一页 span/a[2] 第二页之后就是 span/a[3]
			
 
				+                        if i > 0:
			
 
				+                            self.__browser.find_element_by_xpath(
			
 
				+                                '//*[@id="pagination"]/span/a[3]').click()
			
 
				+                        else:
			
 
				+                            self.__browser.find_element_by_xpath(
			
 
				+                                '//*[@id="pagination"]/span/a[2]').click()
			
 
				+                except Exception as e:
			
 
				+                    print(e)
			
 
				+                reportData.to_csv(self.__save_folder +
			
 
				+                                  "/report" + month1 + ".csv", header=False)
			
 
				+
			
 
				+    def start(self):
			
 
				+        self.crawl()
			
--- a/crwal122.py
+++ b/crwal122.py
@@ -1,131 +0,0 @@
 
				-#!/usr/bin/env python
			
 
				-# -*- encoding: utf-8 -*-
			
 
				-'''
			
 
				-@Contact :   liuyuqi.gov@msn.cn
			
 
				-@Time    :   2020/09/06 01:38:09
			
 
				-@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
			
 
				-@Desc    :   爬取 https://sh.122.gov.cn 驾照考试报名数据
			
 
				-'''
			
 
				-
			
 
				-import pandas as pd
			
 
				-import numpy as np
			
 
				-import requests
			
 
				-from selenium import webdriver
			
 
				-from selenium.common.exceptions import NoSuchElementException
			
 
				-from selenium.webdriver.common.keys import Keys
			
 
				-from selenium.webdriver import ActionChains
			
 
				-
			
 
				-import os
			
 
				-import re
			
 
				-import sys
			
 
				-import time
			
 
				-
			
 
				-base = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003"
			
 
				-chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe"
			
 
				-phantomjspath = r"D:/Program-Files/phantomjs-2.1.1-windows/bin/phantomjs.exe"
			
 
				-
			
 
				-link = []
			
 
				-res = []
			
 
				-save_path = r"download"
			
 
				-link_path = r"data/link.csv"
			
 
				-report_path = r"data/report.csv"
			
 
				-# if not os.path.exists(save_path):
			
 
				-#     os.mkdir(save_path)
			
 
				-
			
 
				-option = webdriver.ChromeOptions()
			
 
				-option.add_argument("lang=zh_CN.UTF-8")
			
 
				-option.add_argument("User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
			
 
				-option.add_argument("--headless")
			
 
				-
			
 
				-# 禁止加载图片
			
 
				-prefs = {
			
 
				-    'profile.default_content_setting_values.images': 2
			
 
				-}
			
 
				-option.add_experimental_option('prefs', prefs)
			
 
				-
			
 
				-# driver = webdriver.PhantomJS(executable_path=phantomjspath)
			
 
				-driver = webdriver.Chrome(executable_path=chormepath, options=option)
			
 
				-# driver.maximize_window()
			
 
				-
			
 
				-
			
 
				-def getLink():
			
 
				-    driver.implicitly_wait(10)
			
 
				-    driver.get(base)  # 加载页面
			
 
				-    if not os.path.exists(link_path):
			
 
				-        for i in range(5):
			
 
				-            monthData = driver.find_elements_by_css_selector(
			
 
				-                "#querylist li a")
			
 
				-            # # 获取本页所有月份
			
 
				-            for i in monthData:
			
 
				-                print(i.text, i.get_attribute("href"))
			
 
				-                link.append([i.text, i.get_attribute("href")])
			
 
				-            # 获取下一步按钮，点击
			
 
				-            driver.find_element_by_xpath(
			
 
				-                '//*[@id="pppagination"]/ul/li[2]/a').click()
			
 
				-            # cookie = getCookie()
			
 
				-            # print(cookie)
			
 
				-        link = pd.DataFrame(link)
			
 
				-        link.to_csv(link_path, header=False)
			
 
				-    else:
			
 
				-        link = pd.read_csv(link_path, names=["month", "link"])
			
 
				-    return link
			
 
				-
			
 
				-
			
 
				-def download(url, save_path):
			
 
				-    try:
			
 
				-        with open(save_path, "wb") as file:
			
 
				-            file.write(requests.get(url).raw)
			
 
				-    except Exception as e:
			
 
				-        print(e)
			
 
				-
			
 
				-
			
 
				-def getCookie():
			
 
				-    cookie = driver.get_cookies()
			
 
				-    cookie_dict = []
			
 
				-    for c in cookie:
			
 
				-        ck = "{0}={1};".format(c['name'], c['value'])
			
 
				-        cookie_dict.append(ck)
			
 
				-    return cookie_dict
			
 
				-
			
 
				-
			
 
				-def crawl():
			
 
				-    global link
			
 
				-    link = pd.read_csv(link_path, names=["month", "link"])
			
 
				-    for i in range(len(link)):
			
 
				-        link1 = link.loc[i]["link"]    # 链接
			
 
				-        month1 = link.loc[i]["month"]  # 月份
			
 
				-        if not os.path.exists("/data/report" + month1 + ".csv"):
			
 
				-            driver.implicitly_wait(10)
			
 
				-            driver.get(link1)
			
 
				-            # # 找出多少条，多少页
			
 
				-            try:
			
 
				-                text = driver.find_element_by_xpath(
			
 
				-                    '//*[@id="pagination"]/span').text  # 有异常
			
 
				-                # 共2391条记录 1/120页
			
 
				-                pagesize = re.split("[/页]", re.search("/.*页  ", text).group())[1]
			
 
				-                reportData = pd.DataFrame(
			
 
				-                    columns=["date", "place", "course1", "course2", "course3", "course4"])
			
 
				-                for i in range(int(pagesize)):
			
 
				-                    # 找出本页table
			
 
				-                    trlist = driver.find_elements_by_tag_name("tr")
			
 
				-                    for row in trlist:
			
 
				-                        tdlist = row.find_elements_by_tag_name("td")
			
 
				-                        tmp = []
			
 
				-                        for col in tdlist:
			
 
				-                            tmp.append(col.text)
			
 
				-                        reportData = reportData.append(
			
 
				-                            pd.Series(tmp, index=reportData.columns), ignore_index=True)  # 增加一行
			
 
				-                # 点击下一步 ，这里有一个问题，第一页 span/a[2] 第二页之后就是 span/a[3]
			
 
				-                    if i > 0:
			
 
				-                        driver.find_element_by_xpath(
			
 
				-                            '//*[@id="pagination"]/span/a[3]').click()
			
 
				-                    else:
			
 
				-                        driver.find_element_by_xpath(
			
 
				-                            '//*[@id="pagination"]/span/a[2]').click()
			
 
				-            except Exception as e:
			
 
				-                print(e)
			
 
				-            reportData.to_csv("data/report" + month1 + ".csv", header=False)
			
 
				-    driver.close()
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    crawl()
			
--- a/run.py
+++ b/run.py
@@ -0,0 +1,21 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2020/09/07 16:39:11
			
 
				+@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   入口
			
 
				+'''
			
 
				+import crawl_car as cc
			
 
				+
			
 
				+site = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003"
			
 
				+chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe"
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    crawler = cc.CrawlCar(
			
 
				+        site=site,
			
 
				+        save_folder='./data', 
			
 
				+        browser=cc.BrowserType.CHROME,
			
 
				+        driver=chormepath  # 驱动程序路径，firefox不需要
			
 
				+    )
			
 
				+    crawler.start()