|
@@ -0,0 +1,145 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- encoding: utf-8 -*-
|
|
|
+'''
|
|
|
+@Contact : liuyuqi.gov@msn.cn
|
|
|
+@Time : 2020/09/07 16:38:41
|
|
|
+@License : Copyright © 2017-2020 liuyuqi. All Rights Reserved.
|
|
|
+@Desc : 驾考报名爬虫
|
|
|
+'''
|
|
|
+
|
|
|
+import os
|
|
|
+import sys
|
|
|
+import re
|
|
|
+import time
|
|
|
+import logging
|
|
|
+from selenium import webdriver
|
|
|
+import pandas as pd
|
|
|
+
|
|
|
+class Enum(tuple):
|
|
|
+ __getattr__ = tuple.index
|
|
|
+
|
|
|
+BrowserType = Enum(['FIREFOX', 'CHROME', 'IE', 'SAFARI', 'PHANTOMJS'])
|
|
|
+
|
|
|
+class CrawlCar():
|
|
|
+ def __init__(self, site, save_folder="data", browser=BrowserType.FIREFOX, driver=None):
|
|
|
+ self.__site = site
|
|
|
+ self.__save_folder = save_folder
|
|
|
+ self.__chapter_list = []
|
|
|
+
|
|
|
+ if not os.path.exists(self.__save_folder):
|
|
|
+ os.mkdir(self.__save_folder)
|
|
|
+
|
|
|
+ if BrowserType.FIREFOX == browser:
|
|
|
+ self.__browser = webdriver.Firefox()
|
|
|
+ elif BrowserType.CHROME == browser:
|
|
|
+ option = webdriver.ChromeOptions()
|
|
|
+ option.add_argument("lang=zh_CN.UTF-8")
|
|
|
+ option.add_argument(
|
|
|
+ "User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
|
|
|
+ # option.add_argument("--headless")
|
|
|
+
|
|
|
+ # 禁止加载图片
|
|
|
+ prefs = {
|
|
|
+ 'profile.default_content_setting_values.images': 2
|
|
|
+ }
|
|
|
+ option.add_experimental_option('prefs', prefs)
|
|
|
+ self.__browser = webdriver.Chrome(
|
|
|
+ executable_path=driver, options=option)
|
|
|
+ # window.navigater.webdriver 取消webdriver标志
|
|
|
+ self.__browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
|
|
+ "source": """
|
|
|
+ Object.defineProperty(navigator, 'webdriver', {
|
|
|
+ get: () => undefined
|
|
|
+ })
|
|
|
+ """})
|
|
|
+ elif BrowserType.IE == browser:
|
|
|
+ self.__browser = webdriver.Ie(driver)
|
|
|
+ elif BrowserType.SAFARI == browser:
|
|
|
+ self.__browser = webdriver.Safari(driver)
|
|
|
+ elif BrowserType.PHANTOMJS == browser:
|
|
|
+ self.__browser = webdriver.PhantomJS(driver)
|
|
|
+ else:
|
|
|
+ raise TypeError('UNKNOWN BROWSER TYPE: %s' % browser)
|
|
|
+ logging.basicConfig(
|
|
|
+ format='[%(asctime)s] %(levelname)s::%(module)s::%(funcName)s() %(message)s', level=logging.INFO)
|
|
|
+
|
|
|
+ def __del__(self):
|
|
|
+ self.__browser.quit()
|
|
|
+
|
|
|
+ def getLink(self):
|
|
|
+ self.__browser.implicitly_wait(10)
|
|
|
+ self.__browser.get(self.__site) # 加载页面
|
|
|
+ link_path = self.__save_folder + "/link.csv"
|
|
|
+ if not os.path.exists(link_path):
|
|
|
+ for i in range(5):
|
|
|
+ monthData = self.__browser.find_elements_by_css_selector(
|
|
|
+ "#querylist li a")
|
|
|
+ # # 获取本页所有月份
|
|
|
+ for i in monthData:
|
|
|
+ print(i.text, i.get_attribute("href"))
|
|
|
+ link.append([i.text, i.get_attribute("href")])
|
|
|
+ # 获取下一步按钮,点击
|
|
|
+ self.__browser.find_element_by_xpath(
|
|
|
+ '//*[@id="pppagination"]/ul/li[2]/a').click()
|
|
|
+ # cookie = getCookie()
|
|
|
+ # print(cookie)
|
|
|
+ link = pd.DataFrame(link)
|
|
|
+ link.to_csv(link_path, header=False)
|
|
|
+ else:
|
|
|
+ link = pd.read_csv(link_path, names=["month", "link"])
|
|
|
+ return link
|
|
|
+
|
|
|
+ def downlaod(self, url, save_path):
|
|
|
+ pass
|
|
|
+
|
|
|
+ def getCookie(self):
|
|
|
+ cookie = self.__browser.get_cookies()
|
|
|
+ cookie_dict = []
|
|
|
+ for c in cookie:
|
|
|
+ ck = "{0}={1};".format(c['name'], c['value'])
|
|
|
+ cookie_dict.append(ck)
|
|
|
+ return cookie_dict
|
|
|
+
|
|
|
+ def crawl(self):
|
|
|
+ link_path = self.__save_folder + "/link.csv"
|
|
|
+ link = pd.read_csv(link_path, names=["month", "link"])
|
|
|
+ for i in range(len(link)):
|
|
|
+ link1 = link.loc[i]["link"] # 链接
|
|
|
+ month1 = link.loc[i]["month"] # 月份
|
|
|
+ if not os.path.exists(self.__save_folder + "/report" + month1 + ".csv"):
|
|
|
+ self.__browser.implicitly_wait(10)
|
|
|
+ self.__browser.get(link1)
|
|
|
+ # # 找出多少条,多少页
|
|
|
+ try:
|
|
|
+ text = self.__browser.find_element_by_xpath(
|
|
|
+ '//*[@id="pagination"]/span').text # 有异常
|
|
|
+ # 共2391条记录 1/120页
|
|
|
+ pagesize = re.split(
|
|
|
+ "[/页]", re.search("/.*页 ", text).group())[1]
|
|
|
+ reportData = pd.DataFrame(
|
|
|
+ columns=["date", "place", "course1", "course2", "course3", "course4"])
|
|
|
+ for i in range(int(pagesize)):
|
|
|
+ # 找出本页table
|
|
|
+ trlist = self.__browser.find_elements_by_tag_name("tr")
|
|
|
+ for row in trlist:
|
|
|
+ tdlist = row.find_elements_by_tag_name("td")
|
|
|
+ tmp = []
|
|
|
+ for col in tdlist:
|
|
|
+ tmp.append(col.text)
|
|
|
+ reportData = reportData.append(
|
|
|
+ pd.Series(tmp, index=reportData.columns), ignore_index=True) # 增加一行
|
|
|
+
|
|
|
+ # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
|
|
|
+ if i > 0:
|
|
|
+ self.__browser.find_element_by_xpath(
|
|
|
+ '//*[@id="pagination"]/span/a[3]').click()
|
|
|
+ else:
|
|
|
+ self.__browser.find_element_by_xpath(
|
|
|
+ '//*[@id="pagination"]/span/a[2]').click()
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ reportData.to_csv(self.__save_folder +
|
|
|
+ "/report" + month1 + ".csv", header=False)
|
|
|
+
|
|
|
+ def start(self):
|
|
|
+ self.crawl()
|