crawl_car.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2020/09/07 16:38:41
  6. @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved.
  7. @Desc : 驾考报名爬虫
  8. '''
  9. import os
  10. import sys
  11. import re
  12. import time
  13. import logging
  14. from selenium import webdriver
  15. import pandas as pd
  16. class Enum(tuple):
  17. __getattr__ = tuple.index
  18. BrowserType = Enum(['FIREFOX', 'CHROME', 'IE', 'SAFARI', 'PHANTOMJS'])
  19. class CrawlCar():
  20. def __init__(self, site, save_folder="data", browser=BrowserType.FIREFOX, driver=None):
  21. self.__site = site
  22. self.__save_folder = save_folder
  23. self.__chapter_list = []
  24. if not os.path.exists(self.__save_folder):
  25. os.mkdir(self.__save_folder)
  26. if BrowserType.FIREFOX == browser:
  27. self.__browser = webdriver.Firefox()
  28. elif BrowserType.CHROME == browser:
  29. option = webdriver.ChromeOptions()
  30. option.add_argument("lang=zh_CN.UTF-8")
  31. option.add_argument(
  32. "User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
  33. # option.add_argument("--headless")
  34. # 禁止加载图片
  35. prefs = {
  36. 'profile.default_content_setting_values.images': 2
  37. }
  38. option.add_experimental_option('prefs', prefs)
  39. self.__browser = webdriver.Chrome(
  40. executable_path=driver, options=option)
  41. # window.navigater.webdriver 取消webdriver标志
  42. self.__browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  43. "source": """
  44. Object.defineProperty(navigator, 'webdriver', {
  45. get: () => undefined
  46. })
  47. """})
  48. elif BrowserType.IE == browser:
  49. self.__browser = webdriver.Ie(driver)
  50. elif BrowserType.SAFARI == browser:
  51. self.__browser = webdriver.Safari(driver)
  52. elif BrowserType.PHANTOMJS == browser:
  53. self.__browser = webdriver.PhantomJS(driver)
  54. else:
  55. raise TypeError('UNKNOWN BROWSER TYPE: %s' % browser)
  56. logging.basicConfig(
  57. format='[%(asctime)s] %(levelname)s::%(module)s::%(funcName)s() %(message)s', level=logging.INFO)
  58. def __del__(self):
  59. self.__browser.quit()
  60. def getLink(self):
  61. self.__browser.implicitly_wait(10)
  62. self.__browser.get(self.__site) # 加载页面
  63. link_path = self.__save_folder + "/link.csv"
  64. if not os.path.exists(link_path):
  65. for i in range(5):
  66. monthData = self.__browser.find_elements_by_css_selector(
  67. "#querylist li a")
  68. # # 获取本页所有月份
  69. for i in monthData:
  70. print(i.text, i.get_attribute("href"))
  71. link.append([i.text, i.get_attribute("href")])
  72. # 获取下一步按钮,点击
  73. self.__browser.find_element_by_xpath(
  74. '//*[@id="pppagination"]/ul/li[2]/a').click()
  75. # cookie = getCookie()
  76. # print(cookie)
  77. link = pd.DataFrame(link)
  78. link.to_csv(link_path, header=False)
  79. else:
  80. link = pd.read_csv(link_path, names=["month", "link"])
  81. return link
  82. def downlaod(self, url, save_path):
  83. pass
  84. def getCookie(self):
  85. cookie = self.__browser.get_cookies()
  86. cookie_dict = []
  87. for c in cookie:
  88. ck = "{0}={1};".format(c['name'], c['value'])
  89. cookie_dict.append(ck)
  90. return cookie_dict
  91. def crawl(self):
  92. link_path = self.__save_folder + "/link.csv"
  93. link = pd.read_csv(link_path, names=["month", "link"])
  94. for i in range(len(link)):
  95. link1 = link.loc[i]["link"] # 链接
  96. month1 = link.loc[i]["month"] # 月份
  97. if not os.path.exists(self.__save_folder + "/report" + month1 + ".csv"):
  98. self.__browser.implicitly_wait(10)
  99. self.__browser.get(link1)
  100. # # 找出多少条,多少页
  101. try:
  102. text = self.__browser.find_element_by_xpath(
  103. '//*[@id="pagination"]/span').text # 有异常
  104. # 共2391条记录 1/120页
  105. pagesize = re.split(
  106. "[/页]", re.search("/.*页 ", text).group())[1]
  107. reportData = pd.DataFrame(
  108. columns=["date", "place", "course1", "course2", "course3", "course4"])
  109. for i in range(int(pagesize)):
  110. # 找出本页table
  111. trlist = self.__browser.find_elements_by_tag_name("tr")
  112. for row in trlist:
  113. tdlist = row.find_elements_by_tag_name("td")
  114. tmp = []
  115. for col in tdlist:
  116. tmp.append(col.text)
  117. reportData = reportData.append(
  118. pd.Series(tmp, index=reportData.columns), ignore_index=True) # 增加一行
  119. # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
  120. if i > 0:
  121. self.__browser.find_element_by_xpath(
  122. '//*[@id="pagination"]/span/a[3]').click()
  123. else:
  124. self.__browser.find_element_by_xpath(
  125. '//*[@id="pagination"]/span/a[2]').click()
  126. except Exception as e:
  127. print(e)
  128. reportData.to_csv(self.__save_folder +
  129. "/report" + month1 + ".csv", header=False)
  130. def merge(self):
  131. '''
  132. 合并多个csv文件
  133. '''
  134. df = pd.DataFrame()
  135. for parent, dirnames, filenames in os.walk(self.__save_folder):
  136. for filename in filenames:
  137. if filename.startswith("report"):
  138. df1 = pd.read_csv(os.path.join(parent, filename))
  139. df = pd.concat([df, df1])
  140. df = df.drop(df[df["日期"] == "日期"].index).reset_index()
  141. df.to_csv(self.__save_folder + "res.csv", header=False)
  142. def start(self):
  143. self.crawl()