crwal122.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2020/09/06 01:38:09
  6. @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved.
  7. @Desc : 爬取 https://sh.122.gov.cn 驾照考试报名数据
  8. '''
  9. import pandas as pd
  10. import numpy as np
  11. import requests
  12. from selenium import webdriver
  13. from selenium.common.exceptions import NoSuchElementException
  14. from selenium.webdriver.common.keys import Keys
  15. from selenium.webdriver import ActionChains
  16. import os
  17. import re
  18. import sys
  19. import time
  20. base = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003"
  21. chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe"
  22. phantomjspath = r"D:/Program-Files/phantomjs-2.1.1-windows/bin/phantomjs.exe"
  23. link = []
  24. res = []
  25. save_path = r"download"
  26. link_path = r"data/link.csv"
  27. report_path = r"data/report.csv"
  28. # if not os.path.exists(save_path):
  29. # os.mkdir(save_path)
  30. option = webdriver.ChromeOptions()
  31. option.add_argument("lang=zh_CN.UTF-8")
  32. option.add_argument("User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
  33. option.add_argument("--headless")
  34. # 禁止加载图片
  35. prefs = {
  36. 'profile.default_content_setting_values.images': 2
  37. }
  38. option.add_experimental_option('prefs', prefs)
  39. # driver = webdriver.PhantomJS(executable_path=phantomjspath)
  40. driver = webdriver.Chrome(executable_path=chormepath, options=option)
  41. # driver.maximize_window()
  42. def getLink():
  43. driver.implicitly_wait(10)
  44. driver.get(base) # 加载页面
  45. if not os.path.exists(link_path):
  46. for i in range(5):
  47. monthData = driver.find_elements_by_css_selector(
  48. "#querylist li a")
  49. # # 获取本页所有月份
  50. for i in monthData:
  51. print(i.text, i.get_attribute("href"))
  52. link.append([i.text, i.get_attribute("href")])
  53. # 获取下一步按钮,点击
  54. driver.find_element_by_xpath(
  55. '//*[@id="pppagination"]/ul/li[2]/a').click()
  56. # cookie = getCookie()
  57. # print(cookie)
  58. link = pd.DataFrame(link)
  59. link.to_csv(link_path, header=False)
  60. else:
  61. link = pd.read_csv(link_path, names=["month", "link"])
  62. return link
  63. def download(url, save_path):
  64. try:
  65. with open(save_path, "wb") as file:
  66. file.write(requests.get(url).raw)
  67. except Exception as e:
  68. print(e)
  69. def getCookie():
  70. cookie = driver.get_cookies()
  71. cookie_dict = []
  72. for c in cookie:
  73. ck = "{0}={1};".format(c['name'], c['value'])
  74. cookie_dict.append(ck)
  75. return cookie_dict
  76. def crawl():
  77. global link
  78. link = pd.read_csv(link_path, names=["month", "link"])
  79. for i in range(len(link)):
  80. link1 = link.loc[i]["link"] # 链接
  81. month1 = link.loc[i]["month"] # 月份
  82. if not os.path.exists("/data/report" + month1 + ".csv"):
  83. driver.implicitly_wait(10)
  84. driver.get(link1)
  85. # # 找出多少条,多少页
  86. try:
  87. text = driver.find_element_by_xpath(
  88. '//*[@id="pagination"]/span').text # 有异常
  89. # 共2391条记录 1/120页
  90. pagesize = re.split("[/页]", re.search("/.*页 ", text).group())[1]
  91. reportData = pd.DataFrame(
  92. columns=["date", "place", "course1", "course2", "course3", "course4"])
  93. for i in range(int(pagesize)):
  94. # 找出本页table
  95. trlist = driver.find_elements_by_tag_name("tr")
  96. for row in trlist:
  97. tdlist = row.find_elements_by_tag_name("td")
  98. tmp = []
  99. for col in tdlist:
  100. tmp.append(col.text)
  101. reportData = reportData.append(
  102. pd.Series(tmp, index=reportData.columns), ignore_index=True) # 增加一行
  103. # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
  104. if i > 0:
  105. driver.find_element_by_xpath(
  106. '//*[@id="pagination"]/span/a[3]').click()
  107. else:
  108. driver.find_element_by_xpath(
  109. '//*[@id="pagination"]/span/a[2]').click()
  110. except Exception as e:
  111. print(e)
  112. reportData.to_csv("data/report" + month1 + ".csv", header=False)
  113. driver.close()
  114. if __name__ == "__main__":
  115. crawl()