liuyuqi-dellpc 3 years ago
parent
commit
1c44037984
6 changed files with 236 additions and 0 deletions
  1. 1 0
      .gitignore
  2. 28 0
      README.md
  3. 131 0
      crwal122.py
  4. 59 0
      data/link.csv
  5. 13 0
      docs/驾考报名爬虫.md
  6. 4 0
      requirements.txt

+ 1 - 0
.gitignore

@@ -1 +1,2 @@
 /.ipynb_checkpoints
+/data

+ 28 - 0
README.md

@@ -0,0 +1,28 @@
+##  car-nalysis
+
+二手车分析,驾考分析。
+
+### usage
+
+```
+cd my_project_dir
+virtualenv -p /opt/python/bin/python3 venv
+source venv/bin/activate
+pip install -r requirements.txt
+
+
+python crwal122.py
+
+
+```
+
+预估时间:每月份驾考报名大概120页,一年12月,总共5年,每次大概5s。 总耗时 36000s = 10小时 ,具体耗时以运行为主。
+
+
+
+
+
+
+
+
+

+ 131 - 0
crwal122.py

@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2020/09/06 01:38:09
+@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
+@Desc    :   爬取 https://sh.122.gov.cn 驾照考试报名数据
+'''
+
+import pandas as pd
+import numpy as np
+import requests
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver import ActionChains
+
+import os
+import re
+import sys
+import time
+
+base = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003"
+chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe"
+phantomjspath = r"D:/Program-Files/phantomjs-2.1.1-windows/bin/phantomjs.exe"
+
+link = []
+res = []
+save_path = r"download"
+link_path = r"data/link.csv"
+report_path = r"data/report.csv"
+# if not os.path.exists(save_path):
+#     os.mkdir(save_path)
+
+option = webdriver.ChromeOptions()
+option.add_argument("lang=zh_CN.UTF-8")
+option.add_argument("User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
+option.add_argument("--headless")
+
+# 禁止加载图片
+prefs = {
+    'profile.default_content_setting_values.images': 2
+}
+option.add_experimental_option('prefs', prefs)
+
+# driver = webdriver.PhantomJS(executable_path=phantomjspath)
+driver = webdriver.Chrome(executable_path=chormepath, options=option)
+# driver.maximize_window()
+
+
+def getLink():
+    driver.implicitly_wait(10)
+    driver.get(base)  # 加载页面
+    if not os.path.exists(link_path):
+        for i in range(5):
+            monthData = driver.find_elements_by_css_selector(
+                "#querylist li a")
+            # # 获取本页所有月份
+            for i in monthData:
+                print(i.text, i.get_attribute("href"))
+                link.append([i.text, i.get_attribute("href")])
+            # 获取下一步按钮,点击
+            driver.find_element_by_xpath(
+                '//*[@id="pppagination"]/ul/li[2]/a').click()
+            # cookie = getCookie()
+            # print(cookie)
+        link = pd.DataFrame(link)
+        link.to_csv(link_path, header=False)
+    else:
+        link = pd.read_csv(link_path, names=["month", "link"])
+    return link
+
+
+def download(url, save_path):
+    try:
+        with open(save_path, "wb") as file:
+            file.write(requests.get(url).raw)
+    except Exception as e:
+        print(e)
+
+
+def getCookie():
+    cookie = driver.get_cookies()
+    cookie_dict = []
+    for c in cookie:
+        ck = "{0}={1};".format(c['name'], c['value'])
+        cookie_dict.append(ck)
+    return cookie_dict
+
+
+def crawl():
+    global link
+    link = pd.read_csv(link_path, names=["month", "link"])
+    for i in range(len(link)):
+        link1 = link.loc[i]["link"]    # 链接
+        month1 = link.loc[i]["month"]  # 月份
+        if not os.path.exists("/data/report" + month1 + ".csv"):
+            driver.implicitly_wait(10)
+            driver.get(link1)
+            # # 找出多少条,多少页
+            try:
+                text = driver.find_element_by_xpath(
+                    '//*[@id="pagination"]/span').text  # 有异常
+                # 共2391条记录 1/120页
+                pagesize = re.split("[/页]", re.search("/.*页  ", text).group())[1]
+                reportData = pd.DataFrame(
+                    columns=["date", "place", "course1", "course2", "course3", "course4"])
+                for i in range(int(pagesize)):
+                    # 找出本页table
+                    trlist = driver.find_elements_by_tag_name("tr")
+                    for row in trlist:
+                        tdlist = row.find_elements_by_tag_name("td")
+                        tmp = []
+                        for col in tdlist:
+                            tmp.append(col.text)
+                        reportData = reportData.append(
+                            pd.Series(tmp, index=reportData.columns), ignore_index=True)  # 增加一行
+                # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
+                    if i > 0:
+                        driver.find_element_by_xpath(
+                            '//*[@id="pagination"]/span/a[3]').click()
+                    else:
+                        driver.find_element_by_xpath(
+                            '//*[@id="pagination"]/span/a[2]').click()
+            except Exception as e:
+                print(e)
+            reportData.to_csv("data/report" + month1 + ".csv", header=False)
+    driver.close()
+
+if __name__ == "__main__":
+    crawl()

+ 59 - 0
data/link.csv

@@ -0,0 +1,59 @@
+0,2020年08月,https://staticsh.122.gov.cn/group4/M02/E2/B3/ynomOF9NYj6AVCUiAAAl8fFIDqo30.html
+1,2020年07月,https://staticsh.122.gov.cn/group4/M06/09/D7/ynomOF8kg9GANDqNAAAjq_LKpV866.html
+2,2020年06月,https://staticsh.122.gov.cn/group4/M06/3F/E1/ynomOF77pWWABcRlAAAiupZMUgY31.html
+3,2020年05月,https://staticsh.122.gov.cn/group4/M06/8E/D4/ynomOF7UGHmAFiINAAAh9YXpBDQ75.html
+4,2020年04月,https://staticsh.122.gov.cn/group4/M04/E3/5A/ynomOF6rOguABPm-AAAenCITgB838.html
+5,2020年03月,https://staticsh.122.gov.cn/group4/M01/F6/2E/ynomOF6DrSKAQnc-AAAGdBHzqBg84.html
+6,2020年01月,https://staticsh.122.gov.cn/group4/M00/EA/C1/ynomOF40k0eAOLUoAAAfvRFc1sM97.html
+7,2019年12月,https://staticsh.122.gov.cn/group4/M05/93/08/ynomOF4LtN2AHlmLAAAjHatTLUc36.html
+8,2019年11月,https://staticsh.122.gov.cn/group4/M03/D9/C9/ynomOF3i1nGANvAfAAAffFwQ0zY18.html
+9,2019年10月,https://staticsh.122.gov.cn/group4/M01/3C/6B/ynomOF27SYaAIf_0AAAe6lN78OE30.html
+10,2019年09月,https://staticsh.122.gov.cn/group4/M01/82/3A/ynomOF2SaxiAXgohAAAh4Ps7JFw92.html
+11,2019年08月,https://staticsh.122.gov.cn/group4/M00/0B/6D/ynomOF1q3iqAJlIXAAAgj3KyTjw83.html
+12,2019年07月,https://staticsh.122.gov.cn/group4/M05/83/91/ynomOF1B_72AXvNVAAAhUV1Bs2Q72.html
+13,2019年06月,https://staticsh.122.gov.cn/group4/M06/0B/54/ynomOF0ZIVCAWKixAAAdstDqxdk02.html
+14,2019年05月,https://staticsh.122.gov.cn/group4/M00/A1/84/ynomOFzxlGOAWsHkAAAdCdBf9gk30.html
+15,2019年04月,https://staticsh.122.gov.cn/group4/M07/86/90/ynomOFznvXaAPfo4AAAfOvOU1m002.html
+16,2019年03月,https://staticsh.122.gov.cn/group4/M01/86/88/ynomOFznvXKAA1iVAAAepK7sA0s96.html
+17,2019年02月,https://staticsh.122.gov.cn/group4/M00/86/7D/ynomOFznvWyAAletAAAb3iEQWik18.html
+18,2019年01月,https://staticsh.122.gov.cn/group4/M01/86/73/ynomOFznvWWAbiN3AAAeSo81REk10.html
+19,2018年12月,https://staticsh.122.gov.cn/group4/M02/86/6A/ynomOFznvV-ANfAHAAAe4f5Mk8g14.html
+20,2018年11月,https://staticsh.122.gov.cn/group4/M00/86/63/ynomOFznvViAbhUEAAAeiiv37Yo28.html
+21,2018年10月,https://staticsh.122.gov.cn/group4/M07/86/5F/ynomOFznvUyAAY7mAAAc2pxxb-A98.html
+22,2018年09月,https://staticsh.122.gov.cn/group4/M05/9F/C9/ynomOFwcnj-AA8bJAAAkJv4JZ_I98.html
+23,2018年08月,https://staticsh.122.gov.cn/group4/M03/9F/D2/ynomOFwcnkeAdJR6AAAn5Kaick487.html
+24,2018年07月,https://staticsh.122.gov.cn/group4/M02/30/CA/ynomOFvlQxWAa-ANAAAnbxrXjM472.html
+25,2018年06月,https://staticsh.122.gov.cn/group4/M03/30/CF/ynomOFvlQxmAIHldAAAlIeCBnvc88.html
+26,2018年05月,https://staticsh.122.gov.cn/group4/M02/30/D4/ynomOFvlQx2APNF_AAAnu0LDe4051.html
+27,2018年04月,https://staticsh.122.gov.cn/group4/M05/30/E7/ynomOFvlQyiAWPAIAAAmuU4SnvE95.html
+28,2018年03月,https://staticsh.122.gov.cn/group4/M02/30/F6/ynomOFvlQzKAYlN0AAAjO6iB-Dc65.html
+29,2018年02月,https://staticsh.122.gov.cn/group4/M03/31/01/ynomOFvlQzuAKiLwAAAec7niS3882.html
+30,2018年01月,https://staticsh.122.gov.cn/group4/M02/31/10/ynomOFvlQ0WAQZhbAAAjh3VYbGk13.html
+31,2017年12月,https://staticsh.122.gov.cn/group4/M03/31/23/ynomOFvlQ1CAAk8YAAAifrWIcmI84.html
+32,2017年11月,https://staticsh.122.gov.cn/group4/M03/31/2F/ynomOFvlQ1mAde8vAAAjnMT95Tk04.html
+33,2017年10月,https://staticsh.122.gov.cn/group4/M06/AF/BF/ynomOFrEM-qAXpZzAAAfVyXOhD030.html
+34,2017年09月,https://staticsh.122.gov.cn/group4/M00/AF/D5/ynomOFrEM_aAc801AAAiU26XD8Y08.html
+35,2017年08月,https://staticsh.122.gov.cn/group4/M07/AF/DC/ynomOFrEM_uAE2cXAAAjaaqJQrk39.html
+36,2017年07月,https://staticsh.122.gov.cn/group4/M03/AF/E7/ynomOFrEM_-AOr-cAAAiAhLZj7c59.html
+37,2017年06月,https://staticsh.122.gov.cn/group4/M03/AF/F0/ynomOFrENAOAQ4j4AAAioXtw1oc69.html
+38,2017年05月,https://staticsh.122.gov.cn/group4/M03/AF/FD/ynomOFrENAiAEeUpAAAihwWQJRs16.html
+39,2017年04月,https://staticsh.122.gov.cn/group4/M05/B0/1A/ynomOFrENBSAKorRAAAg_OxHFis77.html
+40,2017年03月,https://staticsh.122.gov.cn/group2/M01/45/E9/ynomNFnPO7CALrYHAAAjaJ1_lrA67.html
+41,2017年02月,https://staticsh.122.gov.cn/group2/M03/45/DE/ynomNFnPO6iAY1VeAAAgAKstNIs99.html
+42,2017年01月,https://staticsh.122.gov.cn/group2/M01/45/D8/ynomNFnPO6OAcdekAAAg_FGiAao74.html
+43,2016年12月,https://staticsh.122.gov.cn/group2/M01/45/C6/ynomNFnPO5KAfOp4AAAin7QVFRk11.html
+44,2016年11月,https://staticsh.122.gov.cn/group2/M00/45/C5/ynomNFnPO5CALYx6AAAik9NhyAs66.html
+45,2016年10月,https://staticsh.122.gov.cn/group1/M03/D3/6E/ynomM1nPO-CASjODAAAfWBJVlko59.html
+46,2016年09月,https://staticsh.122.gov.cn/group2/M03/A8/6A/ynomNFi5A9OAE1dJAAAiH8QM4hY33.html
+47,2016年08月,https://staticsh.122.gov.cn/group1/M03/9F/1A/ynomM1i5A9-AUufeAAAjrSxI4-o11.html
+48,2016年07月,https://staticsh.122.gov.cn/group1/M01/9F/1D/ynomM1i5A-WAanzuAAAiT-b13S022.html
+49,2016年06月,https://staticsh.122.gov.cn/group1/M01/9F/23/ynomM1i5A-yAaBJiAAAiS1EBqms74.html
+50,2016年05月,https://staticsh.122.gov.cn/group1/M02/9F/26/ynomM1i5A_CAd-10AAAigaQkrkI96.html
+51,2016年03月,https://staticsh.122.gov.cn/group1/M02/9F/30/ynomM1i5BACATLl-AAAizCgLLI016.html
+52,2016年02月,https://staticsh.122.gov.cn/group2/M01/8E/1C/ynomNFilPjGANvdFAAAd_4G966s16.html
+53,2015年12月,https://staticsh.122.gov.cn/group1/M02/52/6E/ynomM1aTbDaAXgexAAAjpggpFos42.html
+54,2015年11月,https://staticsh.122.gov.cn/group1/M02/AA/AC/ynomM1ZxAOeAR5AbAAAhrZKO4t855.html
+55,2015年10月,https://staticsh.122.gov.cn/group1/M00/4F/21/ynomM1ZMHgiAF0D2AAAffFhwxbA42.html
+56,2015年09月,https://staticsh.122.gov.cn/group1/M01/3B/4B/ynomM1aOJ5WAY6FLAAAh-wnaw2s53.html
+57,2015年08月,https://staticsh.122.gov.cn/group1/M01/3B/45/ynomM1aOJ32Ac8iJAAAhb5u3Z9028.html
+58,2015年07月,https://staticsh.122.gov.cn/group1/M03/3B/43/ynomM1aOJ3SAacaFAAAi90MLQ-086.html

+ 13 - 0
docs/驾考报名爬虫.md

@@ -0,0 +1,13 @@
+## 爬虫说明 
+
+(1)打开首页:
+https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003
+
+(2)先点击下一页,循环获取所有月份链接
+
+
+(3)点击所有月份链接
+
+https://staticsh.122.gov.cn/group4/M02/E2/B3/ynomOF9NYj6AVCUiAAAl8fFIDqo30.html
+
+(4)对每月份,点击下一页,循环每一页

+ 4 - 0
requirements.txt

@@ -0,0 +1,4 @@
+requests
+pandas
+numpy
+selenium==3.141.0