|
@@ -0,0 +1,74 @@
|
|
|
|
+#!/usr/bin/env python
|
|
|
|
+# -*- encoding: utf-8 -*-
|
|
|
|
+'''
|
|
|
|
+@Contact : liuyuqi.gov@msn.cn
|
|
|
|
+@Time : 2022/05/25 18:06:22
|
|
|
|
+@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
|
|
+@Desc :
|
|
|
|
+'''
|
|
|
|
+
|
|
|
|
+import os
|
|
|
|
+import sys
|
|
|
|
+import json
|
|
|
|
+import re
|
|
|
|
+import time
|
|
|
|
+import datetime
|
|
|
|
+from contextlib import closing
|
|
|
|
+from crawl_mrdx import api
|
|
|
|
+from crawl_mrdx.libs.json_conf import JsonConf
|
|
|
|
+import requests
|
|
|
|
+import utils.DownloadProgress as DownloadProgress
|
|
|
|
+from concurrent.futures import ThreadPoolExecutor
|
|
|
|
+import random
|
|
|
|
+import utils.user_agent as user_agent
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class CrawlMrdx():
|
|
|
|
+
|
|
|
|
+ def __init__(self):
|
|
|
|
+ self.jsonConf = JsonConf()
|
|
|
|
+ self.conf = self.jsonConf.load()
|
|
|
|
+ self.currentDate = self.conf.get('currentDate')
|
|
|
|
+
|
|
|
|
+ def downNews(self, url, fileName):
|
|
|
|
+ with closing(requests.get(url=url, headers=user_agent.getheaders(), stream=True)) as response:
|
|
|
|
+ chunkSize = 1024
|
|
|
|
+ contentSize = int(response.headers["content-length"])
|
|
|
|
+ if(os.path.exists(fileName) and os.path.getsize(fileName) == contentSize):
|
|
|
|
+ print("跳过" + fileName)
|
|
|
|
+ else:
|
|
|
|
+ progress = DownloadProgress.DownloadProgress(fileName, total=contentSize, unit="KB",
|
|
|
|
+ chunk_size=chunkSize, run_status="downloading", fin_status="downloaded")
|
|
|
|
+ if not os.path.exists(os.path.dirname(fileName)):
|
|
|
|
+ os.makedirs(os.path.dirname(fileName))
|
|
|
|
+ with open(fileName, "wb") as file:
|
|
|
|
+ for data in response.iter_content(chunk_size=chunkSize):
|
|
|
|
+ file.write(data)
|
|
|
|
+ progress.refresh(count=len(data))
|
|
|
|
+
|
|
|
|
+ def crawl(self):
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ if not os.path.exists("data"):
|
|
|
|
+ os.makedirs("data")
|
|
|
|
+ pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
|
|
|
|
+ index = 1
|
|
|
|
+ while True:
|
|
|
|
+ yestday = (datetime.datetime.strptime(
|
|
|
|
+ self.currentDate, "%Y%m%d").date() + datetime.timedelta(index)).strftime("%Y%m%d")
|
|
|
|
+ for j in range(1, 17):
|
|
|
|
+ fileName = r"./data/%s/0%s.pdf" % (yestday, j)
|
|
|
|
+ if(os.path.exists(fileName)):
|
|
|
|
+ print("跳过" + fileName)
|
|
|
|
+ else:
|
|
|
|
+ url = api.pdfUrl % (yestday, j)
|
|
|
|
+ # 检查链接有效性
|
|
|
|
+ response = requests.head(url)
|
|
|
|
+ if response.status_code == 200:
|
|
|
|
+ # downNews(url,fileName)
|
|
|
|
+ future1 = pool.submit(self.downNews, url, fileName)
|
|
|
|
+ time.sleep(random.randint(1, 2)) # 文明爬虫
|
|
|
|
+ self.jsonConf.set({"currentDate": yestday})
|
|
|
|
+ if(yestday == self.conf.get('endDate')):
|
|
|
|
+ break
|
|
|
|
+ index += 1
|
|
|
|
+ print("last time: {} s".format(time.time() - start_time))
|