12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2022/05/25 18:06:22
- @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
- @Desc :
- '''
- import os
- import sys
- import json
- import re
- import time
- import datetime
- from contextlib import closing
- from crawl_mrdx import api
- from crawl_mrdx.libs.json_conf import JsonConf
- import requests
- import utils.DownloadProgress as DownloadProgress
- from concurrent.futures import ThreadPoolExecutor
- import random
- import utils.user_agent as user_agent
- class CrawlMrdx():
- def __init__(self):
- self.jsonConf = JsonConf()
- self.conf = self.jsonConf.load()
- self.start_date = self.conf.get('startDate')
- self.end_date = self.conf.get("endDate")
-
- def update(self):
- '''update app'''
- pass
- def downNews(self, url, fileName):
- '''download news
- :param url: news url
- :param fileName: saved file name
- '''
- with closing(requests.get(url=url, headers=user_agent.getheaders(), stream=True)) as response:
- chunkSize = 1024
- contentSize = int(response.headers["content-length"])
- if(os.path.exists(fileName) and os.path.getsize(fileName) == contentSize):
- print("跳过" + fileName)
- else:
- progress = DownloadProgress.DownloadProgress(fileName, total=contentSize, unit="KB",
- chunk_size=chunkSize, run_status="downloading", fin_status="downloaded")
- if not os.path.exists(os.path.dirname(fileName)):
- os.makedirs(os.path.dirname(fileName))
- with open(fileName, "wb") as file:
- for data in response.iter_content(chunk_size=chunkSize):
- file.write(data)
- progress.refresh(count=len(data))
- def crawl(self, start: str, end: str):
- '''crawl news
- :param start: start date
- :param end: end date
- '''
- start_time = time.time() # 计算耗时
- if not os.path.exists("data"):
- os.makedirs("data")
- pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
- index = 1
- start_date = datetime.datetime.strptime(start, "%Y%m%d")
- end_date = datetime.datetime.strptime(end, "%Y%m%d")
- current_date = start_date
- while current_date <= end_date:
- print(current_date.strftime("%Y%m%d"))
- current_date_str=current_date.strftime("%Y%m%d")
- for j in range(1, 17):
- fileName = r"./data/%s/0%s.pdf" % (current_date_str, j)
- if(os.path.exists(fileName)):
- print("跳过" + fileName)
- else:
- url = api.pdfUrl % (current_date_str, j)
- # 检查链接有效性
- response = requests.head(url)
- if response.status_code == 200:
- # downNews(url,fileName)
- future1 = pool.submit(self.downNews, url, fileName)
- time.sleep(random.randint(1, 2)) # 文明爬虫
- self.jsonConf.set({"currentDate": current_date_str})
- if(current_date_str == self.conf.get('endDate')):
- break
- current_date += datetime.timedelta(days=1)
- print("last time: {} s".format(time.time() - start_time))
|