#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Author : liuyuqi @Contact : liuyuqi.gov@msn.cn @Time : 2019/11/11 04:40:28 @Version : 1.0 @License : (C)Copyright 2019 @Desc : 按照规则下载pdf文件,知道请求无效停止 ''' import os import sys import json import re import time import datetime from contextlib import closing import requests import utils.DownloadProgress as DownloadProgress from concurrent.futures import ThreadPoolExecutor import random import utils.user_agent as user_agent def get_link(): pass def downNews(url, fileName): with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response: chunkSize=1024 contentSize=int(response.headers["content-length"]) fileD="./data/"+fileName if(os.path.exists(fileD) and os.path.getsize(fileD)==contentSize): print("跳过"+fileName) else: progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" , chunk_size=chunkSize,run_status="downloading",fin_status="downloaded") if not os.path.exists(os.path.dirname(fileD)): os.makedirs(os.path.dirname(fileD)) with open(fileD,"wb") as file: for data in response.iter_content(chunk_size=chunkSize): file.write(data) progress.refresh(count=len(data)) def crawl(): for i in range(1,3650): yestday = (datetime.date.today() + datetime.timedelta(-i)).strftime("%Y%m%d") for j in range(1, 17): url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j) # 检查链接有效性 response=requests.head(url) if response.status_code==200: fileName=r"%s/0%s.pdf" %(yestday,j) downNews(url,fileName) time.sleep(random.randint(1,2)) # 文明爬虫 if __name__ == "__main__": start_time = time.time() if not os.path.exists("data"): os.makedirs("data") crawl() print("last time: {} s".format(time.time() - start_time))