#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Author : liuyuqi @Contact : liuyuqi.gov@msn.cn @Time : 2019/11/11 04:40:28 @Version : 1.0 @License : (C)Copyright 2019 @Desc : 按照规则下载pdf文件,知道请求无效停止 ''' import os import sys import json import re import time import datetime from contextlib import closing import requests import utils.DownloadProgress as DownloadProgress from concurrent.futures import ThreadPoolExecutor import random import utils.user_agent as user_agent def get_link(): pass def downNews(url, fileName): with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response: chunkSize=1024 contentSize=int(response.headers["content-length"]) if(os.path.exists(fileName) and os.path.getsize(fileName)==contentSize): print("跳过"+fileName) else: progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" , chunk_size=chunkSize,run_status="downloading",fin_status="downloaded") if not os.path.exists(os.path.dirname(fileName)): os.makedirs(os.path.dirname(fileName)) with open(fileName,"wb") as file: for data in response.iter_content(chunk_size=chunkSize): file.write(data) progress.refresh(count=len(data)) def crawl(): pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池 for i in range(1,3650): # yestday = (datetime.date.today() + datetime.timedelta(-i)).strftime("%Y%m%d") yestday = (datetime.datetime.strptime("2019-11-11","%Y-%m-%d").date() + datetime.timedelta(-i)).strftime("%Y%m%d") for j in range(1, 17): fileName=r"./data/%s/0%s.pdf" %(yestday,j) if(os.path.exists(fileName)): print("跳过"+fileName) else: url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j) # 检查链接有效性 response=requests.head(url) if response.status_code==200: # downNews(url,fileName) future1 = pool.submit(downNews,url, fileName) # time.sleep(random.randint(1,2)) # 文明爬虫 if __name__ == "__main__": start_time = time.time() if not os.path.exists("data"): os.makedirs("data") crawl() print("last time: {} s".format(time.time() - start_time))