123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Author : liuyuqi
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2019/11/11 04:40:28
- @Version : 1.0
- @License : (C)Copyright 2019
- @Desc : 按照规则下载pdf文件,知道请求无效停止
- '''
- import os
- import sys
- import json
- import re
- import time
- import datetime
- from contextlib import closing
- import requests
- import utils.DownloadProgress as DownloadProgress
- from concurrent.futures import ThreadPoolExecutor
- import random
- import utils.user_agent as user_agent
- def get_link():
- pass
- def downNews(url, fileName):
- with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response:
- chunkSize=1024
- contentSize=int(response.headers["content-length"])
- fileD="./data/"+fileName
- if(os.path.exists(fileD) and os.path.getsize(fileD)==contentSize):
- print("跳过"+fileName)
- else:
- progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
- chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
- if not os.path.exists(os.path.dirname(fileD)):
- os.makedirs(os.path.dirname(fileD))
- with open(fileD,"wb") as file:
- for data in response.iter_content(chunk_size=chunkSize):
- file.write(data)
- progress.refresh(count=len(data))
- def crawl():
- # for i in range(1,3650):
- yestday = (datetime.date.today() +
- datetime.timedelta(-i)).strftime("%Y%m%d")
- for j in range(1, 17):
- url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
- # 检查链接有效性
- response=requests.head(url)
- if response.status_code==200:
- fileName=r"%s/0%s.pdf" %(yestday,j)
- downNews(url,fileName)
- time.sleep(random.randint(1,2)) # 文明爬虫
- if __name__ == "__main__":
- start_time = time.time()
- if not os.path.exists("data"):
- os.makedirs("data")
- crawl()
- print("last time: {} s".format(time.time() - start_time))
|