lyq
/
crawl_mrdx


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
							#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Author  :   liuyuqi
@Contact :   liuyuqi.gov@msn.cn
@Time    :   2019/11/11 04:40:28
@Version :   1.0
@License :   (C)Copyright 2019
@Desc    :   按照规则下载pdf文件，知道请求无效停止
'''
import os
import sys
import json
import re
import time
import datetime
from contextlib import closing
import requests
import utils.DownloadProgress as DownloadProgress
from concurrent.futures import ThreadPoolExecutor
import random
import utils.user_agent as user_agent

def get_link():
    pass

def downNews(url, fileName):
    with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response:
        chunkSize=1024
        contentSize=int(response.headers["content-length"])
        fileD="./data/"+fileName
        if(os.path.exists(fileD) and os.path.getsize(fileD)==contentSize):
            print("跳过"+fileName)
        else:

            progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
                                                        chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
            if not os.path.exists(os.path.dirname(fileD)):
                os.makedirs(os.path.dirname(fileD))
            with open(fileD,"wb") as file:
                for data in response.iter_content(chunk_size=chunkSize):
                    file.write(data)
                    progress.refresh(count=len(data))

def crawl():
    for i in range(1,3650):
        yestday = (datetime.date.today() +
                datetime.timedelta(-i)).strftime("%Y%m%d")
        for j in range(1, 17):
            url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
            # 检查链接有效性
            response=requests.head(url)
            if response.status_code==200:
                fileName=r"%s/0%s.pdf" %(yestday,j)
                downNews(url,fileName)
                time.sleep(random.randint(1,2))  # 文明爬虫

if __name__ == "__main__":
    start_time = time.time()
    if not os.path.exists("data"):
        os.makedirs("data")
    crawl()
    print("last time: {} s".format(time.time() - start_time))