main.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Author : liuyuqi
  5. @Contact : liuyuqi.gov@msn.cn
  6. @Time : 2019/11/11 04:40:28
  7. @Version : 1.0
  8. @License : (C)Copyright 2019
  9. @Desc : 按照规则下载pdf文件,知道请求无效停止
  10. '''
  11. import os
  12. import sys
  13. import json
  14. import re
  15. import time
  16. import datetime
  17. from contextlib import closing
  18. import requests
  19. import utils.DownloadProgress as DownloadProgress
  20. from concurrent.futures import ThreadPoolExecutor
  21. import random
  22. import utils.user_agent as user_agent
  23. def get_link():
  24. pass
  25. def downNews(url, fileName):
  26. with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response:
  27. chunkSize=1024
  28. contentSize=int(response.headers["content-length"])
  29. fileD="./data/"+fileName
  30. if(os.path.exists(fileD) and os.path.getsize(fileD)==contentSize):
  31. print("跳过"+fileName)
  32. else:
  33. progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
  34. chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
  35. if not os.path.exists(os.path.dirname(fileD)):
  36. os.makedirs(os.path.dirname(fileD))
  37. with open(fileD,"wb") as file:
  38. for data in response.iter_content(chunk_size=chunkSize):
  39. file.write(data)
  40. progress.refresh(count=len(data))
  41. def crawl():
  42. for i in range(1,3650):
  43. yestday = (datetime.date.today() +
  44. datetime.timedelta(-i)).strftime("%Y%m%d")
  45. for j in range(1, 17):
  46. url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
  47. # 检查链接有效性
  48. response=requests.head(url)
  49. if response.status_code==200:
  50. fileName=r"%s/0%s.pdf" %(yestday,j)
  51. downNews(url,fileName)
  52. time.sleep(random.randint(1,2)) # 文明爬虫
  53. if __name__ == "__main__":
  54. start_time = time.time()
  55. if not os.path.exists("data"):
  56. os.makedirs("data")
  57. crawl()
  58. print("last time: {} s".format(time.time() - start_time))