crawl_mrdx.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2022/05/25 18:06:22
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc :
  8. '''
  9. import os
  10. import sys
  11. import json
  12. import re
  13. import time
  14. import datetime
  15. from contextlib import closing
  16. from crawl_mrdx import api
  17. from crawl_mrdx.libs.json_conf import JsonConf
  18. import requests
  19. import utils.DownloadProgress as DownloadProgress
  20. from concurrent.futures import ThreadPoolExecutor
  21. import random
  22. import utils.user_agent as user_agent
  23. class CrawlMrdx():
  24. def __init__(self):
  25. self.jsonConf = JsonConf()
  26. self.conf = self.jsonConf.load()
  27. self.start_date = self.conf.get('startDate')
  28. self.end_date = self.conf.get("endDate")
  29. def update(self):
  30. '''update app'''
  31. pass
  32. def downNews(self, url, fileName):
  33. '''download news
  34. :param url: news url
  35. :param fileName: saved file name
  36. '''
  37. with closing(requests.get(url=url, headers=user_agent.getheaders(), stream=True)) as response:
  38. chunkSize = 1024
  39. contentSize = int(response.headers["content-length"])
  40. if(os.path.exists(fileName) and os.path.getsize(fileName) == contentSize):
  41. print("跳过" + fileName)
  42. else:
  43. progress = DownloadProgress.DownloadProgress(fileName, total=contentSize, unit="KB",
  44. chunk_size=chunkSize, run_status="downloading", fin_status="downloaded")
  45. if not os.path.exists(os.path.dirname(fileName)):
  46. os.makedirs(os.path.dirname(fileName))
  47. with open(fileName, "wb") as file:
  48. for data in response.iter_content(chunk_size=chunkSize):
  49. file.write(data)
  50. progress.refresh(count=len(data))
  51. def crawl(self, start: str, end: str):
  52. '''crawl news
  53. :param start: start date
  54. :param end: end date
  55. '''
  56. start_time = time.time() # 计算耗时
  57. if not os.path.exists("data"):
  58. os.makedirs("data")
  59. pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
  60. if start is None:
  61. start = self.conf.get('startDate')
  62. if end is None:
  63. end = self.conf.get("endDate")
  64. start_date = datetime.datetime.strptime(start, "%Y%m%d")
  65. end_date = datetime.datetime.strptime(end, "%Y%m%d")
  66. current_date = start_date
  67. while current_date <= end_date:
  68. print(current_date.strftime("%Y%m%d"))
  69. current_date_str = current_date.strftime("%Y%m%d")
  70. for j in range(1, 17):
  71. fileName = r"./data/%s/0%s.pdf" % (current_date_str, j)
  72. if(os.path.exists(fileName)):
  73. print("跳过" + fileName)
  74. else:
  75. url = api.pdfUrl % (current_date_str, j)
  76. # 检查链接有效性
  77. response = requests.head(url)
  78. if response.status_code == 200:
  79. # downNews(url,fileName)
  80. future1 = pool.submit(self.downNews, url, fileName)
  81. time.sleep(random.randint(1, 2)) # 文明爬虫
  82. self.jsonConf.set({"currentDate": current_date_str})
  83. if(current_date_str == self.conf.get('endDate')):
  84. break
  85. current_date += datetime.timedelta(days=1)
  86. print("last time: {} s".format(time.time() - start_time))