crawl_mrdx.py 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2022/05/25 18:06:22
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc :
  8. '''
  9. import os
  10. import sys
  11. import json
  12. import re
  13. import time
  14. import datetime
  15. from contextlib import closing
  16. from crawl_mrdx import api
  17. from crawl_mrdx.libs.json_conf import JsonConf
  18. import requests
  19. import utils.DownloadProgress as DownloadProgress
  20. from concurrent.futures import ThreadPoolExecutor
  21. import random
  22. import utils.user_agent as user_agent
  23. class CrawlMrdx():
  24. def __init__(self):
  25. self.jsonConf = JsonConf()
  26. self.conf = self.jsonConf.load()
  27. self.start_date = self.conf.get('startDate')
  28. self.end_date = self.conf.get("endDate")
  29. def update(self):
  30. '''update app'''
  31. pass
  32. def downNews(self, url, fileName):
  33. '''download news
  34. :param url: news url
  35. :param fileName: saved file name
  36. '''
  37. with closing(requests.get(url=url, headers=user_agent.getheaders(), stream=True)) as response:
  38. chunkSize = 1024
  39. contentSize = int(response.headers["content-length"])
  40. if(os.path.exists(fileName) and os.path.getsize(fileName) == contentSize):
  41. print("跳过" + fileName)
  42. else:
  43. progress = DownloadProgress.DownloadProgress(fileName, total=contentSize, unit="KB",
  44. chunk_size=chunkSize, run_status="downloading", fin_status="downloaded")
  45. if not os.path.exists(os.path.dirname(fileName)):
  46. os.makedirs(os.path.dirname(fileName))
  47. with open(fileName, "wb") as file:
  48. for data in response.iter_content(chunk_size=chunkSize):
  49. file.write(data)
  50. progress.refresh(count=len(data))
  51. def crawl(self, start: str, end: str):
  52. '''crawl news
  53. :param start: start date
  54. :param end: end date
  55. '''
  56. start_time = time.time() # 计算耗时
  57. if not os.path.exists("data"):
  58. os.makedirs("data")
  59. pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
  60. index = 1
  61. start_date = datetime.datetime.strptime(start, "%Y%m%d")
  62. end_date = datetime.datetime.strptime(end, "%Y%m%d")
  63. current_date = start_date
  64. while current_date <= end_date:
  65. print(current_date.strftime("%Y%m%d"))
  66. current_date_str=current_date.strftime("%Y%m%d")
  67. for j in range(1, 17):
  68. fileName = r"./data/%s/0%s.pdf" % (current_date_str, j)
  69. if(os.path.exists(fileName)):
  70. print("跳过" + fileName)
  71. else:
  72. url = api.pdfUrl % (current_date_str, j)
  73. # 检查链接有效性
  74. response = requests.head(url)
  75. if response.status_code == 200:
  76. # downNews(url,fileName)
  77. future1 = pool.submit(self.downNews, url, fileName)
  78. time.sleep(random.randint(1, 2)) # 文明爬虫
  79. self.jsonConf.set({"currentDate": current_date_str})
  80. if(current_date_str == self.conf.get('endDate')):
  81. break
  82. current_date += datetime.timedelta(days=1)
  83. print("last time: {} s".format(time.time() - start_time))