middlewares.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. # -*- coding: utf-8 -*-
  2. # Define here the models for your spider middleware
  3. #
  4. # See documentation in:
  5. # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  6. import random
  7. from logging import getLogger
  8. from scrapy import signals
  9. from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
  10. from scrapy.http import HtmlResponse
  11. from selenium import webdriver
  12. from selenium.common.exceptions import TimeoutException
  13. from selenium.webdriver.common.by import By
  14. from selenium.webdriver.support.ui import WebDriverWait
  15. from selenium.webdriver.support import expected_conditions as EC
  16. import requests
  17. from .utils import user_agent_list
  18. class SpidernoticesSpiderMiddleware(object):
  19. # Not all methods need to be defined. If a method is not defined,
  20. # scrapy acts as if the spider middleware does not modify the
  21. # passed objects.
  22. @classmethod
  23. def from_crawler(cls, crawler):
  24. # This method is used by Scrapy to create your spiders.
  25. s = cls()
  26. crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
  27. return s
  28. def process_spider_input(self, response, spider):
  29. # Called for each response that goes through the spider
  30. # middleware and into the spider.
  31. # Should return None or raise an exception.
  32. return None
  33. def process_spider_output(self, response, result, spider):
  34. # Called with the results returned from the Spider, after
  35. # it has processed the response.
  36. # Must return an iterable of Request, dict or Item objects.
  37. for i in result:
  38. yield i
  39. def process_spider_exception(self, response, exception, spider):
  40. # Called when a spider or process_spider_input() method
  41. # (from other spider middleware) raises an exception.
  42. # Should return either None or an iterable of Request, dict
  43. # or Item objects.
  44. pass
  45. def process_start_requests(self, start_requests, spider):
  46. # Called with the start requests of the spider, and works
  47. # similarly to the process_spider_output() method, except
  48. # that it doesn’t have a response associated.
  49. # Must return only requests (not items).
  50. for r in start_requests:
  51. yield r
  52. def spider_opened(self, spider):
  53. spider.logger.info('Spider opened: %s' % spider.name)
  54. class SpidernoticesDownloaderMiddleware(object):
  55. # Not all methods need to be defined. If a method is not defined,
  56. # scrapy acts as if the downloader middleware does not modify the
  57. # passed objects.
  58. @classmethod
  59. def from_crawler(cls, crawler):
  60. # This method is used by Scrapy to create your spiders.
  61. s = cls()
  62. crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
  63. return s
  64. def process_request(self, request, spider):
  65. # Called for each request that goes through the downloader
  66. # middleware.
  67. # Must either:
  68. # - return None: continue processing this request
  69. # - or return a Response object
  70. # - or return a Request object
  71. # - or raise IgnoreRequest: process_exception() methods of
  72. # installed downloader middleware will be called
  73. return None
  74. def process_response(self, request, response, spider):
  75. # Called with the response returned from the downloader.
  76. # Must either;
  77. # - return a Response object
  78. # - return a Request object
  79. # - or raise IgnoreRequest
  80. return response
  81. def process_exception(self, request, exception, spider):
  82. # Called when a download handler or a process_request()
  83. # (from other downloader middleware) raises an exception.
  84. # Must either:
  85. # - return None: continue processing this exception
  86. # - return a Response object: stops process_exception() chain
  87. # - return a Request object: stops process_exception() chain
  88. pass
  89. def spider_opened(self, spider):
  90. spider.logger.info('Spider opened: %s' % spider.name)
  91. class SeleniumMiddleware(object):
  92. """
  93. 使用Selenium加载动态页面。
  94. 1、process_request:通过request.url判断。
  95. """
  96. def __init__(self, timeout=10, service_args=[]):
  97. self.logger = getLogger(__name__)
  98. self.browser = webdriver.PhantomJS(service_args=service_args)
  99. self.timeout = timeout
  100. self.wait = WebDriverWait(self.browser, self.timeout)
  101. def __del__(self):
  102. self.browser.close()
  103. def process_request(self, request, spider):
  104. if request.url.find(r'data.eastmoney.com/notices/stock/') != -1:
  105. # 个股公告公告列表,此时AJX加载到第一页
  106. self.logger.debug('Selenium is Starting')
  107. try:
  108. self.browser.get(request.url)
  109. # time.sleep(3)
  110. self.wait.until(
  111. EC.presence_of_element_located((By.ID, 'PageCont'))
  112. )
  113. return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
  114. )
  115. except TimeoutException:
  116. return HtmlResponse(url=request.url, status=500, request=request)
  117. else:
  118. return None
  119. class ProxyIpMiddleware(object):
  120. """
  121. 使用ip:port格式的代理。从47.103.1.245:5010获取
  122. """
  123. def process_request(self, request, spider):
  124. ip = requests.get('http://47.103.1.245:5010/get/').json()
  125. if ip["proxy"]:
  126. request.meta["proxy"] = "http://" + ip["proxy"]
  127. class RandomUserAgent(UserAgentMiddleware):
  128. """
  129. 随机设置一个useragent
  130. """
  131. def process_request(self, request, spider):
  132. ua = random.choice(user_agent_list)
  133. request.headers.setdefault('User-Agent', ua)