docker.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from boto.beanstalk.response import Response
  4. class DockerSpider(scrapy.Spider):
  5. name = 'docker'
  6. keyword=['docker','hadoop','区块链','android','人工智能','python']
  7. allowed_domains = ['http://ipac.library.sh.cn']
  8. start_urls = ['http://ipac.library.sh.cn/ipac20/ipac.jsp']
  9. def parse(self, response):
  10. for href in response.css('.question'):
  11. full_url=response.urljoin(href.extract())
  12. yield scrapy.Request(full_url,callback=self.parse_question)
  13. def start_request(self):
  14. res=[]
  15. for i in range(1,10):
  16. req=scrapy.Request("http://ipac.library.sh.cn/ipac20/ipac.jsp?session=F4H74295589B7.106&menu=search&aspect=basic_search&npp=10&ipp=20&profile=sl&ri=1&source=172.16.103.188%40%21shcl&index=.TW&x=0&y=0&aspect=basic_search&term=%s"%i)
  17. reqs.append(req)
  18. return reqs
  19. def parse_question(self,response):
  20. yield{
  21. 'title':response.css('h1').extract()[0],
  22. 'title':response.css('h1').extract()[0],
  23. 'title':response.css('h1').extract()[0],
  24. 'title':response.css('h1').extract()[0]
  25. }