# -*- coding: utf-8 -*- import scrapy from boto.beanstalk.response import Response class DockerSpider(scrapy.Spider): name = 'docker' keyword=['docker','hadoop','区块链','android','人工智能','python'] allowed_domains = ['http://ipac.library.sh.cn'] start_urls = ['http://ipac.library.sh.cn/ipac20/ipac.jsp'] def parse(self, response): for href in response.css('.question'): full_url=response.urljoin(href.extract()) yield scrapy.Request(full_url,callback=self.parse_question) def start_request(self): res=[] for i in range(1,10): req=scrapy.Request("http://ipac.library.sh.cn/ipac20/ipac.jsp?session=F4H74295589B7.106&menu=search&aspect=basic_search&npp=10&ipp=20&profile=sl&ri=1&source=172.16.103.188%40%21shcl&index=.TW&x=0&y=0&aspect=basic_search&term=%s"%i) reqs.append(req) return reqs def parse_question(self,response): yield{ 'title':response.css('h1').extract()[0], 'title':response.css('h1').extract()[0], 'title':response.css('h1').extract()[0], 'title':response.css('h1').extract()[0] }