lyq
/
crawl-shlib


			
							1234567891011121314151617181920212223242526272829303132333435
							# -*- coding: utf-8 -*-
import scrapy
from boto.beanstalk.response import Response


class DockerSpider(scrapy.Spider):
    name = 'docker'
    keyword=['docker','hadoop','区块链','android','人工智能','python']
    allowed_domains = ['http://ipac.library.sh.cn']
    start_urls = ['http://ipac.library.sh.cn/ipac20/ipac.jsp']

    def parse(self, response):
        for href in response.css('.question'):
            full_url=response.urljoin(href.extract())
            yield scrapy.Request(full_url,callback=self.parse_question)
    
    def start_request(self):
        res=[]
        
        for i in range(1,10):
            req=scrapy.Request("http://ipac.library.sh.cn/ipac20/ipac.jsp?session=F4H74295589B7.106&menu=search&aspect=basic_search&npp=10&ipp=20&profile=sl&ri=1&source=172.16.103.188%40%21shcl&index=.TW&x=0&y=0&aspect=basic_search&term=%s"%i)
            reqs.append(req)
        
        return reqs
    
    def parse_question(self,response):
        yield{
              'title':response.css('h1').extract()[0],
              'title':response.css('h1').extract()[0],
              'title':response.css('h1').extract()[0],
              'title':response.css('h1').extract()[0]
              }