|
@@ -5,15 +5,24 @@ from boto.beanstalk.response import Response
|
|
|
|
|
|
class DockerSpider(scrapy.Spider):
|
|
class DockerSpider(scrapy.Spider):
|
|
name = 'docker'
|
|
name = 'docker'
|
|
|
|
+ keyword=['docker','hadoop','区块链','android','人工智能','python']
|
|
allowed_domains = ['http://ipac.library.sh.cn']
|
|
allowed_domains = ['http://ipac.library.sh.cn']
|
|
- start_urls = ['http://http://ipac.library.sh.cn/']
|
|
|
|
|
|
+ start_urls = ['http://ipac.library.sh.cn/ipac20/ipac.jsp']
|
|
|
|
|
|
def parse(self, response):
|
|
def parse(self, response):
|
|
for href in response.css('.question'):
|
|
for href in response.css('.question'):
|
|
full_url=response.urljoin(href.extract())
|
|
full_url=response.urljoin(href.extract())
|
|
yield scrapy.Request(full_url,callback=self.parse_question)
|
|
yield scrapy.Request(full_url,callback=self.parse_question)
|
|
|
|
+
|
|
def start_request(self):
|
|
def start_request(self):
|
|
res=[]
|
|
res=[]
|
|
|
|
+
|
|
|
|
+ for i in range(1,10):
|
|
|
|
+ req=scrapy.Request("http://ipac.library.sh.cn/ipac20/ipac.jsp?session=F4H74295589B7.106&menu=search&aspect=basic_search&npp=10&ipp=20&profile=sl&ri=1&source=172.16.103.188%40%21shcl&index=.TW&x=0&y=0&aspect=basic_search&term=%s"%i)
|
|
|
|
+ reqs.append(req)
|
|
|
|
+
|
|
|
|
+ return reqs
|
|
|
|
+
|
|
def parse_question(self,response):
|
|
def parse_question(self,response):
|
|
yield{
|
|
yield{
|
|
'title':response.css('h1').extract()[0],
|
|
'title':response.css('h1').extract()[0],
|
|
@@ -21,4 +30,6 @@ class DockerSpider(scrapy.Spider):
|
|
'title':response.css('h1').extract()[0],
|
|
'title':response.css('h1').extract()[0],
|
|
'title':response.css('h1').extract()[0]
|
|
'title':response.css('h1').extract()[0]
|
|
}
|
|
}
|
|
-
|
|
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|