6 years ago · 3f2c659c19
--- a/shlib/items.py
+++ b/shlib/items.py
@@ -6,9 +6,25 @@
 
				 # http://doc.scrapy.org/en/latest/topics/items.html
			
 
				 
			
 
				 import scrapy
			
 
				+from astropy.io.votable.tree import Field
			
 
				 
			
 
				 
			
 
				 class ShlibItem(scrapy.Item):
			
 
				     # define the fields for your item here like:
			
 
				     # name = scrapy.Field()
			
 
				-    pass
			
 
				+    #书名
			
 
				+    book_name=Field()
			
 
				+    
			
 
				+    book_desc=Field()
			
 
				+    
			
 
				+    book_name=Field()
			
 
				+    
			
 
				+    book_name=Field()
			
 
				+    
			
 
				+    book_name=Field()
			
 
				+    
			
 
				+    book_name=Field()
			
 
				+    
			
 
				+    book_name=Field()
			
 
				+    
			
 
				+    book_name=Field()
			
--- a/shlib/pipelines.py
+++ b/shlib/pipelines.py
@@ -1,4 +1,5 @@
 
				 # -*- coding: utf-8 -*-
			
 
				+import MySQLdb
			
 
				 
			
 
				 # Define your item pipelines here
			
 
				 #
			
@@ -7,5 +8,23 @@
 
				 
			
 
				 
			
 
				 class ShlibPipeline(object):
			
 
				+    #对数据保存操作，数据库/写文件/发邮件等
			
 
				     def process_item(self, item, spider):
			
 
				+        DBKWARGS = spider.settings.get('DBKWARGS')
			
 
				+        MySQLdb.connect(**DBKWARGS)
			
 
				+        cur = con.cursor()
			
 
				+        sql = ("insert into proxy(IP,PORT,TYPE,POSITION,SPEED,LAST_CHECK_TIME) "
			
 
				+            "values(%s,%s,%s,%s,%s,%s)")
			
 
				+        lis = (item['IP'],item['PORT'],item['TYPE'],item['POSITION'],item['SPEED'],
			
 
				+            item['LAST_CHECK_TIME'])
			
 
				+        try:
			
 
				+            cur.execute(sql,lis)
			
 
				+        except Exception,e:
			
 
				+            print "Insert error:",e
			
 
				+            con.rollback()
			
 
				+        else:
			
 
				+            con.commit()
			
 
				+        cur.close()
			
 
				+        con.close()
			
 
				         return item
			
 
				+    
			
--- a/shlib/settings.py
+++ b/shlib/settings.py
@@ -14,13 +14,18 @@ BOT_NAME = 'shlib'
 
				 SPIDER_MODULES = ['shlib.spiders']
			
 
				 NEWSPIDER_MODULE = 'shlib.spiders'
			
 
				 
			
 
				+# 数据库连接参数
			
 
				+DBKWARGS={'db':'ippool','user':'root', 'passwd':'toor',
			
 
				+    'host':'localhost','use_unicode':True, 'charset':'utf8'}
			
 
				 
			
 
				 # Crawl responsibly by identifying yourself (and your website) on the user-agent
			
 
				-#USER_AGENT = 'shlib (+http://www.yourdomain.com)'
			
 
				+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
			
 
				 
			
 
				 # Obey robots.txt rules
			
 
				 ROBOTSTXT_OBEY = True
			
 
				 
			
 
				+LOG_FILE = "shlib.log"
			
 
				+
			
 
				 # Configure maximum concurrent requests performed by Scrapy (default: 16)
			
 
				 #CONCURRENT_REQUESTS = 32
			
 
				 
			
@@ -64,9 +69,9 @@ ROBOTSTXT_OBEY = True
 
				 
			
 
				 # Configure item pipelines
			
 
				 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
			
 
				-#ITEM_PIPELINES = {
			
 
				-#    'shlib.pipelines.ShlibPipeline': 300,
			
 
				-#}
			
 
				+ITEM_PIPELINES = {
			
 
				+    'shlib.pipelines.ShlibPipeline': 300,
			
 
				+}
			
 
				 
			
 
				 # Enable and configure the AutoThrottle extension (disabled by default)
			
 
				 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
			
--- a/shlib/spiders/docker.py
+++ b/shlib/spiders/docker.py
@@ -1,5 +1,6 @@
 
				 # -*- coding: utf-8 -*-
			
 
				 import scrapy
			
 
				+from boto.beanstalk.response import Response
			
 
				 
			
 
				 
			
 
				 class DockerSpider(scrapy.Spider):
			
@@ -8,4 +9,16 @@ class DockerSpider(scrapy.Spider):
 
				     start_urls = ['http://http://ipac.library.sh.cn/']
			
 
				 
			
 
				     def parse(self, response):
			
 
				-        pass
			
 
				+        for href in response.css('.question'):
			
 
				+            full_url=response.urljoin(href.extract())
			
 
				+            yield scrapy.Request(full_url,callback=self.parse_question)
			
 
				+    def start_request(self):
			
 
				+        res=[]
			
 
				+    def parse_question(self,response):
			
 
				+        yield{
			
 
				+              'title':response.css('h1').extract()[0],
			
 
				+              'title':response.css('h1').extract()[0],
			
 
				+              'title':response.css('h1').extract()[0],
			
 
				+              'title':response.css('h1').extract()[0]
			
 
				+              }
			
 
				+