liuyuqi-dellpc 6 years ago
parent
commit
3f2c659c19
4 changed files with 59 additions and 6 deletions
  1. 17 1
      shlib/items.py
  2. 19 0
      shlib/pipelines.py
  3. 9 4
      shlib/settings.py
  4. 14 1
      shlib/spiders/docker.py

+ 17 - 1
shlib/items.py

@@ -6,9 +6,25 @@
 # http://doc.scrapy.org/en/latest/topics/items.html
 
 import scrapy
+from astropy.io.votable.tree import Field
 
 
 class ShlibItem(scrapy.Item):
     # define the fields for your item here like:
     # name = scrapy.Field()
-    pass
+    #书名
+    book_name=Field()
+    
+    book_desc=Field()
+    
+    book_name=Field()
+    
+    book_name=Field()
+    
+    book_name=Field()
+    
+    book_name=Field()
+    
+    book_name=Field()
+    
+    book_name=Field()

+ 19 - 0
shlib/pipelines.py

@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import MySQLdb
 
 # Define your item pipelines here
 #
@@ -7,5 +8,23 @@
 
 
 class ShlibPipeline(object):
+    #对数据保存操作,数据库/写文件/发邮件等
     def process_item(self, item, spider):
+        DBKWARGS = spider.settings.get('DBKWARGS')
+        MySQLdb.connect(**DBKWARGS)
+        cur = con.cursor()
+        sql = ("insert into proxy(IP,PORT,TYPE,POSITION,SPEED,LAST_CHECK_TIME) "
+            "values(%s,%s,%s,%s,%s,%s)")
+        lis = (item['IP'],item['PORT'],item['TYPE'],item['POSITION'],item['SPEED'],
+            item['LAST_CHECK_TIME'])
+        try:
+            cur.execute(sql,lis)
+        except Exception,e:
+            print "Insert error:",e
+            con.rollback()
+        else:
+            con.commit()
+        cur.close()
+        con.close()
         return item
+    

+ 9 - 4
shlib/settings.py

@@ -14,13 +14,18 @@ BOT_NAME = 'shlib'
 SPIDER_MODULES = ['shlib.spiders']
 NEWSPIDER_MODULE = 'shlib.spiders'
 
+# 数据库连接参数
+DBKWARGS={'db':'ippool','user':'root', 'passwd':'toor',
+    'host':'localhost','use_unicode':True, 'charset':'utf8'}
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'shlib (+http://www.yourdomain.com)'
+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
 
+LOG_FILE = "shlib.log"
+
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
 
@@ -64,9 +69,9 @@ ROBOTSTXT_OBEY = True
 
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'shlib.pipelines.ShlibPipeline': 300,
-#}
+ITEM_PIPELINES = {
+    'shlib.pipelines.ShlibPipeline': 300,
+}
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

+ 14 - 1
shlib/spiders/docker.py

@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 import scrapy
+from boto.beanstalk.response import Response
 
 
 class DockerSpider(scrapy.Spider):
@@ -8,4 +9,16 @@ class DockerSpider(scrapy.Spider):
     start_urls = ['http://http://ipac.library.sh.cn/']
 
     def parse(self, response):
-        pass
+        for href in response.css('.question'):
+            full_url=response.urljoin(href.extract())
+            yield scrapy.Request(full_url,callback=self.parse_question)
+    def start_request(self):
+        res=[]
+    def parse_question(self,response):
+        yield{
+              'title':response.css('h1').extract()[0],
+              'title':response.css('h1').extract()[0],
+              'title':response.css('h1').extract()[0],
+              'title':response.css('h1').extract()[0]
+              }
+