liuyuqi-dellpc 6 years ago
parent
commit
c5612a9805
5 changed files with 31 additions and 22 deletions
  1. 1 0
      requirements.txt
  2. 5 1
      README.md
  3. 9 15
      shlib/items.py
  4. 3 4
      shlib/pipelines.py
  5. 13 2
      shlib/spiders/docker.py

+ 1 - 0
requirements.txt

@@ -0,0 +1 @@
+Scrapy==1.3.3

+ 5 - 1
README.md

@@ -10,4 +10,8 @@ http://ipac.library.sh.cn/ipac20/ipac.jsp?session=1M973O24348D8.1202&profile=sl&
 web搜索,email推送信息。
 
 # 跑
-scrapy crawl docker
+pip install Scrapy MysqlDB 
+settings.py配置数据库
+
+scrapy crawl docker
+

+ 9 - 15
shlib/items.py

@@ -13,18 +13,12 @@ class ShlibItem(scrapy.Item):
     # define the fields for your item here like:
     # name = scrapy.Field()
     #书名
-    book_name=Field()
-    
-    book_desc=Field()
-    
-    book_name=Field()
-    
-    book_name=Field()
-    
-    book_name=Field()
-    
-    book_name=Field()
-    
-    book_name=Field()
-    
-    book_name=Field()
+    book_id=Field()
+    bookname=Field()
+    url=Field()
+    desc=Field()
+    address=Field()
+    booknum=Field()
+    status=Field()
+    type=Field()
+    barcode=Field()

+ 3 - 4
shlib/pipelines.py

@@ -13,10 +13,9 @@ class ShlibPipeline(object):
         DBKWARGS = spider.settings.get('DBKWARGS')
         MySQLdb.connect(**DBKWARGS)
         cur = con.cursor()
-        sql = ("insert into proxy(IP,PORT,TYPE,POSITION,SPEED,LAST_CHECK_TIME) "
-            "values(%s,%s,%s,%s,%s,%s)")
-        lis = (item['IP'],item['PORT'],item['TYPE'],item['POSITION'],item['SPEED'],
-            item['LAST_CHECK_TIME'])
+        sql = ("insert into bookinfo(`bookid`, `bookname`, `url`, `desc`, `address`, `booknum`, `status`, `type`, `barcode`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)")
+        lis = (item['book_id'],item['bookname'],item['url'],item['desc'],item['address'],
+               item['booknum'],item['status'],item['type'],item['barcode'])
         try:
             cur.execute(sql,lis)
         except Exception,e:

+ 13 - 2
shlib/spiders/docker.py

@@ -5,15 +5,24 @@ from boto.beanstalk.response import Response
 
 class DockerSpider(scrapy.Spider):
     name = 'docker'
+    keyword=['docker','hadoop','区块链','android','人工智能','python']
     allowed_domains = ['http://ipac.library.sh.cn']
-    start_urls = ['http://http://ipac.library.sh.cn/']
+    start_urls = ['http://ipac.library.sh.cn/ipac20/ipac.jsp']
 
     def parse(self, response):
         for href in response.css('.question'):
             full_url=response.urljoin(href.extract())
             yield scrapy.Request(full_url,callback=self.parse_question)
+    
     def start_request(self):
         res=[]
+        
+        for i in range(1,10):
+            req=scrapy.Request("http://ipac.library.sh.cn/ipac20/ipac.jsp?session=F4H74295589B7.106&menu=search&aspect=basic_search&npp=10&ipp=20&profile=sl&ri=1&source=172.16.103.188%40%21shcl&index=.TW&x=0&y=0&aspect=basic_search&term=%s"%i)
+            reqs.append(req)
+        
+        return reqs
+    
     def parse_question(self,response):
         yield{
               'title':response.css('h1').extract()[0],
@@ -21,4 +30,6 @@ class DockerSpider(scrapy.Spider):
               'title':response.css('h1').extract()[0],
               'title':response.css('h1').extract()[0]
               }
-        
+
+
+