Browse Source

添加 妹子图 爬虫

liuyuqi-dellpc 4 years ago
parent
commit
e1fb44bb84

+ 34 - 4
README.md

@@ -1,14 +1,44 @@
 # xiaohua-crawl
 
-1、 先爬取数据。数据存放到data文件夹。
-2、 在用python处理数据,下载图片。
-3、 sas工具定性分析。
+目前本项目包含两个小项目:校花网爬虫,妹子网爬虫。后续爬取结果陆续发布,并定时追加更新。
+
+1、 先爬取数据。数据存放到 data 文件夹。
+2、 在用 python 处理数据,下载图片。
+3、 sas 工具定性分析。
+
+
+## 执行
+
+校花网项目:
+
+```
+cd /d C:/Users/dell/Desktop/xiaohua-crawl
+pip install -r requirements.txt
+python xiaohua/main.py
+
+```
+
+妹子网项目:
+
+```
+> cd /d C:/Users/dell/Desktop/xiaohua-crawl/meizi
+> sudo easy_install virtualenv
+> virtualenv venv
+> source venv/bin/activate
+> python setup.py --requires | xargs pip install
+
+* 妹子图:`python run.py crawl meizitu`
+* coser `scrapy crawl coser -o items.csv -t csv`
+* WorldCosplay `python worldcosplay.py 53056`
+
+```
 
 ## 结果
+
 1、 校花图片
 
 爬了很多校花的靓照,以大学-名字命名,比如:上海师范大学天华学院-周雯琳。
 
 2、 校花通讯录
 
-爬了很多校花的详细资料,比如姓名,年龄,三围,学校,兴趣等等,最后统一做成通讯录形式。
+爬了很多校花的详细资料,比如姓名,年龄,三围,学校,兴趣等等,最后统一做成通讯录形式。

+ 0 - 0
meizi/fun/__init__.py


+ 32 - 0
meizi/fun/items.py

@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+from scrapy.contrib.loader import ItemLoader
+from scrapy.contrib.loader.processor import MapCompose, TakeFirst, Join
+
+
+class MeizituItem(scrapy.Item):
+    url = scrapy.Field()
+    name = scrapy.Field()
+    tags = scrapy.Field()
+    image_urls = scrapy.Field()
+    images = scrapy.Field()
+
+
+class CoserItem(scrapy.Item):
+    url = scrapy.Field()
+    name = scrapy.Field()
+    info = scrapy.Field()
+    image_urls = scrapy.Field()
+    images = scrapy.Field()
+
+
+class MyItemLoader(ItemLoader):
+    default_input_processor = MapCompose(lambda s: s.strip())
+    default_output_processor = TakeFirst()
+    description_out = Join()

+ 52 - 0
meizi/fun/pipelines.py

@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import requests
+from fun import settings
+import os
+
+
+class ImageDownloadPipeline(object):
+    def process_item(self, item, spider):
+        if 'image_urls' in item:
+            images = []
+            dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
+
+            request_data = {'allow_redirects': False,
+             'auth': None,
+             'cert': None,
+             'data': {},
+             'files': {},
+             'headers': {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'},
+             'method': 'get',
+             'params': {},
+             'proxies': {},
+             'stream': True,
+             'timeout': 30,
+             'url': '',
+             'verify': True}
+
+            if not os.path.exists(dir_path):
+                os.makedirs(dir_path)
+            for image_url in item['image_urls']:
+                request_data['url'] = image_url
+                us = image_url.split('/')[3:]
+                image_file_name = '_'.join(us)
+                file_path = '%s/%s' % (dir_path, image_file_name)
+                images.append(file_path)
+                if os.path.exists(file_path):
+                    continue
+
+                with open(file_path, 'wb') as handle:
+                    response = requests.request(**request_data)
+                    for block in response.iter_content(1024):
+                        if not block:
+                            break
+
+                        handle.write(block)
+
+            item['images'] = images
+        return item

+ 23 - 0
meizi/fun/settings.py

@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for fun project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+BOT_NAME = 'fun'
+
+SPIDER_MODULES = ['fun.spiders']
+NEWSPIDER_MODULE = 'fun.spiders'
+
+ITEM_PIPELINES = {'fun.pipelines.ImageDownloadPipeline': 1}
+
+IMAGES_STORE = '/tmp/images'
+
+
+DOWNLOAD_DELAY = 0.25    # 250 ms of delay
+
+USER_AGENT = "USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'"

+ 4 - 0
meizi/fun/spiders/__init__.py

@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

+ 33 - 0
meizi/fun/spiders/coser.py

@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+from scrapy.selector import Selector
+import scrapy
+from scrapy.contrib.loader import ItemLoader
+from fun.items import CoserItem
+
+
+class CoserSpider(scrapy.Spider):
+    name = "coser"
+    allowed_domains = ["bcy.net"]
+    start_urls = (
+        'http://bcy.net/cn125101',
+        'http://bcy.net/cn126487',
+        'http://bcy.net/cn126173'
+    )
+
+    def parse(self, response):
+        sel = Selector(response)
+
+        for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
+            link = 'http://bcy.net%s' % link
+            request = scrapy.Request(link, callback=self.parse_item)
+            yield request
+
+    def parse_item(self, response):
+        l = ItemLoader(item=CoserItem(), response=response)
+        l.add_xpath('name', "//h1[@class='js-post-title']/text()")
+        l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
+        urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
+        urls = [url.replace('/w650', '') for url in urls]
+        l.add_value('image_urls', urls)
+        l.add_value('url', response.url)
+        return l.load_item()

+ 22 - 0
meizi/fun/spiders/coser2.py

@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.contrib.loader import ItemLoader, Identity
+from fun.items import CoserItem
+
+
+class CoserSpider(scrapy.Spider):
+    name = "coser2"
+    allowed_domains = ["bcy.net"]
+    start_urls = (
+        'http://bcy.net/coser/detail/9495/130440',
+    )
+
+    def parse(self, response):
+        l = ItemLoader(item=CoserItem(), response=response)
+        l.add_xpath('name', "//h1[@class='js-post-title']/text()")
+        l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
+        urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
+        urls = [url.replace('/w650', '') for url in urls]
+        l.add_value('image_urls', urls)
+        l.add_value('url', response.url)
+        return l.load_item()

+ 36 - 0
meizi/fun/spiders/meizitu.py

@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+from scrapy.selector import Selector
+import scrapy
+from scrapy.contrib.loader import ItemLoader, Identity
+from fun.items import MeizituItem
+
+
+class MeizituSpider(scrapy.Spider):
+    name = "meizitu"
+    allowed_domains = ["meizitu.com"]
+    start_urls = (
+        'http://www.meizitu.com/',
+    )
+
+    def parse(self, response):
+        sel = Selector(response)
+        for link in sel.xpath('//h2/a/@href').extract():
+            request = scrapy.Request(link, callback=self.parse_item)
+            yield request
+
+        pages = sel.xpath("//div[@class='navigation']/div[@id='wp_page_numbers']/ul/li/a/@href").extract()
+        print('pages: %s' % pages)
+        if len(pages) > 2:
+            page_link = pages[-2]
+            page_link = page_link.replace('/a/', '')
+            request = scrapy.Request('http://www.meizitu.com/a/%s' % page_link, callback=self.parse)
+            yield request
+
+    def parse_item(self, response):
+        l = ItemLoader(item=MeizituItem(), response=response)
+        l.add_xpath('name', '//h2/a/text()')
+        l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
+        l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())
+
+        l.add_value('url', response.url)
+        return l.load_item()

+ 0 - 0
meizi/fun/test.py


+ 4 - 0
meizi/run.py

@@ -0,0 +1,4 @@
+#!/usr/bin/python
+
+from scrapy.cmdline import execute
+execute()

+ 11 - 0
meizi/scrapy.cfg

@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = fun.settings
+
+[deploy]
+url = http://localhost:6800/
+project = fun

+ 10 - 0
meizi/setup.py

@@ -0,0 +1,10 @@
+# Automatically created by: scrapy deploy
+
+from setuptools import setup, find_packages
+
+setup(
+    name='fun_crawler',
+    version='1.0',
+    packages=find_packages(),
+    entry_points={'scrapy': ['settings = fun.settings']}, requires=['requests', 'scrapy']
+)

+ 54 - 0
meizi/worldcosplay.py

@@ -0,0 +1,54 @@
+# coding=utf-8
+import json
+from sys import argv
+# import requests
+import os
+import urllib
+import urllib2
+
+
+def main(member_id, page=1, index=0):
+    url = 'http://worldcosplay.net/en/api/member/photos?member_id=%s&page=%s&limit=100000&rows=16&p3_photo_list=1' % (member_id, page)
+    r = urllib2.urlopen(url)
+
+    if r.code == 200:
+        data = json.loads(r.read())
+        if data['has_error'] != 0:
+            print u'接口挫了'
+            exit(1)
+
+        photo_data_list = data['list']
+        if not photo_data_list:
+            print u'没东西了?第 %s 页,共下载了 %s 个图片' % (page, index - 1)
+            exit(0)
+        for photo_data in photo_data_list:
+            url = photo_data['photo']['sq300_url']
+            subject = photo_data['photo']['subject']
+            url = url.replace('/sq300', '')
+            subject = subject.replace('/', '_')
+
+            if not os.path.exists(member_id):
+                os.makedirs(member_id)
+
+            filename = '%s/%s_%s_%s.jpg' % (member_id, member_id, index, subject)
+            try:
+                urllib.urlretrieve(url=url, filename=filename)
+                print u'下完了%s张' % (index + 1)
+                index += 1
+            except Exception:
+                print(u'这张图片下载出问题了: %s' % url)
+
+        page += 1
+        main(member_id, page=page, index=index)
+
+    else:
+        print u'挫了'
+        exit(1)
+
+
+if __name__ == '__main__':
+    if len(argv) < 2:
+        print(u'请输入coser ID,例如:53056')
+        exit(1)
+    member_id = argv[1]
+    main(member_id)

+ 0 - 0
requirements.txt


+ 37 - 32
xiaohua/main.py

@@ -1,4 +1,4 @@
-#coding=utf-8
+# coding=utf-8
 '''
 Created on 2017年7月15日
 @vsersion:python3.6
@@ -10,46 +10,51 @@ from time import sleep
 import random
 from urllib import request
 
-project_dir="C:/Users/dell/Desktop/xiaohua-crawl"
-img_dir=project_dir+"/images"
-data_dir=project_dir+"/data"
-
-def downloadImg(imgUrl,fileName):
-    try:  
-        headers = { 
-        'User-Agent' : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
-        'Referer':'http://www.xiaohuar.com'
-             }
-        req=request.Request(url=imgUrl)
+project_dir = "C:/Users/dell/Desktop/xiaohua-crawl"
+img_dir = project_dir+"/images"
+data_dir = project_dir+"/data"
+
+
+def downloadImg(imgUrl, fileName):
+    try:
+        headers = {
+            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
+            'Referer': 'http://www.xiaohuar.com'
+        }
+        req = request.Request(url=imgUrl)
         for i in headers:
-            req.add_header(i,headers[i])
-        res=request.urlopen(req)
+            req.add_header(i, headers[i])
+        res = request.urlopen(req)
         with open(img_dir+"/"+fileName+imgUrl[-4:], "wb") as code:
             code.write(res.read())
 #         sleep(random.randint(1,5))
-    except Exception as err:  
-        print(err)  
-    finally:  
-        print("pic:"+ fileName+".jpg")
+    except Exception as err:
+        print(err)
+    finally:
+        print("pic:" + fileName+".jpg")
+
 
 def __init__():
-    if(os.path.exists(img_dir)!=True):
-                os.mkdir(img_dir)
-    if(os.path.exists(data_dir)!=True):
-                os.mkdir(data_dir)
+    if(os.path.exists(img_dir) != True):
+        os.mkdir(img_dir)
+    if(os.path.exists(data_dir) != True):
+        os.mkdir(data_dir)
+
 
 def main():
-    file=data_dir+"/result.csv"
+    file = data_dir+"/result.csv"
     with open(file, 'r') as f:
-#         data=csv.reader(f, csv.excel_tab)
-        data=csv.reader(f)
+        #         data=csv.reader(f, csv.excel_tab)
+        data = csv.reader(f)
         for row in data:
-            imgUrl=""
-            fileName=""
+            imgUrl = ""
+            fileName = ""
             for i in range(len(row)):
-                fileName=row[4]+"-"+row[3]
-                imgUrl="http://www.xiaohuar.com"+row[2]
-            
-            downloadImg(imgUrl,fileName)
+                fileName = row[4]+"-"+row[3]
+                imgUrl = "http://www.xiaohuar.com"+row[2]
+
+            downloadImg(imgUrl, fileName)
+
+
 __init__()
-main()
+main()