4 years ago · e1fb44bb84
--- a/README.md
+++ b/README.md
@@ -1,14 +1,44 @@
 
				 # xiaohua-crawl
			
 
				 
			
 
				-1、 先爬取数据。数据存放到data文件夹。
			
 
				-2、 在用python处理数据，下载图片。
			
 
				-3、 sas工具定性分析。
			
 
				+目前本项目包含两个小项目：校花网爬虫，妹子网爬虫。后续爬取结果陆续发布，并定时追加更新。
			
 
				+
			
 
				+1、 先爬取数据。数据存放到 data 文件夹。
			
 
				+2、 在用 python 处理数据，下载图片。
			
 
				+3、 sas 工具定性分析。
			
 
				+
			
 
				+
			
 
				+## 执行
			
 
				+
			
 
				+校花网项目：
			
 
				+
			
 
				+```
			
 
				+cd /d C:/Users/dell/Desktop/xiaohua-crawl
			
 
				+pip install -r requirements.txt
			
 
				+python xiaohua/main.py
			
 
				+
			
 
				+```
			
 
				+
			
 
				+妹子网项目：
			
 
				+
			
 
				+```
			
 
				+> cd /d C:/Users/dell/Desktop/xiaohua-crawl/meizi
			
 
				+> sudo easy_install virtualenv
			
 
				+> virtualenv venv
			
 
				+> source venv/bin/activate
			
 
				+> python setup.py --requires | xargs pip install
			
 
				+
			
 
				+* 妹子图：`python run.py crawl meizitu`
			
 
				+* coser `scrapy crawl coser -o items.csv -t csv`
			
 
				+* WorldCosplay `python worldcosplay.py 53056`
			
 
				+
			
 
				+```
			
 
				 
			
 
				 ## 结果
			
 
				+
			
 
				 1、 校花图片
			
 
				 
			
 
				 爬了很多校花的靓照，以大学-名字命名，比如：上海师范大学天华学院-周雯琳。
			
 
				 
			
 
				 2、 校花通讯录
			
 
				 
			
 
				-爬了很多校花的详细资料，比如姓名，年龄，三围，学校，兴趣等等，最后统一做成通讯录形式。
			
 
				+爬了很多校花的详细资料，比如姓名，年龄，三围，学校，兴趣等等，最后统一做成通讯录形式。
			
--- a/meizi/fun/__init__.py
+++ b/meizi/fun/__init__.py
--- a/meizi/fun/items.py
+++ b/meizi/fun/items.py
@@ -0,0 +1,32 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define here the models for your scraped items
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# http://doc.scrapy.org/en/latest/topics/items.html
			
 
				+
			
 
				+import scrapy
			
 
				+from scrapy.contrib.loader import ItemLoader
			
 
				+from scrapy.contrib.loader.processor import MapCompose, TakeFirst, Join
			
 
				+
			
 
				+
			
 
				+class MeizituItem(scrapy.Item):
			
 
				+    url = scrapy.Field()
			
 
				+    name = scrapy.Field()
			
 
				+    tags = scrapy.Field()
			
 
				+    image_urls = scrapy.Field()
			
 
				+    images = scrapy.Field()
			
 
				+
			
 
				+
			
 
				+class CoserItem(scrapy.Item):
			
 
				+    url = scrapy.Field()
			
 
				+    name = scrapy.Field()
			
 
				+    info = scrapy.Field()
			
 
				+    image_urls = scrapy.Field()
			
 
				+    images = scrapy.Field()
			
 
				+
			
 
				+
			
 
				+class MyItemLoader(ItemLoader):
			
 
				+    default_input_processor = MapCompose(lambda s: s.strip())
			
 
				+    default_output_processor = TakeFirst()
			
 
				+    description_out = Join()
			
--- a/meizi/fun/pipelines.py
+++ b/meizi/fun/pipelines.py
@@ -0,0 +1,52 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define your item pipelines here
			
 
				+#
			
 
				+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
			
 
				+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+import requests
			
 
				+from fun import settings
			
 
				+import os
			
 
				+
			
 
				+
			
 
				+class ImageDownloadPipeline(object):
			
 
				+    def process_item(self, item, spider):
			
 
				+        if 'image_urls' in item:
			
 
				+            images = []
			
 
				+            dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
			
 
				+
			
 
				+            request_data = {'allow_redirects': False,
			
 
				+             'auth': None,
			
 
				+             'cert': None,
			
 
				+             'data': {},
			
 
				+             'files': {},
			
 
				+             'headers': {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'},
			
 
				+             'method': 'get',
			
 
				+             'params': {},
			
 
				+             'proxies': {},
			
 
				+             'stream': True,
			
 
				+             'timeout': 30,
			
 
				+             'url': '',
			
 
				+             'verify': True}
			
 
				+
			
 
				+            if not os.path.exists(dir_path):
			
 
				+                os.makedirs(dir_path)
			
 
				+            for image_url in item['image_urls']:
			
 
				+                request_data['url'] = image_url
			
 
				+                us = image_url.split('/')[3:]
			
 
				+                image_file_name = '_'.join(us)
			
 
				+                file_path = '%s/%s' % (dir_path, image_file_name)
			
 
				+                images.append(file_path)
			
 
				+                if os.path.exists(file_path):
			
 
				+                    continue
			
 
				+
			
 
				+                with open(file_path, 'wb') as handle:
			
 
				+                    response = requests.request(**request_data)
			
 
				+                    for block in response.iter_content(1024):
			
 
				+                        if not block:
			
 
				+                            break
			
 
				+
			
 
				+                        handle.write(block)
			
 
				+
			
 
				+            item['images'] = images
			
 
				+        return item
			
--- a/meizi/fun/settings.py
+++ b/meizi/fun/settings.py
@@ -0,0 +1,23 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Scrapy settings for fun project
			
 
				+#
			
 
				+# For simplicity, this file contains only the most important settings by
			
 
				+# default. All the other settings are documented here:
			
 
				+#
			
 
				+#     http://doc.scrapy.org/en/latest/topics/settings.html
			
 
				+#
			
 
				+
			
 
				+BOT_NAME = 'fun'
			
 
				+
			
 
				+SPIDER_MODULES = ['fun.spiders']
			
 
				+NEWSPIDER_MODULE = 'fun.spiders'
			
 
				+
			
 
				+ITEM_PIPELINES = {'fun.pipelines.ImageDownloadPipeline': 1}
			
 
				+
			
 
				+IMAGES_STORE = '/tmp/images'
			
 
				+
			
 
				+
			
 
				+DOWNLOAD_DELAY = 0.25    # 250 ms of delay
			
 
				+
			
 
				+USER_AGENT = "USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'"
			
--- a/meizi/fun/spiders/__init__.py
+++ b/meizi/fun/spiders/__init__.py
@@ -0,0 +1,4 @@
 
				+# This package will contain the spiders of your Scrapy project
			
 
				+#
			
 
				+# Please refer to the documentation for information on how to create and manage
			
 
				+# your spiders.
			
--- a/meizi/fun/spiders/coser.py
+++ b/meizi/fun/spiders/coser.py
@@ -0,0 +1,33 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+from scrapy.selector import Selector
			
 
				+import scrapy
			
 
				+from scrapy.contrib.loader import ItemLoader
			
 
				+from fun.items import CoserItem
			
 
				+
			
 
				+
			
 
				+class CoserSpider(scrapy.Spider):
			
 
				+    name = "coser"
			
 
				+    allowed_domains = ["bcy.net"]
			
 
				+    start_urls = (
			
 
				+        'http://bcy.net/cn125101',
			
 
				+        'http://bcy.net/cn126487',
			
 
				+        'http://bcy.net/cn126173'
			
 
				+    )
			
 
				+
			
 
				+    def parse(self, response):
			
 
				+        sel = Selector(response)
			
 
				+
			
 
				+        for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
			
 
				+            link = 'http://bcy.net%s' % link
			
 
				+            request = scrapy.Request(link, callback=self.parse_item)
			
 
				+            yield request
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        l = ItemLoader(item=CoserItem(), response=response)
			
 
				+        l.add_xpath('name', "//h1[@class='js-post-title']/text()")
			
 
				+        l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
			
 
				+        urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
			
 
				+        urls = [url.replace('/w650', '') for url in urls]
			
 
				+        l.add_value('image_urls', urls)
			
 
				+        l.add_value('url', response.url)
			
 
				+        return l.load_item()
			
--- a/meizi/fun/spiders/coser2.py
+++ b/meizi/fun/spiders/coser2.py
@@ -0,0 +1,22 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import scrapy
			
 
				+from scrapy.contrib.loader import ItemLoader, Identity
			
 
				+from fun.items import CoserItem
			
 
				+
			
 
				+
			
 
				+class CoserSpider(scrapy.Spider):
			
 
				+    name = "coser2"
			
 
				+    allowed_domains = ["bcy.net"]
			
 
				+    start_urls = (
			
 
				+        'http://bcy.net/coser/detail/9495/130440',
			
 
				+    )
			
 
				+
			
 
				+    def parse(self, response):
			
 
				+        l = ItemLoader(item=CoserItem(), response=response)
			
 
				+        l.add_xpath('name', "//h1[@class='js-post-title']/text()")
			
 
				+        l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
			
 
				+        urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
			
 
				+        urls = [url.replace('/w650', '') for url in urls]
			
 
				+        l.add_value('image_urls', urls)
			
 
				+        l.add_value('url', response.url)
			
 
				+        return l.load_item()
			
--- a/meizi/fun/spiders/meizitu.py
+++ b/meizi/fun/spiders/meizitu.py
@@ -0,0 +1,36 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+from scrapy.selector import Selector
			
 
				+import scrapy
			
 
				+from scrapy.contrib.loader import ItemLoader, Identity
			
 
				+from fun.items import MeizituItem
			
 
				+
			
 
				+
			
 
				+class MeizituSpider(scrapy.Spider):
			
 
				+    name = "meizitu"
			
 
				+    allowed_domains = ["meizitu.com"]
			
 
				+    start_urls = (
			
 
				+        'http://www.meizitu.com/',
			
 
				+    )
			
 
				+
			
 
				+    def parse(self, response):
			
 
				+        sel = Selector(response)
			
 
				+        for link in sel.xpath('//h2/a/@href').extract():
			
 
				+            request = scrapy.Request(link, callback=self.parse_item)
			
 
				+            yield request
			
 
				+
			
 
				+        pages = sel.xpath("//div[@class='navigation']/div[@id='wp_page_numbers']/ul/li/a/@href").extract()
			
 
				+        print('pages: %s' % pages)
			
 
				+        if len(pages) > 2:
			
 
				+            page_link = pages[-2]
			
 
				+            page_link = page_link.replace('/a/', '')
			
 
				+            request = scrapy.Request('http://www.meizitu.com/a/%s' % page_link, callback=self.parse)
			
 
				+            yield request
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        l = ItemLoader(item=MeizituItem(), response=response)
			
 
				+        l.add_xpath('name', '//h2/a/text()')
			
 
				+        l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
			
 
				+        l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())
			
 
				+
			
 
				+        l.add_value('url', response.url)
			
 
				+        return l.load_item()
			
--- a/meizi/fun/test.py
+++ b/meizi/fun/test.py
--- a/meizi/run.py
+++ b/meizi/run.py
@@ -0,0 +1,4 @@
 
				+#!/usr/bin/python
			
 
				+
			
 
				+from scrapy.cmdline import execute
			
 
				+execute()
			
--- a/meizi/scrapy.cfg
+++ b/meizi/scrapy.cfg
@@ -0,0 +1,11 @@
 
				+# Automatically created by: scrapy startproject
			
 
				+#
			
 
				+# For more information about the [deploy] section see:
			
 
				+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
			
 
				+
			
 
				+[settings]
			
 
				+default = fun.settings
			
 
				+
			
 
				+[deploy]
			
 
				+url = http://localhost:6800/
			
 
				+project = fun
			
--- a/meizi/setup.py
+++ b/meizi/setup.py
@@ -0,0 +1,10 @@
 
				+# Automatically created by: scrapy deploy
			
 
				+
			
 
				+from setuptools import setup, find_packages
			
 
				+
			
 
				+setup(
			
 
				+    name='fun_crawler',
			
 
				+    version='1.0',
			
 
				+    packages=find_packages(),
			
 
				+    entry_points={'scrapy': ['settings = fun.settings']}, requires=['requests', 'scrapy']
			
 
				+)
			
--- a/meizi/worldcosplay.py
+++ b/meizi/worldcosplay.py
@@ -0,0 +1,54 @@
 
				+# coding=utf-8
			
 
				+import json
			
 
				+from sys import argv
			
 
				+# import requests
			
 
				+import os
			
 
				+import urllib
			
 
				+import urllib2
			
 
				+
			
 
				+
			
 
				+def main(member_id, page=1, index=0):
			
 
				+    url = 'http://worldcosplay.net/en/api/member/photos?member_id=%s&page=%s&limit=100000&rows=16&p3_photo_list=1' % (member_id, page)
			
 
				+    r = urllib2.urlopen(url)
			
 
				+
			
 
				+    if r.code == 200:
			
 
				+        data = json.loads(r.read())
			
 
				+        if data['has_error'] != 0:
			
 
				+            print u'接口挫了'
			
 
				+            exit(1)
			
 
				+
			
 
				+        photo_data_list = data['list']
			
 
				+        if not photo_data_list:
			
 
				+            print u'没东西了？第 %s 页，共下载了 %s 个图片' % (page, index - 1)
			
 
				+            exit(0)
			
 
				+        for photo_data in photo_data_list:
			
 
				+            url = photo_data['photo']['sq300_url']
			
 
				+            subject = photo_data['photo']['subject']
			
 
				+            url = url.replace('/sq300', '')
			
 
				+            subject = subject.replace('/', '_')
			
 
				+
			
 
				+            if not os.path.exists(member_id):
			
 
				+                os.makedirs(member_id)
			
 
				+
			
 
				+            filename = '%s/%s_%s_%s.jpg' % (member_id, member_id, index, subject)
			
 
				+            try:
			
 
				+                urllib.urlretrieve(url=url, filename=filename)
			
 
				+                print u'下完了%s张' % (index + 1)
			
 
				+                index += 1
			
 
				+            except Exception:
			
 
				+                print(u'这张图片下载出问题了： %s' % url)
			
 
				+
			
 
				+        page += 1
			
 
				+        main(member_id, page=page, index=index)
			
 
				+
			
 
				+    else:
			
 
				+        print u'挫了'
			
 
				+        exit(1)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(argv) < 2:
			
 
				+        print(u'请输入coser ID，例如：53056')
			
 
				+        exit(1)
			
 
				+    member_id = argv[1]
			
 
				+    main(member_id)
			
--- a/requirements.txt
+++ b/requirements.txt
--- a/xiaohua/main.py
+++ b/xiaohua/main.py
@@ -1,4 +1,4 @@
 
				-#coding=utf-8
			
 
				+# coding=utf-8
			
 
				 '''
			
 
				 Created on 2017年7月15日
			
 
				 @vsersion:python3.6
			
@@ -10,46 +10,51 @@ from time import sleep
 
				 import random
			
 
				 from urllib import request
			
 
				 
			
 
				-project_dir="C:/Users/dell/Desktop/xiaohua-crawl"
			
 
				-img_dir=project_dir+"/images"
			
 
				-data_dir=project_dir+"/data"
			
 
				-
			
 
				-def downloadImg(imgUrl,fileName):
			
 
				-    try:  
			
 
				-        headers = { 
			
 
				-        'User-Agent' : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
			
 
				-        'Referer':'http://www.xiaohuar.com'
			
 
				-             }
			
 
				-        req=request.Request(url=imgUrl)
			
 
				+project_dir = "C:/Users/dell/Desktop/xiaohua-crawl"
			
 
				+img_dir = project_dir+"/images"
			
 
				+data_dir = project_dir+"/data"
			
 
				+
			
 
				+
			
 
				+def downloadImg(imgUrl, fileName):
			
 
				+    try:
			
 
				+        headers = {
			
 
				+            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
			
 
				+            'Referer': 'http://www.xiaohuar.com'
			
 
				+        }
			
 
				+        req = request.Request(url=imgUrl)
			
 
				         for i in headers:
			
 
				-            req.add_header(i,headers[i])
			
 
				-        res=request.urlopen(req)
			
 
				+            req.add_header(i, headers[i])
			
 
				+        res = request.urlopen(req)
			
 
				         with open(img_dir+"/"+fileName+imgUrl[-4:], "wb") as code:
			
 
				             code.write(res.read())
			
 
				 #         sleep(random.randint(1,5))
			
 
				-    except Exception as err:  
			
 
				-        print(err)  
			
 
				-    finally:  
			
 
				-        print("pic："+ fileName+".jpg")
			
 
				+    except Exception as err:
			
 
				+        print(err)
			
 
				+    finally:
			
 
				+        print("pic：" + fileName+".jpg")
			
 
				+
			
 
				 
			
 
				 def __init__():
			
 
				-    if(os.path.exists(img_dir)!=True):
			
 
				-                os.mkdir(img_dir)
			
 
				-    if(os.path.exists(data_dir)!=True):
			
 
				-                os.mkdir(data_dir)
			
 
				+    if(os.path.exists(img_dir) != True):
			
 
				+        os.mkdir(img_dir)
			
 
				+    if(os.path.exists(data_dir) != True):
			
 
				+        os.mkdir(data_dir)
			
 
				+
			
 
				 
			
 
				 def main():
			
 
				-    file=data_dir+"/result.csv"
			
 
				+    file = data_dir+"/result.csv"
			
 
				     with open(file, 'r') as f:
			
 
				-#         data=csv.reader(f, csv.excel_tab)
			
 
				-        data=csv.reader(f)
			
 
				+        #         data=csv.reader(f, csv.excel_tab)
			
 
				+        data = csv.reader(f)
			
 
				         for row in data:
			
 
				-            imgUrl=""
			
 
				-            fileName=""
			
 
				+            imgUrl = ""
			
 
				+            fileName = ""
			
 
				             for i in range(len(row)):
			
 
				-                fileName=row[4]+"-"+row[3]
			
 
				-                imgUrl="http://www.xiaohuar.com"+row[2]
			
 
				-            
			
 
				-            downloadImg(imgUrl,fileName)
			
 
				+                fileName = row[4]+"-"+row[3]
			
 
				+                imgUrl = "http://www.xiaohuar.com"+row[2]
			
 
				+
			
 
				+            downloadImg(imgUrl, fileName)
			
 
				+
			
 
				+
			
 
				 __init__()
			
 
				-main()
			
 
				+main()