12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- # -*- coding: utf-8 -*-
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
- import requests
- from fun import settings
- import os
- class ImageDownloadPipeline(object):
- def process_item(self, item, spider):
- if 'image_urls' in item:
- images = []
- dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
- request_data = {'allow_redirects': False,
- 'auth': None,
- 'cert': None,
- 'data': {},
- 'files': {},
- 'headers': {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'},
- 'method': 'get',
- 'params': {},
- 'proxies': {},
- 'stream': True,
- 'timeout': 30,
- 'url': '',
- 'verify': True}
- if not os.path.exists(dir_path):
- os.makedirs(dir_path)
- for image_url in item['image_urls']:
- request_data['url'] = image_url
- us = image_url.split('/')[3:]
- image_file_name = '_'.join(us)
- file_path = '%s/%s' % (dir_path, image_file_name)
- images.append(file_path)
- if os.path.exists(file_path):
- continue
- with open(file_path, 'wb') as handle:
- response = requests.request(**request_data)
- for block in response.iter_content(1024):
- if not block:
- break
- handle.write(block)
- item['images'] = images
- return item
|