pipelines.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. # -*- coding: utf-8 -*-
  2. # Define your item pipelines here
  3. #
  4. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  5. # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  6. import requests
  7. from fun import settings
  8. import os
  9. class ImageDownloadPipeline(object):
  10. def process_item(self, item, spider):
  11. if 'image_urls' in item:
  12. images = []
  13. dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
  14. request_data = {'allow_redirects': False,
  15. 'auth': None,
  16. 'cert': None,
  17. 'data': {},
  18. 'files': {},
  19. 'headers': {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'},
  20. 'method': 'get',
  21. 'params': {},
  22. 'proxies': {},
  23. 'stream': True,
  24. 'timeout': 30,
  25. 'url': '',
  26. 'verify': True}
  27. if not os.path.exists(dir_path):
  28. os.makedirs(dir_path)
  29. for image_url in item['image_urls']:
  30. request_data['url'] = image_url
  31. us = image_url.split('/')[3:]
  32. image_file_name = '_'.join(us)
  33. file_path = '%s/%s' % (dir_path, image_file_name)
  34. images.append(file_path)
  35. if os.path.exists(file_path):
  36. continue
  37. with open(file_path, 'wb') as handle:
  38. response = requests.request(**request_data)
  39. for block in response.iter_content(1024):
  40. if not block:
  41. break
  42. handle.write(block)
  43. item['images'] = images
  44. return item