emotions.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2024/08/21 18:45:54
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc : 表情包
  8. '''
  9. import os
  10. # import requests
  11. import httpx
  12. import bs4
  13. from concurrent.futures import ThreadPoolExecutor
  14. class Emotions(object):
  15. """ crawl emotions """
  16. _url = 'https://fabiaoqing.com/biaoqing/lists/page/{page}.html'
  17. header= {
  18. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.142.86 Safari/537.36"
  19. }
  20. def __init__(self, params: dict, debug=False):
  21. self.params = params
  22. self.sess = httpx.Client(headers=self.header)
  23. self.app_path=params["app_path"]
  24. if not os.path.exists(self.app_path+'/data'):
  25. os.mkdir(self.app_path+'/data')
  26. self.data_path = self.app_path+'/data/'
  27. self.pool = ThreadPoolExecutor(params['threads'])
  28. def run(self):
  29. page_start=self.params["page_start"]
  30. page_end=self.params["page_end"]
  31. for i in range(page_start, page_end+1):
  32. url = self._url.format(page=i)
  33. self.pool.submit(self.get_page, url)
  34. def get_page(self, url):
  35. response = self.sess.get(url)
  36. soup = bs4.BeautifulSoup(response.text, 'lxml')
  37. img_list = soup.find_all('img', class_='ui image lazy')
  38. for img in img_list:
  39. image = img.get('data-original')
  40. title = img.get('title')
  41. print('下载图片: ', title)
  42. try:
  43. with open(self.data_path + title + os.path.splitext(image)[-1], 'wb') as f:
  44. img = self.sess.get(image).content
  45. f.write(img)
  46. except OSError:
  47. print('length failed')
  48. break
  49. print('下载完毕: ', url)
  50. def __del__(self):
  51. self.pool.shutdown(wait=True)