12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2024/08/21 18:45:54
- @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
- @Desc : 表情包
- '''
- import os
- # import requests
- import httpx
- import bs4
- from concurrent.futures import ThreadPoolExecutor
- class Emotions(object):
- """ crawl emotions """
- _url = 'https://fabiaoqing.com/biaoqing/lists/page/{page}.html'
- header= {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.142.86 Safari/537.36"
- }
- def __init__(self, params: dict, debug=False):
- self.params = params
- self.sess = httpx.Client(headers=self.header)
- self.app_path=params["app_path"]
- if not os.path.exists(self.app_path+'/data'):
- os.mkdir(self.app_path+'/data')
- self.data_path = self.app_path+'/data/'
- self.pool = ThreadPoolExecutor(params['threads'])
- def run(self):
- page_start=self.params["page_start"]
- page_end=self.params["page_end"]
- for i in range(page_start, page_end+1):
- url = self._url.format(page=i)
- self.pool.submit(self.get_page, url)
- def get_page(self, url):
- response = self.sess.get(url)
- soup = bs4.BeautifulSoup(response.text, 'lxml')
- img_list = soup.find_all('img', class_='ui image lazy')
- for img in img_list:
- image = img.get('data-original')
- title = img.get('title')
- print('下载图片: ', title)
- try:
- with open(self.data_path + title + os.path.splitext(image)[-1], 'wb') as f:
- img = self.sess.get(image).content
- f.write(img)
- except OSError:
- print('length failed')
- break
- print('下载完毕: ', url)
-
- def __del__(self):
- self.pool.shutdown(wait=True)
|