#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Contact : liuyuqi.gov@msn.cn @Time : 2024/08/21 18:45:54 @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved. @Desc : 表情包 ''' import os # import requests import httpx import bs4 from concurrent.futures import ThreadPoolExecutor class Emotions(object): """ crawl emotions """ _url = 'https://fabiaoqing.com/biaoqing/lists/page/{page}.html' header= { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.142.86 Safari/537.36" } def __init__(self, params: dict, debug=False): self.params = params self.sess = httpx.Client(headers=self.header) self.app_path=params["app_path"] if not os.path.exists(self.app_path+'/data'): os.mkdir(self.app_path+'/data') self.data_path = self.app_path+'/data/' self.pool = ThreadPoolExecutor(params['threads']) def run(self): page_start=self.params["page_start"] page_end=self.params["page_end"] for i in range(page_start, page_end+1): url = self._url.format(page=i) self.pool.submit(self.get_page, url) def get_page(self, url): response = self.sess.get(url) soup = bs4.BeautifulSoup(response.text, 'lxml') img_list = soup.find_all('img', class_='ui image lazy') for img in img_list: image = img.get('data-original') title = img.get('title') print('下载图片: ', title) try: with open(self.data_path + title + os.path.splitext(image)[-1], 'wb') as f: img = self.sess.get(image).content f.write(img) except OSError: print('length failed') break print('下载完毕: ', url) def __del__(self): self.pool.shutdown(wait=True)