liuyuqi-dellpc 4 months ago
commit
cebc329cbd
5 changed files with 73 additions and 0 deletions
  1. 8 0
      README.md
  2. 1 0
      crawl_emotions/__init__.py
  3. 54 0
      crawl_emotions/emotions.py
  4. 6 0
      main.py
  5. 4 0
      requirements.txt

+ 8 - 0
README.md

@@ -0,0 +1,8 @@
+# crawl_emotions
+
+表情包爬取工具
+
+## License
+
+
+## Reference

+ 1 - 0
crawl_emotions/__init__.py

@@ -0,0 +1 @@
+from .emotions import Emotions

+ 54 - 0
crawl_emotions/emotions.py

@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2024/08/21 18:45:54
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   表情包
+'''
+import os,sys,re
+# import requests
+import httpx
+import bs4,csv,lxml
+from concurrent.futures import ThreadPoolExecutor
+
+class Emotions(object):
+    """"""
+    _url = 'https://fabiaoqing.com/biaoqing/lists/page/{page}.html'
+    header= {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.142.86 Safari/537.36"
+    }
+    def __init__(self):
+        self.sess = httpx.Session()
+        self.sess.headers.update(self.header)
+        self.path = os.path.dirname(os.path.abspath(__file__))
+        if not os.path.exists(self.path+'/data'):
+            os.mkdir(self.path+'/data')
+        self.data_path = self.path+'/data/'
+        self.pool = ThreadPoolExecutor(10)
+
+    def run(self):
+        for i in range(1, 4328+1):
+            url = self._url.format(page=i)
+            self.pool.submit(self.get_page, url)
+
+    def get_page(self, url):
+        response = self.sess.get(url)
+        soup = bs4.BeautifulSoup(response.text, 'lxml')
+        img_list = soup.find_all('img', class_='ui image lazy')
+        for img in img_list:
+            image = img.get('data-original')
+            title = img.get('title')
+            print('下载图片: ', title)
+            try:
+                with open(self.data_path + title + os.path.splitext(image)[-1], 'wb') as f:
+                    img = requests.get(image).content
+                    f.write(img)
+            except OSError:
+                print('length  failed')
+                break
+        print('下载完毕: ', url)
+    
+        def __del__(self):
+            self.pool.shutdown(wait=True)
+            

+ 6 - 0
main.py

@@ -0,0 +1,6 @@
+
+from crawl_emotions import Emotions
+
+if __name__=='__main__':
+    emo=Emotions()
+    emo.run()

+ 4 - 0
requirements.txt

@@ -0,0 +1,4 @@
+requests
+lxml
+bs4
+httpx