fish 4 months ago
parent
commit
c7a1609a7c
4 changed files with 9 additions and 9 deletions
  1. 1 0
      .gitignore
  2. 1 1
      README.md
  3. 6 7
      crawl_emotions/emotions.py
  4. 1 1
      main.py

+ 1 - 0
.gitignore

@@ -0,0 +1 @@
+*.pyc

+ 1 - 1
README.md

@@ -1,6 +1,6 @@
 # crawl_emotions
 
-表情包爬取工具
+表情包爬取工具,速度不能快,会触发 Cloudflare 限制。
 
 ## License
 

+ 6 - 7
crawl_emotions/emotions.py

@@ -6,26 +6,25 @@
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
 @Desc    :   表情包
 '''
-import os,sys,re
+import os
 # import requests
 import httpx
-import bs4,csv,lxml
+import bs4
 from concurrent.futures import ThreadPoolExecutor
 
 class Emotions(object):
-    """"""
+    """ crawl emotions """
     _url = 'https://fabiaoqing.com/biaoqing/lists/page/{page}.html'
     header= {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.142.86 Safari/537.36"
     }
     def __init__(self):
-        self.sess = httpx.Session()
-        self.sess.headers.update(self.header)
+        self.sess = httpx.Client(headers=self.header)
         self.path = os.path.dirname(os.path.abspath(__file__))
         if not os.path.exists(self.path+'/data'):
             os.mkdir(self.path+'/data')
         self.data_path = self.path+'/data/'
-        self.pool = ThreadPoolExecutor(10)
+        self.pool = ThreadPoolExecutor(2)
 
     def run(self):
         for i in range(1, 4328+1):
@@ -42,7 +41,7 @@ class Emotions(object):
             print('下载图片: ', title)
             try:
                 with open(self.data_path + title + os.path.splitext(image)[-1], 'wb') as f:
-                    img = requests.get(image).content
+                    img = self.sess.get(image).content
                     f.write(img)
             except OSError:
                 print('length  failed')

+ 1 - 1
main.py

@@ -2,5 +2,5 @@
 from crawl_emotions import Emotions
 
 if __name__=='__main__':
-    emo=Emotions()
+    emo= Emotions()
     emo.run()