2 Commits c7a1609a7c ... 5a8a80aeee

Author SHA1 Message Date
  liuyuqi-dellpc 5a8a80aeee refactor: improve data handling and directory management in emotions crawler 3 months ago
  liuyuqi-dellpc f05abb0a54 feat: add command-line argument parsing with argparse 3 months ago

+ 5 - 0
.env.example

@@ -0,0 +1,5 @@
+
+page_start=1
+page_end=4328
+
+threads=1

+ 1 - 0
.gitignore

@@ -1 +1,2 @@
 *.pyc
 *.pyc
+.env

+ 11 - 0
README.md

@@ -2,6 +2,17 @@
 
 
 表情包爬取工具,速度不能快,会触发 Cloudflare 限制。
 表情包爬取工具,速度不能快,会触发 Cloudflare 限制。
 
 
+
+表情来源于: https://fabiaoqing.com/
+
+
+## Develop
+
+先配置 .env
+```
+python main.py
+```
+
 ## License
 ## License
 
 
 
 

+ 16 - 1
crawl_emotions/__init__.py

@@ -1 +1,16 @@
-from .emotions import Emotions
+from .emotions import Emotions
+import os,sys,re,json,time
+from .options import parser_args
+
+def main(argv=None):
+    """Main entry point of the program"""
+    try:
+        args = parser_args()
+        emotions = Emotions(args)
+        try:
+            instances: None = emotions.run()
+            print("运行中的服务器列表:")
+        except Exception as e:
+            print(e)
+    except KeyboardInterrupt:
+        sys.exit('\nERROR: Interrupted by user')

+ 12 - 10
crawl_emotions/emotions.py

@@ -18,16 +18,19 @@ class Emotions(object):
     header= {
     header= {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.142.86 Safari/537.36"
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.142.86 Safari/537.36"
     }
     }
-    def __init__(self):
+    def __init__(self,  params: dict, debug=False):
+        self.params = params
         self.sess = httpx.Client(headers=self.header)
         self.sess = httpx.Client(headers=self.header)
-        self.path = os.path.dirname(os.path.abspath(__file__))
-        if not os.path.exists(self.path+'/data'):
-            os.mkdir(self.path+'/data')
-        self.data_path = self.path+'/data/'
-        self.pool = ThreadPoolExecutor(2)
+        self.app_path=params["app_path"]
+        if not os.path.exists(self.app_path+'/data'):
+            os.mkdir(self.app_path+'/data')
+        self.data_path = self.app_path+'/data/'
+        self.pool = ThreadPoolExecutor(params['threads'])
 
 
     def run(self):
     def run(self):
-        for i in range(1, 4328+1):
+        page_start=self.params["page_start"]
+        page_end=self.params["page_end"]
+        for i in range(page_start, page_end+1):
             url = self._url.format(page=i)
             url = self._url.format(page=i)
             self.pool.submit(self.get_page, url)
             self.pool.submit(self.get_page, url)
 
 
@@ -48,6 +51,5 @@ class Emotions(object):
                 break
                 break
         print('下载完毕: ', url)
         print('下载完毕: ', url)
     
     
-        def __del__(self):
-            self.pool.shutdown(wait=True)
-            
+    def __del__(self):
+        self.pool.shutdown(wait=True)

+ 90 - 0
crawl_emotions/options.py

@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/11/01 00:01:04
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   命令行参数,或配置文件
+"""
+
+import argparse
+import os
+import shlex
+import dotenv
+from collections import OrderedDict
+from .utils.str_util import preferredencoding
+from .utils.frozen_dir import get_app_path
+
+def parser_args(overrideArguments=None):
+    """解析参数"""
+
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument('-c', '--config', help='config file', default='config.ini')
+    # argparser.add_argument(
+    #     'command',
+    #     help='command: ',
+    #     choices=['clone', 'push', 'delete', 'list', 'serve', 'server'],
+    # )
+    argparser.add_argument('-l', '--log', help='log file', default='log.txt')
+    argparser.add_argument('-d', '--debug', help='debug mode', action='store_true')
+    argparser.add_argument('-user', '--user', help='set a user')
+    argparser.add_argument(
+        '-p',
+        '--platform',
+        help='set a platform',
+        choices=['github', 'gitee', 'gitlab', 'gogs', 'gitea', 'bitbucket', 'coding'],
+        default='github',
+    )
+    argparser.add_argument('-token', '--token', help='set a token')
+    argparser.add_argument('-host', '--host', help='set a host')
+    argparser.add_argument(
+        '-repo_path', '--repo_path', help='set a repo'
+    )  # , default=os.getcwd())
+    argparser.add_argument('-repo_name', '--repo_name', help='set a repo name')
+    argparser.add_argument('-repo_id', '--repo_id', help='set a repo id')
+    args = argparser.parse_args()
+
+    # remove None
+    command_line_conf = OrderedDict(
+        {k: v for k, v in args.__dict__.items() if v is not None}
+    )
+
+    system_conf = user_conf = custom_conf = OrderedDict()
+    user_conf = _read_user_conf()
+
+    if args.config:
+        custom_conf = _read_custom_conf(args.config)
+
+    system_conf.update(user_conf)
+    system_conf.update(command_line_conf)
+    # if args.command == None and args.extractor == None:
+    #     raise 'Error, please input cmd and extractor params11'
+    app_path = get_app_path()
+    system_conf["app_path"] = app_path
+    return system_conf
+
+
+def _read_custom_conf(config_path: str) -> OrderedDict:
+    """读取自定义配置文件 config.yaml"""
+
+    def compat_shlex_split(s, comments=False, posix=True):
+        if isinstance(s, str):
+            s = s.encode('utf-8')
+        return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix)))
+
+    try:
+        with open(config_path, 'r', encoding=preferredencoding()) as f:
+            contents = f.read()
+            res = compat_shlex_split(contents, comments=True)
+    except Exception as e:
+        return []
+    return res
+
+
+def _read_user_conf() -> OrderedDict:
+    """读取用户配置文件: .env 文件"""
+    user_conf = OrderedDict()
+    dotenv_path = '.env'
+    if os.path.exists(dotenv_path):
+        user_conf = dotenv.dotenv_values(dotenv_path)
+    return OrderedDict(user_conf)

+ 0 - 0
crawl_emotions/utils/__init__.py


+ 18 - 0
crawl_emotions/utils/frozen_dir.py

@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2024/04/12
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+"""
+import sys  
+import os  
+   
+def get_app_path() -> str:  
+    """Returns the base application path."""  
+    if hasattr(sys, 'frozen'):  
+        # Handles PyInstaller  
+        return os.path.dirname(sys.executable)  #使用 pyinstaller 打包后的 exe 目录
+    # return os.path.dirname(os.path.dirname(os.path.dirname(__file__))) # 没打包前的py目录
+    return sys.path[0]

+ 71 - 0
crawl_emotions/utils/str_util.py

@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/10/31 17:06:37
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   字符串工具类
+"""
+import argparse
+import locale
+import re
+import sys
+
+
+def compat_register_utf8():
+    """win 兼容utf-8编码"""
+    if sys.platform == 'win32':
+        from codecs import register, lookup
+
+        register(lambda name: lookup('utf-8') if name == 'cp65001' else None)
+
+
+def preferredencoding():
+    """Get preferred encoding.
+
+    Returns the best encoding scheme for the system, based on
+    locale.getpreferredencoding() and some further tweaks.
+    """
+    try:
+        pref = locale.getpreferredencoding()
+        'TEST'.encode(pref)
+    except Exception:
+        pref = 'UTF-8'
+
+    return pref
+
+
+def SpCharReplace(char):
+    """特殊字符替换"""
+    temp = str(char)
+    for i in temp:
+        if '<' == i:
+            char = char.replace('<', '《')
+        if '>' == i:
+            char = char.replace('>', '》')
+        if "'" == i:
+            char = char.replace("'", '')  # 处理单引号
+        if '\\' == i:
+            char = char.replace('\\', '')  # 处理反斜杠\
+        if '"' == i:
+            char = char.replace('"', '`')  # 处理双引号"
+        if '&' == i:
+            char = char.replace('&', '-')  # 处理&号"
+        if '|' == i:
+            char = char.replace('|', '')  # 处理&号
+        if '@' == i:
+            char = char.replace('@', '.')  # 处理@号
+        if '%' == i:
+            char = char.replace('%', '`')  # 处理单引号
+        if '*' == i:
+            char = char.replace('*', '`')  # 处理反斜杠\
+        if '("' == i:
+            char = char.replace('"', '`')  # 处理双引号"
+        if ')"' == i:
+            char = char.replace(')"', '`')
+        if '-' == i:
+            char = char.replace('-', '`')  # 处理&号
+        if 'ÐÂÎÅ' == i:
+            char = char.replace('ÐÂÎÅ', '`')  # 处理ÐÂÎÅ
+        # 在后面扩展其他特殊字符
+    return char

+ 10 - 3
main.py

@@ -1,6 +1,13 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2024/09/19 09:08:32
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   enter point`
+'''
 
 
-from crawl_emotions import Emotions
+from crawl_emotions import main
 
 
 if __name__=='__main__':
 if __name__=='__main__':
-    emo= Emotions()
-    emo.run()
+    main()