8 months ago · b8a8230a12
--- a/.env
+++ b/.env
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -0,0 +1,63 @@
 
															+name: Publish Installers
														
 
															+
														
 
															+on:
														
 
															+  workflow_dispatch: ~
														
 
															+  push:
														
 
															+    branches: [master]
														
 
															+    tags: [v*]
														
 
															+
														
 
															+jobs:
														
 
															+  build:
														
 
															+    name: Build ${{ matrix.os }} Package
														
 
															+    runs-on: ${{ matrix.os }}
														
 
															+    strategy:
														
 
															+      matrix: 
														
 
															+        os: [windows-2019, ubuntu-20.04]
														
 
															+    steps:
														
 
															+      - name: Checkout Code
														
 
															+        uses: actions/checkout@v4
														
 
															+
														
 
															+      - name: Set Release Version
														
 
															+        id: get_version
														
 
															+        shell: bash
														
 
															+        run: |
														
 
															+          echo "::set-output name=hash::$(git rev-parse --short HEAD)"
														
 
															+          echo "::set-output name=date::$(date +%Y%m%d)"
														
 
															+          echo "::set-output name=url::$(git remote get-url origin)"
														
 
															+          
														
 
															+      - name: Set Up Python
														
 
															+        uses: actions/setup-python@v4
														
 
															+        with:
														
 
															+          python-version: '3.12'
														
 
															+          cache: pip
														
 
															+          cache-dependency-path: '**/requirements*.txt'
														
 
															+
														
 
															+      - name: Install Dependencies
														
 
															+        run: |
														
 
															+          python -m pip install --upgrade pip wheel setuptools poetry
														
 
															+          poetry config virtualenvs.in-project true
														
 
															+          poetry install
														
 
															+
														
 
															+      - name: Build Package
														
 
															+        run: |
														
 
															+          poetry run python -m PyInstaller crawl_sse.spec
														
 
															+      - name: Update to ali oss
														
 
															+        uses: yizhoumo/setup-ossutil@v1
														
 
															+        with:
														
 
															+          endpoint: oss-cn-qingdao.aliyuncs.com
														
 
															+          access-key-id: ${{ secrets.OSS_KEY_ID }}
														
 
															+          access-key-secret: ${{ secrets.OSS_KEY_SECRET }}
														
 
															+          
														
 
															+      - name: cp files to aliyun
														
 
															+        run: |
														
 
															+          ossutil cp -r dist/ oss://yoqi-software/develop/crawl_sse/${{ steps.get_version.outputs.date }}-${{ steps.get_version.outputs.hash }}/
														
 
															+      
														
 
															+      - uses: leafney/dingtalk-action@v1
														
 
															+        if: always()
														
 
															+        env:
														
 
															+          DINGTALK_ACCESS_TOKEN: ${{ secrets.DINGTALK_ACCESS_TOKEN }}
														
 
															+        with:
														
 
															+          msgtype: link
														
 
															+          title: 'crawl_sse build success'
														
 
															+          text: 'please download from aliyun oss. [git.yoqi.me]'
														
 
															+          msg_url: '${{ steps.get_version.outputs.url }}'
														
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,8 @@
 
															+*.csv
														
 
															+*.pyc
														
 
															+*.pdf
														
 
															+*.xlsx
														
 
															+data/**/*.html
														
 
															+build/
														
 
															+dist/
														
 
															+*.log
														
--- a/README.md
+++ b/README.md
@@ -0,0 +1,12 @@
 
															+# crawl_beian
														
 
															+
														
 
															+大模型算备分析
														
 
															+
														
 
															+
														
 
															+## License
														
 
															+
														
 
															+
														
 
															+
														
 
															+## Reference
														
 
															+
														
 
															+
														
--- a/crawl_beian/__init__.py
+++ b/crawl_beian/__init__.py
@@ -0,0 +1 @@
 
															+from .beian import Beian
														
--- a/crawl_beian/beian.py
+++ b/crawl_beian/beian.py
@@ -0,0 +1,57 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- encoding: utf-8 -*-
														
 
															+"""
														
 
															+@Contact :   liuyuqi.gov@msn.cn
														
 
															+@Time    :   2024/07/15
														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
														
 
															+@Desc    :   beian
														
 
															+"""
														
 
															+import requests
														
 
															+import os,sys,re
														
 
															+from concurrent.futures import ThreadPoolExecutor
														
 
															+import pandas as pd
														
 
															+from lxml import etree
														
 
															+
														
 
															+class Beian(object):
														
 
															+    """beian """
														
 
															+    
														
 
															+    source_url=r"https://www.aifun.cc/beian"
														
 
															+    _headers = {
														
 
															+        'Referer': 'https://www.aifun.cc',
														
 
															+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
														
 
															+            (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
														
 
															+    }
														
 
															+    modelData = pd.DataFrame(columns=['序号', '算法名称', '主体名称', '公示日期', '备案编号', '应用产品', '主要用途'])
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.sess = requests.Session()
														
 
															+        self.sess.headers.update(self._headers)
														
 
															+
														
 
															+    def crawl(self) -> None:
														
 
															+        if not os.path.exists("data/beian.html"):
														
 
															+            res=self.sess.get(self.source_url)
														
 
															+            with open("data/beian.html", "w", encoding="utf-8") as file:
														
 
															+                file.write(res.text)
														
 
															+        with open("data/beian.html", "r", encoding="utf-8") as file:
														
 
															+            modelData = pd.DataFrame(columns=['序号', '算法名称', '主体名称', '公示日期', '备案编号', '应用产品', '主要用途'])
														
 
															+            soup=soup = etree.HTML(file.read())
														
 
															+            num=soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[1]')
														
 
															+            names=soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[2]')
														
 
															+            companys =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[3]')
														
 
															+            date =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[4]')
														
 
															+            beian_no =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[5]')
														
 
															+            apps =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[6]')
														
 
															+            desc =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[7]')
														
 
															+
														
 
															+            for i in range(len(names)):
														
 
															+                self.modelData.loc[i] = [
														
 
															+                    num[i].text if i < len(num) else '',
														
 
															+                    names[i].text if i < len(names) else '',
														
 
															+                    companys[i].text if i < len(companys) else '',
														
 
															+                    date[i].text if i < len(date) else '',
														
 
															+                    beian_no[i].text if i < len(beian_no) else '',
														
 
															+                    apps[i].text if i < len(apps) else '',
														
 
															+                    desc[i].text if i < len(desc) else ''
														
 
															+                ]
														
 
															+        # self.modelData.to_csv("data/model_data.csv", encoding="utf-8")
														
 
															+        self.modelData.to_excel("data/model_data.xlsx", index=False)
														
--- a/crawl_beian/options.py
+++ b/crawl_beian/options.py
@@ -0,0 +1,52 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- encoding: utf-8 -*-
														
 
															+"""
														
 
															+@Contact :   liuyuqi.gov@msn.cn
														
 
															+@Time    :   2024/06/22
														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
														
 
															+@Desc    :   command line params or config from file
														
 
															+"""
														
 
															+
														
 
															+import sys,os,re
														
 
															+import argparse
														
 
															+from collections import OrderedDict
														
 
															+from .utils.frozen_dir import get_app_path
														
 
															+
														
 
															+def parse_args():
														
 
															+    """
														
 
															+    parse command line params
														
 
															+    """
														
 
															+    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''
														
 
															+            crawl_sse
														
 
															+            command line params:
														
 
															+            python main.py crawl --extractor cninfo
														
 
															+            python main.py download --extractor cninfo
														
 
															+            python main.py crawl --extractor sse
														
 
															+                                     ''', epilog="Believe that with the above description, " +
														
 
															+                                                            "you can start working right away. Wish you success")
														
 
															+    # group = parser.add_mutually_exclusive_group()
														
 
															+    parser.add_argument('command',  help='command: crawl, download', choices=['crawl','download', 'help','version'] , default='help')
														
 
															+    parser.add_argument('--extractor', help='extractor: cninfo, sse', choices=['cninfo','sse'], default='cninfo')
														
 
															+    args = parser.parse_args()
														
 
															+
														
 
															+    if args is None:
														
 
															+        parser.print_help()
														
 
															+        sys.exit(1)
														
 
															+    # remove None
														
 
															+    command_line_conf = OrderedDict(
														
 
															+        {k: v for k, v in args.__dict__.items() if v is not None}
														
 
															+    )
														
 
															+    system_conf = user_conf = custom_conf = OrderedDict()
														
 
															+    system_conf.update(command_line_conf)
														
 
															+
														
 
															+    app_path = get_app_path()
														
 
															+    system_conf["app_path"] = app_path
														
 
															+    return system_conf
														
 
															+
														
 
															+def _read_custom_conf(config_path: str) -> OrderedDict:
														
 
															+    """read custom config file"""
														
 
															+    pass
														
 
															+
														
 
															+def _read_user_conf() -> OrderedDict:
														
 
															+    """read user config file"""
														
 
															+    pass
														
--- a/crawl_beian/utils/__init__.py
+++ b/crawl_beian/utils/__init__.py
--- a/crawl_beian/utils/frozen_dir.py
+++ b/crawl_beian/utils/frozen_dir.py
@@ -0,0 +1,18 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- encoding: utf-8 -*-
														
 
															+"""
														
 
															+@Contact :   liuyuqi.gov@msn.cn
														
 
															+@Time    :   2024/04/12
														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
														
 
															+@Desc    :   
														
 
															+"""
														
 
															+import sys  
														
 
															+import os  
														
 
															+   
														
 
															+def get_app_path() -> str:  
														
 
															+    """Returns the base application path."""  
														
 
															+    if hasattr(sys, 'frozen'):  
														
 
															+        # Handles PyInstaller  
														
 
															+        return os.path.dirname(sys.executable)  #使用 pyinstaller 打包后的 exe 目录
														
 
															+    # return os.path.dirname(os.path.dirname(os.path.dirname(__file__))) # 没打包前的py目录
														
 
															+    return sys.path[0]
														
--- a/data/.gitkeep
+++ b/data/.gitkeep
--- a/docs/统计分析.ipynb
+++ b/docs/统计分析.ipynb
@@ -0,0 +1,63 @@
 
															+{
														
 
															+ "cells": [
														
 
															+  {
														
 
															+   "cell_type": "markdown",
														
 
															+   "metadata": {},
														
 
															+   "source": [
														
 
															+    "# 大模型算备分析\n",
														
 
															+    "\n",
														
 
															+    "基础分析\n",
														
 
															+    "作图\n"
														
 
															+   ]
														
 
															+  },
														
 
															+  {
														
 
															+   "cell_type": "code",
														
 
															+   "execution_count": null,
														
 
															+   "metadata": {},
														
 
															+   "outputs": [],
														
 
															+   "source": []
														
 
															+  },
														
 
															+  {
														
 
															+   "cell_type": "code",
														
 
															+   "execution_count": null,
														
 
															+   "metadata": {},
														
 
															+   "outputs": [],
														
 
															+   "source": []
														
 
															+  },
														
 
															+  {
														
 
															+   "cell_type": "code",
														
 
															+   "execution_count": null,
														
 
															+   "metadata": {},
														
 
															+   "outputs": [],
														
 
															+   "source": []
														
 
															+  },
														
 
															+  {
														
 
															+   "cell_type": "code",
														
 
															+   "execution_count": null,
														
 
															+   "metadata": {},
														
 
															+   "outputs": [],
														
 
															+   "source": []
														
 
															+  },
														
 
															+  {
														
 
															+   "cell_type": "code",
														
 
															+   "execution_count": null,
														
 
															+   "metadata": {},
														
 
															+   "outputs": [],
														
 
															+   "source": []
														
 
															+  },
														
 
															+  {
														
 
															+   "cell_type": "code",
														
 
															+   "execution_count": null,
														
 
															+   "metadata": {},
														
 
															+   "outputs": [],
														
 
															+   "source": []
														
 
															+  }
														
 
															+ ],
														
 
															+ "metadata": {
														
 
															+  "language_info": {
														
 
															+   "name": "python"
														
 
															+  }
														
 
															+ },
														
 
															+ "nbformat": 4,
														
 
															+ "nbformat_minor": 2
														
 
															+}
														
--- a/main.py
+++ b/main.py
@@ -0,0 +1,34 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- encoding: utf-8 -*-
														
 
															+"""
														
 
															+@Contact :   liuyuqi.gov@msn.cn
														
 
															+@Time    :   2024/07/15
														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
														
 
															+@Desc    :   enter point
														
 
															+"""
														
 
															+from crawl_beian import Beian 
														
 
															+from crawl_beian.options import parse_args
														
 
															+
														
 
															+if __name__=='__main__':
														
 
															+    # args = parse_args()
														
 
															+    # if args['command'] == 'crawl':
														
 
															+    #     if args['extractor'] == 'sse':
														
 
															+    #         sse = Sse()
														
 
															+    #         sse.crawl()
														
 
															+    #     elif args['extractor'] == 'cninfo':
														
 
															+    #         cninfo = Cninfo()
														
 
															+    #         cninfo.crawl()
														
 
															+    # elif args['command'] == 'download':
														
 
															+    #     if args['extractor'] == 'cninfo':
														
 
															+    #         cninfo =Cninfo()
														
 
															+    #         cninfo.download()
														
 
															+    # elif args['command'] == 'help':
														
 
															+    #     pass
														
 
															+    # elif args['command'] == 'version':
														
 
															+    #     print('1.0.0')
														
 
															+    # else:
														
 
															+    #     print('command error, please use --help to get help')
														
 
															+    #     sys.exit(1)
														
 
															+    beian =Beian()
														
 
															+    beian.crawl()
														
 
															+    
														
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,14 @@
 
															+[tool.poetry]
														
 
															+name = "crawl-beian"
														
 
															+version = "0.1.0"
														
 
															+description = ""
														
 
															+authors = ["liuyuqi-dellpc <liuyuqi.gov@msn.cn>"]
														
 
															+readme = "README.md"
														
 
															+
														
 
															+[tool.poetry.dependencies]
														
 
															+python = "^3.11"
														
 
															+
														
 
															+
														
 
															+[build-system]
														
 
															+requires = ["poetry-core"]
														
 
															+build-backend = "poetry.core.masonry.api"