liuyuqi-dellpc 8 months ago
commit
b8a8230a12

+ 0 - 0
.env


+ 63 - 0
.github/workflows/build.yml

@@ -0,0 +1,63 @@
+name: Publish Installers
+
+on:
+  workflow_dispatch: ~
+  push:
+    branches: [master]
+    tags: [v*]
+
+jobs:
+  build:
+    name: Build ${{ matrix.os }} Package
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: 
+        os: [windows-2019, ubuntu-20.04]
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+
+      - name: Set Release Version
+        id: get_version
+        shell: bash
+        run: |
+          echo "::set-output name=hash::$(git rev-parse --short HEAD)"
+          echo "::set-output name=date::$(date +%Y%m%d)"
+          echo "::set-output name=url::$(git remote get-url origin)"
+          
+      - name: Set Up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.12'
+          cache: pip
+          cache-dependency-path: '**/requirements*.txt'
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip wheel setuptools poetry
+          poetry config virtualenvs.in-project true
+          poetry install
+
+      - name: Build Package
+        run: |
+          poetry run python -m PyInstaller crawl_sse.spec
+      - name: Update to ali oss
+        uses: yizhoumo/setup-ossutil@v1
+        with:
+          endpoint: oss-cn-qingdao.aliyuncs.com
+          access-key-id: ${{ secrets.OSS_KEY_ID }}
+          access-key-secret: ${{ secrets.OSS_KEY_SECRET }}
+          
+      - name: cp files to aliyun
+        run: |
+          ossutil cp -r dist/ oss://yoqi-software/develop/crawl_sse/${{ steps.get_version.outputs.date }}-${{ steps.get_version.outputs.hash }}/
+      
+      - uses: leafney/dingtalk-action@v1
+        if: always()
+        env:
+          DINGTALK_ACCESS_TOKEN: ${{ secrets.DINGTALK_ACCESS_TOKEN }}
+        with:
+          msgtype: link
+          title: 'crawl_sse build success'
+          text: 'please download from aliyun oss. [git.yoqi.me]'
+          msg_url: '${{ steps.get_version.outputs.url }}'

+ 8 - 0
.gitignore

@@ -0,0 +1,8 @@
+*.csv
+*.pyc
+*.pdf
+*.xlsx
+data/**/*.html
+build/
+dist/
+*.log

+ 12 - 0
README.md

@@ -0,0 +1,12 @@
+# crawl_beian
+
+大模型算备分析
+
+
+## License
+
+
+
+## Reference
+
+

+ 1 - 0
crawl_beian/__init__.py

@@ -0,0 +1 @@
+from .beian import Beian

+ 57 - 0
crawl_beian/beian.py

@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2024/07/15
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   beian
+"""
+import requests
+import os,sys,re
+from concurrent.futures import ThreadPoolExecutor
+import pandas as pd
+from lxml import etree
+
+class Beian(object):
+    """beian """
+    
+    source_url=r"https://www.aifun.cc/beian"
+    _headers = {
+        'Referer': 'https://www.aifun.cc',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
+            (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
+    }
+    modelData = pd.DataFrame(columns=['序号', '算法名称', '主体名称', '公示日期', '备案编号', '应用产品', '主要用途'])
+
+    def __init__(self):
+        self.sess = requests.Session()
+        self.sess.headers.update(self._headers)
+
+    def crawl(self) -> None:
+        if not os.path.exists("data/beian.html"):
+            res=self.sess.get(self.source_url)
+            with open("data/beian.html", "w", encoding="utf-8") as file:
+                file.write(res.text)
+        with open("data/beian.html", "r", encoding="utf-8") as file:
+            modelData = pd.DataFrame(columns=['序号', '算法名称', '主体名称', '公示日期', '备案编号', '应用产品', '主要用途'])
+            soup=soup = etree.HTML(file.read())
+            num=soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[1]')
+            names=soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[2]')
+            companys =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[3]')
+            date =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[4]')
+            beian_no =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[5]')
+            apps =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[6]')
+            desc =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[7]')
+
+            for i in range(len(names)):
+                self.modelData.loc[i] = [
+                    num[i].text if i < len(num) else '',
+                    names[i].text if i < len(names) else '',
+                    companys[i].text if i < len(companys) else '',
+                    date[i].text if i < len(date) else '',
+                    beian_no[i].text if i < len(beian_no) else '',
+                    apps[i].text if i < len(apps) else '',
+                    desc[i].text if i < len(desc) else ''
+                ]
+        # self.modelData.to_csv("data/model_data.csv", encoding="utf-8")
+        self.modelData.to_excel("data/model_data.xlsx", index=False)

+ 52 - 0
crawl_beian/options.py

@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2024/06/22
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   command line params or config from file
+"""
+
+import sys,os,re
+import argparse
+from collections import OrderedDict
+from .utils.frozen_dir import get_app_path
+
+def parse_args():
+    """
+    parse command line params
+    """
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''
+            crawl_sse
+            command line params:
+            python main.py crawl --extractor cninfo
+            python main.py download --extractor cninfo
+            python main.py crawl --extractor sse
+                                     ''', epilog="Believe that with the above description, " +
+                                                            "you can start working right away. Wish you success")
+    # group = parser.add_mutually_exclusive_group()
+    parser.add_argument('command',  help='command: crawl, download', choices=['crawl','download', 'help','version'] , default='help')
+    parser.add_argument('--extractor', help='extractor: cninfo, sse', choices=['cninfo','sse'], default='cninfo')
+    args = parser.parse_args()
+
+    if args is None:
+        parser.print_help()
+        sys.exit(1)
+    # remove None
+    command_line_conf = OrderedDict(
+        {k: v for k, v in args.__dict__.items() if v is not None}
+    )
+    system_conf = user_conf = custom_conf = OrderedDict()
+    system_conf.update(command_line_conf)
+
+    app_path = get_app_path()
+    system_conf["app_path"] = app_path
+    return system_conf
+
+def _read_custom_conf(config_path: str) -> OrderedDict:
+    """read custom config file"""
+    pass
+
+def _read_user_conf() -> OrderedDict:
+    """read user config file"""
+    pass

+ 0 - 0
crawl_beian/utils/__init__.py


+ 18 - 0
crawl_beian/utils/frozen_dir.py

@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2024/04/12
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+"""
+import sys  
+import os  
+   
+def get_app_path() -> str:  
+    """Returns the base application path."""  
+    if hasattr(sys, 'frozen'):  
+        # Handles PyInstaller  
+        return os.path.dirname(sys.executable)  #使用 pyinstaller 打包后的 exe 目录
+    # return os.path.dirname(os.path.dirname(os.path.dirname(__file__))) # 没打包前的py目录
+    return sys.path[0]

+ 0 - 0
data/.gitkeep


+ 63 - 0
docs/统计分析.ipynb

@@ -0,0 +1,63 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 大模型算备分析\n",
+    "\n",
+    "基础分析\n",
+    "作图\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

+ 34 - 0
main.py

@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2024/07/15
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   enter point
+"""
+from crawl_beian import Beian 
+from crawl_beian.options import parse_args
+
+if __name__=='__main__':
+    # args = parse_args()
+    # if args['command'] == 'crawl':
+    #     if args['extractor'] == 'sse':
+    #         sse = Sse()
+    #         sse.crawl()
+    #     elif args['extractor'] == 'cninfo':
+    #         cninfo = Cninfo()
+    #         cninfo.crawl()
+    # elif args['command'] == 'download':
+    #     if args['extractor'] == 'cninfo':
+    #         cninfo =Cninfo()
+    #         cninfo.download()
+    # elif args['command'] == 'help':
+    #     pass
+    # elif args['command'] == 'version':
+    #     print('1.0.0')
+    # else:
+    #     print('command error, please use --help to get help')
+    #     sys.exit(1)
+    beian =Beian()
+    beian.crawl()
+    

+ 14 - 0
pyproject.toml

@@ -0,0 +1,14 @@
+[tool.poetry]
+name = "crawl-beian"
+version = "0.1.0"
+description = ""
+authors = ["liuyuqi-dellpc <liuyuqi.gov@msn.cn>"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.11"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"