|
@@ -0,0 +1,57 @@
|
|
|
|
+#!/usr/bin/env python
|
|
|
|
+# -*- encoding: utf-8 -*-
|
|
|
|
+"""
|
|
|
|
+@Contact : liuyuqi.gov@msn.cn
|
|
|
|
+@Time : 2024/07/15
|
|
|
|
+@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
|
|
+@Desc : beian
|
|
|
|
+"""
|
|
|
|
+import requests
|
|
|
|
+import os,sys,re
|
|
|
|
+from concurrent.futures import ThreadPoolExecutor
|
|
|
|
+import pandas as pd
|
|
|
|
+from lxml import etree
|
|
|
|
+
|
|
|
|
+class Beian(object):
|
|
|
|
+ """beian """
|
|
|
|
+
|
|
|
|
+ source_url=r"https://www.aifun.cc/beian"
|
|
|
|
+ _headers = {
|
|
|
|
+ 'Referer': 'https://www.aifun.cc',
|
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
|
|
|
|
+ (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
|
|
|
|
+ }
|
|
|
|
+ modelData = pd.DataFrame(columns=['序号', '算法名称', '主体名称', '公示日期', '备案编号', '应用产品', '主要用途'])
|
|
|
|
+
|
|
|
|
+ def __init__(self):
|
|
|
|
+ self.sess = requests.Session()
|
|
|
|
+ self.sess.headers.update(self._headers)
|
|
|
|
+
|
|
|
|
+ def crawl(self) -> None:
|
|
|
|
+ if not os.path.exists("data/beian.html"):
|
|
|
|
+ res=self.sess.get(self.source_url)
|
|
|
|
+ with open("data/beian.html", "w", encoding="utf-8") as file:
|
|
|
|
+ file.write(res.text)
|
|
|
|
+ with open("data/beian.html", "r", encoding="utf-8") as file:
|
|
|
|
+ modelData = pd.DataFrame(columns=['序号', '算法名称', '主体名称', '公示日期', '备案编号', '应用产品', '主要用途'])
|
|
|
|
+ soup=soup = etree.HTML(file.read())
|
|
|
|
+ num=soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[1]')
|
|
|
|
+ names=soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[2]')
|
|
|
|
+ companys =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[3]')
|
|
|
|
+ date =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[4]')
|
|
|
|
+ beian_no =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[5]')
|
|
|
|
+ apps =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[6]')
|
|
|
|
+ desc =soup.xpath(r'//*[@id="tablepress-1"]/tbody/tr/td[7]')
|
|
|
|
+
|
|
|
|
+ for i in range(len(names)):
|
|
|
|
+ self.modelData.loc[i] = [
|
|
|
|
+ num[i].text if i < len(num) else '',
|
|
|
|
+ names[i].text if i < len(names) else '',
|
|
|
|
+ companys[i].text if i < len(companys) else '',
|
|
|
|
+ date[i].text if i < len(date) else '',
|
|
|
|
+ beian_no[i].text if i < len(beian_no) else '',
|
|
|
|
+ apps[i].text if i < len(apps) else '',
|
|
|
|
+ desc[i].text if i < len(desc) else ''
|
|
|
|
+ ]
|
|
|
|
+ # self.modelData.to_csv("data/model_data.csv", encoding="utf-8")
|
|
|
|
+ self.modelData.to_excel("data/model_data.xlsx", index=False)
|