|
@@ -10,15 +10,15 @@ import requests
|
|
|
import re
|
|
|
import openpyxl
|
|
|
import time
|
|
|
-import os,sys,re,csv
|
|
|
+import os,re
|
|
|
import pandas as pd
|
|
|
-from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
|
|
|
+from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
class Cninfo(object):
|
|
|
'''
|
|
|
深圳证券
|
|
|
'''
|
|
|
- years = [ 2018, 2019, 2020, 2021, 2022, 2023 ]
|
|
|
+ years =[ 2010,2011,2012,2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021, 2022, 2023 ]
|
|
|
host = "http://www.cninfo.com.cn"
|
|
|
headers = {
|
|
|
"Accept": "*/*",
|
|
@@ -36,8 +36,12 @@ class Cninfo(object):
|
|
|
def __init__(self):
|
|
|
self.sess = requests.Session()
|
|
|
self.pool = ThreadPoolExecutor(max_workers=10)
|
|
|
+ self.api={
|
|
|
+ 'hisAnnouncement' : f'{self.host}/new/hisAnnouncement/query', # 查询接口
|
|
|
+ 'szse_stock': f'{self.host}/new/data/szse_stock.json' # 股票代码与orgid对应关系
|
|
|
+ }
|
|
|
|
|
|
- def get_report(self, page_num:int, date:str):
|
|
|
+ def get_report(self, page_num:int, date:str, category):
|
|
|
'''
|
|
|
获取公告数据
|
|
|
|
|
@@ -48,7 +52,6 @@ class Cninfo(object):
|
|
|
plate: sz;sh, 表示沪深两市
|
|
|
seDate:查询时间
|
|
|
'''
|
|
|
-
|
|
|
data = {
|
|
|
"pageNum": page_num,
|
|
|
"pageSize": 30,
|
|
@@ -57,94 +60,112 @@ class Cninfo(object):
|
|
|
"plate": "sz;sh",
|
|
|
"searchkey": "",
|
|
|
"secid": "",
|
|
|
- "category": "category_gddh_szsh",
|
|
|
+ "category": category,
|
|
|
"trade": "",
|
|
|
"seDate": date,
|
|
|
"sortName": "code",
|
|
|
"sortType": "asc",
|
|
|
- "isHLtitle": "false"
|
|
|
+ "isHLtitle": "false" #
|
|
|
}
|
|
|
- response = self.sess.post(f'{self.host}/new/hisAnnouncement/query', data=data)
|
|
|
+ response = self.sess.post(self.api['hisAnnouncement'], data=data)
|
|
|
return response
|
|
|
|
|
|
-
|
|
|
def downlaod_report(self, date):
|
|
|
+ ''' 循环下载公告数据(按月) '''
|
|
|
all_results = []
|
|
|
page_num = 1
|
|
|
- response_test = self.get_report(page_num, date)
|
|
|
- data_test = response_test.json()
|
|
|
- total_pages = data_test["totalpages"]
|
|
|
- max_retries = 3 #最大重试次数
|
|
|
- retry_count = 0 #当前重试次数
|
|
|
- while page_num <= total_pages:
|
|
|
- response = None
|
|
|
- # 重试机制
|
|
|
- while retry_count <= max_retries:
|
|
|
- # 发送请求
|
|
|
- try:
|
|
|
- response = self.get_report(page_num, date)
|
|
|
- response.raise_for_status()
|
|
|
- break
|
|
|
- except requests.exceptions.RequestException as e:
|
|
|
- print(f"出现错误!: {e}")
|
|
|
- print(f"5秒后重试...")
|
|
|
- time.sleep(5)
|
|
|
- retry_count += 1
|
|
|
+ # 获取总页数
|
|
|
+ category_map = {
|
|
|
+ "年报": "category_ndbg_szsh",
|
|
|
+ "半年报": "category_bndbg_szsh",
|
|
|
+ "一季报": "category_yjdbg_szsh",
|
|
|
+ "三季报": "category_sjdbg_szsh",
|
|
|
+ "业绩预告": "category_yjygjxz_szsh",
|
|
|
+ "权益分派": "category_qyfpxzcs_szsh",
|
|
|
+ "董事会": "category_dshgg_szsh",
|
|
|
+ "监事会": "category_jshgg_szsh",
|
|
|
+ "股东大会": "category_gddh_szsh",
|
|
|
+ "日常经营": "category_rcjy_szsh",
|
|
|
+ "公司治理": "category_gszl_szsh",
|
|
|
+ "中介报告": "category_zj_szsh",
|
|
|
+ "首发": "category_sf_szsh",
|
|
|
+ "增发": "category_zf_szsh",
|
|
|
+ "股权激励": "category_gqjl_szsh",
|
|
|
+ "配股": "category_pg_szsh",
|
|
|
+ "解禁": "category_jj_szsh",
|
|
|
+ "公司债": "category_gszq_szsh",
|
|
|
+ "可转债": "category_kzzq_szsh",
|
|
|
+ "其他融资": "category_qtrz_szsh",
|
|
|
+ "股权变动": "category_gqbd_szsh",
|
|
|
+ "补充更正": "category_bcgz_szsh",
|
|
|
+ "澄清致歉": "category_cqdq_szsh",
|
|
|
+ "风险提示": "category_fxts_szsh",
|
|
|
+ "特别处理和退市": "category_tbclts_szsh",
|
|
|
+ "退市整理期": "category_tszlq_szsh"
|
|
|
+ }
|
|
|
+ for key, value in category_map.items():
|
|
|
+ print(f"正在下载 {date} {key} 数据...")
|
|
|
+ response_test = self.get_report(page_num, date, value)
|
|
|
+ data_test = response_test.json()
|
|
|
+ total_pages = data_test["totalpages"]
|
|
|
+ max_retries = 3 # 最大重试次数
|
|
|
+ retry_count = 0 # 当前重试次数
|
|
|
+ while page_num <= total_pages:
|
|
|
+ response = None
|
|
|
+ # 重试机制
|
|
|
+ while retry_count <= max_retries:
|
|
|
+ # 发送请求
|
|
|
+ try:
|
|
|
+ response = self.get_report(page_num, date, value)
|
|
|
+ response.raise_for_status()
|
|
|
+ break
|
|
|
+ except requests.exceptions.RequestException as e:
|
|
|
+ print(f"出现错误!: {e}")
|
|
|
+ print(f"5秒后重试...")
|
|
|
+ time.sleep(5)
|
|
|
+ retry_count += 1
|
|
|
|
|
|
- if retry_count > max_retries:
|
|
|
- print(f"{max_retries} 次重试后均失败. 跳过第 {page_num}页.")
|
|
|
- page_num += 1
|
|
|
- retry_count = 0
|
|
|
- continue
|
|
|
- else:
|
|
|
- # 解析数据
|
|
|
- try:
|
|
|
- data = response.json()
|
|
|
- # per = (counter/sum)
|
|
|
- # if per <1:
|
|
|
- # print(f"\r当前年份下载进度 {per*100:.2f} %",end='')
|
|
|
- # else:
|
|
|
- # print(f"\r下载完成,正在保存……", end='')
|
|
|
- # 尝试解析公告数据,如果解析失败则重试
|
|
|
- retry_count = 0
|
|
|
- while True:
|
|
|
- try:
|
|
|
- if data["announcements"] is None:
|
|
|
- raise Exception("公告数据为空")
|
|
|
- else:
|
|
|
- all_results.extend(data["announcements"])
|
|
|
- break
|
|
|
- except (TypeError, KeyError) as e:
|
|
|
- print(f"解析公告数据失败: {e}")
|
|
|
- print(f"5秒后重试...")
|
|
|
- time.sleep(5)
|
|
|
- retry_count += 1
|
|
|
- if retry_count > max_retries:
|
|
|
- raise Exception("达到最大重试次数,跳过此页")
|
|
|
- continue
|
|
|
+ if retry_count > max_retries:
|
|
|
+ print(f"{max_retries} 次重试后均失败. 跳过第 {page_num}页.")
|
|
|
page_num += 1
|
|
|
- # counter +=1
|
|
|
- except (ValueError, KeyError) as e:
|
|
|
- print(f"解析响应数据失败: {e}")
|
|
|
- print(f"5秒后重试...")
|
|
|
- time.sleep(5)
|
|
|
- retry_count += 1
|
|
|
- if retry_count > max_retries:
|
|
|
- raise Exception("达到最大重试次数,跳过此页")
|
|
|
+ retry_count = 0
|
|
|
continue
|
|
|
+ else:
|
|
|
+ # 解析数据
|
|
|
+ try:
|
|
|
+ data = response.json()
|
|
|
+ # 尝试解析公告数据,如果解析失败则重试
|
|
|
+ retry_count = 0
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ if data["announcements"] is None:
|
|
|
+ raise Exception("公告数据为空")
|
|
|
+ else:
|
|
|
+ all_results.extend(data["announcements"])
|
|
|
+ break
|
|
|
+ except (TypeError, KeyError) as e:
|
|
|
+ print(f"解析公告数据失败: {e}")
|
|
|
+ print(f"5秒后重试...")
|
|
|
+ time.sleep(5)
|
|
|
+ retry_count += 1
|
|
|
+ if retry_count > max_retries:
|
|
|
+ raise Exception("达到最大重试次数,跳过此页")
|
|
|
+ continue
|
|
|
+ page_num += 1
|
|
|
+ except (ValueError, KeyError) as e:
|
|
|
+ print(f"解析响应数据失败: {e}")
|
|
|
+ print(f"5秒后重试...")
|
|
|
+ time.sleep(5)
|
|
|
+ retry_count += 1
|
|
|
+ if retry_count > max_retries:
|
|
|
+ raise Exception("达到最大重试次数,跳过此页")
|
|
|
+ continue
|
|
|
return all_results
|
|
|
|
|
|
def run(self):
|
|
|
- years =[ 2018, 2019, 2020, 2021, 2022, 2023 ]
|
|
|
- # counter = 1 # 计数器
|
|
|
- sum = 0
|
|
|
- for year in years:
|
|
|
+ for year in self.years:
|
|
|
if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
|
|
|
continue
|
|
|
- date_count = f"{year}-01-01~{year}-12-31"
|
|
|
- response = self.get_report(1, date_count)
|
|
|
- data = response.json()
|
|
|
- sum = data['totalpages']
|
|
|
all_results = []
|
|
|
time_segments = [
|
|
|
f"{year}-01-01~{year}-01-31",
|
|
@@ -230,14 +251,13 @@ class Cninfo(object):
|
|
|
# data/公司名称/{年份}-标题
|
|
|
file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
|
|
|
for i in range(len(urls)):
|
|
|
+ # http://static.cninfo.com.cn/finalpage/2018-01-30/1204372527.PDF
|
|
|
self.pool.submit(self.download_file, urls[i],file_names[i] )
|
|
|
- # wait(self.pool, return_when=ALL_COMPLETED)
|
|
|
print(f'----{year}年下载完成')
|
|
|
|
|
|
def download_file(self, url, file_path):
|
|
|
''' download file
|
|
|
'''
|
|
|
- # download from urls
|
|
|
if not os.path.exists(file_path):
|
|
|
res = self.sess.get(url)
|
|
|
if res.status_code == 200:
|