|
@@ -0,0 +1,274 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- encoding: utf-8 -*-
|
|
|
+'''
|
|
|
+@Contact : liuyuqi.gov@msn.cn
|
|
|
+@Time : 2023/12/03 16:11:15
|
|
|
+@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
|
+@Desc : 上市公司年度财报下载
|
|
|
+'''
|
|
|
+import requests
|
|
|
+import re
|
|
|
+import openpyxl
|
|
|
+import time
|
|
|
+import os,re
|
|
|
+import pandas as pd
|
|
|
+from concurrent.futures import ThreadPoolExecutor
|
|
|
+
|
|
|
+class Cninfo(object):
|
|
|
+ '''
|
|
|
+ 深圳证券
|
|
|
+ '''
|
|
|
+ years =[ 2010,2011,2012,2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021, 2022, 2023 ]
|
|
|
+ host = "http://www.cninfo.com.cn"
|
|
|
+ headers = {
|
|
|
+ "Accept": "*/*",
|
|
|
+ "Accept-Encoding": "gzip, deflate",
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
|
|
+ "Content-Length": "195",
|
|
|
+ "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
|
|
+ "Origin": "http://www.cninfo.com.cn",
|
|
|
+ "Proxy-Connection": "keep-alive",
|
|
|
+ "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_gddh_szsh",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42",
|
|
|
+ "X-Requested-With": "XMLHttpRequest"
|
|
|
+ }
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.sess = requests.Session()
|
|
|
+ self.pool = ThreadPoolExecutor(max_workers=10)
|
|
|
+ self.api={
|
|
|
+ 'hisAnnouncement' : f'{self.host}/new/hisAnnouncement/query', # 查询接口
|
|
|
+ 'szse_stock': f'{self.host}/new/data/szse_stock.json' # 股票代码与orgid对应关系
|
|
|
+ }
|
|
|
+
|
|
|
+ def get_report(self, page_num:int, date:str, category):
|
|
|
+ '''
|
|
|
+ 获取公告数据
|
|
|
+
|
|
|
+ params:
|
|
|
+ page_num: 页码
|
|
|
+ date: 查询时间段,格式为:2021-01-01~2021-12-31
|
|
|
+
|
|
|
+ plate: sz;sh, 表示沪深两市
|
|
|
+ seDate:查询时间
|
|
|
+ '''
|
|
|
+ data = {
|
|
|
+ "pageNum": page_num,
|
|
|
+ "pageSize": 30,
|
|
|
+ "column": "szse",
|
|
|
+ "tabName": "fulltext",
|
|
|
+ "plate": "sz;sh",
|
|
|
+ "searchkey": "",
|
|
|
+ "secid": "",
|
|
|
+ "category": category,
|
|
|
+ "trade": "",
|
|
|
+ "seDate": date,
|
|
|
+ "sortName": "code",
|
|
|
+ "sortType": "asc",
|
|
|
+ "isHLtitle": "false" #
|
|
|
+ }
|
|
|
+ response = self.sess.post(self.api['hisAnnouncement'], data=data)
|
|
|
+ return response
|
|
|
+
|
|
|
+ def downlaod_report(self, date):
|
|
|
+ ''' 循环下载公告数据(按月) '''
|
|
|
+ all_results = []
|
|
|
+ page_num = 1
|
|
|
+ # 获取总页数
|
|
|
+ category_map = {
|
|
|
+ "年报": "category_ndbg_szsh",
|
|
|
+ "半年报": "category_bndbg_szsh",
|
|
|
+ "一季报": "category_yjdbg_szsh",
|
|
|
+ "三季报": "category_sjdbg_szsh",
|
|
|
+ "业绩预告": "category_yjygjxz_szsh",
|
|
|
+ "权益分派": "category_qyfpxzcs_szsh",
|
|
|
+ "董事会": "category_dshgg_szsh",
|
|
|
+ "监事会": "category_jshgg_szsh",
|
|
|
+ "股东大会": "category_gddh_szsh",
|
|
|
+ "日常经营": "category_rcjy_szsh",
|
|
|
+ "公司治理": "category_gszl_szsh",
|
|
|
+ "中介报告": "category_zj_szsh",
|
|
|
+ "首发": "category_sf_szsh",
|
|
|
+ "增发": "category_zf_szsh",
|
|
|
+ "股权激励": "category_gqjl_szsh",
|
|
|
+ "配股": "category_pg_szsh",
|
|
|
+ "解禁": "category_jj_szsh",
|
|
|
+ "公司债": "category_gszq_szsh",
|
|
|
+ "可转债": "category_kzzq_szsh",
|
|
|
+ "其他融资": "category_qtrz_szsh",
|
|
|
+ "股权变动": "category_gqbd_szsh",
|
|
|
+ "补充更正": "category_bcgz_szsh",
|
|
|
+ "澄清致歉": "category_cqdq_szsh",
|
|
|
+ "风险提示": "category_fxts_szsh",
|
|
|
+ "特别处理和退市": "category_tbclts_szsh",
|
|
|
+ "退市整理期": "category_tszlq_szsh"
|
|
|
+ }
|
|
|
+ for key, value in category_map.items():
|
|
|
+ print(f"正在获取 {date} {key} 数据...")
|
|
|
+ response_test = self.get_report(page_num, date, value)
|
|
|
+ data_test = response_test.json()
|
|
|
+ total_pages = data_test["totalpages"]
|
|
|
+ max_retries = 3 # 最大重试次数
|
|
|
+ retry_count = 0 # 当前重试次数
|
|
|
+ while page_num <= total_pages:
|
|
|
+ response = None
|
|
|
+ # 重试机制
|
|
|
+ while retry_count <= max_retries:
|
|
|
+ # 发送请求
|
|
|
+ try:
|
|
|
+ response = self.get_report(page_num, date, value)
|
|
|
+ response.raise_for_status()
|
|
|
+ break
|
|
|
+ except requests.exceptions.RequestException as e:
|
|
|
+ print(f"出现错误!: {e}")
|
|
|
+ print(f"5秒后重试...")
|
|
|
+ time.sleep(5)
|
|
|
+ retry_count += 1
|
|
|
+
|
|
|
+ if retry_count > max_retries:
|
|
|
+ print(f"{max_retries} 次重试后均失败. 跳过第 {page_num}页.")
|
|
|
+ page_num += 1
|
|
|
+ retry_count = 0
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ # 解析数据
|
|
|
+ try:
|
|
|
+ data = response.json()
|
|
|
+ # 尝试解析公告数据,如果解析失败则重试
|
|
|
+ retry_count = 0
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ if data["announcements"] is None:
|
|
|
+ raise Exception("公告数据为空")
|
|
|
+ else:
|
|
|
+ all_results.extend(data["announcements"])
|
|
|
+ break
|
|
|
+ except (TypeError, KeyError) as e:
|
|
|
+ print(f"解析公告数据失败: {e}")
|
|
|
+ print(f"5秒后重试...")
|
|
|
+ time.sleep(5)
|
|
|
+ retry_count += 1
|
|
|
+ if retry_count > max_retries:
|
|
|
+ raise Exception("达到最大重试次数,跳过此页")
|
|
|
+ continue
|
|
|
+ page_num += 1
|
|
|
+ except (ValueError, KeyError) as e:
|
|
|
+ print(f"解析响应数据失败: {e}")
|
|
|
+ print(f"5秒后重试...")
|
|
|
+ time.sleep(5)
|
|
|
+ retry_count += 1
|
|
|
+ if retry_count > max_retries:
|
|
|
+ raise Exception("达到最大重试次数,跳过此页")
|
|
|
+ continue
|
|
|
+ return all_results
|
|
|
+
|
|
|
+ def run(self):
|
|
|
+ for year in self.years:
|
|
|
+ if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
|
|
|
+ continue
|
|
|
+ all_results = []
|
|
|
+ time_segments = [
|
|
|
+ f"{year}-01-01~{year}-01-31",
|
|
|
+ f"{year}-02-01~{year}-02-28",
|
|
|
+ f"{year}-03-01~{year}-03-31",
|
|
|
+ f"{year}-04-01~{year}-04-30",
|
|
|
+ f"{year}-05-01~{year}-05-30",
|
|
|
+ f"{year}-06-01~{year}-06-30",
|
|
|
+ f"{year}-07-01~{year}-07-31",
|
|
|
+ f"{year}-08-01~{year}-08-31",
|
|
|
+ f"{year}-09-01~{year}-09-30",
|
|
|
+ f"{year}-10-01~{year}-10-31",
|
|
|
+ f"{year}-11-01~{year}-11-30",
|
|
|
+ f"{year}-12-01~{year}-12-31",
|
|
|
+ ]
|
|
|
+ for i in time_segments:
|
|
|
+ results = self.downlaod_report(i)
|
|
|
+ all_results.extend(results)
|
|
|
+
|
|
|
+ workbook = openpyxl.Workbook()
|
|
|
+ worksheet = workbook.active
|
|
|
+ worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
|
|
|
+
|
|
|
+ # 解析搜索结果并添加到Excel表格中
|
|
|
+ for item in all_results:
|
|
|
+ company_code = item["secCode"]
|
|
|
+ company_name = item["secName"]
|
|
|
+ title = item["announcementTitle"].strip()
|
|
|
+ # 剔除不需要的样式和特殊符号,并重新组合标题
|
|
|
+ title = re.sub(r"<.*?>", "", title)
|
|
|
+ title = title.replace(":", "")
|
|
|
+ title = f"《{title}》"
|
|
|
+
|
|
|
+ adjunct_url = item["adjunctUrl"]
|
|
|
+ year = re.search(r"\d{4}", title)
|
|
|
+ if year:
|
|
|
+ tmp_year = year.group()
|
|
|
+ else:
|
|
|
+ tmp_year = year
|
|
|
+ announcement_url=f"http://static.cninfo.com.cn/{adjunct_url}"
|
|
|
+
|
|
|
+ worksheet.append([company_code, company_name, title, tmp_year, announcement_url])
|
|
|
+ #注意:年报默认保存在代码同级目录下,如需调整请修改此处的路径,请自行创建文件夹并填入路径
|
|
|
+ workbook.save(f"股东大会公告链接_{year}.xlsx")
|
|
|
+
|
|
|
+ print(f"----{year}年获取完成")
|
|
|
+
|
|
|
+
|
|
|
+ def remove_dump(self):
|
|
|
+ ''' 去重 '''
|
|
|
+ for year in self.years:
|
|
|
+ file_path = f'股东大会公告链接_{year}.xlsx'
|
|
|
+ if os.path.exists(file_path):
|
|
|
+ df_2018 = pd.read_excel(file_path)
|
|
|
+ df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)
|
|
|
+ df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)
|
|
|
+
|
|
|
+ def download(self):
|
|
|
+ ''' read all link and download it '''
|
|
|
+ for year in self.years:
|
|
|
+ # csv skip head
|
|
|
+ file_path = f'股东大会公告链接_{year}_rep.xlsx'
|
|
|
+ print(f'process file:{year}')
|
|
|
+ # if the file is exist 公司代码 公司简称 标题 年份 年报链接
|
|
|
+ if os.path.exists(file_path):
|
|
|
+ df_2018 = pd.read_excel(file_path)
|
|
|
+ df = pd.read_excel(file_path)
|
|
|
+ urls = df['年报链接'].tolist()
|
|
|
+ # get all title
|
|
|
+ titles = df['标题'].tolist()
|
|
|
+ # get all company name
|
|
|
+ company_names = df['公司简称'].tolist()
|
|
|
+ # company_names remove / and \ and * and ? and : and " and < and > and | and \t and \n ans space
|
|
|
+ company_names = [re.sub(r'[\\/:*?"<>|\t\n\s]', '', i) for i in company_names]
|
|
|
+ # get all year
|
|
|
+ years = df['年份'].tolist()
|
|
|
+ # get all company code
|
|
|
+ company_codes = df['公司代码'].tolist()
|
|
|
+ print(len(company_codes),f'size: {len(company_codes)}')
|
|
|
+ for i in range(len(company_codes)):
|
|
|
+ if not os.path.exists(f'data/{company_names[i]}'):
|
|
|
+ os.makedirs(f'data/{company_names[i]}')
|
|
|
+ # data/公司名称/{年份}-标题
|
|
|
+ file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
|
|
|
+ for i in range(len(urls)):
|
|
|
+ # http://static.cninfo.com.cn/finalpage/2018-01-30/1204372527.PDF
|
|
|
+ self.pool.submit(self.download_file, urls[i],file_names[i] )
|
|
|
+ print(f'----{year}年下载完成')
|
|
|
+
|
|
|
+ def download_file(self, url, file_path):
|
|
|
+ ''' download file
|
|
|
+ '''
|
|
|
+ if not os.path.exists(file_path):
|
|
|
+ res = self.sess.get(url)
|
|
|
+ if res.status_code == 200:
|
|
|
+ with open(file_path, 'wb') as f:
|
|
|
+ f.write(res.content)
|
|
|
+ print(f'下载: {file_path} 完成')
|
|
|
+ time.sleep(1)
|
|
|
+ else:
|
|
|
+ print(f'file:{file_path} is exist')
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ cninfo = Cninfo()
|
|
|
+ cninfo.run()
|
|
|
+ # cninfo.download()
|