Browse Source

更新请求,支持下载年报

liuyuqi-dellpc 1 year ago
parent
commit
4f6cf288bd
1 changed files with 97 additions and 77 deletions
  1. 97 77
      crawl_sse/cninfo.py

+ 97 - 77
crawl_sse/cninfo.py

@@ -10,15 +10,15 @@ import requests
 import re
 import openpyxl
 import time
-import os,sys,re,csv
+import os,re
 import pandas as pd
-from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
+from concurrent.futures import ThreadPoolExecutor
 
 class Cninfo(object):
     ''' 
     深圳证券
     '''
-    years = [ 2018, 2019, 2020, 2021, 2022, 2023 ]
+    years =[ 2010,2011,2012,2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021, 2022, 2023 ]
     host = "http://www.cninfo.com.cn"
     headers = {
         "Accept": "*/*",
@@ -36,8 +36,12 @@ class Cninfo(object):
     def __init__(self):
         self.sess = requests.Session()
         self.pool = ThreadPoolExecutor(max_workers=10)
+        self.api={
+            'hisAnnouncement' : f'{self.host}/new/hisAnnouncement/query', # 查询接口
+            'szse_stock': f'{self.host}/new/data/szse_stock.json'  # 股票代码与orgid对应关系
+        }
 
-    def get_report(self, page_num:int, date:str):
+    def get_report(self, page_num:int, date:str, category):
         '''
         获取公告数据
 
@@ -48,7 +52,6 @@ class Cninfo(object):
         plate: sz;sh, 表示沪深两市
         seDate:查询时间
         '''
-
         data = {
             "pageNum": page_num,
             "pageSize": 30,
@@ -57,94 +60,112 @@ class Cninfo(object):
             "plate": "sz;sh",
             "searchkey": "",
             "secid": "",
-            "category": "category_gddh_szsh",
+            "category": category,
             "trade": "",
             "seDate": date,
             "sortName": "code",
             "sortType": "asc",
-            "isHLtitle": "false"
+            "isHLtitle": "false" #
         }
-        response = self.sess.post(f'{self.host}/new/hisAnnouncement/query', data=data)
+        response = self.sess.post(self.api['hisAnnouncement'], data=data)
         return response
 
-
     def downlaod_report(self, date):
+        ''' 循环下载公告数据(按月) '''
         all_results = []
         page_num = 1
-        response_test = self.get_report(page_num, date)
-        data_test = response_test.json()
-        total_pages = data_test["totalpages"]
-        max_retries = 3 #最大重试次数
-        retry_count = 0 #当前重试次数
-        while page_num <= total_pages:
-            response = None
-            # 重试机制
-            while retry_count <= max_retries:
-                # 发送请求
-                try:
-                    response = self.get_report(page_num, date)
-                    response.raise_for_status()
-                    break
-                except requests.exceptions.RequestException as e:
-                    print(f"出现错误!: {e}")
-                    print(f"5秒后重试...")
-                    time.sleep(5)
-                    retry_count += 1
+        # 获取总页数
+        category_map = {
+            "年报": "category_ndbg_szsh",
+            "半年报": "category_bndbg_szsh",
+            "一季报": "category_yjdbg_szsh",
+            "三季报": "category_sjdbg_szsh",
+            "业绩预告": "category_yjygjxz_szsh",
+            "权益分派": "category_qyfpxzcs_szsh",
+            "董事会": "category_dshgg_szsh",
+            "监事会": "category_jshgg_szsh",
+            "股东大会": "category_gddh_szsh",
+            "日常经营": "category_rcjy_szsh",
+            "公司治理": "category_gszl_szsh",
+            "中介报告": "category_zj_szsh",
+            "首发": "category_sf_szsh",
+            "增发": "category_zf_szsh",
+            "股权激励": "category_gqjl_szsh",
+            "配股": "category_pg_szsh",
+            "解禁": "category_jj_szsh",
+            "公司债": "category_gszq_szsh",
+            "可转债": "category_kzzq_szsh",
+            "其他融资": "category_qtrz_szsh",
+            "股权变动": "category_gqbd_szsh",
+            "补充更正": "category_bcgz_szsh",
+            "澄清致歉": "category_cqdq_szsh",
+            "风险提示": "category_fxts_szsh",
+            "特别处理和退市": "category_tbclts_szsh",
+            "退市整理期": "category_tszlq_szsh"
+        }
+        for key, value in category_map.items():
+            print(f"正在下载 {date} {key} 数据...")
+            response_test = self.get_report(page_num, date, value)
+            data_test = response_test.json()
+            total_pages = data_test["totalpages"]
+            max_retries = 3 # 最大重试次数
+            retry_count = 0 # 当前重试次数
+            while page_num <= total_pages:
+                response = None
+                # 重试机制
+                while retry_count <= max_retries:
+                    # 发送请求
+                    try:
+                        response = self.get_report(page_num, date, value)
+                        response.raise_for_status()
+                        break
+                    except requests.exceptions.RequestException as e:
+                        print(f"出现错误!: {e}")
+                        print(f"5秒后重试...")
+                        time.sleep(5)
+                        retry_count += 1
 
-            if retry_count > max_retries:
-                print(f"{max_retries} 次重试后均失败. 跳过第 {page_num}页.")
-                page_num += 1
-                retry_count = 0
-                continue
-            else:
-                # 解析数据
-                try:
-                    data = response.json()
-                    # per = (counter/sum)
-                    # if  per <1:
-                    #     print(f"\r当前年份下载进度 {per*100:.2f} %",end='')
-                    # else:
-                    #     print(f"\r下载完成,正在保存……", end='')
-                    # 尝试解析公告数据,如果解析失败则重试
-                    retry_count = 0
-                    while True:
-                        try:
-                            if data["announcements"] is None:
-                                raise Exception("公告数据为空")
-                            else:
-                                all_results.extend(data["announcements"])
-                            break
-                        except (TypeError, KeyError) as e:
-                            print(f"解析公告数据失败: {e}")
-                            print(f"5秒后重试...")
-                            time.sleep(5)
-                            retry_count += 1
-                            if retry_count > max_retries:
-                                raise Exception("达到最大重试次数,跳过此页")
-                            continue
+                if retry_count > max_retries:
+                    print(f"{max_retries} 次重试后均失败. 跳过第 {page_num}页.")
                     page_num += 1
-                    # counter +=1
-                except (ValueError, KeyError) as e:
-                    print(f"解析响应数据失败: {e}")
-                    print(f"5秒后重试...")
-                    time.sleep(5)
-                    retry_count += 1
-                    if retry_count > max_retries:
-                        raise Exception("达到最大重试次数,跳过此页")
+                    retry_count = 0
                     continue
+                else:
+                    # 解析数据
+                    try:
+                        data = response.json()
+                        # 尝试解析公告数据,如果解析失败则重试
+                        retry_count = 0
+                        while True:
+                            try:
+                                if data["announcements"] is None:
+                                    raise Exception("公告数据为空")
+                                else:
+                                    all_results.extend(data["announcements"])
+                                break
+                            except (TypeError, KeyError) as e:
+                                print(f"解析公告数据失败: {e}")
+                                print(f"5秒后重试...")
+                                time.sleep(5)
+                                retry_count += 1
+                                if retry_count > max_retries:
+                                    raise Exception("达到最大重试次数,跳过此页")
+                                continue
+                        page_num += 1
+                    except (ValueError, KeyError) as e:
+                        print(f"解析响应数据失败: {e}")
+                        print(f"5秒后重试...")
+                        time.sleep(5)
+                        retry_count += 1
+                        if retry_count > max_retries:
+                            raise Exception("达到最大重试次数,跳过此页")
+                        continue
         return all_results
 
     def run(self):
-        years =[ 2018, 2019, 2020, 2021, 2022, 2023 ]
-        # counter = 1  # 计数器
-        sum = 0
-        for year in years:
+        for year in self.years:
             if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
                 continue
-            date_count = f"{year}-01-01~{year}-12-31"
-            response = self.get_report(1, date_count)
-            data = response.json()
-            sum = data['totalpages']
             all_results = []
             time_segments = [
                 f"{year}-01-01~{year}-01-31",
@@ -230,14 +251,13 @@ class Cninfo(object):
                 # data/公司名称/{年份}-标题
                 file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
                 for i in range(len(urls)):
+                    # http://static.cninfo.com.cn/finalpage/2018-01-30/1204372527.PDF
                     self.pool.submit(self.download_file, urls[i],file_names[i] )
-                # wait(self.pool, return_when=ALL_COMPLETED)
                 print(f'----{year}年下载完成')
     
     def download_file(self, url, file_path):
         ''' download file 
         '''
-        # download from urls
         if not os.path.exists(file_path):
                     res = self.sess.get(url)
                     if res.status_code == 200: