Browse Source

完成上市公告下载

liuyuqi-dellpc 5 months ago
parent
commit
14f3c5aba1
10 changed files with 423 additions and 28 deletions
  1. 1 0
      .gitignore
  2. 15 1
      README.md
  3. 2 2
      crawl_sse/__init__.py
  4. 254 0
      crawl_sse/cninfo.py
  5. 104 0
      crawl_sse/pdf_2_txt.py
  6. 2 2
      crawl_sse/sse.py
  7. 20 0
      docs/cninfo.http
  8. 18 19
      docs/上市公司分析.ipynb
  9. 6 3
      main.py
  10. 1 1
      requirements.txt

+ 1 - 0
.gitignore

@@ -1,3 +1,4 @@
 *.csv
 *.pyc
 *.pdf
+*.xlsx

+ 15 - 1
README.md

@@ -1,5 +1,6 @@
 # crawl_sse
 
+下载上市公司数据:
 
 ```
 virtualenv .venv
@@ -7,6 +8,19 @@ source .venv/bin/activate
 
 pip install -r requirements.txt
 
-python main.py
+python main.py company
+
+```
+
+上市公司年报下载:
+```
+# 获取列表,保存到csv
+python main.py nianbao
+
+# npynb执行去重
+
+
+# 下载年报
+python main.py nianbao --download
 
 ```

+ 2 - 2
crawl_sse/__init__.py

@@ -1,2 +1,2 @@
-from .sse import Sse
-
+# from .sse import Sse
+from .cninfo import Cninfo

+ 254 - 0
crawl_sse/cninfo.py

@@ -0,0 +1,254 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/12/03 16:11:15
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   上市公司年度财报下载
+'''
+import requests
+import re
+import openpyxl
+import time
+import os,sys,re,csv
+import pandas as pd
+from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
+
+class Cninfo(object):
+    ''' 
+    深圳证券
+    '''
+    years = [ 2018, 2019, 2020, 2021, 2022, 2023 ]
+    host = "http://www.cninfo.com.cn"
+    headers = {
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+        "Content-Length": "195",
+        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
+        "Origin": "http://www.cninfo.com.cn",
+        "Proxy-Connection": "keep-alive",
+        "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_gddh_szsh",
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42",
+        "X-Requested-With": "XMLHttpRequest"
+    }
+
+    def __init__(self):
+        self.sess = requests.Session()
+        self.pool = ThreadPoolExecutor(max_workers=10)
+
+    def get_report(self, page_num:int, date:str):
+        '''
+        获取公告数据
+
+        params:
+            page_num: 页码
+            date: 查询时间段,格式为:2021-01-01~2021-12-31
+
+        plate: sz;sh, 表示沪深两市
+        seDate:查询时间
+        '''
+
+        data = {
+            "pageNum": page_num,
+            "pageSize": 30,
+            "column": "szse",
+            "tabName": "fulltext",
+            "plate": "sz;sh",
+            "searchkey": "",
+            "secid": "",
+            "category": "category_gddh_szsh",
+            "trade": "",
+            "seDate": date,
+            "sortName": "code",
+            "sortType": "asc",
+            "isHLtitle": "false"
+        }
+        response = self.sess.post(f'{self.host}/new/hisAnnouncement/query', data=data)
+        return response
+
+
+    def downlaod_report(self, date):
+        all_results = []
+        page_num = 1
+        response_test = self.get_report(page_num, date)
+        data_test = response_test.json()
+        total_pages = data_test["totalpages"]
+        max_retries = 3 #最大重试次数
+        retry_count = 0 #当前重试次数
+        while page_num <= total_pages:
+            response = None
+            # 重试机制
+            while retry_count <= max_retries:
+                # 发送请求
+                try:
+                    response = self.get_report(page_num, date)
+                    response.raise_for_status()
+                    break
+                except requests.exceptions.RequestException as e:
+                    print(f"出现错误!: {e}")
+                    print(f"5秒后重试...")
+                    time.sleep(5)
+                    retry_count += 1
+
+            if retry_count > max_retries:
+                print(f"{max_retries} 次重试后均失败. 跳过第 {page_num}页.")
+                page_num += 1
+                retry_count = 0
+                continue
+            else:
+                # 解析数据
+                try:
+                    data = response.json()
+                    # per = (counter/sum)
+                    # if  per <1:
+                    #     print(f"\r当前年份下载进度 {per*100:.2f} %",end='')
+                    # else:
+                    #     print(f"\r下载完成,正在保存……", end='')
+                    # 尝试解析公告数据,如果解析失败则重试
+                    retry_count = 0
+                    while True:
+                        try:
+                            if data["announcements"] is None:
+                                raise Exception("公告数据为空")
+                            else:
+                                all_results.extend(data["announcements"])
+                            break
+                        except (TypeError, KeyError) as e:
+                            print(f"解析公告数据失败: {e}")
+                            print(f"5秒后重试...")
+                            time.sleep(5)
+                            retry_count += 1
+                            if retry_count > max_retries:
+                                raise Exception("达到最大重试次数,跳过此页")
+                            continue
+                    page_num += 1
+                    # counter +=1
+                except (ValueError, KeyError) as e:
+                    print(f"解析响应数据失败: {e}")
+                    print(f"5秒后重试...")
+                    time.sleep(5)
+                    retry_count += 1
+                    if retry_count > max_retries:
+                        raise Exception("达到最大重试次数,跳过此页")
+                    continue
+        return all_results
+
+    def run(self):
+        years =[ 2018, 2019, 2020, 2021, 2022, 2023 ]
+        # counter = 1  # 计数器
+        sum = 0
+        for year in years:
+            if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
+                continue
+            date_count = f"{year}-01-01~{year}-12-31"
+            response = self.get_report(1, date_count)
+            data = response.json()
+            sum = data['totalpages']
+            all_results = []
+            time_segments = [
+                f"{year}-01-01~{year}-01-31",
+                f"{year}-02-01~{year}-02-28",
+                f"{year}-03-01~{year}-03-31",
+                f"{year}-04-01~{year}-04-30",
+                f"{year}-05-01~{year}-05-30",
+                f"{year}-06-01~{year}-06-30",
+                f"{year}-07-01~{year}-07-31",
+                f"{year}-08-01~{year}-08-31",
+                f"{year}-09-01~{year}-09-30",
+                f"{year}-10-01~{year}-10-31",
+                f"{year}-11-01~{year}-11-30",
+                f"{year}-12-01~{year}-12-31",
+            ]
+            for i in time_segments:
+                results = self.downlaod_report(i)
+                all_results.extend(results)
+
+            workbook = openpyxl.Workbook()
+            worksheet = workbook.active
+            worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
+
+            # 解析搜索结果并添加到Excel表格中
+            for item in all_results:
+                company_code = item["secCode"]
+                company_name = item["secName"]
+                title = item["announcementTitle"].strip()
+                # 剔除不需要的样式和特殊符号,并重新组合标题
+                title = re.sub(r"<.*?>", "", title)
+                title = title.replace(":", "")
+                title = f"《{title}》"
+
+                adjunct_url = item["adjunctUrl"]
+                year = re.search(r"\d{4}", title)
+                if year:
+                    tmp_year = year.group()
+                else:
+                    tmp_year = year
+                announcement_url=f"http://static.cninfo.com.cn/{adjunct_url}"
+
+                worksheet.append([company_code, company_name, title, tmp_year, announcement_url])
+            #注意:年报默认保存在代码同级目录下,如需调整请修改此处的路径,请自行创建文件夹并填入路径
+            workbook.save(f"股东大会公告链接_{tmp_year}.xlsx")
+
+            print(f"----{tmp_year}年下载完成")
+    
+    
+    def remove_dump(self):
+        ''' 去重 '''
+        for year in self.years:
+            file_path = f'股东大会公告链接_{year}.xlsx'
+            if os.path.exists(file_path):
+                df_2018 = pd.read_excel(file_path)
+                df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)
+                df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)
+
+    def download(self):
+        ''' read all link and download it '''
+        for year in self.years:
+            # csv skip head
+            file_path = f'股东大会公告链接_{year}_rep.xlsx'
+            print(f'process file:{year}')
+            # if the file is exist 公司代码	公司简称	标题	年份	年报链接
+            if os.path.exists(file_path):
+                df_2018 = pd.read_excel(file_path)
+                df = pd.read_excel(file_path)
+                urls = df['年报链接'].tolist()
+                # get all title
+                titles = df['标题'].tolist()
+                # get all company name
+                company_names = df['公司简称'].tolist()
+                # company_names remove / and \ and * and ? and : and " and < and > and | and \t and \n ans space
+                company_names = [re.sub(r'[\\/:*?"<>|\t\n\s]', '', i) for i in company_names]
+                # get all year
+                years = df['年份'].tolist()
+                # get all company code
+                company_codes = df['公司代码'].tolist()
+                print(len(company_codes),f'size: {len(company_codes)}')
+                for i in range(len(company_codes)):
+                    if not os.path.exists(f'data/{company_names[i]}'):
+                        os.makedirs(f'data/{company_names[i]}')
+                # data/公司名称/{年份}-标题
+                file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
+                for i in range(len(urls)):
+                    self.pool.submit(self.download_file, urls[i],file_names[i] )
+                # wait(self.pool, return_when=ALL_COMPLETED)
+                print(f'----{year}年下载完成')
+    
+    def download_file(self, url, file_path):
+        ''' download file 
+        '''
+        # download from urls
+        if not os.path.exists(file_path):
+                    res = self.sess.get(url)
+                    if res.status_code == 200:
+                        with open(file_path, 'wb') as f:
+                            f.write(res.content)
+                            print(f'下载: {file_path} 完成')
+                    time.sleep(1)
+        else:
+            print(f'file:{file_path} is exist')
+
+if __name__ == "__main__":
+    cninfo = Cninfo()
+    # cninfo.run()
+    cninfo.download()

+ 104 - 0
crawl_sse/pdf_2_txt.py

@@ -0,0 +1,104 @@
+
+import pandas as pd
+import requests
+import os
+import multiprocessing
+import pdfplumber
+import logging
+import re
+
+#日志配置文件
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+
+#下载模块
+def download_pdf(pdf_url, pdf_file_path):
+    try:
+        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'}
+        with requests.get(pdf_url, headers=headers,stream=True, timeout=10) as r:
+            with open(pdf_file_path, 'wb') as f:
+                f.write(r.content)
+    except requests.exceptions.RequestException as e:
+        logging.error(f"下载PDF文件失败:{e}")
+        return False
+    else:
+        return True
+
+#文件转换
+def convert(code, name, title, pdf_url, pdf_dir, txt_dir, flag_pdf):
+    pdf_file_path = os.path.join(pdf_dir, re.sub(r'[\\/:*?"<>|]', '',f"{code:06}_{name}_{title}_{pdf_url}.pdf"))
+    txt_file_path = os.path.join(txt_dir, re.sub(r'[\\/:*?"<>|]', '', f"{code:06}_{name}_{title}_{pdf_url}.txt"))
+
+    try:
+        # 下载PDF文件
+        if not os.path.exists(pdf_file_path):
+            retry_count = 3
+            while retry_count > 0:
+                if download_pdf(pdf_url, pdf_file_path):
+                    break
+                else:
+                    retry_count -= 1
+            if retry_count == 0:
+                logging.error(f"下载失败:{pdf_url}")
+                return
+
+        # 转换PDF文件为TXT文件
+        with pdfplumber.open(pdf_file_path) as pdf:
+            with open(txt_file_path, 'w', encoding='utf-8') as f:
+                for page in pdf.pages:
+                    text = page.extract_text()
+                    f.write(text)
+
+        logging.info(f"{txt_file_path} 已保存.")
+
+    except Exception as e:
+        logging.error(f"处理 {code:06}_{name}_{pdf_url}时出错: {e}")
+    else:
+        # 删除已转换的PDF文件,以节省空间
+        if flag_pdf:
+            os.remove(pdf_file_path)
+            logging.info(f"{pdf_file_path} 已被删除.")
+
+
+
+def main(file_name,pdf_dir,txt_dir,flag_pdf):
+    print("程序开始运行,请耐心等待……")
+    # 读取Excel文件
+    try:
+        df = pd.read_excel(file_name)
+    except Exception as e:
+        logging.error(f"读取失败,请检查路径是否设置正确,建议输入绝对路径 {e}")
+        return
+    try:
+        os.makedirs(pdf_dir, exist_ok=True)
+        os.makedirs(txt_dir, exist_ok=True)
+    except Exception as e:
+        logging.error(f"创建文件夹失败!请检查文件夹是否为只读! {e}")
+        return
+
+    # 读取文件内容并存储为字典
+    content_dict = ((row['公司代码'], row['公司简称'], row['标题'], row['年报链接']) for _, row in df.iterrows())
+
+    # 多进程下载PDF并转为TXT文件
+    with multiprocessing.Pool() as pool:
+        for code, name, title, pdf_url in content_dict:
+            txt_file_name = f"{code:06}_{name}_{title}_{pdf_url}.txt"
+            txt_file_path = os.path.join(txt_dir, txt_file_name)
+            if os.path.exists(txt_file_path):
+                logging.info(f"{txt_file_name} 已存在,跳过.")
+            else:
+                pool.apply_async(convert, args=(code, name, title, pdf_url, pdf_dir, txt_dir, flag_pdf))
+
+        pool.close()
+        pool.join()
+
+
+if __name__ == '__main__':
+    # 是否删除pdf文件,True为是,False为否
+    flag_pdf = False
+    year = 2018
+    file_name = r"F:\juchao_link\股东大会公告链接_2018_去重.xlsx"
+    pdf_dir = r'F:\股东大会公告2018pdf'
+    txt_dir = r'F:\股东大会公告2018txt'
+    main(file_name, pdf_dir, txt_dir, flag_pdf)
+    print(f"{year}年年报处理完毕,若报错,请检查后重新运行")

+ 2 - 2
crawl_sse/sse.py

@@ -21,7 +21,7 @@ from selenium.webdriver.support import expected_conditions as EC
 import selenium.common.exceptions
 
 class Sse(object):
-    
+
     _host = r'http://www.sse.com.cn'
     _headers = {
         'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/',
@@ -86,7 +86,7 @@ class Sse(object):
                     print(f'error:{e}')
         except Exception as e:
             print(f'error:{e}')
-    
+
     def get_diqu_data(self):
         ''' 获取地区数据
          '''

+ 20 - 0
docs/cninfo.http

@@ -0,0 +1,20 @@
+
+
+### 获取
+POST http://www.cninfo.com.cn/new/hisAnnouncement/query
+
+{
+    "pageNum": 1,
+    "pageSize": 30,
+    "column": "szse",
+    "tabName": "fulltext",
+    "plate": "sz;sh",
+    "searchkey": "",
+    "secid": "",
+    "category": "category_gddh_szsh",
+    "trade": "",
+    "seDate": "2021-01-01~2021-12-31",
+    "sortName": "code",
+    "sortType": "asc",
+    "isHLtitle": "false"
+}

+ 18 - 19
上市公司分析.ipynb → docs/上市公司分析.ipynb

@@ -39,32 +39,31 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "source": [
+    "对 2018 年的数据进去重"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import pandas as pd\n",
+    "import os,sys,re\n",
+    "\n",
+    "years = [ 2018, 2019, 2020, 2021, 2022, 2023 ]\n",
+    "for year in years:\n",
+    "    file_path = f'股东大会公告链接_{year}.xlsx'\n",
+    "    if os.path.exists(file_path):\n",
+    "        df_2018 = pd.read_excel(file_path)\n",
+    "        # 根据 df_2018['年报链接'] 列中的值进行去重\n",
+    "        df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)\n",
+    "        df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)\n",
+    "\n"
+   ]
   }
  ],
  "metadata": {

+ 6 - 3
main.py

@@ -7,8 +7,11 @@
 @Desc    :   enter point
 '''
 
-from crawl_sse import Sse
+# from crawl_sse import Sse
+from crawl_sse import Cninfo
 
 if __name__=='__main__':
-    sse = Sse()
-    sse.crawl()
+    # sse = Sse()
+    # sse.crawl()
+    cninfo =Cninfo()
+    cninfo.download()

+ 1 - 1
requirements.txt

@@ -4,6 +4,6 @@ bs4
 lxml==4.9.3
 selenium==4.11.2
 webdriver_manager==4.0.1
-
+pdfplumber