1 year ago · 14f3c5aba1
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 
				 *.csv
			
 
				 *.pyc
			
 
				 *.pdf
			
 
				+*.xlsx
			
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
 
				 # crawl_sse
			
 
				 
			
 
				+下载上市公司数据：
			
 
				 
			
 
				 ```
			
 
				 virtualenv .venv
			
@@ -7,6 +8,19 @@ source .venv/bin/activate
 
				 
			
 
				 pip install -r requirements.txt
			
 
				 
			
 
				-python main.py
			
 
				+python main.py company
			
 
				+
			
 
				+```
			
 
				+
			
 
				+上市公司年报下载：
			
 
				+```
			
 
				+# 获取列表，保存到csv
			
 
				+python main.py nianbao
			
 
				+
			
 
				+# npynb执行去重
			
 
				+
			
 
				+
			
 
				+# 下载年报
			
 
				+python main.py nianbao --download
			
 
				 
			
 
				 ```
			
--- a/crawl_sse/__init__.py
+++ b/crawl_sse/__init__.py
@@ -1,2 +1,2 @@
 
				-from .sse import Sse
			
 
				-
			
 
				+# from .sse import Sse
			
 
				+from .cninfo import Cninfo
			
--- a/crawl_sse/cninfo.py
+++ b/crawl_sse/cninfo.py
@@ -0,0 +1,254 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2023/12/03 16:11:15
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   上市公司年度财报下载
			
 
				+'''
			
 
				+import requests
			
 
				+import re
			
 
				+import openpyxl
			
 
				+import time
			
 
				+import os,sys,re,csv
			
 
				+import pandas as pd
			
 
				+from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
			
 
				+
			
 
				+class Cninfo(object):
			
 
				+    ''' 
			
 
				+    深圳证券
			
 
				+    '''
			
 
				+    years = [ 2018, 2019, 2020, 2021, 2022, 2023 ]
			
 
				+    host = "http://www.cninfo.com.cn"
			
 
				+    headers = {
			
 
				+        "Accept": "*/*",
			
 
				+        "Accept-Encoding": "gzip, deflate",
			
 
				+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
			
 
				+        "Content-Length": "195",
			
 
				+        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
			
 
				+        "Origin": "http://www.cninfo.com.cn",
			
 
				+        "Proxy-Connection": "keep-alive",
			
 
				+        "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_gddh_szsh",
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42",
			
 
				+        "X-Requested-With": "XMLHttpRequest"
			
 
				+    }
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.sess = requests.Session()
			
 
				+        self.pool = ThreadPoolExecutor(max_workers=10)
			
 
				+
			
 
				+    def get_report(self, page_num:int, date:str):
			
 
				+        '''
			
 
				+        获取公告数据
			
 
				+
			
 
				+        params:
			
 
				+            page_num: 页码
			
 
				+            date: 查询时间段，格式为：2021-01-01~2021-12-31
			
 
				+
			
 
				+        plate: sz;sh, 表示沪深两市
			
 
				+        seDate：查询时间
			
 
				+        '''
			
 
				+
			
 
				+        data = {
			
 
				+            "pageNum": page_num,
			
 
				+            "pageSize": 30,
			
 
				+            "column": "szse",
			
 
				+            "tabName": "fulltext",
			
 
				+            "plate": "sz;sh",
			
 
				+            "searchkey": "",
			
 
				+            "secid": "",
			
 
				+            "category": "category_gddh_szsh",
			
 
				+            "trade": "",
			
 
				+            "seDate": date,
			
 
				+            "sortName": "code",
			
 
				+            "sortType": "asc",
			
 
				+            "isHLtitle": "false"
			
 
				+        }
			
 
				+        response = self.sess.post(f'{self.host}/new/hisAnnouncement/query', data=data)
			
 
				+        return response
			
 
				+
			
 
				+
			
 
				+    def downlaod_report(self, date):
			
 
				+        all_results = []
			
 
				+        page_num = 1
			
 
				+        response_test = self.get_report(page_num, date)
			
 
				+        data_test = response_test.json()
			
 
				+        total_pages = data_test["totalpages"]
			
 
				+        max_retries = 3 #最大重试次数
			
 
				+        retry_count = 0 #当前重试次数
			
 
				+        while page_num <= total_pages:
			
 
				+            response = None
			
 
				+            # 重试机制
			
 
				+            while retry_count <= max_retries:
			
 
				+                # 发送请求
			
 
				+                try:
			
 
				+                    response = self.get_report(page_num, date)
			
 
				+                    response.raise_for_status()
			
 
				+                    break
			
 
				+                except requests.exceptions.RequestException as e:
			
 
				+                    print(f"出现错误！: {e}")
			
 
				+                    print(f"5秒后重试...")
			
 
				+                    time.sleep(5)
			
 
				+                    retry_count += 1
			
 
				+
			
 
				+            if retry_count > max_retries:
			
 
				+                print(f"{max_retries} 次重试后均失败. 跳过第 {page_num}页.")
			
 
				+                page_num += 1
			
 
				+                retry_count = 0
			
 
				+                continue
			
 
				+            else:
			
 
				+                # 解析数据
			
 
				+                try:
			
 
				+                    data = response.json()
			
 
				+                    # per = (counter/sum)
			
 
				+                    # if  per <1:
			
 
				+                    #     print(f"\r当前年份下载进度 {per*100:.2f} %",end='')
			
 
				+                    # else:
			
 
				+                    #     print(f"\r下载完成，正在保存……", end='')
			
 
				+                    # 尝试解析公告数据，如果解析失败则重试
			
 
				+                    retry_count = 0
			
 
				+                    while True:
			
 
				+                        try:
			
 
				+                            if data["announcements"] is None:
			
 
				+                                raise Exception("公告数据为空")
			
 
				+                            else:
			
 
				+                                all_results.extend(data["announcements"])
			
 
				+                            break
			
 
				+                        except (TypeError, KeyError) as e:
			
 
				+                            print(f"解析公告数据失败: {e}")
			
 
				+                            print(f"5秒后重试...")
			
 
				+                            time.sleep(5)
			
 
				+                            retry_count += 1
			
 
				+                            if retry_count > max_retries:
			
 
				+                                raise Exception("达到最大重试次数，跳过此页")
			
 
				+                            continue
			
 
				+                    page_num += 1
			
 
				+                    # counter +=1
			
 
				+                except (ValueError, KeyError) as e:
			
 
				+                    print(f"解析响应数据失败: {e}")
			
 
				+                    print(f"5秒后重试...")
			
 
				+                    time.sleep(5)
			
 
				+                    retry_count += 1
			
 
				+                    if retry_count > max_retries:
			
 
				+                        raise Exception("达到最大重试次数，跳过此页")
			
 
				+                    continue
			
 
				+        return all_results
			
 
				+
			
 
				+    def run(self):
			
 
				+        years =[ 2018, 2019, 2020, 2021, 2022, 2023 ]
			
 
				+        # counter = 1  # 计数器
			
 
				+        sum = 0
			
 
				+        for year in years:
			
 
				+            if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
			
 
				+                continue
			
 
				+            date_count = f"{year}-01-01~{year}-12-31"
			
 
				+            response = self.get_report(1, date_count)
			
 
				+            data = response.json()
			
 
				+            sum = data['totalpages']
			
 
				+            all_results = []
			
 
				+            time_segments = [
			
 
				+                f"{year}-01-01~{year}-01-31",
			
 
				+                f"{year}-02-01~{year}-02-28",
			
 
				+                f"{year}-03-01~{year}-03-31",
			
 
				+                f"{year}-04-01~{year}-04-30",
			
 
				+                f"{year}-05-01~{year}-05-30",
			
 
				+                f"{year}-06-01~{year}-06-30",
			
 
				+                f"{year}-07-01~{year}-07-31",
			
 
				+                f"{year}-08-01~{year}-08-31",
			
 
				+                f"{year}-09-01~{year}-09-30",
			
 
				+                f"{year}-10-01~{year}-10-31",
			
 
				+                f"{year}-11-01~{year}-11-30",
			
 
				+                f"{year}-12-01~{year}-12-31",
			
 
				+            ]
			
 
				+            for i in time_segments:
			
 
				+                results = self.downlaod_report(i)
			
 
				+                all_results.extend(results)
			
 
				+
			
 
				+            workbook = openpyxl.Workbook()
			
 
				+            worksheet = workbook.active
			
 
				+            worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
			
 
				+
			
 
				+            # 解析搜索结果并添加到Excel表格中
			
 
				+            for item in all_results:
			
 
				+                company_code = item["secCode"]
			
 
				+                company_name = item["secName"]
			
 
				+                title = item["announcementTitle"].strip()
			
 
				+                # 剔除不需要的样式和特殊符号，并重新组合标题
			
 
				+                title = re.sub(r"<.*?>", "", title)
			
 
				+                title = title.replace("：", "")
			
 
				+                title = f"《{title}》"
			
 
				+
			
 
				+                adjunct_url = item["adjunctUrl"]
			
 
				+                year = re.search(r"\d{4}", title)
			
 
				+                if year:
			
 
				+                    tmp_year = year.group()
			
 
				+                else:
			
 
				+                    tmp_year = year
			
 
				+                announcement_url=f"http://static.cninfo.com.cn/{adjunct_url}"
			
 
				+
			
 
				+                worksheet.append([company_code, company_name, title, tmp_year, announcement_url])
			
 
				+            #注意：年报默认保存在代码同级目录下，如需调整请修改此处的路径，请自行创建文件夹并填入路径
			
 
				+            workbook.save(f"股东大会公告链接_{tmp_year}.xlsx")
			
 
				+
			
 
				+            print(f"----{tmp_year}年下载完成")
			
 
				+    
			
 
				+    
			
 
				+    def remove_dump(self):
			
 
				+        ''' 去重 '''
			
 
				+        for year in self.years:
			
 
				+            file_path = f'股东大会公告链接_{year}.xlsx'
			
 
				+            if os.path.exists(file_path):
			
 
				+                df_2018 = pd.read_excel(file_path)
			
 
				+                df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)
			
 
				+                df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)
			
 
				+
			
 
				+    def download(self):
			
 
				+        ''' read all link and download it '''
			
 
				+        for year in self.years:
			
 
				+            # csv skip head
			
 
				+            file_path = f'股东大会公告链接_{year}_rep.xlsx'
			
 
				+            print(f'process file:{year}')
			
 
				+            # if the file is exist 公司代码	公司简称	标题	年份	年报链接
			
 
				+            if os.path.exists(file_path):
			
 
				+                df_2018 = pd.read_excel(file_path)
			
 
				+                df = pd.read_excel(file_path)
			
 
				+                urls = df['年报链接'].tolist()
			
 
				+                # get all title
			
 
				+                titles = df['标题'].tolist()
			
 
				+                # get all company name
			
 
				+                company_names = df['公司简称'].tolist()
			
 
				+                # company_names remove / and \ and * and ? and : and " and < and > and | and \t and \n ans space
			
 
				+                company_names = [re.sub(r'[\\/:*?"<>|\t\n\s]', '', i) for i in company_names]
			
 
				+                # get all year
			
 
				+                years = df['年份'].tolist()
			
 
				+                # get all company code
			
 
				+                company_codes = df['公司代码'].tolist()
			
 
				+                print(len(company_codes),f'size: {len(company_codes)}')
			
 
				+                for i in range(len(company_codes)):
			
 
				+                    if not os.path.exists(f'data/{company_names[i]}'):
			
 
				+                        os.makedirs(f'data/{company_names[i]}')
			
 
				+                # data/公司名称/{年份}-标题
			
 
				+                file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
			
 
				+                for i in range(len(urls)):
			
 
				+                    self.pool.submit(self.download_file, urls[i],file_names[i] )
			
 
				+                # wait(self.pool, return_when=ALL_COMPLETED)
			
 
				+                print(f'----{year}年下载完成')
			
 
				+    
			
 
				+    def download_file(self, url, file_path):
			
 
				+        ''' download file 
			
 
				+        '''
			
 
				+        # download from urls
			
 
				+        if not os.path.exists(file_path):
			
 
				+                    res = self.sess.get(url)
			
 
				+                    if res.status_code == 200:
			
 
				+                        with open(file_path, 'wb') as f:
			
 
				+                            f.write(res.content)
			
 
				+                            print(f'下载: {file_path} 完成')
			
 
				+                    time.sleep(1)
			
 
				+        else:
			
 
				+            print(f'file:{file_path} is exist')
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    cninfo = Cninfo()
			
 
				+    # cninfo.run()
			
 
				+    cninfo.download()
			
--- a/crawl_sse/pdf_2_txt.py
+++ b/crawl_sse/pdf_2_txt.py
@@ -0,0 +1,104 @@
 
				+
			
 
				+import pandas as pd
			
 
				+import requests
			
 
				+import os
			
 
				+import multiprocessing
			
 
				+import pdfplumber
			
 
				+import logging
			
 
				+import re
			
 
				+
			
 
				+#日志配置文件
			
 
				+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+
			
 
				+#下载模块
			
 
				+def download_pdf(pdf_url, pdf_file_path):
			
 
				+    try:
			
 
				+        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'}
			
 
				+        with requests.get(pdf_url, headers=headers,stream=True, timeout=10) as r:
			
 
				+            with open(pdf_file_path, 'wb') as f:
			
 
				+                f.write(r.content)
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        logging.error(f"下载PDF文件失败：{e}")
			
 
				+        return False
			
 
				+    else:
			
 
				+        return True
			
 
				+
			
 
				+#文件转换
			
 
				+def convert(code, name, title, pdf_url, pdf_dir, txt_dir, flag_pdf):
			
 
				+    pdf_file_path = os.path.join(pdf_dir, re.sub(r'[\\/:*?"<>|]', '',f"{code:06}_{name}_{title}_{pdf_url}.pdf"))
			
 
				+    txt_file_path = os.path.join(txt_dir, re.sub(r'[\\/:*?"<>|]', '', f"{code:06}_{name}_{title}_{pdf_url}.txt"))
			
 
				+
			
 
				+    try:
			
 
				+        # 下载PDF文件
			
 
				+        if not os.path.exists(pdf_file_path):
			
 
				+            retry_count = 3
			
 
				+            while retry_count > 0:
			
 
				+                if download_pdf(pdf_url, pdf_file_path):
			
 
				+                    break
			
 
				+                else:
			
 
				+                    retry_count -= 1
			
 
				+            if retry_count == 0:
			
 
				+                logging.error(f"下载失败：{pdf_url}")
			
 
				+                return
			
 
				+
			
 
				+        # 转换PDF文件为TXT文件
			
 
				+        with pdfplumber.open(pdf_file_path) as pdf:
			
 
				+            with open(txt_file_path, 'w', encoding='utf-8') as f:
			
 
				+                for page in pdf.pages:
			
 
				+                    text = page.extract_text()
			
 
				+                    f.write(text)
			
 
				+
			
 
				+        logging.info(f"{txt_file_path} 已保存.")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"处理 {code:06}_{name}_{pdf_url}时出错： {e}")
			
 
				+    else:
			
 
				+        # 删除已转换的PDF文件，以节省空间
			
 
				+        if flag_pdf:
			
 
				+            os.remove(pdf_file_path)
			
 
				+            logging.info(f"{pdf_file_path} 已被删除.")
			
 
				+
			
 
				+
			
 
				+
			
 
				+def main(file_name,pdf_dir,txt_dir,flag_pdf):
			
 
				+    print("程序开始运行，请耐心等待……")
			
 
				+    # 读取Excel文件
			
 
				+    try:
			
 
				+        df = pd.read_excel(file_name)
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"读取失败，请检查路径是否设置正确，建议输入绝对路径 {e}")
			
 
				+        return
			
 
				+    try:
			
 
				+        os.makedirs(pdf_dir, exist_ok=True)
			
 
				+        os.makedirs(txt_dir, exist_ok=True)
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"创建文件夹失败！请检查文件夹是否为只读！ {e}")
			
 
				+        return
			
 
				+
			
 
				+    # 读取文件内容并存储为字典
			
 
				+    content_dict = ((row['公司代码'], row['公司简称'], row['标题'], row['年报链接']) for _, row in df.iterrows())
			
 
				+
			
 
				+    # 多进程下载PDF并转为TXT文件
			
 
				+    with multiprocessing.Pool() as pool:
			
 
				+        for code, name, title, pdf_url in content_dict:
			
 
				+            txt_file_name = f"{code:06}_{name}_{title}_{pdf_url}.txt"
			
 
				+            txt_file_path = os.path.join(txt_dir, txt_file_name)
			
 
				+            if os.path.exists(txt_file_path):
			
 
				+                logging.info(f"{txt_file_name} 已存在，跳过.")
			
 
				+            else:
			
 
				+                pool.apply_async(convert, args=(code, name, title, pdf_url, pdf_dir, txt_dir, flag_pdf))
			
 
				+
			
 
				+        pool.close()
			
 
				+        pool.join()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # 是否删除pdf文件，True为是，False为否
			
 
				+    flag_pdf = False
			
 
				+    year = 2018
			
 
				+    file_name = r"F:\juchao_link\股东大会公告链接_2018_去重.xlsx"
			
 
				+    pdf_dir = r'F:\股东大会公告2018pdf'
			
 
				+    txt_dir = r'F:\股东大会公告2018txt'
			
 
				+    main(file_name, pdf_dir, txt_dir, flag_pdf)
			
 
				+    print(f"{year}年年报处理完毕，若报错，请检查后重新运行")
			
--- a/crawl_sse/sse.py
+++ b/crawl_sse/sse.py
@@ -21,7 +21,7 @@ from selenium.webdriver.support import expected_conditions as EC
 
				 import selenium.common.exceptions
			
 
				 
			
 
				 class Sse(object):
			
 
				-    
			
 
				+
			
 
				     _host = r'http://www.sse.com.cn'
			
 
				     _headers = {
			
 
				         'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/',
			
@@ -86,7 +86,7 @@ class Sse(object):
 
				                     print(f'error:{e}')
			
 
				         except Exception as e:
			
 
				             print(f'error:{e}')
			
 
				-    
			
 
				+
			
 
				     def get_diqu_data(self):
			
 
				         ''' 获取地区数据
			
 
				          '''
			
--- a/docs/cninfo.http
+++ b/docs/cninfo.http
@@ -0,0 +1,20 @@
 
				+
			
 
				+
			
 
				+### 获取
			
 
				+POST http://www.cninfo.com.cn/new/hisAnnouncement/query
			
 
				+
			
 
				+{
			
 
				+    "pageNum": 1,
			
 
				+    "pageSize": 30,
			
 
				+    "column": "szse",
			
 
				+    "tabName": "fulltext",
			
 
				+    "plate": "sz;sh",
			
 
				+    "searchkey": "",
			
 
				+    "secid": "",
			
 
				+    "category": "category_gddh_szsh",
			
 
				+    "trade": "",
			
 
				+    "seDate": "2021-01-01~2021-12-31",
			
 
				+    "sortName": "code",
			
 
				+    "sortType": "asc",
			
 
				+    "isHLtitle": "false"
			
 
				+}
			
--- a/docs/上市公司分析.ipynb
+++ b/docs/上市公司分析.ipynb
@@ -39,32 +39,31 @@
 
				    ]
			
 
				   },
			
 
				   {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": []
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": []
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				+   "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": []
			
 
				+   "source": [
			
 
				+    "对 2018 年的数据进去重"
			
 
				+   ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				-   "source": []
			
 
				+   "source": [
			
 
				+    "import pandas as pd\n",
			
 
				+    "import os,sys,re\n",
			
 
				+    "\n",
			
 
				+    "years = [ 2018, 2019, 2020, 2021, 2022, 2023 ]\n",
			
 
				+    "for year in years:\n",
			
 
				+    "    file_path = f'股东大会公告链接_{year}.xlsx'\n",
			
 
				+    "    if os.path.exists(file_path):\n",
			
 
				+    "        df_2018 = pd.read_excel(file_path)\n",
			
 
				+    "        # 根据 df_2018['年报链接'] 列中的值进行去重\n",
			
 
				+    "        df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)\n",
			
 
				+    "        df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)\n",
			
 
				+    "\n"
			
 
				+   ]
			
 
				   }
			
 
				  ],
			
 
				  "metadata": {
			
--- a/main.py
+++ b/main.py
@@ -7,8 +7,11 @@
 
				 @Desc    :   enter point
			
 
				 '''
			
 
				 
			
 
				-from crawl_sse import Sse
			
 
				+# from crawl_sse import Sse
			
 
				+from crawl_sse import Cninfo
			
 
				 
			
 
				 if __name__=='__main__':
			
 
				-    sse = Sse()
			
 
				-    sse.crawl()
			
 
				+    # sse = Sse()
			
 
				+    # sse.crawl()
			
 
				+    cninfo =Cninfo()
			
 
				+    cninfo.download()
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,6 @@ bs4
 
				 lxml==4.9.3
			
 
				 selenium==4.11.2
			
 
				 webdriver_manager==4.0.1
			
 
				-
			
 
				+pdfplumber