1 year ago · 0837f58893
--- a/.env
+++ b/.env
@@ -1,2 +1,2 @@
 
															-year = []
														
 
															+year = [ 2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010 ]
														
 
															 cookie = 
														
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 
															 # For more information, please refer to https://aka.ms/vscode-docker-python
														
 
															-FROM python:3-slim
														
 
															+FROM python:3.12-slim
														
 
															 # Keeps Python from generating .pyc files in the container
														
 
															 ENV PYTHONDONTWRITEBYTECODE=1
														
@@ -8,8 +8,12 @@ ENV PYTHONDONTWRITEBYTECODE=1
 
															 ENV PYTHONUNBUFFERED=1
														
 
															 # Install pip requirements
														
 
															-COPY requirements.txt .
														
 
															-RUN python -m pip install -r requirements.txt
														
 
															+# COPY requirements.txt .
														
 
															+# RUN python -m pip install -r requirements.txt
														
 
															+COPY pyproject.toml .
														
 
															+RUN python -m pip install "poetry==1.8.2"
														
 
															+RUN poetry config virtualenvs.create false
														
 
															+RUN poetry install
														
 
															 WORKDIR /app
														
 
															 COPY . /app
														
--- a/README.md
+++ b/README.md
@@ -26,8 +26,20 @@ docker 打包交付运行：
 
															 ```
														
 
															 # docker run -d -p 9515:9515 -v $(pwd):/app mcr.microsoft.com/msedge/msedgedriver
														
 
															+# images
														
 
															+crawl_sse
														
 
															+jianboy/crawl_sse:1.0.1
														
 
															+sift-docker.pkg.coding.net/flutter-team/dev-container/crawl_sse:1.0.1
														
 
															+
														
 
															+# download
														
 
															 docker run -it --rm -v /data/crawl_sse:/app jianboy/crawl_sse:1.0.1 download --extractor cninfo
														
 
															+# crawl
														
 
															+docker run -it --rm -v $(pwd):/app/data crawl_sse python main.py crawl --extractor sse
														
 
															+
														
 
															+# download
														
 
															+docker run -it --rm -v $(pwd)/data:/app/data sift-docker.pkg.coding.net/flutter-team/dev-container/crawl_sse:1.0.1 python main.py download --extractor cninfo
														
 
															+
														
 
															 ```
														
--- a/crawl_sse/cninfo.py
+++ b/crawl_sse/cninfo.py
@@ -18,7 +18,7 @@ class Cninfo(object):
 
															     ''' 
														
 
															     巨潮资讯
														
 
															     '''
														
 
															-    years =[ 2010,2011,2012,2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024 ]
														
 
															+    years =[ 2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010 ]
														
 
															     host = "http://www.cninfo.com.cn"
														
 
															     headers = {
														
 
															         "Accept": "*/*",
														
@@ -162,69 +162,74 @@ class Cninfo(object):
 
															                         continue
														
 
															         return all_results
														
 
															-    def crawl(self):
														
 
															-        ''' 主函数
														
 
															-         下载股东大会公告链接，保存为xlsx
														
 
															-           '''
														
 
															-        for year in self.years:
														
 
															-            if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
														
 
															-                continue
														
 
															-            all_results = []
														
 
															-            time_segments = [
														
 
															-                f"{year}-01-01~{year}-01-31",
														
 
															-                f"{year}-02-01~{year}-02-28",
														
 
															-                f"{year}-03-01~{year}-03-31",
														
 
															-                f"{year}-04-01~{year}-04-30",
														
 
															-                f"{year}-05-01~{year}-05-30",
														
 
															-                f"{year}-06-01~{year}-06-30",
														
 
															-                f"{year}-07-01~{year}-07-31",
														
 
															-                f"{year}-08-01~{year}-08-31",
														
 
															-                f"{year}-09-01~{year}-09-30",
														
 
															-                f"{year}-10-01~{year}-10-31",
														
 
															-                f"{year}-11-01~{year}-11-30",
														
 
															-                f"{year}-12-01~{year}-12-31",
														
 
															-            ]
														
 
															-            for i in time_segments:
														
 
															-                results = self.downlaod_report(i)
														
 
															-                all_results.extend(results)
														
 
															+    def _crawl_report(self, year:int):
														
 
															+        ''' 下载年报 '''
														
 
															+        if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
														
 
															+            return
														
 
															+        all_results = []
														
 
															+        time_segments = [
														
 
															+            f"{year}-01-01~{year}-01-31",
														
 
															+            f"{year}-02-01~{year}-02-28",
														
 
															+            f"{year}-03-01~{year}-03-31",
														
 
															+            f"{year}-04-01~{year}-04-30",
														
 
															+            f"{year}-05-01~{year}-05-30",
														
 
															+            f"{year}-06-01~{year}-06-30",
														
 
															+            f"{year}-07-01~{year}-07-31",
														
 
															+            f"{year}-08-01~{year}-08-31",
														
 
															+            f"{year}-09-01~{year}-09-30",
														
 
															+            f"{year}-10-01~{year}-10-31",
														
 
															+            f"{year}-11-01~{year}-11-30",
														
 
															+            f"{year}-12-01~{year}-12-31",
														
 
															+        ]
														
 
															+        for i in time_segments:
														
 
															+            results = self.downlaod_report(i)
														
 
															+            all_results.extend(results)
														
 
															-            workbook = openpyxl.Workbook()
														
 
															-            worksheet = workbook.active
														
 
															-            worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
														
 
															+        workbook = openpyxl.Workbook()
														
 
															+        worksheet = workbook.active
														
 
															+        worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
														
 
															-            # 解析搜索结果并添加到Excel表格中
														
 
															-            for item in all_results:
														
 
															-                company_code = item["secCode"]
														
 
															-                company_name = item["secName"]
														
 
															-                title = item["announcementTitle"].strip()
														
 
															-                # 剔除不需要的样式和特殊符号，并重新组合标题
														
 
															-                title = re.sub(r"<.*?>", "", title)
														
 
															-                title = title.replace("：", "")
														
 
															-                title = f"《{title}》"
														
 
															+        # 解析搜索结果并添加到Excel表格中
														
 
															+        for item in all_results:
														
 
															+            company_code = item["secCode"]
														
 
															+            company_name = item["secName"]
														
 
															+            title = item["announcementTitle"].strip()
														
 
															+            # 剔除不需要的样式和特殊符号，并重新组合标题
														
 
															+            title = re.sub(r"<.*?>", "", title)
														
 
															+            title = title.replace("：", "")
														
 
															+            title = f"《{title}》"
														
 
															-                adjunct_url = item["adjunctUrl"]
														
 
															-                year = re.search(r"\d{4}", title)
														
 
															-                if year:
														
 
															-                    tmp_year = year.group()
														
 
															-                else:
														
 
															-                    tmp_year = year
														
 
															-                announcement_url=f"http://static.cninfo.com.cn/{adjunct_url}"
														
 
															+            adjunct_url = item["adjunctUrl"]
														
 
															+            year = re.search(r"\d{4}", title)
														
 
															+            if year:
														
 
															+                tmp_year = year.group()
														
 
															+            else:
														
 
															+                tmp_year = year
														
 
															+            announcement_url=f"http://static.cninfo.com.cn/{adjunct_url}"
														
 
															+
														
 
															+            worksheet.append([company_code, company_name, title, tmp_year, announcement_url])
														
 
															+        #注意：年报默认保存在代码同级目录下，如需调整请修改此处的路径，请自行创建文件夹并填入路径
														
 
															+        workbook.save(f"股东大会公告链接_{year}.xlsx")
														
 
															-                worksheet.append([company_code, company_name, title, tmp_year, announcement_url])
														
 
															-            #注意：年报默认保存在代码同级目录下，如需调整请修改此处的路径，请自行创建文件夹并填入路径
														
 
															-            workbook.save(f"股东大会公告链接_{year}.xlsx")
														
 
															+        print(f"----{year}年获取完成")
														
 
															+        self._remove_dump(year)
														
 
															+        print(f"--------去重-----")
														
 
															-            print(f"----{year}年获取完成")
														
 
															+    def crawl(self):
														
 
															+        ''' 主函数
														
 
															+         下载股东大会公告链接，保存为xlsx
														
 
															+           '''
														
 
															+        for year in self.years:
														
 
															+            self._crawl_report(year)
														
 
															         self._remove_dump()
														
 
															-    def _remove_dump(self):
														
 
															+    def _remove_dump(self, year:int):
														
 
															         ''' 去重 '''
														
 
															-        for year in self.years:
														
 
															-            file_path = f'股东大会公告链接_{year}.xlsx'
														
 
															-            if os.path.exists(file_path):
														
 
															-                df_2018 = pd.read_excel(file_path)
														
 
															-                df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)
														
 
															-                df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)
														
 
															+        file_path = f'股东大会公告链接_{year}.xlsx'
														
 
															+        if os.path.exists(file_path):
														
 
															+            df_2018 = pd.read_excel(file_path)
														
 
															+            df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)
														
 
															+            df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)
														
 
															     def download(self):
														
 
															         ''' read all link and download it '''
														
@@ -234,30 +239,40 @@ class Cninfo(object):
 
															             print(f'process file:{year}')
														
 
															             # if the file is exist 公司代码	公司简称	标题	年份	年报链接
														
 
															             if os.path.exists(file_path):
														
 
															-                df_2018 = pd.read_excel(file_path)
														
 
															-                df = pd.read_excel(file_path)
														
 
															-                urls = df['年报链接'].tolist()
														
 
															-                # get all title
														
 
															-                titles = df['标题'].tolist()
														
 
															-                # get all company name
														
 
															-                company_names = df['公司简称'].tolist()
														
 
															-                # company_names remove / and \ and * and ? and : and " and < and > and | and \t and \n ans space
														
 
															-                company_names = [re.sub(r'[\\/:*?"<>|\t\n\s]', '', i) for i in company_names]
														
 
															-                # get all year
														
 
															-                years = df['年份'].tolist()
														
 
															-                # get all company code
														
 
															-                company_codes = df['公司代码'].tolist()
														
 
															-                print(len(company_codes),f'size: {len(company_codes)}')
														
 
															-                for i in range(len(company_codes)):
														
 
															-                    if not os.path.exists(f'data/{company_names[i]}'):
														
 
															-                        os.makedirs(f'data/{company_names[i]}')
														
 
															-                # data/公司名称/{年份}-标题
														
 
															-                file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
														
 
															-                for i in range(len(urls)):
														
 
															-                    # http://static.cninfo.com.cn/finalpage/2018-01-30/1204372527.PDF
														
 
															-                    self.pool.submit(self._download_file, urls[i],file_names[i] )
														
 
															-                print(f'----{year}年下载完成')
														
 
															+                self._download(year)
														
 
															+            else:
														
 
															+                print(f'file:{file_path} is not exist')
														
 
															+                self._crawl_report(year)
														
 
															+                self._download(year)
														
 
															+        self.pool.shutdown(wait=True)
														
 
															+    def _download(self, year:int):
														
 
															+        ''' 下载年报 '''
														
 
															+        file_path = f'股东大会公告链接_{year}.xlsx'
														
 
															+        df_2018 = pd.read_excel(file_path)
														
 
															+        df = pd.read_excel(file_path)
														
 
															+        urls = df['年报链接'].tolist()
														
 
															+        # get all title
														
 
															+        titles = df['标题'].tolist()
														
 
															+        # get all company name
														
 
															+        company_names = df['公司简称'].tolist()
														
 
															+        # company_names remove / and \ and * and ? and : and " and < and > and | and \t and \n ans space
														
 
															+        company_names = [re.sub(r'[\\/:*?"<>|\t\n\s]', '', i) for i in company_names]
														
 
															+        # get all year
														
 
															+        years = df['年份'].tolist()
														
 
															+        # get all company code
														
 
															+        company_codes = df['公司代码'].tolist()
														
 
															+        print(len(company_codes),f'size: {len(company_codes)}')
														
 
															+        for i in range(len(company_codes)):
														
 
															+            if not os.path.exists(f'data/{company_names[i]}'):
														
 
															+                os.makedirs(f'data/{company_names[i]}')
														
 
															+        # data/公司名称/{年份}-标题
														
 
															+        file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
														
 
															+        for i in range(len(urls)):
														
 
															+            # http://static.cninfo.com.cn/finalpage/2018-01-30/1204372527.PDF
														
 
															+            self.pool.submit(self._download_file, urls[i],file_names[i] )
														
 
															+        print(f'----{year}年下载完成')
														
 
															+
														
 
															     def _download_file(self, url, file_path):
														
 
															         ''' download file 
														
 
															         '''
														
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,7 +2,11 @@ version: '3.4'
 
															 services:
														
 
															   crawlsse:
														
 
															-    image: crawlsse
														
 
															-    build:
														
 
															-      context: .
														
 
															-      dockerfile: ./Dockerfile
														
 
															+    image: sift-docker.pkg.coding.net/flutter-team/dev-container/crawl_sse:1.0.1
														
 
															+    container_name: crawlsse
														
 
															+    volumes:
														
 
															+      - ./data:/app/data
														
 
															+      - ./logs:/app/logs
														
 
															+    environment:
														
 
															+      - TZ=Asia/Shanghai
														
 
															+    command: ["python", "main.py", "crawl", "--extractor", "cninfo"]