1 year ago · 0837f58893
--- a/.env
+++ b/.env
@@ -1,2 +1,2 @@
 
				-year = []
			
 
				+year = [ 2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010 ]
			
 
				 cookie = 
			
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 
				 # For more information, please refer to https://aka.ms/vscode-docker-python
			
 
				-FROM python:3-slim
			
 
				+FROM python:3.12-slim
			
 
				 
			
 
				 # Keeps Python from generating .pyc files in the container
			
 
				 ENV PYTHONDONTWRITEBYTECODE=1
			
@@ -8,8 +8,12 @@ ENV PYTHONDONTWRITEBYTECODE=1
 
				 ENV PYTHONUNBUFFERED=1
			
 
				 
			
 
				 # Install pip requirements
			
 
				-COPY requirements.txt .
			
 
				-RUN python -m pip install -r requirements.txt
			
 
				+# COPY requirements.txt .
			
 
				+# RUN python -m pip install -r requirements.txt
			
 
				+COPY pyproject.toml .
			
 
				+RUN python -m pip install "poetry==1.8.2"
			
 
				+RUN poetry config virtualenvs.create false
			
 
				+RUN poetry install
			
 
				 
			
 
				 WORKDIR /app
			
 
				 COPY . /app
			
--- a/README.md
+++ b/README.md
@@ -26,8 +26,20 @@ docker 打包交付运行：
 
				 ```
			
 
				 # docker run -d -p 9515:9515 -v $(pwd):/app mcr.microsoft.com/msedge/msedgedriver
			
 
				 
			
 
				+# images
			
 
				+crawl_sse
			
 
				+jianboy/crawl_sse:1.0.1
			
 
				+sift-docker.pkg.coding.net/flutter-team/dev-container/crawl_sse:1.0.1
			
 
				+
			
 
				+# download
			
 
				 docker run -it --rm -v /data/crawl_sse:/app jianboy/crawl_sse:1.0.1 download --extractor cninfo
			
 
				 
			
 
				+# crawl
			
 
				+docker run -it --rm -v $(pwd):/app/data crawl_sse python main.py crawl --extractor sse
			
 
				+
			
 
				+# download
			
 
				+docker run -it --rm -v $(pwd)/data:/app/data sift-docker.pkg.coding.net/flutter-team/dev-container/crawl_sse:1.0.1 python main.py download --extractor cninfo
			
 
				+
			
 
				 ```
			
 
				 
			
 
				 
			
--- a/crawl_sse/cninfo.py
+++ b/crawl_sse/cninfo.py
@@ -18,7 +18,7 @@ class Cninfo(object):
 
				     ''' 
			
 
				     巨潮资讯
			
 
				     '''
			
 
				-    years =[ 2010,2011,2012,2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024 ]
			
 
				+    years =[ 2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010 ]
			
 
				     host = "http://www.cninfo.com.cn"
			
 
				     headers = {
			
 
				         "Accept": "*/*",
			
@@ -162,69 +162,74 @@ class Cninfo(object):
 
				                         continue
			
 
				         return all_results
			
 
				 
			
 
				-    def crawl(self):
			
 
				-        ''' 主函数
			
 
				-         下载股东大会公告链接，保存为xlsx
			
 
				-           '''
			
 
				-        for year in self.years:
			
 
				-            if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
			
 
				-                continue
			
 
				-            all_results = []
			
 
				-            time_segments = [
			
 
				-                f"{year}-01-01~{year}-01-31",
			
 
				-                f"{year}-02-01~{year}-02-28",
			
 
				-                f"{year}-03-01~{year}-03-31",
			
 
				-                f"{year}-04-01~{year}-04-30",
			
 
				-                f"{year}-05-01~{year}-05-30",
			
 
				-                f"{year}-06-01~{year}-06-30",
			
 
				-                f"{year}-07-01~{year}-07-31",
			
 
				-                f"{year}-08-01~{year}-08-31",
			
 
				-                f"{year}-09-01~{year}-09-30",
			
 
				-                f"{year}-10-01~{year}-10-31",
			
 
				-                f"{year}-11-01~{year}-11-30",
			
 
				-                f"{year}-12-01~{year}-12-31",
			
 
				-            ]
			
 
				-            for i in time_segments:
			
 
				-                results = self.downlaod_report(i)
			
 
				-                all_results.extend(results)
			
 
				+    def _crawl_report(self, year:int):
			
 
				+        ''' 下载年报 '''
			
 
				+        if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
			
 
				+            return
			
 
				+        all_results = []
			
 
				+        time_segments = [
			
 
				+            f"{year}-01-01~{year}-01-31",
			
 
				+            f"{year}-02-01~{year}-02-28",
			
 
				+            f"{year}-03-01~{year}-03-31",
			
 
				+            f"{year}-04-01~{year}-04-30",
			
 
				+            f"{year}-05-01~{year}-05-30",
			
 
				+            f"{year}-06-01~{year}-06-30",
			
 
				+            f"{year}-07-01~{year}-07-31",
			
 
				+            f"{year}-08-01~{year}-08-31",
			
 
				+            f"{year}-09-01~{year}-09-30",
			
 
				+            f"{year}-10-01~{year}-10-31",
			
 
				+            f"{year}-11-01~{year}-11-30",
			
 
				+            f"{year}-12-01~{year}-12-31",
			
 
				+        ]
			
 
				+        for i in time_segments:
			
 
				+            results = self.downlaod_report(i)
			
 
				+            all_results.extend(results)
			
 
				 
			
 
				-            workbook = openpyxl.Workbook()
			
 
				-            worksheet = workbook.active
			
 
				-            worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
			
 
				+        workbook = openpyxl.Workbook()
			
 
				+        worksheet = workbook.active
			
 
				+        worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
			
 
				 
			
 
				-            # 解析搜索结果并添加到Excel表格中
			
 
				-            for item in all_results:
			
 
				-                company_code = item["secCode"]
			
 
				-                company_name = item["secName"]
			
 
				-                title = item["announcementTitle"].strip()
			
 
				-                # 剔除不需要的样式和特殊符号，并重新组合标题
			
 
				-                title = re.sub(r"<.*?>", "", title)
			
 
				-                title = title.replace("：", "")
			
 
				-                title = f"《{title}》"
			
 
				+        # 解析搜索结果并添加到Excel表格中
			
 
				+        for item in all_results:
			
 
				+            company_code = item["secCode"]
			
 
				+            company_name = item["secName"]
			
 
				+            title = item["announcementTitle"].strip()
			
 
				+            # 剔除不需要的样式和特殊符号，并重新组合标题
			
 
				+            title = re.sub(r"<.*?>", "", title)
			
 
				+            title = title.replace("：", "")
			
 
				+            title = f"《{title}》"
			
 
				 
			
 
				-                adjunct_url = item["adjunctUrl"]
			
 
				-                year = re.search(r"\d{4}", title)
			
 
				-                if year:
			
 
				-                    tmp_year = year.group()
			
 
				-                else:
			
 
				-                    tmp_year = year
			
 
				-                announcement_url=f"http://static.cninfo.com.cn/{adjunct_url}"
			
 
				+            adjunct_url = item["adjunctUrl"]
			
 
				+            year = re.search(r"\d{4}", title)
			
 
				+            if year:
			
 
				+                tmp_year = year.group()
			
 
				+            else:
			
 
				+                tmp_year = year
			
 
				+            announcement_url=f"http://static.cninfo.com.cn/{adjunct_url}"
			
 
				+
			
 
				+            worksheet.append([company_code, company_name, title, tmp_year, announcement_url])
			
 
				+        #注意：年报默认保存在代码同级目录下，如需调整请修改此处的路径，请自行创建文件夹并填入路径
			
 
				+        workbook.save(f"股东大会公告链接_{year}.xlsx")
			
 
				 
			
 
				-                worksheet.append([company_code, company_name, title, tmp_year, announcement_url])
			
 
				-            #注意：年报默认保存在代码同级目录下，如需调整请修改此处的路径，请自行创建文件夹并填入路径
			
 
				-            workbook.save(f"股东大会公告链接_{year}.xlsx")
			
 
				+        print(f"----{year}年获取完成")
			
 
				+        self._remove_dump(year)
			
 
				+        print(f"--------去重-----")
			
 
				 
			
 
				-            print(f"----{year}年获取完成")
			
 
				+    def crawl(self):
			
 
				+        ''' 主函数
			
 
				+         下载股东大会公告链接，保存为xlsx
			
 
				+           '''
			
 
				+        for year in self.years:
			
 
				+            self._crawl_report(year)
			
 
				         self._remove_dump()
			
 
				     
			
 
				-    def _remove_dump(self):
			
 
				+    def _remove_dump(self, year:int):
			
 
				         ''' 去重 '''
			
 
				-        for year in self.years:
			
 
				-            file_path = f'股东大会公告链接_{year}.xlsx'
			
 
				-            if os.path.exists(file_path):
			
 
				-                df_2018 = pd.read_excel(file_path)
			
 
				-                df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)
			
 
				-                df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)
			
 
				+        file_path = f'股东大会公告链接_{year}.xlsx'
			
 
				+        if os.path.exists(file_path):
			
 
				+            df_2018 = pd.read_excel(file_path)
			
 
				+            df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)
			
 
				+            df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)
			
 
				 
			
 
				     def download(self):
			
 
				         ''' read all link and download it '''
			
@@ -234,30 +239,40 @@ class Cninfo(object):
 
				             print(f'process file:{year}')
			
 
				             # if the file is exist 公司代码	公司简称	标题	年份	年报链接
			
 
				             if os.path.exists(file_path):
			
 
				-                df_2018 = pd.read_excel(file_path)
			
 
				-                df = pd.read_excel(file_path)
			
 
				-                urls = df['年报链接'].tolist()
			
 
				-                # get all title
			
 
				-                titles = df['标题'].tolist()
			
 
				-                # get all company name
			
 
				-                company_names = df['公司简称'].tolist()
			
 
				-                # company_names remove / and \ and * and ? and : and " and < and > and | and \t and \n ans space
			
 
				-                company_names = [re.sub(r'[\\/:*?"<>|\t\n\s]', '', i) for i in company_names]
			
 
				-                # get all year
			
 
				-                years = df['年份'].tolist()
			
 
				-                # get all company code
			
 
				-                company_codes = df['公司代码'].tolist()
			
 
				-                print(len(company_codes),f'size: {len(company_codes)}')
			
 
				-                for i in range(len(company_codes)):
			
 
				-                    if not os.path.exists(f'data/{company_names[i]}'):
			
 
				-                        os.makedirs(f'data/{company_names[i]}')
			
 
				-                # data/公司名称/{年份}-标题
			
 
				-                file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
			
 
				-                for i in range(len(urls)):
			
 
				-                    # http://static.cninfo.com.cn/finalpage/2018-01-30/1204372527.PDF
			
 
				-                    self.pool.submit(self._download_file, urls[i],file_names[i] )
			
 
				-                print(f'----{year}年下载完成')
			
 
				+                self._download(year)
			
 
				+            else:
			
 
				+                print(f'file:{file_path} is not exist')
			
 
				+                self._crawl_report(year)
			
 
				+                self._download(year)
			
 
				+        self.pool.shutdown(wait=True)
			
 
				     
			
 
				+    def _download(self, year:int):
			
 
				+        ''' 下载年报 '''
			
 
				+        file_path = f'股东大会公告链接_{year}.xlsx'
			
 
				+        df_2018 = pd.read_excel(file_path)
			
 
				+        df = pd.read_excel(file_path)
			
 
				+        urls = df['年报链接'].tolist()
			
 
				+        # get all title
			
 
				+        titles = df['标题'].tolist()
			
 
				+        # get all company name
			
 
				+        company_names = df['公司简称'].tolist()
			
 
				+        # company_names remove / and \ and * and ? and : and " and < and > and | and \t and \n ans space
			
 
				+        company_names = [re.sub(r'[\\/:*?"<>|\t\n\s]', '', i) for i in company_names]
			
 
				+        # get all year
			
 
				+        years = df['年份'].tolist()
			
 
				+        # get all company code
			
 
				+        company_codes = df['公司代码'].tolist()
			
 
				+        print(len(company_codes),f'size: {len(company_codes)}')
			
 
				+        for i in range(len(company_codes)):
			
 
				+            if not os.path.exists(f'data/{company_names[i]}'):
			
 
				+                os.makedirs(f'data/{company_names[i]}')
			
 
				+        # data/公司名称/{年份}-标题
			
 
				+        file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
			
 
				+        for i in range(len(urls)):
			
 
				+            # http://static.cninfo.com.cn/finalpage/2018-01-30/1204372527.PDF
			
 
				+            self.pool.submit(self._download_file, urls[i],file_names[i] )
			
 
				+        print(f'----{year}年下载完成')
			
 
				+
			
 
				     def _download_file(self, url, file_path):
			
 
				         ''' download file 
			
 
				         '''
			
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,7 +2,11 @@ version: '3.4'
 
				 
			
 
				 services:
			
 
				   crawlsse:
			
 
				-    image: crawlsse
			
 
				-    build:
			
 
				-      context: .
			
 
				-      dockerfile: ./Dockerfile
			
 
				+    image: sift-docker.pkg.coding.net/flutter-team/dev-container/crawl_sse:1.0.1
			
 
				+    container_name: crawlsse
			
 
				+    volumes:
			
 
				+      - ./data:/app/data
			
 
				+      - ./logs:/app/logs
			
 
				+    environment:
			
 
				+      - TZ=Asia/Shanghai
			
 
				+    command: ["python", "main.py", "crawl", "--extractor", "cninfo"]