fish 3 months ago
parent
commit
0837f58893
5 changed files with 122 additions and 87 deletions
  1. 1 1
      .env
  2. 7 3
      Dockerfile
  3. 12 0
      README.md
  4. 94 79
      crawl_sse/cninfo.py
  5. 8 4
      docker-compose.yml

+ 1 - 1
.env

@@ -1,2 +1,2 @@
-year = []
+year = [ 2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010 ]
 cookie = 
 cookie = 

+ 7 - 3
Dockerfile

@@ -1,5 +1,5 @@
 # For more information, please refer to https://aka.ms/vscode-docker-python
 # For more information, please refer to https://aka.ms/vscode-docker-python
-FROM python:3-slim
+FROM python:3.12-slim
 
 
 # Keeps Python from generating .pyc files in the container
 # Keeps Python from generating .pyc files in the container
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONDONTWRITEBYTECODE=1
@@ -8,8 +8,12 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONUNBUFFERED=1
 
 
 # Install pip requirements
 # Install pip requirements
-COPY requirements.txt .
-RUN python -m pip install -r requirements.txt
+# COPY requirements.txt .
+# RUN python -m pip install -r requirements.txt
+COPY pyproject.toml .
+RUN python -m pip install "poetry==1.8.2"
+RUN poetry config virtualenvs.create false
+RUN poetry install
 
 
 WORKDIR /app
 WORKDIR /app
 COPY . /app
 COPY . /app

+ 12 - 0
README.md

@@ -26,8 +26,20 @@ docker 打包交付运行:
 ```
 ```
 # docker run -d -p 9515:9515 -v $(pwd):/app mcr.microsoft.com/msedge/msedgedriver
 # docker run -d -p 9515:9515 -v $(pwd):/app mcr.microsoft.com/msedge/msedgedriver
 
 
+# images
+crawl_sse
+jianboy/crawl_sse:1.0.1
+sift-docker.pkg.coding.net/flutter-team/dev-container/crawl_sse:1.0.1
+
+# download
 docker run -it --rm -v /data/crawl_sse:/app jianboy/crawl_sse:1.0.1 download --extractor cninfo
 docker run -it --rm -v /data/crawl_sse:/app jianboy/crawl_sse:1.0.1 download --extractor cninfo
 
 
+# crawl
+docker run -it --rm -v $(pwd):/app/data crawl_sse python main.py crawl --extractor sse
+
+# download
+docker run -it --rm -v $(pwd)/data:/app/data sift-docker.pkg.coding.net/flutter-team/dev-container/crawl_sse:1.0.1 python main.py download --extractor cninfo
+
 ```
 ```
 
 
 
 

+ 94 - 79
crawl_sse/cninfo.py

@@ -18,7 +18,7 @@ class Cninfo(object):
     ''' 
     ''' 
     巨潮资讯
     巨潮资讯
     '''
     '''
-    years =[ 2010,2011,2012,2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024 ]
+    years =[ 2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010 ]
     host = "http://www.cninfo.com.cn"
     host = "http://www.cninfo.com.cn"
     headers = {
     headers = {
         "Accept": "*/*",
         "Accept": "*/*",
@@ -162,69 +162,74 @@ class Cninfo(object):
                         continue
                         continue
         return all_results
         return all_results
 
 
-    def crawl(self):
-        ''' 主函数
-         下载股东大会公告链接,保存为xlsx
-           '''
-        for year in self.years:
-            if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
-                continue
-            all_results = []
-            time_segments = [
-                f"{year}-01-01~{year}-01-31",
-                f"{year}-02-01~{year}-02-28",
-                f"{year}-03-01~{year}-03-31",
-                f"{year}-04-01~{year}-04-30",
-                f"{year}-05-01~{year}-05-30",
-                f"{year}-06-01~{year}-06-30",
-                f"{year}-07-01~{year}-07-31",
-                f"{year}-08-01~{year}-08-31",
-                f"{year}-09-01~{year}-09-30",
-                f"{year}-10-01~{year}-10-31",
-                f"{year}-11-01~{year}-11-30",
-                f"{year}-12-01~{year}-12-31",
-            ]
-            for i in time_segments:
-                results = self.downlaod_report(i)
-                all_results.extend(results)
+    def _crawl_report(self, year:int):
+        ''' 下载年报 '''
+        if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
+            return
+        all_results = []
+        time_segments = [
+            f"{year}-01-01~{year}-01-31",
+            f"{year}-02-01~{year}-02-28",
+            f"{year}-03-01~{year}-03-31",
+            f"{year}-04-01~{year}-04-30",
+            f"{year}-05-01~{year}-05-30",
+            f"{year}-06-01~{year}-06-30",
+            f"{year}-07-01~{year}-07-31",
+            f"{year}-08-01~{year}-08-31",
+            f"{year}-09-01~{year}-09-30",
+            f"{year}-10-01~{year}-10-31",
+            f"{year}-11-01~{year}-11-30",
+            f"{year}-12-01~{year}-12-31",
+        ]
+        for i in time_segments:
+            results = self.downlaod_report(i)
+            all_results.extend(results)
 
 
-            workbook = openpyxl.Workbook()
-            worksheet = workbook.active
-            worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
+        workbook = openpyxl.Workbook()
+        worksheet = workbook.active
+        worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
 
 
-            # 解析搜索结果并添加到Excel表格中
-            for item in all_results:
-                company_code = item["secCode"]
-                company_name = item["secName"]
-                title = item["announcementTitle"].strip()
-                # 剔除不需要的样式和特殊符号,并重新组合标题
-                title = re.sub(r"<.*?>", "", title)
-                title = title.replace(":", "")
-                title = f"《{title}》"
+        # 解析搜索结果并添加到Excel表格中
+        for item in all_results:
+            company_code = item["secCode"]
+            company_name = item["secName"]
+            title = item["announcementTitle"].strip()
+            # 剔除不需要的样式和特殊符号,并重新组合标题
+            title = re.sub(r"<.*?>", "", title)
+            title = title.replace(":", "")
+            title = f"《{title}》"
 
 
-                adjunct_url = item["adjunctUrl"]
-                year = re.search(r"\d{4}", title)
-                if year:
-                    tmp_year = year.group()
-                else:
-                    tmp_year = year
-                announcement_url=f"http://static.cninfo.com.cn/{adjunct_url}"
+            adjunct_url = item["adjunctUrl"]
+            year = re.search(r"\d{4}", title)
+            if year:
+                tmp_year = year.group()
+            else:
+                tmp_year = year
+            announcement_url=f"http://static.cninfo.com.cn/{adjunct_url}"
+
+            worksheet.append([company_code, company_name, title, tmp_year, announcement_url])
+        #注意:年报默认保存在代码同级目录下,如需调整请修改此处的路径,请自行创建文件夹并填入路径
+        workbook.save(f"股东大会公告链接_{year}.xlsx")
 
 
-                worksheet.append([company_code, company_name, title, tmp_year, announcement_url])
-            #注意:年报默认保存在代码同级目录下,如需调整请修改此处的路径,请自行创建文件夹并填入路径
-            workbook.save(f"股东大会公告链接_{year}.xlsx")
+        print(f"----{year}年获取完成")
+        self._remove_dump(year)
+        print(f"--------去重-----")
 
 
-            print(f"----{year}年获取完成")
+    def crawl(self):
+        ''' 主函数
+         下载股东大会公告链接,保存为xlsx
+           '''
+        for year in self.years:
+            self._crawl_report(year)
         self._remove_dump()
         self._remove_dump()
     
     
-    def _remove_dump(self):
+    def _remove_dump(self, year:int):
         ''' 去重 '''
         ''' 去重 '''
-        for year in self.years:
-            file_path = f'股东大会公告链接_{year}.xlsx'
-            if os.path.exists(file_path):
-                df_2018 = pd.read_excel(file_path)
-                df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)
-                df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)
+        file_path = f'股东大会公告链接_{year}.xlsx'
+        if os.path.exists(file_path):
+            df_2018 = pd.read_excel(file_path)
+            df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)
+            df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)
 
 
     def download(self):
     def download(self):
         ''' read all link and download it '''
         ''' read all link and download it '''
@@ -234,30 +239,40 @@ class Cninfo(object):
             print(f'process file:{year}')
             print(f'process file:{year}')
             # if the file is exist 公司代码	公司简称	标题	年份	年报链接
             # if the file is exist 公司代码	公司简称	标题	年份	年报链接
             if os.path.exists(file_path):
             if os.path.exists(file_path):
-                df_2018 = pd.read_excel(file_path)
-                df = pd.read_excel(file_path)
-                urls = df['年报链接'].tolist()
-                # get all title
-                titles = df['标题'].tolist()
-                # get all company name
-                company_names = df['公司简称'].tolist()
-                # company_names remove / and \ and * and ? and : and " and < and > and | and \t and \n ans space
-                company_names = [re.sub(r'[\\/:*?"<>|\t\n\s]', '', i) for i in company_names]
-                # get all year
-                years = df['年份'].tolist()
-                # get all company code
-                company_codes = df['公司代码'].tolist()
-                print(len(company_codes),f'size: {len(company_codes)}')
-                for i in range(len(company_codes)):
-                    if not os.path.exists(f'data/{company_names[i]}'):
-                        os.makedirs(f'data/{company_names[i]}')
-                # data/公司名称/{年份}-标题
-                file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
-                for i in range(len(urls)):
-                    # http://static.cninfo.com.cn/finalpage/2018-01-30/1204372527.PDF
-                    self.pool.submit(self._download_file, urls[i],file_names[i] )
-                print(f'----{year}年下载完成')
+                self._download(year)
+            else:
+                print(f'file:{file_path} is not exist')
+                self._crawl_report(year)
+                self._download(year)
+        self.pool.shutdown(wait=True)
     
     
+    def _download(self, year:int):
+        ''' 下载年报 '''
+        file_path = f'股东大会公告链接_{year}.xlsx'
+        df_2018 = pd.read_excel(file_path)
+        df = pd.read_excel(file_path)
+        urls = df['年报链接'].tolist()
+        # get all title
+        titles = df['标题'].tolist()
+        # get all company name
+        company_names = df['公司简称'].tolist()
+        # company_names remove / and \ and * and ? and : and " and < and > and | and \t and \n ans space
+        company_names = [re.sub(r'[\\/:*?"<>|\t\n\s]', '', i) for i in company_names]
+        # get all year
+        years = df['年份'].tolist()
+        # get all company code
+        company_codes = df['公司代码'].tolist()
+        print(len(company_codes),f'size: {len(company_codes)}')
+        for i in range(len(company_codes)):
+            if not os.path.exists(f'data/{company_names[i]}'):
+                os.makedirs(f'data/{company_names[i]}')
+        # data/公司名称/{年份}-标题
+        file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
+        for i in range(len(urls)):
+            # http://static.cninfo.com.cn/finalpage/2018-01-30/1204372527.PDF
+            self.pool.submit(self._download_file, urls[i],file_names[i] )
+        print(f'----{year}年下载完成')
+
     def _download_file(self, url, file_path):
     def _download_file(self, url, file_path):
         ''' download file 
         ''' download file 
         '''
         '''

+ 8 - 4
docker-compose.yml

@@ -2,7 +2,11 @@ version: '3.4'
 
 
 services:
 services:
   crawlsse:
   crawlsse:
-    image: crawlsse
-    build:
-      context: .
-      dockerfile: ./Dockerfile
+    image: sift-docker.pkg.coding.net/flutter-team/dev-container/crawl_sse:1.0.1
+    container_name: crawlsse
+    volumes:
+      - ./data:/app/data
+      - ./logs:/app/logs
+    environment:
+      - TZ=Asia/Shanghai
+    command: ["python", "main.py", "crawl", "--extractor", "cninfo"]