liuyuqi-dellpc 9 months ago
parent
commit
570ef9c436
6 changed files with 27 additions and 44 deletions
  1. 6 15
      README.md
  2. 1 1
      crawl_sse/__init__.py
  3. 7 9
      crawl_sse/cninfo.py
  4. 1 10
      crawl_sse/options.py
  5. 0 3
      crawl_sse/sse.py
  6. 12 6
      main.py

+ 6 - 15
README.md

@@ -10,30 +10,21 @@
 
 ## Develop
 
-下载上市公司数据:
+抽取上市公司数据:
 ```
 poetry shell
-python main.py company
+python main.py crawl --extractor sse
 
-```
-
-上市公司年报下载:
-```
-# 获取列表,保存到csv
-python main.py nianbao
-
-# npynb执行去重
-
-
-# 下载年报
-python main.py nianbao --download
+python main.py crawl --extractor cninfo
+#上市公司年报下载:
+python main.py download --extractor cninfo
 
 ```
 
 docker 打包交付运行:
 
 ```
-docker run -it --rm -v /data/crawl_sse:/app jianboy/crawl_sse:1.0.1 download
+docker run -it --rm -v /data/crawl_sse:/app jianboy/crawl_sse:1.0.1 download --extractor cninfo
 
 ```
 

+ 1 - 1
crawl_sse/__init__.py

@@ -1,2 +1,2 @@
-# from .sse import Sse
+from .sse import Sse
 from .cninfo import Cninfo

+ 7 - 9
crawl_sse/cninfo.py

@@ -18,7 +18,7 @@ class Cninfo(object):
     ''' 
     巨潮资讯
     '''
-    years =[ 2010,2011,2012,2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021, 2022, 2023 ]
+    years =[ 2010,2011,2012,2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024 ]
     host = "http://www.cninfo.com.cn"
     headers = {
         "Accept": "*/*",
@@ -162,7 +162,10 @@ class Cninfo(object):
                         continue
         return all_results
 
-    def run(self):
+    def crawl(self):
+        ''' 主函数
+         下载股东大会公告链接,保存为xlsx
+           '''
         for year in self.years:
             if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
                 continue
@@ -212,9 +215,9 @@ class Cninfo(object):
             workbook.save(f"股东大会公告链接_{year}.xlsx")
 
             print(f"----{year}年获取完成")
+        self._remove_dump()
     
-    
-    def remove_dump(self):
+    def _remove_dump(self):
         ''' 去重 '''
         for year in self.years:
             file_path = f'股东大会公告链接_{year}.xlsx'
@@ -267,8 +270,3 @@ class Cninfo(object):
                     time.sleep(1)
         else:
             print(f'file:{file_path} is exist')
-
-if __name__ == "__main__":
-    cninfo = Cninfo()
-    cninfo.run()
-    # cninfo.download()

+ 1 - 10
crawl_sse/options.py

@@ -18,16 +18,7 @@ def parse_args():
     """
     parser = argparse.ArgumentParser(description='search domain')
     parser.add_argument('command',  help='command: generate, search', choices=['generate','search', 'help','version'] , default='help')
-    parser.add_argument('--export_all', action='store_true', help='export all domain')
-    parser.add_argument(
-        "--input", help="set input domain list file,eg: domain.txt", type=str, default="domain.txt")
-    parser.add_argument(
-        "--output", help="set output domain result list file,eg: result.txt", type=str, default="result.txt")
-        
-    parser.add_argument('--lang', choices=['zh', 'en'], default='en',help='language')
-    parser.add_argument('--domain', default='com',help='input some domain, plilt with ","')
-    parser.add_argument('--keyword', default='', help='input some keyword, spilt with ","')
-    parser.add_argument('--position', default='prefix',choices=['prefix', 'suffix'], help='choose generate str positon')
+    parser.add_argument('--extractor', help='extractor: cninfo, sse', choices=['cninfo','sse'], default='cninfo')
     args = parser.parse_args()
 
     # remove None

+ 0 - 3
crawl_sse/sse.py

@@ -138,6 +138,3 @@ class Sse(object):
             writer = csv.writer(f)
             writer.writerow(['地区名称','股票代码','名称'])
             writer.writerows(self.diqu_date)
-
-if __name__=='__main__':
-    pass

+ 12 - 6
main.py

@@ -5,6 +5,7 @@
 @Time    :   2023/12/03 03:09:35
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
 @Desc    :   enter point
+多个功能,整理成命令行参数执行
 '''
 
 from crawl_sse import Sse
@@ -14,12 +15,17 @@ import sys
 
 if __name__=='__main__':
     args = parse_args()
-    if args['command'] == 'generate':
-        sse = Sse()
-        sse.crawl()
-    elif args['command'] == 'search':
-        cninfo =Cninfo()
-        cninfo.download()
+    if args['command'] == 'crawl':
+        if args['extractor'] == 'sse':
+            sse = Sse()
+            sse.crawl()
+        elif args['extractor'] == 'cninfo':
+            cninfo = Cninfo()
+            cninfo.crawl()
+    elif args['command'] == 'download':
+        if args['extractor'] == 'cninfo':
+            cninfo =Cninfo()
+            cninfo.download()
     elif args['command'] == 'help':
         pass
     elif args['command'] == 'version':