9 months ago · 570ef9c436
--- a/README.md
+++ b/README.md
@@ -10,30 +10,21 @@
 
				 
			
 
				 ## Develop
			
 
				 
			
 
				-下载上市公司数据：
			
 
				+抽取上市公司数据：
			
 
				 ```
			
 
				 poetry shell
			
 
				-python main.py company
			
 
				+python main.py crawl --extractor sse
			
 
				 
			
 
				-```
			
 
				-
			
 
				-上市公司年报下载：
			
 
				-```
			
 
				-# 获取列表，保存到csv
			
 
				-python main.py nianbao
			
 
				-
			
 
				-# npynb执行去重
			
 
				-
			
 
				-
			
 
				-# 下载年报
			
 
				-python main.py nianbao --download
			
 
				+python main.py crawl --extractor cninfo
			
 
				+#上市公司年报下载：
			
 
				+python main.py download --extractor cninfo
			
 
				 
			
 
				 ```
			
 
				 
			
 
				 docker 打包交付运行：
			
 
				 
			
 
				 ```
			
 
				-docker run -it --rm -v /data/crawl_sse:/app jianboy/crawl_sse:1.0.1 download
			
 
				+docker run -it --rm -v /data/crawl_sse:/app jianboy/crawl_sse:1.0.1 download --extractor cninfo
			
 
				 
			
 
				 ```
			
 
				 
			
--- a/crawl_sse/__init__.py
+++ b/crawl_sse/__init__.py
@@ -1,2 +1,2 @@
 
				-# from .sse import Sse
			
 
				+from .sse import Sse
			
 
				 from .cninfo import Cninfo
			
--- a/crawl_sse/cninfo.py
+++ b/crawl_sse/cninfo.py
@@ -18,7 +18,7 @@ class Cninfo(object):
 
				     ''' 
			
 
				     巨潮资讯
			
 
				     '''
			
 
				-    years =[ 2010,2011,2012,2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021, 2022, 2023 ]
			
 
				+    years =[ 2010,2011,2012,2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024 ]
			
 
				     host = "http://www.cninfo.com.cn"
			
 
				     headers = {
			
 
				         "Accept": "*/*",
			
@@ -162,7 +162,10 @@ class Cninfo(object):
 
				                         continue
			
 
				         return all_results
			
 
				 
			
 
				-    def run(self):
			
 
				+    def crawl(self):
			
 
				+        ''' 主函数
			
 
				+         下载股东大会公告链接，保存为xlsx
			
 
				+           '''
			
 
				         for year in self.years:
			
 
				             if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
			
 
				                 continue
			
@@ -212,9 +215,9 @@ class Cninfo(object):
 
				             workbook.save(f"股东大会公告链接_{year}.xlsx")
			
 
				 
			
 
				             print(f"----{year}年获取完成")
			
 
				+        self._remove_dump()
			
 
				     
			
 
				-    
			
 
				-    def remove_dump(self):
			
 
				+    def _remove_dump(self):
			
 
				         ''' 去重 '''
			
 
				         for year in self.years:
			
 
				             file_path = f'股东大会公告链接_{year}.xlsx'
			
@@ -267,8 +270,3 @@ class Cninfo(object):
 
				                     time.sleep(1)
			
 
				         else:
			
 
				             print(f'file:{file_path} is exist')
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    cninfo = Cninfo()
			
 
				-    cninfo.run()
			
 
				-    # cninfo.download()
			
--- a/crawl_sse/options.py
+++ b/crawl_sse/options.py
@@ -18,16 +18,7 @@ def parse_args():
 
				     """
			
 
				     parser = argparse.ArgumentParser(description='search domain')
			
 
				     parser.add_argument('command',  help='command: generate, search', choices=['generate','search', 'help','version'] , default='help')
			
 
				-    parser.add_argument('--export_all', action='store_true', help='export all domain')
			
 
				-    parser.add_argument(
			
 
				-        "--input", help="set input domain list file,eg: domain.txt", type=str, default="domain.txt")
			
 
				-    parser.add_argument(
			
 
				-        "--output", help="set output domain result list file,eg: result.txt", type=str, default="result.txt")
			
 
				-        
			
 
				-    parser.add_argument('--lang', choices=['zh', 'en'], default='en',help='language')
			
 
				-    parser.add_argument('--domain', default='com',help='input some domain, plilt with ","')
			
 
				-    parser.add_argument('--keyword', default='', help='input some keyword, spilt with ","')
			
 
				-    parser.add_argument('--position', default='prefix',choices=['prefix', 'suffix'], help='choose generate str positon')
			
 
				+    parser.add_argument('--extractor', help='extractor: cninfo, sse', choices=['cninfo','sse'], default='cninfo')
			
 
				     args = parser.parse_args()
			
 
				 
			
 
				     # remove None
			
--- a/crawl_sse/sse.py
+++ b/crawl_sse/sse.py
@@ -138,6 +138,3 @@ class Sse(object):
 
				             writer = csv.writer(f)
			
 
				             writer.writerow(['地区名称','股票代码','名称'])
			
 
				             writer.writerows(self.diqu_date)
			
 
				-
			
 
				-if __name__=='__main__':
			
 
				-    pass
			
--- a/main.py
+++ b/main.py
@@ -5,6 +5,7 @@
 
				 @Time    :   2023/12/03 03:09:35
			
 
				 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				 @Desc    :   enter point
			
 
				+多个功能，整理成命令行参数执行
			
 
				 '''
			
 
				 
			
 
				 from crawl_sse import Sse
			
@@ -14,12 +15,17 @@ import sys
 
				 
			
 
				 if __name__=='__main__':
			
 
				     args = parse_args()
			
 
				-    if args['command'] == 'generate':
			
 
				-        sse = Sse()
			
 
				-        sse.crawl()
			
 
				-    elif args['command'] == 'search':
			
 
				-        cninfo =Cninfo()
			
 
				-        cninfo.download()
			
 
				+    if args['command'] == 'crawl':
			
 
				+        if args['extractor'] == 'sse':
			
 
				+            sse = Sse()
			
 
				+            sse.crawl()
			
 
				+        elif args['extractor'] == 'cninfo':
			
 
				+            cninfo = Cninfo()
			
 
				+            cninfo.crawl()
			
 
				+    elif args['command'] == 'download':
			
 
				+        if args['extractor'] == 'cninfo':
			
 
				+            cninfo =Cninfo()
			
 
				+            cninfo.download()
			
 
				     elif args['command'] == 'help':
			
 
				         pass
			
 
				     elif args['command'] == 'version':