|
@@ -18,7 +18,7 @@ class Cninfo(object):
|
|
|
'''
|
|
|
巨潮资讯
|
|
|
'''
|
|
|
- years =[ 2010,2011,2012,2013,2014,2015,2016,2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024 ]
|
|
|
+ years =[ 2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010 ]
|
|
|
host = "http://www.cninfo.com.cn"
|
|
|
headers = {
|
|
|
"Accept": "*/*",
|
|
@@ -162,69 +162,74 @@ class Cninfo(object):
|
|
|
continue
|
|
|
return all_results
|
|
|
|
|
|
- def crawl(self):
|
|
|
- ''' 主函数
|
|
|
- 下载股东大会公告链接,保存为xlsx
|
|
|
- '''
|
|
|
- for year in self.years:
|
|
|
- if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
|
|
|
- continue
|
|
|
- all_results = []
|
|
|
- time_segments = [
|
|
|
- f"{year}-01-01~{year}-01-31",
|
|
|
- f"{year}-02-01~{year}-02-28",
|
|
|
- f"{year}-03-01~{year}-03-31",
|
|
|
- f"{year}-04-01~{year}-04-30",
|
|
|
- f"{year}-05-01~{year}-05-30",
|
|
|
- f"{year}-06-01~{year}-06-30",
|
|
|
- f"{year}-07-01~{year}-07-31",
|
|
|
- f"{year}-08-01~{year}-08-31",
|
|
|
- f"{year}-09-01~{year}-09-30",
|
|
|
- f"{year}-10-01~{year}-10-31",
|
|
|
- f"{year}-11-01~{year}-11-30",
|
|
|
- f"{year}-12-01~{year}-12-31",
|
|
|
- ]
|
|
|
- for i in time_segments:
|
|
|
- results = self.downlaod_report(i)
|
|
|
- all_results.extend(results)
|
|
|
+ def _crawl_report(self, year:int):
|
|
|
+ ''' 下载年报 '''
|
|
|
+ if os.path.exists(f'股东大会公告链接_{year}.xlsx'):
|
|
|
+ return
|
|
|
+ all_results = []
|
|
|
+ time_segments = [
|
|
|
+ f"{year}-01-01~{year}-01-31",
|
|
|
+ f"{year}-02-01~{year}-02-28",
|
|
|
+ f"{year}-03-01~{year}-03-31",
|
|
|
+ f"{year}-04-01~{year}-04-30",
|
|
|
+ f"{year}-05-01~{year}-05-30",
|
|
|
+ f"{year}-06-01~{year}-06-30",
|
|
|
+ f"{year}-07-01~{year}-07-31",
|
|
|
+ f"{year}-08-01~{year}-08-31",
|
|
|
+ f"{year}-09-01~{year}-09-30",
|
|
|
+ f"{year}-10-01~{year}-10-31",
|
|
|
+ f"{year}-11-01~{year}-11-30",
|
|
|
+ f"{year}-12-01~{year}-12-31",
|
|
|
+ ]
|
|
|
+ for i in time_segments:
|
|
|
+ results = self.downlaod_report(i)
|
|
|
+ all_results.extend(results)
|
|
|
|
|
|
- workbook = openpyxl.Workbook()
|
|
|
- worksheet = workbook.active
|
|
|
- worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
|
|
|
+ workbook = openpyxl.Workbook()
|
|
|
+ worksheet = workbook.active
|
|
|
+ worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
|
|
|
|
|
|
- # 解析搜索结果并添加到Excel表格中
|
|
|
- for item in all_results:
|
|
|
- company_code = item["secCode"]
|
|
|
- company_name = item["secName"]
|
|
|
- title = item["announcementTitle"].strip()
|
|
|
- # 剔除不需要的样式和特殊符号,并重新组合标题
|
|
|
- title = re.sub(r"<.*?>", "", title)
|
|
|
- title = title.replace(":", "")
|
|
|
- title = f"《{title}》"
|
|
|
+ # 解析搜索结果并添加到Excel表格中
|
|
|
+ for item in all_results:
|
|
|
+ company_code = item["secCode"]
|
|
|
+ company_name = item["secName"]
|
|
|
+ title = item["announcementTitle"].strip()
|
|
|
+ # 剔除不需要的样式和特殊符号,并重新组合标题
|
|
|
+ title = re.sub(r"<.*?>", "", title)
|
|
|
+ title = title.replace(":", "")
|
|
|
+ title = f"《{title}》"
|
|
|
|
|
|
- adjunct_url = item["adjunctUrl"]
|
|
|
- year = re.search(r"\d{4}", title)
|
|
|
- if year:
|
|
|
- tmp_year = year.group()
|
|
|
- else:
|
|
|
- tmp_year = year
|
|
|
- announcement_url=f"http://static.cninfo.com.cn/{adjunct_url}"
|
|
|
+ adjunct_url = item["adjunctUrl"]
|
|
|
+ year = re.search(r"\d{4}", title)
|
|
|
+ if year:
|
|
|
+ tmp_year = year.group()
|
|
|
+ else:
|
|
|
+ tmp_year = year
|
|
|
+ announcement_url=f"http://static.cninfo.com.cn/{adjunct_url}"
|
|
|
+
|
|
|
+ worksheet.append([company_code, company_name, title, tmp_year, announcement_url])
|
|
|
+ #注意:年报默认保存在代码同级目录下,如需调整请修改此处的路径,请自行创建文件夹并填入路径
|
|
|
+ workbook.save(f"股东大会公告链接_{year}.xlsx")
|
|
|
|
|
|
- worksheet.append([company_code, company_name, title, tmp_year, announcement_url])
|
|
|
- #注意:年报默认保存在代码同级目录下,如需调整请修改此处的路径,请自行创建文件夹并填入路径
|
|
|
- workbook.save(f"股东大会公告链接_{year}.xlsx")
|
|
|
+ print(f"----{year}年获取完成")
|
|
|
+ self._remove_dump(year)
|
|
|
+ print(f"--------去重-----")
|
|
|
|
|
|
- print(f"----{year}年获取完成")
|
|
|
+ def crawl(self):
|
|
|
+ ''' 主函数
|
|
|
+ 下载股东大会公告链接,保存为xlsx
|
|
|
+ '''
|
|
|
+ for year in self.years:
|
|
|
+ self._crawl_report(year)
|
|
|
self._remove_dump()
|
|
|
|
|
|
- def _remove_dump(self):
|
|
|
+ def _remove_dump(self, year:int):
|
|
|
''' 去重 '''
|
|
|
- for year in self.years:
|
|
|
- file_path = f'股东大会公告链接_{year}.xlsx'
|
|
|
- if os.path.exists(file_path):
|
|
|
- df_2018 = pd.read_excel(file_path)
|
|
|
- df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)
|
|
|
- df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)
|
|
|
+ file_path = f'股东大会公告链接_{year}.xlsx'
|
|
|
+ if os.path.exists(file_path):
|
|
|
+ df_2018 = pd.read_excel(file_path)
|
|
|
+ df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)
|
|
|
+ df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)
|
|
|
|
|
|
def download(self):
|
|
|
''' read all link and download it '''
|
|
@@ -234,30 +239,40 @@ class Cninfo(object):
|
|
|
print(f'process file:{year}')
|
|
|
# if the file is exist 公司代码 公司简称 标题 年份 年报链接
|
|
|
if os.path.exists(file_path):
|
|
|
- df_2018 = pd.read_excel(file_path)
|
|
|
- df = pd.read_excel(file_path)
|
|
|
- urls = df['年报链接'].tolist()
|
|
|
- # get all title
|
|
|
- titles = df['标题'].tolist()
|
|
|
- # get all company name
|
|
|
- company_names = df['公司简称'].tolist()
|
|
|
- # company_names remove / and \ and * and ? and : and " and < and > and | and \t and \n ans space
|
|
|
- company_names = [re.sub(r'[\\/:*?"<>|\t\n\s]', '', i) for i in company_names]
|
|
|
- # get all year
|
|
|
- years = df['年份'].tolist()
|
|
|
- # get all company code
|
|
|
- company_codes = df['公司代码'].tolist()
|
|
|
- print(len(company_codes),f'size: {len(company_codes)}')
|
|
|
- for i in range(len(company_codes)):
|
|
|
- if not os.path.exists(f'data/{company_names[i]}'):
|
|
|
- os.makedirs(f'data/{company_names[i]}')
|
|
|
- # data/公司名称/{年份}-标题
|
|
|
- file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
|
|
|
- for i in range(len(urls)):
|
|
|
- # http://static.cninfo.com.cn/finalpage/2018-01-30/1204372527.PDF
|
|
|
- self.pool.submit(self._download_file, urls[i],file_names[i] )
|
|
|
- print(f'----{year}年下载完成')
|
|
|
+ self._download(year)
|
|
|
+ else:
|
|
|
+ print(f'file:{file_path} is not exist')
|
|
|
+ self._crawl_report(year)
|
|
|
+ self._download(year)
|
|
|
+ self.pool.shutdown(wait=True)
|
|
|
|
|
|
+ def _download(self, year:int):
|
|
|
+ ''' 下载年报 '''
|
|
|
+ file_path = f'股东大会公告链接_{year}.xlsx'
|
|
|
+ df_2018 = pd.read_excel(file_path)
|
|
|
+ df = pd.read_excel(file_path)
|
|
|
+ urls = df['年报链接'].tolist()
|
|
|
+ # get all title
|
|
|
+ titles = df['标题'].tolist()
|
|
|
+ # get all company name
|
|
|
+ company_names = df['公司简称'].tolist()
|
|
|
+ # company_names remove / and \ and * and ? and : and " and < and > and | and \t and \n ans space
|
|
|
+ company_names = [re.sub(r'[\\/:*?"<>|\t\n\s]', '', i) for i in company_names]
|
|
|
+ # get all year
|
|
|
+ years = df['年份'].tolist()
|
|
|
+ # get all company code
|
|
|
+ company_codes = df['公司代码'].tolist()
|
|
|
+ print(len(company_codes),f'size: {len(company_codes)}')
|
|
|
+ for i in range(len(company_codes)):
|
|
|
+ if not os.path.exists(f'data/{company_names[i]}'):
|
|
|
+ os.makedirs(f'data/{company_names[i]}')
|
|
|
+ # data/公司名称/{年份}-标题
|
|
|
+ file_names = [f'data/{company_names[i]}/{years[i]}-{titles[i]}.pdf' for i in range(len(company_codes))]
|
|
|
+ for i in range(len(urls)):
|
|
|
+ # http://static.cninfo.com.cn/finalpage/2018-01-30/1204372527.PDF
|
|
|
+ self.pool.submit(self._download_file, urls[i],file_names[i] )
|
|
|
+ print(f'----{year}年下载完成')
|
|
|
+
|
|
|
def _download_file(self, url, file_path):
|
|
|
''' download file
|
|
|
'''
|