{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 上市公司分析报告\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据合并整理\n",
    "\n",
    "地区分类数据和行业分类数据整理为一个表,排序保存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas  as pd \n",
    "\n",
    "df_sse_diqu = pd.read_csv('sse_diqu.csv')  # 地区名称\t股票代码\t名称\n",
    "df_sse_hangye = pd.read_csv('sse_hangye.csv') # 行业名称\t行业代码\t股票代码\t名称\n",
    "\n",
    "# 合并为一个表: 行业名称\t行业代码\t地区名称\t股票代码\t名称\n",
    "# sort 地区名称 行业名称\n",
    "df_sse_diqu_hangye = pd.merge(df_sse_diqu, df_sse_hangye, on='股票代码', how='left')\n",
    "df_sse_diqu_hangye = df_sse_diqu_hangye.sort_values(by=['地区名称', '行业名称'])\n",
    "df_sse_diqu_hangye['名称'] = df_sse_diqu_hangye['名称_x']\n",
    "df_sse_diqu_hangye = df_sse_diqu_hangye.drop(['名称_x', '名称_y'], axis=1)\n",
    "df_sse_diqu_hangye = df_sse_diqu_hangye[['地区名称', '行业名称', '股票代码', '名称']]\n",
    "df_sse_diqu_hangye.to_csv('sse_diqu_hangye.csv', index=False)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对 2018 年的数据进去重"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os,sys,re\n",
    "\n",
    "years = [ 2018, 2019, 2020, 2021, 2022, 2023 ]\n",
    "for year in years:\n",
    "    file_path = f'股东大会公告链接_{year}.xlsx'\n",
    "    if os.path.exists(file_path):\n",
    "        df_2018 = pd.read_excel(file_path)\n",
    "        # 根据 df_2018['年报链接'] 列中的值进行去重\n",
    "        df_2018.drop_duplicates(subset=['年报链接'],keep='first',inplace=True)\n",
    "        df_2018.to_excel(f'股东大会公告链接_{year}_rep.xlsx',index=False)\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}