Browse Source

完成数据获取,保存

liuyuqi-dellpc 1 year ago
parent
commit
d88d189807
3 changed files with 143 additions and 53 deletions
  1. 1 0
      .gitignore
  2. 65 53
      crawl_sse/sse.py
  3. 77 0
      上市公司分析.ipynb

+ 1 - 0
.gitignore

@@ -1,2 +1,3 @@
 *.csv
 *.pyc
+*.pdf

+ 65 - 53
crawl_sse/sse.py

@@ -40,12 +40,12 @@ class Sse(object):
 
     def init_browser(self):
         chrome_option = webdriver.EdgeOptions()
+        chrome_option.add_argument('--headless')
         chrome_option.add_experimental_option("detach", True)
         chrome_option.add_experimental_option('excludeSwitches', ['enable-logging'])
         service = webdriver.EdgeService(EdgeChromiumDriverManager().install())
         self.driver = webdriver.Edge(service = service, options=chrome_option)
-        
-
+    
     def crawl(self):
         self.get_hangye_data()
         self.get_diqu_data()
@@ -55,70 +55,82 @@ class Sse(object):
     def get_hangye_data(self):
         ''' 获取行业数据         '''
         url_hangyes = f'{self._host}/assortment/stock/areatrade/trade/'
-        # resp = self.sess.get(url_hangyes)
-        self.driver.get(url_hangyes)
         try:
-            pass
-        except Exception as e:
-            pass
-        finally:
-            pass
-
-        # wait = WebDriverWait(self.driver, 5)
-        # selenium.common.exceptions.InvalidSelectorException: Message: invalid selector: The result of the xpath expression "/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()" is: [object Text]. It should be an element.
-        # wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()')))
-        time.sleep(5)
-        
-        html = self.driver.page_source
-        soup = etree.HTML(html)
-        hangye_names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()')
-        hangye_names_url = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/@href')
-        hangye_codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[2]/text()')
-        
-        for i in range(len(hangye_names_url)):
-            self.driver.get(f'{self._host}{hangye_names_url[i]}')
-            time.sleep(5)
+            print(f'driver url:{url_hangyes}')
+            self.driver.get(url_hangyes)
+            wait = WebDriverWait(self.driver, 10)
+            # selenium.common.exceptions.InvalidSelectorException: Message: invalid selector: The result of the xpath expression "/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()" is: [object Text]. It should be an element.
+            wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table')))
+            time.sleep(2)
             html = self.driver.page_source
             soup = etree.HTML(html)
-            codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/text()')
-            names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[2]/text()')
-            for j in range(len(codes)):
-                self.hangye_date.append([hangye_names[i], hangye_codes[i], codes[j], names[j]])
-            time.sleep(random.randint(1,3))
-    
-    def save_to_csv(self):
-        with open('sse_hangye.csv','w',newline='',encoding='utf-8') as f:
-            writer = csv.writer(f)
-            writer.writerow(['行业名称','行业代码','股票代码','名称'])
-            writer.writerows(self.hangye_date)
-        
-        with open('sse_diqu.csv','w',newline='',encoding='utf-8') as f:
-            writer = csv.writer(f)
-            writer.writerow(['地区名称','股票代码','名称'])
-            writer.writerows(self.diqu_date)
+            hangye_names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()')
+            hangye_names_url = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/@href')
+            hangye_codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[2]/text()')
+            for i in range(len(hangye_names_url)):
+                print(f'driver get url:{self._host}{hangye_names_url[i]}')
+                try:
+                    self.driver.get(f'{self._host}{hangye_names_url[i]}')
+
+                    wait = WebDriverWait(self.driver, 10)
+                    # selenium.common.exceptions.InvalidSelectorException: Message: invalid selector: The result of the xpath expression "/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()" is: [object Text]. It should be an element.
+                    wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table')))
+                    time.sleep(1)
+                    html = self.driver.page_source
+                    soup = etree.HTML(html)
+                    codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/text()')
+                    names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[2]/text()')
+                    for j in range(len(codes)):
+                        self.hangye_date.append([hangye_names[i], hangye_codes[i], codes[j], names[j]])
+                except Exception as e:
+                    print(f'error:{e}')
+        except Exception as e:
+            print(f'error:{e}')
     
     def get_diqu_data(self):
         ''' 获取地区数据
          '''
         url_diqus = f'{self._host}/assortment/stock/areatrade/area/'
-        self.driver.get(url_diqus)
-
-        time.sleep(5)
+        print(f'driver url:{url_diqus}')
+        try:
+            self.driver.get(url_diqus)
+        except Exception as e:
+            print(f'error:{e}')
+            return
+        wait = WebDriverWait(self.driver, 10)
+        # selenium.common.exceptions.InvalidSelectorException: Message: invalid selector: The result of the xpath expression "/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()" is: [object Text]. It should be an element.
+        wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table')))
+        time.sleep(1)
         html = self.driver.page_source
         soup = etree.HTML(html)
         diqu_names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/text()')
         diqu_names_url = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/@href')
 
         for i in range(len(diqu_names_url)):
-            self.driver.get(f'{self._host}{diqu_names_url[i]}')
-            time.sleep(5)
-            html = self.driver.page_source
-            soup = etree.HTML(html)
-            codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/text()')
-            names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[2]/text()')
-            for j in range(len(codes)):
-                self.diqu_date.append([diqu_names[i], codes[j], names[j]])
-            time.sleep(random.randint(1,3))
+            try:
+                print(f'driver get url:{self._host}{diqu_names_url[i]}')
+                self.driver.get(f'{self._host}{diqu_names_url[i]}')
 
+                wait = WebDriverWait(self.driver, 10)
+                # selenium.common.exceptions.InvalidSelectorException: Message: invalid selector: The result of the xpath expression "/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()" is: [object Text]. It should be an element.
+                wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table')))
+                time.sleep(1)
+                html = self.driver.page_source
+                soup = etree.HTML(html)
+                codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/text()')
+                names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[2]/text()')
+                for j in range(len(codes)):
+                    self.diqu_date.append([diqu_names[i], codes[j], names[j]])
+            except Exception as e:
+                print(f'error:{e}')
 
-    
+    def save_to_csv(self):
+        with open('sse_hangye.csv','w',newline='',encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerow(['行业名称','行业代码','股票代码','名称'])
+            writer.writerows(self.hangye_date)
+        
+        with open('sse_diqu.csv','w',newline='',encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerow(['地区名称','股票代码','名称'])
+            writer.writerows(self.diqu_date)

+ 77 - 0
上市公司分析.ipynb

@@ -0,0 +1,77 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 上市公司分析报告\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 数据合并整理\n",
+    "\n",
+    "地区分类数据和行业分类数据整理为一个表,排序保存"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas  as pd \n",
+    "\n",
+    "df_sse_diqu = pd.read_csv('sse_diqu.csv')  # 地区名称\t股票代码\t名称\n",
+    "df_sse_hangye = pd.read_csv('sse_hangye.csv') # 行业名称\t行业代码\t股票代码\t名称\n",
+    "\n",
+    "# 合并为一个表: 行业名称\t行业代码\t地区名称\t股票代码\t名称\n",
+    "# sort 地区名称 行业名称\n",
+    "df_sse_diqu_hangye = pd.merge(df_sse_diqu, df_sse_hangye, on='股票代码', how='left')\n",
+    "df_sse_diqu_hangye = df_sse_diqu_hangye.sort_values(by=['地区名称', '行业名称'])\n",
+    "df_sse_diqu_hangye['名称'] = df_sse_diqu_hangye['名称_x']\n",
+    "df_sse_diqu_hangye = df_sse_diqu_hangye.drop(['名称_x', '名称_y'], axis=1)\n",
+    "df_sse_diqu_hangye = df_sse_diqu_hangye[['地区名称', '行业名称', '股票代码', '名称']]\n",
+    "df_sse_diqu_hangye.to_csv('sse_diqu_hangye.csv', index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}