liuyuqi-dellpc 1 year ago
commit
291523ea66
6 changed files with 163 additions and 0 deletions
  1. 2 0
      .gitignore
  2. 12 0
      README.md
  3. 2 0
      crawl_sse/__init__.py
  4. 124 0
      crawl_sse/sse.py
  5. 14 0
      main.py
  6. 9 0
      requirements.txt

+ 2 - 0
.gitignore

@@ -0,0 +1,2 @@
+*.csv
+*.pyc

+ 12 - 0
README.md

@@ -0,0 +1,12 @@
+# crawl_sse
+
+
+```
+virtualenv .venv
+source .venv/bin/activate
+
+pip install -r requirements.txt
+
+python main.py
+
+```

+ 2 - 0
crawl_sse/__init__.py

@@ -0,0 +1,2 @@
+from .sse import Sse
+

+ 124 - 0
crawl_sse/sse.py

@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/12/03 03:09:20
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   上市公司
+
+http://www.sse.com.cn/assortment/stock/areatrade/area/
+
+'''
+import requests
+from lxml import etree
+import csv,re,os,sys,time,random
+from webdriver_manager.chrome import ChromeDriverManager
+from webdriver_manager.microsoft import EdgeChromiumDriverManager
+from selenium import webdriver
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+import selenium.common.exceptions
+
+class Sse(object):
+    
+    _host = r'http://www.sse.com.cn'
+    _headers = {
+        'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
+            (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
+    }
+
+    def __init__(self):
+        self.sess = requests.Session()
+        self.sess.headers.update(self._headers)
+        self.hangye_date = []
+        self.diqu_date = []
+
+        self.driver =None
+        self.init_browser()
+
+    def init_browser(self):
+        chrome_option = webdriver.EdgeOptions()
+        chrome_option.add_experimental_option("detach", True)
+        chrome_option.add_experimental_option('excludeSwitches', ['enable-logging'])
+        service = webdriver.EdgeService(EdgeChromiumDriverManager().install())
+        self.driver = webdriver.Edge(service = service, options=chrome_option)
+        
+
+    def crawl(self):
+        self.get_hangye_data()
+        self.get_diqu_data()
+        self.save_to_csv()
+        self.driver.close()
+
+    def get_hangye_data(self):
+        ''' 获取行业数据         '''
+        url_hangyes = f'{self._host}/assortment/stock/areatrade/trade/'
+        # resp = self.sess.get(url_hangyes)
+        self.driver.get(url_hangyes)
+        try:
+            pass
+        except Exception as e:
+            pass
+        finally:
+            pass
+
+        # wait = WebDriverWait(self.driver, 5)
+        # selenium.common.exceptions.InvalidSelectorException: Message: invalid selector: The result of the xpath expression "/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()" is: [object Text]. It should be an element.
+        # wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()')))
+        time.sleep(5)
+        
+        html = self.driver.page_source
+        soup = etree.HTML(html)
+        hangye_names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()')
+        hangye_names_url = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/@href')
+        hangye_codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[2]/text()')
+        
+        for i in range(len(hangye_names_url)):
+            self.driver.get(f'{self._host}{hangye_names_url[i]}')
+            time.sleep(5)
+            html = self.driver.page_source
+            soup = etree.HTML(html)
+            codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/text()')
+            names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[2]/text()')
+            for j in range(len(codes)):
+                self.hangye_date.append([hangye_names[i], hangye_codes[i], codes[j], names[j]])
+            time.sleep(random.randint(1,3))
+    
+    def save_to_csv(self):
+        with open('sse_hangye.csv','w',newline='',encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerow(['行业名称','行业代码','股票代码','名称'])
+            writer.writerows(self.hangye_date)
+        
+        with open('sse_diqu.csv','w',newline='',encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerow(['地区名称','股票代码','名称'])
+            writer.writerows(self.diqu_date)
+    
+    def get_diqu_data(self):
+        ''' 获取地区数据
+         '''
+        url_diqus = f'{self._host}/assortment/stock/areatrade/area/'
+        self.driver.get(url_diqus)
+
+        time.sleep(5)
+        html = self.driver.page_source
+        soup = etree.HTML(html)
+        diqu_names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/text()')
+        diqu_names_url = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/@href')
+
+        for i in range(len(diqu_names_url)):
+            self.driver.get(f'{self._host}{diqu_names_url[i]}')
+            time.sleep(5)
+            html = self.driver.page_source
+            soup = etree.HTML(html)
+            codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/text()')
+            names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[2]/text()')
+            for j in range(len(codes)):
+                self.diqu_date.append([diqu_names[i], codes[j], names[j]])
+            time.sleep(random.randint(1,3))
+
+
+    

+ 14 - 0
main.py

@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/12/03 03:09:35
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   enter point
+'''
+
+from crawl_sse import Sse
+
+if __name__=='__main__':
+    sse = Sse()
+    sse.crawl()

+ 9 - 0
requirements.txt

@@ -0,0 +1,9 @@
+requests==2.31.0
+python-dotenv==0.21.1
+bs4
+lxml==4.9.3
+selenium==4.11.2
+webdriver_manager==4.0.1
+
+
+