|
@@ -0,0 +1,124 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- encoding: utf-8 -*-
|
|
|
+'''
|
|
|
+@Contact : liuyuqi.gov@msn.cn
|
|
|
+@Time : 2023/12/03 03:09:20
|
|
|
+@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
|
+@Desc : 上市公司
|
|
|
+
|
|
|
+http://www.sse.com.cn/assortment/stock/areatrade/area/
|
|
|
+
|
|
|
+'''
|
|
|
+import requests
|
|
|
+from lxml import etree
|
|
|
+import csv,re,os,sys,time,random
|
|
|
+from webdriver_manager.chrome import ChromeDriverManager
|
|
|
+from webdriver_manager.microsoft import EdgeChromiumDriverManager
|
|
|
+from selenium import webdriver
|
|
|
+from selenium.webdriver.support.wait import WebDriverWait
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+import selenium.common.exceptions
|
|
|
+
|
|
|
+class Sse(object):
|
|
|
+
|
|
|
+ _host = r'http://www.sse.com.cn'
|
|
|
+ _headers = {
|
|
|
+ 'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/',
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
|
|
|
+ (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
|
|
|
+ }
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.sess = requests.Session()
|
|
|
+ self.sess.headers.update(self._headers)
|
|
|
+ self.hangye_date = []
|
|
|
+ self.diqu_date = []
|
|
|
+
|
|
|
+ self.driver =None
|
|
|
+ self.init_browser()
|
|
|
+
|
|
|
+ def init_browser(self):
|
|
|
+ chrome_option = webdriver.EdgeOptions()
|
|
|
+ chrome_option.add_experimental_option("detach", True)
|
|
|
+ chrome_option.add_experimental_option('excludeSwitches', ['enable-logging'])
|
|
|
+ service = webdriver.EdgeService(EdgeChromiumDriverManager().install())
|
|
|
+ self.driver = webdriver.Edge(service = service, options=chrome_option)
|
|
|
+
|
|
|
+
|
|
|
+ def crawl(self):
|
|
|
+ self.get_hangye_data()
|
|
|
+ self.get_diqu_data()
|
|
|
+ self.save_to_csv()
|
|
|
+ self.driver.close()
|
|
|
+
|
|
|
+ def get_hangye_data(self):
|
|
|
+ ''' 获取行业数据 '''
|
|
|
+ url_hangyes = f'{self._host}/assortment/stock/areatrade/trade/'
|
|
|
+ # resp = self.sess.get(url_hangyes)
|
|
|
+ self.driver.get(url_hangyes)
|
|
|
+ try:
|
|
|
+ pass
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
+ finally:
|
|
|
+ pass
|
|
|
+
|
|
|
+ # wait = WebDriverWait(self.driver, 5)
|
|
|
+ # selenium.common.exceptions.InvalidSelectorException: Message: invalid selector: The result of the xpath expression "/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()" is: [object Text]. It should be an element.
|
|
|
+ # wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()')))
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
+ html = self.driver.page_source
|
|
|
+ soup = etree.HTML(html)
|
|
|
+ hangye_names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/text()')
|
|
|
+ hangye_names_url = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[1]/a/@href')
|
|
|
+ hangye_codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody[2]/tr/td[2]/text()')
|
|
|
+
|
|
|
+ for i in range(len(hangye_names_url)):
|
|
|
+ self.driver.get(f'{self._host}{hangye_names_url[i]}')
|
|
|
+ time.sleep(5)
|
|
|
+ html = self.driver.page_source
|
|
|
+ soup = etree.HTML(html)
|
|
|
+ codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/text()')
|
|
|
+ names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[2]/text()')
|
|
|
+ for j in range(len(codes)):
|
|
|
+ self.hangye_date.append([hangye_names[i], hangye_codes[i], codes[j], names[j]])
|
|
|
+ time.sleep(random.randint(1,3))
|
|
|
+
|
|
|
+ def save_to_csv(self):
|
|
|
+ with open('sse_hangye.csv','w',newline='',encoding='utf-8') as f:
|
|
|
+ writer = csv.writer(f)
|
|
|
+ writer.writerow(['行业名称','行业代码','股票代码','名称'])
|
|
|
+ writer.writerows(self.hangye_date)
|
|
|
+
|
|
|
+ with open('sse_diqu.csv','w',newline='',encoding='utf-8') as f:
|
|
|
+ writer = csv.writer(f)
|
|
|
+ writer.writerow(['地区名称','股票代码','名称'])
|
|
|
+ writer.writerows(self.diqu_date)
|
|
|
+
|
|
|
+ def get_diqu_data(self):
|
|
|
+ ''' 获取地区数据
|
|
|
+ '''
|
|
|
+ url_diqus = f'{self._host}/assortment/stock/areatrade/area/'
|
|
|
+ self.driver.get(url_diqus)
|
|
|
+
|
|
|
+ time.sleep(5)
|
|
|
+ html = self.driver.page_source
|
|
|
+ soup = etree.HTML(html)
|
|
|
+ diqu_names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/text()')
|
|
|
+ diqu_names_url = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/@href')
|
|
|
+
|
|
|
+ for i in range(len(diqu_names_url)):
|
|
|
+ self.driver.get(f'{self._host}{diqu_names_url[i]}')
|
|
|
+ time.sleep(5)
|
|
|
+ html = self.driver.page_source
|
|
|
+ soup = etree.HTML(html)
|
|
|
+ codes = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[1]/a/text()')
|
|
|
+ names = soup.xpath('/html/body/div[8]/div/div[2]/div/div[1]/div[1]/table/tbody/tr/td[2]/text()')
|
|
|
+ for j in range(len(codes)):
|
|
|
+ self.diqu_date.append([diqu_names[i], codes[j], names[j]])
|
|
|
+ time.sleep(random.randint(1,3))
|
|
|
+
|
|
|
+
|
|
|
+
|