|
@@ -0,0 +1,99 @@
|
|
|
+
|
|
|
+'''
|
|
|
+Created on 2017年7月1日
|
|
|
+@vsersion:python3.6
|
|
|
+@author: liuyuqi
|
|
|
+'''
|
|
|
+import random
|
|
|
+import time
|
|
|
+
|
|
|
+
|
|
|
+import requests
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+from nt import chdir
|
|
|
+
|
|
|
+
|
|
|
+urls = []
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+citys = ['pudongxinqu']
|
|
|
+
|
|
|
+workSpace = "E:\\data\\workspace\\Python_Tools\\lianjianalysis"
|
|
|
+resultFile = "lianjia.csv"
|
|
|
+
|
|
|
+
|
|
|
+class Lianjia(object):
|
|
|
+ def __init__(self):
|
|
|
+ super(Lianjia, self).__init__()
|
|
|
+ self.session = requests.Session()
|
|
|
+ headers = {
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
|
|
|
+ 'Referer': 'http://sh.lianjia.com/ershoufang/',
|
|
|
+ }
|
|
|
+ self.session.headers.update(headers)
|
|
|
+
|
|
|
+ def getUrls(self):
|
|
|
+
|
|
|
+ for i in citys:
|
|
|
+ url = 'http://sh.lianjia.com/ershoufang/%s/' % i
|
|
|
+ res = self.session.get(url)
|
|
|
+ res = res.text.encode(res.encoding).decode('utf-8')
|
|
|
+ soup = BeautifulSoup(res, 'html.parser')
|
|
|
+ page = soup.findAll('div', {'class': 'c-pagination'})
|
|
|
+ pages = [i.strip() for i in page[0].text.split('\n')]
|
|
|
+ if len(pages) > 3:
|
|
|
+ total_pages = int(pages[-3])
|
|
|
+ else:
|
|
|
+ total_pages = int(pages[-2])
|
|
|
+
|
|
|
+ for j in list(range(1, total_pages + 1)):
|
|
|
+ urls.append('http://sh.lianjia.com/ershoufang/%s/d%s' % (i, j))
|
|
|
+
|
|
|
+ time.sleep(random.randint(2, 10))
|
|
|
+
|
|
|
+ def mSpider(self):
|
|
|
+
|
|
|
+ self.getUrls()
|
|
|
+
|
|
|
+ file = open(resultFile, 'w', encoding='utf-8')
|
|
|
+
|
|
|
+ for url in urls:
|
|
|
+ res = requests.get(url)
|
|
|
+ res = res.text.encode(res.encoding).decode('utf-8')
|
|
|
+ soup = BeautifulSoup(res, 'html.parser')
|
|
|
+ find_all = soup.find_all(name='div', attrs={'class': 'info'})
|
|
|
+
|
|
|
+ for i in list(range(len(find_all))):
|
|
|
+ title = find_all[i].find('a')['title']
|
|
|
+
|
|
|
+ res2 = find_all[i]
|
|
|
+ name = res2.find_all('div', {'class': 'where'})[0].find_all('span')[0].text
|
|
|
+ room_type = res2.find_all('div', {'class': 'where'})[0].find_all('span')[1].text
|
|
|
+ size = res2.find_all('div', {'class': 'where'})[0].find_all('span')[2].text[:-3]
|
|
|
+
|
|
|
+
|
|
|
+ info = [i.strip() for i in res2.find_all('div', {'class': 'con'})[0].text.split('\n')]
|
|
|
+ region = info[1]
|
|
|
+ loucheng = info[2][2:]
|
|
|
+ chaoxiang = info[5][2:]
|
|
|
+ builtdate = info[-3][2:]
|
|
|
+
|
|
|
+
|
|
|
+ price = find_all[i].find('div', {'class': 'price'}).text.strip()[:-1]
|
|
|
+
|
|
|
+ price_union = find_all[i].find('div', {'class': 'price-pre'}).text.strip()[:-3]
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ file.write(','.join(
|
|
|
+ (name, room_type, size, region, loucheng, chaoxiang, price, price_union, builtdate)) + '\n')
|
|
|
+
|
|
|
+
|
|
|
+ file.close()
|
|
|
+
|
|
|
+
|
|
|
+chdir(workSpace)
|
|
|
+jia = Lianjia()
|
|
|
+jia.mSpider()
|
|
|
+print(urls)
|