123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- '''
- Created on 2017年7月1日
- @vsersion:python3.6
- @author: liuyuqi
- '''
- import random
- import time
- import requests
- from bs4 import BeautifulSoup
- from nt import chdir
- urls = []
- citys = ['pudongxinqu']
- workSpace = "E:\\data\\workspace\\Python_Tools\\lianjianalysis"
- resultFile = "lianjia.csv"
- class Lianjia(object):
- def __init__(self):
- super(Lianjia, self).__init__()
- self.session = requests.Session()
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
- 'Referer': 'http://sh.lianjia.com/ershoufang/',
- }
- self.session.headers.update(headers)
- def getUrls(self):
-
- for i in citys:
- url = 'http://sh.lianjia.com/ershoufang/%s/' % i
- res = self.session.get(url)
- res = res.text.encode(res.encoding).decode('utf-8')
- soup = BeautifulSoup(res, 'html.parser')
- page = soup.findAll('div', {'class': 'c-pagination'})
- pages = [i.strip() for i in page[0].text.split('\n')]
- if len(pages) > 3:
- total_pages = int(pages[-3])
- else:
- total_pages = int(pages[-2])
- for j in list(range(1, total_pages + 1)):
- urls.append('http://sh.lianjia.com/ershoufang/%s/d%s' % (i, j))
-
- time.sleep(random.randint(2, 10))
- def mSpider(self):
-
- self.getUrls()
-
- file = open(resultFile, 'w', encoding='utf-8')
- for url in urls:
- res = requests.get(url)
- res = res.text.encode(res.encoding).decode('utf-8')
- soup = BeautifulSoup(res, 'html.parser')
- find_all = soup.find_all(name='div', attrs={'class': 'info'})
- for i in list(range(len(find_all))):
- title = find_all[i].find('a')['title']
- res2 = find_all[i]
- name = res2.find_all('div', {'class': 'where'})[0].find_all('span')[0].text
- room_type = res2.find_all('div', {'class': 'where'})[0].find_all('span')[1].text
- size = res2.find_all('div', {'class': 'where'})[0].find_all('span')[2].text[:-3]
-
- info = [i.strip() for i in res2.find_all('div', {'class': 'con'})[0].text.split('\n')]
- region = info[1]
- loucheng = info[2][2:]
- chaoxiang = info[5][2:]
- builtdate = info[-3][2:]
-
- price = find_all[i].find('div', {'class': 'price'}).text.strip()[:-1]
-
- price_union = find_all[i].find('div', {'class': 'price-pre'}).text.strip()[:-3]
-
-
- file.write(','.join(
- (name, room_type, size, region, loucheng, chaoxiang, price, price_union, builtdate)) + '\n')
-
- file.close()
- chdir(workSpace)
- jia = Lianjia()
- jia.mSpider()
- print(urls)
|