lyq
/
crawl_secondhand


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637
							# coding=utf-8
'''
Created on 2017年7月1日
@vsersion:python3.6
@author: liuyuqi
'''
from bs4 import BeautifulSoup

url = 'http://sh.lianjia.com/ershoufang/pudongxinqu'

# res=requests.get(url)
# res=res.text.encode(res.encoding).decode('utf-8')
# file = open("resultFile.txt",'w',encoding = 'utf-8')
# file.write(res)
file = open("resultFile.txt", 'r', encoding='UTF-8')
try:
    res = file.read()
finally:
    file.close()

soup = BeautifulSoup(res, 'html.parser')  # 使用bs4模块，对响应的链接源代码进行html解析
page = soup.findAll('div', {'class': 'c-pagination'})
pages = [i.strip() for i in page[0].text.split('\n')]  # 抓取出每个区域的二手房链接中所有的页数
if len(pages) > 3:
    total_pages = int(pages[-3])
else:
    total_pages = int(pages[-2])
# print(total_pages)
find_all = soup.find_all(name='div', attrs={'class': 'info'})
# print(len(find_all))
res2 = find_all[1]
title = res2.find('a')['title']
print(res2)
name = res2.find_all('div', {'class': 'info-row'})[1].find_all('span')[0].text  # 每套二手房的小区名称
room_type = res2.find_all('div', {'class': 'info-row'})[0].find_all('span')[1].text  # 每套二手房的户型
# size = res2.find_all('div',{'class':'info-row'})[0].find_all('span')[2].text[:-3] # 每套二手房的面积
print(room_type)