12345678910111213141516171819202122232425262728293031323334353637 |
- # coding=utf-8
- '''
- Created on 2017年7月1日
- @vsersion:python3.6
- @author: liuyuqi
- '''
- from bs4 import BeautifulSoup
- url = 'http://sh.lianjia.com/ershoufang/pudongxinqu'
- # res=requests.get(url)
- # res=res.text.encode(res.encoding).decode('utf-8')
- # file = open("resultFile.txt",'w',encoding = 'utf-8')
- # file.write(res)
- file = open("resultFile.txt", 'r', encoding='UTF-8')
- try:
- res = file.read()
- finally:
- file.close()
- soup = BeautifulSoup(res, 'html.parser') # 使用bs4模块,对响应的链接源代码进行html解析
- page = soup.findAll('div', {'class': 'c-pagination'})
- pages = [i.strip() for i in page[0].text.split('\n')] # 抓取出每个区域的二手房链接中所有的页数
- if len(pages) > 3:
- total_pages = int(pages[-3])
- else:
- total_pages = int(pages[-2])
- # print(total_pages)
- find_all = soup.find_all(name='div', attrs={'class': 'info'})
- # print(len(find_all))
- res2 = find_all[1]
- title = res2.find('a')['title']
- print(res2)
- name = res2.find_all('div', {'class': 'info-row'})[1].find_all('span')[0].text # 每套二手房的小区名称
- room_type = res2.find_all('div', {'class': 'info-row'})[0].find_all('span')[1].text # 每套二手房的户型
- # size = res2.find_all('div',{'class':'info-row'})[0].find_all('span')[2].text[:-3] # 每套二手房的面积
- print(room_type)
|