test.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. # coding=utf-8
  2. '''
  3. Created on 2017年7月1日
  4. @vsersion:python3.6
  5. @author: liuyuqi
  6. '''
  7. from bs4 import BeautifulSoup
  8. url = 'http://sh.lianjia.com/ershoufang/pudongxinqu'
  9. # res=requests.get(url)
  10. # res=res.text.encode(res.encoding).decode('utf-8')
  11. # file = open("resultFile.txt",'w',encoding = 'utf-8')
  12. # file.write(res)
  13. file = open("resultFile.txt", 'r', encoding='UTF-8')
  14. try:
  15. res = file.read()
  16. finally:
  17. file.close()
  18. soup = BeautifulSoup(res, 'html.parser') # 使用bs4模块,对响应的链接源代码进行html解析
  19. page = soup.findAll('div', {'class': 'c-pagination'})
  20. pages = [i.strip() for i in page[0].text.split('\n')] # 抓取出每个区域的二手房链接中所有的页数
  21. if len(pages) > 3:
  22. total_pages = int(pages[-3])
  23. else:
  24. total_pages = int(pages[-2])
  25. # print(total_pages)
  26. find_all = soup.find_all(name='div', attrs={'class': 'info'})
  27. # print(len(find_all))
  28. res2 = find_all[1]
  29. title = res2.find('a')['title']
  30. print(res2)
  31. name = res2.find_all('div', {'class': 'info-row'})[1].find_all('span')[0].text # 每套二手房的小区名称
  32. room_type = res2.find_all('div', {'class': 'info-row'})[0].find_all('span')[1].text # 每套二手房的户型
  33. # size = res2.find_all('div',{'class':'info-row'})[0].find_all('span')[2].text[:-3] # 每套二手房的面积
  34. print(room_type)