getData.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # coding=utf-8
  2. '''
  3. Created on 2017年7月1日
  4. @vsersion:python3.6
  5. @author: liuyuqi
  6. '''
  7. import random
  8. import time
  9. # 导入开发模块
  10. import requests
  11. from bs4 import BeautifulSoup
  12. from nt import chdir
  13. # 定义空列表,用于创建所有的爬虫链接
  14. urls = []
  15. # 指定爬虫所需的上海各个区域名称
  16. # citys = ['pudongxinqu','minhang','baoshan','xuhui','putuo','yangpu','changning','songjiang',
  17. # 'jiading','huangpu','jinan','zhabei','hongkou','qingpu','fengxian','jinshan','chongming']
  18. citys = ['pudongxinqu']
  19. workSpace = "E:\\data\\workspace\\Python_Tools\\lianjianalysis"
  20. resultFile = "lianjia.csv"
  21. class Lianjia(object):
  22. def __init__(self):
  23. super(Lianjia, self).__init__()
  24. self.session = requests.Session()
  25. headers = {
  26. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
  27. 'Referer': 'http://sh.lianjia.com/ershoufang/',
  28. }
  29. self.session.headers.update(headers)
  30. def getUrls(self):
  31. # 基于for循环,构造完整的爬虫链接
  32. for i in citys:
  33. url = 'http://sh.lianjia.com/ershoufang/%s/' % i
  34. res = self.session.get(url) # 发送get请求
  35. res = res.text.encode(res.encoding).decode('utf-8') # 需要转码,否则会有问题
  36. soup = BeautifulSoup(res, 'html.parser') # 使用bs4模块,对响应的链接源代码进行html解析
  37. page = soup.findAll('div', {'class': 'c-pagination'}) # 使用finalAll方法,获取指定标签和属性下的内容
  38. pages = [i.strip() for i in page[0].text.split('\n')] # 抓取出每个区域的二手房链接中所有的页数
  39. if len(pages) > 3:
  40. total_pages = int(pages[-3])
  41. else:
  42. total_pages = int(pages[-2])
  43. for j in list(range(1, total_pages + 1)): # 拼接所有需要爬虫的链接
  44. urls.append('http://sh.lianjia.com/ershoufang/%s/d%s' % (i, j))
  45. # 随机睡眠2-10s
  46. time.sleep(random.randint(2, 10))
  47. def mSpider(self):
  48. # 获取所有url
  49. self.getUrls()
  50. # 创建csv文件,用于后面的保存数据
  51. file = open(resultFile, 'w', encoding='utf-8')
  52. for url in urls: # 基于for循环,抓取出所有满足条件的标签和属性列表,存放在find_all中
  53. res = requests.get(url)
  54. res = res.text.encode(res.encoding).decode('utf-8')
  55. soup = BeautifulSoup(res, 'html.parser')
  56. find_all = soup.find_all(name='div', attrs={'class': 'info'})
  57. for i in list(range(len(find_all))): # 基于for循环,抓取出所需的各个字段信息
  58. title = find_all[i].find('a')['title'] # 每套二手房的标语
  59. res2 = find_all[i]
  60. name = res2.find_all('div', {'class': 'where'})[0].find_all('span')[0].text # 每套二手房的小区名称
  61. room_type = res2.find_all('div', {'class': 'where'})[0].find_all('span')[1].text # 每套二手房的户型
  62. size = res2.find_all('div', {'class': 'where'})[0].find_all('span')[2].text[:-3] # 每套二手房的面积
  63. # 采用列表解析式,删除字符串的首位空格
  64. info = [i.strip() for i in res2.find_all('div', {'class': 'con'})[0].text.split('\n')]
  65. region = info[1] # 每套二手房所属的区域
  66. loucheng = info[2][2:] # 每套二手房所在的楼层
  67. chaoxiang = info[5][2:] # 每套二手房的朝向
  68. builtdate = info[-3][2:] # 每套二手房的建筑时间
  69. # 每套二手房的总价
  70. price = find_all[i].find('div', {'class': 'price'}).text.strip()[:-1]
  71. # 每套二手房的平方米售价
  72. price_union = find_all[i].find('div', {'class': 'price-pre'}).text.strip()[:-3]
  73. # print(name,room_type,size,region,loucheng,chaoxiang,price,price_union,builtdate)
  74. # 将上面的各字段信息值写入并保存到csv文件中
  75. file.write(','.join(
  76. (name, room_type, size, region, loucheng, chaoxiang, price, price_union, builtdate)) + '\n')
  77. # 关闭文件(否则数据不会写入到csv文件中)
  78. file.close()
  79. chdir(workSpace)
  80. jia = Lianjia()
  81. jia.mSpider()
  82. print(urls)