getData.py 4.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. # coding=utf-8
  2. '''
  3. Created on 2017年7月1日
  4. @vsersion:python3.6
  5. @author: liuyuqi
  6. '''
  7. import random
  8. import time
  9. # 导入开发模块
  10. import requests
  11. from bs4 import BeautifulSoup
  12. from nt import chdir
  13. class Lianjia(object):
  14. def __init__(self):
  15. super(Lianjia, self).__init__()
  16. self.session = requests.Session()
  17. headers = {
  18. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
  19. 'Referer': 'http://sh.lianjia.com/ershoufang/',
  20. }
  21. self.urls=[]
  22. self.session.headers.update(headers)
  23. def getUrls(self):
  24. # 基于for循环,构造完整的爬虫链接
  25. for city in citys:
  26. url = 'http://sh.lianjia.com/ershoufang/%s/' % city
  27. res = self.session.get(url) # 发送get请求
  28. res = res.text.encode(res.encoding).decode('utf-8') # 需要转码,否则会有问题
  29. soup = BeautifulSoup(res, 'html.parser') # 使用bs4模块,对响应的链接源代码进行html解析
  30. page = soup.findAll('div', {'class': 'c-pagination'}) # 使用finalAll方法,获取指定标签和属性下的内容
  31. pages = [i.strip() for i in page[0].text.split('\n')] # 抓取出每个区域的二手房链接中所有的页数
  32. if len(pages) > 3:
  33. total_pages = int(pages[-3])
  34. else:
  35. total_pages = int(pages[-2])
  36. for j in list(range(1, total_pages + 1)): # 拼接所有需要爬虫的链接
  37. self.urls.append('http://sh.lianjia.com/ershoufang/%s/d%s' % (city, j))
  38. # 随机睡眠2-10s
  39. time.sleep(random.randint(2, 10))
  40. def mSpider(self):
  41. # 获取所有url
  42. self.getUrls()
  43. # 创建csv文件,用于后面的保存数据
  44. file = open('链家二手房.csv', 'w', encoding='utf-8')
  45. for url in self.urls: # 基于for循环,抓取出所有满足条件的标签和属性列表,存放在find_all中
  46. res = requests.get(url)
  47. res = res.text.encode(res.encoding).decode('utf-8')
  48. soup = BeautifulSoup(res, 'html.parser')
  49. find_all = soup.find_all(name='div', attrs={'class': 'info'})
  50. for i in list(range(len(find_all))): # 基于for循环,抓取出所需的各个字段信息
  51. title = find_all[i].find('a')['title'] # 每套二手房的标语
  52. res2 = find_all[i]
  53. name = res2.find_all('div', {'class': 'where'})[0].find_all('span')[0].text # 每套二手房的小区名称
  54. room_type = res2.find_all('div', {'class': 'where'})[0].find_all('span')[1].text # 每套二手房的户型
  55. size = res2.find_all('div', {'class': 'where'})[0].find_all('span')[2].text[:-3] # 每套二手房的面积
  56. # 采用列表解析式,删除字符串的首位空格
  57. info = [i.strip() for i in res2.find_all('div', {'class': 'con'})[0].text.split('\n')]
  58. region = info[1] # 每套二手房所属的区域
  59. loucheng = info[2][2:] # 每套二手房所在的楼层
  60. chaoxiang = info[5][2:] # 每套二手房的朝向
  61. builtdate = info[-3][2:] # 每套二手房的建筑时间
  62. # 每套二手房的总价
  63. price = find_all[i].find('div', {'class': 'price'}).text.strip()[:-1]
  64. # 每套二手房的平方米售价
  65. price_union = find_all[i].find('div', {'class': 'price-pre'}).text.strip()[:-3]
  66. # print(name,room_type,size,region,loucheng,chaoxiang,price,price_union,builtdate)
  67. # 将上面的各字段信息值写入并保存到csv文件中
  68. file.write(','.join(
  69. (name, room_type, size, region, loucheng, chaoxiang, price, price_union, builtdate)) + '\n')
  70. # 关闭文件(否则数据不会写入到csv文件中)
  71. file.close()
  72. if __name__ == "__main__":
  73. workSpace = "E:\\data\\workspace\\Python_Tools\\lianjianalysis"
  74. chdir(workSpace)
  75. # 指定爬虫所需的上海各个区域名称
  76. citys = ['pudongxinqu','minhang','baoshan','xuhui','putuo','yangpu','changning','songjiang',
  77. 'jiading','huangpu','jinan','zhabei','hongkou','qingpu','fengxian','jinshan','chongming']
  78. lianjia=Lianjia()
  79. lianjia.mSpider(citys)