lianjia.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from scrapy import Selector
  4. import json
  5. import re
  6. from house_spider.items import LianjiaVillageItem, LianjiaHouseItem
  7. class LianjiaSpider(scrapy.Spider):
  8. name = 'lianjia'
  9. allowed_domains = ['cq.lianjia.com']
  10. start_urls = ['cq.lianjia.com']
  11. def __init__(self, **kwargs):
  12. super().__init__(**kwargs)
  13. self.base_url = 'https://cq.lianjia.com'
  14. def start_requests(self):
  15. request_url = 'https://cq.lianjia.com/xiaoqu/'
  16. yield scrapy.Request(url=request_url, callback=self.parse_district_links)
  17. def parse_district_links(self, response):
  18. """提取地区链接"""
  19. sel = Selector(response)
  20. links = sel.css("div[data-role='ershoufang'] div:first-child a::attr(href)").extract()
  21. for link in links:
  22. url = self.base_url + link
  23. yield scrapy.Request(url=url, callback=self.parse_bizcircle_links)
  24. def parse_bizcircle_links(self, response):
  25. """提取商圈链接"""
  26. sel = Selector(response)
  27. links = sel.css("div[data-role='ershoufang'] div:nth-child(2) a::attr(href)").extract()
  28. for link in links:
  29. url = self.base_url + link
  30. yield scrapy.Request(url=url, callback=self.parse_village_list, meta={"ref": url})
  31. def parse_village_list(self, response):
  32. """提取小区链接"""
  33. sel = Selector(response)
  34. links = sel.css(".listContent .xiaoquListItem .img::attr(href)").extract()
  35. for link in links:
  36. yield scrapy.Request(url=link, callback=self.parse_village_detail)
  37. # page
  38. page_data = sel.css(".house-lst-page-box::attr(page-data)").extract_first()
  39. page_data = json.loads(page_data)
  40. if page_data['curPage'] < page_data['totalPage']:
  41. url = response.meta["ref"] + 'pg' + str(page_data['curPage'] + 1)
  42. yield scrapy.Request(url=url, callback=self.parse_village_list, meta=response.meta)
  43. def parse_village_detail(self, response):
  44. """提取小区详情"""
  45. village_url = response.url
  46. sel = Selector(response)
  47. zone = sel.css('.xiaoquDetailbreadCrumbs .l-txt a::text').extract()
  48. latitude = 0
  49. longitude = 0
  50. try:
  51. html = response.body.decode().replace('\r', '')
  52. local = html[html.find('resblockPosition:'):html.find('resblockName') - 1]
  53. m = re.search('(\d.*\d),(\d.*\d)', local)
  54. longitude = m.group(1)
  55. latitude = m.group(2)
  56. except Exception:
  57. pass
  58. item = LianjiaVillageItem()
  59. item['id'] = village_url.replace(self.base_url + '/xiaoqu/', '').replace('/', '')
  60. item['name'] = sel.css('.detailHeader .detailTitle::text').extract_first()
  61. item['address'] = sel.css('.detailHeader .detailDesc::text').extract_first()
  62. item['latitude'] = latitude
  63. item['longitude'] = longitude
  64. item['zone'] = ','.join(zone)
  65. item['year'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(1) .xiaoquInfoContent::text').extract_first()
  66. item['build_type'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(2) .xiaoquInfoContent::text').extract_first()
  67. item['property_costs'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(3) .xiaoquInfoContent::text').extract_first()
  68. item['property_company'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(4) .xiaoquInfoContent::text').extract_first()
  69. item['developers'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(5) .xiaoquInfoContent::text').extract_first()
  70. item['buildings'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(6) .xiaoquInfoContent::text').extract_first()
  71. item['total_house'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(7) .xiaoquInfoContent::text').extract_first()
  72. print(item['name'])
  73. yield item
  74. # 小区房源 https://cq.lianjia.com/ershoufang/c3620038190566370/
  75. url = self.base_url + "/ershoufang/c" + item['id'] + "/"
  76. yield scrapy.Request(url=url, callback=self.parse_house_list, meta={"ref": url})
  77. def parse_house_list(self, response):
  78. """提取房源链接"""
  79. sel = Selector(response)
  80. # 链家有时小区查询不到数据
  81. total = sel.css('.resultDes .total span::text').extract_first()
  82. total = int(total)
  83. if total > 0:
  84. # 提取房源链接
  85. links = sel.css(".sellListContent li .info .title a::attr(href)").extract()
  86. for link in links:
  87. yield scrapy.Request(url=link, callback=self.parse_house_detail)
  88. # 链接分页
  89. page_data = sel.css(".house-lst-page-box::attr(page-data)").extract_first()
  90. page_data = json.loads(page_data)
  91. if page_data['curPage'] == 1 and page_data['totalPage'] > 1:
  92. price = response.url.replace(self.base_url + '/ershoufang/', '')
  93. for x in range(2, page_data['totalPage'] + 1, 1):
  94. url = self.base_url + '/ershoufang/' + 'pg' + str(x) + price
  95. yield scrapy.Request(url=url, callback=self.parse_house_list)
  96. def parse_house_detail(self, response):
  97. """提取房源信息"""
  98. sel = Selector(response)
  99. item = LianjiaHouseItem()
  100. item['房屋Id'] = response.url.replace(self.base_url + '/ershoufang/', '').replace('.html', '')
  101. item['标题'] = sel.css('.title-wrapper .title .main::text').extract_first()
  102. item['售价'] = sel.css('.overview .content .price .total::text').extract_first()
  103. item['小区'] = sel.css('.overview .content .aroundInfo .communityName a.info::text').extract_first()
  104. item['小区ID'] = sel.css('.overview .content .aroundInfo .communityName a.info::attr(href)').extract_first().replace('/xiaoqu/', '').replace('/', '')
  105. item['房屋户型'] = sel.css('#introduction .base .content ul li:nth-child(1)::text').extract_first()
  106. item['所在楼层'] = sel.css('#introduction .base .content ul li:nth-child(2)::text').extract_first()
  107. item['建筑面积'] = sel.css('#introduction .base .content ul li:nth-child(3)::text').extract_first()
  108. item['户型结构'] = sel.css('#introduction .base .content ul li:nth-child(4)::text').extract_first()
  109. item['套内面积'] = sel.css('#introduction .base .content ul li:nth-child(5)::text').extract_first()
  110. item['建筑类型'] = sel.css('#introduction .base .content ul li:nth-child(6)::text').extract_first()
  111. item['房屋朝向'] = sel.css('#introduction .base .content ul li:nth-child(7)::text').extract_first()
  112. item['建筑结构'] = sel.css('#introduction .base .content ul li:nth-child(8)::text').extract_first()
  113. item['装修情况'] = sel.css('#introduction .base .content ul li:nth-child(9)::text').extract_first()
  114. item['梯户比例'] = sel.css('#introduction .base .content ul li:nth-child(10)::text').extract_first()
  115. item['配备电梯'] = sel.css('#introduction .base .content ul li:nth-child(11)::text').extract_first()
  116. item['产权年限'] = sel.css('#introduction .base .content ul li:nth-child(12)::text').extract_first()
  117. item['挂牌时间'] = sel.css('#introduction .transaction .content ul li:nth-child(1) span:nth-child(2)::text').extract_first()
  118. item['交易权属'] = sel.css('#introduction .transaction .content ul li:nth-child(2) span:nth-child(2)::text').extract_first()
  119. item['上次交易'] = sel.css('#introduction .transaction .content ul li:nth-child(3) span:nth-child(2)::text').extract_first()
  120. item['房屋用途'] = sel.css('#introduction .transaction .content ul li:nth-child(4) span:nth-child(2)::text').extract_first()
  121. item['房屋年限'] = sel.css('#introduction .transaction .content ul li:nth-child(5) span:nth-child(2)::text').extract_first()
  122. item['产权所属'] = sel.css('#introduction .transaction .content ul li:nth-child(6) span:nth-child(2)::text').extract_first()
  123. item['抵押信息'] = sel.css('#introduction .transaction .content ul li:nth-child(7) span:nth-child(2)::attr(title)').extract_first()
  124. item['房本备件'] = sel.css('#introduction .transaction .content ul li:nth-child(8) span:nth-child(2)::text').extract_first()
  125. yield item