1 year ago · 0abc08bd54
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,37 @@
 
															+# 58

														
 
															+58_username="jianboy"

														
 
															+58_token="xx"

														
 
															+58_private=true

														
 
															+

														
 
															+# 168

														
 
															+168_host="https://168.com"

														
 
															+168_username="jianboy"

														
 
															+168_token="glpat-xx-m1"

														
 
															+168_private=true

														
 
															+

														
 
															+# dangdang

														
 
															+dangdang_host = "https://git.yoqi.me"

														
 
															+dangdang_username="xx"

														
 
															+dangdang_token="xx"

														
 
															+dangdang_private=true

														
 
															+

														
 
															+#jingdong

														
 
															+jingdong_username="xx"

														
 
															+jingdong_token=""

														
 
															+jingdong_private=true

														
 
															+

														
 
															+# paipai

														
 
															+paipai_username="xx"

														
 
															+paipai_token="xx"

														
 
															+paipai_private=true

														
 
															+

														
 
															+# xianyu

														
 
															+xianyu_username="xx"

														
 
															+xianyu_token="xx"

														
 
															+xianyu_project="flutter-team"

														
 
															+xianyu_private=true

														
 
															+

														
 
															+# zhuagnzhuan

														
 
															+zhuagnzhuan_username=""

														
 
															+zhuagnzhuan_token=""

														
 
															+

														
--- a/README.md
+++ b/README.md
@@ -3,15 +3,17 @@
 
															 [![Version](https://img.shields.io/badge/version-v1.1.0-brightgreen)](https://git.yoqi.me/lyq/crawl_secondhand)
														
 
															 [![.Python](https://img.shields.io/badge/Python-v3.8.5-brightgreen?style=plastic)](https://git.yoqi.me/lyq/crawl_secondhand)
														
 
															-二手商品监控。
														
 
															+二手商品监控，二手车，二手房，租房，电子产品。
														
 
															 1、指定商品定时爬取，有新商品时发送邮件通知。
														
 
															 2、比价，有新用户发布同样的商品时发送邮件通知。
														
 
															-## Usage
														
 
															-
														
 
															+链家房源爬虫，通过小区信息爬取所有房源,基于scrapy
														
 
															+## Usage
														
 
															+    setting.py中配置MongoDB
														
 
															+    run run.py
														
 
															 **Web版本**
														
--- a/crawl_house.py
+++ b/crawl_house.py
@@ -0,0 +1,31 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- encoding: utf-8 -*-
														
 
															+'''
														
 
															+@Contact :   liuyuqi.gov@msn.cn
														
 
															+@Time    :   2023/09/22 17:42:53
														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
														
 
															+@Desc    :   
														
 
															+'''
														
 
															+
														
 
															+from scrapy import cmdline
														
 
															+from scrapy.utils.project import get_project_settings
														
 
															+from scrapy.scheduler import Scheduler, CrawlerProcess, CrawlerRunner
														
 
															+from scrapy.utils.project import reactor, defer
														
 
															+from twisted.internet import CrawlerRunner
														
 
															+from scrapy.crawler import CrawlerRunner
														
 
															+from scrapy.utils.log import configure_logging
														
 
															+
														
 
															+import os,crawl_house
														
 
															+
														
 
															+configure_loging()
														
 
															+runner = CrawlerRunner(get_project_settings())
														
 
															+
														
 
															+@defer.inlineCallbacks
														
 
															+def crawl():
														
 
															+    # yield runner.crawl(crawl_house)
														
 
															+    cmdline.execute('scrapy genspider -o houses.csv houses.com'.split())
														
 
															+    reactor.stop()
														
 
															+
														
 
															+if __name__=='__main__':
														
 
															+    crawl()
														
 
															+    reactor.run()
														
--- a/crawl_house/__init__.py
+++ b/crawl_house/__init__.py
--- a/crawl_house/items.py
+++ b/crawl_house/items.py
@@ -0,0 +1,60 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+
														
 
															+# Define here the models for your scraped items
														
 
															+#
														
 
															+# See documentation in:
														
 
															+# https://doc.scrapy.org/en/latest/topics/items.html
														
 
															+
														
 
															+import scrapy
														
 
															+
														
 
															+
														
 
															+class HouseSpiderItem(scrapy.Item):
														
 
															+    # define the fields for your item here like:
														
 
															+    # name = scrapy.Field()
														
 
															+    pass
														
 
															+
														
 
															+
														
 
															+class LianjiaVillageItem(scrapy.Item):
														
 
															+    # 链家小区
														
 
															+    collection = 'lianjia_village'
														
 
															+    id = scrapy.Field()
														
 
															+    name = scrapy.Field()
														
 
															+    zone = scrapy.Field()
														
 
															+    address = scrapy.Field()
														
 
															+    latitude = scrapy.Field()
														
 
															+    longitude = scrapy.Field()
														
 
															+    year = scrapy.Field()
														
 
															+    build_type = scrapy.Field()
														
 
															+    property_costs = scrapy.Field()
														
 
															+    property_company = scrapy.Field()
														
 
															+    developers = scrapy.Field()
														
 
															+    buildings = scrapy.Field()
														
 
															+    total_house = scrapy.Field()
														
 
															+
														
 
															+class LianjiaHouseItem(scrapy.Item):
														
 
															+    collection = 'lianjia_House'
														
 
															+    房屋Id = scrapy.Field()
														
 
															+    标题 = scrapy.Field()
														
 
															+    售价 = scrapy.Field()
														
 
															+    小区 = scrapy.Field()
														
 
															+    小区ID = scrapy.Field()
														
 
															+    房屋户型 = scrapy.Field()
														
 
															+    所在楼层 = scrapy.Field()
														
 
															+    建筑面积 = scrapy.Field()
														
 
															+    户型结构 = scrapy.Field()
														
 
															+    套内面积 = scrapy.Field()
														
 
															+    建筑类型 = scrapy.Field()
														
 
															+    房屋朝向 = scrapy.Field()
														
 
															+    建筑结构 = scrapy.Field()
														
 
															+    装修情况 = scrapy.Field()
														
 
															+    梯户比例 = scrapy.Field()
														
 
															+    配备电梯 = scrapy.Field()
														
 
															+    产权年限 = scrapy.Field()
														
 
															+    挂牌时间 = scrapy.Field()
														
 
															+    交易权属 = scrapy.Field()
														
 
															+    上次交易 = scrapy.Field()
														
 
															+    房屋用途 = scrapy.Field()
														
 
															+    房屋年限 = scrapy.Field()
														
 
															+    产权所属 = scrapy.Field()
														
 
															+    抵押信息 = scrapy.Field()
														
 
															+    房本备件 = scrapy.Field()
														
--- a/crawl_house/middlewares.py
+++ b/crawl_house/middlewares.py
@@ -0,0 +1,103 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+
														
 
															+# Define here the models for your spider middleware
														
 
															+#
														
 
															+# See documentation in:
														
 
															+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
														
 
															+
														
 
															+from scrapy import signals
														
 
															+
														
 
															+
														
 
															+class HouseSpiderSpiderMiddleware(object):
														
 
															+    # Not all methods need to be defined. If a method is not defined,
														
 
															+    # scrapy acts as if the spider middleware does not modify the
														
 
															+    # passed objects.
														
 
															+
														
 
															+    @classmethod
														
 
															+    def from_crawler(cls, crawler):
														
 
															+        # This method is used by Scrapy to create your spiders.
														
 
															+        s = cls()
														
 
															+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
														
 
															+        return s
														
 
															+
														
 
															+    def process_spider_input(self, response, spider):
														
 
															+        # Called for each response that goes through the spider
														
 
															+        # middleware and into the spider.
														
 
															+
														
 
															+        # Should return None or raise an exception.
														
 
															+        return None
														
 
															+
														
 
															+    def process_spider_output(self, response, result, spider):
														
 
															+        # Called with the results returned from the Spider, after
														
 
															+        # it has processed the response.
														
 
															+
														
 
															+        # Must return an iterable of Request, dict or Item objects.
														
 
															+        for i in result:
														
 
															+            yield i
														
 
															+
														
 
															+    def process_spider_exception(self, response, exception, spider):
														
 
															+        # Called when a spider or process_spider_input() method
														
 
															+        # (from other spider middleware) raises an exception.
														
 
															+
														
 
															+        # Should return either None or an iterable of Response, dict
														
 
															+        # or Item objects.
														
 
															+        pass
														
 
															+
														
 
															+    def process_start_requests(self, start_requests, spider):
														
 
															+        # Called with the start requests of the spider, and works
														
 
															+        # similarly to the process_spider_output() method, except
														
 
															+        # that it doesn’t have a response associated.
														
 
															+
														
 
															+        # Must return only requests (not items).
														
 
															+        for r in start_requests:
														
 
															+            yield r
														
 
															+
														
 
															+    def spider_opened(self, spider):
														
 
															+        spider.logger.info('Spider opened: %s' % spider.name)
														
 
															+
														
 
															+
														
 
															+class HouseSpiderDownloaderMiddleware(object):
														
 
															+    # Not all methods need to be defined. If a method is not defined,
														
 
															+    # scrapy acts as if the downloader middleware does not modify the
														
 
															+    # passed objects.
														
 
															+
														
 
															+    @classmethod
														
 
															+    def from_crawler(cls, crawler):
														
 
															+        # This method is used by Scrapy to create your spiders.
														
 
															+        s = cls()
														
 
															+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
														
 
															+        return s
														
 
															+
														
 
															+    def process_request(self, request, spider):
														
 
															+        # Called for each request that goes through the downloader
														
 
															+        # middleware.
														
 
															+
														
 
															+        # Must either:
														
 
															+        # - return None: continue processing this request
														
 
															+        # - or return a Response object
														
 
															+        # - or return a Request object
														
 
															+        # - or raise IgnoreRequest: process_exception() methods of
														
 
															+        #   installed downloader middleware will be called
														
 
															+        return None
														
 
															+
														
 
															+    def process_response(self, request, response, spider):
														
 
															+        # Called with the response returned from the downloader.
														
 
															+
														
 
															+        # Must either;
														
 
															+        # - return a Response object
														
 
															+        # - return a Request object
														
 
															+        # - or raise IgnoreRequest
														
 
															+        return response
														
 
															+
														
 
															+    def process_exception(self, request, exception, spider):
														
 
															+        # Called when a download handler or a process_request()
														
 
															+        # (from other downloader middleware) raises an exception.
														
 
															+
														
 
															+        # Must either:
														
 
															+        # - return None: continue processing this exception
														
 
															+        # - return a Response object: stops process_exception() chain
														
 
															+        # - return a Request object: stops process_exception() chain
														
 
															+        pass
														
 
															+
														
 
															+    def spider_opened(self, spider):
														
 
															+        spider.logger.info('Spider opened: %s' % spider.name)
														
--- a/crawl_house/pipelines.py
+++ b/crawl_house/pipelines.py
@@ -0,0 +1,25 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+
														
 
															+# Define your item pipelines here
														
 
															+#
														
 
															+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
														
 
															+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
														
 
															+import pymongo
														
 
															+from scrapy.conf import settings
														
 
															+
														
 
															+
														
 
															+class HouseSpiderPipeline(object):
														
 
															+    def process_item(self, item, spider):
														
 
															+        return item
														
 
															+
														
 
															+
														
 
															+class LianjiaVillageSavePipeline(object):
														
 
															+    def __init__(self):
														
 
															+        pass
														
 
															+
														
 
															+    def process_item(self, item, spider):
														
 
															+        if spider.name == 'lianjia':
														
 
															+            client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
														
 
															+            db = client['house']
														
 
															+            coll = db[item.collection]
														
 
															+            coll.insert(dict(item))
														
--- a/crawl_house/run.py
+++ b/crawl_house/run.py
@@ -0,0 +1,3 @@
 
															+from scrapy import cmdline
														
 
															+
														
 
															+cmdline.execute("scrapy crawl lianjia".split())
														
--- a/crawl_house/settings.py
+++ b/crawl_house/settings.py
@@ -0,0 +1,94 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+
														
 
															+# Scrapy settings for house_spider project
														
 
															+#
														
 
															+# For simplicity, this file contains only settings considered important or
														
 
															+# commonly used. You can find more settings consulting the documentation:
														
 
															+#
														
 
															+#     https://doc.scrapy.org/en/latest/topics/settings.html
														
 
															+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
														
 
															+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
														
 
															+
														
 
															+BOT_NAME = 'house_spider'
														
 
															+
														
 
															+SPIDER_MODULES = ['house_spider.spiders']
														
 
															+NEWSPIDER_MODULE = 'house_spider.spiders'
														
 
															+
														
 
															+
														
 
															+# Crawl responsibly by identifying yourself (and your website) on the user-agent
														
 
															+#USER_AGENT = 'house_spider (+http://www.yourdomain.com)'
														
 
															+
														
 
															+# Obey robots.txt rules
														
 
															+ROBOTSTXT_OBEY = True
														
 
															+
														
 
															+# Configure maximum concurrent requests performed by Scrapy (default: 16)
														
 
															+#CONCURRENT_REQUESTS = 32
														
 
															+
														
 
															+# Configure a delay for requests for the same website (default: 0)
														
 
															+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
														
 
															+# See also autothrottle settings and docs
														
 
															+#DOWNLOAD_DELAY = 3
														
 
															+# The download delay setting will honor only one of:
														
 
															+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
														
 
															+#CONCURRENT_REQUESTS_PER_IP = 16
														
 
															+
														
 
															+# Disable cookies (enabled by default)
														
 
															+#COOKIES_ENABLED = False
														
 
															+
														
 
															+# Disable Telnet Console (enabled by default)
														
 
															+#TELNETCONSOLE_ENABLED = False
														
 
															+
														
 
															+# Override the default request headers:
														
 
															+#DEFAULT_REQUEST_HEADERS = {
														
 
															+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
														
 
															+#   'Accept-Language': 'en',
														
 
															+#}
														
 
															+
														
 
															+# Enable or disable spider middlewares
														
 
															+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
														
 
															+#SPIDER_MIDDLEWARES = {
														
 
															+#    'house_spider.middlewares.HouseSpiderSpiderMiddleware': 543,
														
 
															+#}
														
 
															+
														
 
															+# Enable or disable downloader middlewares
														
 
															+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
														
 
															+#DOWNLOADER_MIDDLEWARES = {
														
 
															+#    'house_spider.middlewares.HouseSpiderDownloaderMiddleware': 543,
														
 
															+#}
														
 
															+
														
 
															+# Enable or disable extensions
														
 
															+# See https://doc.scrapy.org/en/latest/topics/extensions.html
														
 
															+#EXTENSIONS = {
														
 
															+#    'scrapy.extensions.telnet.TelnetConsole': None,
														
 
															+#}
														
 
															+
														
 
															+# Configure item pipelines
														
 
															+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
														
 
															+ITEM_PIPELINES = {
														
 
															+    'house_spider.pipelines.HouseSpiderPipeline': 300,
														
 
															+    'house_spider.pipelines.LianjiaVillageSavePipeline': 301,
														
 
															+}
														
 
															+
														
 
															+# Enable and configure the AutoThrottle extension (disabled by default)
														
 
															+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
														
 
															+#AUTOTHROTTLE_ENABLED = True
														
 
															+# The initial download delay
														
 
															+#AUTOTHROTTLE_START_DELAY = 5
														
 
															+# The maximum download delay to be set in case of high latencies
														
 
															+#AUTOTHROTTLE_MAX_DELAY = 60
														
 
															+# The average number of requests Scrapy should be sending in parallel to
														
 
															+# each remote server
														
 
															+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
														
 
															+# Enable showing throttling stats for every response received:
														
 
															+#AUTOTHROTTLE_DEBUG = False
														
 
															+
														
 
															+# Enable and configure HTTP caching (disabled by default)
														
 
															+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
														
 
															+#HTTPCACHE_ENABLED = True
														
 
															+#HTTPCACHE_EXPIRATION_SECS = 0
														
 
															+#HTTPCACHE_DIR = 'httpcache'
														
 
															+#HTTPCACHE_IGNORE_HTTP_CODES = []
														
 
															+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
														
 
															+
														
 
															+MONGO_HOST = "127.0.0.1"  # 主机IP
														
 
															+MONGO_PORT = 27017  # 端口号
														
--- a/crawl_house/spiders/__init__.py
+++ b/crawl_house/spiders/__init__.py
@@ -0,0 +1,4 @@
 
															+# This package will contain the spiders of your Scrapy project
														
 
															+#
														
 
															+# Please refer to the documentation for information on how to create and manage
														
 
															+# your spiders.
														
--- a/crawl_house/spiders/beike.py
+++ b/crawl_house/spiders/beike.py
@@ -0,0 +1,24 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- encoding: utf-8 -*-
														
 
															+'''
														
 
															+@Contact :   liuyuqi.gov@msn.cn
														
 
															+@Time    :   2023/09/22 17:45:53
														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
														
 
															+@Desc    :   贝壳网
														
 
															+'''
														
 
															+
														
 
															+import scrapy
														
 
															+
														
 
															+class BeikeSpider(scrapy.Spider):
														
 
															+    name = 'beike'
														
 
															+    allowed_domains = ['cq.ke.com']
														
 
															+    start_urls = ['http://cq.ke.com/']
														
 
															+
														
 
															+    def parse(self, response):
														
 
															+        pass
														
 
															+    def start_requests(self):
														
 
															+        url = self.base_url + 'xiaoqu/'
														
 
															+        yield scrapy.Request(url=url, callback=self.parse_district_links)
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        pass
														
--- a/crawl_house/spiders/lianjia.py
+++ b/crawl_house/spiders/lianjia.py
@@ -0,0 +1,142 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+import scrapy
														
 
															+from scrapy import Selector
														
 
															+import json
														
 
															+import re
														
 
															+
														
 
															+from house_spider.items import LianjiaVillageItem, LianjiaHouseItem
														
 
															+
														
 
															+
														
 
															+class LianjiaSpider(scrapy.Spider):
														
 
															+    name = 'lianjia'
														
 
															+    allowed_domains = ['cq.lianjia.com']
														
 
															+    start_urls = ['cq.lianjia.com']
														
 
															+
														
 
															+    def __init__(self, **kwargs):
														
 
															+        super().__init__(**kwargs)
														
 
															+        self.base_url = 'https://cq.lianjia.com'
														
 
															+
														
 
															+    def start_requests(self):
														
 
															+        request_url = 'https://cq.lianjia.com/xiaoqu/'
														
 
															+        yield scrapy.Request(url=request_url, callback=self.parse_district_links)
														
 
															+
														
 
															+    def parse_district_links(self, response):
														
 
															+        """提取地区链接"""
														
 
															+        sel = Selector(response)
														
 
															+        links = sel.css("div[data-role='ershoufang'] div:first-child a::attr(href)").extract()
														
 
															+        for link in links:
														
 
															+            url = self.base_url + link
														
 
															+            yield scrapy.Request(url=url, callback=self.parse_bizcircle_links)
														
 
															+
														
 
															+    def parse_bizcircle_links(self, response):
														
 
															+        """提取商圈链接"""
														
 
															+        sel = Selector(response)
														
 
															+        links = sel.css("div[data-role='ershoufang'] div:nth-child(2) a::attr(href)").extract()
														
 
															+        for link in links:
														
 
															+            url = self.base_url + link
														
 
															+            yield scrapy.Request(url=url, callback=self.parse_village_list, meta={"ref": url})
														
 
															+
														
 
															+    def parse_village_list(self, response):
														
 
															+        """提取小区链接"""
														
 
															+        sel = Selector(response)
														
 
															+        links = sel.css(".listContent .xiaoquListItem .img::attr(href)").extract()
														
 
															+        for link in links:
														
 
															+            yield scrapy.Request(url=link, callback=self.parse_village_detail)
														
 
															+
														
 
															+        # page
														
 
															+        page_data = sel.css(".house-lst-page-box::attr(page-data)").extract_first()
														
 
															+        page_data = json.loads(page_data)
														
 
															+        if page_data['curPage'] < page_data['totalPage']:
														
 
															+            url = response.meta["ref"] + 'pg' + str(page_data['curPage'] + 1)
														
 
															+            yield scrapy.Request(url=url, callback=self.parse_village_list, meta=response.meta)
														
 
															+
														
 
															+    def parse_village_detail(self, response):
														
 
															+        """提取小区详情"""
														
 
															+        village_url = response.url
														
 
															+        sel = Selector(response)
														
 
															+        zone = sel.css('.xiaoquDetailbreadCrumbs .l-txt a::text').extract()
														
 
															+        latitude = 0
														
 
															+        longitude = 0
														
 
															+        try:
														
 
															+            html = response.body.decode().replace('\r', '')
														
 
															+            local = html[html.find('resblockPosition:'):html.find('resblockName') - 1]
														
 
															+            m = re.search('(\d.*\d),(\d.*\d)', local)
														
 
															+            longitude = m.group(1)
														
 
															+            latitude = m.group(2)
														
 
															+        except Exception:
														
 
															+            pass
														
 
															+
														
 
															+        item = LianjiaVillageItem()
														
 
															+        item['id'] = village_url.replace(self.base_url + '/xiaoqu/', '').replace('/', '')
														
 
															+        item['name'] = sel.css('.detailHeader .detailTitle::text').extract_first()
														
 
															+        item['address'] = sel.css('.detailHeader .detailDesc::text').extract_first()
														
 
															+        item['latitude'] = latitude
														
 
															+        item['longitude'] = longitude
														
 
															+        item['zone'] = ','.join(zone)
														
 
															+        item['year'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(1) .xiaoquInfoContent::text').extract_first()
														
 
															+        item['build_type'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(2) .xiaoquInfoContent::text').extract_first()
														
 
															+        item['property_costs'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(3) .xiaoquInfoContent::text').extract_first()
														
 
															+        item['property_company'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(4) .xiaoquInfoContent::text').extract_first()
														
 
															+        item['developers'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(5) .xiaoquInfoContent::text').extract_first()
														
 
															+        item['buildings'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(6) .xiaoquInfoContent::text').extract_first()
														
 
															+        item['total_house'] = sel.css('.xiaoquInfo .xiaoquInfoItem:nth-child(7) .xiaoquInfoContent::text').extract_first()
														
 
															+
														
 
															+        print(item['name'])
														
 
															+        yield item
														
 
															+
														
 
															+        # 小区房源 https://cq.lianjia.com/ershoufang/c3620038190566370/
														
 
															+        url = self.base_url + "/ershoufang/c" + item['id'] + "/"
														
 
															+        yield scrapy.Request(url=url, callback=self.parse_house_list, meta={"ref": url})
														
 
															+
														
 
															+    def parse_house_list(self, response):
														
 
															+        """提取房源链接"""
														
 
															+        sel = Selector(response)
														
 
															+        # 链家有时小区查询不到数据
														
 
															+        total = sel.css('.resultDes .total span::text').extract_first()
														
 
															+        total = int(total)
														
 
															+        if total > 0:
														
 
															+            # 提取房源链接
														
 
															+            links = sel.css(".sellListContent li .info .title a::attr(href)").extract()
														
 
															+            for link in links:
														
 
															+                yield scrapy.Request(url=link, callback=self.parse_house_detail)
														
 
															+            # 链接分页
														
 
															+            page_data = sel.css(".house-lst-page-box::attr(page-data)").extract_first()
														
 
															+            page_data = json.loads(page_data)
														
 
															+            if page_data['curPage'] == 1 and page_data['totalPage'] > 1:
														
 
															+                price = response.url.replace(self.base_url + '/ershoufang/', '')
														
 
															+                for x in range(2, page_data['totalPage'] + 1, 1):
														
 
															+                    url = self.base_url + '/ershoufang/' + 'pg' + str(x) + price
														
 
															+                    yield scrapy.Request(url=url, callback=self.parse_house_list)
														
 
															+
														
 
															+    def parse_house_detail(self, response):
														
 
															+        """提取房源信息"""
														
 
															+        sel = Selector(response)
														
 
															+
														
 
															+        item = LianjiaHouseItem()
														
 
															+        item['房屋Id'] = response.url.replace(self.base_url + '/ershoufang/', '').replace('.html', '')
														
 
															+        item['标题'] = sel.css('.title-wrapper .title .main::text').extract_first()
														
 
															+        item['售价'] = sel.css('.overview .content .price .total::text').extract_first()
														
 
															+        item['小区'] = sel.css('.overview .content .aroundInfo .communityName a.info::text').extract_first()
														
 
															+        item['小区ID'] = sel.css('.overview .content .aroundInfo .communityName a.info::attr(href)').extract_first().replace('/xiaoqu/', '').replace('/', '')
														
 
															+        item['房屋户型'] = sel.css('#introduction .base .content ul li:nth-child(1)::text').extract_first()
														
 
															+        item['所在楼层'] = sel.css('#introduction .base .content ul li:nth-child(2)::text').extract_first()
														
 
															+        item['建筑面积'] = sel.css('#introduction .base .content ul li:nth-child(3)::text').extract_first()
														
 
															+        item['户型结构'] = sel.css('#introduction .base .content ul li:nth-child(4)::text').extract_first()
														
 
															+        item['套内面积'] = sel.css('#introduction .base .content ul li:nth-child(5)::text').extract_first()
														
 
															+        item['建筑类型'] = sel.css('#introduction .base .content ul li:nth-child(6)::text').extract_first()
														
 
															+        item['房屋朝向'] = sel.css('#introduction .base .content ul li:nth-child(7)::text').extract_first()
														
 
															+        item['建筑结构'] = sel.css('#introduction .base .content ul li:nth-child(8)::text').extract_first()
														
 
															+        item['装修情况'] = sel.css('#introduction .base .content ul li:nth-child(9)::text').extract_first()
														
 
															+        item['梯户比例'] = sel.css('#introduction .base .content ul li:nth-child(10)::text').extract_first()
														
 
															+        item['配备电梯'] = sel.css('#introduction .base .content ul li:nth-child(11)::text').extract_first()
														
 
															+        item['产权年限'] = sel.css('#introduction .base .content ul li:nth-child(12)::text').extract_first()
														
 
															+        item['挂牌时间'] = sel.css('#introduction .transaction .content ul li:nth-child(1) span:nth-child(2)::text').extract_first()
														
 
															+        item['交易权属'] = sel.css('#introduction .transaction .content ul li:nth-child(2) span:nth-child(2)::text').extract_first()
														
 
															+        item['上次交易'] = sel.css('#introduction .transaction .content ul li:nth-child(3) span:nth-child(2)::text').extract_first()
														
 
															+        item['房屋用途'] = sel.css('#introduction .transaction .content ul li:nth-child(4) span:nth-child(2)::text').extract_first()
														
 
															+        item['房屋年限'] = sel.css('#introduction .transaction .content ul li:nth-child(5) span:nth-child(2)::text').extract_first()
														
 
															+        item['产权所属'] = sel.css('#introduction .transaction .content ul li:nth-child(6) span:nth-child(2)::text').extract_first()
														
 
															+        item['抵押信息'] = sel.css('#introduction .transaction .content ul li:nth-child(7) span:nth-child(2)::attr(title)').extract_first()
														
 
															+        item['房本备件'] = sel.css('#introduction .transaction .content ul li:nth-child(8) span:nth-child(2)::text').extract_first()
														
 
															+
														
 
															+        yield item
														
--- a/crawl_secondhand/extractor/lianjia/getData.py
+++ b/crawl_secondhand/extractor/lianjia/getData.py
@@ -0,0 +1,99 @@
 
															+# coding=utf-8
														
 
															+'''
														
 
															+Created on 2017年7月1日
														
 
															+@vsersion:python3.6
														
 
															+@author: liuyuqi
														
 
															+'''
														
 
															+import random
														
 
															+import time
														
 
															+
														
 
															+# 导入开发模块
														
 
															+import requests
														
 
															+from bs4 import BeautifulSoup
														
 
															+from nt import chdir
														
 
															+
														
 
															+# 定义空列表，用于创建所有的爬虫链接
														
 
															+urls = []
														
 
															+# 指定爬虫所需的上海各个区域名称
														
 
															+# citys = ['pudongxinqu','minhang','baoshan','xuhui','putuo','yangpu','changning','songjiang',
														
 
															+#          'jiading','huangpu','jinan','zhabei','hongkou','qingpu','fengxian','jinshan','chongming']
														
 
															+citys = ['pudongxinqu']
														
 
															+
														
 
															+workSpace = "E:\\data\\workspace\\Python_Tools\\lianjianalysis"
														
 
															+resultFile = "lianjia.csv"
														
 
															+
														
 
															+
														
 
															+class Lianjia(object):
														
 
															+    def __init__(self):
														
 
															+        super(Lianjia, self).__init__()
														
 
															+        self.session = requests.Session()
														
 
															+        headers = {
														
 
															+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
														
 
															+            'Referer': 'http://sh.lianjia.com/ershoufang/',
														
 
															+        }
														
 
															+        self.session.headers.update(headers)
														
 
															+
														
 
															+    def getUrls(self):
														
 
															+        # 基于for循环，构造完整的爬虫链接
														
 
															+        for i in citys:
														
 
															+            url = 'http://sh.lianjia.com/ershoufang/%s/' % i
														
 
															+            res = self.session.get(url)  # 发送get请求
														
 
															+            res = res.text.encode(res.encoding).decode('utf-8')  # 需要转码，否则会有问题
														
 
															+            soup = BeautifulSoup(res, 'html.parser')  # 使用bs4模块，对响应的链接源代码进行html解析
														
 
															+            page = soup.findAll('div', {'class': 'c-pagination'})  # 使用finalAll方法，获取指定标签和属性下的内容
														
 
															+            pages = [i.strip() for i in page[0].text.split('\n')]  # 抓取出每个区域的二手房链接中所有的页数
														
 
															+            if len(pages) > 3:
														
 
															+                total_pages = int(pages[-3])
														
 
															+            else:
														
 
															+                total_pages = int(pages[-2])
														
 
															+
														
 
															+            for j in list(range(1, total_pages + 1)):  # 拼接所有需要爬虫的链接
														
 
															+                urls.append('http://sh.lianjia.com/ershoufang/%s/d%s' % (i, j))
														
 
															+            #         随机睡眠2-10s
														
 
															+        time.sleep(random.randint(2, 10))
														
 
															+
														
 
															+    def mSpider(self):
														
 
															+        #         获取所有url
														
 
															+        self.getUrls()
														
 
															+        # 创建csv文件，用于后面的保存数据
														
 
															+        file = open(resultFile, 'w', encoding='utf-8')
														
 
															+
														
 
															+        for url in urls:  # 基于for循环，抓取出所有满足条件的标签和属性列表，存放在find_all中
														
 
															+            res = requests.get(url)
														
 
															+            res = res.text.encode(res.encoding).decode('utf-8')
														
 
															+            soup = BeautifulSoup(res, 'html.parser')
														
 
															+            find_all = soup.find_all(name='div', attrs={'class': 'info'})
														
 
															+
														
 
															+            for i in list(range(len(find_all))):  # 基于for循环，抓取出所需的各个字段信息
														
 
															+                title = find_all[i].find('a')['title']  # 每套二手房的标语
														
 
															+
														
 
															+                res2 = find_all[i]
														
 
															+                name = res2.find_all('div', {'class': 'where'})[0].find_all('span')[0].text  # 每套二手房的小区名称
														
 
															+                room_type = res2.find_all('div', {'class': 'where'})[0].find_all('span')[1].text  # 每套二手房的户型
														
 
															+                size = res2.find_all('div', {'class': 'where'})[0].find_all('span')[2].text[:-3]  # 每套二手房的面积
														
 
															+
														
 
															+                # 采用列表解析式，删除字符串的首位空格
														
 
															+                info = [i.strip() for i in res2.find_all('div', {'class': 'con'})[0].text.split('\n')]
														
 
															+                region = info[1]  # 每套二手房所属的区域
														
 
															+                loucheng = info[2][2:]  # 每套二手房所在的楼层
														
 
															+                chaoxiang = info[5][2:]  # 每套二手房的朝向
														
 
															+                builtdate = info[-3][2:]  # 每套二手房的建筑时间
														
 
															+
														
 
															+                # 每套二手房的总价
														
 
															+                price = find_all[i].find('div', {'class': 'price'}).text.strip()[:-1]
														
 
															+                # 每套二手房的平方米售价
														
 
															+                price_union = find_all[i].find('div', {'class': 'price-pre'}).text.strip()[:-3]
														
 
															+
														
 
															+                # print(name,room_type,size,region,loucheng,chaoxiang,price,price_union,builtdate)
														
 
															+                # 将上面的各字段信息值写入并保存到csv文件中
														
 
															+                file.write(','.join(
														
 
															+                    (name, room_type, size, region, loucheng, chaoxiang, price, price_union, builtdate)) + '\n')
														
 
															+
														
 
															+        # 关闭文件（否则数据不会写入到csv文件中）
														
 
															+        file.close()
														
 
															+
														
 
															+
														
 
															+chdir(workSpace)
														
 
															+jia = Lianjia()
														
 
															+jia.mSpider()
														
 
															+print(urls)
														
--- a/crawl_secondhand/extractor/lianjia/gethouse.py
+++ b/crawl_secondhand/extractor/lianjia/gethouse.py
@@ -0,0 +1,83 @@
 
															+# coding=utf-8
														
 
															+'''
														
 
															+Created on 2017年5月16日
														
 
															+@vsersion:python3.6
														
 
															+@author: liuyuqi
														
 
															+'''
														
 
															+# 导入开发模块
														
 
															+import requests
														
 
															+from bs4 import BeautifulSoup
														
 
															+
														
 
															+# 定义空列表，用于创建所有的爬虫链接
														
 
															+urls = []
														
 
															+# 指定爬虫所需的上海各个区域名称
														
 
															+citys1 = ['pudongxinqu', 'minhang', 'baoshan', 'xuhui', 'putuo', 'yangpu', 'changning', 'songjiang',
														
 
															+          'jiading', 'huangpu', 'jinan', 'zhabei', 'hongkou', 'qingpu', 'fengxian', 'jinshan', 'chongming']
														
 
															+
														
 
															+citys = ['pudongxinqu']
														
 
															+
														
 
															+data = {"user": "user", "password": "pass"}
														
 
															+headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
														
 
															+           "Accept-Encoding": "gzip",
														
 
															+           "Accept-Language": "zh-CN,zh;q=0.8",
														
 
															+           "Referer": "http://www.example.com/",
														
 
															+           "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
														
 
															+           }
														
 
															+# 基于for循环，构造完整的爬虫链接
														
 
															+for i in citys:
														
 
															+    url = 'http://sh.lianjia.com/ershoufang/%s/' % i
														
 
															+    res = requests.get(url, headers=headers)
														
 
															+    res = res.text.encode(res.encoding).decode('utf-8')  # 需要转码，否则会有问题
														
 
															+    soup = BeautifulSoup(res, 'html.parser')  # 使用bs4模块，对响应的链接源代码进行html解析
														
 
															+    page = soup.findAll('div', {'class': 'c-pagination'})  # 使用finalAll方法，获取指定标签和属性下的内容
														
 
															+    pages = [i.strip() for i in page[0].text.split('\n')]  # 抓取出每个区域的二手房链接中所有的页数
														
 
															+    # print(pages)
														
 
															+    if len(pages) > 3:
														
 
															+        total_pages = int(pages[-3])
														
 
															+    else:
														
 
															+        total_pages = int(pages[-2])
														
 
															+
														
 
															+    for j in list(range(1, total_pages + 1)):  # 拼接所有需要爬虫的链接
														
 
															+        urls.append('http://sh.lianjia.com/ershoufang/%s/d%s' % (i, j))
														
 
															+        # print(urls)
														
 
															+urlss = []
														
 
															+for i in range(0, 1):
														
 
															+    urlss.append(urls[i])
														
 
															+# print(urls)
														
 
															+# exit()
														
 
															+##############写文件###################
														
 
															+# 创建csv文件，用于后面的保存数据
														
 
															+file = open('lianjia.csv', 'w', encoding='utf-8')
														
 
															+
														
 
															+for url in urls:  # 基于for循环，抓取出所有满足条件的标签和属性列表，存放在find_all中
														
 
															+    res = requests.get(url)
														
 
															+    res = res.text.encode(res.encoding).decode('utf-8')
														
 
															+    soup = BeautifulSoup(res, 'html.parser')
														
 
															+    find_all = soup.find_all(name='div', attrs={'class': 'info-panel'})
														
 
															+
														
 
															+    for i in list(range(len(find_all))):  # 基于for循环，抓取出所需的各个字段信息
														
 
															+        title = find_all[i].find('a')['title']  # 每套二手房的标语
														
 
															+
														
 
															+        res2 = find_all[i]
														
 
															+        name = res2.find_all('div', {'class': 'where'})[0].find_all('span')[0].text  # 每套二手房的小区名称
														
 
															+        room_type = res2.find_all('div', {'class': 'where'})[0].find_all('span')[1].text  # 每套二手房的户型
														
 
															+        size = res2.find_all('div', {'class': 'where'})[0].find_all('span')[2].text[:-3]  # 每套二手房的面积
														
 
															+
														
 
															+        # 采用列表解析式，删除字符串的首位空格
														
 
															+        info = [i.strip() for i in res2.find_all('div', {'class': 'con'})[0].text.split('\n')]
														
 
															+        region = info[1]  # 每套二手房所属的区域
														
 
															+        loucheng = info[2][2:]  # 每套二手房所在的楼层
														
 
															+        chaoxiang = info[5][2:]  # 每套二手房的朝向
														
 
															+        builtdate = info[-3][2:]  # 每套二手房的建筑时间
														
 
															+
														
 
															+        # 每套二手房的总价
														
 
															+        price = find_all[i].find('div', {'class': 'price'}).text.strip()[:-1]
														
 
															+        # 每套二手房的平方米售价
														
 
															+        price_union = find_all[i].find('div', {'class': 'price-pre'}).text.strip()[:-3]
														
 
															+
														
 
															+        # print(name,room_type,size,region,loucheng,chaoxiang,price,price_union,builtdate)
														
 
															+        # 将上面的各字段信息值写入并保存到csv文件中
														
 
															+        file.write(','.join((name, room_type, size, region, loucheng, chaoxiang, price, price_union, builtdate)) + '\n')
														
 
															+
														
 
															+# 关闭文件（否则数据不会写入到csv文件中）
														
 
															+file.close()
														
--- a/crawl_secondhand/options.py
+++ b/crawl_secondhand/options.py
@@ -0,0 +1,81 @@
 
															+#!/usr/bin/env python

														
 
															+# -*- encoding: utf-8 -*-

														
 
															+'''

														
 
															+@Contact :   liuyuqi.gov@msn.cn

														
 
															+@Time    :   2024/07/30 18:46:33

														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.

														
 
															+@Desc    :   read command line params and config file

														
 
															+'''

														
 
															+

														
 
															+import argparse

														
 
															+import os

														
 
															+import shlex

														
 
															+import dotenv

														
 
															+from collections import OrderedDict

														
 
															+from .utils.str_util import preferredencoding

														
 
															+

														
 
															+def parser_args(overrideArguments=None):

														
 
															+    """解析参数"""

														
 
															+    argparser = argparse.ArgumentParser()

														
 
															+    argparser.add_argument('-c', '--config', help='config file', default='config.ini')

														
 
															+    argparser.add_argument(

														
 
															+        'command',

														
 
															+        help='command: ',

														
 
															+        choices=['create', 'clone', 'push', 'delete', 'pull'],

														
 
															+    )

														
 
															+    argparser.add_argument('-d', '--debug', help='debug mode', action='store_true')

														
 
															+    argparser.add_argument(

														
 
															+        '-p',

														
 
															+        '--platform',

														
 
															+        help='set a platform',

														
 
															+        choices=['github', 'gitee', 'gitlab', 'gogs', 'gitea', 'bitbucket', 'coding'],

														
 
															+        default='github',

														
 
															+    )

														
 
															+    argparser.add_argument('-token', '--token', help='set a token')

														
 
															+    argparser.add_argument(

														
 
															+        '-repo_path', '--repo_path', help='set a repo'

														
 
															+    )  # , default=os.getcwd())

														
 
															+    args = argparser.parse_args()

														
 
															+

														
 
															+    # remove None

														
 
															+    command_line_conf = OrderedDict(

														
 
															+        {k: v for k, v in args.__dict__.items() if v is not None}

														
 
															+    )

														
 
															+

														
 
															+    system_conf = user_conf = custom_conf = OrderedDict()

														
 
															+    user_conf = _read_user_conf()

														
 
															+

														
 
															+    if args.config:

														
 
															+        custom_conf = _read_custom_conf(args.config)

														
 
															+

														
 
															+    system_conf.update(user_conf)

														
 
															+    system_conf.update(command_line_conf)

														
 
															+    if args.command == None and args.extractor == None:

														
 
															+        raise 'Error, please input cmd and extractor params11'

														
 
															+    return system_conf

														
 
															+

														
 
															+

														
 
															+def _read_custom_conf(config_path: str) -> OrderedDict:

														
 
															+    """读取自定义配置文件 config.yaml"""

														
 
															+

														
 
															+    def compat_shlex_split(s, comments=False, posix=True):

														
 
															+        if isinstance(s, str):

														
 
															+            s = s.encode('utf-8')

														
 
															+        return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix)))

														
 
															+

														
 
															+    try:

														
 
															+        with open(config_path, 'r', encoding=preferredencoding()) as f:

														
 
															+            contents = f.read()

														
 
															+            res = compat_shlex_split(contents, comments=True)

														
 
															+    except Exception as e:

														
 
															+        return []

														
 
															+    return res

														
 
															+

														
 
															+

														
 
															+def _read_user_conf() -> OrderedDict:

														
 
															+    """读取用户配置文件: .env 文件"""

														
 
															+    user_conf = OrderedDict()

														
 
															+    dotenv_path = '.env'

														
 
															+    if os.path.exists(dotenv_path):

														
 
															+        user_conf = dotenv.dotenv_values(dotenv_path)

														
 
															+    return OrderedDict(user_conf)

														
--- a/crawl_secondhand/paipai.py
+++ b/crawl_secondhand/paipai.py
@@ -1,5 +0,0 @@
 
															-from .secondhand import Secondhand
														
 
															-class Paipai(Secondhand):
														
 
															-    ''' 京东拍拍二手商品 '''
														
 
															-    def __init__(self):
														
 
															-        pass
														
--- a/crawl_secondhand/utils/__init__.py
+++ b/crawl_secondhand/utils/__init__.py
--- a/crawl_secondhand/utils/str_util.py
+++ b/crawl_secondhand/utils/str_util.py
@@ -0,0 +1,71 @@
 
															+#!/usr/bin/env python

														
 
															+# -*- encoding: utf-8 -*-

														
 
															+"""

														
 
															+@Contact :   liuyuqi.gov@msn.cn

														
 
															+@Time    :   2023/10/31 17:06:37

														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.

														
 
															+@Desc    :   字符串工具类

														
 
															+"""

														
 
															+import argparse

														
 
															+import locale

														
 
															+import re

														
 
															+import sys

														
 
															+

														
 
															+

														
 
															+def compat_register_utf8():

														
 
															+    """win 兼容utf-8编码"""

														
 
															+    if sys.platform == 'win32':

														
 
															+        from codecs import register, lookup

														
 
															+

														
 
															+        register(lambda name: lookup('utf-8') if name == 'cp65001' else None)

														
 
															+

														
 
															+

														
 
															+def preferredencoding():

														
 
															+    """Get preferred encoding.

														
 
															+

														
 
															+    Returns the best encoding scheme for the system, based on

														
 
															+    locale.getpreferredencoding() and some further tweaks.

														
 
															+    """

														
 
															+    try:

														
 
															+        pref = locale.getpreferredencoding()

														
 
															+        'TEST'.encode(pref)

														
 
															+    except Exception:

														
 
															+        pref = 'UTF-8'

														
 
															+

														
 
															+    return pref

														
 
															+

														
 
															+

														
 
															+def SpCharReplace(char):

														
 
															+    """特殊字符替换"""

														
 
															+    temp = str(char)

														
 
															+    for i in temp:

														
 
															+        if '<' == i:

														
 
															+            char = char.replace('<', '《')

														
 
															+        if '>' == i:

														
 
															+            char = char.replace('>', '》')

														
 
															+        if "'" == i:

														
 
															+            char = char.replace("'", '')  # 处理单引号

														
 
															+        if '\\' == i:

														
 
															+            char = char.replace('\\', '')  # 处理反斜杠\

														
 
															+        if '"' == i:

														
 
															+            char = char.replace('"', '`')  # 处理双引号"

														
 
															+        if '&' == i:

														
 
															+            char = char.replace('&', '-')  # 处理&号"

														
 
															+        if '|' == i:

														
 
															+            char = char.replace('|', '')  # 处理&号

														
 
															+        if '@' == i:

														
 
															+            char = char.replace('@', '.')  # 处理@号

														
 
															+        if '%' == i:

														
 
															+            char = char.replace('%', '`')  # 处理单引号

														
 
															+        if '*' == i:

														
 
															+            char = char.replace('*', '`')  # 处理反斜杠\

														
 
															+        if '("' == i:

														
 
															+            char = char.replace('"', '`')  # 处理双引号"

														
 
															+        if ')"' == i:

														
 
															+            char = char.replace(')"', '`')

														
 
															+        if '-' == i:

														
 
															+            char = char.replace('-', '`')  # 处理&号

														
 
															+        if 'ÐÂÎÅ' == i:

														
 
															+            char = char.replace('ÐÂÎÅ', '`')  # 处理ÐÂÎÅ

														
 
															+        # 在后面扩展其他特殊字符

														
 
															+    return char

														
--- a/docs/58.http
+++ b/docs/58.http
@@ -0,0 +1,8 @@
 
															+### 二手车
														
 
															+
														
 
															+
														
 
															+### 二手房
														
 
															+
														
 
															+
														
 
															+### 二手商品
														
 
															+
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,12 @@ requests
 
															 flask
														
 
															 dotenv
														
 
															 pandas
														
 
															+
														
 
															+
														
 
															+Scrapy==2.9.0
														
 
															+PyMySQL==0.9.3
														
 
															+pymongo==3.10.1
														
 
															+urllib3==1.23
														
 
															+requests==2.19.1
														
 
															+
														
 
															+
														
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -0,0 +1,11 @@
 
															+# Automatically created by: scrapy startproject
														
 
															+#
														
 
															+# For more information about the [deploy] section see:
														
 
															+# https://scrapyd.readthedocs.io/en/latest/deploy.html
														
 
															+
														
 
															+[settings]
														
 
															+default = house_spider.settings
														
 
															+
														
 
															+[deploy]
														
 
															+#url = http://localhost:6800/
														
 
															+project = house_spider
														
--- a/test/compare-lxml-beautiful.py
+++ b/test/compare-lxml-beautiful.py
@@ -0,0 +1,31 @@
 
															+# coding=utf-8
														
 
															+'''
														
 
															+lxml和bs4解析html对比
														
 
															+例子，通过两种方法，把百度所有产品打印出来。
														
 
															+Created on 2017年7月3日
														
 
															+@vsersion:python3.6
														
 
															+@author: liuyuqi
														
 
															+'''
														
 
															+import requests
														
 
															+from bs4 import BeautifulSoup
														
 
															+from lxml import etree
														
 
															+
														
 
															+url = "https://www.baidu.com/more/"
														
 
															+res = requests.get(url)
														
 
															+html = res.text.encode(res.encoding).decode('utf-8')
														
 
															+
														
 
															+# 使用beautiful解析
														
 
															+soup = BeautifulSoup(html, 'lxml')
														
 
															+titles = soup.findAll('div', {'class': 'con'})
														
 
															+print(len(titles))
														
 
															+for title in titles:
														
 
															+    print(soup.find_all('a')[1].text)  # 不好抓取
														
 
															+
														
 
															+# 使用lxml解析
														
 
															+# //*[@id="content"]/div[1]/div[2]/a
														
 
															+# //*[@id="content"]/div[2]/div[2]/a
														
 
															+
														
 
															+selector = etree.HTML(html)
														
 
															+titles = selector.xpath('//*[@id="content"]/div/div/a/text()')
														
 
															+for title in titles:
														
 
															+    print(title)
														
--- a/test/test.py
+++ b/test/test.py
@@ -0,0 +1,37 @@
 
															+# coding=utf-8
														
 
															+'''
														
 
															+Created on 2017年7月1日
														
 
															+@vsersion:python3.6
														
 
															+@author: liuyuqi
														
 
															+'''
														
 
															+from bs4 import BeautifulSoup
														
 
															+
														
 
															+url = 'http://sh.lianjia.com/ershoufang/pudongxinqu'
														
 
															+
														
 
															+# res=requests.get(url)
														
 
															+# res=res.text.encode(res.encoding).decode('utf-8')
														
 
															+# file = open("resultFile.txt",'w',encoding = 'utf-8')
														
 
															+# file.write(res)
														
 
															+file = open("resultFile.txt", 'r', encoding='UTF-8')
														
 
															+try:
														
 
															+    res = file.read()
														
 
															+finally:
														
 
															+    file.close()
														
 
															+
														
 
															+soup = BeautifulSoup(res, 'html.parser')  # 使用bs4模块，对响应的链接源代码进行html解析
														
 
															+page = soup.findAll('div', {'class': 'c-pagination'})
														
 
															+pages = [i.strip() for i in page[0].text.split('\n')]  # 抓取出每个区域的二手房链接中所有的页数
														
 
															+if len(pages) > 3:
														
 
															+    total_pages = int(pages[-3])
														
 
															+else:
														
 
															+    total_pages = int(pages[-2])
														
 
															+# print(total_pages)
														
 
															+find_all = soup.find_all(name='div', attrs={'class': 'info'})
														
 
															+# print(len(find_all))
														
 
															+res2 = find_all[1]
														
 
															+title = res2.find('a')['title']
														
 
															+print(res2)
														
 
															+name = res2.find_all('div', {'class': 'info-row'})[1].find_all('span')[0].text  # 每套二手房的小区名称
														
 
															+room_type = res2.find_all('div', {'class': 'info-row'})[0].find_all('span')[1].text  # 每套二手房的户型
														
 
															+# size = res2.find_all('div',{'class':'info-row'})[0].find_all('span')[2].text[:-3] # 每套二手房的面积
														
 
															+print(room_type)
														
--- a/test/testPost.py
+++ b/test/testPost.py
@@ -0,0 +1,40 @@
 
															+'''
														
 
															+Created on 2017年5月16日
														
 
															+@vsersion:python3.6
														
 
															+@author: liuyuqi
														
 
															+'''
														
 
															+##登录
														
 
															+# -*- coding:utf-8 -*-
														
 
															+
														
 
															+import requests
														
 
															+
														
 
															+s = requests.Session()
														
 
															+url1 = 'http://www.exanple.com/login'  # 登陆地址
														
 
															+url2 = "http://www.example.com/main"  # 需要登陆才能访问的页面地址
														
 
															+data = {"user": "user", "password": "pass"}
														
 
															+headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
														
 
															+           "Accept-Encoding": "gzip",
														
 
															+           "Accept-Language": "zh-CN,zh;q=0.8",
														
 
															+           "Referer": "http://www.example.com/",
														
 
															+           "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
														
 
															+           }
														
 
															+res1 = s.post(url1, data=data)  # 登录
														
 
															+
														
 
															+res2 = s.post(url2)  # 抓取
														
 
															+res3 = requests.get(url2, cookies=res1.cookies, headers=headers)
														
 
															+
														
 
															+print(res2.content)  # 获得二进制响应内容
														
 
															+print(res2.raw)  # 获得原始响应内容,需要stream=True
														
 
															+print(res2.raw.read(50))
														
 
															+print(type(res2.text))  # 返回解码成unicode的内容
														
 
															+print(res2.url)
														
 
															+print(res2.history)  # 追踪重定向
														
 
															+print(res2.cookies)
														
 
															+print(res2.cookies['example_cookie_name'])
														
 
															+print(res2.headers)
														
 
															+print(res2.headers['Content-Type'])
														
 
															+print(res2.headers.get('content-type'))
														
 
															+print(res2.json)  # 讲返回内容编码为json
														
 
															+print(res2.encoding)  # 返回内容编码
														
 
															+print(res2.status_code)  # 返回http状态码
														
 
															+print(res2.raise_for_status())  # 返回错误状态码