Browse Source

add beike

liuyuqi-dellpc 1 year ago
parent
commit
9ede119569
8 changed files with 442 additions and 4 deletions
  1. 14 4
      README.md
  2. 74 0
      analysis/贝壳网房产分析.ipynb
  3. 6 0
      beike/__init__.py
  4. 20 0
      beike/api.py
  5. 154 0
      beike/beike.py
  6. 119 0
      beike/phone.py
  7. 24 0
      gui.py
  8. 31 0
      main.py

+ 14 - 4
README.md

@@ -1,7 +1,17 @@
-# 链家爬虫
-### 简介 
-    链家房源爬虫,通过小区信息爬取所有房源,基于scrapy
-### 用法
+# 房产数据分析
+
+对 链家,贝壳等网站,新房,二手房,租房等信息获取分析。
+
+## Usage
+
+链家:
+
     setting.py中配置MongoDB
     setting.py中配置MongoDB
     run run.py
     run run.py
 
 
+贝壳:
+
+```
+python main.py beike
+```
+

+ 74 - 0
analysis/贝壳网房产分析.ipynb

@@ -0,0 +1,74 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 贝壳网房产分析\n",
+    "\n",
+    "分析新房,二手房,租房交易"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os,sys,re\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "# import seaborn as sns\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 数据预处理\n",
+    "def data_preprocessing():\n",
+    "    df=pd.read_csv('data.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

+ 6 - 0
beike/__init__.py

@@ -0,0 +1,6 @@
+from .beike import Beike
+
+
+def main():
+    beike = Beike()
+    beike.run()

+ 20 - 0
beike/api.py

@@ -0,0 +1,20 @@
+
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/05/29 20:02:10
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   接口
+'''
+
+
+# phone
+
+
+# pc
+
+# https://bj.ke.com/ershoufang/city/area/xiaoqu/ershoufang/
+_host=https://jian.ke.com/ershoufang/city/area/xiaoqu/ershoufang/
+
+

+ 154 - 0
beike/beike.py

@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/05/29 22:03:35
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+'''
+import requests
+from lxml import etree
+import csv,os,sys,re
+import xlwt
+import pandas as pd
+import argparse
+import logging
+
+class Beike(object):
+    ''' 贝壳网数据 '''
+
+    def __init__(self,city:str,save_type:str='csv'):
+        ''' 初始化 '''
+        self.sess=requests.session()
+        self.logger=logging.getLogger(__name__)
+        self.logger.setLevel(logging.INFO)
+        self.formatter=logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        self.ch=logging.StreamHandler()
+        self.ch.setLevel(logging.INFO)
+        self.ch.setFormatter(self.formatter)
+        self.logger.addHandler(self.ch)
+        
+        self.args=self.get_args()
+
+    def get_proxy(self):
+        ''' 获取代理 '''
+        pass
+    
+    def set_proxy(self):
+        pass
+
+    def get_args(self):
+        ''' 获取参数 '''
+        parser=argparse.ArgumentParser(description='贝壳网数据')
+        parser.add_argument('-c','--city',type=str,help='城市')
+        parser.add_argument('-p','--page',type=int,default=1,help='页码')
+    
+    def run(self,city:dict, type='ershoufang'):
+        for city in citys:
+            for i in range(1,101):
+                pass
+
+    def get_zufang(self,city:str,page:int=1):
+        ''' 获取租房数据 '''
+        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
+                'Cookie': 'select_city=510100; lianjia_uuid=db980944-7c31-4bcc-8aae-78cdd9ba138d; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22181a93da768e3e-047b67455463da-26021a51-1327104-181a93da769efd%22%2C%22%24device_id%22%3A%22181a93da768e3e-047b67455463da-26021a51-1327104-181a93da769efd%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E4%BB%98%E8%B4%B9%E5%B9%BF%E5%91%8A%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Fother.php%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E8%B4%9D%E5%A3%B3%E6%89%BE%E6%88%BF%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wychengdu%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; GUARANTEE_BANNER_SHOW=true; login_ucid=2000000245569411; lianjia_ssid=712b2a49-fc10-4962-8fea-379234735b80; lianjia_token=2.0013a6ea83788090ec020bc3b2663bffe3; lianjia_token_secure=2.0013a6ea83788090ec020bc3b2663bffe3; security_ticket=R2sd09nxk8Sm81pfJkvABUssf/StkYIdArkOZN0QqdFQvHLLFF7LBqExRYZbcQYf29gNoQKxH9O7MpAHGu2v73TQnJFksR9KIw/NJ+itit98LyB2Ncs/DgCZmgm3w0ypGjqFK49ik+KFpzJtvr4ukZOE94RSX5+eLymj/Vb+D18=; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiYTRkYjMxYmRhYTM3YTU1ZWI3NDg5OGFiNTVhNTk3ZGQxY2ZhMDkyNTg2YWYwM2NiZDlhMWMwNzM3YmQ2ZmI1ZmZiODA4YmQ4ZTVkMWU4NDNjMjU2M2FhMGZhMzlkM2U4MjkzYTE4Y2U4NjVkNGU3Nzc5NTM5ZmFkNzlmMTE3ZGVkZmFlZDdkNmQxNjc4ZTA0ZDhiOTNiNzhhMDEyM2Q1Y2M0N2QzMDRlYjhkYjliOGM4MGQ2ZTdiMjc0N2UxMzViNjRkYjcwNjA5ZmVkMmIyMjhjMzc2NDQyN2E5YTkzNTY0MjkwYjU2NGE1OGZmYjcxNjMzZDAwZjMxNDBkY2YwNlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJjODFiYmNhYVwifSIsInIiOiJodHRwczovL2NkLnp1LmtlLmNvbS96dWZhbmciLCJvcyI6IndlYiIsInYiOiIwLjEifQ=='}
+        resp = requests.get(url=url,headers=headers)
+        #xpath解析
+        parser = etree.HTMLParser(encoding='utf-8')
+        #转换树对象
+        tree = etree.XML(resp.text,parser=parser)
+
+        price = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/span/em/text()')
+        #print(price)
+        addres = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/p[2]/a[1]/text()')
+        #print(addres)
+        access = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/p[2]/a[2]/text()')
+        #print(access)
+        area = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/p[2]/a[3]/text()')
+        #print(area)
+        #拿到一整列的房源标题
+        titles = tree.xpath('//*[@id="content"]/div[1]/div[1]/div/div/p[1]/a/text()')
+        #print(titles)
+        prices = tree.xpath('//*[@id="content"]/div[1]/div[1]/div/div/span/em/text()\n')
+        #print(prices)
+        #获取你需要的信息的所有板块
+        all_list = tree.xpath('//*[@id="content"]/div[1]/div[1]/div')
+        #print(all)
+        #遍历子板块信
+
+        for all in all_list:
+            title = all.xpath('./div/p[1]/a/text()')
+            price = all.xpath('./div/span/em/text()')
+            path = all.xpath('./div/p[2]/a[1]/text()')
+            sub =  all.xpath('./div/p[2]/a[2]/text()')
+            home = all.xpath('./div/p[2]/a[3]/text()')
+            jx = all.xpath('./div/p[2]/text()[1]')[0]
+            if '精选' in jx:
+                area = all.xpath('./div/p[2]/text()[6]')
+                toward = all.xpath('./div/p[2]/text()[7]')
+                house_type = all.xpath('./div/p[2]/text()[8]')
+            else:
+                area = all.xpath('./div/p[2]/text()[5]')
+                toward = all.xpath('./div/p[2]/text()[6]')
+                house_type = all.xpath('./div/p[2]/text()[7]')
+            list_box.append([title, price, path, sub, home,area,toward,house_type])
+
+
+        for i in list_box:
+            i[0][0] = i[0][0].strip()
+            i[1][0] = i[1][0].strip()
+            i[2][0] = i[2][0].strip()
+            i[4][0] = i[4][0].strip()
+            i[5][0] = i[5][0].strip()
+            i[6][0] = i[6][0].strip()
+            i[7][0] = i[7][0].strip()
+
+        for i in list_box:
+            print(i)
+    #设置页 加到表头
+    '''pages = range(1,3)
+    for page in pages:
+        url = f'https://cd.zu.ke.com/zufang/pg{page}/#contentList'''
+    #存数字方式之一
+    #存到csv文件,逗号分隔符文件
+    # writer = csv.writer(open('data1.csv','w',encoding='utf-8'))#打开,未创建文件
+    # writer.writerow(['title', 'price', 'path', 'sub', 'home','area','toward','house_type'])
+    # writer.writerows(list_box)
+    # 存储数据方式二
+    # 写入excel文件
+    wb = xlwt.Workbook()#建立工作布
+    sheet = wb.add_sheet(('data'))#建立表
+    titles = ('title', 'price', 'path', 'sub', 'home','area','toward','house_type')
+    for index,title in enumerate(titles):# enumerate 输出两个维度的数据 1、数据系列的索引值,2、数据本身
+        sheet.write(0,index,title)# 参数-:行索引 参数二:列索引 参数三:数据本身
+    for i , item in enumerate(list_box):# i:行索引 item:一行数据
+        for j ,data in enumerate(item):# j:列索引 data:一个数据
+            sheet.write(i+1,j,data)
+    wb.save('house.xls')
+
+
+    def get_ershoufang(self):
+        ''' 获取二手房数据 '''
+        pass
+    
+    def get_xinfang(self):
+        ''' 获取新房数据 '''
+        pass
+    
+    @staticmethod
+    def save( df:pd.DataFrame, filename:str):
+        ''' 保存数据 '''
+        if os.path.exists(filename):
+            df.to_excel(filename, index=False, header=False, mode='a', encoding='utf-8-sig')
+        else:
+            df.to_excel(filename, index=False, encoding='utf-8-sig')
+
+if __name__=='__main__':
+    citys = {
+        '成都':'cd',
+        '上海':'sh',
+        '绵阳':'mianyang',
+        "吉安":"jian"
+        }
+    beike=Beike()
+    beike.run(citys, "ershoufang")

+ 119 - 0
beike/phone.py

@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/05/29 19:20:39
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   手机接口 
+'''
+import hashlib
+import base64    
+
+class KePhone(object):
+    ''' 调用app接口 '''
+    
+    def __init__(self):
+        pass
+
+
+    # 获取挂牌信息-贝壳网信息
+    def get_guapai_data(self, xiaoqu_id):
+        guapai_url = 'https://app.api.ke.com/house/ershoufang/searchv5'
+        guapai_data = {
+            'fullFilters': '1',
+            'containerType': '2',
+            'limitCount': '20',
+            'condition': xiaoqu_id,
+            'cityId': '**********',
+            'limitOffset': '0'
+        }
+        # header抓一下复制进来
+        headers = {
+            "x-req-id": "**********",
+            "Page-Schema": "ershou%2Flist",
+            "Referer": "ershoulistsearch",
+            "Cookie": "lianjia_udid=********;"
+                      "lianjia_ssid=**********;"
+                      "lianjia_uuid=**********",
+            "Lianjia-City-Id": "**********",
+            "User-Agent": "Beike2.31.0;Android MuMu; Android 6.0.1",
+            "Lianjia-Channel": "**********",
+            "Lianjia-Device-Id": "**********",
+            "Lianjia-Version": "2.31.0",
+            "Lianjia-Im-Version": "2.34.0",
+            "Lianjia-Recommend-Allowable": "1",
+            "Authorization": self.generateAuthorization(guapai_url, guapai_data),
+            "ip": "**********",
+            "wifi_name": "**********",
+            "lat": "**********",
+            "lng": "**********",
+            "Host": "app.api.ke.com",
+            "Connection": "Keep-Alive",
+            "Accept-Encoding": "gzip"
+        }
+        guapai_res = requests.get(guapai_url, headers=headers, params=guapai_data)
+        # 获取小区的挂牌信息列表
+        guapai_json = json.loads(guapai_res.text)
+        if guapai_json['errno'] == 0:
+            guapai_info = guapai_json['data']['list']
+            for i in guapai_info:
+                # 插入数据库的信息列表
+                sql_data = []
+                if 'houseCode' in i:
+                    # house_code
+                    house_code = i['houseCode']
+                    sql_data.append(house_code)
+                    # 标题
+                    resblock_name = i['title']
+                    sql_data.append(resblock_name)
+                    # 描述
+                    resblock_desc = i['desc']
+                    sql_data.append(resblock_desc)
+                    # 总价
+                    total_p = i['priceStr']
+                    total_p = self.return_no(total_p)
+                    sql_data.append(total_p)
+                    # 小区
+                    communityName = i['communityName']
+                    sql_data.append(communityName)
+                    # basicList处理
+                    for basic_info in i['basicList']:
+                        # pass
+                        sql_data.append(basic_info['value'])
+                    # infoList处理
+                    for infoList_info in i['infoList']:
+                        infoList_data = infoList_info['value']
+                        if infoList_info['name'] == '单价:':
+                            # 返回数字
+                            infoList_data = self.return_no(infoList_data)
+                        sql_data.append(infoList_data)
+                    # 构造URL,用于微信推送
+                    url = 'https://m.ke.com/tj/ershoufang/' + house_code + '.html'
+                    sql_data.append(url)
+                    # 数据插入数据库
+                    self.insert_guapai(sql_data)
+            # 休眠
+            sleeptime = random.randint(2, 10)
+            time.sleep(sleeptime)
+
+
+
+    # 获取Authorization
+    # 来源https://github.com/ShiJianYingxiang/origin/blob/master/fang_beike/fang_beike/spiders/ershou_viewer.py
+    # url没用
+    def generateAuthorization(self, url, url_parm):
+        secret_key = "d5e343d453aecca8b14b2dc687c381ca"
+        secret_id = "20180111_android"
+        # 提取URL内参数
+        # url_parm = {i.split("=")[0]: i.split("=")[1] for i in url.split("?")[1].split("&")}
+        # 参数排序
+        url_parm_sort = sorted(url_parm.items(), key=lambda x: x[0], reverse=False)
+        p2 = secret_key + "".join([i[0] + "=" + i[1] for i in url_parm_sort])
+        v3 = hashlib.sha1(p2.encode('utf-8')).hexdigest()
+        v4 = secret_id + ":" + v3
+        v5 = base64.b64encode(v4.encode("utf-8"))
+        return v5.decode()
+
+
+if __name__ == "__main__":
+    pass

+ 24 - 0
gui.py

@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/05/30 01:35:43
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   gui
+'''
+
+from pyqt5 import QtWidgets, QtGui, QtCore
+
+class HouseGui(QtWidgets.QWidget):
+    def __init__(self):
+        super(HouseGui, self).__init__()
+        self.init_ui()
+
+    def init_ui(self):
+        self.resize(800, 600)
+        self.setWindowTitle('house')
+        self.setWindowIcon(QtGui.QIcon('house.ico'))
+        self.show()
+
+if __name__=='__main__':
+    pass

+ 31 - 0
main.py

@@ -0,0 +1,31 @@
+
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/05/30 01:26:35
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   enter point
+'''
+
+import argparse
+from beike import Beike
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser(description='beike')
+    parser.add_argument('--version', '-v', action='version', version='%(prog)s 1.0')
+    parser.add_argument('--website', '-web', type=str, default='beike', help='crawl which website, default is beike,lianjia')
+    parser.add_argument('--city', '-c', type=str, default='bj', help='crawl which city, default is bj')
+    parser.add_argument('--save_type', '-save', type=str, default='xiaoqu', help='save which type, default is csv. csv,mysql,excel')
+    args=parser.parse_args()
+    if args.website == 'beike':
+        beike = Beike(args.city, args.save_type)
+        beike.run()
+    elif args.website == 'lianjia':
+        pass
+    elif args.website == 'fangtianxia':
+        pass
+    elif args.website == 'anjuke':
+        pass
+    elif args.website == '58':
+        pass