#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Contact : liuyuqi.gov@msn.cn @Time : 2023/05/29 22:03:35 @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved. @Desc : ''' import requests from lxml import etree import csv,os,sys,re import xlwt import pandas as pd import argparse import logging class Beike(object): ''' 贝壳网数据 ''' def __init__(self,city:str,save_type:str='csv'): ''' 初始化 ''' self.sess=requests.session() self.logger=logging.getLogger(__name__) self.logger.setLevel(logging.INFO) self.formatter=logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') self.ch=logging.StreamHandler() self.ch.setLevel(logging.INFO) self.ch.setFormatter(self.formatter) self.logger.addHandler(self.ch) self.args=self.get_args() def get_proxy(self): ''' 获取代理 ''' pass def set_proxy(self): pass def get_args(self): ''' 获取参数 ''' parser=argparse.ArgumentParser(description='贝壳网数据') parser.add_argument('-c','--city',type=str,help='城市') parser.add_argument('-p','--page',type=int,default=1,help='页码') def run(self,city:dict, type='ershoufang'): for city in citys: for i in range(1,101): pass def get_zufang(self,city:str,page:int=1): ''' 获取租房数据 ''' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', 'Cookie': 'select_city=510100; lianjia_uuid=db980944-7c31-4bcc-8aae-78cdd9ba138d; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22181a93da768e3e-047b67455463da-26021a51-1327104-181a93da769efd%22%2C%22%24device_id%22%3A%22181a93da768e3e-047b67455463da-26021a51-1327104-181a93da769efd%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E4%BB%98%E8%B4%B9%E5%B9%BF%E5%91%8A%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Fother.php%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E8%B4%9D%E5%A3%B3%E6%89%BE%E6%88%BF%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wychengdu%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; GUARANTEE_BANNER_SHOW=true; login_ucid=2000000245569411; lianjia_ssid=712b2a49-fc10-4962-8fea-379234735b80; lianjia_token=2.0013a6ea83788090ec020bc3b2663bffe3; lianjia_token_secure=2.0013a6ea83788090ec020bc3b2663bffe3; security_ticket=R2sd09nxk8Sm81pfJkvABUssf/StkYIdArkOZN0QqdFQvHLLFF7LBqExRYZbcQYf29gNoQKxH9O7MpAHGu2v73TQnJFksR9KIw/NJ+itit98LyB2Ncs/DgCZmgm3w0ypGjqFK49ik+KFpzJtvr4ukZOE94RSX5+eLymj/Vb+D18=; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiYTRkYjMxYmRhYTM3YTU1ZWI3NDg5OGFiNTVhNTk3ZGQxY2ZhMDkyNTg2YWYwM2NiZDlhMWMwNzM3YmQ2ZmI1ZmZiODA4YmQ4ZTVkMWU4NDNjMjU2M2FhMGZhMzlkM2U4MjkzYTE4Y2U4NjVkNGU3Nzc5NTM5ZmFkNzlmMTE3ZGVkZmFlZDdkNmQxNjc4ZTA0ZDhiOTNiNzhhMDEyM2Q1Y2M0N2QzMDRlYjhkYjliOGM4MGQ2ZTdiMjc0N2UxMzViNjRkYjcwNjA5ZmVkMmIyMjhjMzc2NDQyN2E5YTkzNTY0MjkwYjU2NGE1OGZmYjcxNjMzZDAwZjMxNDBkY2YwNlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJjODFiYmNhYVwifSIsInIiOiJodHRwczovL2NkLnp1LmtlLmNvbS96dWZhbmciLCJvcyI6IndlYiIsInYiOiIwLjEifQ=='} resp = requests.get(url=url,headers=headers) #xpath解析 parser = etree.HTMLParser(encoding='utf-8') #转换树对象 tree = etree.XML(resp.text,parser=parser) price = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/span/em/text()') #print(price) addres = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/p[2]/a[1]/text()') #print(addres) access = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/p[2]/a[2]/text()') #print(access) area = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/p[2]/a[3]/text()') #print(area) #拿到一整列的房源标题 titles = tree.xpath('//*[@id="content"]/div[1]/div[1]/div/div/p[1]/a/text()') #print(titles) prices = tree.xpath('//*[@id="content"]/div[1]/div[1]/div/div/span/em/text()\n') #print(prices) #获取你需要的信息的所有板块 all_list = tree.xpath('//*[@id="content"]/div[1]/div[1]/div') #print(all) #遍历子板块信 for all in all_list: title = all.xpath('./div/p[1]/a/text()') price = all.xpath('./div/span/em/text()') path = all.xpath('./div/p[2]/a[1]/text()') sub = all.xpath('./div/p[2]/a[2]/text()') home = all.xpath('./div/p[2]/a[3]/text()') jx = all.xpath('./div/p[2]/text()[1]')[0] if '精选' in jx: area = all.xpath('./div/p[2]/text()[6]') toward = all.xpath('./div/p[2]/text()[7]') house_type = all.xpath('./div/p[2]/text()[8]') else: area = all.xpath('./div/p[2]/text()[5]') toward = all.xpath('./div/p[2]/text()[6]') house_type = all.xpath('./div/p[2]/text()[7]') list_box.append([title, price, path, sub, home,area,toward,house_type]) for i in list_box: i[0][0] = i[0][0].strip() i[1][0] = i[1][0].strip() i[2][0] = i[2][0].strip() i[4][0] = i[4][0].strip() i[5][0] = i[5][0].strip() i[6][0] = i[6][0].strip() i[7][0] = i[7][0].strip() for i in list_box: print(i) #设置页 加到表头 '''pages = range(1,3) for page in pages: url = f'https://cd.zu.ke.com/zufang/pg{page}/#contentList''' #存数字方式之一 #存到csv文件,逗号分隔符文件 # writer = csv.writer(open('data1.csv','w',encoding='utf-8'))#打开,未创建文件 # writer.writerow(['title', 'price', 'path', 'sub', 'home','area','toward','house_type']) # writer.writerows(list_box) # 存储数据方式二 # 写入excel文件 wb = xlwt.Workbook()#建立工作布 sheet = wb.add_sheet(('data'))#建立表 titles = ('title', 'price', 'path', 'sub', 'home','area','toward','house_type') for index,title in enumerate(titles):# enumerate 输出两个维度的数据 1、数据系列的索引值,2、数据本身 sheet.write(0,index,title)# 参数-:行索引 参数二:列索引 参数三:数据本身 for i , item in enumerate(list_box):# i:行索引 item:一行数据 for j ,data in enumerate(item):# j:列索引 data:一个数据 sheet.write(i+1,j,data) wb.save('house.xls') def get_ershoufang(self): ''' 获取二手房数据 ''' pass def get_xinfang(self): ''' 获取新房数据 ''' pass @staticmethod def save( df:pd.DataFrame, filename:str): ''' 保存数据 ''' if os.path.exists(filename): df.to_excel(filename, index=False, header=False, mode='a', encoding='utf-8-sig') else: df.to_excel(filename, index=False, encoding='utf-8-sig') if __name__=='__main__': citys = { '成都':'cd', '上海':'sh', '绵阳':'mianyang', "吉安":"jian" } beike=Beike() beike.run(citys, "ershoufang")