5 years ago · 9d189eafd1
--- a/lianjia/compare-lxml-beautiful.py
+++ b/lianjia/compare-lxml-beautiful.py
@@ -0,0 +1,31 @@
 
				+# coding=utf-8
			
 
				+'''
			
 
				+lxml和bs4解析html对比
			
 
				+例子，通过两种方法，把百度所有产品打印出来。
			
 
				+Created on 2017年7月3日
			
 
				+@vsersion:python3.6
			
 
				+@author: liuyuqi
			
 
				+'''
			
 
				+import requests
			
 
				+from bs4 import BeautifulSoup
			
 
				+from lxml import etree
			
 
				+
			
 
				+url = "https://www.baidu.com/more/"
			
 
				+res = requests.get(url)
			
 
				+html = res.text.encode(res.encoding).decode('utf-8')
			
 
				+
			
 
				+# 使用beautiful解析
			
 
				+soup = BeautifulSoup(html, 'lxml')
			
 
				+titles = soup.findAll('div', {'class': 'con'})
			
 
				+print(len(titles))
			
 
				+for title in titles:
			
 
				+    print(soup.find_all('a')[1].text)  # 不好抓取
			
 
				+
			
 
				+# 使用lxml解析
			
 
				+# //*[@id="content"]/div[1]/div[2]/a
			
 
				+# //*[@id="content"]/div[2]/div[2]/a
			
 
				+
			
 
				+selector = etree.HTML(html)
			
 
				+titles = selector.xpath('//*[@id="content"]/div/div/a/text()')
			
 
				+for title in titles:
			
 
				+    print(title)
			
--- a/lianjia/getData.py
+++ b/lianjia/getData.py
@@ -0,0 +1,99 @@
 
				+# coding=utf-8
			
 
				+'''
			
 
				+Created on 2017年7月1日
			
 
				+@vsersion:python3.6
			
 
				+@author: liuyuqi
			
 
				+'''
			
 
				+import random
			
 
				+import time
			
 
				+
			
 
				+# 导入开发模块
			
 
				+import requests
			
 
				+from bs4 import BeautifulSoup
			
 
				+from nt import chdir
			
 
				+
			
 
				+# 定义空列表，用于创建所有的爬虫链接
			
 
				+urls = []
			
 
				+# 指定爬虫所需的上海各个区域名称
			
 
				+# citys = ['pudongxinqu','minhang','baoshan','xuhui','putuo','yangpu','changning','songjiang',
			
 
				+#          'jiading','huangpu','jinan','zhabei','hongkou','qingpu','fengxian','jinshan','chongming']
			
 
				+citys = ['pudongxinqu']
			
 
				+
			
 
				+workSpace = "E:\\data\\workspace\\Python_Tools\\lianjianalysis"
			
 
				+resultFile = "lianjia.csv"
			
 
				+
			
 
				+
			
 
				+class Lianjia(object):
			
 
				+    def __init__(self):
			
 
				+        super(Lianjia, self).__init__()
			
 
				+        self.session = requests.Session()
			
 
				+        headers = {
			
 
				+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
			
 
				+            'Referer': 'http://sh.lianjia.com/ershoufang/',
			
 
				+        }
			
 
				+        self.session.headers.update(headers)
			
 
				+
			
 
				+    def getUrls(self):
			
 
				+        # 基于for循环，构造完整的爬虫链接
			
 
				+        for i in citys:
			
 
				+            url = 'http://sh.lianjia.com/ershoufang/%s/' % i
			
 
				+            res = self.session.get(url)  # 发送get请求
			
 
				+            res = res.text.encode(res.encoding).decode('utf-8')  # 需要转码，否则会有问题
			
 
				+            soup = BeautifulSoup(res, 'html.parser')  # 使用bs4模块，对响应的链接源代码进行html解析
			
 
				+            page = soup.findAll('div', {'class': 'c-pagination'})  # 使用finalAll方法，获取指定标签和属性下的内容
			
 
				+            pages = [i.strip() for i in page[0].text.split('\n')]  # 抓取出每个区域的二手房链接中所有的页数
			
 
				+            if len(pages) > 3:
			
 
				+                total_pages = int(pages[-3])
			
 
				+            else:
			
 
				+                total_pages = int(pages[-2])
			
 
				+
			
 
				+            for j in list(range(1, total_pages + 1)):  # 拼接所有需要爬虫的链接
			
 
				+                urls.append('http://sh.lianjia.com/ershoufang/%s/d%s' % (i, j))
			
 
				+            #         随机睡眠2-10s
			
 
				+        time.sleep(random.randint(2, 10))
			
 
				+
			
 
				+    def mSpider(self):
			
 
				+        #         获取所有url
			
 
				+        self.getUrls()
			
 
				+        # 创建csv文件，用于后面的保存数据
			
 
				+        file = open(resultFile, 'w', encoding='utf-8')
			
 
				+
			
 
				+        for url in urls:  # 基于for循环，抓取出所有满足条件的标签和属性列表，存放在find_all中
			
 
				+            res = requests.get(url)
			
 
				+            res = res.text.encode(res.encoding).decode('utf-8')
			
 
				+            soup = BeautifulSoup(res, 'html.parser')
			
 
				+            find_all = soup.find_all(name='div', attrs={'class': 'info'})
			
 
				+
			
 
				+            for i in list(range(len(find_all))):  # 基于for循环，抓取出所需的各个字段信息
			
 
				+                title = find_all[i].find('a')['title']  # 每套二手房的标语
			
 
				+
			
 
				+                res2 = find_all[i]
			
 
				+                name = res2.find_all('div', {'class': 'where'})[0].find_all('span')[0].text  # 每套二手房的小区名称
			
 
				+                room_type = res2.find_all('div', {'class': 'where'})[0].find_all('span')[1].text  # 每套二手房的户型
			
 
				+                size = res2.find_all('div', {'class': 'where'})[0].find_all('span')[2].text[:-3]  # 每套二手房的面积
			
 
				+
			
 
				+                # 采用列表解析式，删除字符串的首位空格
			
 
				+                info = [i.strip() for i in res2.find_all('div', {'class': 'con'})[0].text.split('\n')]
			
 
				+                region = info[1]  # 每套二手房所属的区域
			
 
				+                loucheng = info[2][2:]  # 每套二手房所在的楼层
			
 
				+                chaoxiang = info[5][2:]  # 每套二手房的朝向
			
 
				+                builtdate = info[-3][2:]  # 每套二手房的建筑时间
			
 
				+
			
 
				+                # 每套二手房的总价
			
 
				+                price = find_all[i].find('div', {'class': 'price'}).text.strip()[:-1]
			
 
				+                # 每套二手房的平方米售价
			
 
				+                price_union = find_all[i].find('div', {'class': 'price-pre'}).text.strip()[:-3]
			
 
				+
			
 
				+                # print(name,room_type,size,region,loucheng,chaoxiang,price,price_union,builtdate)
			
 
				+                # 将上面的各字段信息值写入并保存到csv文件中
			
 
				+                file.write(','.join(
			
 
				+                    (name, room_type, size, region, loucheng, chaoxiang, price, price_union, builtdate)) + '\n')
			
 
				+
			
 
				+        # 关闭文件（否则数据不会写入到csv文件中）
			
 
				+        file.close()
			
 
				+
			
 
				+
			
 
				+chdir(workSpace)
			
 
				+jia = Lianjia()
			
 
				+jia.mSpider()
			
 
				+print(urls)
			
--- a/lianjia/test.py
+++ b/lianjia/test.py
@@ -0,0 +1,37 @@
 
				+# coding=utf-8
			
 
				+'''
			
 
				+Created on 2017年7月1日
			
 
				+@vsersion:python3.6
			
 
				+@author: liuyuqi
			
 
				+'''
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+url = 'http://sh.lianjia.com/ershoufang/pudongxinqu'
			
 
				+
			
 
				+# res=requests.get(url)
			
 
				+# res=res.text.encode(res.encoding).decode('utf-8')
			
 
				+# file = open("resultFile.txt",'w',encoding = 'utf-8')
			
 
				+# file.write(res)
			
 
				+file = open("resultFile.txt", 'r', encoding='UTF-8')
			
 
				+try:
			
 
				+    res = file.read()
			
 
				+finally:
			
 
				+    file.close()
			
 
				+
			
 
				+soup = BeautifulSoup(res, 'html.parser')  # 使用bs4模块，对响应的链接源代码进行html解析
			
 
				+page = soup.findAll('div', {'class': 'c-pagination'})
			
 
				+pages = [i.strip() for i in page[0].text.split('\n')]  # 抓取出每个区域的二手房链接中所有的页数
			
 
				+if len(pages) > 3:
			
 
				+    total_pages = int(pages[-3])
			
 
				+else:
			
 
				+    total_pages = int(pages[-2])
			
 
				+# print(total_pages)
			
 
				+find_all = soup.find_all(name='div', attrs={'class': 'info'})
			
 
				+# print(len(find_all))
			
 
				+res2 = find_all[1]
			
 
				+title = res2.find('a')['title']
			
 
				+print(res2)
			
 
				+name = res2.find_all('div', {'class': 'info-row'})[1].find_all('span')[0].text  # 每套二手房的小区名称
			
 
				+room_type = res2.find_all('div', {'class': 'info-row'})[0].find_all('span')[1].text  # 每套二手房的户型
			
 
				+# size = res2.find_all('div',{'class':'info-row'})[0].find_all('span')[2].text[:-3] # 每套二手房的面积
			
 
				+print(room_type)