3 years ago · 8a534e8ace
--- a/lianjia/getData.py
+++ b/lianjia/getData.py
@@ -12,17 +12,6 @@ import requests
 
				 from bs4 import BeautifulSoup
			
 
				 from nt import chdir
			
 
				 
			
 
				-# 定义空列表，用于创建所有的爬虫链接
			
 
				-urls = []
			
 
				-# 指定爬虫所需的上海各个区域名称
			
 
				-# citys = ['pudongxinqu','minhang','baoshan','xuhui','putuo','yangpu','changning','songjiang',
			
 
				-#          'jiading','huangpu','jinan','zhabei','hongkou','qingpu','fengxian','jinshan','chongming']
			
 
				-citys = ['pudongxinqu']
			
 
				-
			
 
				-workSpace = "E:\\data\\workspace\\Python_Tools\\lianjianalysis"
			
 
				-resultFile = "lianjia.csv"
			
 
				-
			
 
				-
			
 
				 class Lianjia(object):
			
 
				     def __init__(self):
			
 
				         super(Lianjia, self).__init__()
			
@@ -31,12 +20,13 @@ class Lianjia(object):
 
				             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
			
 
				             'Referer': 'http://sh.lianjia.com/ershoufang/',
			
 
				         }
			
 
				+        self.urls=[]
			
 
				         self.session.headers.update(headers)
			
 
				 
			
 
				     def getUrls(self):
			
 
				         # 基于for循环，构造完整的爬虫链接
			
 
				-        for i in citys:
			
 
				-            url = 'http://sh.lianjia.com/ershoufang/%s/' % i
			
 
				+        for city in citys:
			
 
				+            url = 'http://sh.lianjia.com/ershoufang/%s/' % city
			
 
				             res = self.session.get(url)  # 发送get请求
			
 
				             res = res.text.encode(res.encoding).decode('utf-8')  # 需要转码，否则会有问题
			
 
				             soup = BeautifulSoup(res, 'html.parser')  # 使用bs4模块，对响应的链接源代码进行html解析
			
@@ -48,7 +38,7 @@ class Lianjia(object):
 
				                 total_pages = int(pages[-2])
			
 
				 
			
 
				             for j in list(range(1, total_pages + 1)):  # 拼接所有需要爬虫的链接
			
 
				-                urls.append('http://sh.lianjia.com/ershoufang/%s/d%s' % (i, j))
			
 
				+                self.urls.append('http://sh.lianjia.com/ershoufang/%s/d%s' % (city, j))
			
 
				             #         随机睡眠2-10s
			
 
				         time.sleep(random.randint(2, 10))
			
 
				 
			
@@ -56,9 +46,9 @@ class Lianjia(object):
 
				         #         获取所有url
			
 
				         self.getUrls()
			
 
				         # 创建csv文件，用于后面的保存数据
			
 
				-        file = open(resultFile, 'w', encoding='utf-8')
			
 
				+        file = open('链家二手房.csv', 'w', encoding='utf-8')
			
 
				 
			
 
				-        for url in urls:  # 基于for循环，抓取出所有满足条件的标签和属性列表，存放在find_all中
			
 
				+        for url in self.urls:  # 基于for循环，抓取出所有满足条件的标签和属性列表，存放在find_all中
			
 
				             res = requests.get(url)
			
 
				             res = res.text.encode(res.encoding).decode('utf-8')
			
 
				             soup = BeautifulSoup(res, 'html.parser')
			
@@ -92,8 +82,11 @@ class Lianjia(object):
 
				         # 关闭文件（否则数据不会写入到csv文件中）
			
 
				         file.close()
			
 
				 
			
 
				-
			
 
				-chdir(workSpace)
			
 
				-jia = Lianjia()
			
 
				-jia.mSpider()
			
 
				-print(urls)
			
 
				+if __name__ == "__main__":
			
 
				+    workSpace = "E:\\data\\workspace\\Python_Tools\\lianjianalysis"
			
 
				+    chdir(workSpace)
			
 
				+    # 指定爬虫所需的上海各个区域名称
			
 
				+    citys = ['pudongxinqu','minhang','baoshan','xuhui','putuo','yangpu','changning','songjiang',
			
 
				+            'jiading','huangpu','jinan','zhabei','hongkou','qingpu','fengxian','jinshan','chongming']
			
 
				+    lianjia=Lianjia()
			
 
				+    lianjia.mSpider(citys)
			
--- a/lianjia/normal.test.R
+++ b/lianjia/normal.test.R
@@ -0,0 +1,58 @@
 
				+norm.test <- function(x, breaks = 20, alpha = 0.05,
			
 
				+plot = TRUE){
			
 
				+if(plot == TRUE)
			
 
				+{#设置图形界面（多图合为一张图）
			
 
				+opar <- par(no.readonly = TRUE)
			
 
				+layout(matrix(c(1,1,2,3),2,2,byrow = TRUE),
			
 
				+width = c(2,2),heights = c(2,2))
			
 
				+#绘制直方图
			
 
				+hist(x, freq = FALSE, breaks = seq(min(x),
			
 
				+max(x), length = breaks), main = 'x的直方图',
			
 
				+ylab = '核密度值')
			
 
				+#添加核密度图
			
 
				+lines(density(x), col = 'red', lty = 1, lwd = 2)
			
 
				+#添加正态分布图
			
 
				+x <- x[order(x)]
			
 
				+lines(x, dnorm(x, mean(x), sd(x)),
			
 
				+col = 'blue', lty = 2, lwd = 2.5)
			
 
				+#添加图例
			
 
				+legend('topright',
			
 
				+legend = c('核密度曲线','正态分布曲线'),
			
 
				+col = c('red','blue'), lty = c(1,2),
			
 
				+lwd = c(2,2.5), bty = 'n')
			
 
				+#绘制Q-Q图
			
 
				+qqnorm(x, xlab = '实际分布', ylab = '正态分布',
			
 
				+main = 'x的Q-Q图', col = 'blue')
			
 
				+qqline(x)
			
 
				+#绘制P-P图
			
 
				+P <- pnorm(x, mean(x), sd(x))
			
 
				+cdf <- 0
			
 
				+for(i in 1:length(x)){cdf[i] <- sum(x <= x[i])/length(x)}
			
 
				+plot(cdf, P, xlab = '实际分布', ylab = '正态分布',
			
 
				+main = 'x的P-P图', xlim = c(0,1),
			
 
				+ylim = c(0,1), col = 'blue')
			
 
				+abline(a = 0, b = 1)
			
 
				+par(opar)
			
 
				+}
			
 
				+#定量的shapiro检验
			
 
				+if (length(x) <= 5000) {
			
 
				+shapiro <- shapiro.test(x)
			
 
				+if(shapiro$p.value > alpha)
			
 
				+print(paste('定量结果为：', 'x服从正态分布，',
			
 
				+'P值 =',round(shapiro$p.value,5), '> 0.05'))
			
 
				+else
			
 
				+print(paste('定量结果为：', 'x不服从正态分布，',
			
 
				+'P值 =',round(shapiro$p.value,5), '<= 0.05'))
			
 
				+shapiro
			
 
				+}
			
 
				+else {
			
 
				+ks <- ks.test(x,'pnorm')
			
 
				+if(ks$p.value > alpha)
			
 
				+print(paste('定量结果为：', 'x服从正态分布，',
			
 
				+'P值 =',round(ks$p.value,5), '> 0.05'))
			
 
				+else
			
 
				+print(paste('定量结果为：', 'x不服从正态分布，',
			
 
				+'P值 =',round(ks$p.value,5), '<= 0.05'))
			
 
				+ks
			
 
				+}
			
 
				+}
			
--- a/lianjia/链家二手房分析.R
+++ b/lianjia/链家二手房分析.R
@@ -0,0 +1,149 @@
 
				+# 导入开发所需的扩展包
			
 
				+library(dplyr)
			
 
				+library(Hmisc)
			
 
				+library(ggplot2)
			
 
				+library(caret)
			
 
				+
			
 
				+house <- read.csv(file.choose(), stringsAsFactors = FALSE)
			
 
				+dim(house)
			
 
				+str(house)
			
 
				+summary(house)
			
 
				+
			
 
				+load('.RData')
			
 
				+# 数据探索
			
 
				+
			
 
				+# 户型分布
			
 
				+type_freq <- data.frame(table(house$户型))
			
 
				+type_p <- ggplot(data = type_freq, mapping = aes(x = reorder(Var1, -Freq),y = Freq)) + geom_bar(stat = 'identity', fill = 'steelblue') + theme(axis.text.x  = element_text(angle = 30, vjust = 0.5)) + xlab('户型') + ylab('套数')
			
 
				+type_p
			
 
				+
			
 
				+# 把低于一千套的房型设置为其他
			
 
				+type <- c('2室2厅','2室1厅','3室2厅','1室1厅','3室1厅','4室2厅','1室0厅','2室0厅')
			
 
				+house$type.new <- ifelse(house$户型 %in% type, house$户型,'其他')
			
 
				+type_freq <- data.frame(table(house$type.new))
			
 
				+type_p <- ggplot(data = type_freq, mapping = aes(x = reorder(Var1, -Freq),y = Freq)) + geom_bar(stat = 'identity', fill = 'steelblue') + theme(axis.text.x  = element_text(angle = 30, vjust = 0.5)) + xlab('户型') + ylab('套数')
			
 
				+type_p
			
 
				+
			
 
				+# 面积的正态性检验
			
 
				+norm.test(house$面积)
			
 
				+
			
 
				+# 房价的正态性检验
			
 
				+norm.test(house$价格.W.)
			
 
				+
			
 
				+# 楼层分布
			
 
				+unique(house$楼层)
			
 
				+
			
 
				+# 把楼层分为低区、中区和高区三种
			
 
				+house$floow <- ifelse(substring(house$楼层,1,2) %in% c('低区','中区','高区'), substring(house$楼层,1,2),'低区')
			
 
				+
			
 
				+# 各楼层类型百分比分布
			
 
				+percent <- paste(round(prop.table(table(house$floow))*100,2),'%',sep = '')
			
 
				+df <- data.frame(table(house$floow))
			
 
				+df <- cbind(df, percent)
			
 
				+df
			
 
				+
			
 
				+# 上海各区房价均价
			
 
				+avg_price <- aggregate(house$单价.平方米., by = list(house$区域), mean)
			
 
				+
			
 
				+p <- ggplot(data = avg_price, mapping = aes(x = reorder(Group.1, -x), y = x, group = 1)) + geom_area(fill = 'lightgreen') + geom_line(colour = 'steelblue', size = 2) + geom_point() + xlab('') + ylab('均价')
			
 
				+p
			
 
				+
			
 
				+# 房屋建筑时间确实严重，我们按区域分组，使用众数填充
			
 
				+house$建筑时间[house$建筑时间 == ''] <- NA
			
 
				+# 自定义众数函数
			
 
				+stat.mode <- function(x, rm.na = TRUE){
			
 
				+  if (rm.na == TRUE){
			
 
				+    y = x[!is.na(x)]
			
 
				+  }
			
 
				+  res = names(table(y))[which.max(table(y))]
			
 
				+  return(res)
			
 
				+}
			
 
				+
			
 
				+# 自定义函数，实现分组替补
			
 
				+my.impute <- function(data, category.col = NULL, 
			
 
				+                      miss.col = NULL, method = stat.mode){
			
 
				+  impute.data = NULL
			
 
				+  for(i in as.character(unique(data[,category.col]))){
			
 
				+    sub.data = subset(data, data[,category.col] == i)
			
 
				+    sub.data[,miss.col] = impute(sub.data[,miss.col], method)
			
 
				+    impute.data = c(impute.data, sub.data[,miss.col])
			
 
				+  }
			
 
				+  data[,miss.col] = impute.data
			
 
				+  return(data)
			
 
				+}
			
 
				+
			
 
				+final_house <- subset(my.impute(house, '区域', '建筑时间'),select = c(区域,type.new,floow,面积,价格.W.,单价.平方米.,建筑时间))
			
 
				+final_house <- transform(final_house, builtdate2now = 2016-as.integer(substring(as.character(建筑时间),1,4)))
			
 
				+final_house <- subset(final_house, select = -建筑时间)
			
 
				+
			
 
				+# 使用k-means聚类，探究上海的各个区域可以划分为几类
			
 
				+
			
 
				+# 自定义函数
			
 
				+tot.wssplot <- function(data, nc, seed=1234){
			
 
				+  #假设分为一组时的总的离差平方和              
			
 
				+  tot.wss <- (nrow(data)-1)*sum(apply(data,2,var)) 
			
 
				+  for (i in 2:nc){
			
 
				+    #必须指定随机种子数
			
 
				+    set.seed(seed) 
			
 
				+    tot.wss[i] <- kmeans(data, centers=i, iter.max = 100)$tot.withinss
			
 
				+  }
			
 
				+  plot(1:nc, tot.wss, type="b", xlab="Number of Clusters",
			
 
				+       ylab="Within groups sum of squares",col = 'blue',
			
 
				+       lwd = 2, main = 'Choose best Clusters')
			
 
				+}
			
 
				+
			
 
				+
			
 
				+standrad <- data.frame(scale(final_house[,c('面积','价格.W.','单价.平方米.')]))
			
 
				+myplot <- tot.wssplot(standrad, nc = 15)
			
 
				+
			
 
				+# 根据图形，大致可以将数据聚为5类
			
 
				+set.seed(1234)
			
 
				+clust <- kmeans(x = standrad, centers = 5, iter.max = 100)
			
 
				+table(clust$cluster)
			
 
				+
			
 
				+# 按聚类的结果，比较各类中房子的平均面积、价格和单价
			
 
				+aggregate(final_house[,3:5], list(clust$cluster), mean)
			
 
				+
			
 
				+# 按照聚类的结果，查看各类中的区域分布
			
 
				+table(house$区域,clust$cluster)
			
 
				+
			
 
				+# 各户型的平均面积
			
 
				+aggregate(final_house$面积, list(final_house$type.new), mean)
			
 
				+
			
 
				+# 绘制面积与单价的散点图，并按聚类进行划分
			
 
				+p <- ggplot(data = final_house[,3:5], mapping = aes(x = 面积,y = 单价.平方米., color = factor(clust$cluster)))
			
 
				+p <- p + geom_point(pch = 20, size = 3)
			
 
				+p + scale_colour_manual(values = c("red","blue", "green", "black", "orange"))
			
 
				+
			
 
				+
			
 
				+# 构造楼层和聚类结果的哑变量
			
 
				+# 将几个离散变量转换为因子，目的便于下面一次性处理哑变量
			
 
				+final_house$cluster <- factor(clust$cluster)
			
 
				+final_house$floow <- factor(final_house$floow)
			
 
				+final_house$type.new <- factor(final_house$type.new)
			
 
				+# 筛选出所有因子型变量
			
 
				+factors <- names(final_house)[sapply(final_house, class) == 'factor']
			
 
				+# 将因子型变量转换成公式formula的右半边形式
			
 
				+formula <- f <- as.formula(paste('~', paste(factors, collapse = '+')))
			
 
				+dummy <- dummyVars(formula = formula, data = final_house)
			
 
				+pred <- predict(dummy, newdata = final_house)
			
 
				+head(pred)
			
 
				+# 将哑变量规整到final_house数据集中
			
 
				+final_house2 <- cbind(final_house,pred)
			
 
				+# 筛选出需要建模的数据
			
 
				+model.data <- subset(final_house2,select = -c(1,2,3,8,17,18,24))
			
 
				+# 直接对数据进行线性回归建模
			
 
				+fit1 <- lm(价格.W. ~ .,data = model.data)
			
 
				+summary(fit1)
			
 
				+
			
 
				+library(car)
			
 
				+# Box-Cox转换
			
 
				+powerTransform(fit1)
			
 
				+
			
 
				+fit2 <- lm(log(价格.W.) ~ .,data = model.data)
			
 
				+
			
 
				+# 使用plot方法完成模型定性的诊断
			
 
				+opar <- par(no.readonly = TRUE)
			
 
				+par(mfrow = c(2,2))
			
 
				+plot(fit2)
			
 
				+par(opar)