R09228001 楊宇翔 台大地理碩二

rm(list=ls()) #clear all
setwd("~/Desktop/110-1/110-1 data visualization/week3_1007")
library(readxl)
s1=read_excel("s1.xls")
s2=read_excel("s2.xls")
s3=read_excel("s3.xls")
s4=read_excel("s4.xls")
d=rbind(s1,s2,s3,s4)
#install.packages("dplyr")
library(dplyr)
colnames(d)[c(17:19,25)]=c("房間數","廳數","衛浴數","停車位面積")
df=d %>% select(鄉鎮市區,土地移轉總面積平方公尺,交易年月日,建築完成年月,建物移轉總面積平方公尺,房間數,廳數,衛浴數,總價元,單價元平方公尺,停車位面積,車位總價元,主建物面積,附屬建物面積,陽台面積)
da=as.data.frame(df[-1,-1])
df1=lapply(da,as.numeric)
df2=as.data.frame(df1)
df3=na.omit(df2)
R=cor(df3)

[1] Following the #2 of in-class practice, for each building state (建物型態), what district of Taipei City is the hotspot of these outlier objects?

g=d %>% select(鄉鎮市區,總價元,建物移轉總面積平方公尺,建物型態)
g$建物移轉總面積平方公尺=as.numeric(g$建物移轉總面積平方公尺)
g$總價元=as.numeric(g$總價元)
g$建物型態=as.factor(g$建物型態)
gf=na.omit(g)
colnames(gf)=c("region","price","area","type")

Select several building state (建物型態) to identify the hotspot in Taipei.

library(ggplot2)
p <- ggplot(gf, aes (x = area, y = price ) ) + 
  geom_point() + 
  stat_ellipse(type="norm", level=.9) + 
  facet_wrap(~type)  +
  theme(text=element_text(family="黑體-繁 中黑", size=14))

build <- ggplot_build(p)$data
points <- build[[1]]
ell <- build[[2]]

library(sp)
check <- c()
for (i in 1:length(points$x)){
  check[i] = as.logical(point.in.polygon(points$x[i], points$y[i], 
    ell$x[ell$PANEL == points$PANEL[i]], ell$y[ell$PANEL == points$PANEL[i]]))
}
dat <- data.frame(
      points[1:2],
      gf$type,
      gf$region,
      check)
head(dat)
##        x        y                    gf.type gf.region check
## 1   0.00  1600000                       其他    中正區  TRUE
## 2 109.36 11850000      公寓(5樓含以下無電梯)    文山區  TRUE
## 3 109.42 10000000      公寓(5樓含以下無電梯)    文山區  TRUE
## 4 163.53 21000000 住宅大樓(11層含以上有電梯)    文山區  TRUE
## 5 204.79 67203400     華廈(10層含以下有電梯)    文山區 FALSE
## 6  88.70 18250000 住宅大樓(11層含以上有電梯)    中正區  TRUE
options(scipen = 999)
ggplot(dat, aes (x = x, y = y, color = check ) ) + geom_point() + 
  stat_ellipse(type="norm", level=.9, color='black') + facet_wrap(~gf$type,,scale='free',ncol=3)+ theme(text=element_text(family="黑體-繁 中黑", size=14))+
   labs(title="Relation between floor area & total transaction price by building type",x ="floor area", y = "total transaction price")

Piechart of outliers by regions of Taipei city

o=dat %>% filter(check==FALSE)
ot=as.data.frame(table(o[,c(4,3)]))
head(ot)
##   gf.region      gf.type Freq
## 1    北投區 辦公商業大樓    0
## 2    大安區 辦公商業大樓    2
## 3    大同區 辦公商業大樓    0
## 4    南港區 辦公商業大樓    0
## 5    內湖區 辦公商業大樓    3
## 6    士林區 辦公商業大樓    0
#reorder(gf$region,Freq)
#bar chart
dot=ot[-((nrow(ot)-12):nrow(ot)),]
ggplot(dot,aes(x="",y=dot$Freq,fill=dot$gf.region))+
geom_bar(stat='identity',position="fill",width=1)+
  coord_polar("y", start=0) +
     facet_wrap(dot$gf.type,ncol =4)+
   theme(text=element_text(family="黑體-繁 中黑", size=14))+
   labs(title="Piecharts of outliers of relation between area and price by Taipei region",
        x ="Taipei city region", y = "outliers count")+ scale_fill_brewer(palette="Set3")

interpretation:

outlier’s geogrpahical variance is the most distinguishable in 住宅大樓,the hot spot is 中山區 followed by 大安區、信義區。 in type of 公寓,hotspot is in 北投區。in type of 套房,hotspot is in 中正區。in type of 華廈,the hot spot is in 大安區 follwed by 內湖區。in 商業辦公大樓,hotspot is in 中山區。in 透天屋,the hot spot is in 北投區。

[2] Plotting heat map to identify highly-correlated variables in real estate transaction data.

main buiding area, floor area, total price are highly correlated and parking lot area and parking lot price are highly correlated in Taipei city.

par(family='STKaiti')
plot(hclust(as.dist(1-R)))

#install.packages("corrplot")
library("corrplot")
par(family='STKaiti')
corrplot(R, method="color", order="hclust", addrect=4,
tl.col="gray30",cl.lim=c(0,1), col=colorRampPalette(c("darkblue","white","darkred"))(200))

[3] Are these highly-correlated variables the same across districts of Taipei City?

These highly-correlated variables are NOT the same across districts of Taipei City. Some places have different correlation from another regrion.

df1=lapply(da,as.numeric)
df2=as.data.frame(df1)
df2=cbind(df$鄉鎮市區[-1],df2)
df3=na.omit(df2)
colnames(df3)[1]=c("region")

library(dplyr)

# 分成list
df3_list <- split(df3,df3$region)   
d1=df3_list[[1]]
R_list=list()

dt=as.data.frame(table(df3$region))
n=dt$Var1
for (i in 1:12){
R_list[[i]]=cor(df3_list[[i]][,-1])
library("corrplot")
par(family='STKaiti')
corrplot(R_list[[i]], method="color", order="hclust", addrect=4,
tl.col="gray30", col=colorRampPalette(c("darkgreen","white","orange"))(200),title =n[i])
}

[4] Plotting the following graph.

library(dplyr)
colnames(d)
##  [1] "鄉鎮市區"               "交易標的"               "土地位置建物門牌"      
##  [4] "土地移轉總面積平方公尺" "都市土地使用分區"       "非都市土地使用分區"    
##  [7] "非都市土地使用編定"     "交易年月日"             "交易筆棟數"            
## [10] "移轉層次"               "總樓層數"               "建物型態"              
## [13] "主要用途"               "主要建材"               "建築完成年月"          
## [16] "建物移轉總面積平方公尺" "房間數"                 "廳數"                  
## [19] "衛浴數"                 "建物現況格局-隔間"      "有無管理組織"          
## [22] "總價元"                 "單價元平方公尺"         "車位類別"              
## [25] "停車位面積"             "車位總價元"             "備註"                  
## [28] "編號"                   "主建物面積"             "附屬建物面積"          
## [31] "陽台面積"               "電梯"                   "移轉編號"
dy= d %>% select(鄉鎮市區,都市土地使用分區,有無管理組織)
dy=dy %>% filter(都市土地使用分區==c("商","住","其他"))
for(i in 1:nrow(dy)){
  if (dy$鄉鎮市區[i]=="士林區") dy$region[i]="north"
   else if (dy$鄉鎮市區[i]=="北投區") dy$region[i]="north"
   else if (dy$鄉鎮市區[i]=="文山區") dy$region[i]="south"
   else if (dy$鄉鎮市區[i]=="內湖區") dy$region[i]="east"
   else if (dy$鄉鎮市區[i]=="南港區") dy$region[i]="east"
   else if (dy$鄉鎮市區[i]=="大安區") dy$region[i]="middle"
   else if (dy$鄉鎮市區[i]=="信義區") dy$region[i]="middle"
   else if (dy$鄉鎮市區[i]=="松山區") dy$region[i]="middle"
    else dy$region[i]="west"
}

for(i in 1:nrow(dy)){
  if (dy$都市土地使用分區[i]=="住") dy$use[i]="residence"
   else if (dy$都市土地使用分區[i]=="商") dy$use[i]="business"
    else dy$use[i]="others"
}

for(i in 1:nrow(dy)){
  if (dy$有無管理組織[i]=="有") dy$guard[i]="yes"
   else dy$guard[i]="no"
}


dr=dy[,c(4:6)]



#install.packages("vcd")
library(vcd)
#Mosaic Plots

mosaic(guard ~ region + use, highlighting_fill=c("yellow","purple"), data=dr)

# Residual-based Shadings
mosaic(guard ~ region + use, data=dr, gp = shading_hcl)

[5] Explain the meaning of the graph.

Interpretation:

This is a Mosaic Plots for categorical data. The color in this plot means peason’s residuals. Pearson residuals are used in a Chi-Square Test of Independence to analyze the difference between observed cell counts and expected cell counts in a contingency table.The formula is (obs - exp) / sqrt(exp). Observations that are not fit well by the model have highly positive or negative Pearson residuals.

According to peason’s residuals plot, we want check 3 categorical vatiables, main use, region and building security. The main concern is whether there exist difference between regions in taipei in the ratio of real estate transaction that contain and not contain building security in differet main use.

According to the chart, the blue color means highy positive peason’s residual and the red means highy negative peason’s residual, which means there are big difference between two groups. Grey color means low pearson’s residual, which means the observation fit well and there are no big different between two groups In this chartm the parking lot in middle area and buisness building in south area and factory building in western area have low pearson residuals which means that whether there are builiding securities or not is near taipei mean .