Interpretation:
I generated two density graphs with ggplot2 package below. In the first graph, it is the overall picture and in the second graph, I zoom in to the overlapping area. They habe different peak and distribution but they overlap in somewhere.
rm(list=ls())
setwd("~/Desktop/110-1/110-1 data visualization/week20930")
library(readxl)
s1=read_excel("s1.xls")
s2=read_excel("s2.xls")
s3=read_excel("s3.xls")
s4=read_excel("s4.xls")
d=rbind(s1,s2,s3,s4)
#install.packages("dplyr")
library(dplyr)
##
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:stats':
##
## filter, lag
## 下列物件被遮斷自 'package:base':
##
## intersect, setdiff, setequal, union
df=d %>% select(土地位置建物門牌,建物型態,車位總價元)
df$車位總價元=as.numeric(df$車位總價元)
library(dplyr)
gb=df %>% filter(建物型態 %in% c("華廈(10層含以下有電梯)","住宅大樓(11層含以上有電梯)")) #%>% group_by(建物型態) %>% summarise(mean(車位總價元))
options(scipen = 999)
library(ggplot2)
ggplot(gb,aes(x=車位總價元,fill=建物型態))+
geom_density(alpha=0.4)+
theme(text=element_text(family="黑體-繁 中黑", size=14))

ggplot(gb,aes(x=車位總價元,fill=建物型態))+
geom_density(alpha=0.4)+
theme(text=element_text(family="黑體-繁 中黑", size=14))+
scale_x_continuous(limits=c(1,10000000))

1.2 Using ttest() to compare its differences of mean prices and summary the statistical results.
Interpretation:
In Welch Two Sample t-test, t value is -23.487, df is 14557 and p-value is < 0.00000000000000022.Because p-value < 0.05, it is statistically significant and we can deny HO. Accordingly, true difference in means is not equal to 0. In this senario, mean of parking price in 華廈(10層含以下有電梯) and 住宅大樓(11層含以上有電梯) is statistically difference.
m=gb %>% filter(建物型態=="華廈(10層含以下有電梯)")
t=gb %>% filter(建物型態=="住宅大樓(11層含以上有電梯)")
t.test(m$車位總價元,t$車位總價元)
##
## Welch Two Sample t-test
##
## data: m$車位總價元 and t$車位總價元
## t = -23.487, df = 14557, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -744909.9 -630151.5
## sample estimates:
## mean of x mean of y
## 479550.2 1167080.9
2. Using boxplot to compare the unit price (單價元) among different shifting level (移轉層次) and discuss whether the object with higher floor could have higher transaction prices?
Note: you should describe the procedure ofhow you handle the column of shifting level (移轉層次)
Interpretation:
In general, the higher the floor of item is, the higher the unit price is. In another word,the object with higher floor could have higher transaction prices. Among this big picture, 1st floor has relatively high unit price than 2-4th floor.
t1=as.data.frame(table(d$移轉層次))
#先刪除na
d2=d[which(!is.na(d$移轉層次)), ]
d2=d2[which(!is.na(d$單價元平方公尺)), ]
#地下
under=agrep("地下",d2$移轉層次)
#只能大概,先不要全分類
u=d2[under,]
u$floor=0
#地上
#只能大概,先不要全分類
up=d2[-under,]
up$移轉層次[is.na(up$移轉層次)]=0
for (i in 1:nrow(up)){
if (up$移轉層次[i] =="一層") up$floor[i]=1
else if (up$移轉層次[i] =="二層") up$floor[i]=1
else if (up$移轉層次[i] =="三層") up$floor[i]=2
else if (up$移轉層次[i] =="四層") up$floor[i]=2
else if (up$移轉層次[i] =="五層") up$floor[i]=3
else if (up$移轉層次[i] =="六層") up$floor[i]=3
else if (up$移轉層次[i] =="七層") up$floor[i]=3
else if (up$移轉層次[i] =="八層") up$floor[i]=3
else if (up$移轉層次[i] =="九層") up$floor[i]=3
else if (up$移轉層次[i] =="十層") up$floor[i]=4
else if (up$移轉層次[i] =="十一層") up$floor[i]=4
else if (up$移轉層次[i] =="十二層") up$floor[i]=4
else if (up$移轉層次[i] =="十三層") up$floor[i]=4
else if (up$移轉層次[i] =="十四層") up$floor[i]=4
else if (up$移轉層次[i] =="十五層") up$floor[i]=5
else if (up$移轉層次[i] =="十六層") up$floor[i]=5
else if (up$移轉層次[i] =="十七層") up$floor[i]=5
else if (up$移轉層次[i] =="十八層") up$floor[i]=5
else if (up$移轉層次[i] =="十九層") up$floor[i]=5
else if (up$移轉層次[i] =="二十層") up$floor[i]=6
else if (up$移轉層次[i] =="二十ㄧ層") up$floor[i]=6
else if (up$移轉層次[i] =="二十二層") up$floor[i]=6
else if (up$移轉層次[i] =="二十三層") up$floor[i]=6
else if (up$移轉層次[i] =="二十四層") up$floor[i]=6
else if (up$移轉層次[i] =="二十五層") up$floor[i]=7
else if (up$移轉層次[i] =="二十六層") up$floor[i]=7
else if (up$移轉層次[i] =="二十七層") up$floor[i]=7
else if (up$移轉層次[i] =="二十八層") up$floor[i]=7
else if (up$移轉層次[i] =="二十九層") up$floor[i]=7
else if (up$移轉層次[i] =="三十層") up$floor[i]=8
else up$floor[i]=-999
}
table(up$floor)
##
## -999 1 2 3 4 5 6 7 8
## 5506 4284 5616 7003 2900 592 137 66 3
up2=up[-which(up$floor==-999),]
data=rbind(u,up2)
library(dplyr)
data2=data %>% select(floor,單價元平方公尺)
data3=data2[which(!is.na(data2$單價元平方公尺)), ]
data3$單價元平方公尺=as.numeric(data3$單價元平方公尺)
data3$floor=as.factor(data3$floor)
for( i in 1:nrow(data3)){
if (data3$floor[i]==0) data3$fl[i]=-1
else if (data3$floor[i]==1) data3$fl[i]=1
else if (data3$floor[i]==2) data3$fl[i]=3
else if (data3$floor[i]==3) data3$fl[i]=5
else if (data3$floor[i]==4) data3$fl[i]=10
else if (data3$floor[i]==5) data3$fl[i]=15
else if (data3$floor[i]==6) data3$fl[i]=20
else if (data3$floor[i]==7) data3$fl[i]=25
else if (data3$floor[i]==8) data3$fl[i]=30
}
data3$fl=as.factor(data3$fl)
#ggplot
ggplot(data3,aes(x=fl,y=單價元平方公尺,fill=fl))+
geom_boxplot()+ theme(text=element_text(family="黑體-繁 中黑", size=14))+labs(x="樓層數(轉移層次)",fill="樓層數(轉移層次)")+
ggtitle("樓層數(轉移層次)與每單位不動產交易價格關係")

3. Using graphics to characterize the objects of which the unit price is outlier and compare among different districts of Taipei City.
Hint: Describe the characteristics of the objects with very high unit prices? And are these characteristics the same across Taipei City?
Interpretation:
I found out that “construction type”(建物型態)may be the characters for the unit price outlier (extermely high). I compared the group of outlier (742 items) and all the non-outlier item (20996 items), and made boxplot to show the distribution of data in each construction type. As show in boxplot below, we can see there are 2 type which is 2 highest value among all the construction type. The highest one is HOUSE (透天厝),the second one is store (店面(店鋪)). To sum up, In my own analysis, I found “construction type”(建物型態) may be realted to the unit price of property transaction especially in HOUSE (透天厝) and store (店面(店鋪)).
#先刪除na
q=d[which(!is.na(d$單價元平方公尺)), ]
q$單價元平方公尺=as.numeric(q$單價元平方公尺)
## Warning: 強制變更過程中產生了 NA
q=q[which(!is.na(q$單價元平方公尺)), ]
#summary(q$單價元平方公尺)
#boxplot(q$單價元平方公尺)
#par(family='STKaiti')
#hist(q$單價元平方公尺)
p=q %>% select(鄉鎮市區,都市土地使用分區,交易年月日,主要用途,建物型態,主要建材,建築完成年月,有無管理組織,單價元平方公尺,總樓層數) %>% na.omit()
# 找出 outlier
out <- boxplot.stats(p$單價元平方公尺)$out
out_ind <- which(p$單價元平方公尺 %in% c(out))
ot=p[out_ind,]
ot$class="outlier"
not=p[-out_ind,]
not$class="non-outlier"
df=rbind(ot,not)
colnames(df)
## [1] "鄉鎮市區" "都市土地使用分區" "交易年月日" "主要用途"
## [5] "建物型態" "主要建材" "建築完成年月" "有無管理組織"
## [9] "單價元平方公尺" "總樓層數" "class"
ggplot(df,aes(x=建物型態,y=單價元平方公尺,fill=class))+
geom_boxplot()+ theme(text=element_text(family="黑體-繁 中黑", size=14))+
ggtitle("建物型態別離群值")

Interpretation :
Seeing that I found “construction type”(建物型態) may be realted to the unit price of property transaction especially in HOUSE (透天厝) and store (店面(店鋪)), Does geographical difference exist? The answer is YES.
First I take a look at 1st graph for all construction type and found that in“文山區” the outlier overlapped the non-outlier, therefore, construction type should be extracted to discuss the outlier, which verify my finding previously.
Secondly, I generate 2 graphs which is house and stores repectively. When it comes to unit price outlier(extremely high), in house type, 中山區、士林區、北投區 have the highest unit price in order. In stores type, 信義區、內湖區、大安區、中正區 have the highest unit price in order. Two types hace outliers in different regions. In my personal experience, 中山區、士林區、北投區 are more of housing areas and 信義區、內湖區、大安區、中正區 are more of buisiness areas.