R09228001 楊宇翔

Question 1: In Tokyo, Japan, did regions with high COVID-19 cases lead to high suicide rate ?

Question 2: Did the relationship of COVID-19 and suicide differ among centre Tokyo and Peripheral Tokyo ?

Part 1. Data

##############  Tokyo COVID-19 感染人數 ######################

rm(list=ls()) #clear all
setwd("~/Downloads")
infect=read.csv("130001_tokyo_covid19_positive_cases_by_municipality.csv")
infect$公表_年月=as.Date(infect$公表_年月日)

library(lubridate)
library(dplyr)

infect=infect[-which(infect$市区町村名==""),]
infect.m=infect %>% 
  group_by(yr = year(公表_年月日), mon = month(公表_年月日),place=市区町村名,code=全国地方公共団体コード) %>% 
  summarise(infect_m = sum(陽性者数))

infect.m$date=paste0(infect.m$yr,"-",infect.m$mon,"-01")
infect.m$date=as.Date(infect.m$date)
#infect.m$date=as.Date(paste0(infect.m$yr,infect.m$mon,"01"))

data=infect.m[,-c(1:2)]

library(tidyr)
data_wide = spread(data, date, infect_m)

data_wide$region[which(grepl("区",data_wide$place))]="centre"
data_wide$region[-which(grepl("区",data_wide$place))]="periphery"
d1=data_wide
d1$code=as.character(d1$code)
library(sf)
jp=st_read("~/Desktop/碩論_日本青少年自殺空間變異/data/japan_population_shp/jp1741_bothsex.shp")
## Reading layer `jp1741_bothsex' from data source 
##   `/Users/yangyuxiang_1/Desktop/碩論_日本青少年自殺空間變異/data/japan_population_shp/jp1741_bothsex.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1741 features and 30 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 122.9339 ymin: 24.04562 xmax: 153.9867 ymax: 45.52638
## Geodetic CRS:  JGD2011
jp2=jp[which(jp$prfct__=="東京都"),c(4,3,7,9)]
d1=left_join(d1,st_drop_geometry(jp2[,c(1,3,4)]),by="code")
d1[,c(3:24)]=(d1[,c(3:24)]/d1$all_age)*10^5 #每十萬人有多少感染人數
d1=d1[,-c(1,26,27)]
jp3=left_join(jp2,d1,by="code")

df <- st_drop_geometry(jp3)
########### from wide to long data ###########
library(tidyr)
long1 <- df %>% gather(date, covid_19, -c(code,cmplt_k,cmplt_n,all_age,region))
long1$date=as.Date(long1$date)



##############  自殺人數 ######################
setwd("~/Desktop/碩論_日本青少年自殺空間變異/data/suicide_count")

library(readxl)
d.tokyo=read_excel("tokyo_202003to202110.xlsx",col_names = TRUE,sheet = 1)
c.tokyo=left_join(jp3[,c(1,4)],d.tokyo,by="code")
d=c.tokyo
d[,c(9:28)]=(d[,c(9:28)]/d$all_age)*10^5#每十萬人有多少自殺
d0=d[,-c(2:6)]

d0=st_drop_geometry(d0)
########### from wide to long data ###########
library(tidyr)
long2 <-d0 %>% gather(date,suicide, -c(code,complete_kanji,complete_en))
long2$yr=substr(long2$date,1,4)
long2$mon=substr(long2$date,5,6)
long2$date=paste0(long2$yr,"-",long2$mon,"-01")
long2$date=as.Date(long2$date)
long2=long2[,c(1,4,5)]
join_long=left_join(long1,long2,by=c("code","date"))



#library("writexl")
#write_xlsx(join_long,"~/Desktop/碩論_日本青少年自殺空間變異/data/suicide_count/tokyo_longjoin.xlsx")
d=join_long

#時間縮短
d2=join_long[which(d$date<"2021-03-02"),]

#group 到類別
library(dplyr)
d3=d %>% group_by(region,date) %>% summarise_all(mean)
d3=d3[-c(21,22,43,44),-c(3,4,5)]


d=as.data.frame(d)
d3=as.data.frame(d3)

Type 1 Data: All data with head()

head(d)
##     code        cmplt_k              cmplt_n all_age region       date
## 1 131016 東京都千代田区  Tokyo-to,Chiyoda-ku   67216 centre 2020-03-01
## 2 131024   東京都中央区     Tokyo-to,Chuo-ku  170583 centre 2020-03-01
## 3 131032     東京都港区   Tokyo-to,Minato-ku  259036 centre 2020-03-01
## 4 131041   東京都新宿区 Tokyo-to,Shinjuku-ku  345231 centre 2020-03-01
## 5 131059   東京都文京区   Tokyo-to,Bunkyo-ku  226574 centre 2020-03-01
## 6 131067   東京都台東区    Tokyo-to,Taito-ku  203647 centre 2020-03-01
##    covid_19   suicide
## 1  4.463223 0.0000000
## 2 11.138273 1.1724498
## 3 15.055822 1.9302336
## 4  6.372545 2.3172890
## 5  1.765428 0.4413569
## 6  7.365687 0.9820916

Type 2 Data : Data aggregated to centre/peripheral head()

head(d3)
##   region       date  all_age    covid_19  suicide
## 1 centre 2020-03-01 416207.1    4.323384 1.210717
## 2 centre 2020-04-01 416207.1  657.550987 0.901172
## 3 centre 2020-05-01 416207.1 1515.700282 1.200989
## 4 centre 2020-06-01 416207.1 1729.220383 1.266975
## 5 centre 2020-07-01 416207.1 2823.836784 1.624201
## 6 centre 2020-08-01 416207.1 5401.435378 1.447878

Part 2. Tokyo Map

tokyo =jp3[-c(54:62),]
plot(tokyo["region"],main="Centre & Perpheral of Tokyo,Japan")

p_popup1 <- paste0("<strong>Municipals: </strong>", tokyo$cmplt_k)
p_popup2 <- paste0("<strong>Regions: </strong>", tokyo$region)
popup=rbind(p_popup1,p_popup2)

library(leaflet)
?colorFactor
tokyo$region=as.factor(tokyo$region)
factpal <- colorFactor(topo.colors(2), tokyo$region)
library(leaflet) 
leaflet(tokyo) %>%
  addPolygons(
    stroke = T, # remove polygon borders,
     color = "white",
    fillColor = ~factpal(region), # set fill color with function from above and value
    fillOpacity =0.8, smoothFactor = 0.5, # make it nicer
  popup = p_popup1 )  %>%
  addTiles()
## Warning: sf layer has inconsistent datum (+proj=longlat +ellps=GRS80 +no_defs).
## Need '+proj=longlat +datum=WGS84'

Part 3. Correlation Plot

#時間縮短
d2=join_long[which(d$date<"2021-10-02"),]

library(ggplot2)
g1=ggplot(d2,aes(x=covid_19,y=suicide,col=region))+
  geom_point()+
  ggtitle("Scatterplot of COVID-19 and suicide")+ 
  geom_smooth(method="lm", size=1, color="black")

g1

g1.1=ggplot(d2,aes(x=covid_19,y=suicide,col=region))+
  geom_point()+
  ggtitle("Scatterplot of COVID-19 and suicide")+ 
  geom_smooth(method="lm", size=1, color="black")+
  facet_wrap(~region,ncol=4)

g1.1

Part 4. Time Grouping correlation

g2=ggplot(d2,aes(x=covid_19,y=suicide,col=region))+
  geom_point()+
  facet_wrap(~date,ncol=4)+
  ggtitle("Scatterplot of COVID-19 and suicide")
g2

g3=ggplot(d2,aes(x=covid_19,y=suicide,col=region))+
  geom_point()+
  geom_path()+
  theme_bw()+
  theme(text=element_text(family="黑體-繁 中黑", size=14))
library(directlabels)
direct.label(g3,method = "extreme.grid")

g4=ggplot(data = d2,aes(x=covid_19,y=suicide,col=region))+
  geom_point()+
  geom_path()+
  theme_bw()+
  geom_text(aes(label = date),colour = "gray",size = 2.5,hjust = 0,vjust = 0)
g4

Part 5. Animation of time grouping correlation

d$date2=as.character(d$date)
d$date2=as.numeric(gsub("-","",d$date2))

#### ploty animation ########
library(plotly)
p <- plot_ly(d,x = ~covid_19 , y = ~suicide)#,color=~region)
add_markers(p,
            size = ~all_age,
            color = ~region,
            text = ~region,hoverinfo = "text",
            id = ~region,
            frame = ~ date2,
            showlenged = TRUE)

Part 6. Simplfied Result

#group 到類別
library(dplyr)
d3=d2 %>% group_by(region,date) %>% summarise_all(mean)


g1=ggplot(d3,aes(x=covid_19,y=suicide,col=region))+
  geom_point()

g1

g1.1=ggplot(d3,aes(x=covid_19,y=suicide,col=region))+
  geom_point()+
  ggtitle("Scatterplot of COVID-19 and suicide")+ 
  geom_smooth(method="lm", size=1, color="black")+
  facet_wrap(~region,ncol=4)

g1.1

g2=ggplot(d3,aes(x=covid_19,y=suicide,col=region))+
  geom_point()+
  facet_wrap(~date,ncol=4)
g2

g3=ggplot(d3,aes(x=covid_19,y=suicide,col=region))+
  geom_point()+
  geom_path()+
  theme_bw()+
  theme(text=element_text(family="黑體-繁 中黑", size=14))


library(directlabels)
direct.label(g3,method = "extreme.grid")

g4=ggplot(data = d3,aes(x=covid_19,y=suicide,col=region))+
  geom_point()+
  geom_path()+
  theme_bw()+
  geom_text(aes(label = date),colour = "gray",size = 2.5,hjust = 0,vjust = 0)
g4

d3$date2=as.character(d3$date)
d3$date2=as.numeric(gsub("-","",d3$date2))
d3$date2
##  [1] 20200301 20200401 20200501 20200601 20200701 20200801 20200901 20201001
##  [9] 20201101 20201201 20210101 20210201 20210301 20210401 20210501 20210601
## [17] 20210701 20210801 20210901 20211001 20200301 20200401 20200501 20200601
## [25] 20200701 20200801 20200901 20201001 20201101 20201201 20210101 20210201
## [33] 20210301 20210401 20210501 20210601 20210701 20210801 20210901 20211001
#### ploty animation ########
library(plotly)
p <- plot_ly(d3,x = ~covid_19 , y = ~suicide)#,color=~region)
add_markers(p,
            size = ~all_age,
            color = ~region,
            text = ~region,hoverinfo = "text",
            id = ~region,
            frame = ~ date2,
            showlenged = TRUE)

Part 7. Explanation :

Instruction:

A data story of correlation over time: Using appropriate data to set up grouping and complementary variables for representing temporal variations. The assignment should include: Visualization outputs: (interactive) charts/plots Text descriptions (describe the data and plots, your observations, and possible explanations)

My Interpretation :

In the big picture, I found out that the COVID-19 cases and suicide rate are negatively related which is very surprise to me. I originally thought that the place with higher infected rate will caused uneasiness and panic leading to excessive suicide rate. However, the data analysis proved me wrong.

As the difference between central and peripheral Tokyo, I found out that in most of time, COVID-19 infected rate is higher in central Tokyo. However the suicide rate change over different stage of virus outbreak. In simplified aggregated analysis, suicide rate in central Tokyo is higher than peripheral in most of the time except for 2020-Sep and 2021-Feb. However, If we see the unaggregated analysis, peripheral Tokyo has higher fluctuation of suicide rate than central Tokyo.

As for possible mechanism of this phenomen, I have not found the reasonable explanation for this. However, I think this phenomen found out by data analysis is interesting and worth dig deeper.

The End

Thank you for reading