์บ๋ฆฌํฌ๋์ ๋ํ๊ต์ "Bayesian Statistics: Techniques and Models"์ ์ด์ํ๊ธฐ ์ํ ํ๋ก์ ํธ ๊ฒฐ๊ณผ๋ฌผ์ด๋ค.
<R Code>
##########################
# setting & loading data #
##########################
set.seed(777)
library(dplyr)
library(tidyr)
library(ggplot2)
library(GGally)
library(rjags)
theme_set(theme_light() +
theme(plot.title = element_text(face = 'bold', colour = 'grey10'),
plot.subtitle = element_text(colour = 'grey25'),
panel.grid.major = element_line(colour = 'grey90', size = 1),
panel.grid.minor = element_line(colour = 'grey80', size = 0.5, linetype = 'dashed'),
legend.position = 'top',
legend.spacing.x = unit(0.125, 'cm'),
legend.background = element_rect(fill = NULL, linetype = 'dotted'),
strip.background = element_blank(),
strip.text = element_text(face = 'bold', colour = 'grey25', size = 11.25)))
data_air = na.omit(airquality) %>%
select(-Day)
data_air$Month = as.factor(data_air$Month)
ggpairs(data_air)
##########################
# classical linear model #
##########################
mod_lm = lm(Ozone ~ ., data = data_air)
summary(mod_lm)
###########################################
# model 1: full model using original data #
###########################################
X = model.matrix(mod_lm) %>%
as.data.frame() %>%
select(-1)
y = data_air['Ozone']
names(y) = 'y'
data_jags = as.list(bind_cols(y, X))
mod_string = " model {
for (i in 1:length(y)) {
y[i] ~ dnorm(mu[i], prec)
mu[i] = b0 + b[1]*Solar.R[i] + b[2]*Wind[i] + b[3]*Temp[i] + b[4]*Month6[i] + b[5]*Month7[i] + b[6]*Month8[i] + b[7]*Month9[i]
}
b0 ~ dnorm(0.0, 1.0/1.0e6)
for (j in 1:7) {
b[j] ~ dnorm(0.0, 1.0/1.0e6)
}
prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
sig = sqrt(1.0/prec)
} "
mod = jags.model(textConnection(mod_string),
data = data_jags,
n.chains = 3)
params = c('b0', 'b', 'sig')
update(mod, 1e6)
mod_sim = coda.samples(model = mod,
variable.names = params,
n.iter = 1e6)
gelman.diag(mod_sim)
dic_1 = dic.samples(mod, n.iter = 1e6)
summary(mod_sim)
#######################################################
# model 2: random intercept model using original data #
#######################################################
data_jags = as.list(data_air)
names(data_jags)[1] = 'y'
mod_string = "model {
for (i in 1:length(y)) {
y[i] ~ dnorm(mu[i], prec)
mu[i] = b0[Month[i]] + b[1]*Solar.R[i] + b[2]*Wind[i] + b[3]*Temp[i]
}
for (j in 1:3) {
b[j] ~ dnorm(0.0, 1.0/1.0e6)
}
prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
sig = sqrt(1.0/prec)
for (k in min(Month):max(Month)) {
b0[k] ~ dnorm(mu_b0, prec_b0)
}
mu_b0 ~ dnorm(0.0, 1.0/1.0e6)
prec_b0 ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
tau = sqrt(1.0/prec_b0)
}"
mod = jags.model(textConnection(mod_string),
data = data_jags,
n.chains = 3)
update(mod, 1e6)
params = c('b0', 'b', 'mu_b0', 'sig', 'tau')
mod_sim = coda.samples(model = mod,
variable.names = params,
n.iter = 1e6)
mod_comb_sim = do.call(rbind, mod_sim)
gelman.diag(mod_sim)
dic_2 = dic.samples(mod, n.iter = 1e6)
summary(mod_sim)
####################################################
# model 3: model without Month using original data #
####################################################
data_jags = as.list(data_air %>%
select(-Month))
names(data_jags)[1] = 'y'
mod_string = " model {
for (i in 1:length(y)) {
y[i] ~ dnorm(mu[i], prec)
mu[i] = b0 + b[1]*Solar.R[i] + b[2]*Wind[i] + b[3]*Temp[i]
}
b0 ~ dnorm(0.0, 1.0/1.0e6)
for (j in 1:3) {
b[j] ~ dnorm(0.0, 1.0/1.0e6)
}
prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
sig = sqrt(1.0/prec)
} "
mod = jags.model(textConnection(mod_string),
data = data_jags,
n.chains = 3)
params = c('b0', 'b', 'sig')
update(mod, 1e6)
mod_sim = coda.samples(model = mod,
variable.names = params,
n.iter = 1e6)
gelman.diag(mod_sim)
dic_3 = dic.samples(mod, n.iter = 1e6)
summary(mod_sim)
##########################################################
# model 4: full model using square-root transformed data #
##########################################################
data_air_tf = with(data_air,
data.frame(sqrt_Ozone = sqrt(Ozone),
sqrt_Solar.R = sqrt(Solar.R),
sqrt_Wind = sqrt(Wind),
sqrt_Temp = sqrt(Temp),
Month = Month))
ggpairs(data_air_tf)
mod_lm = lm(sqrt_Ozone ~ ., data = data_air_tf)
summary(mod_lm)
X = model.matrix(mod_lm) %>%
as.data.frame() %>%
select(-1)
y = data_air_tf['sqrt_Ozone']
names(y) = 'y'
data_jags = as.list(bind_cols(y, X))
mod_string = " model {
for (i in 1:length(y)) {
y[i] ~ dnorm(mu[i], prec)
mu[i] = b0 + b[1]*sqrt_Solar.R[i] + b[2]*sqrt_Wind[i] + b[3]*sqrt_Temp[i] + b[4]*Month6[i] + b[5]*Month7[i] + b[6]*Month8[i] + b[7]*Month9[i]
}
b0 ~ dnorm(0.0, 1.0/1.0e6)
for (j in 1:7) {
b[j] ~ dnorm(0.0, 1.0/1.0e6)
}
prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
sig = sqrt(1.0/prec)
} "
mod = jags.model(textConnection(mod_string),
data = data_jags,
n.chains = 3)
params = c('b0', 'b', 'sig')
update(mod, 1e6)
mod_sim = coda.samples(model = mod,
variable.names = params,
n.iter = 1e6)
gelman.diag(mod_sim)
dic_4 = dic.samples(mod, n.iter = 1e6)
summary(mod_sim)
######################################################################
# model 5: random intercept model using square-root transformed data #
######################################################################
data_jags = as.list(data_air_tf)
names(data_jags)[1] = 'y'
mod_string = " model {
for (i in 1:length(y)) {
y[i] ~ dnorm(mu[i], prec)
mu[i] = b0[Month[i]] + b[1]*sqrt_Solar.R[i] + b[2]*sqrt_Wind[i] + b[3]*sqrt_Temp[i]
}
for (j in 1:3) {
b[j] ~ dnorm(0.0, 1.0/1.0e6)
}
prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
sig = sqrt(1.0/prec)
for (k in min(Month):max(Month)) {
b0[k] ~ dnorm(mu_b0, prec_b0)
}
mu_b0 ~ dnorm(0.0, 1.0/1.0e6)
prec_b0 ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
tau = sqrt(1.0/prec_b0)
} "
mod = jags.model(textConnection(mod_string),
data = data_jags,
n.chains = 3)
update(mod, 1e6)
params = c('b0', 'b', 'mu_b0', 'sig', 'tau')
mod_sim = coda.samples(model = mod,
variable.names = params,
n.iter = 1e6)
gelman.diag(mod_sim)
dic_5 = dic.samples(mod, n.iter = 1e6)
summary(mod_sim)
###################################################################
# model 6: model without Month using square-root transformed data #
###################################################################
data_jags = as.list(data_air_tf %>%
select(-Month))
names(data_jags)[1] = 'y'
mod_string = " model {
for (i in 1:length(y)) {
y[i] ~ dnorm(mu[i], prec)
mu[i] = b0 + b[1]*sqrt_Solar.R[i] + b[2]*sqrt_Wind[i] + b[3]*sqrt_Temp[i]
}
b0 ~ dnorm(0.0, 1.0/1.0e6)
for (j in 1:3) {
b[j] ~ dnorm(0.0, 1.0/1.0e6)
}
prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
sig = sqrt(1.0/prec)
} "
mod = jags.model(textConnection(mod_string),
data = data_jags,
n.chains = 3)
params = c('b0', 'b', 'sig')
update(mod, 1e6)
mod_sim = coda.samples(model = mod,
variable.names = params,
n.iter = 1e6)
mod_comb_sim = do.call(rbind, mod_sim)
gelman.diag(mod_sim)
dic_6 = dic.samples(mod, n.iter = 1e6)
summary(mod_sim)
#####################
# residual analysis #
#####################
print(dic_1) # Mean deviance: 988.7 / penalty 9.111 / Penalized deviance: 997.8
print(dic_2) # Mean deviance: 989.4 / penalty 6.766 / Penalized deviance: 996.1
print(dic_3) # Mean deviance: 993.7 / penalty 5.034 / Penalized deviance: 998.7
print(dic_4) # Mean deviance: 388.5 / penalty 9.1 / Penalized deviance: 397.6
print(dic_5) # Mean deviance: 388.3 / penalty 8.97 / Penalized deviance: 397.3
print(dic_6) # Mean deviance: 391.5 / penalty 5.038 / Penalized deviance: 396.5
# selected model: bayesian linear model 06 (square root transformation & without Month)
y = data_air_tf$sqrt_Ozone
pred_y = rep(NA, nrow(data_air_tf))
coef = colMeans(mod_comb_sim)
for (i in 1:nrow(data_air)) {
pred_y[i] = coef[1]*data_air_tf[i, 'sqrt_Solar.R'] + coef[2]*data_air_tf[i, 'sqrt_Wind'] + coef[3]*data_air_tf[i, 'sqrt_Temp'] + coef[4]
}
df_resid = data.frame(idx = 1:nrow(data_air_tf),
std_resid = scale(y - pred_y),
resid = y - pred_y,
pred_y = pred_y,
actual_y = y)
df_resid %>%
ggplot(aes(idx, std_resid)) +
geom_point(shape = 1) +
geom_hline(yintercept = 0, linetype = 'longdash', colour = 'blue') +
labs(x = 'Index', y = 'Standardized Residuals',
title = 'Residuals vs Index')
df_resid %>%
ggplot(aes(sample = std_resid)) +
stat_qq(shape = 1) +
geom_qq_line(colour = 'blue', linetype = 'longdash') +
labs(x = 'Normal Theoretical Quantiles', y = 'Standardized Residuals',
title = 'Normal Q-Q')
df_resid %>%
ggplot(aes(pred_y, std_resid)) +
geom_point(shape = 1) +
geom_hline(yintercept = 0, linetype = 'longdash', colour = 'blue') +
labs(x = 'Fitted Values for Square Root of Ozone', y = 'Standardized Residuals',
title = 'Residuals vs Fitted')
df_resid %>%
ggplot(aes(actual_y, pred_y)) +
geom_point(shape = 1) +
geom_abline(slope = 1, intercept = 0, linetype = 'longdash', colour = 'blue') +
labs(x = 'Actual Values for Square Root of Ozone', y = 'Fitted Values for Square Root of Ozone',
title = 'Fitted vs Actual')
'Statistics > Bayesian Statistics' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
ํผํฉ ๋ชจ๋ธ(Mixture model) (0) | 2020.09.09 |
---|---|
์์์ ํธ ๋ชจ๋ธ(Random intercept model) (0) | 2020.09.07 |
๊ณ์ธต์ ๋ชจ๋ธ(Hierarchical model) (0) | 2020.08.22 |
๋ฒ ์ด์ง์ ํฌ์์ก ํ๊ท(Bayesian poisson regression) (0) | 2020.08.21 |
๋ฒ ์ด์ง์ ๋ก์ง์คํฑ ํ๊ท(Bayesian logistic regression) (0) | 2020.08.19 |