๋ณธ๋ฌธ ๋ฐ”๋กœ๊ฐ€๊ธฐ

Statistics/Bayesian Statistics

Bayesian linear model for New York air quality measurements

์บ˜๋ฆฌํฌ๋‹ˆ์•„ ๋Œ€ํ•™๊ต์˜ "Bayesian Statistics: Techniques and Models"์„ ์ด์ˆ˜ํ•˜๊ธฐ ์œ„ํ•œ ํ”„๋กœ์ ํŠธ ๊ฒฐ๊ณผ๋ฌผ์ด๋‹ค.

 

 

<R Code>

 

##########################
# setting & loading data #
##########################

set.seed(777)

library(dplyr)
library(tidyr)
library(ggplot2)
library(GGally)
library(rjags)

theme_set(theme_light() + 
            theme(plot.title = element_text(face = 'bold', colour = 'grey10'), 
                  plot.subtitle = element_text(colour = 'grey25'), 
                  panel.grid.major = element_line(colour = 'grey90', size = 1), 
                  panel.grid.minor = element_line(colour = 'grey80', size = 0.5, linetype = 'dashed'), 
                  legend.position = 'top', 
                  legend.spacing.x = unit(0.125, 'cm'), 
                  legend.background = element_rect(fill = NULL, linetype = 'dotted'), 
                  strip.background = element_blank(), 
                  strip.text = element_text(face = 'bold', colour = 'grey25', size = 11.25)))

data_air = na.omit(airquality) %>% 
  select(-Day)

data_air$Month = as.factor(data_air$Month)

ggpairs(data_air)

##########################
# classical linear model #
##########################

mod_lm = lm(Ozone ~ ., data = data_air)

summary(mod_lm)

###########################################
# model 1: full model using original data #
###########################################

X = model.matrix(mod_lm) %>%
  as.data.frame() %>%
  select(-1)

y = data_air['Ozone']

names(y) = 'y'

data_jags = as.list(bind_cols(y, X))

mod_string = " model {
  for (i in 1:length(y)) {
    y[i] ~ dnorm(mu[i], prec)
    mu[i] = b0 + b[1]*Solar.R[i] + b[2]*Wind[i] + b[3]*Temp[i] + b[4]*Month6[i] + b[5]*Month7[i] + b[6]*Month8[i] + b[7]*Month9[i]
  }

  b0 ~ dnorm(0.0, 1.0/1.0e6)

  for (j in 1:7) {
    b[j] ~ dnorm(0.0, 1.0/1.0e6)
  }

  prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
  sig = sqrt(1.0/prec)
} "

mod = jags.model(textConnection(mod_string),
                 data = data_jags,
                 n.chains = 3)

params = c('b0', 'b', 'sig')

update(mod, 1e6)

mod_sim = coda.samples(model = mod,
                       variable.names = params,
                       n.iter = 1e6)

gelman.diag(mod_sim)

dic_1 = dic.samples(mod, n.iter = 1e6)

summary(mod_sim)

#######################################################
# model 2: random intercept model using original data #
#######################################################

data_jags = as.list(data_air)

names(data_jags)[1] = 'y'

mod_string = "model {
  for (i in 1:length(y)) {
    y[i] ~ dnorm(mu[i], prec)
    mu[i] = b0[Month[i]] + b[1]*Solar.R[i] + b[2]*Wind[i] + b[3]*Temp[i]
  }

  for (j in 1:3) {
    b[j] ~ dnorm(0.0, 1.0/1.0e6)
  }

  prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
  sig = sqrt(1.0/prec)

  for (k in min(Month):max(Month)) {
    b0[k] ~ dnorm(mu_b0, prec_b0)
  }

  mu_b0 ~ dnorm(0.0, 1.0/1.0e6)
  prec_b0 ~ dgamma(5.0/2.0, 5.0*10.0/2.0)

  tau = sqrt(1.0/prec_b0)
}"

mod = jags.model(textConnection(mod_string),
                 data = data_jags,
                 n.chains = 3)

update(mod, 1e6)

params = c('b0', 'b', 'mu_b0', 'sig', 'tau')

mod_sim = coda.samples(model = mod,
                       variable.names = params,
                       n.iter = 1e6)

mod_comb_sim = do.call(rbind, mod_sim)

gelman.diag(mod_sim)

dic_2 = dic.samples(mod, n.iter = 1e6)

summary(mod_sim)

####################################################
# model 3: model without Month using original data #
####################################################

data_jags = as.list(data_air %>%
                      select(-Month))

names(data_jags)[1] = 'y'

mod_string = " model {
  for (i in 1:length(y)) {
    y[i] ~ dnorm(mu[i], prec)
    mu[i] = b0 + b[1]*Solar.R[i] + b[2]*Wind[i] + b[3]*Temp[i]
  }

  b0 ~ dnorm(0.0, 1.0/1.0e6)

  for (j in 1:3) {
    b[j] ~ dnorm(0.0, 1.0/1.0e6)
  }

  prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
  sig = sqrt(1.0/prec)
} "

mod = jags.model(textConnection(mod_string),
                 data = data_jags,
                 n.chains = 3)

params = c('b0', 'b', 'sig')

update(mod, 1e6)

mod_sim = coda.samples(model = mod,
                       variable.names = params,
                       n.iter = 1e6)

gelman.diag(mod_sim)

dic_3 = dic.samples(mod, n.iter = 1e6)

summary(mod_sim)

##########################################################
# model 4: full model using square-root transformed data #  
##########################################################

data_air_tf = with(data_air,
                   data.frame(sqrt_Ozone = sqrt(Ozone),
                              sqrt_Solar.R = sqrt(Solar.R),
                              sqrt_Wind = sqrt(Wind),
                              sqrt_Temp = sqrt(Temp),
                              Month = Month))

ggpairs(data_air_tf)

mod_lm = lm(sqrt_Ozone ~ ., data = data_air_tf)

summary(mod_lm)

X = model.matrix(mod_lm) %>%
  as.data.frame() %>%
  select(-1)

y = data_air_tf['sqrt_Ozone']
names(y) = 'y'

data_jags = as.list(bind_cols(y, X))

mod_string = " model {
  for (i in 1:length(y)) {
    y[i] ~ dnorm(mu[i], prec)
    mu[i] = b0 + b[1]*sqrt_Solar.R[i] + b[2]*sqrt_Wind[i] + b[3]*sqrt_Temp[i] + b[4]*Month6[i] + b[5]*Month7[i] + b[6]*Month8[i] + b[7]*Month9[i]
  }

  b0 ~ dnorm(0.0, 1.0/1.0e6)

  for (j in 1:7) {
    b[j] ~ dnorm(0.0, 1.0/1.0e6)
  }

  prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
  sig = sqrt(1.0/prec)
} "

mod = jags.model(textConnection(mod_string),
                 data = data_jags,
                 n.chains = 3)

params = c('b0', 'b', 'sig')

update(mod, 1e6)

mod_sim = coda.samples(model = mod,
                       variable.names = params,
                       n.iter = 1e6)

gelman.diag(mod_sim)

dic_4 = dic.samples(mod, n.iter = 1e6)

summary(mod_sim)

######################################################################
# model 5: random intercept model using square-root transformed data #  
######################################################################

data_jags = as.list(data_air_tf)
names(data_jags)[1] = 'y'

mod_string = " model {
  for (i in 1:length(y)) {
    y[i] ~ dnorm(mu[i], prec)
    mu[i] = b0[Month[i]] + b[1]*sqrt_Solar.R[i] + b[2]*sqrt_Wind[i] + b[3]*sqrt_Temp[i]
  }

  for (j in 1:3) {
    b[j] ~ dnorm(0.0, 1.0/1.0e6)
  }

  prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
  sig = sqrt(1.0/prec)

  for (k in min(Month):max(Month)) {
    b0[k] ~ dnorm(mu_b0, prec_b0)
  }

  mu_b0 ~ dnorm(0.0, 1.0/1.0e6)
  prec_b0 ~ dgamma(5.0/2.0, 5.0*10.0/2.0)

  tau = sqrt(1.0/prec_b0)
} "

mod = jags.model(textConnection(mod_string),
                 data = data_jags,
                 n.chains = 3)

update(mod, 1e6)

params = c('b0', 'b', 'mu_b0', 'sig', 'tau')

mod_sim = coda.samples(model = mod,
                       variable.names = params,
                       n.iter = 1e6)

gelman.diag(mod_sim)

dic_5 = dic.samples(mod, n.iter = 1e6)

summary(mod_sim)

###################################################################
# model 6: model without Month using square-root transformed data #  
###################################################################

data_jags = as.list(data_air_tf %>% 
                      select(-Month))

names(data_jags)[1] = 'y'

mod_string = " model {
  for (i in 1:length(y)) {
    y[i] ~ dnorm(mu[i], prec)
    mu[i] = b0 + b[1]*sqrt_Solar.R[i] + b[2]*sqrt_Wind[i] + b[3]*sqrt_Temp[i]
  }

  b0 ~ dnorm(0.0, 1.0/1.0e6)

  for (j in 1:3) {
    b[j] ~ dnorm(0.0, 1.0/1.0e6)
  }

  prec ~ dgamma(5.0/2.0, 5.0*10.0/2.0)
  sig = sqrt(1.0/prec)
} "

mod = jags.model(textConnection(mod_string),
                 data = data_jags,
                 n.chains = 3)

params = c('b0', 'b', 'sig')

update(mod, 1e6)

mod_sim = coda.samples(model = mod,
                       variable.names = params,
                       n.iter = 1e6)

mod_comb_sim = do.call(rbind, mod_sim)

gelman.diag(mod_sim)

dic_6 = dic.samples(mod, n.iter = 1e6)

summary(mod_sim)

#####################
# residual analysis #
#####################

print(dic_1) # Mean deviance:  988.7 / penalty 9.111 / Penalized deviance: 997.8 
print(dic_2) # Mean deviance:  989.4 / penalty 6.766 / Penalized deviance: 996.1 
print(dic_3) # Mean deviance:  993.7 / penalty 5.034 / Penalized deviance: 998.7 
print(dic_4) # Mean deviance:  388.5 / penalty 9.1 / Penalized deviance: 397.6 
print(dic_5) # Mean deviance:  388.3 / penalty 8.97 / Penalized deviance: 397.3 
print(dic_6) # Mean deviance:  391.5 / penalty 5.038 / Penalized deviance: 396.5 

# selected model: bayesian linear model 06 (square root transformation & without Month)

y = data_air_tf$sqrt_Ozone
pred_y = rep(NA, nrow(data_air_tf))

coef = colMeans(mod_comb_sim)

for (i in 1:nrow(data_air)) {
  pred_y[i] = coef[1]*data_air_tf[i, 'sqrt_Solar.R'] + coef[2]*data_air_tf[i, 'sqrt_Wind'] + coef[3]*data_air_tf[i, 'sqrt_Temp'] + coef[4]
}

df_resid = data.frame(idx = 1:nrow(data_air_tf), 
                      std_resid = scale(y - pred_y), 
                      resid = y - pred_y, 
                      pred_y = pred_y, 
                      actual_y = y)

df_resid %>% 
  ggplot(aes(idx, std_resid)) + 
  geom_point(shape = 1) + 
  geom_hline(yintercept = 0, linetype = 'longdash', colour = 'blue') + 
  labs(x = 'Index', y = 'Standardized Residuals', 
       title = 'Residuals vs Index')

df_resid %>% 
  ggplot(aes(sample = std_resid)) + 
  stat_qq(shape = 1) + 
  geom_qq_line(colour = 'blue', linetype = 'longdash') + 
  labs(x = 'Normal Theoretical Quantiles', y = 'Standardized Residuals', 
       title = 'Normal Q-Q')

df_resid %>% 
  ggplot(aes(pred_y, std_resid)) + 
  geom_point(shape = 1) + 
  geom_hline(yintercept = 0, linetype = 'longdash', colour = 'blue') + 
  labs(x = 'Fitted Values for Square Root of Ozone', y = 'Standardized Residuals', 
       title = 'Residuals vs Fitted')

df_resid %>% 
  ggplot(aes(actual_y, pred_y)) + 
  geom_point(shape = 1) + 
  geom_abline(slope = 1, intercept = 0, linetype = 'longdash', colour = 'blue') + 
  labs(x = 'Actual Values for Square Root of Ozone', y = 'Fitted Values for Square Root of Ozone', 
       title = 'Fitted vs Actual')