๋ณธ๋ฌธ ๋ฐ”๋กœ๊ฐ€๊ธฐ

Project

2020. 09. KAERI, ์ง„๋™์ฒด ํ™œ์šฉ ์ถฉ๋Œ์ฒด ์˜ˆ์ธก ๋Œ€ํšŒ

< EDA & Feature Engineering (R Code) >

 

#######################################
# library & setting for visualization #
#######################################

library(dplyr)
library(tidyr)
library(cdata)
library(stringr)
library(data.table)
library(ggplot2)
library(scales)
library(viridis)
library(digest)

theme_set(theme_minimal() + 
            theme(plot.title = element_text(face = 'bold', colour = 'grey10'), 
                  plot.subtitle = element_text(colour = 'grey25'), 
                  panel.grid.major = element_line(colour = 'grey50', size = 0.25, linetype = 'dashed'), 
                  panel.grid.minor = element_blank(), 
                  legend.position = 'top', 
                  legend.spacing.x = unit(0.125, 'cm'), 
                  legend.background = element_rect(fill = NULL, linetype = 'dotted'), 
                  strip.background = element_blank(), 
                  strip.text = element_text(face = 'bold', colour = 'grey25', size = 11.25)))

colour_list = c('#AE5CFF', '#5CFFA5', 'grey25')

##############################
# loading & checking dataset #
##############################

df_train_feature = fread('C:/Users/user/Desktop/Project/KAERI/Data/train_features.csv')
df_train_target = fread('C:/Users/user/Desktop/Project/KAERI/Data/train_target.csv')
df_test_feature = fread('C:/Users/user/Desktop/Project/KAERI/Data/test_features.csv')
df_test_target =fread('C:/Users/user/Desktop/Project/KAERI/Data/sample_submission.csv')

df_train_feature$id = df_train_feature$id + 1
df_train_target$id = df_train_target$id + 1
df_test_feature$id = df_test_feature$id + 1
df_test_target$id = df_test_target$id + 1

#################
# preprocessing #
#################

df_train = df_train_feature %>% 
  left_join(df_train_target) %>% as.data.frame()

df_test = df_test_feature %>% 
  left_join(df_test_target) %>% as.data.frame()

df_total = bind_rows(df_train, df_test) 

#######
# eda #
#######

df_train %>% 
  group_by(id) %>% 
  count() %>% 
  as.data.frame()

df_train_target %>% 
  group_by(X, Y, M, V) %>% 
  count() %>% 
  as.data.frame()

df_train_target %>% 
  group_by(X) %>% 
  count()

df_train_target %>% 
  group_by(Y) %>% 
  count()

df_train_target %>% 
  group_by(M) %>% 
  count()

df_train_target %>% 
  group_by(V) %>% 
  count()

unique(df_train$X)
unique(df_train$Y)
unique(df_train$M)
unique(df_train$V)

# S1

df_train %>%
  ggplot(aes(Time, S1, group = id)) +
  geom_line() +
  facet_wrap(~ X)

df_train %>%
  ggplot(aes(Time, S1, group = id)) +
  geom_line() +
  facet_wrap(~ Y)

df_train %>%
  ggplot(aes(Time, S1, group = id)) +
  geom_line() +
  facet_wrap(~ M)

df_train %>%
  ggplot(aes(Time, S1, group = id)) +
  geom_line() +
  facet_wrap(~ V)

# S2

df_train %>%
  ggplot(aes(Time, S2, group = id)) +
  geom_line() +
  facet_wrap(~ X)

df_train %>%
  ggplot(aes(Time, S2, group = id)) +
  geom_line() +
  facet_wrap(~ Y)

df_train %>%
  ggplot(aes(Time, S2, group = id)) +
  geom_line() +
  facet_wrap(~ M)

df_train %>%
  ggplot(aes(Time, S2, group = id)) +
  geom_line() +
  facet_wrap(~ V)

# S3

df_train %>%
  ggplot(aes(Time, S3, group = id)) +
  geom_line() +
  facet_wrap(~ X)

df_train %>%
  ggplot(aes(Time, S3, group = id)) +
  geom_line() +
  facet_wrap(~ Y)

df_train %>%
  ggplot(aes(Time, S3, group = id)) +
  geom_line() +
  facet_wrap(~ M)

df_train %>%
  ggplot(aes(Time, S3, group = id)) +
  geom_line() +
  facet_wrap(~ V)

# S4

df_train %>%
  ggplot(aes(Time, S4, group = id)) +
  geom_line() +
  facet_wrap(~ X)

df_train %>%
  ggplot(aes(Time, S4, group = id)) +
  geom_line() +
  facet_wrap(~ Y)

df_train %>%
  ggplot(aes(Time, S4, group = id)) +
  geom_line() +
  facet_wrap(~ M)

df_train %>%
  ggplot(aes(Time, S4, group = id)) +
  geom_line() +
  facet_wrap(~ V)

#######################
# feature engineering #
#######################

# initial time of S1, S2, S3, S4

df_init_time_S1 = data.frame(id = 1:length(unique(df_total$id)),
                           init_time_S1 = NA)

for (i in 1:length(unique(df_total$id))) {
  for (j in 1:length(df_total[df_total$id == i, 'S1'])) {
    if (df_total[df_total$id == i, 'S1'][j] != 0) {
      df_init_time_S1$init_time_S1[i] = df_total[df_total$id == i, 'Time'][j]

      break
    }
  }
}

write.csv(df_init_time_S1, 'df_init_time_S1.csv')

df_init_time_S2 = data.frame(id = 1:length(unique(df_total$id)),
                              init_time_S2 = NA)

for (i in 1:length(unique(df_total$id))) {
  for (j in 1:length(df_total[df_total$id == i, 'S2'])) {
    if (df_total[df_total$id == i, 'S2'][j] != 0) {
      df_init_time_S2$init_time_S2[i] = df_total[df_total$id == i, 'Time'][j]
      
      break
    }
  }
}

write.csv(df_init_time_S2, 'df_init_time_S2.csv')

df_init_time_S3 = data.frame(id = 1:length(unique(df_total$id)),
                              init_time_S3 = NA)

for (i in 1:length(unique(df_total$id))) {
  for (j in 1:length(df_total[df_total$id == i, 'S3'])) {
    if (df_total[df_total$id == i, 'S3'][j] != 0) {
      df_init_time_S3$init_time_S3[i] = df_total[df_total$id == i, 'Time'][j]
      
      break
    }
  }
}

write.csv(df_init_time_S3, 'df_init_time_S3.csv')

df_init_time_S4 = data.frame(id = 1:length(unique(df_total$id)),
                              init_time_S4 = NA)

for (i in 1:length(unique(df_total$id))) {
  for (j in 1:length(df_total[df_total$id == i, 'S4'])) {
    if (df_total[df_total$id == i, 'S4'][j] != 0) {
      df_init_time_S4$init_time_S4[i] = df_total[df_total$id == i, 'Time'][j]
      
      break
    }
  }
}

write.csv(df_init_time_S4, 'df_init_time_S4.csv')

# standard dev. of S1, S2, S3, S4

df_sd_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(sd_S1 = sd(S1)) %>% 
  ungroup()

df_sd_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(sd_S2 = sd(S2)) %>% 
  ungroup()

df_sd_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(sd_S3 = sd(S3)) %>% 
  ungroup()

df_sd_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(sd_S4 = sd(S4)) %>% 
  ungroup()

# mean of S1, S2, S3, S4

df_mean_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(mean_S1 = mean(S1)) %>% 
  ungroup()

df_mean_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(mean_S2 = mean(S2)) %>% 
  ungroup()

df_mean_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(mean_S3 = mean(S3)) %>% 
  ungroup()

df_mean_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(mean_S4 = mean(S4)) %>% 
  ungroup()

# max of S1, S2, S3, S4

df_max_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S1 = max(S1)) %>% 
  ungroup()

df_max_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S2 = max(S2)) %>% 
  ungroup()

df_max_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S3 = max(S3)) %>% 
  ungroup()

df_max_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S4 = max(S4)) %>% 
  ungroup()

# min of S1, S2, S3, S4

df_min_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S1 = min(S1)) %>% 
  ungroup()

df_min_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S2 = min(S2)) %>% 
  ungroup()

df_min_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S3 = min(S3)) %>% 
  ungroup()

df_min_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S4 = min(S4)) %>% 
  ungroup()

# range of S1, S2, S3, S4

df_range_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(range_S1 = max(S1) - min(S1)) %>% 
  ungroup()

df_range_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(range_S2 = max(S2) - min(S2)) %>% 
  ungroup()

df_range_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(range_S3 = max(S3) - min(S3)) %>% 
  ungroup()

df_range_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(range_S4 = max(S4) - min(S4)) %>% 
  ungroup()

# num. of increasing points of S1, S2, S3, S4

df_num_incre_S1 = data.frame(id = 1:length(unique(df_total$id)),
                              num_incre_S1 = NA)

for (i in 1:length(unique(df_total$id))) {
  df_num_incre_S1$num_incre_S1[i] = sum(if_else(lag(df_total[df_total$id == i, 'S1']) > 0, 'Positive', 
                                                 if_else(lag(df_total[df_total$id == i, 'S1']) < 0, 'Nagative', 'Zero')) == 'Positive', na.rm = T)
}

write.csv(df_num_incre_S1, 'df_num_incre_S1.csv')

df_num_incre_S2 = data.frame(id = 1:length(unique(df_total$id)),
                              num_incre_S2 = NA)

for (i in 1:length(unique(df_total$id))) {
  df_num_incre_S2$num_incre_S2[i] = sum(if_else(lag(df_total[df_total$id == i, 'S2']) > 0, 'Positive', 
                                                 if_else(lag(df_total[df_total$id == i, 'S2']) < 0, 'Nagative', 'Zero')) == 'Positive', na.rm = T)
}

write.csv(df_num_incre_S2, 'df_num_incre_S2.csv')

df_num_incre_S3 = data.frame(id = 1:length(unique(df_total$id)),
                              num_incre_S3 = NA)

for (i in 1:length(unique(df_total$id))) {
  df_num_incre_S3$num_incre_S3[i] = sum(if_else(lag(df_total[df_total$id == i, 'S3']) > 0, 'Positive', 
                                                 if_else(lag(df_total[df_total$id == i, 'S3']) < 0, 'Nagative', 'Zero')) == 'Positive', na.rm = T)
}

write.csv(df_num_incre_S3, 'df_num_incre_S3.csv')

df_num_incre_S4 = data.frame(id = 1:length(unique(df_total$id)),
                              num_incre_S4 = NA)

for (i in 1:length(unique(df_total$id))) {
  df_num_incre_S4$num_incre_S4[i] = sum(if_else(lag(df_total[df_total$id == i, 'S4']) > 0, 'Positive', 
                                                 if_else(lag(df_total[df_total$id == i, 'S4']) < 0, 'Nagative', 'Zero')) == 'Positive', na.rm = T)
}

write.csv(df_num_incre_S4, 'df_num_incre_S4.csv')

# num. of decreasing points of S1, S2, S3, S4

df_num_decre_S1 = data.frame(id = 1:length(unique(df_total$id)),
                              num_decre_S1 = NA)

for (i in 1:length(unique(df_total$id))) {
  df_num_decre_S1$num_decre_S1[i] = sum(if_else(lag(df_total[df_total$id == i, 'S1']) > 0, 'Positive', 
                                                 if_else(lag(df_total[df_total$id == i, 'S1']) < 0, 'Nagative', 'Zero')) == 'Nagative', na.rm = T)
}

write.csv(df_num_decre_S1, 'df_num_decre_S1.csv')

df_num_decre_S2 = data.frame(id = 1:length(unique(df_total$id)),
                              num_decre_S2 = NA)

for (i in 1:length(unique(df_total$id))) {
  df_num_decre_S2$num_decre_S2[i] = sum(if_else(lag(df_total[df_total$id == i, 'S2']) > 0, 'Positive', 
                                                 if_else(lag(df_total[df_total$id == i, 'S2']) < 0, 'Nagative', 'Zero')) == 'Nagative', na.rm = T)
}

write.csv(df_num_decre_S2, 'df_num_decre_S2.csv')

df_num_decre_S3 = data.frame(id = 1:length(unique(df_total$id)),
                              num_decre_S3 = NA)

for (i in 1:length(unique(df_total$id))) {
  df_num_decre_S3$num_decre_S3[i] = sum(if_else(lag(df_total[df_total$id == i, 'S3']) > 0, 'Positive', 
                                                 if_else(lag(df_total[df_total$id == i, 'S3']) < 0, 'Nagative', 'Zero')) == 'Nagative', na.rm = T)
}

write.csv(df_num_decre_S3, 'df_num_decre_S3.csv')

df_num_decre_S4 = data.frame(id = 1:length(unique(df_total$id)),
                              num_decre_S4 = NA)

for (i in 1:length(unique(df_total$id))) {
  df_num_decre_S4$num_decre_S4[i] = sum(if_else(lag(df_total[df_total$id == i, 'S4']) > 0, 'Positive', 
                                                 if_else(lag(df_total[df_total$id == i, 'S4']) < 0, 'Nagative', 'Zero')) == 'Nagative', na.rm = T)
}

write.csv(df_num_decre_S4, 'df_num_decre_S4.csv')

# num. of increasing points devided by num. of decreasing points of S1, S2, S3, S4

df_prop_S1 = data.frame(id = 1:length(unique(df_total$id)), 
                         prop_S1 = df_num_incre_S1$num_incre_S1/df_num_decre_S1$num_decre_S1)

df_prop_S2 = data.frame(id = 1:length(unique(df_total$id)), 
                         prop_S2 = df_num_incre_S2$num_incre_S2/df_num_decre_S2$num_decre_S2)

df_prop_S3 = data.frame(id = 1:length(unique(df_total$id)), 
                         prop_S3 = df_num_incre_S3$num_incre_S3/df_num_decre_S3$num_decre_S3)

df_prop_S4 = data.frame(id = 1:length(unique(df_total$id)), 
                         prop_S4 = df_num_incre_S4$num_incre_S4/df_num_decre_S4$num_decre_S4)

# num. of turning points of S1, S2, S3, S4

df_turn_S1 = data.frame(id = 1:length(unique(df_total$id)),
                      turn_S1 = NA)

for (i in 1:length(unique(df_total$id))) {
  temp = if_else(lag(df_total[df_total$id == i, 'S1']) > 0, 'Positive',
                  if_else(lag(df_total[df_total$id == i, 'S1']) < 0, 'Nagative', 'Zero'))

  count = 0

  for (j in 3:length(df_total[df_total$id == i, 'S1'])) {
    if (temp[j - 1] != temp[j]) {
      count = count + 1
    }
  }

  df_turn_S1$turn_S1[i] = count
}

write.csv(df_turn_S1, 'df_turn_S1.csv')

df_turn_S2 = data.frame(id = 1:length(unique(df_total$id)),
                         turn_S2 = NA)

for (i in 1:length(unique(df_total$id))) {
  temp = if_else(lag(df_total[df_total$id == i, 'S2']) > 0, 'Positive',
                  if_else(lag(df_total[df_total$id == i, 'S2']) < 0, 'Nagative', 'Zero'))
  
  count = 0
  
  for (j in 3:length(df_total[df_total$id == i, 'S2'])) {
    if (temp[j - 1] != temp[j]) {
      count = count + 1
    }
  }
  
  df_turn_S2$turn_S2[i] = count
}

write.csv(df_turn_S2, 'df_turn_S2.csv')

df_turn_S3 = data.frame(id = 1:length(unique(df_total$id)),
                         turn_S3 = NA)

for (i in 1:length(unique(df_total$id))) {
  temp = if_else(lag(df_total[df_total$id == i, 'S3']) > 0, 'Positive',
                  if_else(lag(df_total[df_total$id == i, 'S3']) < 0, 'Nagative', 'Zero'))
  
  count = 0
  
  for (j in 3:length(df_total[df_total$id == i, 'S3'])) {
    if (temp[j - 1] != temp[j]) {
      count = count + 1
    }
  }
  
  df_turn_S3$turn_S3[i] = count
}

write.csv(df_turn_S3, 'df_turn_S3.csv')

df_turn_S4 = data.frame(id = 1:length(unique(df_total$id)),
                         turn_S4 = NA)

for (i in 1:length(unique(df_total$id))) {
  temp = if_else(lag(df_total[df_total$id == i, 'S4']) > 0, 'Positive',
                  if_else(lag(df_total[df_total$id == i, 'S4']) < 0, 'Nagative', 'Zero'))
  
  count = 0
  
  for (j in 3:length(df_total[df_total$id == i, 'S4'])) {
    if (temp[j - 1] != temp[j]) {
      count = count + 1
    }
  }
  
  df_turn_S4$turn_S4[i] = count
}

write.csv(df_turn_S4, 'df_turn_S4.csv')

 

< Modeling (R Code) >

 

#######################################
# library & setting for visualization #
#######################################

library(dplyr)
library(tidyr)
library(cdata)
library(stringr)
library(data.table)
library(ggplot2)
library(scales)
library(viridis)
library(digest)
library(corrplot)
library(lightgbm)
library(xgboost)
library(randomForest)
library(e1071)

##############################
# loading & checking dataset #
##############################

set.seed(777)

df_train_feature = fread('C:/Users/user/Desktop/Project/KAERI/Data/train_features.csv')
df_train_target = fread('C:/Users/user/Desktop/Project/KAERI/Data/train_target.csv')
df_test_feature = fread('C:/Users/user/Desktop/Project/KAERI/Data/test_features.csv')
df_test_target =fread('C:/Users/user/Desktop/Project/KAERI/Data/sample_submission.csv')

df_train_feature$id = df_train_feature$id + 1
df_train_target$id = df_train_target$id + 1
df_test_feature$id = df_test_feature$id + 1
df_test_target$id = df_test_target$id + 1

#################
# preprocessing #
#################

df_train = df_train_feature %>% 
  left_join(df_train_target) %>% as.data.frame()

df_test = df_test_feature %>% 
  left_join(df_test_target) %>% as.data.frame()

df_total = bind_rows(df_train, df_test)

#######################
# feature engineering #
#######################

df_init_time_S1 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_init_time_S1.csv') %>% 
  select(-V1)

df_num_decre_S1 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_decre_S1.csv') %>% 
  select(-V1)

df_num_incre_S1 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_incre_S1.csv') %>% 
  select(-V1)

df_turn_S1 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_turn_S1.csv') %>% 
  select(-V1)

df_sd_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(sd_S1 = sd(S1)) %>% 
  ungroup()

df_mean_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(mean_S1 = mean(S1)) %>% 
  ungroup()

df_max_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S1 = max(S1)) %>% 
  ungroup()

df_min_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S1 = min(S1)) %>% 
  ungroup()

df_range_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(range_S1 = max(S1) - min(S1)) %>% 
  ungroup()

df_prop_S1 = data.frame(id = 1:length(unique(df_total$id)), 
                        prop_S1 = df_num_incre_S1$num_incre_S1/df_num_decre_S1$num_decre_S1)

df_init_time_S2 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_init_time_S2.csv') %>% 
  select(-V1)

df_num_decre_S2 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_decre_S2.csv') %>% 
  select(-V1)

df_num_incre_S2 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_incre_S2.csv') %>% 
  select(-V1)

df_turn_S2 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_turn_S2.csv') %>% 
  select(-V1)

df_sd_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(sd_S2 = sd(S2)) %>% 
  ungroup()

df_mean_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(mean_S2 = mean(S2)) %>% 
  ungroup()

df_max_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S2 = max(S2)) %>% 
  ungroup()

df_min_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S2 = min(S2)) %>% 
  ungroup()

df_range_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(range_S2 = max(S2) - min(S2)) %>% 
  ungroup()

df_prop_S2 = data.frame(id = 1:length(unique(df_total$id)), 
                        prop_S2 = df_num_incre_S2$num_incre_S2/df_num_decre_S2$num_decre_S2)

df_init_time_S3 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_init_time_S3.csv') %>% 
  select(-V1)

df_num_decre_S3 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_decre_S3.csv') %>% 
  select(-V1)

df_num_incre_S3 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_incre_S3.csv') %>% 
  select(-V1)

df_turn_S3 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_turn_S3.csv') %>% 
  select(-V1)

df_sd_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(sd_S3 = sd(S3)) %>% 
  ungroup()

df_mean_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(mean_S3 = mean(S3)) %>% 
  ungroup()

df_max_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S3 = max(S3)) %>% 
  ungroup()

df_min_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S3 = min(S3)) %>% 
  ungroup()

df_range_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(range_S3 = max(S3) - min(S3)) %>% 
  ungroup()

df_prop_S3 = data.frame(id = 1:length(unique(df_total$id)), 
                        prop_S3 = df_num_incre_S3$num_incre_S3/df_num_decre_S3$num_decre_S3)

df_init_time_S4 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_init_time_S4.csv') %>% 
  select(-V1)

df_num_decre_S4 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_decre_S4.csv') %>% 
  select(-V1)

df_num_incre_S4 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_incre_S4.csv') %>% 
  select(-V1)

df_turn_S4 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_turn_S4.csv') %>% 
  select(-V1)

df_sd_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(sd_S4 = sd(S4)) %>% 
  ungroup()

df_mean_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(mean_S4 = mean(S4)) %>% 
  ungroup()

df_max_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S4 = max(S4)) %>% 
  ungroup()

df_min_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S4 = min(S4)) %>% 
  ungroup()

df_range_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(range_S4 = max(S4) - min(S4)) %>% 
  ungroup()

df_prop_S4 = data.frame(id = 1:length(unique(df_total$id)), 
                        prop_S4 = df_num_incre_S4$num_incre_S4/df_num_decre_S4$num_decre_S4)

df_max_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S1 = max(S1)) %>% 
  ungroup()

df_max_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S2 = max(S2)) %>% 
  ungroup()

df_max_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S3 = max(S3)) %>% 
  ungroup()

df_max_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(max_S4 = max(S4)) %>% 
  ungroup()

df_min_S1 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S1 = min(S1)) %>% 
  ungroup()

df_min_S2 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S2 = min(S2)) %>% 
  ungroup()

df_min_S3 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S3 = min(S3)) %>% 
  ungroup()

df_min_S4 = df_total %>% 
  group_by(id) %>% 
  summarise(min_S4 = min(S4)) %>% 
  ungroup()

df_total_x = df_init_time_S1 %>% 
  left_join(df_init_time_S2) %>% 
  left_join(df_init_time_S3) %>% 
  left_join(df_init_time_S4) %>% 
  left_join(df_num_decre_S1) %>% 
  left_join(df_num_decre_S2) %>% 
  left_join(df_num_decre_S3) %>% 
  left_join(df_num_decre_S4) %>% 
  left_join(df_num_incre_S1) %>% 
  left_join(df_num_incre_S2) %>% 
  left_join(df_num_incre_S3) %>% 
  left_join(df_num_incre_S4) %>% 
  left_join(df_prop_S1) %>% 
  left_join(df_prop_S2) %>% 
  left_join(df_prop_S3) %>% 
  left_join(df_prop_S4) %>% 
  left_join(df_range_S1) %>% 
  left_join(df_range_S2) %>% 
  left_join(df_range_S3) %>% 
  left_join(df_range_S4) %>% 
  left_join(df_sd_S1) %>% 
  left_join(df_sd_S2) %>% 
  left_join(df_sd_S3) %>% 
  left_join(df_sd_S4) %>% 
  left_join(df_min_S1) %>% 
  left_join(df_min_S2) %>% 
  left_join(df_min_S3) %>% 
  left_join(df_min_S4) %>%
  left_join(df_max_S1) %>% 
  left_join(df_max_S2) %>% 
  left_join(df_max_S3) %>% 
  left_join(df_max_S4) %>%
  left_join(df_turn_S1) %>% 
  left_join(df_turn_S2) %>% 
  left_join(df_turn_S3) %>% 
  left_join(df_turn_S4)

###########################
# correlation of features #
###########################

df_total_x %>% 
  right_join(df_train_target) %>% 
  cor() %>% 
  corrplot(type = 'upper')

#######################################
# spliting total into train and valid #
#######################################

num_train = nrow(df_train_target)

id_train = sample(seq(num_train), num_train*0.75)
id_valid = seq(num_train)
id_valid = id_valid[!(id_valid %in% id_train)]

df_train_x = df_total_x[id_train, ]
df_train_y = df_train_target[id_train, ]
df_valid_x = df_total_x[id_valid, ]
df_valid_y = df_train_target[id_valid, ]

df_test_x = df_total_x[(num_train + 1):nrow(df_total_x), ]

########################
# function of modeling #
########################

train_lgbm = function(x, target, nrounds) {
  params = list(boosting_type = 'gbdt', 
                objective = 'regression', 
                metric = 'mae')
  
  lgb_dataset = lgb.Dataset(data = as.matrix(x), 
                            label = as.matrix(target))
  
  model_lgb = lightgbm(params = params, 
                       data = lgb_dataset, 
                       nrounds = nrounds)
  
  return(model_lgb)
}



train_xgb = function(x, target, nrounds) {
  model_xgb = xgboost(data = data.matrix(x),
                      label = target,
                      nrounds = nrounds,
                      objective = 'reg:linear',
                      eval_metric = 'mae')
  
  return(model_xgb)
}

train_rf = function(x, target) {
  model_rf = randomForest(x, target)
  
  return(model_rf)
}

##############
# mega model #
##############

# X

model_X_lgbm = train_lgbm(df_train_x, df_train_y$X, 250)
model_X_xgb = train_xgb(df_train_x, df_train_y$X, 250)
model_X_rf = train_rf(df_train_x, df_train_y$X)

pred_X_lgbm = predict(model_X_lgbm, as.matrix(df_valid_x))
pred_X_xgb = predict(model_X_xgb, as.matrix(df_valid_x))
pred_X_rf = predict(model_X_rf, df_valid_x)

# Y

model_Y_lgbm = train_lgbm(df_train_x, df_train_y$Y, 250)
model_Y_xgb = train_xgb(df_train_x, df_train_y$Y, 250)
model_Y_rf = train_rf(df_train_x, df_train_y$Y)

pred_Y_lgbm = predict(model_Y_lgbm, as.matrix(df_valid_x))
pred_Y_xgb = predict(model_Y_xgb, as.matrix(df_valid_x))
pred_Y_rf = predict(model_Y_rf, df_valid_x)

# M

model_M_lgbm = train_lgbm(df_train_x, df_train_y$M, 250)
model_M_xgb = train_xgb(df_train_x, df_train_y$M, 250)
model_M_rf = train_rf(df_train_x, df_train_y$M)

pred_M_lgbm = predict(model_M_lgbm, as.matrix(df_valid_x))
pred_M_xgb = predict(model_M_xgb, as.matrix(df_valid_x))
pred_M_rf = predict(model_M_rf, df_valid_x)

# V

model_V_lgbm = train_lgbm(df_train_x, df_train_y$V, 250)
model_V_xgb = train_xgb(df_train_x, df_train_y$V, 250)
model_V_rf = train_rf(df_train_x, df_train_y$V)

pred_V_lgbm = predict(model_V_lgbm, as.matrix(df_valid_x))
pred_V_xgb = predict(model_V_xgb, as.matrix(df_valid_x))
pred_V_rf = predict(model_V_rf, df_valid_x)

# mega dataset

df_mega = data.frame(pred_X_lgbm, pred_X_xgb, pred_X_rf, 
                     pred_Y_lgbm, pred_Y_xgb, pred_Y_rf, 
                     pred_M_lgbm, pred_M_xgb, pred_M_rf, 
                     pred_V_lgbm, pred_V_xgb, pred_V_rf)

# mega model

mega_model_X = train_xgb(df_mega, df_valid_y$X, 25)
mega_model_Y = train_xgb(df_mega, df_valid_y$Y, 50)
mega_model_M = train_xgb(df_mega, df_valid_y$M, 25)
mega_model_V = train_xgb(df_mega, df_valid_y$V, 50)

##############
# submission #
##############

pred_X_lgbm = predict(model_X_lgbm, as.matrix(df_test_x))
pred_X_xgb = predict(model_X_xgb, as.matrix(df_test_x))
pred_X_rf = predict(model_X_rf, df_test_x)

pred_Y_lgbm = predict(model_Y_lgbm, as.matrix(df_test_x))
pred_Y_xgb = predict(model_Y_xgb, as.matrix(df_test_x))
pred_Y_rf = predict(model_Y_rf, df_test_x)

pred_M_lgbm = predict(model_M_lgbm, as.matrix(df_test_x))
pred_M_xgb = predict(model_M_xgb, as.matrix(df_test_x))
pred_M_rf = predict(model_M_rf, df_test_x)

pred_V_lgbm = predict(model_V_lgbm, as.matrix(df_test_x))
pred_V_xgb = predict(model_V_xgb, as.matrix(df_test_x))
pred_V_rf = predict(model_V_rf, df_test_x)

df_mega = data.frame(pred_X_lgbm, pred_X_xgb, pred_X_rf, 
                     pred_Y_lgbm, pred_Y_xgb, pred_Y_rf, 
                     pred_M_lgbm, pred_M_xgb, pred_M_rf, 
                     pred_V_lgbm, pred_V_xgb, pred_V_rf)

df_test_target$X = predict(mega_model_X, as.matrix(df_mega))
df_test_target$Y = predict(mega_model_Y, as.matrix(df_mega))
df_test_target$M = predict(mega_model_M, as.matrix(df_mega))
df_test_target$V = predict(mega_model_V, as.matrix(df_mega))

write.csv(df_test_target, 'sample_submission.csv')