< EDA & Feature Engineering (R Code) >
#######################################
# library & setting for visualization #
#######################################
library(dplyr)
library(tidyr)
library(cdata)
library(stringr)
library(data.table)
library(ggplot2)
library(scales)
library(viridis)
library(digest)
theme_set(theme_minimal() +
theme(plot.title = element_text(face = 'bold', colour = 'grey10'),
plot.subtitle = element_text(colour = 'grey25'),
panel.grid.major = element_line(colour = 'grey50', size = 0.25, linetype = 'dashed'),
panel.grid.minor = element_blank(),
legend.position = 'top',
legend.spacing.x = unit(0.125, 'cm'),
legend.background = element_rect(fill = NULL, linetype = 'dotted'),
strip.background = element_blank(),
strip.text = element_text(face = 'bold', colour = 'grey25', size = 11.25)))
colour_list = c('#AE5CFF', '#5CFFA5', 'grey25')
##############################
# loading & checking dataset #
##############################
df_train_feature = fread('C:/Users/user/Desktop/Project/KAERI/Data/train_features.csv')
df_train_target = fread('C:/Users/user/Desktop/Project/KAERI/Data/train_target.csv')
df_test_feature = fread('C:/Users/user/Desktop/Project/KAERI/Data/test_features.csv')
df_test_target =fread('C:/Users/user/Desktop/Project/KAERI/Data/sample_submission.csv')
df_train_feature$id = df_train_feature$id + 1
df_train_target$id = df_train_target$id + 1
df_test_feature$id = df_test_feature$id + 1
df_test_target$id = df_test_target$id + 1
#################
# preprocessing #
#################
df_train = df_train_feature %>%
left_join(df_train_target) %>% as.data.frame()
df_test = df_test_feature %>%
left_join(df_test_target) %>% as.data.frame()
df_total = bind_rows(df_train, df_test)
#######
# eda #
#######
df_train %>%
group_by(id) %>%
count() %>%
as.data.frame()
df_train_target %>%
group_by(X, Y, M, V) %>%
count() %>%
as.data.frame()
df_train_target %>%
group_by(X) %>%
count()
df_train_target %>%
group_by(Y) %>%
count()
df_train_target %>%
group_by(M) %>%
count()
df_train_target %>%
group_by(V) %>%
count()
unique(df_train$X)
unique(df_train$Y)
unique(df_train$M)
unique(df_train$V)
# S1
df_train %>%
ggplot(aes(Time, S1, group = id)) +
geom_line() +
facet_wrap(~ X)
df_train %>%
ggplot(aes(Time, S1, group = id)) +
geom_line() +
facet_wrap(~ Y)
df_train %>%
ggplot(aes(Time, S1, group = id)) +
geom_line() +
facet_wrap(~ M)
df_train %>%
ggplot(aes(Time, S1, group = id)) +
geom_line() +
facet_wrap(~ V)
# S2
df_train %>%
ggplot(aes(Time, S2, group = id)) +
geom_line() +
facet_wrap(~ X)
df_train %>%
ggplot(aes(Time, S2, group = id)) +
geom_line() +
facet_wrap(~ Y)
df_train %>%
ggplot(aes(Time, S2, group = id)) +
geom_line() +
facet_wrap(~ M)
df_train %>%
ggplot(aes(Time, S2, group = id)) +
geom_line() +
facet_wrap(~ V)
# S3
df_train %>%
ggplot(aes(Time, S3, group = id)) +
geom_line() +
facet_wrap(~ X)
df_train %>%
ggplot(aes(Time, S3, group = id)) +
geom_line() +
facet_wrap(~ Y)
df_train %>%
ggplot(aes(Time, S3, group = id)) +
geom_line() +
facet_wrap(~ M)
df_train %>%
ggplot(aes(Time, S3, group = id)) +
geom_line() +
facet_wrap(~ V)
# S4
df_train %>%
ggplot(aes(Time, S4, group = id)) +
geom_line() +
facet_wrap(~ X)
df_train %>%
ggplot(aes(Time, S4, group = id)) +
geom_line() +
facet_wrap(~ Y)
df_train %>%
ggplot(aes(Time, S4, group = id)) +
geom_line() +
facet_wrap(~ M)
df_train %>%
ggplot(aes(Time, S4, group = id)) +
geom_line() +
facet_wrap(~ V)
#######################
# feature engineering #
#######################
# initial time of S1, S2, S3, S4
df_init_time_S1 = data.frame(id = 1:length(unique(df_total$id)),
init_time_S1 = NA)
for (i in 1:length(unique(df_total$id))) {
for (j in 1:length(df_total[df_total$id == i, 'S1'])) {
if (df_total[df_total$id == i, 'S1'][j] != 0) {
df_init_time_S1$init_time_S1[i] = df_total[df_total$id == i, 'Time'][j]
break
}
}
}
write.csv(df_init_time_S1, 'df_init_time_S1.csv')
df_init_time_S2 = data.frame(id = 1:length(unique(df_total$id)),
init_time_S2 = NA)
for (i in 1:length(unique(df_total$id))) {
for (j in 1:length(df_total[df_total$id == i, 'S2'])) {
if (df_total[df_total$id == i, 'S2'][j] != 0) {
df_init_time_S2$init_time_S2[i] = df_total[df_total$id == i, 'Time'][j]
break
}
}
}
write.csv(df_init_time_S2, 'df_init_time_S2.csv')
df_init_time_S3 = data.frame(id = 1:length(unique(df_total$id)),
init_time_S3 = NA)
for (i in 1:length(unique(df_total$id))) {
for (j in 1:length(df_total[df_total$id == i, 'S3'])) {
if (df_total[df_total$id == i, 'S3'][j] != 0) {
df_init_time_S3$init_time_S3[i] = df_total[df_total$id == i, 'Time'][j]
break
}
}
}
write.csv(df_init_time_S3, 'df_init_time_S3.csv')
df_init_time_S4 = data.frame(id = 1:length(unique(df_total$id)),
init_time_S4 = NA)
for (i in 1:length(unique(df_total$id))) {
for (j in 1:length(df_total[df_total$id == i, 'S4'])) {
if (df_total[df_total$id == i, 'S4'][j] != 0) {
df_init_time_S4$init_time_S4[i] = df_total[df_total$id == i, 'Time'][j]
break
}
}
}
write.csv(df_init_time_S4, 'df_init_time_S4.csv')
# standard dev. of S1, S2, S3, S4
df_sd_S1 = df_total %>%
group_by(id) %>%
summarise(sd_S1 = sd(S1)) %>%
ungroup()
df_sd_S2 = df_total %>%
group_by(id) %>%
summarise(sd_S2 = sd(S2)) %>%
ungroup()
df_sd_S3 = df_total %>%
group_by(id) %>%
summarise(sd_S3 = sd(S3)) %>%
ungroup()
df_sd_S4 = df_total %>%
group_by(id) %>%
summarise(sd_S4 = sd(S4)) %>%
ungroup()
# mean of S1, S2, S3, S4
df_mean_S1 = df_total %>%
group_by(id) %>%
summarise(mean_S1 = mean(S1)) %>%
ungroup()
df_mean_S2 = df_total %>%
group_by(id) %>%
summarise(mean_S2 = mean(S2)) %>%
ungroup()
df_mean_S3 = df_total %>%
group_by(id) %>%
summarise(mean_S3 = mean(S3)) %>%
ungroup()
df_mean_S4 = df_total %>%
group_by(id) %>%
summarise(mean_S4 = mean(S4)) %>%
ungroup()
# max of S1, S2, S3, S4
df_max_S1 = df_total %>%
group_by(id) %>%
summarise(max_S1 = max(S1)) %>%
ungroup()
df_max_S2 = df_total %>%
group_by(id) %>%
summarise(max_S2 = max(S2)) %>%
ungroup()
df_max_S3 = df_total %>%
group_by(id) %>%
summarise(max_S3 = max(S3)) %>%
ungroup()
df_max_S4 = df_total %>%
group_by(id) %>%
summarise(max_S4 = max(S4)) %>%
ungroup()
# min of S1, S2, S3, S4
df_min_S1 = df_total %>%
group_by(id) %>%
summarise(min_S1 = min(S1)) %>%
ungroup()
df_min_S2 = df_total %>%
group_by(id) %>%
summarise(min_S2 = min(S2)) %>%
ungroup()
df_min_S3 = df_total %>%
group_by(id) %>%
summarise(min_S3 = min(S3)) %>%
ungroup()
df_min_S4 = df_total %>%
group_by(id) %>%
summarise(min_S4 = min(S4)) %>%
ungroup()
# range of S1, S2, S3, S4
df_range_S1 = df_total %>%
group_by(id) %>%
summarise(range_S1 = max(S1) - min(S1)) %>%
ungroup()
df_range_S2 = df_total %>%
group_by(id) %>%
summarise(range_S2 = max(S2) - min(S2)) %>%
ungroup()
df_range_S3 = df_total %>%
group_by(id) %>%
summarise(range_S3 = max(S3) - min(S3)) %>%
ungroup()
df_range_S4 = df_total %>%
group_by(id) %>%
summarise(range_S4 = max(S4) - min(S4)) %>%
ungroup()
# num. of increasing points of S1, S2, S3, S4
df_num_incre_S1 = data.frame(id = 1:length(unique(df_total$id)),
num_incre_S1 = NA)
for (i in 1:length(unique(df_total$id))) {
df_num_incre_S1$num_incre_S1[i] = sum(if_else(lag(df_total[df_total$id == i, 'S1']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S1']) < 0, 'Nagative', 'Zero')) == 'Positive', na.rm = T)
}
write.csv(df_num_incre_S1, 'df_num_incre_S1.csv')
df_num_incre_S2 = data.frame(id = 1:length(unique(df_total$id)),
num_incre_S2 = NA)
for (i in 1:length(unique(df_total$id))) {
df_num_incre_S2$num_incre_S2[i] = sum(if_else(lag(df_total[df_total$id == i, 'S2']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S2']) < 0, 'Nagative', 'Zero')) == 'Positive', na.rm = T)
}
write.csv(df_num_incre_S2, 'df_num_incre_S2.csv')
df_num_incre_S3 = data.frame(id = 1:length(unique(df_total$id)),
num_incre_S3 = NA)
for (i in 1:length(unique(df_total$id))) {
df_num_incre_S3$num_incre_S3[i] = sum(if_else(lag(df_total[df_total$id == i, 'S3']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S3']) < 0, 'Nagative', 'Zero')) == 'Positive', na.rm = T)
}
write.csv(df_num_incre_S3, 'df_num_incre_S3.csv')
df_num_incre_S4 = data.frame(id = 1:length(unique(df_total$id)),
num_incre_S4 = NA)
for (i in 1:length(unique(df_total$id))) {
df_num_incre_S4$num_incre_S4[i] = sum(if_else(lag(df_total[df_total$id == i, 'S4']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S4']) < 0, 'Nagative', 'Zero')) == 'Positive', na.rm = T)
}
write.csv(df_num_incre_S4, 'df_num_incre_S4.csv')
# num. of decreasing points of S1, S2, S3, S4
df_num_decre_S1 = data.frame(id = 1:length(unique(df_total$id)),
num_decre_S1 = NA)
for (i in 1:length(unique(df_total$id))) {
df_num_decre_S1$num_decre_S1[i] = sum(if_else(lag(df_total[df_total$id == i, 'S1']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S1']) < 0, 'Nagative', 'Zero')) == 'Nagative', na.rm = T)
}
write.csv(df_num_decre_S1, 'df_num_decre_S1.csv')
df_num_decre_S2 = data.frame(id = 1:length(unique(df_total$id)),
num_decre_S2 = NA)
for (i in 1:length(unique(df_total$id))) {
df_num_decre_S2$num_decre_S2[i] = sum(if_else(lag(df_total[df_total$id == i, 'S2']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S2']) < 0, 'Nagative', 'Zero')) == 'Nagative', na.rm = T)
}
write.csv(df_num_decre_S2, 'df_num_decre_S2.csv')
df_num_decre_S3 = data.frame(id = 1:length(unique(df_total$id)),
num_decre_S3 = NA)
for (i in 1:length(unique(df_total$id))) {
df_num_decre_S3$num_decre_S3[i] = sum(if_else(lag(df_total[df_total$id == i, 'S3']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S3']) < 0, 'Nagative', 'Zero')) == 'Nagative', na.rm = T)
}
write.csv(df_num_decre_S3, 'df_num_decre_S3.csv')
df_num_decre_S4 = data.frame(id = 1:length(unique(df_total$id)),
num_decre_S4 = NA)
for (i in 1:length(unique(df_total$id))) {
df_num_decre_S4$num_decre_S4[i] = sum(if_else(lag(df_total[df_total$id == i, 'S4']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S4']) < 0, 'Nagative', 'Zero')) == 'Nagative', na.rm = T)
}
write.csv(df_num_decre_S4, 'df_num_decre_S4.csv')
# num. of increasing points devided by num. of decreasing points of S1, S2, S3, S4
df_prop_S1 = data.frame(id = 1:length(unique(df_total$id)),
prop_S1 = df_num_incre_S1$num_incre_S1/df_num_decre_S1$num_decre_S1)
df_prop_S2 = data.frame(id = 1:length(unique(df_total$id)),
prop_S2 = df_num_incre_S2$num_incre_S2/df_num_decre_S2$num_decre_S2)
df_prop_S3 = data.frame(id = 1:length(unique(df_total$id)),
prop_S3 = df_num_incre_S3$num_incre_S3/df_num_decre_S3$num_decre_S3)
df_prop_S4 = data.frame(id = 1:length(unique(df_total$id)),
prop_S4 = df_num_incre_S4$num_incre_S4/df_num_decre_S4$num_decre_S4)
# num. of turning points of S1, S2, S3, S4
df_turn_S1 = data.frame(id = 1:length(unique(df_total$id)),
turn_S1 = NA)
for (i in 1:length(unique(df_total$id))) {
temp = if_else(lag(df_total[df_total$id == i, 'S1']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S1']) < 0, 'Nagative', 'Zero'))
count = 0
for (j in 3:length(df_total[df_total$id == i, 'S1'])) {
if (temp[j - 1] != temp[j]) {
count = count + 1
}
}
df_turn_S1$turn_S1[i] = count
}
write.csv(df_turn_S1, 'df_turn_S1.csv')
df_turn_S2 = data.frame(id = 1:length(unique(df_total$id)),
turn_S2 = NA)
for (i in 1:length(unique(df_total$id))) {
temp = if_else(lag(df_total[df_total$id == i, 'S2']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S2']) < 0, 'Nagative', 'Zero'))
count = 0
for (j in 3:length(df_total[df_total$id == i, 'S2'])) {
if (temp[j - 1] != temp[j]) {
count = count + 1
}
}
df_turn_S2$turn_S2[i] = count
}
write.csv(df_turn_S2, 'df_turn_S2.csv')
df_turn_S3 = data.frame(id = 1:length(unique(df_total$id)),
turn_S3 = NA)
for (i in 1:length(unique(df_total$id))) {
temp = if_else(lag(df_total[df_total$id == i, 'S3']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S3']) < 0, 'Nagative', 'Zero'))
count = 0
for (j in 3:length(df_total[df_total$id == i, 'S3'])) {
if (temp[j - 1] != temp[j]) {
count = count + 1
}
}
df_turn_S3$turn_S3[i] = count
}
write.csv(df_turn_S3, 'df_turn_S3.csv')
df_turn_S4 = data.frame(id = 1:length(unique(df_total$id)),
turn_S4 = NA)
for (i in 1:length(unique(df_total$id))) {
temp = if_else(lag(df_total[df_total$id == i, 'S4']) > 0, 'Positive',
if_else(lag(df_total[df_total$id == i, 'S4']) < 0, 'Nagative', 'Zero'))
count = 0
for (j in 3:length(df_total[df_total$id == i, 'S4'])) {
if (temp[j - 1] != temp[j]) {
count = count + 1
}
}
df_turn_S4$turn_S4[i] = count
}
write.csv(df_turn_S4, 'df_turn_S4.csv')
< Modeling (R Code) >
#######################################
# library & setting for visualization #
#######################################
library(dplyr)
library(tidyr)
library(cdata)
library(stringr)
library(data.table)
library(ggplot2)
library(scales)
library(viridis)
library(digest)
library(corrplot)
library(lightgbm)
library(xgboost)
library(randomForest)
library(e1071)
##############################
# loading & checking dataset #
##############################
set.seed(777)
df_train_feature = fread('C:/Users/user/Desktop/Project/KAERI/Data/train_features.csv')
df_train_target = fread('C:/Users/user/Desktop/Project/KAERI/Data/train_target.csv')
df_test_feature = fread('C:/Users/user/Desktop/Project/KAERI/Data/test_features.csv')
df_test_target =fread('C:/Users/user/Desktop/Project/KAERI/Data/sample_submission.csv')
df_train_feature$id = df_train_feature$id + 1
df_train_target$id = df_train_target$id + 1
df_test_feature$id = df_test_feature$id + 1
df_test_target$id = df_test_target$id + 1
#################
# preprocessing #
#################
df_train = df_train_feature %>%
left_join(df_train_target) %>% as.data.frame()
df_test = df_test_feature %>%
left_join(df_test_target) %>% as.data.frame()
df_total = bind_rows(df_train, df_test)
#######################
# feature engineering #
#######################
df_init_time_S1 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_init_time_S1.csv') %>%
select(-V1)
df_num_decre_S1 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_decre_S1.csv') %>%
select(-V1)
df_num_incre_S1 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_incre_S1.csv') %>%
select(-V1)
df_turn_S1 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_turn_S1.csv') %>%
select(-V1)
df_sd_S1 = df_total %>%
group_by(id) %>%
summarise(sd_S1 = sd(S1)) %>%
ungroup()
df_mean_S1 = df_total %>%
group_by(id) %>%
summarise(mean_S1 = mean(S1)) %>%
ungroup()
df_max_S1 = df_total %>%
group_by(id) %>%
summarise(max_S1 = max(S1)) %>%
ungroup()
df_min_S1 = df_total %>%
group_by(id) %>%
summarise(min_S1 = min(S1)) %>%
ungroup()
df_range_S1 = df_total %>%
group_by(id) %>%
summarise(range_S1 = max(S1) - min(S1)) %>%
ungroup()
df_prop_S1 = data.frame(id = 1:length(unique(df_total$id)),
prop_S1 = df_num_incre_S1$num_incre_S1/df_num_decre_S1$num_decre_S1)
df_init_time_S2 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_init_time_S2.csv') %>%
select(-V1)
df_num_decre_S2 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_decre_S2.csv') %>%
select(-V1)
df_num_incre_S2 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_incre_S2.csv') %>%
select(-V1)
df_turn_S2 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_turn_S2.csv') %>%
select(-V1)
df_sd_S2 = df_total %>%
group_by(id) %>%
summarise(sd_S2 = sd(S2)) %>%
ungroup()
df_mean_S2 = df_total %>%
group_by(id) %>%
summarise(mean_S2 = mean(S2)) %>%
ungroup()
df_max_S2 = df_total %>%
group_by(id) %>%
summarise(max_S2 = max(S2)) %>%
ungroup()
df_min_S2 = df_total %>%
group_by(id) %>%
summarise(min_S2 = min(S2)) %>%
ungroup()
df_range_S2 = df_total %>%
group_by(id) %>%
summarise(range_S2 = max(S2) - min(S2)) %>%
ungroup()
df_prop_S2 = data.frame(id = 1:length(unique(df_total$id)),
prop_S2 = df_num_incre_S2$num_incre_S2/df_num_decre_S2$num_decre_S2)
df_init_time_S3 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_init_time_S3.csv') %>%
select(-V1)
df_num_decre_S3 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_decre_S3.csv') %>%
select(-V1)
df_num_incre_S3 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_incre_S3.csv') %>%
select(-V1)
df_turn_S3 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_turn_S3.csv') %>%
select(-V1)
df_sd_S3 = df_total %>%
group_by(id) %>%
summarise(sd_S3 = sd(S3)) %>%
ungroup()
df_mean_S3 = df_total %>%
group_by(id) %>%
summarise(mean_S3 = mean(S3)) %>%
ungroup()
df_max_S3 = df_total %>%
group_by(id) %>%
summarise(max_S3 = max(S3)) %>%
ungroup()
df_min_S3 = df_total %>%
group_by(id) %>%
summarise(min_S3 = min(S3)) %>%
ungroup()
df_range_S3 = df_total %>%
group_by(id) %>%
summarise(range_S3 = max(S3) - min(S3)) %>%
ungroup()
df_prop_S3 = data.frame(id = 1:length(unique(df_total$id)),
prop_S3 = df_num_incre_S3$num_incre_S3/df_num_decre_S3$num_decre_S3)
df_init_time_S4 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_init_time_S4.csv') %>%
select(-V1)
df_num_decre_S4 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_decre_S4.csv') %>%
select(-V1)
df_num_incre_S4 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_num_incre_S4.csv') %>%
select(-V1)
df_turn_S4 = fread('C:/Users/user/Desktop/Project/KAERI/Data/df_turn_S4.csv') %>%
select(-V1)
df_sd_S4 = df_total %>%
group_by(id) %>%
summarise(sd_S4 = sd(S4)) %>%
ungroup()
df_mean_S4 = df_total %>%
group_by(id) %>%
summarise(mean_S4 = mean(S4)) %>%
ungroup()
df_max_S4 = df_total %>%
group_by(id) %>%
summarise(max_S4 = max(S4)) %>%
ungroup()
df_min_S4 = df_total %>%
group_by(id) %>%
summarise(min_S4 = min(S4)) %>%
ungroup()
df_range_S4 = df_total %>%
group_by(id) %>%
summarise(range_S4 = max(S4) - min(S4)) %>%
ungroup()
df_prop_S4 = data.frame(id = 1:length(unique(df_total$id)),
prop_S4 = df_num_incre_S4$num_incre_S4/df_num_decre_S4$num_decre_S4)
df_max_S1 = df_total %>%
group_by(id) %>%
summarise(max_S1 = max(S1)) %>%
ungroup()
df_max_S2 = df_total %>%
group_by(id) %>%
summarise(max_S2 = max(S2)) %>%
ungroup()
df_max_S3 = df_total %>%
group_by(id) %>%
summarise(max_S3 = max(S3)) %>%
ungroup()
df_max_S4 = df_total %>%
group_by(id) %>%
summarise(max_S4 = max(S4)) %>%
ungroup()
df_min_S1 = df_total %>%
group_by(id) %>%
summarise(min_S1 = min(S1)) %>%
ungroup()
df_min_S2 = df_total %>%
group_by(id) %>%
summarise(min_S2 = min(S2)) %>%
ungroup()
df_min_S3 = df_total %>%
group_by(id) %>%
summarise(min_S3 = min(S3)) %>%
ungroup()
df_min_S4 = df_total %>%
group_by(id) %>%
summarise(min_S4 = min(S4)) %>%
ungroup()
df_total_x = df_init_time_S1 %>%
left_join(df_init_time_S2) %>%
left_join(df_init_time_S3) %>%
left_join(df_init_time_S4) %>%
left_join(df_num_decre_S1) %>%
left_join(df_num_decre_S2) %>%
left_join(df_num_decre_S3) %>%
left_join(df_num_decre_S4) %>%
left_join(df_num_incre_S1) %>%
left_join(df_num_incre_S2) %>%
left_join(df_num_incre_S3) %>%
left_join(df_num_incre_S4) %>%
left_join(df_prop_S1) %>%
left_join(df_prop_S2) %>%
left_join(df_prop_S3) %>%
left_join(df_prop_S4) %>%
left_join(df_range_S1) %>%
left_join(df_range_S2) %>%
left_join(df_range_S3) %>%
left_join(df_range_S4) %>%
left_join(df_sd_S1) %>%
left_join(df_sd_S2) %>%
left_join(df_sd_S3) %>%
left_join(df_sd_S4) %>%
left_join(df_min_S1) %>%
left_join(df_min_S2) %>%
left_join(df_min_S3) %>%
left_join(df_min_S4) %>%
left_join(df_max_S1) %>%
left_join(df_max_S2) %>%
left_join(df_max_S3) %>%
left_join(df_max_S4) %>%
left_join(df_turn_S1) %>%
left_join(df_turn_S2) %>%
left_join(df_turn_S3) %>%
left_join(df_turn_S4)
###########################
# correlation of features #
###########################
df_total_x %>%
right_join(df_train_target) %>%
cor() %>%
corrplot(type = 'upper')
#######################################
# spliting total into train and valid #
#######################################
num_train = nrow(df_train_target)
id_train = sample(seq(num_train), num_train*0.75)
id_valid = seq(num_train)
id_valid = id_valid[!(id_valid %in% id_train)]
df_train_x = df_total_x[id_train, ]
df_train_y = df_train_target[id_train, ]
df_valid_x = df_total_x[id_valid, ]
df_valid_y = df_train_target[id_valid, ]
df_test_x = df_total_x[(num_train + 1):nrow(df_total_x), ]
########################
# function of modeling #
########################
train_lgbm = function(x, target, nrounds) {
params = list(boosting_type = 'gbdt',
objective = 'regression',
metric = 'mae')
lgb_dataset = lgb.Dataset(data = as.matrix(x),
label = as.matrix(target))
model_lgb = lightgbm(params = params,
data = lgb_dataset,
nrounds = nrounds)
return(model_lgb)
}
train_xgb = function(x, target, nrounds) {
model_xgb = xgboost(data = data.matrix(x),
label = target,
nrounds = nrounds,
objective = 'reg:linear',
eval_metric = 'mae')
return(model_xgb)
}
train_rf = function(x, target) {
model_rf = randomForest(x, target)
return(model_rf)
}
##############
# mega model #
##############
# X
model_X_lgbm = train_lgbm(df_train_x, df_train_y$X, 250)
model_X_xgb = train_xgb(df_train_x, df_train_y$X, 250)
model_X_rf = train_rf(df_train_x, df_train_y$X)
pred_X_lgbm = predict(model_X_lgbm, as.matrix(df_valid_x))
pred_X_xgb = predict(model_X_xgb, as.matrix(df_valid_x))
pred_X_rf = predict(model_X_rf, df_valid_x)
# Y
model_Y_lgbm = train_lgbm(df_train_x, df_train_y$Y, 250)
model_Y_xgb = train_xgb(df_train_x, df_train_y$Y, 250)
model_Y_rf = train_rf(df_train_x, df_train_y$Y)
pred_Y_lgbm = predict(model_Y_lgbm, as.matrix(df_valid_x))
pred_Y_xgb = predict(model_Y_xgb, as.matrix(df_valid_x))
pred_Y_rf = predict(model_Y_rf, df_valid_x)
# M
model_M_lgbm = train_lgbm(df_train_x, df_train_y$M, 250)
model_M_xgb = train_xgb(df_train_x, df_train_y$M, 250)
model_M_rf = train_rf(df_train_x, df_train_y$M)
pred_M_lgbm = predict(model_M_lgbm, as.matrix(df_valid_x))
pred_M_xgb = predict(model_M_xgb, as.matrix(df_valid_x))
pred_M_rf = predict(model_M_rf, df_valid_x)
# V
model_V_lgbm = train_lgbm(df_train_x, df_train_y$V, 250)
model_V_xgb = train_xgb(df_train_x, df_train_y$V, 250)
model_V_rf = train_rf(df_train_x, df_train_y$V)
pred_V_lgbm = predict(model_V_lgbm, as.matrix(df_valid_x))
pred_V_xgb = predict(model_V_xgb, as.matrix(df_valid_x))
pred_V_rf = predict(model_V_rf, df_valid_x)
# mega dataset
df_mega = data.frame(pred_X_lgbm, pred_X_xgb, pred_X_rf,
pred_Y_lgbm, pred_Y_xgb, pred_Y_rf,
pred_M_lgbm, pred_M_xgb, pred_M_rf,
pred_V_lgbm, pred_V_xgb, pred_V_rf)
# mega model
mega_model_X = train_xgb(df_mega, df_valid_y$X, 25)
mega_model_Y = train_xgb(df_mega, df_valid_y$Y, 50)
mega_model_M = train_xgb(df_mega, df_valid_y$M, 25)
mega_model_V = train_xgb(df_mega, df_valid_y$V, 50)
##############
# submission #
##############
pred_X_lgbm = predict(model_X_lgbm, as.matrix(df_test_x))
pred_X_xgb = predict(model_X_xgb, as.matrix(df_test_x))
pred_X_rf = predict(model_X_rf, df_test_x)
pred_Y_lgbm = predict(model_Y_lgbm, as.matrix(df_test_x))
pred_Y_xgb = predict(model_Y_xgb, as.matrix(df_test_x))
pred_Y_rf = predict(model_Y_rf, df_test_x)
pred_M_lgbm = predict(model_M_lgbm, as.matrix(df_test_x))
pred_M_xgb = predict(model_M_xgb, as.matrix(df_test_x))
pred_M_rf = predict(model_M_rf, df_test_x)
pred_V_lgbm = predict(model_V_lgbm, as.matrix(df_test_x))
pred_V_xgb = predict(model_V_xgb, as.matrix(df_test_x))
pred_V_rf = predict(model_V_rf, df_test_x)
df_mega = data.frame(pred_X_lgbm, pred_X_xgb, pred_X_rf,
pred_Y_lgbm, pred_Y_xgb, pred_Y_rf,
pred_M_lgbm, pred_M_xgb, pred_M_rf,
pred_V_lgbm, pred_V_xgb, pred_V_rf)
df_test_target$X = predict(mega_model_X, as.matrix(df_mega))
df_test_target$Y = predict(mega_model_Y, as.matrix(df_mega))
df_test_target$M = predict(mega_model_M, as.matrix(df_mega))
df_test_target$V = predict(mega_model_V, as.matrix(df_mega))
write.csv(df_test_target, 'sample_submission.csv')