2018. 11. Term Project, 항만 물류 데이터를 활용한 예측 모델링

< Proposal Presentation >

< Final Presentation >

< EDA 1 (R Code) >

library(dplyr)
library(imputeTS)
library(ggplot2)

###################
# Loading dataset #
###################

dataset <- read.csv('C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/Dataset.csv')
dataset$date <- as.Date(dataset$date)

temp <- data.frame(date=seq(as.Date('1964-05-04'), as.Date('2018-02-12'), by=1))
dataset <- merge(dataset, temp, by='date', all=T)
rm(list='temp')

dataset <- dataset %>% filter(format(date, '%Y') >= 2003)

#################
# Check dataset #
#################

# for(i in 2:5) {
#   statsNA(dataset[, i])
# }
# 
# summary(dataset)

#####################################
# Excepting the weekends in dataset #
#####################################

dataset <- dataset[!(weekdays(dataset$date) %in% c('토요일', '일요일')), ]

for(i in 2:5) {
  statsNA(dataset[, i])
}

summary(dataset)

#############################################
# Time series plot about P3A_03 (Unit: Day) #
#############################################

dataset$index <- 1:nrow(dataset)

ggplot(dataset, aes(index, P3A_03)) + 
  geom_line() + 
  labs(x='Index', y='P3A_03 ($/Day)') + 
  theme_bw()

############################################
# Autocorrelation about P3A_03 (Unit: Day) #
############################################

temp_rate <- dataset[, 'P3A_03']
length_row <- length(temp_rate)

for (i in 1:3500) {
  temp <- c(rep(NA, i), temp_rate[1:(length_row - i)])
  temp_rate <- cbind(temp_rate, temp)
}

temp_rate <- as.data.frame(temp_rate)
names(temp_rate) <- c('rate', paste('lag_', 1:(ncol(temp_rate) - 1), sep=''))

temp_cor <- NULL

for (i in 1:3500) {
  temp_cor <- c(temp_cor, cor(temp_rate[, 1], temp_rate[, i + 1], use="complete.obs"))
}

temp_cor <- as.data.frame(cbind(c(1:3500), temp_cor))

ggplot(temp_cor, aes(V1, temp_cor)) +
  geom_bar(stat='identity') +
  labs(x='Lag', y='Correlation') +
  theme_bw()

rm(list=c('temp_cor', 'temp_rate','i', 'length_row', 'temp'))

#############################################
# Autocorrelation about P3A_03 (Unit: Week) #
#############################################

dataset$week <- rep(NA, nrow(dataset))
dataset$week[1:3] <- 1

temp_week <- NULL

for(i in 1:(length(4:(nrow(dataset)) - 1)/5)) {
  temp_week <- c(temp_week, rep(i + 1, 5))
}

dataset$week[4:(nrow(dataset) - 1)] <- temp_week
dataset$week[nrow(dataset)] <- 790

temp_dataset <- dataset %>%
  group_by(week) %>%
  summarise(mean_rate=mean(P3A_03, na.rm=T))

length_row <- nrow(temp_dataset)

for (i in 1:750) {
  temp <- c(rep(NA, i), temp_dataset$mean_rate[1:(length_row - i)])
  temp_dataset <- cbind(temp_dataset, temp)
}

names(temp_dataset) <- c('index', 'mean_rate', paste('lag_', 1:(ncol(temp_dataset) - 2), sep=''))

temp_cor <- NULL

for (i in 1:750) {
  temp_cor <- c(temp_cor, cor(temp_dataset[, 2], temp_dataset[, i + 2], use="complete.obs"))
}

temp_cor <- as.data.frame(cbind(c(1:750), temp_cor))

ggplot(temp_cor, aes(V1, temp_cor)) +
  geom_bar(stat='identity') +
  labs(x='Lag', y='Correlation') +
  theme_bw()

rm(list=c('temp_cor', 'temp_dataset', 'i', 'length_row', 'temp', 'temp_week'))

##############################################
# Autocorrelation about P3A_03 (Unit: Month) #
##############################################

dataset$year <- format(dataset$date, format='%Y')
dataset$month <- format(dataset$date, format='%m')

temp_dataset <- dataset %>% 
  group_by(year, month) %>% 
  summarise(mean_rate=mean(P3A_03, na.rm=T))
temp_dataset <- as.data.frame(temp_dataset)

length_row <- nrow(temp_dataset)

for (i in 1:180) {
  temp <- c(rep(NA, i), temp_dataset$mean_rate[1:(length_row - i)])
  temp_dataset <- cbind(temp_dataset, temp)
}

names(temp_dataset) <- c('year', 'month', 'mean_rate', paste('lag_', 1:(ncol(temp_dataset) - 3), sep=''))

temp_cor <- NULL

for (i in 1:180) {
  temp_cor <- c(temp_cor, cor(temp_dataset[, 3], temp_dataset[, i + 3], use="complete.obs"))
}

temp_cor <- as.data.frame(cbind(c(1:length(temp_cor)), temp_cor))

ggplot(temp_cor, aes(V1, temp_cor)) +
  geom_bar(stat='identity') +
  labs(x='Lag', y='Correlation') +
  theme_bw()

rm(list=c('temp_cor', 'temp_dataset', 'i', 'length_row', 'temp'))

< EDA 2 (R Code) >

library(dplyr)
library(reshape)
library(ggplot2)
library(ggcorrplot)
library(scales) 
library(gridExtra)
library(PerformanceAnalytics)
library(xlsx)

BCI <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/FreightIndex.xlsx", sheetName="BCI")
BDI <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/FreightIndex.xlsx", sheetName="BDI")
BPI <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/FreightIndex.xlsx", sheetName="BPI")
BSI <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/FreightIndex.xlsx", sheetName="BSI")
BHSI <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/FreightIndex.xlsx", sheetName="BHSI")

LIBOR <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/LIBOR.xlsx", sheetName="LIBOR")
won.dollar <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/Exchange_Won-Dollar.xlsx", sheetName="data")
bunker <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/BunkerPrices.xlsx", sheetName="data")
freight.rate <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/TargetVariableCandidates.xlsx", sheetName="data")
timecharter.rate <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/TimecharterRate.xlsx", sheetName="data")
earning.panamax <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/Earning_Panamax.xlsx", sheetName="data")
price.panamax <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/Price_Panamax.xlsx", sheetName="data")
GDP <- read.xlsx("C:/Users/user/Desktop/TermProject(데이터마이닝)/Data/GDP.xlsx", sheetName="data")

## Preprocessing
BPI$date <- as.Date(substr(as.character(BPI$date), 1, 10))
BHSI$date <- as.Date(BHSI$date, format= "%d/%m/%Y")

index <- BCI %>% 
  merge(BDI, by="date", all=T) %>% 
  merge(BPI, by="date", all=T) %>% 
  merge(BSI, by="date", all=T) %>% 
  merge(BHSI, by="date", all=T)

index$year <- format(index$date, "%Y")
index$month <- format(index$date, "%m")

BCI_year.month <- index %>% 
  select(year, month, BCI) %>% 
  group_by(year, month) %>% 
  summarise(mean_BCI=mean(BCI, na.rm=T), sd_BCI=sd(BCI, na.rm=T), coef.of.var_BCI=sd_BCI/mean_BCI)
BDI_year.month <- index %>% 
  select(year, month, BDI) %>% 
  group_by(year, month) %>% 
  summarise(mean_BDI=mean(BDI, na.rm=T), sd_BDI=sd(BDI, na.rm=T), coef.of.var_BDI=sd_BDI/mean_BDI)
BPI_year.month <- index %>% 
  select(year, month, BPI) %>% 
  group_by(year, month) %>% 
  summarise(mean_BPI=mean(BPI, na.rm=T), sd_BPI=sd(BPI, na.rm=T), coef.of.var_BPI=sd_BPI/mean_BPI)
BSI_year.month <- index %>% 
  select(year, month, BSI) %>% 
  group_by(year, month) %>% 
  summarise(mean_BSI=mean(BSI, na.rm=T), sd_BSI=sd(BSI, na.rm=T), coef.of.var_BSI=sd_BSI/mean_BSI)
BHSI_year.month <- index %>% 
  select(year, month, BHSI) %>% 
  group_by(year, month) %>% 
  summarise(mean_BHSI=mean(BHSI, na.rm=T), sd_BHSI=sd(BHSI, na.rm=T), coef.of.var_BHSI=sd_BHSI/mean_BHSI)

index_year.month <- BCI_year.month %>% 
  merge(BPI_year.month, by=c("year", "month"), all=T) %>% 
  merge(BDI_year.month, by=c("year", "month"), all=T) %>% 
  merge(BSI_year.month, by=c("year", "month"), all=T) %>% 
  merge(BHSI_year.month, by=c("year", "month"), all=T)

BCI$diff_1[!is.na(BCI[2])] <- c(NA, na.omit(BCI)[2:nrow(na.omit(BCI)), 2] - na.omit(BCI)[1:(nrow(na.omit(BCI)) - 1), 2])
BDI$diff_1[!is.na(BDI[2])] <- c(NA, na.omit(BDI)[2:nrow(na.omit(BDI)), 2] - na.omit(BDI)[1:(nrow(na.omit(BDI)) - 1), 2])
BPI$diff_1[!is.na(BPI[2])] <- c(NA, na.omit(BPI)[2:nrow(na.omit(BPI)), 2] - na.omit(BPI)[1:(nrow(na.omit(BPI)) - 1), 2])
BSI$diff_1[!is.na(BSI[2])] <- c(NA, na.omit(BSI)[2:nrow(na.omit(BSI)), 2] - na.omit(BSI)[1:(nrow(na.omit(BSI)) - 1), 2])
BHSI$diff_1[!is.na(BHSI[2])] <- c(NA, na.omit(BHSI)[2:nrow(na.omit(BHSI)), 2] - na.omit(BHSI)[1:(nrow(na.omit(BHSI)) - 1), 2])

BCI$diff_2[!is.na(BCI[2])] <- c(NA, NA, na.omit(BCI[2])[3:nrow(na.omit(BCI[2])), ] - na.omit(BCI[2])[1:(nrow(na.omit(BCI[2])) - 2), ])
BDI$diff_2[!is.na(BDI[2])] <- c(NA, NA, na.omit(BDI[2])[3:nrow(na.omit(BDI[2])), ] - na.omit(BDI[2])[1:(nrow(na.omit(BDI[2])) - 2), ])
BPI$diff_2[!is.na(BPI[2])] <- c(NA, NA, na.omit(BPI[2])[3:nrow(na.omit(BPI[2])), ] - na.omit(BPI[2])[1:(nrow(na.omit(BPI[2])) - 2), ])
BSI$diff_2[!is.na(BSI[2])] <- c(NA, NA, na.omit(BSI[2])[3:nrow(na.omit(BSI[2])), ] - na.omit(BSI[2])[1:(nrow(na.omit(BSI[2])) - 2), ])
BHSI$diff_2[!is.na(BHSI[2])] <- c(NA, NA, na.omit(BHSI[2])[3:nrow(na.omit(BHSI[2])), ] - na.omit(BHSI[2])[1:(nrow(na.omit(BHSI)[2] - 2)), ])

BCI$diff_3[!is.na(BCI[2])] <- c(NA, NA, NA, na.omit(BCI[2])[4:nrow(na.omit(BCI[2])), ] - na.omit(BCI[2])[1:(nrow(na.omit(BCI[2])) - 3), ])
BDI$diff_3[!is.na(BDI[2])] <- c(NA, NA, NA, na.omit(BDI[2])[4:nrow(na.omit(BDI[2])), ] - na.omit(BDI[2])[1:(nrow(na.omit(BDI[2])) - 3), ])
BPI$diff_3[!is.na(BPI[2])] <- c(NA, NA, NA, na.omit(BPI[2])[4:nrow(na.omit(BPI[2])), ] - na.omit(BPI[2])[1:(nrow(na.omit(BPI[2])) - 3), ])
BSI$diff_3[!is.na(BSI[2])] <- c(NA, NA, NA, na.omit(BSI[2])[4:nrow(na.omit(BSI[2])), ] - na.omit(BSI[2])[1:(nrow(na.omit(BSI[2])) - 3), ])
BHSI$diff_3[!is.na(BHSI[2])] <- c(NA, NA, NA, na.omit(BHSI[2])[4:nrow(na.omit(BHSI[2])), ] - na.omit(BHSI[2])[1:(nrow(na.omit(BHSI[2])) - 3), ])

BPI$month <- format(BPI$date, format="%m")
BPI$year <- format(BPI$date, format="%Y")
BPI$day <- format(BPI$date, format="%d")

LIBOR$date <- as.Date(LIBOR$date, format= "%d.%m.%Y")
LIBOR$LIBOR <- LIBOR$LIBOR

won.dollar$won.dollar <- as.character(won.dollar$won.dollar)
won.dollar <- won.dollar %>% 
  filter(won.dollar != "") %>% 
  select(date, won.dollar)
won.dollar$date <- as.Date(won.dollar$date, format="%Y/%m/%d")
won.dollar$won.dollar <- as.numeric(won.dollar$won.dollar)

bunker <- bunker[!(is.na(bunker$price_MDO) & is.na(bunker$price_380cst)), 1:ncol(bunker)]
bunker$price_MDO[which.max(bunker$price_MDO)] <- 1030

a <- bunker[1:2]
names(a)[2] <- "price"
a$type <- "380cst"
b <- bunker[c(1, 3)]
names(b)[2] <- "price"
b$type <- "MDO"
bunker_total <- rbind(a, b)

rate_BHSI <- freight.rate[1:2]
rate_BPI <- freight.rate[3:4]
rate_BSI <- freight.rate[5:6]

rate_BHSI$date <- as.Date(rate_BHSI$date, format="%d/%m/%Y")
rate_BHSI <- na.omit(rate_BHSI)
names(rate_BPI)[1] <- "date"
rate_BPI$date <- as.Date(rate_BPI$date, format="%Y-%m-%d")
rate_BPI <- na.omit(rate_BPI)
names(rate_BSI)[1] <- "date"
rate_BSI <- na.omit(rate_BSI)

names(timecharter.rate) <- gsub("X", "", names(timecharter.rate))

timecharter.rate_total <- data.frame(date=as.Date(NA), rate=NA, type=NA)

for(i in 2:18) {
  temp_df <- timecharter.rate[c(1, i)]
  temp_type <- names(timecharter.rate)[i]
  names(temp_df)[2] <- "rate"
  temp_df$type <- temp_type
  timecharter.rate_total <- rbind(timecharter.rate_total, temp_df)
}

timecharter.rate_total <- na.omit(timecharter.rate_total)

a <- earning.panamax[c(1, 2)]
names(a)[2] <- "earning"
a$route <- names(earning.panamax)[2]
b <- earning.panamax[c(1, 3)]
names(b)[2] <- "earning"
b$route <- names(earning.panamax)[3]
c <- earning.panamax[c(1, 4)]
names(c)[2] <- "earning"
c$route <- names(earning.panamax)[4]

earning.panamax_total <- rbind(a, b, c)
earning.panamax_total <- na.omit(earning.panamax_total)

GDP_cor <- GDP
GDP_cor <- na.omit(GDP_Original)
GDP_cor$year <- format(GDP_cor$year, format="%Y")


names(GDP)[2:3] <- c("Japan", "Korea")
GDP[2:3] <- data.frame(apply(GDP[2:3], 2, function(x) round(x, 2)))
GDP[1] <- format(GDP[1], format="%Y")

GDP_melted <- melt(GDP, id="year")
names(GDP_melted)[2:3] <- c("country", "percent")
GDP_melted$year <- as.numeric(GDP$year)

names(price.panamax) <- gsub("X", "", names(price.panamax))
price.panamax_melted <- melt(price.panamax, id="date")
names(price.panamax_melted)[2:3] <- c("type", "price")

price.panamax_melted$type <- as.character(price.panamax_melted$type)
price.panamax_melted$type[price.panamax_melted$type == "93.96K"] <- "93-96K"
price.panamax_melted$type[price.panamax_melted$type == "80.82K"] <- "80-82K"

## Visualization

# About all Feight indexes
ggplot(index) + 
  geom_line(aes(date, BCI, colour="BCI"), size=2) + 
  geom_line(aes(date, BDI, colour="BDI"), size=2) + 
  geom_line(aes(date, BPI, colour="BPI"), size=2) + 
  geom_line(aes(date, BSI, colour="BSI"), size=2) + 
  geom_line(aes(date, BHSI, colour="BHSI"), size=2) + 
  scale_x_date(breaks=date_breaks("1 year"), labels=date_format("%Y")) + 
  labs(x="Year", y="Index", colour="Index") + 
  theme_bw()

ggcorrplot(cor(index[sapply(index, is.numeric)], use="pairwise.complete.obs"), type="lower", lab=T, outline.col="white")

# # Relationgship between Freight indexes
# 
# p1 <- ggplot(index, aes(BCI, BDI)) + 
#   geom_point(colour="orange", alpha=0.2) + 
#   geom_smooth(method="lm", colour="red") + 
#   theme_bw()
# p2 <- ggplot(index, aes(BCI, BPI)) + 
#   geom_point(colour="orange", alpha=0.2) + 
#   geom_smooth(method="lm", colour="red") + 
#   theme_bw()
# p3 <- ggplot(index, aes(BCI, BSI)) + 
#   geom_point(colour="orange", alpha=0.2) + 
#   geom_smooth(method="lm", colour="red") + 
#   theme_bw()
# p4 <- ggplot(index, aes(BSI, BHSI)) + 
#   geom_point(colour="orange", alpha=0.2) + 
#   geom_smooth(method="lm", colour="red") + 
#   theme_bw()
# p5 <- ggplot(index, aes(BDI, BPI)) + 
#   geom_point(colour="orange", alpha=0.2) + 
#   geom_smooth(method="lm", colour="red") + 
#   theme_bw()
# p6 <- ggplot(index, aes(BDI, BSI)) + 
#   geom_point(colour="orange", alpha=0.2) + 
#   geom_smooth(method="lm", colour="red") + 
#   theme_bw()
# p7 <- ggplot(index, aes(BDI, BHSI)) + 
#   geom_point(colour="orange", alpha=0.2) + 
#   geom_smooth(method="lm", colour="red") + 
#   theme_bw()
# p8 <- ggplot(index, aes(BPI, BSI)) + 
#   geom_point(colour="orange", alpha=0.2) + 
#   geom_smooth(method="lm", colour="red") + 
#   theme_bw()
# p9 <- ggplot(index, aes(BPI, BHSI)) + 
#   geom_point(colour="orange", alpha=0.2) + 
#   geom_smooth(method="lm", colour="red") + 
#   theme_bw()
# p10 <- ggplot(index, aes(BSI, BHSI)) +
#   geom_point(colour="orange", alpha=0.2) + 
#   geom_smooth(method="lm", colour="red") + 
#   theme_bw()
# 
# grid.arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)

# Cv of Freight indexes

# ggplot(index_year.month, aes(month, coef.of.var_BCI)) +
#   geom_point(aes(colour=year)) + 
#   geom_line(aes(group=year, colour=year)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Month", y="Cv of BCI", colour="Year") + 
#   theme_bw()

# ggplot(index_year.month, aes(month, coef.of.var_BDI)) +
#   geom_point(aes(colour=year)) + 
#   geom_line(aes(group=year, colour=year)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Month", y="Cv of BDI", colour="Year") + 
#   theme_bw()

temp <- index_year.month %>% filter(year >= 2013 & year < 2018) %>% select(month, coef.of.var_BPI, year)
temp$month <- as.numeric(temp$month)

ggplot(temp, aes(month, coef.of.var_BPI)) + 
  geom_line(aes(group=year, colour=year), size=2) + 
  geom_point(aes(month, coef.of.var_BPI, colour=year)) + 
  geom_rect(xmin=4, xmax=6, ymin=-Inf, ymax=Inf, fill="red", alpha=0.005) + 
  geom_rect(xmin=8, xmax=9, ymin=-Inf, ymax=Inf, fill="red", alpha=0.005) + 
  geom_rect(xmin=6, xmax=8, ymin=-Inf, ymax=Inf, fill="blue", alpha=0.005) + 
  geom_rect(xmin=1, xmax=2, ymin=-Inf, ymax=Inf, fill="blue", alpha=0.005) + 
  stat_summary(fun.y="mean", geom="point", colour="yellow", size=5) + 
  stat_summary(fun.y="mean", geom="point", colour="black", size=2.5) + 
  scale_x_continuous(breaks=c(1:12)) + 
  labs(x="Month", y="Cv of BPI", colour="Year") + 
  theme_bw() + 
  theme(panel.grid.minor.x = element_blank())

# ggplot(index_year.month, aes(month, coef.of.var_BSI)) +
#   geom_point(aes(colour=year)) + 
#   geom_line(aes(group=year, colour=year)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Month", y="Cv of BSI", colour="Year") + 
#   theme_bw()

# ggplot(index_year.month, aes(month, coef.of.var_BHSI)) +
#   geom_point(aes(colour=year)) + 
#   geom_line(aes(group=year, colour=year)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Month", y="Cv of BHSI", colour="Year") + 
#   theme_bw()

# ggplot(index_year.month, aes(year, coef.of.var_BCI)) +
#   geom_point(aes(colour=month)) + 
#   geom_line(aes(group=month, colour=month)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Year", y="Cv of BCI", colour="Month") + 
#   theme_bw()

# ggplot(index_year.month, aes(year, coef.of.var_BDI)) +
#   geom_point(aes(colour=month)) + 
#   geom_line(aes(group=month, colour=month)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Month", y="Cv of BDI", colour="Month") + 
#   theme_bw()

# ggplot(index_year.month, aes(year, coef.of.var_BPI)) +
#   geom_boxplot(aes()) + 
#   geom_point(aes(year, coef.of.var_BPI, colour=month)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Year", y="Cv of BPI", colour="Month") + 
#   theme_bw()

# ggplot(index_year.month, aes(year, coef.of.var_BSI)) +
#   geom_point(aes(colour=month)) + 
#   geom_line(aes(group=month, colour=month)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Year", y="Cv of BSI", colour="Month") + 
#   theme_bw()

# ggplot(index_year.month, aes(year, coef.of.var_BHSI)) +
#   geom_point(aes(colour=month)) + 
#   geom_line(aes(group=month, colour=month)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Year", y="Cv of BHSI", colour="Month") + 
#   theme_bw()

# Mean of Feight indexes

# ggplot(index_year.month, aes(month, mean_BCI)) +
#   geom_point(aes(colour=year)) + 
#   geom_line(aes(group=year, colour=year)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Month", y="Mean of BCI", colour="Year") + 
#   theme_bw()

# ggplot(index_year.month, aes(month, mean_BDI)) +
#   geom_point(aes(colour=year)) + 
#   geom_line(aes(group=year, colour=year)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Month", y="Mean of BDI", colour="Year") + 
#   theme_bw()

temp <- index_year.month %>% filter(year >= 2013 & year < 2018) %>% select(month, mean_BPI, year)
temp$month <- as.numeric(temp$month)

ggplot(temp, aes(month, mean_BPI)) + 
  geom_line(aes(group=year, colour=year), size=2) + 
  geom_point(aes(month, mean_BPI, colour=year)) + 
  geom_rect(xmin=1, xmax=2, ymin=-Inf, ymax=Inf, fill="blue", alpha=0.005) + 
  geom_rect(xmin=2, xmax=3, ymin=-Inf, ymax=Inf, fill="red", alpha=0.005) + 
  geom_rect(xmin=4, xmax=6, ymin=-Inf, ymax=Inf, fill="blue", alpha=0.005) + 
  geom_rect(xmin=6, xmax=7, ymin=-Inf, ymax=Inf, fill="red", alpha=0.005) + 
  geom_rect(xmin=9, xmax=10, ymin=-Inf, ymax=Inf, fill="red", alpha=0.005) +
  stat_summary(fun.y="mean", geom="point", colour="yellow", size=5) + 
  stat_summary(fun.y="mean", geom="point", colour="black", size=2.5) + 
  scale_x_continuous(breaks=c(1:12)) + 
  labs(x="Month", y="Mean of BPI ($/Day)", colour="Year") + 
  theme_bw() + 
  theme(panel.grid.minor.x = element_blank())

# ggplot(index_year.month, aes(month, mean_BSI)) +
#   geom_point(aes(colour=year)) + 
#   geom_line(aes(group=year, colour=year)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Month", y="Mean of BSI", colour="Year") + 
#   theme_bw()

# ggplot(index_year.month, aes(month, mean_BHSI)) +
#   geom_point(aes(colour=year)) + 
#   geom_line(aes(group=year, colour=year)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Month", y="Mean of BHSI", colour="Year") + 
#   theme_bw()

# ggplot(index_year.month, aes(year, mean_BCI)) +
#   geom_point(aes(colour=month)) + 
#   geom_line(aes(group=month, colour=month)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Year", y="Mean of BCI", colour="Month") + 
#   theme_bw()

# ggplot(index_year.month, aes(year, mean_BDI)) +
#   geom_point(aes(colour=month)) + 
#   geom_line(aes(group=month, colour=month)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Month", y="Mean of BDI", colour="Month") + 
#   theme_bw()

ggplot(index_year.month, aes(year, mean_BPI)) +
  geom_boxplot(aes()) + 
  geom_point(aes(year, mean_BPI, colour=month)) + 
  stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
  labs(x="Year", y="Mean of BPI", colour="Month") + 
  theme_bw()

# ggplot(index_year.month, aes(year, mean_BSI)) +
#   geom_point(aes(colour=month)) + 
#   geom_line(aes(group=month, colour=month)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Year", y="Mean of BSI", colour="Month") + 
#   theme_bw()

# ggplot(index_year.month, aes(year, mean_BHSI)) +
#   geom_point(aes(colour=month)) + 
#   geom_line(aes(group=month, colour=month)) + 
#   stat_summary(fun.y="mean", geom="point", colour="red", size=5) + 
#   labs(x="Year", y="Mean of BHSI", colour="Month") + 
#   theme_bw()

# About LIBOR
ggplot(LIBOR, aes(date, LIBOR)) + 
  geom_line() + 
  scale_x_date(breaks=date_breaks("5 years"), labels=date_format("%Y")) + 
  labs(x="Year") + 
  theme_bw()

# About Won per Dollar
ggplot(won.dollar, aes(date, won.dollar)) + 
  geom_line() + 
  scale_x_date(breaks=date_breaks("5 years"), labels=date_format("%Y")) + 
  labs(x="Year", y="Won per Dollar") + 
  theme_bw()

# About all Bunker prices
ggplot(bunker_total, aes(date, price, group=type, colour=type)) + 
  geom_line(size=2) + 
  theme_bw() + 
  scale_x_date(breaks=date_breaks("1 year"), labels=date_format("%Y")) + 
  labs(x="Year", y="Price ($/tonne)", colour="Type")

ggcorrplot(cor(bunker[sapply(bunker, is.numeric)], use="pairwise.complete.obs"), type="lower", lab=T, outline.col="white")

# About Freight rates
ggplot(rate_BHSI) + 
  geom_density(aes(BHSI_HS6, fill="HS6"), alpha=0.25, size=0) + 
  geom_density(data=rate_BPI, aes(BPI_P3A_03, fill="P3A_03"), alpha=0.25, size=0) + 
  geom_density(data=rate_BSI, aes(BSI_S1B_58, fill="S1B_58"), alpha=0.25, size=0) + 
  labs(x="Freight Rate ($/Day)", y="Density", fill="Route") + 
  theme_bw()

p1 <- ggplot(rate_BHSI, aes(date, BHSI_HS6)) + 
  geom_line() + 
  scale_x_date(breaks=date_breaks("2 year"), labels=date_format("%Y")) + 
  labs(x="Year", y="HS6 ($/Day)") + 
  theme_bw()

p2 <- ggplot(rate_BPI, aes(date, BPI_P3A_03)) + 
  geom_line() + 
  scale_x_date(breaks=date_breaks("4 year"), labels=date_format("%Y")) + 
  labs(x="Year", y="P3A_03 ($/Day)") + 
  theme_bw()

p3 <- ggplot(rate_BSI, aes(date, BSI_S1B_58)) + 
  geom_line() + 
  scale_x_date(breaks=date_breaks("1 year"), labels=date_format("%Y")) + 
  labs(x="Year", y="S1B_58 ($/Day)") + 
  theme_bw()

grid.arrange(p1, p2, p3, ncol=3)

# About Timecharter Rates
ggplot(timecharter.rate_total, aes(date, rate, group=type, colour=type)) + 
  geom_line() + 
  scale_x_date(breaks=date_breaks("2 year"), labels=date_format("%Y")) + 
  labs(x="Year", y="Timcharter Rate ($/Day)", colour="Type") + 
  theme_bw()

ggcorrplot(cor(timecharter.rate[sapply(timecharter.rate, is.numeric)], use="pairwise.complete.obs"), type="lower", lab=T, outline.col="white")

# About Earning of Panamax
ggplot(earning.panamax_total, aes(date, earning, group=route, colour=route)) + 
  geom_line(size=2, alpha=0.5) + 
  scale_x_date(breaks=date_breaks("1 year"), labels=date_format("%Y")) + 
  labs(x="Year", y="Earning of Panamax ($/Day)", colour="Route") + 
  theme_bw()

ggcorrplot(cor(earning.panamax[sapply(earning.panamax, is.numeric)], use="pairwise.complete.obs"), type="lower", lab=T, outline.col="white")

# About GDP
ggplot(GDP_melted, aes(year, percent, group=country, colour=country)) + 
  geom_line(size=2) + 
  labs(x="Year", y="GDP (%)", colour="Country") + 
  theme_bw()

temp <- GDP_cor %>% 
  filter(year >= 2010) %>% 
  select(year, GDP_japan, GDP_korea)

ggplot(GDP_cor, aes(GDP_japan, GDP_korea)) + 
  geom_label(aes(label = year)) + 
  geom_smooth(method="lm", se=F, colour="red", size=2) + 
  geom_abline(slope=1, intercept=0, linetype="dashed") + 
  labs(y="GDP of Korea (%)", x="GDP of Japan (%)") + 
  theme_bw()

# About Price of Panamax
ggplot(price.panamax_melted, aes(date, price, group=type, colour=type)) + 
  geom_line(size=2) + 
  labs(x="Date", y="Price ($ millon)", colour="Type") + 
  scale_x_date(breaks=date_breaks("2 year"), labels=date_format("%Y")) + 
  theme_bw()

ggcorrplot(cor(price.panamax[sapply(price.panamax, is.numeric)], use="pairwise.complete.obs"), type="lower", lab=T, outline.col="white")

## Summary of Data
summary(rate_BHSI)
round(sd(rate_BHSI$BHSI_HS6)/mean(rate_BHSI$BHSI_HS6), 2)
summary(rate_BPI)
round(sd(rate_BPI$BPI_P3A_03)/mean(rate_BPI$BPI_P3A_03), 2)
summary(rate_BSI)
round(sd(rate_BSI$BSI_S1B_58)/mean(rate_BSI$BSI_S1B_58), 2)

## Binding Dataset

# Target Variable
rate_BPI

# Predictor
BPI <- BPI[1:2]
LIBOR
names(won.dollar)[2] <- "won.per.dollar"
GDP

# Binding Dataset
# dataset <- rate_BPI %>% 
#   merge(BPI, by="date", all=T) %>% 
#   merge(LIBOR, by="date", all=T) %>% 
#   merge(won.dollar, by="date", all=T) %>% 
#   merge(bunker, by="date", all=T) %>% 
#   merge(timecharter.rate, by="date", all=T) %>% 
#   merge(earning.panamax, by="date", all=T) %>% 
#   merge(price.panamax, by="date", all=T)

dataset <- rate_BPI %>% 
  merge(BPI, by="date", all=T) %>% 
  merge(LIBOR, by="date", all=T) %>% 
  merge(won.dollar, by="date", all=T)

# write(dataset, file="Dataset.csv")

# dataset$year <- as.integer(format(dataset$date, "%Y"))
# dataset$month <- as.integer(format(dataset$date, "%m"))
# 
# min(GDP$year)
# max(GDP$year)
# 2018 - 1968
# 
# for(i in 0:50) {
#   dataset$GDP_japan[dataset$year==(1968+i)] <- GDP[1+i, 2]
#   dataset$GDP_korea[dataset$year==(1968+i)] <- GDP[1+i, 3]
# }

# Visualization

# Correlation about Dataset
ggcorrplot(round(cor(dataset[, -c(1, 39, 40)], use="pairwise.complete.obs"), 1), type="upper", lab=T, outline.col="white")

# Checking relationship between some columns

# BPI_P3A_03 vs LIBOR
ggplot(dataset, aes(LIBOR, BPI_P3A_03)) + 
  geom_point() + 
  geom_smooth(method="lm") + 
  scale_x_continuous(breaks=c(1:6), limits=c(0, 6)) + 
  scale_color_gradient(low="blue", high="red") + 
  labs(y="P3A_03 ($/Day)") + 
  theme_bw()

min(na.omit(dataset[c("LIBOR", "BPI_P3A_03", "year")])$year)
max(na.omit(dataset[c("LIBOR", "BPI_P3A_03", "year")])$year)

temp <- dataset %>% filter(year >= 2002 & year <= 2018) %>% select(LIBOR, BPI_P3A_03, year, month)

ggplot(temp, aes(LIBOR, BPI_P3A_03, colour=year)) + 
  geom_point() + 
  geom_smooth(method="lm", aes(group=year)) + 
  scale_x_continuous(breaks=c(1:6), limits=c(0, 6)) + 
  scale_color_gradient(low="blue", high="red") + 
  labs(y="P3A_03 ($/Day)", colour="Year") + 
  theme_bw()

temp <- dataset %>% filter(year >= 2013) %>% select(LIBOR, BPI_P3A_03, year, month)

ggplot(temp, aes(LIBOR, BPI_P3A_03, colour=year)) + 
  geom_point() + 
  geom_smooth(method="lm", aes(group=year)) + 
  scale_color_gradient(low="blue", high="red") + 
  labs(y="P3A_03 ($/Day)", colour="Year") + 
  theme_bw()

# BPI_P3A_03 vs won.per.dollar
ggplot(dataset, aes(won.per.dollar, BPI_P3A_03)) + 
  geom_point() + 
  geom_smooth(method="lm") + 
  scale_x_continuous(limits=c(750, 1600)) + 
  labs(y="P3A_03 ($/Day)", x="Won per Dollar") + 
  theme_bw()

min(na.omit(dataset[c("won.per.dollar", "BPI_P3A_03", "year")])$year)
max(na.omit(dataset[c("won.per.dollar", "BPI_P3A_03", "year")])$year)

temp <- dataset %>% filter(year >= 2002 & year < 2018) %>% select(won.per.dollar, BPI_P3A_03, year, month)

ggplot(temp, aes(won.per.dollar, BPI_P3A_03, colour=year)) + 
  geom_point() + 
  geom_smooth(aes(group=year), method="lm") + 
  scale_x_continuous(limits=c(750, 1600)) + 
  scale_color_gradient(low="blue", high="red") + 
  labs(y="P3A_03 ($/Day)", x="Won per Dollar", colour="Year") + 
  theme_bw()

temp <- dataset %>% filter(year >= 2013) %>% select(won.per.dollar, BPI_P3A_03, year, month)

ggplot(temp, aes(won.per.dollar, BPI_P3A_03, colour=year)) + 
  geom_point() + 
  geom_smooth(aes(group=year), method="lm") + 
  scale_color_gradient(low="blue", high="red") + 
  labs(y="P3A_03 ($/Day)", x="Won per Dollar", colour="Year") + 
  theme_bw()

# BPI_P3A_03 vs price_380cst
ggplot(dataset, aes(price_380cst, BPI_P3A_03)) + 
  geom_point() + 
  geom_smooth(method="lm") + 
  labs(y="P3A_03 ($/Day)", x="Price of 380cst ($/tonne)") + 
  theme_bw()

min(na.omit(dataset[c("price_380cst", "BPI_P3A_03", "year")])$year)
max(na.omit(dataset[c("price_380cst", "BPI_P3A_03", "year")])$year)

temp <- dataset %>% filter(year >= 2002 & year < 2018) %>% select(price_380cst, BPI_P3A_03, year, month)

ggplot(temp, aes(price_380cst, BPI_P3A_03, colour=year)) + 
  geom_point() + 
  geom_smooth(aes(group=year), method="lm") + 
  scale_color_gradient(low="blue", high="red") + 
  labs(y="P3A_03 ($/Day)", x="Price of 380cst ($/tonne)", colour="Year") + 
  theme_bw()

temp <- dataset %>% filter(year >= 2013) %>% select(price_380cst, BPI_P3A_03, year, month)

ggplot(temp, aes(price_380cst, BPI_P3A_03, colour=year)) + 
  geom_point() + 
  geom_smooth(aes(group=year), method="lm") + 
  scale_color_gradient(low="blue", high="red") + 
  labs(y="P3A_03 ($/Day)", x="Price of 380cst ($/tonne)", colour="Year") + 
  theme_bw()

# BPI_P3A_04 vs price_MDO
ggplot(dataset, aes(price_MDO, BPI_P3A_03)) + 
  geom_point() + 
  geom_smooth(method="lm") + 
  labs(y="P3A_03 ($/Day)", x="Price of MDO ($/tonne)") + 
  theme_bw()

temp <- dataset %>% filter(year >= 2002 & year < 2018) %>% select(price_MDO, BPI_P3A_03, year, month)

ggplot(temp, aes(price_MDO, BPI_P3A_03, colour=year)) + 
  geom_point() + 
  geom_smooth(aes(group=year), method="lm") + 
  scale_color_gradient(low="blue", high="red") + 
  labs(y="P3A_03 ($/Day)", x="Price of MDO ($/tonne)", colour="Year") + 
  theme_bw()

temp <- dataset %>% filter(year >= 2013) %>% select(price_MDO, BPI_P3A_03, year, month)

ggplot(temp, aes(price_MDO, BPI_P3A_03, colour=year)) + 
  geom_point() + 
  geom_smooth(aes(group=year), method="lm") + 
  scale_color_gradient(low="blue", high="red") + 
  labs(y="P3A_03 ($/Day)", x="Price of MDO ($/tonne)", colour="Year") + 
  theme_bw()

# BPI_P3A_04 vs GDP_japan
ggplot(dataset, aes(GDP_japan, BPI_P3A_03)) + 
  geom_point() + 
  stat_summary(fun.y="mean", geom="point", colour="yellow", size=5) + 
  stat_summary(fun.y="mean", geom="point", colour="red", size=2.5) + 
  labs(x="GDP of Japan (%)", y="BPI_P3A ($/Day)") + 
  theme_bw() 

# BPI_P3A_04 vs GDP_korea
ggplot(dataset, aes(GDP_korea, BPI_P3A_03)) + 
  geom_point() + 
  stat_summary(fun.y="mean", geom="point", colour="yellow", size=5) + 
  stat_summary(fun.y="mean", geom="point", colour="red", size=2.5) + 
  labs(x="GDP of Korea (%)", y="BPI_P3A ($/Day)") + 
  theme_bw()

< Modeling (R Code) >

# Preprocessing
library(dplyr)

# Preprocessing for time series data
library(imputeTS)
library(nlme)
library(forecast)

# Visualization
library(ggplot2)
library(gridExtra)
library(scales)

# Tree model
library(randomForest)
library(xgboost)
library(gbm)

# LSTM model
library(keras)

###################
# Loading dataset #
###################

dataset <- read.csv('C:/Users/user/Desktop/Data_for_Campus_Life/3-2/Term_Project(데이터마이닝)/Data/Dataset.csv')
dataset$date <- as.Date(dataset$date)

temp <- data.frame(date=seq(as.Date('2003-01-01'), as.Date('2018-02-12'), by=1))
dataset <- merge(dataset, temp, by='date', all=T) %>% filter(format(date, '%Y') >= 2011) # Year from 2011 to 2018 is focused
rm(list='temp')

#################
# Check dataset #
#################

na <- c(sum(is.na(dataset$P3A_03)), sum(is.na(dataset$BPI)), sum(is.na(dataset$LIBOR)), sum(is.na(dataset$won.per.dollar)))
temp <- data.frame(column=names(dataset)[-1], na)

p1 <- ggplot(temp, aes(column, na)) + 
  geom_bar(stat='identity') + 
  scale_y_continuous(limits=c(0, 1000)) + 
  labs(x='Column', y='Count of NA') + 
  theme_bw()

for (i in 2:5) {
  statsNA(dataset[, i])
}

summary(dataset)

dataset <- dataset[!(weekdays(dataset$date) %in% c('토요일', '일요일')), ]

summary(dataset)

for (i in 2:5) {
  statsNA(dataset[, i])
}

na <- c(sum(is.na(dataset$P3A_03)), sum(is.na(dataset$BPI)), sum(is.na(dataset$LIBOR)), sum(is.na(dataset$won.per.dollar)))
temp <- data.frame(column=names(dataset)[-1], na)

p2 <- ggplot(temp, aes(column, na)) + 
  geom_bar(stat='identity') + 
  scale_y_continuous(limits=c(0, 1000)) + 
  labs(x='Column', y='Count of NA') + 
  theme_bw()

grid.arrange(p1, p2)

ggplot(dataset, aes(date, P3A_03)) + 
  geom_line() + 
  geom_smooth() +
  labs(x='Date', y='Rate ($/Day)') + 
  theme_bw() # About 3 years cycle

rm(list=c('i', 'na', 'p1', 'p2'))

###########################
# Change time unit (Week) #
###########################

##############
# About 2012 #
##############

# head(dataset %>%
#   mutate(day=weekdays(date)))
# 
# temp <- NULL
# 
# for(i in 1:(nrow(dataset)/5)) {
#   temp <- c(temp, rep(i, 5))
# }
# 
# dataset$week <- c(temp, max(temp) + 1)
# 
# dataset_week <- dataset %>%
#   mutate(year=format(dataset$date, format='%Y'), month=format(dataset$date, format='%m')) %>%
#   group_by(week) %>%
#   summarise(P3A_03=mean(P3A_03, na.rm=T))
# 
# ggplot(dataset_week, aes(week, P3A_03)) +
#   geom_line() +
#   labs(x='Week', y='Mean of Rate ($/Day)') +
#   theme_bw()
# 
# rm(list='temp')

############################
# Change time unit (Month) #
############################

dataset_month <- dataset %>% 
  mutate(year=format(dataset$date, format='%Y'), month=format(dataset$date, format='%m')) %>% 
  group_by(year, month) %>% 
  summarise(P3A_03=mean(P3A_03, na.rm=T))

dataset_month$index <- 1:nrow(dataset_month)

ggplot(dataset_month, aes(index, P3A_03)) + 
  geom_line() + 
  labs(x='Index', y='Mean of Rate ($/Day)') + 
  theme_bw() # No trend

#########################
# Check Autocorrelation #
#########################

temp <- dataset %>% 
  filter(!is.na(P3A_03))

acf(temp$P3A_03, lag.max=50, main=NA)
pacf(temp$P3A_03, lag.max=50, main=NA)

# temp <- dataset_week %>%
#   filter(!is.na(P3A_03))
# 
# acf(temp$P3A_03, lag.max=50)
# pacf(temp$P3A_03, lag.max=nrow(temp))

temp <- dataset_month %>% 
  filter(!is.na(P3A_03))

acf(temp$P3A_03, lag.max=50, main=NA)
pacf(temp$P3A_03, lag.max=50, main=NA)

rm(list='temp', 'dataset_month')

#######################################
# Extracting train and test for ARIMA #
#######################################

train <- dataset %>% 
  filter(format(date, '%Y') >= 2011 & date <= as.Date('2017-06-30'))

test <- dataset %>% 
  filter(date > as.Date('2017-06-30'))

################################
# Modeling ARIMA (Unit: Month) #
################################

train <- train %>% 
  mutate(year=format(train$date, format='%Y'), month=format(train$date, format='%m')) %>% 
  group_by(year, month) %>% 
  summarise(P3A_03=mean(P3A_03, na.rm=T))
train <- as.numeric(train$P3A_03)

test <- test %>% 
  mutate(year=format(test$date, format='%Y'), month=format(test$date, format='%m')) %>% 
  group_by(year, month) %>% 
  summarise(P3A_03=mean(P3A_03, na.rm=T))
test <- as.numeric(test$P3A_03)

plot.ts(train)
plot.ts(test)

train_log <- log(train)

plot.ts(diff(train_log, differences=2), xlab='Index', ylab=NULL)

train_diff <- diff(train_log, differences=2)

temp <- acf(train_diff, plot=F) # Cutoff value: 3 / Model: MA(2)
plot(temp, main=NA)

temp <- pacf(train_diff, plot=F) # Cutoff value: 15 / Model: AR(14)
plot(temp, main=NA)

train_arima <- arima(train_log, order=c(14, 2, 1))
train_pred <- predict(train_arima, n.ahead = length(test))

predict(train_arima, n.ahead = 1000)

temp <- c(rep(NA, length(train)), exp(train_pred$pred))
temp <- as.data.frame(cbind(c(train, test), temp))

temp$index <- (1:nrow(temp))
names(temp) <- c('target', 'pred', 'index')

ggplot(temp, aes(index, target)) + 
  geom_rect(xmin=79, xmax=86, ymin=-Inf, ymax=Inf, fill="yellow", alpha=0.0025) + 
  geom_line(size=1.5, alpha=0.5) + 
  geom_point(size=2.5) + 
  geom_line(aes(index, pred), colour='red', size=1.5, alpha=0.5) + 
  geom_point(aes(index, pred), size=2.5, colour='red') + 
  scale_x_continuous(limit=c(60, 86)) + 
  labs(x='Index', y='Rate ($/Day)', title='Model: ARIMA (p=14, d=2, q=2)', 
       subtitle=paste('RMSE: ', round(1878.35, 1), '\nMAPE: ', round(16.37, 2))) + 
  theme_bw()

temp <- na.omit(temp)

round(sqrt(sum((temp$target - temp$pred)^2)/nrow(temp)), 2) # RMSE
round(sum((abs(temp$target - temp$pred)/temp$pred*100))/nrow(temp), 2) # MAPE

rm(list=c('temp', 'train_arima', 'train_pred', 'test', 'train', 'train_diff', 'train_log'))

###########################
# Change time unit (Week) #
###########################

head(dataset %>%
       mutate(day=weekdays(date)))

temp <- NULL

for(i in 1:(nrow(dataset)/5)) {
  temp <- c(temp, rep(i, 5))
}

dataset_week <- dataset[1:length(temp),]
dataset_week$index <- temp

dataset_week <- dataset_week %>%
  mutate(year=format(dataset_week$date, format='%Y'), month=format(dataset_week$date, format='%m')) %>%
  group_by(index) %>%
  summarise(P3A_03=mean(P3A_03, na.rm=T), BPI=mean(BPI, na.rm=T), LIBOR=mean(LIBOR, na.rm=T), won.per.dollar=mean(won.per.dollar, na.rm=T))

statsNA(dataset_week$P3A_03)

ggplot(dataset_week, aes(index, P3A_03)) + 
  geom_line() + 
  labs(x='Index', y='Rate ($/Day)') + 
  theme_bw()

rm(list=c('i', 'temp'))

############################################################
# Check Autocorrelation and dataset & Imputation NA values #
############################################################

temp <- acf(dataset_week$P3A_03[!is.na(dataset_week$P3A_03)], lag=50, plot=F)
temp <- as.numeric(unlist(temp)[1:51])
temp <- data.frame(lag=0:(length(temp) - 1), acf=temp)

sum(temp$acf > 0.5)

ggplot(temp, aes(lag, acf)) +
  geom_rect(xmin=-Inf, xmax=8, ymin=0.5, ymax=Inf, fill='yellow', alpha=0.0025) +
  geom_bar(stat='identity') +
  geom_vline(xintercept = 8, linetype='dashed', colour='red', size=1.25) +
  geom_hline(yintercept = 0.5, linetype='dashed', colour='red', size=1.25) +
  labs(x='Lag', y='ACF') +
  scale_x_continuous(breaks=c(8, (0:5)*15)) +
  theme_bw()

sum(is.na(dataset_week$P3A_03))

dataset_week_imputed <- dataset_week 
dataset_week_imputed$P3A_03 <- na.kalman(dataset_week$P3A_03, model='auto.arima')

################################################################
# Extracting & Normalizing train, validation and test for LSTM #
################################################################

dataset_processed <- dataset_week_imputed %>% 
  select(index, P3A_03) %>% 
  mutate(key=as.factor(c(rep('train', 300 + 4), 
                         rep('validation', 25 + 4), 
                         rep('test', nrow(dataset_week) - (300 + 25 + 4 + 4)))))
names(dataset_processed)[2] <- 'value'

dataset_processed$value <- sqrt(dataset_processed$value)

mean_history <- mean(dataset_processed$value, na.rm=T)
sd_history <- sd(dataset_processed$value, na.rm=T)

dataset_processed$value <- (dataset_processed$value - mean_history)/sd_history

rm(list='temp')

#################
# Training LSTM #
#################

#########################
# Setting Hyperarameter #
#########################

lag_setting <- 4
batch_size <- 25
train_length <- 300
tsteps <- 1
epochs <- 50

#################
# Train dataset #
#################

lag_train <- dataset_processed %>%
  mutate(value_lag=lag(value, n=lag_setting)) %>%
  filter(!is.na(value_lag)) %>%
  filter(key == 'train') %>%
  tail(train_length)

x_train_vec <- lag_train$value_lag
x_train_arr <- array(data=x_train_vec, dim=c(length(x_train_vec), 1, 1))

y_train_vec <- lag_train$value
y_train_arr <- array(data=y_train_vec, dim = c(length(y_train_vec), 1))

######################
# Validation dataset #
######################

lag_validation <- dataset_processed %>%
  mutate(value_lag=lag(value, n=lag_setting)) %>%
  filter(!is.na(value_lag)) %>%
  filter(key == 'validation') %>%
  tail(25)

x_validation_vec <- lag_validation$value_lag
x_validation_arr <- array(data=x_validation_vec, dim=c(length(x_validation_vec), 1, 1))

y_validation_vec <- lag_validation$value
y_validation_arr <- array(data=y_validation_vec, dim = c(length(y_validation_vec), 1))

#################
# Train dataset #
#################

lag_test <- dataset_processed %>%
  mutate(value_lag=lag(value, n=lag_setting)) %>%
  filter(!is.na(value_lag)) %>%
  filter(key == 'test') %>%
  tail(25)

x_test_vec <- lag_test$value_lag
x_test_arr <- array(data=x_test_vec, dim=c(length(x_test_vec), 1, 1))

y_test_vec <- lag_test$value
y_test_arr <- array(data=y_test_vec, dim = c(length(y_test_vec), 1))

##################
# Training model #
##################

model <- keras_model_sequential()

model %>%
  layer_lstm(units            = 50,
             input_shape      = c(tsteps, 1),
             batch_size       = batch_size,
             return_sequences = TRUE,
             stateful         = TRUE) %>%
  layer_lstm(units            = 50,
             return_sequences = FALSE,
             stateful         = TRUE) %>%
  layer_dense(units = 1)

model %>%
  compile(loss='mae', optimizer='adam')

for (i in 1:epochs) {
  model %>% fit(x          = x_train_arr,
                y          = y_train_arr,
                batch_size = batch_size,
                epochs     = 1,
                verbose    = 1,
                shuffle    = FALSE)

  model %>% reset_states()
  cat("Epoch: ", i)
}

pred <- model %>% 
  predict(x_test_arr, batch_size=25) %>%
  .[, 1]

recover.value <- function(x) {
  x <- (x*sd_history + mean_history)^2
  return(x)
}

pred <- recover.value(pred)
actual <- recover.value(y_test_vec)

temp <- as.data.frame(cbind(index=1:25, pred, actual))

rmse <- sqrt(sum((pred - actual)^2)/25)
mape <- sum(abs(pred - actual)/actual*100)/25

ggplot(temp, aes(index, actual)) + 
  geom_line(size=2, alpha=0.5) + 
  geom_line(aes(index, pred), colour='red', size=2, alpha=0.5) + 
  labs(x='Index', y='Rate ($/Day)', title='Model: LSTM', 
       subtitle=paste('RMSE: ', round(rmse, 1), '\nMAPE: ', round(mape, 2))) + 
  theme_bw()

##########################
# Tunning hyperparameter #
##########################

sse <- list()
epochs <- c(1:20*10)
iteration <- length(epochs)

for (i in 1:length(epochs)) {
  model <- keras_model_sequential()
  
  model %>%
    layer_lstm(units            = 50, 
               input_shape      = c(tsteps, 1), 
               batch_size       = batch_size,
               return_sequences = TRUE, 
               stateful         = TRUE) %>% 
    layer_lstm(units            = 50, 
               return_sequences = FALSE, 
               stateful         = TRUE) %>% 
    layer_dense(units = 1)
  
  model %>% 
    compile(loss='mae', optimizer='adam')
  
  for (j in 1:epochs[i]) {
    model %>% fit(x          = x_train_arr, 
                  y          = y_train_arr, 
                  batch_size = batch_size,
                  epochs     = 1, 
                  verbose    = 1, 
                  shuffle    = FALSE)
    
    model %>% reset_states()
    cat("Epoch: ", j)
  }
  
  pred <- model %>% 
    predict(x_validation_arr, batch_size=25) %>%
    .[, 1]
  
  pred <- recover.value(pred)
  actual <- recover.value(y_validation_vec)
  
  sse[[i]] <- sum((pred - actual)^2)
}

sse <- unlist(sse)
temp <- data.frame(epoch=1:20*10, se=se)

ggplot(temp, aes(epoch, se)) + 
  geom_bar(stat='identity') + 
  labs(x='Num of Epoch', y='SSE') + 
  theme_bw()

########################################################
# Extracting train, validation and test for Tree model #
########################################################

train <- dataset %>%
  filter(format(date, '%Y') >= 2011 & format(date, '%Y') < 2017)

validation <- dataset %>%
  filter(format(date, '%Y') >= 2017 & date <= as.Date('2017-06-30'))

test <- dataset %>%
  filter(date > as.Date('2017-06-30'))

#########################################################
# Check train, validation and test & Imputing NA values #
#########################################################

for (i in 2:5) {
  statsNA(unlist(train[, i]))
}

for(i in 2:5) {
  statsNA(unlist(validation[, i]))
}

for (i in 2:5) {
  statsNA(unlist(test[, i]))
}

temp <- data.frame(na=train$date[is.na(train$P3A_03)])

p1 <- ggplot(train, aes(date, P3A_03)) + 
  geom_line() + 
  geom_vline(data=temp, aes(xintercept=na), colour='yellow') + 
  labs(x='Year', y='Rate ($/Day)') + 
  theme_bw()

impute.na <- function(x) {
  x[2] <- na.kalman(x[2], model='auto.arima')
  x[3] <- na.kalman(x[3], model='auto.arima')
  x[4] <- na.kalman(x[4], model='auto.arima')
  x[5] <- na.kalman(x[5], model='auto.arima')
  return(x)
}

train <- impute.na(train)
temp <-temp %>% 
  mutate(pred=train$P3A_03[train$date %in% temp$na])

p2 <- ggplot(train, aes(date, P3A_03)) + 
  geom_vline(data=temp, aes(xintercept=na), colour='yellow') + 
  geom_line() + 
  geom_point(data=temp, aes(na, pred), colour='red') + 
  labs(x='Year', y='Rate ($/Day)') + 
  theme_bw()

grid.arrange(p1, p2)

rm(list=c('p1', 'p2', 'temp', 'i'))

#######################################################
# Setting target variable & Creating derived variable #
#######################################################

create.dv <- function(x) {
  x <- x %>% 
    mutate(mul_P3A.BPI=P3A_03*BPI, 
           mul_P3A.LIBOR=P3A_03*LIBOR, 
           div_P3A.won=P3A_03/won.per.dollar, 
           mul_BPI.LIBOR=BPI*LIBOR, 
           div_BPI.won=BPI/won.per.dollar, 
           div_won.LIBOR=won.per.dollar/LIBOR)
  
  x$P3A_03 <- round(x$P3A_03, 0)
  x$BPI <- round(x$BPI, 0)
  x$LIBOR <- round(x$LIBOR, 4)
  x$won.per.dollar <- round(x$won.per.dollar , 0)
  x$mul_P3A.BPI <- x$mul_P3A.BPI
  x$mul_P3A.LIBOR <- round(x$mul_P3A.LIBOR, 0)
  x$div_P3A.won <- round(x$div_P3A.won, 2)
  x$mul_BPI.LIBOR <- round(x$mul_BPI.LIBOR, 0)
  x$div_BPI.won <- round(x$div_BPI.won, 3)
  x$div_won.LIBOR <- round(x$div_won.LIBOR, 0)
  
  create.dv_diff <- function(x, l) {
    
    for (i in 1:l) {
      x[, ncol(x) + 1] <- c(rep(NA, i), x$P3A_03[(i + 1):nrow(x)] - x$P3A_03[1:(nrow(x) - i)])
    }
    
    x <- na.omit(x)
    return(x)
  }
  
  x <- create.dv_diff(x, 10)
  
  x$target <- c(rep(NA, 5), x$P3A_03[1:(nrow(x) - 5)])
  x <- na.omit(x)
  
  return(x)
}

train <- create.dv(train)

#######################
# Training Tree model #
#######################

set.seed(777)

model_xgb <- xgboost(data = data.matrix(train[, 2:(ncol(train) - 1)]),
                     label =                             train$target,
                     eta =                                       0.15,
                     min_child_weight =                             5,
                     gamma =                                      2.5,
                     subsample =                                 0.75,
                     colsample_bytree =                          0.75,
                     max_depth =                                    5,
                     nrounds =                                    500,
                     objective =                         "reg:linear",
                     eval_metric =                             "rmse")

model_gbm <- gbm(train$target ~ .,
                 data = train[, 2:(ncol(train) - 1)],
                 distribution =     "gaussian",
                 interaction.depth =         1,
                 shrinkage =              0.05,
                 bag.fraction =           0.75,
                 n.trees =                3000)

model_rf <- randomForest(x = train[2:(ncol(train) - 1)],
                         y =               train$target,
                         mtry =                      10,
                         ntree =                    500,
                         sampsize =                 500)

validation <- impute.na(validation)
validation <- create.dv(validation)

temp <- validation[c('date', 'target')]

temp$date <- temp$date + 7
temp$pred_rf <- predict(model_rf, as.matrix(validation[2:(ncol(validation) - 1)]))
temp$pred_xgb <- predict(model_xgb, as.matrix(validation[2:(ncol(validation) - 1)]))
temp$pred_gbm <- predict(model_gbm, validation[2:(ncol(validation) - 1)], n.trees=gbm.perf(model_gbm, method='OOB', plot.it=FALSE))

rmse <- list()
mape <- list()

for(i in 1:3) {
  rmse[[i]] <- sqrt(sum((temp['target'] - temp[, i + 2])^2)/nrow(temp))
  mape[[i]] <- sum(abs(temp['target'] - temp[, i + 2])/temp['target'])*100/nrow(temp)
}

p1 <- ggplot(temp, aes(date, target)) + 
  geom_line(size=2) + 
  geom_line(aes(date, pred_rf), colour='green', size=2, alpha=0.5) + 
  labs(x='Month', y='Rate ($/Day)', title='Model: Random Forest', 
       subtitle=paste('RMSE: ', round(rmse[[1]], 1), '\nMAPE: ', round(mape[[1]], 2))) + 
  theme_bw()

p2 <- ggplot(temp, aes(date, target)) + 
  geom_line(size=2) + 
  geom_line(aes(date, pred_xgb), colour='red', size=2, alpha=0.5) + 
  labs(x='Month', y='Rate ($/Day)', title='Model: XGBoost', 
       subtitle=paste('RMSE: ', round(rmse[[2]], 1), '\nMAPE: ', round(mape[[2]], 2))) + 
  theme_bw()

p3 <- ggplot(temp, aes(date, target)) + 
  geom_line(size=2) + 
  geom_line(aes(date, pred_gbm), colour='blue', size=2, alpha=0.5) + 
  labs(x='Month', y='Rate ($/Day)', title='Model: GBM', 
       subtitle=paste('RMSE: ', round(rmse[[3]], 1), '\nMAPE: ', round(mape[[3]], 2))) + 
  theme_bw()

grid.arrange(p1, p2, p3)

# weight <- list()
# sum_rmse <- 1/rmse[[1]] + 1/rmse[[2]] + 1/rmse[[3]]
# 
# for (i in 1:3) {
#   weight[[i]] <- (1/rmse[[i]])/(sum_rmse)
# }
# 
# pred_ensemble <- weight[[1]]*temp$pred_rf + weight[[2]]*temp$pred_xgb + weight[[3]]*temp$pred_gbm
# rmse <- sqrt(sum((temp['target'] - pred_ensemble)^2)/nrow(temp))
# mape <- sum(abs(temp['target'] - temp[, i + 2])/temp['target'])*100/nrow(temp)
# 
# ggplot(temp, aes(date, target)) + 
#   geom_line(size=2) + 
#   geom_line(aes(date, weight[[1]]*pred_rf + weight[[2]]*pred_xgb + weight[[3]]*pred_gbm), colour='grey', size=2, alpha=0.5) + 
#   labs(x='Month', y='Rate ($/Day)', title='Model: Ensemble', 
#        subtitle=paste('RMSE: ', round(rmse, 1), '\nMAPE: ', round(mape, 2))) + 
#   theme_bw()

rm(list=c('p1', 'p2', 'p3', 'temp', 'weight', 'i', 'mape', 'pred_ensemble', 'rmse', 'sum_rmse'))

############################################
# Tunning hyperparameter using Grid Search #
############################################

# result_grid.search <- list()
# target <- validation$target[6:nrow(validation)]
# mse_min <- numeric(3)

#################
# Random Forest #
#################

# grid.search <- expand.grid(
#   mtry=c(10, 13, 15, 17, 20),
#   ntree=c(500, 750, 1000, 1250, 1500),
#   sampsize=c(500, 750, 1500)
# )
# 
# model <- list()
# mse <- numeric(nrow(grid.search))
# 
# for (i in 1:nrow(grid.search)) {
#   cat('Iteration:', i, 'of', nrow(grid.search), '\n')
# 
#   model[[i]] <- randomForest(x =        train[2:(ncol(train) - 1)],
#                              y =                      train$target,
#                              mtry =         grid.search[i, 'mtry'],
#                              ntree =       grid.search[i, 'ntree'],
#                              sampsize = grid.search[i, 'sampsize'])
# 
#   predict_temp <- predict(model[[i]], as.matrix(validation[2:(ncol(validation) - 1)]))
#   predict_temp <- predict_temp[1:(length(predict_temp) - 5)]
#   mse[i] <- sum((predict_temp - target)^2)/nrow(validation)
# }
# 
# grid.search$mse <- mse
# result_grid.search[[1]] <- grid.search[which.min(grid.search$mse), ]
# mse_min[1] <- min(grid.search$mse)

###########
# XGBoost #
###########

# grid.search <- expand.grid(
#   eta=c(0.05, 0.1, 0.15, 0.2, 0.25),
#   min_child_weight=c(1, 3, 5, 7, 9),
#   gamma=c(2.5, 5, 7.5),
#   subsample=c(0.25, 0.5, 0.75),
#   colsample_bytree=c(0.5, 0.75, 1),
#   max_depth=c(3, 5, 7, 9), 
#   nrounds=c(250, 500, 750)
# )
# 
# model <- list()
# mse <- numeric(nrow(grid.search))
# 
# for (i in 1:nrow(grid.search)) {
#   cat('Iteration:', i, 'of', nrow(grid.search), '\n')
# 
#   model[[i]] <- xgboost(data =      data.matrix(train[, 2:(ncol(train) - 1)]),
#                         label =                                  train$target,
#                         eta =                           grid.search[i, 'eta'],
#                         min_child_weight = grid.search[i, 'min_child_weight'],
#                         gamma =                       grid.search[i, 'gamma'],
#                         subsample =               grid.search[i, 'subsample'],
#                         colsample_bytree = grid.search[i, 'colsample_bytree'],
#                         max_depth =               grid.search[i, 'max_depth'],
#                         nrounds =                   grid.search[i, 'nrounds'],
#                         objective =                              'reg:linear',
#                         eval_metric =                                  'rmse',
#                         verbose =                                           0)
# 
#   predict_temp <- predict(model[[i]], as.matrix(validation[2:(ncol(validation) - 1)]))
#   predict_temp <- predict_temp[1:(length(predict_temp) - 5)]
#   mse[i] <- sum((predict_temp - target)^2)/nrow(validation)
# }
# 
# grid.search$mse <- mse
# result_grid.search[[2]] <- grid.search[which.min(grid.search$mse), ]
# mse_min[2] <- min(grid.search$mse)

#############
# Light GBM #
#############

# grid.search <- expand.grid(
#   n.trees=c(500, 750, 1000, 1250),
#   interaction.depth=c(1, 2, 3),
#   shrinkage=c(0.05, 0.1, 0.15),
#   bag.fraction=c(0.25, 0.5, 0.75)
# )
# 
# model <- list()
# mse <- numeric(nrow(grid.search))
# 
# for (i in 1:nrow(validation)) {
#   cat('Iteration:', i, 'of', nrow(grid.search), '\n')
# 
#   model[[i]] <- gbm(target ~                                              .,
#                     data =                           train[, 2:ncol(train)],
#                     distribution =                               'gaussian',
#                     n.trees =                     grid.search[i, 'n.trees'],
#                     interaction.depth = grid.search[i, 'interaction.depth'],
#                     shrinkage =                 grid.search[i, 'shrinkage'],
#                     bag.fraction =           grid.search[i, 'bag.fraction'],
#                     cv.folds =                                            5)
# 
#   predict_temp <- predict(model[[i]], validation[2:(ncol(validation) - 1)], n.trees=gbm.perf(model[[i]], method='OOB', plot.it=FALSE))
#   predict_temp <- predict_temp[1:(length(predict_temp) - 5)]
#   mse[i] <- sum((predict_temp - target)^2)/nrow(validation)
# }
# 
# grid.search$mse <- mse
# result_grid.search[[3]] <- grid.search[which.min(grid.search$mse), ]
# mse_min[3] <- min(grid.search$mse)

###################
# Test Tree model #
###################

temp <- test
temp$target <- c(rep(NA, 5), temp$P3A_03[1:(nrow(temp) - 5)])
temp <- na.omit(temp[c('date','P3A_03', 'target')])

temp_test <- impute.na(test)
temp_test <- create.dv(test)

temp_test$pred <- predict(model_xgb, as.matrix(temp_test[2:(ncol(temp_test) - 1)]))
temp_test <- merge(temp[c('date', 'P3A_03','target')], temp_test[c('date', 'pred')], by='date')
temp_test$date <- temp_test$date + 7

rmse <- sqrt(sum((temp_test['target'] - temp_test['pred'])^2)/nrow(temp_test))
mape <- sum(abs(temp_test['target'] - temp_test['pred'])/temp_test['target'])*100/nrow(temp_test)

ggplot(temp_test, aes(date, target)) + 
  geom_line(size=2) + 
  geom_line(aes(date, pred), colour='red', size=2, alpha=0.5) + 
  labs(x='Month', y='Rate ($/Day)', title='Model: XGBoost', 
       subtitle=paste('RMSE: ', round(rmse, 1), '\nMAPE: ', round(mape, 2))) + 
  scale_x_date(labels=date_format("%m"), breaks='1 month') + 
  theme_bw()

p1 <- ggplot(temp_test, aes(abs(P3A_03 - target))) + 
  geom_histogram(colour=NA, fill='red') + 
  scale_x_continuous(limits=c(0, 2500)) + 
  scale_y_continuous(limits=c(0, 30)) + 
  labs(x='Difference of Rate ($/Day)', y='Count') + 
  theme_bw()

p2 <- ggplot(temp_test, aes(abs(pred - target))) + 
  geom_histogram(colour=NA, fill='green') + 
  scale_x_continuous(limits=c(0, 2500)) + 
  scale_y_continuous(limits=c(0, 30)) + 
  labs(x='Difference of Rate ($/Day)', y='Count') + 
  theme_bw()

grid.arrange(p1, p2)

importance <- as.data.frame(xgb.importance(model=model_xgb))
importance$Feature <- factor(importance$Feature, levels=importance$Feature[order(importance$Gain)])

ggplot(importance, aes(Feature, Gain)) + 
  geom_bar(stat='identity', width=0.7) + 
  coord_flip() + 
  guides(fill=F) + 
  labs(y='Imformation Gain', x='Feature') + 
  theme_bw()

plot(1/train$won.per.dollar, train$P3A_03)
plot(train$P3A_03, train$target)

'Project' 카테고리의 다른 글

2020. 09. KAERI, 진동체 활용 충돌체 예측 대회 (0)	2020.12.02
2020. 09. BIG CONTEST, 혁신 아이디어 분야 대회 참가 (0)	2020.12.02
2020. 07. PNU, 졸업과제 경진대회 (0)	2020.12.02
2018. 10. Term Project, 메가박스와 롯데시네마의 매표 운영 시스템 전략 비교 (0)	2020.12.02
2017. 10. Term Project, 선출직 공무원 데이터 베이스 관리 시스템 개발 (0)	2020.12.02

Data world with 🌻Pep🌻

2018. 11. Term Project, 항만 물류 데이터를 활용한 예측 모델링

'Project' 카테고리의 다른 글

티스토리툴바

2018. 11. Term Project, 항만 물류 데이터를 활용한 예측 모델링

'Project' 카테고리의 다른 글

'Project' Related Articles

티스토리툴바