< Proposal Presentation >
< Final Presentation >
< EDA 1 (R Code) >
library(dplyr)
library(imputeTS)
library(ggplot2)
###################
# Loading dataset #
###################
dataset <- read.csv('C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/Dataset.csv')
dataset$date <- as.Date(dataset$date)
temp <- data.frame(date=seq(as.Date('1964-05-04'), as.Date('2018-02-12'), by=1))
dataset <- merge(dataset, temp, by='date', all=T)
rm(list='temp')
dataset <- dataset %>% filter(format(date, '%Y') >= 2003)
#################
# Check dataset #
#################
# for(i in 2:5) {
# statsNA(dataset[, i])
# }
#
# summary(dataset)
#####################################
# Excepting the weekends in dataset #
#####################################
dataset <- dataset[!(weekdays(dataset$date) %in% c('ํ ์์ผ', '์ผ์์ผ')), ]
for(i in 2:5) {
statsNA(dataset[, i])
}
summary(dataset)
#############################################
# Time series plot about P3A_03 (Unit: Day) #
#############################################
dataset$index <- 1:nrow(dataset)
ggplot(dataset, aes(index, P3A_03)) +
geom_line() +
labs(x='Index', y='P3A_03 ($/Day)') +
theme_bw()
############################################
# Autocorrelation about P3A_03 (Unit: Day) #
############################################
temp_rate <- dataset[, 'P3A_03']
length_row <- length(temp_rate)
for (i in 1:3500) {
temp <- c(rep(NA, i), temp_rate[1:(length_row - i)])
temp_rate <- cbind(temp_rate, temp)
}
temp_rate <- as.data.frame(temp_rate)
names(temp_rate) <- c('rate', paste('lag_', 1:(ncol(temp_rate) - 1), sep=''))
temp_cor <- NULL
for (i in 1:3500) {
temp_cor <- c(temp_cor, cor(temp_rate[, 1], temp_rate[, i + 1], use="complete.obs"))
}
temp_cor <- as.data.frame(cbind(c(1:3500), temp_cor))
ggplot(temp_cor, aes(V1, temp_cor)) +
geom_bar(stat='identity') +
labs(x='Lag', y='Correlation') +
theme_bw()
rm(list=c('temp_cor', 'temp_rate','i', 'length_row', 'temp'))
#############################################
# Autocorrelation about P3A_03 (Unit: Week) #
#############################################
dataset$week <- rep(NA, nrow(dataset))
dataset$week[1:3] <- 1
temp_week <- NULL
for(i in 1:(length(4:(nrow(dataset)) - 1)/5)) {
temp_week <- c(temp_week, rep(i + 1, 5))
}
dataset$week[4:(nrow(dataset) - 1)] <- temp_week
dataset$week[nrow(dataset)] <- 790
temp_dataset <- dataset %>%
group_by(week) %>%
summarise(mean_rate=mean(P3A_03, na.rm=T))
length_row <- nrow(temp_dataset)
for (i in 1:750) {
temp <- c(rep(NA, i), temp_dataset$mean_rate[1:(length_row - i)])
temp_dataset <- cbind(temp_dataset, temp)
}
names(temp_dataset) <- c('index', 'mean_rate', paste('lag_', 1:(ncol(temp_dataset) - 2), sep=''))
temp_cor <- NULL
for (i in 1:750) {
temp_cor <- c(temp_cor, cor(temp_dataset[, 2], temp_dataset[, i + 2], use="complete.obs"))
}
temp_cor <- as.data.frame(cbind(c(1:750), temp_cor))
ggplot(temp_cor, aes(V1, temp_cor)) +
geom_bar(stat='identity') +
labs(x='Lag', y='Correlation') +
theme_bw()
rm(list=c('temp_cor', 'temp_dataset', 'i', 'length_row', 'temp', 'temp_week'))
##############################################
# Autocorrelation about P3A_03 (Unit: Month) #
##############################################
dataset$year <- format(dataset$date, format='%Y')
dataset$month <- format(dataset$date, format='%m')
temp_dataset <- dataset %>%
group_by(year, month) %>%
summarise(mean_rate=mean(P3A_03, na.rm=T))
temp_dataset <- as.data.frame(temp_dataset)
length_row <- nrow(temp_dataset)
for (i in 1:180) {
temp <- c(rep(NA, i), temp_dataset$mean_rate[1:(length_row - i)])
temp_dataset <- cbind(temp_dataset, temp)
}
names(temp_dataset) <- c('year', 'month', 'mean_rate', paste('lag_', 1:(ncol(temp_dataset) - 3), sep=''))
temp_cor <- NULL
for (i in 1:180) {
temp_cor <- c(temp_cor, cor(temp_dataset[, 3], temp_dataset[, i + 3], use="complete.obs"))
}
temp_cor <- as.data.frame(cbind(c(1:length(temp_cor)), temp_cor))
ggplot(temp_cor, aes(V1, temp_cor)) +
geom_bar(stat='identity') +
labs(x='Lag', y='Correlation') +
theme_bw()
rm(list=c('temp_cor', 'temp_dataset', 'i', 'length_row', 'temp'))
< EDA 2 (R Code) >
library(dplyr)
library(reshape)
library(ggplot2)
library(ggcorrplot)
library(scales)
library(gridExtra)
library(PerformanceAnalytics)
library(xlsx)
BCI <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/FreightIndex.xlsx", sheetName="BCI")
BDI <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/FreightIndex.xlsx", sheetName="BDI")
BPI <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/FreightIndex.xlsx", sheetName="BPI")
BSI <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/FreightIndex.xlsx", sheetName="BSI")
BHSI <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/FreightIndex.xlsx", sheetName="BHSI")
LIBOR <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/LIBOR.xlsx", sheetName="LIBOR")
won.dollar <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/Exchange_Won-Dollar.xlsx", sheetName="data")
bunker <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/BunkerPrices.xlsx", sheetName="data")
freight.rate <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/TargetVariableCandidates.xlsx", sheetName="data")
timecharter.rate <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/TimecharterRate.xlsx", sheetName="data")
earning.panamax <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/Earning_Panamax.xlsx", sheetName="data")
price.panamax <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/Price_Panamax.xlsx", sheetName="data")
GDP <- read.xlsx("C:/Users/user/Desktop/TermProject(๋ฐ์ดํฐ๋ง์ด๋)/Data/GDP.xlsx", sheetName="data")
## Preprocessing
BPI$date <- as.Date(substr(as.character(BPI$date), 1, 10))
BHSI$date <- as.Date(BHSI$date, format= "%d/%m/%Y")
index <- BCI %>%
merge(BDI, by="date", all=T) %>%
merge(BPI, by="date", all=T) %>%
merge(BSI, by="date", all=T) %>%
merge(BHSI, by="date", all=T)
index$year <- format(index$date, "%Y")
index$month <- format(index$date, "%m")
BCI_year.month <- index %>%
select(year, month, BCI) %>%
group_by(year, month) %>%
summarise(mean_BCI=mean(BCI, na.rm=T), sd_BCI=sd(BCI, na.rm=T), coef.of.var_BCI=sd_BCI/mean_BCI)
BDI_year.month <- index %>%
select(year, month, BDI) %>%
group_by(year, month) %>%
summarise(mean_BDI=mean(BDI, na.rm=T), sd_BDI=sd(BDI, na.rm=T), coef.of.var_BDI=sd_BDI/mean_BDI)
BPI_year.month <- index %>%
select(year, month, BPI) %>%
group_by(year, month) %>%
summarise(mean_BPI=mean(BPI, na.rm=T), sd_BPI=sd(BPI, na.rm=T), coef.of.var_BPI=sd_BPI/mean_BPI)
BSI_year.month <- index %>%
select(year, month, BSI) %>%
group_by(year, month) %>%
summarise(mean_BSI=mean(BSI, na.rm=T), sd_BSI=sd(BSI, na.rm=T), coef.of.var_BSI=sd_BSI/mean_BSI)
BHSI_year.month <- index %>%
select(year, month, BHSI) %>%
group_by(year, month) %>%
summarise(mean_BHSI=mean(BHSI, na.rm=T), sd_BHSI=sd(BHSI, na.rm=T), coef.of.var_BHSI=sd_BHSI/mean_BHSI)
index_year.month <- BCI_year.month %>%
merge(BPI_year.month, by=c("year", "month"), all=T) %>%
merge(BDI_year.month, by=c("year", "month"), all=T) %>%
merge(BSI_year.month, by=c("year", "month"), all=T) %>%
merge(BHSI_year.month, by=c("year", "month"), all=T)
BCI$diff_1[!is.na(BCI[2])] <- c(NA, na.omit(BCI)[2:nrow(na.omit(BCI)), 2] - na.omit(BCI)[1:(nrow(na.omit(BCI)) - 1), 2])
BDI$diff_1[!is.na(BDI[2])] <- c(NA, na.omit(BDI)[2:nrow(na.omit(BDI)), 2] - na.omit(BDI)[1:(nrow(na.omit(BDI)) - 1), 2])
BPI$diff_1[!is.na(BPI[2])] <- c(NA, na.omit(BPI)[2:nrow(na.omit(BPI)), 2] - na.omit(BPI)[1:(nrow(na.omit(BPI)) - 1), 2])
BSI$diff_1[!is.na(BSI[2])] <- c(NA, na.omit(BSI)[2:nrow(na.omit(BSI)), 2] - na.omit(BSI)[1:(nrow(na.omit(BSI)) - 1), 2])
BHSI$diff_1[!is.na(BHSI[2])] <- c(NA, na.omit(BHSI)[2:nrow(na.omit(BHSI)), 2] - na.omit(BHSI)[1:(nrow(na.omit(BHSI)) - 1), 2])
BCI$diff_2[!is.na(BCI[2])] <- c(NA, NA, na.omit(BCI[2])[3:nrow(na.omit(BCI[2])), ] - na.omit(BCI[2])[1:(nrow(na.omit(BCI[2])) - 2), ])
BDI$diff_2[!is.na(BDI[2])] <- c(NA, NA, na.omit(BDI[2])[3:nrow(na.omit(BDI[2])), ] - na.omit(BDI[2])[1:(nrow(na.omit(BDI[2])) - 2), ])
BPI$diff_2[!is.na(BPI[2])] <- c(NA, NA, na.omit(BPI[2])[3:nrow(na.omit(BPI[2])), ] - na.omit(BPI[2])[1:(nrow(na.omit(BPI[2])) - 2), ])
BSI$diff_2[!is.na(BSI[2])] <- c(NA, NA, na.omit(BSI[2])[3:nrow(na.omit(BSI[2])), ] - na.omit(BSI[2])[1:(nrow(na.omit(BSI[2])) - 2), ])
BHSI$diff_2[!is.na(BHSI[2])] <- c(NA, NA, na.omit(BHSI[2])[3:nrow(na.omit(BHSI[2])), ] - na.omit(BHSI[2])[1:(nrow(na.omit(BHSI)[2] - 2)), ])
BCI$diff_3[!is.na(BCI[2])] <- c(NA, NA, NA, na.omit(BCI[2])[4:nrow(na.omit(BCI[2])), ] - na.omit(BCI[2])[1:(nrow(na.omit(BCI[2])) - 3), ])
BDI$diff_3[!is.na(BDI[2])] <- c(NA, NA, NA, na.omit(BDI[2])[4:nrow(na.omit(BDI[2])), ] - na.omit(BDI[2])[1:(nrow(na.omit(BDI[2])) - 3), ])
BPI$diff_3[!is.na(BPI[2])] <- c(NA, NA, NA, na.omit(BPI[2])[4:nrow(na.omit(BPI[2])), ] - na.omit(BPI[2])[1:(nrow(na.omit(BPI[2])) - 3), ])
BSI$diff_3[!is.na(BSI[2])] <- c(NA, NA, NA, na.omit(BSI[2])[4:nrow(na.omit(BSI[2])), ] - na.omit(BSI[2])[1:(nrow(na.omit(BSI[2])) - 3), ])
BHSI$diff_3[!is.na(BHSI[2])] <- c(NA, NA, NA, na.omit(BHSI[2])[4:nrow(na.omit(BHSI[2])), ] - na.omit(BHSI[2])[1:(nrow(na.omit(BHSI[2])) - 3), ])
BPI$month <- format(BPI$date, format="%m")
BPI$year <- format(BPI$date, format="%Y")
BPI$day <- format(BPI$date, format="%d")
LIBOR$date <- as.Date(LIBOR$date, format= "%d.%m.%Y")
LIBOR$LIBOR <- LIBOR$LIBOR
won.dollar$won.dollar <- as.character(won.dollar$won.dollar)
won.dollar <- won.dollar %>%
filter(won.dollar != "") %>%
select(date, won.dollar)
won.dollar$date <- as.Date(won.dollar$date, format="%Y/%m/%d")
won.dollar$won.dollar <- as.numeric(won.dollar$won.dollar)
bunker <- bunker[!(is.na(bunker$price_MDO) & is.na(bunker$price_380cst)), 1:ncol(bunker)]
bunker$price_MDO[which.max(bunker$price_MDO)] <- 1030
a <- bunker[1:2]
names(a)[2] <- "price"
a$type <- "380cst"
b <- bunker[c(1, 3)]
names(b)[2] <- "price"
b$type <- "MDO"
bunker_total <- rbind(a, b)
rate_BHSI <- freight.rate[1:2]
rate_BPI <- freight.rate[3:4]
rate_BSI <- freight.rate[5:6]
rate_BHSI$date <- as.Date(rate_BHSI$date, format="%d/%m/%Y")
rate_BHSI <- na.omit(rate_BHSI)
names(rate_BPI)[1] <- "date"
rate_BPI$date <- as.Date(rate_BPI$date, format="%Y-%m-%d")
rate_BPI <- na.omit(rate_BPI)
names(rate_BSI)[1] <- "date"
rate_BSI <- na.omit(rate_BSI)
names(timecharter.rate) <- gsub("X", "", names(timecharter.rate))
timecharter.rate_total <- data.frame(date=as.Date(NA), rate=NA, type=NA)
for(i in 2:18) {
temp_df <- timecharter.rate[c(1, i)]
temp_type <- names(timecharter.rate)[i]
names(temp_df)[2] <- "rate"
temp_df$type <- temp_type
timecharter.rate_total <- rbind(timecharter.rate_total, temp_df)
}
timecharter.rate_total <- na.omit(timecharter.rate_total)
a <- earning.panamax[c(1, 2)]
names(a)[2] <- "earning"
a$route <- names(earning.panamax)[2]
b <- earning.panamax[c(1, 3)]
names(b)[2] <- "earning"
b$route <- names(earning.panamax)[3]
c <- earning.panamax[c(1, 4)]
names(c)[2] <- "earning"
c$route <- names(earning.panamax)[4]
earning.panamax_total <- rbind(a, b, c)
earning.panamax_total <- na.omit(earning.panamax_total)
GDP_cor <- GDP
GDP_cor <- na.omit(GDP_Original)
GDP_cor$year <- format(GDP_cor$year, format="%Y")
names(GDP)[2:3] <- c("Japan", "Korea")
GDP[2:3] <- data.frame(apply(GDP[2:3], 2, function(x) round(x, 2)))
GDP[1] <- format(GDP[1], format="%Y")
GDP_melted <- melt(GDP, id="year")
names(GDP_melted)[2:3] <- c("country", "percent")
GDP_melted$year <- as.numeric(GDP$year)
names(price.panamax) <- gsub("X", "", names(price.panamax))
price.panamax_melted <- melt(price.panamax, id="date")
names(price.panamax_melted)[2:3] <- c("type", "price")
price.panamax_melted$type <- as.character(price.panamax_melted$type)
price.panamax_melted$type[price.panamax_melted$type == "93.96K"] <- "93-96K"
price.panamax_melted$type[price.panamax_melted$type == "80.82K"] <- "80-82K"
## Visualization
# About all Feight indexes
ggplot(index) +
geom_line(aes(date, BCI, colour="BCI"), size=2) +
geom_line(aes(date, BDI, colour="BDI"), size=2) +
geom_line(aes(date, BPI, colour="BPI"), size=2) +
geom_line(aes(date, BSI, colour="BSI"), size=2) +
geom_line(aes(date, BHSI, colour="BHSI"), size=2) +
scale_x_date(breaks=date_breaks("1 year"), labels=date_format("%Y")) +
labs(x="Year", y="Index", colour="Index") +
theme_bw()
ggcorrplot(cor(index[sapply(index, is.numeric)], use="pairwise.complete.obs"), type="lower", lab=T, outline.col="white")
# # Relationgship between Freight indexes
#
# p1 <- ggplot(index, aes(BCI, BDI)) +
# geom_point(colour="orange", alpha=0.2) +
# geom_smooth(method="lm", colour="red") +
# theme_bw()
# p2 <- ggplot(index, aes(BCI, BPI)) +
# geom_point(colour="orange", alpha=0.2) +
# geom_smooth(method="lm", colour="red") +
# theme_bw()
# p3 <- ggplot(index, aes(BCI, BSI)) +
# geom_point(colour="orange", alpha=0.2) +
# geom_smooth(method="lm", colour="red") +
# theme_bw()
# p4 <- ggplot(index, aes(BSI, BHSI)) +
# geom_point(colour="orange", alpha=0.2) +
# geom_smooth(method="lm", colour="red") +
# theme_bw()
# p5 <- ggplot(index, aes(BDI, BPI)) +
# geom_point(colour="orange", alpha=0.2) +
# geom_smooth(method="lm", colour="red") +
# theme_bw()
# p6 <- ggplot(index, aes(BDI, BSI)) +
# geom_point(colour="orange", alpha=0.2) +
# geom_smooth(method="lm", colour="red") +
# theme_bw()
# p7 <- ggplot(index, aes(BDI, BHSI)) +
# geom_point(colour="orange", alpha=0.2) +
# geom_smooth(method="lm", colour="red") +
# theme_bw()
# p8 <- ggplot(index, aes(BPI, BSI)) +
# geom_point(colour="orange", alpha=0.2) +
# geom_smooth(method="lm", colour="red") +
# theme_bw()
# p9 <- ggplot(index, aes(BPI, BHSI)) +
# geom_point(colour="orange", alpha=0.2) +
# geom_smooth(method="lm", colour="red") +
# theme_bw()
# p10 <- ggplot(index, aes(BSI, BHSI)) +
# geom_point(colour="orange", alpha=0.2) +
# geom_smooth(method="lm", colour="red") +
# theme_bw()
#
# grid.arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)
# Cv of Freight indexes
# ggplot(index_year.month, aes(month, coef.of.var_BCI)) +
# geom_point(aes(colour=year)) +
# geom_line(aes(group=year, colour=year)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Month", y="Cv of BCI", colour="Year") +
# theme_bw()
# ggplot(index_year.month, aes(month, coef.of.var_BDI)) +
# geom_point(aes(colour=year)) +
# geom_line(aes(group=year, colour=year)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Month", y="Cv of BDI", colour="Year") +
# theme_bw()
temp <- index_year.month %>% filter(year >= 2013 & year < 2018) %>% select(month, coef.of.var_BPI, year)
temp$month <- as.numeric(temp$month)
ggplot(temp, aes(month, coef.of.var_BPI)) +
geom_line(aes(group=year, colour=year), size=2) +
geom_point(aes(month, coef.of.var_BPI, colour=year)) +
geom_rect(xmin=4, xmax=6, ymin=-Inf, ymax=Inf, fill="red", alpha=0.005) +
geom_rect(xmin=8, xmax=9, ymin=-Inf, ymax=Inf, fill="red", alpha=0.005) +
geom_rect(xmin=6, xmax=8, ymin=-Inf, ymax=Inf, fill="blue", alpha=0.005) +
geom_rect(xmin=1, xmax=2, ymin=-Inf, ymax=Inf, fill="blue", alpha=0.005) +
stat_summary(fun.y="mean", geom="point", colour="yellow", size=5) +
stat_summary(fun.y="mean", geom="point", colour="black", size=2.5) +
scale_x_continuous(breaks=c(1:12)) +
labs(x="Month", y="Cv of BPI", colour="Year") +
theme_bw() +
theme(panel.grid.minor.x = element_blank())
# ggplot(index_year.month, aes(month, coef.of.var_BSI)) +
# geom_point(aes(colour=year)) +
# geom_line(aes(group=year, colour=year)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Month", y="Cv of BSI", colour="Year") +
# theme_bw()
# ggplot(index_year.month, aes(month, coef.of.var_BHSI)) +
# geom_point(aes(colour=year)) +
# geom_line(aes(group=year, colour=year)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Month", y="Cv of BHSI", colour="Year") +
# theme_bw()
# ggplot(index_year.month, aes(year, coef.of.var_BCI)) +
# geom_point(aes(colour=month)) +
# geom_line(aes(group=month, colour=month)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Year", y="Cv of BCI", colour="Month") +
# theme_bw()
# ggplot(index_year.month, aes(year, coef.of.var_BDI)) +
# geom_point(aes(colour=month)) +
# geom_line(aes(group=month, colour=month)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Month", y="Cv of BDI", colour="Month") +
# theme_bw()
# ggplot(index_year.month, aes(year, coef.of.var_BPI)) +
# geom_boxplot(aes()) +
# geom_point(aes(year, coef.of.var_BPI, colour=month)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Year", y="Cv of BPI", colour="Month") +
# theme_bw()
# ggplot(index_year.month, aes(year, coef.of.var_BSI)) +
# geom_point(aes(colour=month)) +
# geom_line(aes(group=month, colour=month)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Year", y="Cv of BSI", colour="Month") +
# theme_bw()
# ggplot(index_year.month, aes(year, coef.of.var_BHSI)) +
# geom_point(aes(colour=month)) +
# geom_line(aes(group=month, colour=month)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Year", y="Cv of BHSI", colour="Month") +
# theme_bw()
# Mean of Feight indexes
# ggplot(index_year.month, aes(month, mean_BCI)) +
# geom_point(aes(colour=year)) +
# geom_line(aes(group=year, colour=year)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Month", y="Mean of BCI", colour="Year") +
# theme_bw()
# ggplot(index_year.month, aes(month, mean_BDI)) +
# geom_point(aes(colour=year)) +
# geom_line(aes(group=year, colour=year)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Month", y="Mean of BDI", colour="Year") +
# theme_bw()
temp <- index_year.month %>% filter(year >= 2013 & year < 2018) %>% select(month, mean_BPI, year)
temp$month <- as.numeric(temp$month)
ggplot(temp, aes(month, mean_BPI)) +
geom_line(aes(group=year, colour=year), size=2) +
geom_point(aes(month, mean_BPI, colour=year)) +
geom_rect(xmin=1, xmax=2, ymin=-Inf, ymax=Inf, fill="blue", alpha=0.005) +
geom_rect(xmin=2, xmax=3, ymin=-Inf, ymax=Inf, fill="red", alpha=0.005) +
geom_rect(xmin=4, xmax=6, ymin=-Inf, ymax=Inf, fill="blue", alpha=0.005) +
geom_rect(xmin=6, xmax=7, ymin=-Inf, ymax=Inf, fill="red", alpha=0.005) +
geom_rect(xmin=9, xmax=10, ymin=-Inf, ymax=Inf, fill="red", alpha=0.005) +
stat_summary(fun.y="mean", geom="point", colour="yellow", size=5) +
stat_summary(fun.y="mean", geom="point", colour="black", size=2.5) +
scale_x_continuous(breaks=c(1:12)) +
labs(x="Month", y="Mean of BPI ($/Day)", colour="Year") +
theme_bw() +
theme(panel.grid.minor.x = element_blank())
# ggplot(index_year.month, aes(month, mean_BSI)) +
# geom_point(aes(colour=year)) +
# geom_line(aes(group=year, colour=year)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Month", y="Mean of BSI", colour="Year") +
# theme_bw()
# ggplot(index_year.month, aes(month, mean_BHSI)) +
# geom_point(aes(colour=year)) +
# geom_line(aes(group=year, colour=year)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Month", y="Mean of BHSI", colour="Year") +
# theme_bw()
# ggplot(index_year.month, aes(year, mean_BCI)) +
# geom_point(aes(colour=month)) +
# geom_line(aes(group=month, colour=month)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Year", y="Mean of BCI", colour="Month") +
# theme_bw()
# ggplot(index_year.month, aes(year, mean_BDI)) +
# geom_point(aes(colour=month)) +
# geom_line(aes(group=month, colour=month)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Month", y="Mean of BDI", colour="Month") +
# theme_bw()
ggplot(index_year.month, aes(year, mean_BPI)) +
geom_boxplot(aes()) +
geom_point(aes(year, mean_BPI, colour=month)) +
stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
labs(x="Year", y="Mean of BPI", colour="Month") +
theme_bw()
# ggplot(index_year.month, aes(year, mean_BSI)) +
# geom_point(aes(colour=month)) +
# geom_line(aes(group=month, colour=month)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Year", y="Mean of BSI", colour="Month") +
# theme_bw()
# ggplot(index_year.month, aes(year, mean_BHSI)) +
# geom_point(aes(colour=month)) +
# geom_line(aes(group=month, colour=month)) +
# stat_summary(fun.y="mean", geom="point", colour="red", size=5) +
# labs(x="Year", y="Mean of BHSI", colour="Month") +
# theme_bw()
# About LIBOR
ggplot(LIBOR, aes(date, LIBOR)) +
geom_line() +
scale_x_date(breaks=date_breaks("5 years"), labels=date_format("%Y")) +
labs(x="Year") +
theme_bw()
# About Won per Dollar
ggplot(won.dollar, aes(date, won.dollar)) +
geom_line() +
scale_x_date(breaks=date_breaks("5 years"), labels=date_format("%Y")) +
labs(x="Year", y="Won per Dollar") +
theme_bw()
# About all Bunker prices
ggplot(bunker_total, aes(date, price, group=type, colour=type)) +
geom_line(size=2) +
theme_bw() +
scale_x_date(breaks=date_breaks("1 year"), labels=date_format("%Y")) +
labs(x="Year", y="Price ($/tonne)", colour="Type")
ggcorrplot(cor(bunker[sapply(bunker, is.numeric)], use="pairwise.complete.obs"), type="lower", lab=T, outline.col="white")
# About Freight rates
ggplot(rate_BHSI) +
geom_density(aes(BHSI_HS6, fill="HS6"), alpha=0.25, size=0) +
geom_density(data=rate_BPI, aes(BPI_P3A_03, fill="P3A_03"), alpha=0.25, size=0) +
geom_density(data=rate_BSI, aes(BSI_S1B_58, fill="S1B_58"), alpha=0.25, size=0) +
labs(x="Freight Rate ($/Day)", y="Density", fill="Route") +
theme_bw()
p1 <- ggplot(rate_BHSI, aes(date, BHSI_HS6)) +
geom_line() +
scale_x_date(breaks=date_breaks("2 year"), labels=date_format("%Y")) +
labs(x="Year", y="HS6 ($/Day)") +
theme_bw()
p2 <- ggplot(rate_BPI, aes(date, BPI_P3A_03)) +
geom_line() +
scale_x_date(breaks=date_breaks("4 year"), labels=date_format("%Y")) +
labs(x="Year", y="P3A_03 ($/Day)") +
theme_bw()
p3 <- ggplot(rate_BSI, aes(date, BSI_S1B_58)) +
geom_line() +
scale_x_date(breaks=date_breaks("1 year"), labels=date_format("%Y")) +
labs(x="Year", y="S1B_58 ($/Day)") +
theme_bw()
grid.arrange(p1, p2, p3, ncol=3)
# About Timecharter Rates
ggplot(timecharter.rate_total, aes(date, rate, group=type, colour=type)) +
geom_line() +
scale_x_date(breaks=date_breaks("2 year"), labels=date_format("%Y")) +
labs(x="Year", y="Timcharter Rate ($/Day)", colour="Type") +
theme_bw()
ggcorrplot(cor(timecharter.rate[sapply(timecharter.rate, is.numeric)], use="pairwise.complete.obs"), type="lower", lab=T, outline.col="white")
# About Earning of Panamax
ggplot(earning.panamax_total, aes(date, earning, group=route, colour=route)) +
geom_line(size=2, alpha=0.5) +
scale_x_date(breaks=date_breaks("1 year"), labels=date_format("%Y")) +
labs(x="Year", y="Earning of Panamax ($/Day)", colour="Route") +
theme_bw()
ggcorrplot(cor(earning.panamax[sapply(earning.panamax, is.numeric)], use="pairwise.complete.obs"), type="lower", lab=T, outline.col="white")
# About GDP
ggplot(GDP_melted, aes(year, percent, group=country, colour=country)) +
geom_line(size=2) +
labs(x="Year", y="GDP (%)", colour="Country") +
theme_bw()
temp <- GDP_cor %>%
filter(year >= 2010) %>%
select(year, GDP_japan, GDP_korea)
ggplot(GDP_cor, aes(GDP_japan, GDP_korea)) +
geom_label(aes(label = year)) +
geom_smooth(method="lm", se=F, colour="red", size=2) +
geom_abline(slope=1, intercept=0, linetype="dashed") +
labs(y="GDP of Korea (%)", x="GDP of Japan (%)") +
theme_bw()
# About Price of Panamax
ggplot(price.panamax_melted, aes(date, price, group=type, colour=type)) +
geom_line(size=2) +
labs(x="Date", y="Price ($ millon)", colour="Type") +
scale_x_date(breaks=date_breaks("2 year"), labels=date_format("%Y")) +
theme_bw()
ggcorrplot(cor(price.panamax[sapply(price.panamax, is.numeric)], use="pairwise.complete.obs"), type="lower", lab=T, outline.col="white")
## Summary of Data
summary(rate_BHSI)
round(sd(rate_BHSI$BHSI_HS6)/mean(rate_BHSI$BHSI_HS6), 2)
summary(rate_BPI)
round(sd(rate_BPI$BPI_P3A_03)/mean(rate_BPI$BPI_P3A_03), 2)
summary(rate_BSI)
round(sd(rate_BSI$BSI_S1B_58)/mean(rate_BSI$BSI_S1B_58), 2)
## Binding Dataset
# Target Variable
rate_BPI
# Predictor
BPI <- BPI[1:2]
LIBOR
names(won.dollar)[2] <- "won.per.dollar"
GDP
# Binding Dataset
# dataset <- rate_BPI %>%
# merge(BPI, by="date", all=T) %>%
# merge(LIBOR, by="date", all=T) %>%
# merge(won.dollar, by="date", all=T) %>%
# merge(bunker, by="date", all=T) %>%
# merge(timecharter.rate, by="date", all=T) %>%
# merge(earning.panamax, by="date", all=T) %>%
# merge(price.panamax, by="date", all=T)
dataset <- rate_BPI %>%
merge(BPI, by="date", all=T) %>%
merge(LIBOR, by="date", all=T) %>%
merge(won.dollar, by="date", all=T)
# write(dataset, file="Dataset.csv")
# dataset$year <- as.integer(format(dataset$date, "%Y"))
# dataset$month <- as.integer(format(dataset$date, "%m"))
#
# min(GDP$year)
# max(GDP$year)
# 2018 - 1968
#
# for(i in 0:50) {
# dataset$GDP_japan[dataset$year==(1968+i)] <- GDP[1+i, 2]
# dataset$GDP_korea[dataset$year==(1968+i)] <- GDP[1+i, 3]
# }
# Visualization
# Correlation about Dataset
ggcorrplot(round(cor(dataset[, -c(1, 39, 40)], use="pairwise.complete.obs"), 1), type="upper", lab=T, outline.col="white")
# Checking relationship between some columns
# BPI_P3A_03 vs LIBOR
ggplot(dataset, aes(LIBOR, BPI_P3A_03)) +
geom_point() +
geom_smooth(method="lm") +
scale_x_continuous(breaks=c(1:6), limits=c(0, 6)) +
scale_color_gradient(low="blue", high="red") +
labs(y="P3A_03 ($/Day)") +
theme_bw()
min(na.omit(dataset[c("LIBOR", "BPI_P3A_03", "year")])$year)
max(na.omit(dataset[c("LIBOR", "BPI_P3A_03", "year")])$year)
temp <- dataset %>% filter(year >= 2002 & year <= 2018) %>% select(LIBOR, BPI_P3A_03, year, month)
ggplot(temp, aes(LIBOR, BPI_P3A_03, colour=year)) +
geom_point() +
geom_smooth(method="lm", aes(group=year)) +
scale_x_continuous(breaks=c(1:6), limits=c(0, 6)) +
scale_color_gradient(low="blue", high="red") +
labs(y="P3A_03 ($/Day)", colour="Year") +
theme_bw()
temp <- dataset %>% filter(year >= 2013) %>% select(LIBOR, BPI_P3A_03, year, month)
ggplot(temp, aes(LIBOR, BPI_P3A_03, colour=year)) +
geom_point() +
geom_smooth(method="lm", aes(group=year)) +
scale_color_gradient(low="blue", high="red") +
labs(y="P3A_03 ($/Day)", colour="Year") +
theme_bw()
# BPI_P3A_03 vs won.per.dollar
ggplot(dataset, aes(won.per.dollar, BPI_P3A_03)) +
geom_point() +
geom_smooth(method="lm") +
scale_x_continuous(limits=c(750, 1600)) +
labs(y="P3A_03 ($/Day)", x="Won per Dollar") +
theme_bw()
min(na.omit(dataset[c("won.per.dollar", "BPI_P3A_03", "year")])$year)
max(na.omit(dataset[c("won.per.dollar", "BPI_P3A_03", "year")])$year)
temp <- dataset %>% filter(year >= 2002 & year < 2018) %>% select(won.per.dollar, BPI_P3A_03, year, month)
ggplot(temp, aes(won.per.dollar, BPI_P3A_03, colour=year)) +
geom_point() +
geom_smooth(aes(group=year), method="lm") +
scale_x_continuous(limits=c(750, 1600)) +
scale_color_gradient(low="blue", high="red") +
labs(y="P3A_03 ($/Day)", x="Won per Dollar", colour="Year") +
theme_bw()
temp <- dataset %>% filter(year >= 2013) %>% select(won.per.dollar, BPI_P3A_03, year, month)
ggplot(temp, aes(won.per.dollar, BPI_P3A_03, colour=year)) +
geom_point() +
geom_smooth(aes(group=year), method="lm") +
scale_color_gradient(low="blue", high="red") +
labs(y="P3A_03 ($/Day)", x="Won per Dollar", colour="Year") +
theme_bw()
# BPI_P3A_03 vs price_380cst
ggplot(dataset, aes(price_380cst, BPI_P3A_03)) +
geom_point() +
geom_smooth(method="lm") +
labs(y="P3A_03 ($/Day)", x="Price of 380cst ($/tonne)") +
theme_bw()
min(na.omit(dataset[c("price_380cst", "BPI_P3A_03", "year")])$year)
max(na.omit(dataset[c("price_380cst", "BPI_P3A_03", "year")])$year)
temp <- dataset %>% filter(year >= 2002 & year < 2018) %>% select(price_380cst, BPI_P3A_03, year, month)
ggplot(temp, aes(price_380cst, BPI_P3A_03, colour=year)) +
geom_point() +
geom_smooth(aes(group=year), method="lm") +
scale_color_gradient(low="blue", high="red") +
labs(y="P3A_03 ($/Day)", x="Price of 380cst ($/tonne)", colour="Year") +
theme_bw()
temp <- dataset %>% filter(year >= 2013) %>% select(price_380cst, BPI_P3A_03, year, month)
ggplot(temp, aes(price_380cst, BPI_P3A_03, colour=year)) +
geom_point() +
geom_smooth(aes(group=year), method="lm") +
scale_color_gradient(low="blue", high="red") +
labs(y="P3A_03 ($/Day)", x="Price of 380cst ($/tonne)", colour="Year") +
theme_bw()
# BPI_P3A_04 vs price_MDO
ggplot(dataset, aes(price_MDO, BPI_P3A_03)) +
geom_point() +
geom_smooth(method="lm") +
labs(y="P3A_03 ($/Day)", x="Price of MDO ($/tonne)") +
theme_bw()
temp <- dataset %>% filter(year >= 2002 & year < 2018) %>% select(price_MDO, BPI_P3A_03, year, month)
ggplot(temp, aes(price_MDO, BPI_P3A_03, colour=year)) +
geom_point() +
geom_smooth(aes(group=year), method="lm") +
scale_color_gradient(low="blue", high="red") +
labs(y="P3A_03 ($/Day)", x="Price of MDO ($/tonne)", colour="Year") +
theme_bw()
temp <- dataset %>% filter(year >= 2013) %>% select(price_MDO, BPI_P3A_03, year, month)
ggplot(temp, aes(price_MDO, BPI_P3A_03, colour=year)) +
geom_point() +
geom_smooth(aes(group=year), method="lm") +
scale_color_gradient(low="blue", high="red") +
labs(y="P3A_03 ($/Day)", x="Price of MDO ($/tonne)", colour="Year") +
theme_bw()
# BPI_P3A_04 vs GDP_japan
ggplot(dataset, aes(GDP_japan, BPI_P3A_03)) +
geom_point() +
stat_summary(fun.y="mean", geom="point", colour="yellow", size=5) +
stat_summary(fun.y="mean", geom="point", colour="red", size=2.5) +
labs(x="GDP of Japan (%)", y="BPI_P3A ($/Day)") +
theme_bw()
# BPI_P3A_04 vs GDP_korea
ggplot(dataset, aes(GDP_korea, BPI_P3A_03)) +
geom_point() +
stat_summary(fun.y="mean", geom="point", colour="yellow", size=5) +
stat_summary(fun.y="mean", geom="point", colour="red", size=2.5) +
labs(x="GDP of Korea (%)", y="BPI_P3A ($/Day)") +
theme_bw()
< Modeling (R Code) >
# Preprocessing
library(dplyr)
# Preprocessing for time series data
library(imputeTS)
library(nlme)
library(forecast)
# Visualization
library(ggplot2)
library(gridExtra)
library(scales)
# Tree model
library(randomForest)
library(xgboost)
library(gbm)
# LSTM model
library(keras)
###################
# Loading dataset #
###################
dataset <- read.csv('C:/Users/user/Desktop/Data_for_Campus_Life/3-2/Term_Project(๋ฐ์ดํฐ๋ง์ด๋)/Data/Dataset.csv')
dataset$date <- as.Date(dataset$date)
temp <- data.frame(date=seq(as.Date('2003-01-01'), as.Date('2018-02-12'), by=1))
dataset <- merge(dataset, temp, by='date', all=T) %>% filter(format(date, '%Y') >= 2011) # Year from 2011 to 2018 is focused
rm(list='temp')
#################
# Check dataset #
#################
na <- c(sum(is.na(dataset$P3A_03)), sum(is.na(dataset$BPI)), sum(is.na(dataset$LIBOR)), sum(is.na(dataset$won.per.dollar)))
temp <- data.frame(column=names(dataset)[-1], na)
p1 <- ggplot(temp, aes(column, na)) +
geom_bar(stat='identity') +
scale_y_continuous(limits=c(0, 1000)) +
labs(x='Column', y='Count of NA') +
theme_bw()
for (i in 2:5) {
statsNA(dataset[, i])
}
summary(dataset)
dataset <- dataset[!(weekdays(dataset$date) %in% c('ํ ์์ผ', '์ผ์์ผ')), ]
summary(dataset)
for (i in 2:5) {
statsNA(dataset[, i])
}
na <- c(sum(is.na(dataset$P3A_03)), sum(is.na(dataset$BPI)), sum(is.na(dataset$LIBOR)), sum(is.na(dataset$won.per.dollar)))
temp <- data.frame(column=names(dataset)[-1], na)
p2 <- ggplot(temp, aes(column, na)) +
geom_bar(stat='identity') +
scale_y_continuous(limits=c(0, 1000)) +
labs(x='Column', y='Count of NA') +
theme_bw()
grid.arrange(p1, p2)
ggplot(dataset, aes(date, P3A_03)) +
geom_line() +
geom_smooth() +
labs(x='Date', y='Rate ($/Day)') +
theme_bw() # About 3 years cycle
rm(list=c('i', 'na', 'p1', 'p2'))
###########################
# Change time unit (Week) #
###########################
##############
# About 2012 #
##############
# head(dataset %>%
# mutate(day=weekdays(date)))
#
# temp <- NULL
#
# for(i in 1:(nrow(dataset)/5)) {
# temp <- c(temp, rep(i, 5))
# }
#
# dataset$week <- c(temp, max(temp) + 1)
#
# dataset_week <- dataset %>%
# mutate(year=format(dataset$date, format='%Y'), month=format(dataset$date, format='%m')) %>%
# group_by(week) %>%
# summarise(P3A_03=mean(P3A_03, na.rm=T))
#
# ggplot(dataset_week, aes(week, P3A_03)) +
# geom_line() +
# labs(x='Week', y='Mean of Rate ($/Day)') +
# theme_bw()
#
# rm(list='temp')
############################
# Change time unit (Month) #
############################
dataset_month <- dataset %>%
mutate(year=format(dataset$date, format='%Y'), month=format(dataset$date, format='%m')) %>%
group_by(year, month) %>%
summarise(P3A_03=mean(P3A_03, na.rm=T))
dataset_month$index <- 1:nrow(dataset_month)
ggplot(dataset_month, aes(index, P3A_03)) +
geom_line() +
labs(x='Index', y='Mean of Rate ($/Day)') +
theme_bw() # No trend
#########################
# Check Autocorrelation #
#########################
temp <- dataset %>%
filter(!is.na(P3A_03))
acf(temp$P3A_03, lag.max=50, main=NA)
pacf(temp$P3A_03, lag.max=50, main=NA)
# temp <- dataset_week %>%
# filter(!is.na(P3A_03))
#
# acf(temp$P3A_03, lag.max=50)
# pacf(temp$P3A_03, lag.max=nrow(temp))
temp <- dataset_month %>%
filter(!is.na(P3A_03))
acf(temp$P3A_03, lag.max=50, main=NA)
pacf(temp$P3A_03, lag.max=50, main=NA)
rm(list='temp', 'dataset_month')
#######################################
# Extracting train and test for ARIMA #
#######################################
train <- dataset %>%
filter(format(date, '%Y') >= 2011 & date <= as.Date('2017-06-30'))
test <- dataset %>%
filter(date > as.Date('2017-06-30'))
################################
# Modeling ARIMA (Unit: Month) #
################################
train <- train %>%
mutate(year=format(train$date, format='%Y'), month=format(train$date, format='%m')) %>%
group_by(year, month) %>%
summarise(P3A_03=mean(P3A_03, na.rm=T))
train <- as.numeric(train$P3A_03)
test <- test %>%
mutate(year=format(test$date, format='%Y'), month=format(test$date, format='%m')) %>%
group_by(year, month) %>%
summarise(P3A_03=mean(P3A_03, na.rm=T))
test <- as.numeric(test$P3A_03)
plot.ts(train)
plot.ts(test)
train_log <- log(train)
plot.ts(diff(train_log, differences=2), xlab='Index', ylab=NULL)
train_diff <- diff(train_log, differences=2)
temp <- acf(train_diff, plot=F) # Cutoff value: 3 / Model: MA(2)
plot(temp, main=NA)
temp <- pacf(train_diff, plot=F) # Cutoff value: 15 / Model: AR(14)
plot(temp, main=NA)
train_arima <- arima(train_log, order=c(14, 2, 1))
train_pred <- predict(train_arima, n.ahead = length(test))
predict(train_arima, n.ahead = 1000)
temp <- c(rep(NA, length(train)), exp(train_pred$pred))
temp <- as.data.frame(cbind(c(train, test), temp))
temp$index <- (1:nrow(temp))
names(temp) <- c('target', 'pred', 'index')
ggplot(temp, aes(index, target)) +
geom_rect(xmin=79, xmax=86, ymin=-Inf, ymax=Inf, fill="yellow", alpha=0.0025) +
geom_line(size=1.5, alpha=0.5) +
geom_point(size=2.5) +
geom_line(aes(index, pred), colour='red', size=1.5, alpha=0.5) +
geom_point(aes(index, pred), size=2.5, colour='red') +
scale_x_continuous(limit=c(60, 86)) +
labs(x='Index', y='Rate ($/Day)', title='Model: ARIMA (p=14, d=2, q=2)',
subtitle=paste('RMSE: ', round(1878.35, 1), '\nMAPE: ', round(16.37, 2))) +
theme_bw()
temp <- na.omit(temp)
round(sqrt(sum((temp$target - temp$pred)^2)/nrow(temp)), 2) # RMSE
round(sum((abs(temp$target - temp$pred)/temp$pred*100))/nrow(temp), 2) # MAPE
rm(list=c('temp', 'train_arima', 'train_pred', 'test', 'train', 'train_diff', 'train_log'))
###########################
# Change time unit (Week) #
###########################
head(dataset %>%
mutate(day=weekdays(date)))
temp <- NULL
for(i in 1:(nrow(dataset)/5)) {
temp <- c(temp, rep(i, 5))
}
dataset_week <- dataset[1:length(temp),]
dataset_week$index <- temp
dataset_week <- dataset_week %>%
mutate(year=format(dataset_week$date, format='%Y'), month=format(dataset_week$date, format='%m')) %>%
group_by(index) %>%
summarise(P3A_03=mean(P3A_03, na.rm=T), BPI=mean(BPI, na.rm=T), LIBOR=mean(LIBOR, na.rm=T), won.per.dollar=mean(won.per.dollar, na.rm=T))
statsNA(dataset_week$P3A_03)
ggplot(dataset_week, aes(index, P3A_03)) +
geom_line() +
labs(x='Index', y='Rate ($/Day)') +
theme_bw()
rm(list=c('i', 'temp'))
############################################################
# Check Autocorrelation and dataset & Imputation NA values #
############################################################
temp <- acf(dataset_week$P3A_03[!is.na(dataset_week$P3A_03)], lag=50, plot=F)
temp <- as.numeric(unlist(temp)[1:51])
temp <- data.frame(lag=0:(length(temp) - 1), acf=temp)
sum(temp$acf > 0.5)
ggplot(temp, aes(lag, acf)) +
geom_rect(xmin=-Inf, xmax=8, ymin=0.5, ymax=Inf, fill='yellow', alpha=0.0025) +
geom_bar(stat='identity') +
geom_vline(xintercept = 8, linetype='dashed', colour='red', size=1.25) +
geom_hline(yintercept = 0.5, linetype='dashed', colour='red', size=1.25) +
labs(x='Lag', y='ACF') +
scale_x_continuous(breaks=c(8, (0:5)*15)) +
theme_bw()
sum(is.na(dataset_week$P3A_03))
dataset_week_imputed <- dataset_week
dataset_week_imputed$P3A_03 <- na.kalman(dataset_week$P3A_03, model='auto.arima')
################################################################
# Extracting & Normalizing train, validation and test for LSTM #
################################################################
dataset_processed <- dataset_week_imputed %>%
select(index, P3A_03) %>%
mutate(key=as.factor(c(rep('train', 300 + 4),
rep('validation', 25 + 4),
rep('test', nrow(dataset_week) - (300 + 25 + 4 + 4)))))
names(dataset_processed)[2] <- 'value'
dataset_processed$value <- sqrt(dataset_processed$value)
mean_history <- mean(dataset_processed$value, na.rm=T)
sd_history <- sd(dataset_processed$value, na.rm=T)
dataset_processed$value <- (dataset_processed$value - mean_history)/sd_history
rm(list='temp')
#################
# Training LSTM #
#################
#########################
# Setting Hyperarameter #
#########################
lag_setting <- 4
batch_size <- 25
train_length <- 300
tsteps <- 1
epochs <- 50
#################
# Train dataset #
#################
lag_train <- dataset_processed %>%
mutate(value_lag=lag(value, n=lag_setting)) %>%
filter(!is.na(value_lag)) %>%
filter(key == 'train') %>%
tail(train_length)
x_train_vec <- lag_train$value_lag
x_train_arr <- array(data=x_train_vec, dim=c(length(x_train_vec), 1, 1))
y_train_vec <- lag_train$value
y_train_arr <- array(data=y_train_vec, dim = c(length(y_train_vec), 1))
######################
# Validation dataset #
######################
lag_validation <- dataset_processed %>%
mutate(value_lag=lag(value, n=lag_setting)) %>%
filter(!is.na(value_lag)) %>%
filter(key == 'validation') %>%
tail(25)
x_validation_vec <- lag_validation$value_lag
x_validation_arr <- array(data=x_validation_vec, dim=c(length(x_validation_vec), 1, 1))
y_validation_vec <- lag_validation$value
y_validation_arr <- array(data=y_validation_vec, dim = c(length(y_validation_vec), 1))
#################
# Train dataset #
#################
lag_test <- dataset_processed %>%
mutate(value_lag=lag(value, n=lag_setting)) %>%
filter(!is.na(value_lag)) %>%
filter(key == 'test') %>%
tail(25)
x_test_vec <- lag_test$value_lag
x_test_arr <- array(data=x_test_vec, dim=c(length(x_test_vec), 1, 1))
y_test_vec <- lag_test$value
y_test_arr <- array(data=y_test_vec, dim = c(length(y_test_vec), 1))
##################
# Training model #
##################
model <- keras_model_sequential()
model %>%
layer_lstm(units = 50,
input_shape = c(tsteps, 1),
batch_size = batch_size,
return_sequences = TRUE,
stateful = TRUE) %>%
layer_lstm(units = 50,
return_sequences = FALSE,
stateful = TRUE) %>%
layer_dense(units = 1)
model %>%
compile(loss='mae', optimizer='adam')
for (i in 1:epochs) {
model %>% fit(x = x_train_arr,
y = y_train_arr,
batch_size = batch_size,
epochs = 1,
verbose = 1,
shuffle = FALSE)
model %>% reset_states()
cat("Epoch: ", i)
}
pred <- model %>%
predict(x_test_arr, batch_size=25) %>%
.[, 1]
recover.value <- function(x) {
x <- (x*sd_history + mean_history)^2
return(x)
}
pred <- recover.value(pred)
actual <- recover.value(y_test_vec)
temp <- as.data.frame(cbind(index=1:25, pred, actual))
rmse <- sqrt(sum((pred - actual)^2)/25)
mape <- sum(abs(pred - actual)/actual*100)/25
ggplot(temp, aes(index, actual)) +
geom_line(size=2, alpha=0.5) +
geom_line(aes(index, pred), colour='red', size=2, alpha=0.5) +
labs(x='Index', y='Rate ($/Day)', title='Model: LSTM',
subtitle=paste('RMSE: ', round(rmse, 1), '\nMAPE: ', round(mape, 2))) +
theme_bw()
##########################
# Tunning hyperparameter #
##########################
sse <- list()
epochs <- c(1:20*10)
iteration <- length(epochs)
for (i in 1:length(epochs)) {
model <- keras_model_sequential()
model %>%
layer_lstm(units = 50,
input_shape = c(tsteps, 1),
batch_size = batch_size,
return_sequences = TRUE,
stateful = TRUE) %>%
layer_lstm(units = 50,
return_sequences = FALSE,
stateful = TRUE) %>%
layer_dense(units = 1)
model %>%
compile(loss='mae', optimizer='adam')
for (j in 1:epochs[i]) {
model %>% fit(x = x_train_arr,
y = y_train_arr,
batch_size = batch_size,
epochs = 1,
verbose = 1,
shuffle = FALSE)
model %>% reset_states()
cat("Epoch: ", j)
}
pred <- model %>%
predict(x_validation_arr, batch_size=25) %>%
.[, 1]
pred <- recover.value(pred)
actual <- recover.value(y_validation_vec)
sse[[i]] <- sum((pred - actual)^2)
}
sse <- unlist(sse)
temp <- data.frame(epoch=1:20*10, se=se)
ggplot(temp, aes(epoch, se)) +
geom_bar(stat='identity') +
labs(x='Num of Epoch', y='SSE') +
theme_bw()
########################################################
# Extracting train, validation and test for Tree model #
########################################################
train <- dataset %>%
filter(format(date, '%Y') >= 2011 & format(date, '%Y') < 2017)
validation <- dataset %>%
filter(format(date, '%Y') >= 2017 & date <= as.Date('2017-06-30'))
test <- dataset %>%
filter(date > as.Date('2017-06-30'))
#########################################################
# Check train, validation and test & Imputing NA values #
#########################################################
for (i in 2:5) {
statsNA(unlist(train[, i]))
}
for(i in 2:5) {
statsNA(unlist(validation[, i]))
}
for (i in 2:5) {
statsNA(unlist(test[, i]))
}
temp <- data.frame(na=train$date[is.na(train$P3A_03)])
p1 <- ggplot(train, aes(date, P3A_03)) +
geom_line() +
geom_vline(data=temp, aes(xintercept=na), colour='yellow') +
labs(x='Year', y='Rate ($/Day)') +
theme_bw()
impute.na <- function(x) {
x[2] <- na.kalman(x[2], model='auto.arima')
x[3] <- na.kalman(x[3], model='auto.arima')
x[4] <- na.kalman(x[4], model='auto.arima')
x[5] <- na.kalman(x[5], model='auto.arima')
return(x)
}
train <- impute.na(train)
temp <-temp %>%
mutate(pred=train$P3A_03[train$date %in% temp$na])
p2 <- ggplot(train, aes(date, P3A_03)) +
geom_vline(data=temp, aes(xintercept=na), colour='yellow') +
geom_line() +
geom_point(data=temp, aes(na, pred), colour='red') +
labs(x='Year', y='Rate ($/Day)') +
theme_bw()
grid.arrange(p1, p2)
rm(list=c('p1', 'p2', 'temp', 'i'))
#######################################################
# Setting target variable & Creating derived variable #
#######################################################
create.dv <- function(x) {
x <- x %>%
mutate(mul_P3A.BPI=P3A_03*BPI,
mul_P3A.LIBOR=P3A_03*LIBOR,
div_P3A.won=P3A_03/won.per.dollar,
mul_BPI.LIBOR=BPI*LIBOR,
div_BPI.won=BPI/won.per.dollar,
div_won.LIBOR=won.per.dollar/LIBOR)
x$P3A_03 <- round(x$P3A_03, 0)
x$BPI <- round(x$BPI, 0)
x$LIBOR <- round(x$LIBOR, 4)
x$won.per.dollar <- round(x$won.per.dollar , 0)
x$mul_P3A.BPI <- x$mul_P3A.BPI
x$mul_P3A.LIBOR <- round(x$mul_P3A.LIBOR, 0)
x$div_P3A.won <- round(x$div_P3A.won, 2)
x$mul_BPI.LIBOR <- round(x$mul_BPI.LIBOR, 0)
x$div_BPI.won <- round(x$div_BPI.won, 3)
x$div_won.LIBOR <- round(x$div_won.LIBOR, 0)
create.dv_diff <- function(x, l) {
for (i in 1:l) {
x[, ncol(x) + 1] <- c(rep(NA, i), x$P3A_03[(i + 1):nrow(x)] - x$P3A_03[1:(nrow(x) - i)])
}
x <- na.omit(x)
return(x)
}
x <- create.dv_diff(x, 10)
x$target <- c(rep(NA, 5), x$P3A_03[1:(nrow(x) - 5)])
x <- na.omit(x)
return(x)
}
train <- create.dv(train)
#######################
# Training Tree model #
#######################
set.seed(777)
model_xgb <- xgboost(data = data.matrix(train[, 2:(ncol(train) - 1)]),
label = train$target,
eta = 0.15,
min_child_weight = 5,
gamma = 2.5,
subsample = 0.75,
colsample_bytree = 0.75,
max_depth = 5,
nrounds = 500,
objective = "reg:linear",
eval_metric = "rmse")
model_gbm <- gbm(train$target ~ .,
data = train[, 2:(ncol(train) - 1)],
distribution = "gaussian",
interaction.depth = 1,
shrinkage = 0.05,
bag.fraction = 0.75,
n.trees = 3000)
model_rf <- randomForest(x = train[2:(ncol(train) - 1)],
y = train$target,
mtry = 10,
ntree = 500,
sampsize = 500)
validation <- impute.na(validation)
validation <- create.dv(validation)
temp <- validation[c('date', 'target')]
temp$date <- temp$date + 7
temp$pred_rf <- predict(model_rf, as.matrix(validation[2:(ncol(validation) - 1)]))
temp$pred_xgb <- predict(model_xgb, as.matrix(validation[2:(ncol(validation) - 1)]))
temp$pred_gbm <- predict(model_gbm, validation[2:(ncol(validation) - 1)], n.trees=gbm.perf(model_gbm, method='OOB', plot.it=FALSE))
rmse <- list()
mape <- list()
for(i in 1:3) {
rmse[[i]] <- sqrt(sum((temp['target'] - temp[, i + 2])^2)/nrow(temp))
mape[[i]] <- sum(abs(temp['target'] - temp[, i + 2])/temp['target'])*100/nrow(temp)
}
p1 <- ggplot(temp, aes(date, target)) +
geom_line(size=2) +
geom_line(aes(date, pred_rf), colour='green', size=2, alpha=0.5) +
labs(x='Month', y='Rate ($/Day)', title='Model: Random Forest',
subtitle=paste('RMSE: ', round(rmse[[1]], 1), '\nMAPE: ', round(mape[[1]], 2))) +
theme_bw()
p2 <- ggplot(temp, aes(date, target)) +
geom_line(size=2) +
geom_line(aes(date, pred_xgb), colour='red', size=2, alpha=0.5) +
labs(x='Month', y='Rate ($/Day)', title='Model: XGBoost',
subtitle=paste('RMSE: ', round(rmse[[2]], 1), '\nMAPE: ', round(mape[[2]], 2))) +
theme_bw()
p3 <- ggplot(temp, aes(date, target)) +
geom_line(size=2) +
geom_line(aes(date, pred_gbm), colour='blue', size=2, alpha=0.5) +
labs(x='Month', y='Rate ($/Day)', title='Model: GBM',
subtitle=paste('RMSE: ', round(rmse[[3]], 1), '\nMAPE: ', round(mape[[3]], 2))) +
theme_bw()
grid.arrange(p1, p2, p3)
# weight <- list()
# sum_rmse <- 1/rmse[[1]] + 1/rmse[[2]] + 1/rmse[[3]]
#
# for (i in 1:3) {
# weight[[i]] <- (1/rmse[[i]])/(sum_rmse)
# }
#
# pred_ensemble <- weight[[1]]*temp$pred_rf + weight[[2]]*temp$pred_xgb + weight[[3]]*temp$pred_gbm
# rmse <- sqrt(sum((temp['target'] - pred_ensemble)^2)/nrow(temp))
# mape <- sum(abs(temp['target'] - temp[, i + 2])/temp['target'])*100/nrow(temp)
#
# ggplot(temp, aes(date, target)) +
# geom_line(size=2) +
# geom_line(aes(date, weight[[1]]*pred_rf + weight[[2]]*pred_xgb + weight[[3]]*pred_gbm), colour='grey', size=2, alpha=0.5) +
# labs(x='Month', y='Rate ($/Day)', title='Model: Ensemble',
# subtitle=paste('RMSE: ', round(rmse, 1), '\nMAPE: ', round(mape, 2))) +
# theme_bw()
rm(list=c('p1', 'p2', 'p3', 'temp', 'weight', 'i', 'mape', 'pred_ensemble', 'rmse', 'sum_rmse'))
############################################
# Tunning hyperparameter using Grid Search #
############################################
# result_grid.search <- list()
# target <- validation$target[6:nrow(validation)]
# mse_min <- numeric(3)
#################
# Random Forest #
#################
# grid.search <- expand.grid(
# mtry=c(10, 13, 15, 17, 20),
# ntree=c(500, 750, 1000, 1250, 1500),
# sampsize=c(500, 750, 1500)
# )
#
# model <- list()
# mse <- numeric(nrow(grid.search))
#
# for (i in 1:nrow(grid.search)) {
# cat('Iteration:', i, 'of', nrow(grid.search), '\n')
#
# model[[i]] <- randomForest(x = train[2:(ncol(train) - 1)],
# y = train$target,
# mtry = grid.search[i, 'mtry'],
# ntree = grid.search[i, 'ntree'],
# sampsize = grid.search[i, 'sampsize'])
#
# predict_temp <- predict(model[[i]], as.matrix(validation[2:(ncol(validation) - 1)]))
# predict_temp <- predict_temp[1:(length(predict_temp) - 5)]
# mse[i] <- sum((predict_temp - target)^2)/nrow(validation)
# }
#
# grid.search$mse <- mse
# result_grid.search[[1]] <- grid.search[which.min(grid.search$mse), ]
# mse_min[1] <- min(grid.search$mse)
###########
# XGBoost #
###########
# grid.search <- expand.grid(
# eta=c(0.05, 0.1, 0.15, 0.2, 0.25),
# min_child_weight=c(1, 3, 5, 7, 9),
# gamma=c(2.5, 5, 7.5),
# subsample=c(0.25, 0.5, 0.75),
# colsample_bytree=c(0.5, 0.75, 1),
# max_depth=c(3, 5, 7, 9),
# nrounds=c(250, 500, 750)
# )
#
# model <- list()
# mse <- numeric(nrow(grid.search))
#
# for (i in 1:nrow(grid.search)) {
# cat('Iteration:', i, 'of', nrow(grid.search), '\n')
#
# model[[i]] <- xgboost(data = data.matrix(train[, 2:(ncol(train) - 1)]),
# label = train$target,
# eta = grid.search[i, 'eta'],
# min_child_weight = grid.search[i, 'min_child_weight'],
# gamma = grid.search[i, 'gamma'],
# subsample = grid.search[i, 'subsample'],
# colsample_bytree = grid.search[i, 'colsample_bytree'],
# max_depth = grid.search[i, 'max_depth'],
# nrounds = grid.search[i, 'nrounds'],
# objective = 'reg:linear',
# eval_metric = 'rmse',
# verbose = 0)
#
# predict_temp <- predict(model[[i]], as.matrix(validation[2:(ncol(validation) - 1)]))
# predict_temp <- predict_temp[1:(length(predict_temp) - 5)]
# mse[i] <- sum((predict_temp - target)^2)/nrow(validation)
# }
#
# grid.search$mse <- mse
# result_grid.search[[2]] <- grid.search[which.min(grid.search$mse), ]
# mse_min[2] <- min(grid.search$mse)
#############
# Light GBM #
#############
# grid.search <- expand.grid(
# n.trees=c(500, 750, 1000, 1250),
# interaction.depth=c(1, 2, 3),
# shrinkage=c(0.05, 0.1, 0.15),
# bag.fraction=c(0.25, 0.5, 0.75)
# )
#
# model <- list()
# mse <- numeric(nrow(grid.search))
#
# for (i in 1:nrow(validation)) {
# cat('Iteration:', i, 'of', nrow(grid.search), '\n')
#
# model[[i]] <- gbm(target ~ .,
# data = train[, 2:ncol(train)],
# distribution = 'gaussian',
# n.trees = grid.search[i, 'n.trees'],
# interaction.depth = grid.search[i, 'interaction.depth'],
# shrinkage = grid.search[i, 'shrinkage'],
# bag.fraction = grid.search[i, 'bag.fraction'],
# cv.folds = 5)
#
# predict_temp <- predict(model[[i]], validation[2:(ncol(validation) - 1)], n.trees=gbm.perf(model[[i]], method='OOB', plot.it=FALSE))
# predict_temp <- predict_temp[1:(length(predict_temp) - 5)]
# mse[i] <- sum((predict_temp - target)^2)/nrow(validation)
# }
#
# grid.search$mse <- mse
# result_grid.search[[3]] <- grid.search[which.min(grid.search$mse), ]
# mse_min[3] <- min(grid.search$mse)
###################
# Test Tree model #
###################
temp <- test
temp$target <- c(rep(NA, 5), temp$P3A_03[1:(nrow(temp) - 5)])
temp <- na.omit(temp[c('date','P3A_03', 'target')])
temp_test <- impute.na(test)
temp_test <- create.dv(test)
temp_test$pred <- predict(model_xgb, as.matrix(temp_test[2:(ncol(temp_test) - 1)]))
temp_test <- merge(temp[c('date', 'P3A_03','target')], temp_test[c('date', 'pred')], by='date')
temp_test$date <- temp_test$date + 7
rmse <- sqrt(sum((temp_test['target'] - temp_test['pred'])^2)/nrow(temp_test))
mape <- sum(abs(temp_test['target'] - temp_test['pred'])/temp_test['target'])*100/nrow(temp_test)
ggplot(temp_test, aes(date, target)) +
geom_line(size=2) +
geom_line(aes(date, pred), colour='red', size=2, alpha=0.5) +
labs(x='Month', y='Rate ($/Day)', title='Model: XGBoost',
subtitle=paste('RMSE: ', round(rmse, 1), '\nMAPE: ', round(mape, 2))) +
scale_x_date(labels=date_format("%m"), breaks='1 month') +
theme_bw()
p1 <- ggplot(temp_test, aes(abs(P3A_03 - target))) +
geom_histogram(colour=NA, fill='red') +
scale_x_continuous(limits=c(0, 2500)) +
scale_y_continuous(limits=c(0, 30)) +
labs(x='Difference of Rate ($/Day)', y='Count') +
theme_bw()
p2 <- ggplot(temp_test, aes(abs(pred - target))) +
geom_histogram(colour=NA, fill='green') +
scale_x_continuous(limits=c(0, 2500)) +
scale_y_continuous(limits=c(0, 30)) +
labs(x='Difference of Rate ($/Day)', y='Count') +
theme_bw()
grid.arrange(p1, p2)
importance <- as.data.frame(xgb.importance(model=model_xgb))
importance$Feature <- factor(importance$Feature, levels=importance$Feature[order(importance$Gain)])
ggplot(importance, aes(Feature, Gain)) +
geom_bar(stat='identity', width=0.7) +
coord_flip() +
guides(fill=F) +
labs(y='Imformation Gain', x='Feature') +
theme_bw()
plot(1/train$won.per.dollar, train$P3A_03)
plot(train$P3A_03, train$target)