λ³Έλ¬Έ λ°”λ‘œκ°€κΈ°

Project

2020. 09. BIG CONTEST, ν˜μ‹  아이디어 λΆ„μ•Ό λŒ€νšŒ μ°Έκ°€

< Presentation >

 

 

< Preprocessing & EDA (R Code) >

 

######
# SK #
######

###########
# setting #
###########

library(dplyr)
library(tidyr)
library(forcats)
library(lubridate)
library(ggplot2)
library(scales)
library(tidytext)
library(readr)
library(showtext)

font_add_google('Nanum Gothic', 'Gothic')

showtext_auto()

theme_set(theme_minimal() + 
            theme(plot.title = element_text(face = 'bold', colour = 'grey10'), 
                  plot.subtitle = element_text(colour = 'grey25'), 
                  panel.grid.major = element_line(colour = 'grey90', size = 1), 
                  panel.grid.minor = element_line(colour = 'grey80', size = 0.5, linetype = 'dashed'), 
                  legend.position = 'top', 
                  legend.spacing.x = unit(0.125, 'cm'), 
                  legend.background = element_rect(fill = NULL, linetype = 'dotted'), 
                  strip.background = element_blank(), 
                  strip.text = element_text(face = 'bold', colour = 'grey25', size = 11.25)))

data_sk_age = read_csv('C:/Users/user/Desktop/Big_Contest/Data/sk_flow_age.csv')
data_sk_time = read_csv('C:/Users/user/Desktop/Big_Contest/Data/sk_flow_time.csv')

data_sk_age$STD_Y = substr(data_sk_age$STD_YMD, 1, 4)
data_sk_age$STD_M = substr(data_sk_age$STD_YMD, 5, 6)
data_sk_age$STD_D = substr(data_sk_age$STD_YMD, 7, 8)
data_sk_age$STD_YMD = as.Date(paste(data_sk_age$STD_Y, '-', data_sk_age$STD_M, '-', data_sk_age$STD_D, sep = ''))

data_sk_time$STD_Y = substr(data_sk_time$STD_YMD, 1, 4)
data_sk_time$STD_M = substr(data_sk_time$STD_YMD, 5, 6)
data_sk_time$STD_D = substr(data_sk_time$STD_YMD, 7, 8)
data_sk_time$STD_YMD = as.Date(paste(data_sk_time$STD_Y, '-', data_sk_time$STD_M, '-', data_sk_time$STD_D, sep = ''))

data_sk_age$city = 'μ„œμšΈνŠΉλ³„μ‹œ'
data_sk_age$city[data_sk_age$HDONG_CD > 2e9] = 'λŒ€κ΅¬κ΄‘μ—­μ‹œ'

data_sk_time$city = 'μ„œμšΈνŠΉλ³„μ‹œ'
data_sk_time$city[data_sk_time$HDONG_CD > 2e9] = 'λŒ€κ΅¬κ΄‘μ—­μ‹œ'

df_age = data_sk_age %>% 
  select(STD_YMD, STD_Y, MAN_FLOW_POP_CNT_0004:WMAN_FLOW_POP_CNT_70U)

df_age = df_age %>% 
  gather(key = key, value = value, MAN_FLOW_POP_CNT_0004:WMAN_FLOW_POP_CNT_70U)

df_age$gender = substr(df_age$key, 1, 4)
df_age$gender[df_age$gender == 'MAN_'] = 'λ‚¨μž'
df_age$gender[df_age$gender == 'WMAN'] = 'μ—¬μž'

df_age$age = substr(df_age$key, 18, 22)
df_age$age[nchar(df_age$age) == 5] = paste(substr(df_age$age[nchar(df_age$age) == 5], 2, 3), '-', substr(df_age$age[nchar(df_age$age) == 5], 4, 5), sep = '')
df_age$age[nchar(df_age$age) == 4] = paste(substr(df_age$age[nchar(df_age$age) == 4], 1, 2), '-', substr(df_age$age[nchar(df_age$age) == 4], 3, 4), sep = '')
df_age$age[df_age$age == '70U'] = '70-100'
df_age$age[df_age$age == '_7-0U'] = '70-100'

df_age$week = week(df_age$STD_YMD - 3) - 4

df_time = data_sk_time %>% 
  select(city, HDONG_NM, STD_YMD, STD_Y, TMST_00:TMST_23)

df_time = df_time %>% 
  gather(key = key, value = value, TMST_00:TMST_23)

df_time$time = substr(df_time$key, 6, 7)

df_time$week = week(df_time$STD_YMD - 3) - 4

#######
# eda #
#######

df_time %>% 
  group_by(STD_Y, STD_YMD) %>% 
  summarise(total_value = sum(value)) %>% 
  ggplot(aes(STD_YMD, total_value/1e4, colour = as.factor(STD_Y))) + 
  geom_point(size = 1.5) + 
  geom_line(size = 1) + 
  facet_wrap(~ STD_Y, scales = 'free', ncol = 1) + 
  guides(colour = F) + 
  labs(x = '달', y = 'μœ λ™μΈκ΅¬ (λ‹¨μœ„: 만)')

df_age %>% 
  filter(week != 18) %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(week, STD_Y) %>% 
  summarise(total_value = sum(value)/1e4) %>% 
  ggplot(aes(week, total_value, group = STD_Y, colour = STD_Y)) + 
  geom_point(size = 2.5) + 
  geom_line(size = 1) + 
  scale_y_continuous(labels = comma) + 
  labs(x = 'κΈ°κ°„ (λ‹¨μœ„: 7일)', y = 'μœ λ™μΈκ΅¬ (λ‹¨μœ„: 만)') + 
  theme(legend.title = element_blank())

df_age_19 = df_age %>% 
  filter(STD_Y == 2019) %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(age, gender, STD_Y) %>% 
  summarise(total_value = sum(value)/1e4)

df_age_20 = df_age %>% 
  filter(STD_Y == 2020) %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(age, gender, STD_Y) %>% 
  summarise(total_value = sum(value)/1e4)

df_age_total = df_age_19 %>% 
  select(-total_value, -STD_Y)

df_age_total$decre_prop_value = -(df_age_20$total_value - df_age_19$total_value)/df_age_19$total_value*100

df_age_total %>% 
  ggplot(aes(age, decre_prop_value, fill = gender)) + 
  geom_bar(stat = 'identity', position = 'dodge', size = 1, alpha = 0.75, colour = 'black') + 
  scale_fill_manual(values = c('steelblue', 'pink')) + 
  labs(x = 'λ‚˜μ΄', y = 'μ „λ…„ λŒ€λΉ„ μœ λ™μΈκ΅¬ κ°μ†Œμœ¨ (%)') + 
  theme(legend.title = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1))

df_age$is_weekends = (weekdays(df_age$STD_YMD) == 'ν† μš”μΌ' | weekdays(df_age$STD_YMD) == 'μΌμš”μΌ')

df_age_19 = df_age %>% 
  filter(STD_Y == 2019) %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(is_weekends, STD_Y) %>% 
  summarise(total_value = sum(value)/1e4)

df_age_20 = df_age %>% 
  filter(STD_Y == 2020) %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(is_weekends, STD_Y) %>% 
  summarise(total_value = sum(value)/1e4)

df_age_total = df_age_19 %>% 
  select(-total_value, -STD_Y)

df_age_total$decre_prop_value = -(df_age_20$total_value - df_age_19$total_value)/df_age_19$total_value*100

df_age_total$is_weekends = c('평일', '주말')

df_age_total %>%
  ggplot(aes(is_weekends, decre_prop_value, fill = is_weekends)) + 
  geom_bar(stat = 'identity', alpha = 0.5, size = 1, colour = 'black') + 
  labs(x = NULL, y = 'μ „λ…„ λŒ€λΉ„ μœ λ™μΈκ΅¬ κ°μ†Œμœ¨ (%)') + 
  scale_fill_manual(values = c('red', 'black')) + 
  coord_cartesian(ylim = c(22, 26)) + 
  guides(fill = F, colour = F)

df_age_19 = df_age %>% 
  filter(STD_Y == 2019) %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(is_weekends, age, STD_Y) %>% 
  summarise(total_value = sum(value)/1e4)

df_age_20 = df_age %>% 
  filter(STD_Y == 2020) %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(is_weekends, age, STD_Y) %>% 
  summarise(total_value = sum(value)/1e4)

df_age_total = df_age_19 %>% 
  select(-total_value, -STD_Y)

df_age_total$decre_prop_value = -(df_age_20$total_value - df_age_19$total_value)/df_age_19$total_value*100

df_age_total$is_weekends = if_else(df_age_total$is_weekends == T, '주말', '평일')

df_age_total %>% 
  ggplot(aes(age, decre_prop_value, fill = is_weekends)) + 
  geom_bar(stat = 'identity', position = 'dodge', size = 1, alpha = 0.5, colour = 'black') + 
  labs(x = 'λ‚˜μ΄', y = 'μ „λ…„ λŒ€λΉ„ μœ λ™μΈκ΅¬ κ°μ†Œμœ¨ (%)') + 
  scale_fill_manual(values = c('red', 'black')) + 
  theme(legend.title = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1))

df_time$is_weekends = (weekdays(df_time$STD_YMD) == 'ν† μš”μΌ' | weekdays(df_time$STD_YMD) == 'μΌμš”μΌ')

df_time$is_weekends = if_else(df_time$is_weekends == T, '주말', '평일')

df_time %>% 
  group_by(is_weekends, STD_Y, time) %>% 
  summarise(total_value = sum(value)/1e4) %>% 
  ungroup() %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  ggplot(aes(time, total_value, group = STD_Y, colour = STD_Y)) + 
  geom_point(size = 2.5) + 
  geom_line(size = 1) + 
  labs(x = 'μ‹œ', y = 'μœ λ™μΈκ΅¬ (λ‹¨μœ„: 만)') + 
  scale_y_continuous(labels = comma) + 
  facet_wrap(~ is_weekends, scales = 'free_y') + 
  theme(legend.title = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1))

df_time_19 = df_time %>% 
  filter(STD_Y == 2019) %>% 
  group_by(is_weekends, STD_Y, time) %>% 
  summarise(total_value = sum(value)/1e4) %>% 
  ungroup()

df_time_20 = df_time %>% 
  filter(STD_Y == 2020) %>% 
  group_by(is_weekends, STD_Y, time) %>% 
  summarise(total_value = sum(value)/1e4) %>% 
  ungroup()

df_time_total = df_time_19 %>% 
  select(-STD_Y, -total_value)

df_time_total$decre_prop_value = -(df_time_20$total_value - df_time_19$total_value)/(df_time_19$total_value)*100

df_time_total %>% 
  ggplot(aes(time, decre_prop_value, group = is_weekends, colour = is_weekends)) + 
  geom_point(size = 2.5) + 
  geom_line(size = 1) + 
  labs(x = 'μ‹œ', y = 'μ „λ…„ λŒ€λΉ„ μœ λ™μΈκ΅¬ κ°μ†Œμœ¨ (%)') + 
  scale_color_manual(values = c('red', 'black')) + 
  theme(legend.title = element_blank())
  
######
# SH #
######

###########
# setting #
###########

library(dplyr)
library(forcats)
library(ggplot2)
library(scales)
library(tidytext)
library(readr)
library(showtext)

font_add_google('Nanum Gothic', 'Gothic')

showtext_auto()

theme_set(theme_minimal() + 
            theme(plot.title = element_text(face = 'bold', colour = 'grey10'), 
                  plot.subtitle = element_text(colour = 'grey25'), 
                  panel.grid.major = element_line(colour = 'grey90', size = 1), 
                  panel.grid.minor = element_line(colour = 'grey80', size = 0.5, linetype = 'dashed'), 
                  legend.position = 'top', 
                  legend.spacing.x = unit(0.125, 'cm'), 
                  legend.background = element_rect(fill = NULL, linetype = 'dotted'), 
                  strip.background = element_blank(), 
                  strip.text = element_text(face = 'bold', colour = 'grey25', size = 11.25)))

# data_sh_resid = read_csv('C:/Users/user/Desktop/Big_Contest/Data/sh_card_resident.csv')
# data_sh_fore = read_csv('C:/Users/user/Desktop/Big_Contest/Data/sh_card_foreigner.csv')
# 
# data_sh_resid$STD_Y = substr(data_sh_resid$STD_DD, 1, 4)
# data_sh_resid$STD_M = substr(data_sh_resid$STD_DD, 5, 6)
# data_sh_resid$STD_D = substr(data_sh_resid$STD_DD, 7, 8)
# data_sh_resid$STD_YMD = as.Date(paste(data_sh_resid$STD_Y, '-', data_sh_resid$STD_M, '-', data_sh_resid$STD_D, sep = ''))
# 
# data_sh_fore$STD_Y = substr(data_sh_fore$STD_DD, 1, 4)
# data_sh_fore$STD_M = substr(data_sh_fore$STD_DD, 5, 6)
# data_sh_fore$STD_D = substr(data_sh_fore$STD_DD, 7, 8)
# data_sh_fore$STD_YMD = as.Date(paste(data_sh_fore$STD_Y, '-', data_sh_fore$STD_M, '-', data_sh_fore$STD_D, sep = ''))
# 
# data_sh_resid$DONG_NM = data_sh_resid %>%
#   select(GU_CD, DONG_CD) %>%
#   mutate(DONG_NM = case_when(
#     GU_CD == '140' & DONG_CD == '520' ~ 'μ†Œκ³΅λ™',
#     GU_CD == '140' & DONG_CD == '540' ~ 'νšŒν˜„λ™',
#     GU_CD == '140' & DONG_CD == '550' ~ 'λͺ…동',
#     GU_CD == '140' & DONG_CD == '570' ~ '필동',
#     GU_CD == '140' & DONG_CD == '580' ~ 'μž₯좩동',
#     GU_CD == '140' & DONG_CD == '590' ~ '광희동',
#     GU_CD == '140' & DONG_CD == '605' ~ 'μ„μ§€λ‘œλ™',
#     GU_CD == '140' & DONG_CD == '615' ~ '신당동',
#     GU_CD == '140' & DONG_CD == '625' ~ '닀산동',
#     GU_CD == '140' & DONG_CD == '635' ~ 'μ•½μˆ˜λ™',
#     GU_CD == '140' & DONG_CD == '645' ~ '청ꡬ동',
#     GU_CD == '140' & DONG_CD == '650' ~ 'μ‹ λ‹Ή5동',
#     GU_CD == '140' & DONG_CD == '665' ~ '동화동',
#     GU_CD == '140' & DONG_CD == '670' ~ '황학동',
#     GU_CD == '140' & DONG_CD == '680' ~ '쀑림동',
#     GU_CD == '350' & DONG_CD == '560' ~ '월계1동',
#     GU_CD == '350' & DONG_CD == '570' ~ '월계2동',
#     GU_CD == '350' & DONG_CD == '580' ~ '월계3동',
#     GU_CD == '350' & DONG_CD == '595' ~ '곡릉1동',
#     GU_CD == '350' & DONG_CD == '600' ~ '곡릉2동',
#     GU_CD == '350' & DONG_CD == '611' ~ 'ν•˜κ³„1동',
#     GU_CD == '350' & DONG_CD == '612' ~ 'ν•˜κ²Œ2동',
#     GU_CD == '350' & DONG_CD == '619' ~ '쀑계본동',
#     GU_CD == '350' & DONG_CD == '621' ~ '쀑계1동',
#     GU_CD == '350' & DONG_CD == '624' ~ '쀑계4동',
#     GU_CD == '350' & DONG_CD == '625' ~ '쀑계2,3동',
#     GU_CD == '350' & DONG_CD == '630' ~ '상계1동',
#     GU_CD == '350' & DONG_CD == '640' ~ '상계2동',
#     GU_CD == '350' & DONG_CD == '665' ~ '상계3,4동',
#     GU_CD == '350' & DONG_CD == '670' ~ '상계5동',
#     GU_CD == '350' & DONG_CD == '695' ~ '상계6,7동',
#     GU_CD == '350' & DONG_CD == '700' ~ '상계8동',
#     GU_CD == '350' & DONG_CD == '710' ~ '상계9동',
#     GU_CD == '350' & DONG_CD == '720' ~ '상계10동',
#     GU_CD == '260' & DONG_CD == '510' ~ 'λ²”μ–΄1동',
#     GU_CD == '260' & DONG_CD == '520' ~ 'λ²”μ–΄2동',
#     GU_CD == '260' & DONG_CD == '530' ~ 'λ²”μ–΄3동',
#     GU_CD == '260' & DONG_CD == '540' ~ 'λ²”μ–΄4동',
#     GU_CD == '260' & DONG_CD == '550' ~ '만촌1동',
#     GU_CD == '260' & DONG_CD == '560' ~ '만촌2동',
#     GU_CD == '260' & DONG_CD == '561' ~ '만촌3동',
#     GU_CD == '260' & DONG_CD == '570' ~ 'μˆ˜μ„±1가동',
#     GU_CD == '260' & DONG_CD == '580' ~ 'μˆ˜μ„±2,3가동',
#     GU_CD == '260' & DONG_CD == '590' ~ 'μˆ˜μ„±4가동',
#     GU_CD == '260' & DONG_CD == '601' ~ 'ν™©κΈˆ1동',
#     GU_CD == '260' & DONG_CD == '602' ~ 'ν™©κΈˆ2동',
#     GU_CD == '260' & DONG_CD == '610' ~ '쀑동',
#     GU_CD == '260' & DONG_CD == '620' ~ '상동',
#     GU_CD == '260' & DONG_CD == '630' ~ 'νŒŒλ™',
#     GU_CD == '260' & DONG_CD == '640' ~ '두산동',
#     GU_CD == '260' & DONG_CD == '651' ~ '지산1동',
#     GU_CD == '260' & DONG_CD == '652' ~ '지산2동',
#     GU_CD == '260' & DONG_CD == '661' ~ 'λ²”λ¬Ό1동',
#     GU_CD == '260' & DONG_CD == '662' ~ 'λ²”λ¬Ό2동',
#     GU_CD == '260' & DONG_CD == '670' ~ 'κ³ μ‚°1동',
#     GU_CD == '260' & DONG_CD == '680' ~ 'κ³ μ‚°2동',
#     GU_CD == '260' & DONG_CD == '690' ~ 'κ³ μ‚°3동',
#     GU_CD == '110' & DONG_CD == '517' ~ '동인동',
#     GU_CD == '110' & DONG_CD == '545' ~ '삼덕동',
#     GU_CD == '110' & DONG_CD == '565' ~ 'μ„±λ‚΄1동',
#     GU_CD == '110' & DONG_CD == '575' ~ 'μ„±λ‚΄2동',
#     GU_CD == '110' & DONG_CD == '585' ~ 'μ„±λ‚΄3동',
#     GU_CD == '110' & DONG_CD == '595' ~ 'λŒ€μ‹ λ™',
#     GU_CD == '110' & DONG_CD == '640' ~ '남산1동',
#     GU_CD == '110' & DONG_CD == '650' ~ '남산2동',
#     GU_CD == '110' & DONG_CD == '660' ~ '남산3동',
#     GU_CD == '110' & DONG_CD == '670' ~ '남산4동',
#     GU_CD == '110' & DONG_CD == '680' ~ 'λŒ€λ΄‰1동',
#     GU_CD == '110' & DONG_CD == '690' ~ 'λŒ€λ΄‰2동'
#   )) %>%
#   select(DONG_NM) %>%
#   unlist()
# 
# data_sh_resid$MCT_CAT_NM = data_sh_resid %>%
#   select(MCT_CAT_CD) %>%
#   mutate(MCT_CAT_NM = case_when(
#     MCT_CAT_CD == '10' ~ 'μˆ™λ°•',
#     MCT_CAT_CD == '20' ~ 'λ ˆμ €μš©ν’ˆ',
#     MCT_CAT_CD == '21' ~ 'λ ˆμ €μ—…μ†Œ',
#     MCT_CAT_CD == '22' ~ 'λ¬Έν™”μ·¨λ―Έ',
#     MCT_CAT_CD == '30' ~ '가ꡬ',
#     MCT_CAT_CD == '31' ~ 'μ „κΈ°',
#     MCT_CAT_CD == '32' ~ '주방용ꡬ',
#     MCT_CAT_CD == '33' ~ 'μ—°λ£ŒνŒλ§€',
#     MCT_CAT_CD == '34' ~ 'κ΄‘ν•™μ œν’ˆ',
#     MCT_CAT_CD == '35' ~ 'κ°€μ „',
#     MCT_CAT_CD == '40' ~ 'μœ ν†΅μ—…',
#     MCT_CAT_CD == '42' ~ '의볡',
#     MCT_CAT_CD == '43' ~ '직물',
#     MCT_CAT_CD == '44' ~ 'μ‹ λ³€μž‘ν™”',
#     MCT_CAT_CD == '50' ~ 'μ„œμ λ¬Έκ΅¬',
#     MCT_CAT_CD == '52' ~ '사무톡신',
#     MCT_CAT_CD == '60' ~ 'μžλ™μ°¨νŒλ§€',
#     MCT_CAT_CD == '62' ~ 'μžλ™μ°¨μ •λΉ„',
#     MCT_CAT_CD == '70' ~ 'μ˜λ£ŒκΈ°κ΄€',
#     MCT_CAT_CD == '71' ~ 'λ³΄κ±΄μœ„μƒ',
#     MCT_CAT_CD == '80' ~ 'μš”μ‹μ—…μ†Œ',
#     MCT_CAT_CD == '81' ~ 'μŒλ£Œμ‹ν’ˆ',
#     MCT_CAT_CD == '92' ~ 'μˆ˜λ¦¬μ„œλΉ„μŠ€'
#   )) %>%
#   select(MCT_CAT_NM) %>%
#   unlist()

data_sh_resid = read_csv('C:/Users/user/Desktop/Big_Contest/Data/sh_card_resident_new.csv', locale = locale('ko', encoding = 'euc-kr'))

data_sh_resid$PV_NM = 'μ„œμšΈνŠΉλ³„μ‹œ'
data_sh_resid$PV_NM[data_sh_resid$GU_CD %in% c(260, 110)] = 'λŒ€κ΅¬κ΄‘μ—­μ‹œ'

data_sh_resid$GU_NM = '쀑ꡬ'
data_sh_resid$GU_NM[data_sh_resid$GU_CD == '350'] = '노원ꡬ'
data_sh_resid$GU_NM[data_sh_resid$GU_CD == '260'] = 'μˆ˜μ„±κ΅¬'

data_sh_resid$AGE_CD[data_sh_resid$AGE_CD == '20'] = '0-25'
data_sh_resid$AGE_CD[data_sh_resid$AGE_CD == '25'] = '25-30'
data_sh_resid$AGE_CD[data_sh_resid$AGE_CD == '30'] = '30-35'
data_sh_resid$AGE_CD[data_sh_resid$AGE_CD == '35'] = '35-40'
data_sh_resid$AGE_CD[data_sh_resid$AGE_CD == '40'] = '40-45'
data_sh_resid$AGE_CD[data_sh_resid$AGE_CD == '45'] = '45-50'
data_sh_resid$AGE_CD[data_sh_resid$AGE_CD == '50'] = '50-55'
data_sh_resid$AGE_CD[data_sh_resid$AGE_CD == '55'] = '55-60'
data_sh_resid$AGE_CD[data_sh_resid$AGE_CD == '60'] = '60-65'
data_sh_resid$AGE_CD[data_sh_resid$AGE_CD == '65'] = '65-100'

data_sh_resid$SEX_CD[data_sh_resid$SEX_CD == 'M'] = 'λ‚¨μž'
data_sh_resid$SEX_CD[data_sh_resid$SEX_CD == 'F'] = 'μ—¬μž'

data_sh_resid$week = week(data_sh_resid$STD_YMD - 3) - 4

df_temp = data_sh_resid

df_temp = df_temp %>%
  select(-X1, -X1_2, -X1_1)

df_temp = df_temp %>%
  select(STD_YMD, week, PV_NM, DONG_NM, GU_NM, SEX_CD, AGE_CD, MCT_CAT_NM, USE_AMT, USE_CNT)

names(df_temp) = c('DATE', 'PERIOD', 'PROVINCE', 'DONG', 'GU', 'SEX', 'AGE', 'CATEGORY', 'USE_AMOUNT', 'USE_COUNT')

#######
# eda #
#######

data_sh_resid %>% 
  filter(week != 18) %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(week, STD_Y) %>% 
  summarise(total_USE_AMT = sum(USE_AMT)) %>% 
  ggplot(aes(week, total_USE_AMT/1e5, group = STD_Y, colour = STD_Y)) + 
  geom_point(size = 2.5) + 
  geom_line(size = 1) + 
  scale_y_continuous(labels = comma) + 
  labs(x = 'κΈ°κ°„ (λ‹¨μœ„: 7일)', y = 'μ΄μš©κΈˆμ•‘ (λ‹¨μœ„: μ–΅)') + 
  theme(legend.title = element_blank())

data_sh_resid %>% 
  group_by(STD_Y, STD_YMD) %>% 
  summarise(total_USE_AMT = sum(USE_AMT)) %>% 
  ggplot(aes(STD_YMD, total_USE_AMT/1e5, colour = as.factor(STD_Y))) + 
  geom_point(size = 1.5) + 
  geom_line(size = 1) + 
  scale_y_continuous(labels = comma) + 
  guides(colour = F) + 
  labs(x = '달', y = 'μ΄μš©κΈˆμ•‘ (λ‹¨μœ„: μ–΅)') + 
  facet_wrap(~ STD_Y, scales = 'free', ncol = 1)

data_sh_resid %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(STD_Y) %>% 
  summarise(total_USE_AMT = sum(USE_AMT/1e8)) %>% 
  ggplot(aes(as.factor(STD_Y), total_USE_AMT, fill = as.factor(STD_Y))) + 
  geom_bar(stat = 'identity', alpha = 0.75, size = 1, colour = 'black') + 
  labs(x = '', y = 'μ΄μš©κΈˆμ•‘ (λ‹¨μœ„: 천 μ–΅)') + 
  coord_cartesian(ylim = c(25, 37.5)) + 
  guides(fill = F, colour = F)

df_temp = data_sh_resid %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(STD_Y) %>% 
  summarise(total_USE_AMT = sum(USE_AMT/1e8))

-(df_temp$total_USE_AMT[2] - df_temp$total_USE_AMT[1])/df_temp$total_USE_AMT[1]*100

df_temp_19 = data_sh_resid %>% 
  filter(STD_Y == 2019) %>% 
  group_by(AGE_CD) %>% 
  summarise(total_USE_AMT = sum(USE_AMT))

df_temp_20 = data_sh_resid %>% 
  filter(STD_Y == 2020) %>% 
  group_by(AGE_CD) %>% 
  summarise(total_USE_AMT = sum(USE_AMT))

df_temp_total = df_temp_19 %>% 
  select(-total_USE_AMT)

df_temp_total$decre_prop_USE_AMT = -(df_temp_20$total_USE_AMT - df_temp_19$total_USE_AMT)/df_temp_19$total_USE_AMT*100

df_temp_total %>% 
  ggplot(aes(AGE_CD, decre_prop_USE_AMT)) + 
  geom_bar(stat = 'identity', size = 1, alpha = 0.75, position = 'dodge', colour = 'black') + 
  labs(x = 'λ‚˜μ΄', y = 'μ „λ…„ λŒ€λΉ„ μ΄μš©κΈˆμ•‘ κ°μ†Œμœ¨ (%)') + 
  theme(legend.title = element_blank())

df_19 = data_sh_resid %>% 
  filter(STD_Y == '2019') %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(STD_Y, MCT_CAT_NM) %>% 
  summarise(total_USE_AMT = sum(USE_AMT))

df_20 = data_sh_resid %>% 
  filter(STD_Y == '2020') %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(STD_Y, MCT_CAT_NM) %>% 
  summarise(total_USE_AMT = sum(USE_AMT))

df_decre_prop_amt = data.frame(MCT_CAT_NM = c(df_19$MCT_CAT_NM, df_20$MCT_CAT_NM), 
                               incre_prop_amt = -(df_20$total_USE_AMT - df_19$total_USE_AMT)/df_19$total_USE_AMT)

df_decre_prop_amt %>% 
  ggplot(aes(MCT_CAT_NM, incre_prop_amt*100)) + 
  geom_bar(stat = 'identity', position = 'dodge', alpha = 0.5, colour = 'black', fill = 'steelblue', size = 1) + 
  coord_flip() + 
  labs(x = NULL, y = 'μ „λ…„ λŒ€λΉ„ μ΄μš©κΈˆμ•‘ κ°μ†Œμœ¨ (%)') + 
  theme(legend.title = element_blank())

df_19 = data_sh_resid %>% 
  filter(STD_Y == '2019') %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(SEX_CD, STD_Y, MCT_CAT_NM) %>% 
  summarise(total_USE_AMT = sum(USE_AMT))

df_20 = data_sh_resid %>% 
  filter(STD_Y == '2020') %>% 
  mutate(STD_Y = paste(STD_Y, 'λ…„', sep = '')) %>% 
  group_by(SEX_CD, STD_Y, MCT_CAT_NM) %>% 
  summarise(total_USE_AMT = sum(USE_AMT))

df_decre_prop_amt = data.frame(SEX_CD = c(df_19$SEX_CD, df_20$SEX_CD), 
                               MCT_CAT_NM = c(df_19$MCT_CAT_NM, df_20$MCT_CAT_NM), 
                               decre_prop_amt = -(df_20$total_USE_AMT - df_19$total_USE_AMT)/df_19$total_USE_AMT)

df_decre_prop_amt %>% 
  ggplot(aes(MCT_CAT_NM, decre_prop_amt*100, fill = SEX_CD)) + 
  geom_bar(stat = 'identity', position = 'dodge', alpha = 0.5, colour = 'black') + 
  coord_flip() + 
  labs(x = NULL, y = 'μ „λ…„ λŒ€λΉ„ μ΄μš©κΈˆμ•‘ κ°μ†Œμœ¨ (%)') + 
  scale_fill_manual(values = c('steelblue', 'pink')) + 
  theme(legend.title = element_blank())
  
######
# CJ #
######

###########
# setting #
###########

library(dplyr)
library(forcats)
library(ggplot2)
library(scales)
library(tidytext)
library(readr)
library(showtext)
library(lubridate)

font_add_google('Nanum Gothic', 'Gothic')

showtext_auto()

theme_set(theme_minimal() + 
            theme(plot.title = element_text(face = 'bold', colour = 'grey10'), 
                  plot.subtitle = element_text(colour = 'grey25'), 
                  panel.grid.major = element_line(colour = 'grey90', size = 1), 
                  panel.grid.minor = element_line(colour = 'grey80', size = 0.5, linetype = 'dashed'), 
                  legend.position = 'top', 
                  legend.spacing.x = unit(0.125, 'cm'), 
                  legend.background = element_rect(fill = NULL, linetype = 'dotted'), 
                  strip.background = element_blank(), 
                  strip.text = element_text(face = 'bold', colour = 'grey25', size = 11.25)))

data_cj = read_csv('C:/Users/user/Desktop/Big_Contest/Data/cj_logistics.csv')

data_cj$DL_Y = paste('20', substr(data_cj$DL_YMD, 1, 2), sep = '')
data_cj$DL_M = substr(data_cj$DL_YMD, 3, 4)
data_cj$DL_D = substr(data_cj$DL_YMD, 5, 6)
data_cj$DL_YMD = as.Date(paste(data_cj$DL_Y, '-', data_cj$DL_M, '-', data_cj$DL_D, sep = ''))

data_cj$week = week(data_cj$DL_YMD - 3) - 4

#######
# eda #
#######

data_cj %>% 
  select(week, DL_YMD) %>% 
  unique() %>% 
  as.data.frame()

data_cj %>% 
  filter(week != 18) %>% 
  mutate(DL_Y = paste(DL_Y, 'λ…„', sep = '')) %>% 
  group_by(week, DL_Y) %>% 
  summarise(total_INVC_CONT = sum(INVC_CONT)) %>% 
  ggplot(aes(week, total_INVC_CONT/1e4, group = DL_Y, colour = DL_Y)) + 
  geom_point(size = 2.5) + 
  geom_line(size = 1) + 
  scale_y_continuous(labels = comma) + 
  labs(x = 'κΈ°κ°„ (λ‹¨μœ„: 7일)', y = '솑μž₯건수 (λ‹¨μœ„: 만)') + 
  theme(legend.title = element_blank())

data_cj %>% 
  group_by(DL_Y, DL_YMD) %>% 
  summarise(total_INVC_CONT = sum(INVC_CONT)) %>% 
  ggplot(aes(DL_YMD, total_INVC_CONT/1e4, colour = as.factor(DL_Y))) + 
  geom_point(size = 1.5) + 
  geom_line(size = 1) + 
  labs(x = '달', y = '솑μž₯건수 (λ‹¨μœ„: 만)') + 
  guides(colour = F) + 
  facet_wrap(~ DL_Y, scales = 'free', ncol = 1)

data_cj %>% 
  filter(week != max(week)) %>% 
  mutate(DL_Y = paste(DL_Y, 'λ…„', sep = '')) %>% 
  group_by(DL_Y, week) %>% 
  summarise(total_INVC_CONT = sum(INVC_CONT)) %>% 
  ggplot(aes(week, total_INVC_CONT/1e4, group = DL_Y, colour = DL_Y)) + 
  geom_point(size = 2.5) + 
  geom_line(size = 1) + 
  labs(x = 'κΈ°κ°„ (λ‹¨μœ„: 7일)', y = '솑μž₯건수 (λ‹¨μœ„: 만)') + 
  theme(legend.title = element_blank())

df_temp = data_cj %>% 
  mutate(DL_Y = paste(DL_Y, 'λ…„', sep = '')) %>% 
  group_by(DL_Y) %>%
  summarise(total_INVC_CONT = sum(INVC_CONT)) %>% 
  ungroup()

df_temp %>% 
  ggplot(aes(DL_Y, total_INVC_CONT/1e4, fill = as.factor(DL_Y))) + 
  geom_bar(stat = 'identity', colour = 'black', size = 1, alpha = 0.75) + 
  scale_y_continuous(labels = comma) + 
  labs(x = NULL, y = '솑μž₯건수 (λ‹¨μœ„: 만)') + 
  coord_cartesian(ylim = c(500, 1000)) + 
  guides(fill = F)

df_19 = data_cj %>%
  filter(DL_Y == '2019') %>% 
  group_by(DL_Y, DL_GD_LCLS_NM) %>% 
  summarise(sum_INVC_CONT = sum(INVC_CONT)) %>% 
  ungroup()

df_20 = data_cj %>% 
  filter(DL_Y == '2020') %>% 
  group_by(DL_Y, DL_GD_LCLS_NM) %>% 
  summarise(sum_INVC_CONT = sum(INVC_CONT)) %>% 
  ungroup()

df_incre_prop = data.frame(DL_GD_LCLS_NM = df_19$DL_GD_LCLS_NM, 
                           incre_prop = (df_20$sum_INVC_CONT - df_19$sum_INVC_CONT)/df_19$sum_INVC_CONT*100) %>% 
  arrange(-incre_prop)

df_incre_prop %>% 
  ggplot(aes(reorder(DL_GD_LCLS_NM, incre_prop), incre_prop)) + 
  geom_bar(stat = 'identity', alpha = 0.75, size = 1, colour = 'black', fill = 'steelblue') + 
  labs(x = NULL, y = 'μ „λ…„ λŒ€λΉ„ 솑μž₯건수 μ¦κ°€μœ¨ (%)') + 
  guides(fill = F, colour = F) + 
  coord_flip()

df_temp_19 = data_cj %>% 
  filter(DL_Y == '2019') %>% 
  mutate(DL_Y = paste(DL_Y, 'λ…„', sep = '')) %>% 
  group_by(DL_Y, DL_GD_LCLS_NM) %>% 
  summarise(total_INVC_CONT = sum(INVC_CONT))

df_temp_19$total_INVC_CONT = df_temp_19$total_INVC_CONT/sum(df_temp_19$total_INVC_CONT)*100

df_temp_20 = data_cj %>% 
  filter(DL_Y == '2020') %>% 
  mutate(DL_Y = paste(DL_Y, 'λ…„', sep = '')) %>% 
  group_by(DL_Y, DL_GD_LCLS_NM) %>% 
  summarise(total_INVC_CONT = sum(INVC_CONT))

df_temp_20$total_INVC_CONT = df_temp_20$total_INVC_CONT/sum(df_temp_20$total_INVC_CONT)*100

df_temp_total = rbind(df_temp_19, df_temp_20)

df_temp_total %>% 
  ggplot(aes(DL_GD_LCLS_NM, total_INVC_CONT, fill = DL_Y)) + 
  geom_bar(stat = 'identity', position = 'dodge', alpha = 0.75, size = 1, colour = 'black') + 
  labs(x = NULL, y = '전체 솑μž₯건수 쀑 차지 λΉ„μœ¨ (%)') + 
  theme(legend.title = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1))

data_cj %>% 
  mutate(DL_Y = paste(DL_Y, 'λ…„', sep = '')) %>% 
  group_by(DL_Y, CTPV_NM) %>% 
  summarise(total_INVC_CONT = sum(INVC_CONT)) %>% 
  ggplot(aes(CTPV_NM, total_INVC_CONT/1e4, colour = DL_Y, fill = DL_Y)) + 
  geom_bar(stat = 'identity', position = 'dodge', alpha = 0.75, size = 1, colour = 'black') + 
  theme(legend.title = element_blank()) + 
  labs(x = NULL, y = '솑μž₯건수 (λ‹¨μœ„: 만)')

data_cj %>% 
  mutate(DL_Y = paste(DL_Y, 'λ…„', sep = '')) %>% 
  group_by(DL_Y, CTPV_NM) %>% 
  summarise(total_INVC_CONT = sum(INVC_CONT))

df_19 = data_cj %>% 
  group_by(CTPV_NM, DL_Y, DL_GD_LCLS_NM) %>% 
  summarise(sum_INVC_CONT = sum(INVC_CONT)) %>% 
  ungroup() %>% 
  filter(DL_Y == '2019')

df_20 = data_cj %>% 
  group_by(CTPV_NM, DL_Y, DL_GD_LCLS_NM) %>% 
  summarise(sum_INVC_CONT = sum(INVC_CONT)) %>% 
  ungroup() %>% 
  filter(DL_Y == '2020')

df_incre_prop = data.frame(CTPV_NM = df_19$CTPV_NM, 
                           DL_GD_LCLS_NM = df_19$DL_GD_LCLS_NM, 
                           incre_prop = (df_20$sum_INVC_CONT - df_19$sum_INVC_CONT)/df_19$sum_INVC_CONT*100)

df_incre_prop %>% 
  ggplot(aes(reorder(DL_GD_LCLS_NM, incre_prop), incre_prop)) + 
  geom_line(aes(group = DL_GD_LCLS_NM), size = 2.5, alpha = 0.25, colour = 'black') + 
  geom_point(aes(colour = CTPV_NM), size = 5) + 
  scale_colour_manual(values = c('orange', 'brown')) + 
  labs(x = NULL, y = 'μ „λ…„ λŒ€λΉ„ 솑μž₯건수 μ¦κ°€μœ¨ (%)') + 
  coord_flip() + 
  theme(legend.title = element_blank())