관리 메뉴

TEAM EDA

kaggle - Rossmann Store sales Prediction (3) 본문

TEAM EDA /EDA 1기 ( 2018.03.01 ~ 2018.09.16 )

kaggle - Rossmann Store sales Prediction (3)

김현우 2019. 9. 10. 15:31

NOTE : 이번 자료는SRK님의 글을 토대로 외부자료의 내용을 가져와서, 직접 모델링 코드를 짜보도록 하겠습니다.

 

외부자료

  • 다른 주에있는 날씨 데이터가 이 포럼 게시물에 있습니다. santiagomota의 csv 파일.

  • 이 게시물에는 주에 상점을 매핑하는 데이터가 있습니다.[코드]

  • 2014 년 7 월 1 일에서 2014 년 12 월 31 일 사이에 시리즈 중간에 180 개의 데이터가 184 일 누락되어여기에 볼 수 있습니다.

  • 테스트 집합의 저장소 622에서 "열림"열에 대한 누락 된 데이터 중 일부는 이게시물에서 볼 수있는 0으로 바꿀 수 있습니다

  • 월드컵 날짜에 대한 외부 데이터

  • 포럼 게시물의 매크로 표시기 데이터

  • Google 트렌드데이터


그 외 참고 할 만한 시도들.

전처리 코드

title: "kaggle_rossmanneda"  
author: "김현우,박주연,이주영,이지예,주진영,홍정아"  
date: "2018년 6월 8일"  
output: html_document  
library(data.table)
library(zoo)
library(dplyr)
library(ggplot2)
library(forecast)
library(ggrepel)
test <- fread("test.csv")
train <- fread("train.csv")
store <- fread("store.csv")
state <- fread("store_states.csv")
test[is.na(test)] <- 0
BE <- fread("BE.csv",sep=";") %>% select(c("Date","Events"))
BE <- data.table(BE)
head(BE$Date);tail(BE$Date) #2013-01-01 ~ 2015-09-17
BW <- fread("BW.csv") %>% select(c("Date","Events"))
BW <- data.table(BW)
BY <- fread("BY.csv",sep=";") %>% select(c("Date","Events"))
BY <- data.table(BY)
HB <- fread("HB.csv",sep=";") %>% select(c("Date","Events"))
HB <- data.table(HB)
HE <- fread("HE.csv",sep=";") %>% select(c("Date","Events"))
HE <- data.table(HE)
HH <- fread("HH.csv",sep=";") %>% select(c("Date","Events"))
HH <- data.table(HH)
NI <- fread("NI.csv",sep=";") %>% select(c("Date","Events"))
NI <- data.table(NI)
NW <- fread("NW.csv",sep=";") %>% select(c("Date","Events"))
NW <- data.table(NW)
RP <- fread("RP.csv",sep=";") %>% select(c("Date","Events"))
RP <- data.table(RP)
SH <- fread("SH.csv",sep=";") %>% select(c("Date","Events"))
SH <- data.table(SH)
SN <- fread("SN.csv",sep=";") %>% select(c("Date","Events"))
SN <- data.table(SN)
ST <- fread("ST.csv",sep=";") %>% select(c("Date","Events"))
ST <- data.table(ST)
TH <- fread("TH.csv") %>% select(c("Date","Events"))
TH <- data.table(TH)
full <- bind_rows(train,test)
full <- data.table(full)
state <- data.table(state)
setkey(full, Store)
full <- full[state,]
# 아래 state정보의 날짜는 chr형식으로 되어있기에 계산의 속도를 위해 full도 chr로 맞춰주고 진행해야함.
full_BE <- full %>% filter(State=="BE") %>% as.data.table()
setkey(full_BE,Date)
full_BE <- full_BE[BE,]
full_BW <- full %>% filter(State=="BW") %>% as.data.table()
setkey(full_BW,Date)
full_BW <-full_BW[BW,]
full_BY <- full %>% filter(State=="BY") %>% as.data.table()
setkey(full_BY,Date)
full_BY <-full_BY[BY,]
full_HB <- full %>% filter(State=="HB,NI") %>% as.data.table()
setkey(full_HB,Date)
full_HB <-full_HB[HB,]
full_HE <- full %>% filter(State=="HE") %>% as.data.table()
setkey(full_HE,Date)
full_HE <-full_HE[HE,]
full_HH <- full %>% filter(State=="HH") %>% as.data.table()
setkey(full_HH,Date)
full_HH <-full_HH[HH,]
full_NI <- full %>% filter(State=="HB,NI") %>% as.data.table()
setkey(full_NI,Date)
full_NI <- full_NI[NI,]
full_NW <- full %>% filter(State=="NW") %>% as.data.table()
setkey(full_NW,Date)
full_NW <-full_NW[NW,]
full_RP <- full %>% filter(State=="RP") %>% as.data.table()
setkey(full_RP,Date)
full_RP <-full_RP[RP,]
full_SH <- full %>% filter(State=="SH") %>% as.data.table()
setkey(full_SH,Date)
full_SH <-full_SH[SH,]
full_SN <- full %>% filter(State=="SN") %>% as.data.table()
setkey(full_SN,Date)
full_SN <-full_SN[SN,]
full_ST <- full %>% filter(State=="ST") %>% as.data.table()
setkey(full_ST,Date)
full_ST <-full_ST[ST,]
full_TH <- full %>% filter(State=="TH") %>% as.data.table()
setkey(full_TH,Date)
full_TH <-full_TH[TH,]
full_HB_NI <- full_HB
full_HB_NI$Events <- ifelse(full_HB$Events == "", full_NI$Events ,full_HB$Events)
full_weather <- bind_rows(full_HE,full_SN,full_ST,full_BE,full_BW,full_BY,full_HH,full_NW,full_RP,full_SH,full_TH,full_HB_NI)

그런데 이렇게 할 경우 full의 관측치는 1058297인 반면, full_weather는 1058673으로 376개가 증가하는 이상현상이 발생. train에는 있지만 test에는 없는 state 때문에 발생한 문제. <- 그래서 날짜 매칭이 안됨.

ggplot(data = full_weather) + geom_bar(aes(x=State,fill=State)) 
#보면 NA값들이 매칭이 안된 값들. 이걸 제거해주면 됨.
full_weather1 <- full_weather %>% filter(is.na(State)==FALSE)
nrow(full)
nrow(full_weather1)

구글 트렌드 데이터 추가.

TREND_BE <- fread("Rossmann_DE_BE.csv",sep=",", skip = 5)
TREND_BE <- data.table(TREND_BE)

2013-01-06일(일요일) ~ 2013-01-12일(토요일)까지를 주기로 trend를 기록함. 이에대한 분석을 위해 function을 만들어
매주 토요일날에 trend값과 sales값의 합을 기록시켜서 비교를 해줘야함.

TREND_BE <- fread("Rossmann_DE_BE.csv",sep=",", skip = 9, nrows= 142)
TREND_BE <- data.table(TREND_BE)
colnames(TREND_BE) <- c("Date","Trends")
TREND_BE$Date <- substr(TREND_BE$Date,14,23)
final_BE <- left_join(full_BE,TREND_BE)
TREND_BW <- fread("Rossmann_DE_BW.csv",sep=",", skip = 9, nrows= 142)
TREND_BW <- data.table(TREND_BW)
colnames(TREND_BW) <- c("Date","Trends")
TREND_BW$Date <- substr(TREND_BW$Date,14,23)
final_BW <- left_join(full_BW,TREND_BW)
TREND_BY <- fread("Rossmann_DE_BY.csv",sep=",", skip = 470, nrows= 142)
TREND_BY <- data.table(TREND_BY)
colnames(TREND_BY) <- c("Date","Trends")
TREND_BY$Date <- substr(TREND_BY$Date,14,23)
final_BY <- left_join(full_BY,TREND_BY)
TREND_HE <- fread("Rossmann_DE_HE.csv",sep=",", skip = 9, nrows= 142)
TREND_HE <- data.table(TREND_HE)
colnames(TREND_HE) <- c("Date","Trends")
TREND_HE$Date <- substr(TREND_HE$Date,14,23)
final_HE <- left_join(full_HE,TREND_HE)
TREND_HH <- fread("Rossmann_DE_HH.csv",sep=",", skip = 9, nrows= 142)
TREND_HH <- data.table(TREND_HH)
colnames(TREND_HH) <- c("Date","Trends")
TREND_HH$Date <- substr(TREND_HH$Date,14,23)
final_HH <- left_join(full_HH,TREND_HH)
#HB,NI는 같이 묶어서 HB검색은 무시함.
TREND_NI <- fread("Rossmann_DE_NI.csv",sep=",", skip = 9, nrows= 142)
TREND_NI <- data.table(TREND_NI)
colnames(TREND_NI) <- c("Date","Trends")
TREND_NI$Date <- substr(TREND_NI$Date,14,23)
final_NI <- left_join(full_NI,TREND_NI)
TREND_NW <- fread("Rossmann_DE_NW.csv",sep=",", skip = 9, nrows= 142)
TREND_NW <- data.table(TREND_NW)
colnames(TREND_NW) <- c("Date","Trends")
TREND_NW$Date <- substr(TREND_NW$Date,14,23)
final_NW <- left_join(full_NW,TREND_NW)
TREND_RP <- fread("Rossmann_DE_RP.csv",sep=",", skip = 9, nrows= 142)
TREND_RP <- data.table(TREND_RP)
colnames(TREND_RP) <- c("Date","Trends")
TREND_RP$Date <- substr(TREND_RP$Date,14,23)
final_RP <- left_join(full_RP,TREND_RP)
TREND_SH <- fread("Rossmann_DE_SH.csv",sep=",", skip = 9, nrows= 142)
TREND_SH <- data.table(TREND_SH)
colnames(TREND_SH) <- c("Date","Trends")
TREND_SH$Date <- substr(TREND_SH$Date,14,23)
final_SH <- left_join(full_SH,TREND_SH)
TREND_SN <- fread("Rossmann_DE_SN.csv",sep=",", skip = 9, nrows= 142)
TREND_SN <- data.table(TREND_SN)
colnames(TREND_SN) <- c("Date","Trends")
TREND_SN$Date <- substr(TREND_SN$Date,14,23)
final_SN <- left_join(full_SN,TREND_SN)
TREND_ST <- fread("Rossmann_DE_ST.csv",sep=",", skip = 9, nrows= 142)
TREND_ST <- data.table(TREND_ST)
colnames(TREND_ST) <- c("Date","Trends")
TREND_ST$Date <- substr(TREND_ST$Date,14,23)
final_ST <- left_join(full_ST,TREND_ST)
TREND_TH <- fread("Rossmann_DE_TH.csv",sep=",", skip = 9, nrows= 142)
TREND_TH <- data.table(TREND_TH)
colnames(TREND_TH) <- c("Date","Trends")
TREND_TH$Date <- substr(TREND_TH$Date,14,23)
final_TH <- left_join(full_TH,TREND_TH)
final_full <- bind_rows(final_HE,final_SN,final_ST,final_BE,final_BW,final_BY,final_HH,final_NW,final_RP,final_SH,final_TH,final_NI)
ggplot(data = final_full) + geom_bar(aes(x=State,fill=State)) 
final_full <- final_full %>% filter(is.na(State)==FALSE)
nrow(full)
nrow(final_full)
final_train <- final_full %>% filter(is.na(Id)==TRUE)
final_test <- final_full %>% filter(is.na(Id)==FALSE)
write.csv(final_train,"final_train",row.names=FALSE)
write.csv(final_test,"final_test",row.names=FALSE)

모델링 코드

#private = 0.11908 약 상위 13%의 성적(3303팀 중 440등)
library(dplyr)
library(data.table)
library(readr)
library(xgboost)
cat("reading the train and test data\n")
train <- fread("real_train.csv")
test  <- fread("real_test.csv")
store <- fread("store.csv")
# removing the date column (since elements are extracted) and also StateHoliday which has a lot of NAs (may add it back in later)
train <- merge(train,store)
test <- merge(test,store)
train <- train[ which(train$Open=='1'),]
train <- train[ which(train$Sales!='0'),]

# seperating out the elements of the date column for the train set
train$Date <- as.Date(train$Date)
train$month <- as.integer(format(train$Date, "%m"))
train$year <- as.integer(format(train$Date, "%y"))
train$day <- as.integer(format(train$Date, "%d"))
train <- train %>% select(-Date)

test$Date <- as.Date(test$Date)
test$month <- as.integer(format(test$Date, "%m"))
test$year <- as.integer(format(test$Date, "%y"))
test$day <- as.integer(format(test$Date, "%d"))
test <- test %>% select(-Date)
names(train)
feature.names <- names(train)[c(1,2,5:10,12:13,17:23)]
feature.names
for (f in feature.names) {
  if (class(train[[f]])=="character") {
    levels <- unique(c(train[[f]], test[[f]]))
    train[[f]] <- as.integer(factor(train[[f]], levels=levels))
    test[[f]]  <- as.integer(factor(test[[f]],  levels=levels))
  }
}
cat("train data column names after slight feature engineering\n")
names(train)
cat("test data column names after slight feature engineering\n")
names(test)
RMPSE<- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  elab<-exp(as.numeric(labels))-1
  epreds<-exp(as.numeric(preds))-1
  err <- sqrt(mean((epreds/elab-1)^2))
  return(list(metric = "RMPSE", value = err))
}
nrow(train)

h<-sample(nrow(train),100000)

dval<-xgb.DMatrix(data=data.matrix(train[h,]),label=log(train$Sales+1)[h])

dtrain<-xgb.DMatrix(data=data.matrix(train[-h,]),label=log(train$Sales+1)[-h])

watchlist<-list(val=dval,train=dtrain)

param <- list(  objective           = "reg:linear", 
                booster = "gbtree",
                eta                 = 0.02, # 0.06, #0.01,
                max_depth           = 10, #changed from default of 8
                subsample           = 0.9, # 0.7
                colsample_bytree    = 0.7 # 0.7
                #num_parallel_tree   = 2
                # alpha = 0.0001, 
                # lambda = 1
)

clf <- xgb.train(   params              = param, 
                    data                = dtrain, 
                    nrounds             = 3000, #300, #280, #125, #250, # changed from 300
                    verbose             = 0,
                    early.stop.round    = 100,
                    watchlist           = watchlist,
                    maximize            = FALSE,
                    feval=RMPSE
)
pred1 <- exp(predict(clf, data.matrix(test[,feature.names]))) -1 
submission <- data.frame(Id=test$Id, Sales=pred1)
cat("saving the submission file\n")
write_csv(submission, "rf3.csv")

TEST <- test %>% filter(Open==0) %>% select(Id,Sales)
TEST$Sales[is.na(TEST$Sales)] <- 0
submission1 <- submission %>% filter(!(Id %in% TEST$Id))
submission2 <- bind_rows(submission1,TEST)
write_csv(submission2, "rf4.csv")