下載資料

– 這個任務的資料下載是需要經過申請的,請你找助教申請帳號。

F01

– 你會得到3個檔案,分別是「train.csv」、「test.csv」以及「sample_submission.csv」。

讀取資料

library(data.table)

train_dat <- fread('train.csv', data.table = FALSE)
test_dat <- fread('test.csv', data.table = FALSE)
submit_dat <- fread('sample_submission.csv', data.table = FALSE)

插補資料

interest_var <- colnames(train_dat)[-1:-2]

train_dat[,'GENDER'] <- as.factor(train_dat[,'GENDER'])
test_dat[,'GENDER'] <- as.factor(test_dat[,'GENDER'])

for (i in 1:length(interest_var)) {
  if (class(train_dat[,interest_var[i]])[1] %in% c('numeric', 'integer')) {
    m_val <- mean(train_dat[,interest_var[i]], na.rm = TRUE)
  } else {
    m_val <- names(which.max(table(train_dat[,interest_var[i]])))
  }
  train_dat[train_dat[,interest_var[i]] %in% NA, interest_var[i]] <- m_val
  test_dat[test_dat[,interest_var[i]] %in% NA, interest_var[i]] <- m_val
}

使用極限梯度提升機進行分析

– 由於我們沒有辦法得到「test.csv」的結果,我們必須從原始樣本中切割出一部分「驗證組」,這樣才能看看準確度:

library(xgboost)

set.seed(0)

train_idx <- sample(1:nrow(train_dat), nrow(train_dat) * 0.8)

train_X_mat <- model.matrix(~ ., data = train_dat[train_idx,-1:-2])
xgb.data_train <- xgb.DMatrix(data = train_X_mat[,-1], label = train_dat[train_idx,'LVD'])

valid_X_mat <- model.matrix(~ ., data = train_dat[-train_idx,-1:-2])
xgb.data_valid <- xgb.DMatrix(data = valid_X_mat[,-1], label = train_dat[-train_idx,'LVD'])

xgb_fit <-  xgb.train(data = xgb.data_train, watchlist = list(eval = xgb.data_valid),
                      early_stopping_rounds = 10, eval_metric = 'auc', verbose = FALSE,
                      nthread = 2, nrounds = 200, objective = "binary:logistic")
test_X_mat <- model.matrix(~ ., data = test_dat[,-1])
submit_dat[,'p_LVD'] <- predict(xgb_fit, test_X_mat[,-1])
fwrite(submit_dat, file = 'my_submission.csv', na = '', row.names = FALSE, quote = FALSE)

調整參數及換模型

  1. 支持向量機(Support Vector Machine)
library(e1071)

svm_fit <- svm(LVD ~ ., data = train_dat[,-1], type = 'C-classification')

pred_test <- predict(svm_fit, test_dat, decision.values = TRUE)
submit_dat[,'p_LVD'] <- attr(pred_test, "decision.values")
  1. 隨機森林(Random forest)
library(randomForest)

train_dat[,'LVD'] <- as.factor(train_dat[,'LVD'])

rf_fit <- randomForest(LVD ~ ., data = train_dat[,-1])

pred_test <- predict(rf_fit, test_dat, type = 'prob')
submit_dat[,'p_LVD'] <- pred_test[,2]
  1. 彈性網路(Elastic Net)
library(glmnet)

train_dat[,'LVD'] <- as.integer(as.character(train_dat[,'LVD']))

train_X_mat <- model.matrix(~ ., data = train_dat[,-1:-2])
train_X_mat <- train_X_mat[,-1]
train_Y_mat <- matrix(as.matrix(train_dat[,"LVD"]), ncol = 1)

fit_glmnet <- cv.glmnet(x = train_X_mat, y = train_Y_mat, family = 'binomial', alpha = 0.5)

test_X_mat <- model.matrix(~ ., data = test_dat[,-1])
test_X_mat <- test_X_mat[,-1]
pred_test <- predict(fit_glmnet, test_X_mat, s = "lambda.1se")
submit_dat[,'p_LVD'] <- as.numeric(pred_test)