– 這個任務的資料下載是需要經過申請的,請你找助教申請帳號。
– 你會得到3個檔案,分別是「train.csv」、「test.csv」以及「sample_submission.csv」。
library(data.table)
train_dat <- fread('train.csv', data.table = FALSE)
test_dat <- fread('test.csv', data.table = FALSE)
submit_dat <- fread('sample_submission.csv', data.table = FALSE)
– 由於我們沒有辦法得到「test.csv」的結果,我們必須從原始樣本中切割出一部分「驗證組」,這樣才能看看準確度:
library(xgboost)
set.seed(0)
train_idx <- sample(1:nrow(train_dat), nrow(train_dat) * 0.8)
train_X_mat <- model.matrix(~ ., data = train_dat[train_idx,-1:-2])
xgb.data_train <- xgb.DMatrix(data = train_X_mat[,-1], label = train_dat[train_idx,'LVD'])
valid_X_mat <- model.matrix(~ ., data = train_dat[-train_idx,-1:-2])
xgb.data_valid <- xgb.DMatrix(data = valid_X_mat[,-1], label = train_dat[-train_idx,'LVD'])
xgb_fit <- xgb.train(data = xgb.data_train, watchlist = list(eval = xgb.data_valid),
early_stopping_rounds = 10, eval_metric = 'auc', verbose = FALSE,
nthread = 2, nrounds = 200, objective = "binary:logistic")
test_X_mat <- model.matrix(~ ., data = test_dat[,-1])
submit_dat[,'p_LVD'] <- predict(xgb_fit, test_X_mat[,-1])
fwrite(submit_dat, file = 'my_submission.csv', na = '', row.names = FALSE, quote = FALSE)
你可以使用「help()」函數看看「xgb.train()」有哪些參數可以調整。
另外,你也可以換一些模型進行實驗,舉例來說我們可以有以下選擇:
library(e1071)
svm_fit <- svm(LVD ~ ., data = train_dat[,-1], type = 'C-classification')
pred_test <- predict(svm_fit, test_dat, decision.values = TRUE)
submit_dat[,'p_LVD'] <- attr(pred_test, "decision.values")
library(randomForest)
train_dat[,'LVD'] <- as.factor(train_dat[,'LVD'])
rf_fit <- randomForest(LVD ~ ., data = train_dat[,-1])
pred_test <- predict(rf_fit, test_dat, type = 'prob')
submit_dat[,'p_LVD'] <- pred_test[,2]
library(glmnet)
train_dat[,'LVD'] <- as.integer(as.character(train_dat[,'LVD']))
train_X_mat <- model.matrix(~ ., data = train_dat[,-1:-2])
train_X_mat <- train_X_mat[,-1]
train_Y_mat <- matrix(as.matrix(train_dat[,"LVD"]), ncol = 1)
fit_glmnet <- cv.glmnet(x = train_X_mat, y = train_Y_mat, family = 'binomial', alpha = 0.5)
test_X_mat <- model.matrix(~ ., data = test_dat[,-1])
test_X_mat <- test_X_mat[,-1]
pred_test <- predict(fit_glmnet, test_X_mat, s = "lambda.1se")
submit_dat[,'p_LVD'] <- as.numeric(pred_test)