K_fold <- function(Nobs, K=5){ id <- sample(rep(seq.int(K), length.out=Nobs)) l <- lapply(seq.int(K), function(x) list( train = which(x!=id), test = which(x==id) )) return(l) } #feature level factors to be tried noffeaturelevels=c(0.2,0.4,0.6) require(randomForest) data(iris) nofdata=nrow(iris) noffeature=ncol(iris)-1 nfold=10 #number of folds for cross validation nofrep=5 #number of replications for cross validation nfoldnested=5 #number of folds for nested cross validation to use for parameter setting #matrix to store the error rates of each test fold with the parameter setting from nested CV error_CV=matrix(0,nofrep,nfold) #i:replication j:fold for(replication in 1:nofrep){ set.seed(replication) #set seed cvinf=K_fold(nofdata,nfold) #sample data for CV for(fold in 1:nfold){ trainind=cvinf[[fold]]$train #training instance indices testind=cvinf[[fold]]$test #test instance indices #fold data traindata=iris[trainind,] testdata=iris[testind,] #matrix to store error rates for the test data of the nested CV for each fold and parameter level. nestedCVerror=matrix(0,nfoldnested,length(noffeaturelevels)) # i:nested fold j:parameter level noftraincv=nrow(traindata) nestedcvinf=K_fold(noftraincv,nfoldnested) #Sample data for the nested CV #nested cross validation starts here for(foldnested in 1:nfoldnested){ #for each fold trainindnested=nestedcvinf[[foldnested]]$train #training instance indices for nested CV testindnested=nestedcvinf[[foldnested]]$test #test instance indices for nested CV #nested fold data foldtraindata=traindata[trainindnested,] foldtestdata=traindata[testindnested,] for(f in 1:length(noffeaturelevels)){ #for each parameter setting nfeat=floor(noffeaturelevels[f]*noffeature)+1 print(paste("Replication",replication,"Fold",fold,"Nested Fold",foldnested,"- Number of features to try",nfeat)) #run RF with 50 trees with given mtry setting iris.rf <- randomForest(foldtraindata[,-5], foldtraindata[,5],foldtestdata[,-5], foldtestdata[,5],mtry=nfeat,ntree=50) nestedCVerror[foldnested,f]=iris.rf$test$err.rate[50,1] } } #average of the errors over the parameter settings avgerror=apply(nestedCVerror,2,mean) #find the setting with minimum error rate min_ind=which.min(avgerror) #use it to train and classify test fold with 50 trees nfeat=floor(noffeaturelevels[min_ind]*noffeature)+1 finalRF <- randomForest(traindata[,-5], traindata[,5],testdata[,-5], testdata[,5],mtry=nfeat,ntree=50) #store the result for this replication and fold error_CV[replication,fold]=finalRF$test$err.rate[50,1] } } #average of the errors over folds overallError=mean(as.numeric(error_CV)) #standard deviation of the errors over folds overallSTdev=sd(as.numeric(error_CV)) print(overallError) print(overallSTdev)