I have a dataframe which is a made of many datasets combined together (many datasets with the same predictive features but with different samples combined together). This dataframe, called scoresWithResponse , contains the target feature (binary feature, Response / NoResponse) along with many predictive features. The features are in the columns, and the samples are the rows.
The problem is I got an accuracy of 1 in the training set, AUC = 1.
I am using cross validation, and I did feature seletion by calculating the correlation of each predictive feature with the target feature (dropped the number of predictive features from 38 to 26), still the accuracy and precision are 1, a massive overfitting situation and I don't know what is causing this.
The data is imbalanced but not severely. Class NoResponse have about 900 samples and class Response about 550 samples. So not that imbalanced.
Is there any tool in R that I can use to fix this?
This is a sample of the scoresWithResponse df. The target feature is called response:
structure(list(response = c("NoResponse", "NoResponse", "NoResponse",
"NoResponse", "NoResponse", "NoResponse", "NoResponse", "NoResponse",
"NoResponse", "NoResponse", "NoResponse", "NoResponse", "NoResponse",
"NoResponse", "NoResponse", "NoResponse", "NoResponse", "NoResponse",
"NoResponse", "NoResponse", "NoResponse", "NoResponse", "NoResponse",
"NoResponse", "NoResponse", "NoResponse", "NoResponse", "NoResponse",
"NoResponse", "NoResponse", "NoResponse", "NoResponse", "NoResponse",
"NoResponse", "NoResponse", "NoResponse", "NoResponse", "NoResponse",
"NoResponse", "Response", "Response", "Response", "Response",
"Response", "Response", "Response", "Response", "Response", "Response",
"NoResponse"), Adipocytes = c(-0.00147259448084057, -0.00147259448084057,
-0.00147259448084057, -0.00147259448084057, 0.00496131218358668,
0.0548062334131809, -0.00147259448084057, -0.00147259448084057,
-0.00147259448084057, 0.179318801379469, -0.00147259448084057,
0.0152373546449648, 0.00471475971580131, -0.00147259448084057,
-0.00147259448084057, -0.00147259448084057, 0.0223344650075101,
-0.00147259448084057, 0.440899655514849, -0.00147259448084057,
-0.00147259448084057, -0.00147259448084057, -0.00147259448084057,
0.00667190889349088, 0.0209909859594865, -0.00147259448084057,
0.141500721589385, 0.3787676219377, -0.00147259448084057, 0.100845895196263,
0.00376639229249542, -0.00147259448084057, -0.00147259448084057,
-0.00147259448084057, -0.00147259448084057, -0.00147259448084057,
-0.00147259448084057, -0.00147259448084057, -0.00147259448084057,
0.0143591851858567, 0.029973315783053, 0.109643119475897, -0.00147259448084057,
-0.00147259448084057, 0.033995214866229, -0.00147259448084057,
-0.00147259448084057, 0.00729426178712381, -0.00147259448084057,
0.0117856592972338), B.cells = c(0.0784674365204107, 0.0361475824469536,
1.23322306963728, 0.168147773376171, 0.0303336417563689, 0.030213842229231,
0.0104772945803579, 0.6165526362465, 0.0558472468040232, 0.199731757583661,
0.0332625376923969, 0.0254342545665831, 0.0814952345178611, 0.0571654345486949,
0.0104772945803579, 0.0384667823875126, 0.0992166491520768, 0.0909511066706036,
0.0417584790221351, 0.0104772945803579, 0.0168983625736141, 0.0104772945803579,
0.0205640352085533, 0.0356276975652135, 0.118678007191133, 0.0254963339043755,
0.0335184044896974, 0.133580305750364, 0.0629651829274283, 0.0104772945803579,
0.0577719570574633, 0.0354883966171345, 0.0584470348183582, 0.0104772945803579,
0.0478863386113402, 0.0217393584085478, 0.583228651740994, 0.0104772945803579,
0.0433224106904174, 0.0344094787634546, 0.046651993965254, 0.0104772945803579,
0.0104772945803579, 0.0104772945803579, 0.0104772945803579, 0.0426020746172987,
0.060934091547895, 0.0550381273743561, 0.0331768751035052, 0.465658884141495
), Basophils = c(0.078906060975291, 0.185474445082498, 0.134401186945301,
0.665820958289465, 0.0658110816910161, 0.549145210521612, 0.127165943704943,
0.0388563858940922, 0.12506366351286, 0.0687444342155732, 0.128218399654121,
0.264632805160075, 0.111198850866759, 0.0740271877643854, 0.0547830059780378,
0.0788819375423888, 0.122504718666759, 0.116646155897171, 0.0388563858940922,
0.0388563858940922, 0.0847087204695266, 0.170267877921665, 0.192990813992193,
0.08636451452274, 0.205951716498705, 0.0712175634510349, 0.0783020221825332,
0.139422879181795, 0.155482339672933, 0.0388563858940922, 0.110246668069126,
0.110815958263559, 0.332485196949693, 0.0658358036825454, 0.0388563858940922,
0.0790990677422829, 0.0388563858940922, 0.0685108479102874, 0.0603506212082886,
0.0458583547445136, 0.124593010554148, 0.157943929160168, 0.0388563858940922,
0.0388563858940922, 0.0769474587712551, 0.184789379564458, 0.141639474022545,
0.112218409146376, 0.0886279736342864, 0.141914971660106), CD4..naive.T.cells = c(0.000352118916979948,
0.000352118916979948, 0.195806941197775, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.00671990447291144, 0.000352118916979948, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.038732799099841,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.000352118916979948, 0.0101707970144034, 0.000352118916979948,
0.000352118916979948, 0.000352118916979948, 0.000352118916979948,
0.0259073784136612), CD4..Tcm = c(0.00468132560940506, 0.00468132560940506,
0.0229444590832302, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.00468132560940506, 0.00468132560940506,
0.00468132560940506, 0.013310582713857, -0.0341347438586641),
CD8..T.cells = c(0.0674507123528756, 0.0333806083003552,
0.346632430262511, 0.0651318726541825, 0.05791288966358,
0.0333806083003552, 0.0333806083003552, 0.145972600599313,
0.0455590324140021, 0.0557212756574658, 0.0458694514125555,
0.0333806083003552, 0.070967388083968, 0.0333806083003552,
0.0333806083003552, 0.0333806083003552, 0.0841865232673732,
0.169166782933996, 0.0333806083003552, 0.0508094953191331,
0.0333806083003552, 0.0333806083003552, 0.0333806083003552,
0.0836162747296847, 0.0533760632756103, 0.0333806083003552,
0.0333806083003552, 0.159932930078836, 0.105368013137071,
0.0333806083003552, 0.0790884230508081, 0.0333806083003552,
0.0333806083003552, 0.0333806083003552, 0.0437642832942929,
0.0333806083003552, 0.22743171311185, 0.0333806083003552,
0.0333806083003552, 0.0661391850408331, 0.0333806083003552,
0.0333806083003552, 0.0333806083003552, 0.0333806083003552,
0.0333806083003552, 0.0333806083003552, 0.12082525756811,
0.0490077373424495, 0.0333806083003552, 0.240026829190165
), CD8..Tcm = c(0.193057099564419, 0.022148022736091, 0.505616937986011,
0.238282112180758, 0.0802713022259099, 0.0342057097559616,
0.022148022736091, 0.258249161888649, 0.150220291489214,
0.167072217214248, 0.0300146589259272, 0.022148022736091,
0.0793499984509763, 0.0346033174041023, 0.022148022736091,
0.022148022736091, 0.123946309722711, 0.36610054169658, 0.0370798548885808,
0.022148022736091, 0.022148022736091, 0.022148022736091,
0.0339040869679633, 0.154406584553023, 0.177613519349749,
0.04051161748082, 0.0909580697983234, 0.293962482478071,
0.157866758221804, 0.0296400174109871, 0.151550569935703,
0.022148022736091, 0.0621661868212978, 0.022148022736091,
0.0987008846736535, 0.022148022736091, 0.366709884745067,
0.022148022736091, 0.0402905235116509, 0.0948609973244887,
0.0569280565039912, 0.022148022736091, 0.022148022736091,
0.0309187098761019, 0.022148022736091, 0.112148244674048,
0.220105996648614, 0.0934563517188017, 0.022148022736091,
0.343363303326582), Class.switched.memory.B.cells = c(0.0487914604272673,
0.0411529688937465, 0.335591478817477, 0.0310529711582873,
0.0310529711582873, 0.0310529711582873, 0.0310529711582873,
0.182205893480734, 0.052142399620452, 0.123251255694447,
0.0534870034377551, 0.0555369571975287, 0.0548278202087546,
0.0310529711582873, 0.0310529711582873, 0.0310529711582873,
0.0310529711582873, 0.104615136743085, 0.0435557901247572,
0.0429005295010426, 0.0310529711582873, 0.0310529711582873,
0.0310529711582873, 0.038962823575526, 0.0988525482538249,
0.0539200343035075, 0.0310529711582873, 0.0651637094528887,
0.0310529711582873, 0.0566023036659616, 0.0496492442525651,
0.0310529711582873, 0.0634323182972408, 0.0310529711582873,
0.0310529711582873, 0.0310529711582873, 0.167624227076226,
0.0310529711582873, 0.0310529711582873, 0.0310529711582873,
0.0310529711582873, 0.0371717851391283, 0.0483888468863344,
0.0941646814676068, 0.0310529711582873, 0.0310529711582873,
0.0310529711582873, 0.0364758992300671, 0.0410986750456075,
0.133092168173884), DC = c(0.0664041472178045, 0.001974947933282,
0.0526951586016229, 0.135108924806137, 0.0133056050961418,
0.0775815332310746, 0.001974947933282, 0.0369699850727529,
0.0214395600577855, 0.0339845994266395, 0.0111625280105067,
0.001974947933282, 0.0186918493607988, 0.0333206480825139,
0.0102856708829166, 0.001974947933282, 0.0498326250235207,
0.0364369465986278, 0.0723203591042894, 0.0423329199616102,
0.001974947933282, 0.001974947933282, 0.001974947933282,
0.0556268755782605, 0.0660946770036415, 0.0741807202880026,
0.0308137087616544, 0.0760961528436393, 0.0143083258007051,
0.0127495162722945, 0.0431798081217165, 0.001974947933282,
0.0170564729518741, 0.0121701726086785, 0.001974947933282,
0.001974947933282, 0.0609445697531212, 0.001974947933282,
0.001974947933282, 0.0324793759855767, 0.0228625916874287,
0.001974947933282, 0.001974947933282, 0.019310419413258,
0.0232906676085244, 0.0758116669022375, 0.04027829659775,
0.0204012723800148, 0.001974947933282, 0.0257352602124135
)), row.names = c("Pt1", "Pt10", "Pt101", "Pt103", "Pt106",
"Pt11", "Pt17", "Pt18", "Pt2", "Pt24", "Pt26", "Pt27", "Pt28",
"Pt29", "Pt3", "Pt30", "Pt31", "Pt34", "Pt36", "Pt37", "Pt38",
"Pt39", "Pt4", "Pt44", "Pt46", "Pt47", "Pt48", "Pt49", "Pt5",
"Pt52", "Pt59", "Pt62", "Pt65", "Pt66", "Pt67", "Pt72", "Pt77",
"Pt78", "Pt79", "Pt8", "Pt82", "Pt84", "Pt85", "Pt89", "Pt9",
"Pt90", "Pt92", "Pt94", "Pt98", "EA595454"), class = "data.frame")
This is the code:
library(caret)
library(MLeval)
library(DMwR2)
library(predtools)
cancer.type = totaldata$Cancer_Type
scores.batch = limma::removeBatchEffect(Scores ,cancer.type)
scoresWithResponse = data.frame(response = as.factor(totaldata$Response=='Response'),t(scores.batch))
scoresWithResponse$response = plyr::mapvalues(scoresWithResponse$response, c('TRUE', 'FALSE'), c('Response', 'NoResponse'))
corVec = sapply(scoresWithResponse[-1], ltm::biserial.cor, scoresWithResponse[[1]])
vc = corVec > 0.01 | corVec < -0.02
scoresWithResponse = scoresWithResponse[,names(vc[vc==T])]
scoresWithResponse$response = data$response
scoresWithResponse$response = plyr::mapvalues(scoresWithResponse$response, c('1','0'), c('Response', 'NoResponse'))
scoresWithResponse = scoresWithResponse %>% dplyr::select(response, everything())
training = createDataPartition(y = scoresWithResponse$response, p=0.8, list = FALSE)
scoresWithResponse.trn = scoresWithResponse[training, ]
scoresWithResponse.tst = scoresWithResponse[-training,]
ctrlCV = trainControl(method = 'cv', number = 10 , classProbs = TRUE , savePredictions = TRUE)
rfFit <- train(response~., data = scoresWithResponse.trn,
importance = TRUE,
method = "rf",
#ntree = 100,
metric="ROC",
trControl = ctrlCV,
tuneLength = 20,
)
rfROC = roc(scoresWithResponse.trn$response,predict(rfFit,scoresWithResponse.trn, type='prob')[,1])
plot(rfROC)
auc(rfROC)
print(rfFit)
plot(rfFit) # best mtry in x axis
rfFit$results
pred1 = predict(rfFit, scoresWithResponse.trn)
confusionMatrix(table(scoresWithResponse.trn[,'response'], pred1))
tst1 = predict(rfFit, scoresWithResponse.tst)
confusionMatrix(table(scoresWithResponse.tst[,'response'], tst1))
