Improve accuracy randomForest classification model

Question

How do I improve the accuracy of the following data. It is from the following Kaggle competition which I am doing (despite it being closed for a school project).

afinn <- get_sentiments("afinn") %>% 
  mutate(value = case_when(value > 0 ~ 1,
                           value < 0 ~ -1))
nrc <- get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  mutate(value = case_when(sentiment == "positive" ~ 1,
                           sentiment == "negative" ~ -1)) %>% 
  select(!sentiment)
bing <- get_sentiments("bing") %>% 
  mutate(value = case_when(sentiment == "positive" ~ 1,
                           sentiment == "negative" ~ -1)) %>% 
  select(!sentiment)
sentiments <- afinn %>% 
  full_join(nrc) %>% full_join(bing) %>% 
  distinct()
data_original <- read_csv("Data/train.csv") %>% 
  relocate(PetID, .before = 1) %>% 
  select(!(RescuerID | PetID | Name)) %>% 
  mutate(Breed2 = replace_na(Breed2, 0),
         Color2 = replace_na(Color2, 0),
         Color3 = replace_na(Color3, 0)) %>% 
  mutate(AdoptionSpeed = as_factor(AdoptionSpeed),
         MaturitySize = as_factor(MaturitySize),
         FurLength = as_factor(FurLength),
         Vaccinated = as_factor(Vaccinated),
         Dewormed = as_factor(Dewormed),
         Sterilized = as_factor(Sterilized),
         Health = as_factor(Health),
         Breed1 = as.character(Breed1),
         Breed2 = as.character(Breed2),
         Color1 = as_factor(Color1),
         Color2 = as_factor(Color2),
         Color3 = as_factor(Color3),
         Gender = as_factor(Gender),
         Type = as_factor(Type),
         ) %>% 
  mutate(desc_length = str_length(Description)) %>% 
  unnest_tokens(word, Description, "words") %>% 
  left_join(sentiments, by = "word", relationship = "many-to-many") %>% 
  group_by(Type, Age, Breed1, Breed2, Gender, Color1, Color2, Color3, MaturitySize, FurLength, 
           Vaccinated, Dewormed, Sterilized, Health, Quantity, Fee, State, AdoptionSpeed) %>% 
  summarise(sentiment = mean(value, na.rm = TRUE), sentiment = replace_na(sentiment, 0)) %>% 
  mutate(PureBreed = ifelse(Breed2 == 0, TRUE, FALSE),
         ColorCount = as_factor(case_when(Color2 == 0 & Color3 == 0 ~ 1,
                                Color2 == 0 & Color3 != 0 ~ 2,
                                Color2 != 0 & Color3 != 0 ~ 3)),
         Free = ifelse(Fee == 0, TRUE, FALSE)) %>% 
  relocate(AdoptionSpeed, .after = last_col()) %>% 
    drop_na() %>% ungroup()

I've already done the following:

set.seed(222)
#randomforest
rf_tree <- randomForest(AdoptionSpeed~ ., data = data_original,
                 method = "rf")
tune_rf <- tuneRF(data_original[1:20], data_original$AdoptionSpeed, ntreeTry=100, 
                  stepFactor=1.5, improve=0.0001, trace=TRUE, plot=TRUE)
#best mtry = 2
rf_tree2 <- randomForest(AdoptionSpeed~ ., data = data_original, mtry = 2)
#increase number of trees
plot(rf_tree2)
rf_tree3 <- randomForest(AdoptionSpeed~ ., data = data_original, 
                         mtry = 2, ntree = 1000)

I've gotten the following results so far:

> print(rf_tree)
Call:
 randomForest(formula = AdoptionSpeed ~ ., data = data_original,      method = "rf") 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 4
    OOB estimate of  error rate: 63.4%

Confusion matrix:
  0   1   2   3    4 class.error
0 1  78  71  26   63   0.9958159
1 5 545 589 190  412   0.6869615
2 5 506 748 328  610   0.6595357
3 2 314 551 249  619   0.8564841
4 3 261 460 220 1524   0.3824959
> print(rf_tree2)
Call:
 randomForest(formula = AdoptionSpeed ~ ., data = data_original,      mtry = 2) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2
    OOB estimate of  error rate: 62.1%

Confusion matrix:
  0   1   2   3    4 class.error
0 0  63  87  13   76   1.0000000
1 1 409 793  69  469   0.7650775
2 0 367 998 116  716   0.5457442
3 0 216 691 125  703   0.9279539
4 0 191 567  66 1644   0.3338736
> print(rf_tree3)
Call:
 randomForest(formula = AdoptionSpeed ~ ., data = data_original,      mtry = 2, ntree = 1000) 
               Type of random forest: classification
                     Number of trees: 1000
No. of variables tried at each split: 2
    OOB estimate of  error rate: 61.81%

Confusion matrix:
  0   1   2   3    4 class.error
0 0  66  83  12   78   1.0000000
1 1 435 767  62  476   0.7501436
2 0 378 974 137  708   0.5566682
3 0 215 694 126  700   0.9273775
4 0 184 543  76 1665   0.3253647

How do I improve the accuracy further, I feel like I've tried quite a lot (with features) but nothing really helped

https://stats.stackexchange.com/q/312780/1352 – Stephan Kolassa Jun 27 '23 at 02:14 — Stephan Kolassa, Jun 27 '23 at 02:14

Improve accuracy randomForest classification model

0 Answers0