1

How do I improve the accuracy of the following data. It is from the following Kaggle competition which I am doing (despite it being closed for a school project).

afinn <- get_sentiments("afinn") %>% 
  mutate(value = case_when(value > 0 ~ 1,
                           value < 0 ~ -1))
nrc <- get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  mutate(value = case_when(sentiment == "positive" ~ 1,
                           sentiment == "negative" ~ -1)) %>% 
  select(!sentiment)
bing <- get_sentiments("bing") %>% 
  mutate(value = case_when(sentiment == "positive" ~ 1,
                           sentiment == "negative" ~ -1)) %>% 
  select(!sentiment)
sentiments <- afinn %>% 
  full_join(nrc) %>% full_join(bing) %>% 
  distinct()
data_original <- read_csv("Data/train.csv") %>% 
  relocate(PetID, .before = 1) %>% 
  select(!(RescuerID | PetID | Name)) %>% 
  mutate(Breed2 = replace_na(Breed2, 0),
         Color2 = replace_na(Color2, 0),
         Color3 = replace_na(Color3, 0)) %>% 
  mutate(AdoptionSpeed = as_factor(AdoptionSpeed),
         MaturitySize = as_factor(MaturitySize),
         FurLength = as_factor(FurLength),
         Vaccinated = as_factor(Vaccinated),
         Dewormed = as_factor(Dewormed),
         Sterilized = as_factor(Sterilized),
         Health = as_factor(Health),
         Breed1 = as.character(Breed1),
         Breed2 = as.character(Breed2),
         Color1 = as_factor(Color1),
         Color2 = as_factor(Color2),
         Color3 = as_factor(Color3),
         Gender = as_factor(Gender),
         Type = as_factor(Type),
         ) %>% 
  mutate(desc_length = str_length(Description)) %>% 
  unnest_tokens(word, Description, "words") %>% 
  left_join(sentiments, by = "word", relationship = "many-to-many") %>% 
  group_by(Type, Age, Breed1, Breed2, Gender, Color1, Color2, Color3, MaturitySize, FurLength, 
           Vaccinated, Dewormed, Sterilized, Health, Quantity, Fee, State, AdoptionSpeed) %>% 
  summarise(sentiment = mean(value, na.rm = TRUE), sentiment = replace_na(sentiment, 0)) %>% 
  mutate(PureBreed = ifelse(Breed2 == 0, TRUE, FALSE),
         ColorCount = as_factor(case_when(Color2 == 0 & Color3 == 0 ~ 1,
                                Color2 == 0 & Color3 != 0 ~ 2,
                                Color2 != 0 & Color3 != 0 ~ 3)),
         Free = ifelse(Fee == 0, TRUE, FALSE)) %>% 
  relocate(AdoptionSpeed, .after = last_col()) %>% 
    drop_na() %>% ungroup()

I've already done the following:

set.seed(222)

#randomforest rf_tree <- randomForest(AdoptionSpeed~ ., data = data_original, method = "rf")

tune_rf <- tuneRF(data_original[1:20], data_original$AdoptionSpeed, ntreeTry=100, stepFactor=1.5, improve=0.0001, trace=TRUE, plot=TRUE)

#best mtry = 2 rf_tree2 <- randomForest(AdoptionSpeed~ ., data = data_original, mtry = 2)

#increase number of trees plot(rf_tree2) rf_tree3 <- randomForest(AdoptionSpeed~ ., data = data_original, mtry = 2, ntree = 1000)

I've gotten the following results so far:

> print(rf_tree)

Call: randomForest(formula = AdoptionSpeed ~ ., data = data_original, method = "rf") Type of random forest: classification Number of trees: 500 No. of variables tried at each split: 4

    OOB estimate of  error rate: 63.4%

Confusion matrix: 0 1 2 3 4 class.error 0 1 78 71 26 63 0.9958159 1 5 545 589 190 412 0.6869615 2 5 506 748 328 610 0.6595357 3 2 314 551 249 619 0.8564841 4 3 261 460 220 1524 0.3824959 > print(rf_tree2)

Call: randomForest(formula = AdoptionSpeed ~ ., data = data_original, mtry = 2) Type of random forest: classification Number of trees: 500 No. of variables tried at each split: 2

    OOB estimate of  error rate: 62.1%

Confusion matrix: 0 1 2 3 4 class.error 0 0 63 87 13 76 1.0000000 1 1 409 793 69 469 0.7650775 2 0 367 998 116 716 0.5457442 3 0 216 691 125 703 0.9279539 4 0 191 567 66 1644 0.3338736 > print(rf_tree3)

Call: randomForest(formula = AdoptionSpeed ~ ., data = data_original, mtry = 2, ntree = 1000) Type of random forest: classification Number of trees: 1000 No. of variables tried at each split: 2

    OOB estimate of  error rate: 61.81%

Confusion matrix: 0 1 2 3 4 class.error 0 0 66 83 12 78 1.0000000 1 1 435 767 62 476 0.7501436 2 0 378 974 137 708 0.5566682 3 0 215 694 126 700 0.9273775 4 0 184 543 76 1665 0.3253647

How do I improve the accuracy further, I feel like I've tried quite a lot (with features) but nothing really helped

Imme
  • 11

0 Answers0