How do I improve the accuracy of the following data. It is from the following Kaggle competition which I am doing (despite it being closed for a school project).
afinn <- get_sentiments("afinn") %>%
mutate(value = case_when(value > 0 ~ 1,
value < 0 ~ -1))
nrc <- get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
mutate(value = case_when(sentiment == "positive" ~ 1,
sentiment == "negative" ~ -1)) %>%
select(!sentiment)
bing <- get_sentiments("bing") %>%
mutate(value = case_when(sentiment == "positive" ~ 1,
sentiment == "negative" ~ -1)) %>%
select(!sentiment)
sentiments <- afinn %>%
full_join(nrc) %>% full_join(bing) %>%
distinct()
data_original <- read_csv("Data/train.csv") %>%
relocate(PetID, .before = 1) %>%
select(!(RescuerID | PetID | Name)) %>%
mutate(Breed2 = replace_na(Breed2, 0),
Color2 = replace_na(Color2, 0),
Color3 = replace_na(Color3, 0)) %>%
mutate(AdoptionSpeed = as_factor(AdoptionSpeed),
MaturitySize = as_factor(MaturitySize),
FurLength = as_factor(FurLength),
Vaccinated = as_factor(Vaccinated),
Dewormed = as_factor(Dewormed),
Sterilized = as_factor(Sterilized),
Health = as_factor(Health),
Breed1 = as.character(Breed1),
Breed2 = as.character(Breed2),
Color1 = as_factor(Color1),
Color2 = as_factor(Color2),
Color3 = as_factor(Color3),
Gender = as_factor(Gender),
Type = as_factor(Type),
) %>%
mutate(desc_length = str_length(Description)) %>%
unnest_tokens(word, Description, "words") %>%
left_join(sentiments, by = "word", relationship = "many-to-many") %>%
group_by(Type, Age, Breed1, Breed2, Gender, Color1, Color2, Color3, MaturitySize, FurLength,
Vaccinated, Dewormed, Sterilized, Health, Quantity, Fee, State, AdoptionSpeed) %>%
summarise(sentiment = mean(value, na.rm = TRUE), sentiment = replace_na(sentiment, 0)) %>%
mutate(PureBreed = ifelse(Breed2 == 0, TRUE, FALSE),
ColorCount = as_factor(case_when(Color2 == 0 & Color3 == 0 ~ 1,
Color2 == 0 & Color3 != 0 ~ 2,
Color2 != 0 & Color3 != 0 ~ 3)),
Free = ifelse(Fee == 0, TRUE, FALSE)) %>%
relocate(AdoptionSpeed, .after = last_col()) %>%
drop_na() %>% ungroup()
I've already done the following:
set.seed(222)
#randomforest
rf_tree <- randomForest(AdoptionSpeed~ ., data = data_original,
method = "rf")
tune_rf <- tuneRF(data_original[1:20], data_original$AdoptionSpeed, ntreeTry=100,
stepFactor=1.5, improve=0.0001, trace=TRUE, plot=TRUE)
#best mtry = 2
rf_tree2 <- randomForest(AdoptionSpeed~ ., data = data_original, mtry = 2)
#increase number of trees
plot(rf_tree2)
rf_tree3 <- randomForest(AdoptionSpeed~ ., data = data_original,
mtry = 2, ntree = 1000)
I've gotten the following results so far:
> print(rf_tree)
Call:
randomForest(formula = AdoptionSpeed ~ ., data = data_original, method = "rf")
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 4
OOB estimate of error rate: 63.4%
Confusion matrix:
0 1 2 3 4 class.error
0 1 78 71 26 63 0.9958159
1 5 545 589 190 412 0.6869615
2 5 506 748 328 610 0.6595357
3 2 314 551 249 619 0.8564841
4 3 261 460 220 1524 0.3824959
> print(rf_tree2)
Call:
randomForest(formula = AdoptionSpeed ~ ., data = data_original, mtry = 2)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 62.1%
Confusion matrix:
0 1 2 3 4 class.error
0 0 63 87 13 76 1.0000000
1 1 409 793 69 469 0.7650775
2 0 367 998 116 716 0.5457442
3 0 216 691 125 703 0.9279539
4 0 191 567 66 1644 0.3338736
> print(rf_tree3)
Call:
randomForest(formula = AdoptionSpeed ~ ., data = data_original, mtry = 2, ntree = 1000)
Type of random forest: classification
Number of trees: 1000
No. of variables tried at each split: 2
OOB estimate of error rate: 61.81%
Confusion matrix:
0 1 2 3 4 class.error
0 0 66 83 12 78 1.0000000
1 1 435 767 62 476 0.7501436
2 0 378 974 137 708 0.5566682
3 0 215 694 126 700 0.9273775
4 0 184 543 76 1665 0.3253647
How do I improve the accuracy further, I feel like I've tried quite a lot (with features) but nothing really helped