1

How to calculate the standard deviation of the residuals for a pooled regression? (The idea, formula and/or R code are all welcomed)

As far as I understand it is the standard deviation of the estimated and actual values. From Statology Residual standard error

How to calculate this for a regression with multiple imputation? That is, to pool the residual standard error and get the equivalent for summary(linear_model)$sigma for regression analysis for multiple imputed data?

Below is my attempt at it. It would be tempting to just take the mean of the values (v1) but I assume that will have to use something more sophisticated, like rubin's rule ( see v2, which is heavily borrowed from here, I believe that the problem is in the betweenVar-variable as I do not know what the coefficients would be in this case. If the coefficient was 0, then the v1 (mean) would suffice. This would make sense as the residuals should be normally distributed around zero. In that case, the between variance would also be zero and could be dropped from the formula. Or what am I missing here ? )

library(tidyverse)
library(mice)
library(miceadds)
library(reprex) # does not work with this
library(styler)
set.seed(42)
options(warn=-1)
options(knitr.duplicate.label = "allow")

#---------------------------------------#

Data preparations

loading an editing data

d <- mtcars d <- d %>% mutate_at(c('cyl'),factor)

create missing data and impute it

mi_d <- d nr_of_NAs <- 30 for (i in 1:nr_of_NAs) { mi_d[sample(nrow(mi_d),1),sample(ncol(mi_d),1)] <- NA }

#multiple imputation mi_d <- mice(mi_d, m=2, maxit=2) #---------------------------------------#

regressions

#not imputed lm_d <- lm(qsec ~ cyl, data=d)

#imputed dataset lm_mi <- with(mi_d,lm(qsec ~cyl)) lm_mi_pool <- pool(lm_mi)

residual standard error

#not imputed: summary(lm_d)$sigma #1.489994

#-------------------------------------------------

residual standard error

imputed

v1 (probably too simple)

summaries <- lapply(lm_mi$analyses,summary) residual_standard_errors <- sapply(summaries,"[[","sigma") pooled_residual_standard_error <- mean(residual_standard_errors) # ???????? pooled_residual_standard_error # 1.490252

#-------------------------------------------------

residual standard error

imputed

v2 rubin's rule

Source: https://stats.stackexchange.com/q/327237/138594

pooled_rse.F <- function(MI_REG_OBJECT) { summaries <- lapply(lm_mi$analyses,summary) rse_vector <- sapply(summaries,"[[","sigma") m <- length(rse_vector)

betweenVar <- sd(rse_vector) # variance of variances. I believe that the problem lies (at least) in this variable withinVar <- mean(rse_vector^2) # mean of variances totVar <- withinVar + betweenVar+ betweenVar/m (pooled_rse <- sqrt(totVar)) # standard error }

pooled_rse <- pooled_rse.F(lm_mi) pooled_rse

1.490436

#------------------------------------------------

imputed

v3 rubin's rule

https://stats.stackexchange.com/q/327237/138594

pooled_rse.F <- function(MI_REG_OBJECT) { summaries <- lapply(lm_mi$analyses,summary) rse_vector <- sapply(summaries,"[[","sigma") m <- length(rse_vector) beta_hat <- mean(apply(sapply(summaries,"[[","residuals"),2,mean))

betweenVar <- sqrt(sum((rse_vector-beta_hat)^2)/(m-1)) #variance of estimates withinVar <- mean(rse_vector^2) # mean of variances totVar <- withinVar + betweenVar+ betweenVar/m (pooled_rse <- sqrt(totVar)) # standard error }

pooled_rse <- pooled_rse.F(lm_mi) pooled_rse

#2.980505

#------------------------------------------

0 Answers0