Possible Duplicate:
Method to estimate the prediction interval for GLM and Negative Binomial distribution
I am trying to compute prediction intervals for GLM using either Poisson or Negative Binomial distributions.
First I developped a Monte-Carlo approach for Gaussian distribution and derived it for the other distributions. Unfortunately, I am not enough confident with mathematics and statistics to judge the reliability of these estimations. Could you please tell me if what I did make sens or not?
# example for the linear model
x <- rnorm(200)
y <- 3*x+2+rnorm(200, 0, 0.75)
tab <- data.frame(x=x, y=y)
# selection of 50% of the data for the model computation
auxi <- sample(rep(c(1, 0), each=100))
auxi <- as.logical(auxi)
# the linear model
lm1 <- lm(y~x, data=tab, subset=auxi)
# predictions of the 50% remaining data (independent data set)
preds <- predict(lm1, newdata=tab[!auxi,], se=T, type="response", interval="prediction")
# The variance is splitted in two parts
# 1. the se associated with the expeceted values
# for each expected value, a 1000 random values from a normal distribution
# with mu=expected value and sigma the standard error associated with the expected value
p2 <- rnorm(1e3*100, preds$fit[,1], preds$se.fit)
# 2. the residual error
# for each expected value, for each value of the random samples
# 1000 random values are sampled from a normal distribution
# with mu = the random value of the expected value (from the 1000 random samples)
# and sigma = the residual standard deviation
p2 <- rnorm(100*1e6, p2, summary(lm1)$sigma)
# for each expected value (N=100) a random sample of 1e6 values was generated
dim(p2) <- c(100, 1e6)
# the standard error computed is quite similar with the theory :
s1 <- apply(p2, 1, sd)
# theoretical sd :
s.th <- numeric(100)
v <- 1
for (i in which(!auxi)){
X <- cbind(1, tab[i,"x"])
u <- X%*%vcov(lm1)
u <- u%*%t(X)
u <- (summary(lm1)$sigma^2)+u
s.th[v] <- sqrt(u)
v <- v+1
}
aux <- order(preds$fit[,1])
matplot(sort(preds$fit[,1]), cbind(s.th, s1)[aux,], type="l", lty=1, col=c("black", "grey50"))
# what I did for the GLM with a Negative Binomial distribution
nb2_syn <- function(nobs = 50000, off = 0,
alpha = 1, xv = c(1, 0.75, -1.5), mu=rep(1,length(xv)-1), sigma=diag(length(xv)-1)) {
# xv sont les paramètres du modèle
#
p <- length(xv) - 1
X <- MASS:::mvrnorm(nobs, mu, sigma)
X <- cbind(1, X)
xb <- X %*% xv
a <- alpha
ia <- 1/a
exb <- exp(xb + off)
xg <- rgamma(nobs, a, a, ia)
xbg <-exb*xg
nby <- rpois(nobs, xbg)
out <- data.frame(cbind(nby, X[,-1]))
names(out) <- c("nby", paste("x", 1:p, sep=""))
return(out)
}
# simulation of the data set such as
# log(y) = -1 + x
# theta = 0.5
# mu(x) = 0.5
set.seed(100)
sim.data <- nb2_syn(nobs = 200, alpha=.5, xv = c(-1, 1),mu=0.5)
# computation of the model
mynb <- MASS:::glm.nb(nby ~ ., data = sim.data, subset=auxi)
# prediction of the values on the link space
preds <- predict(mynb, newdata=sim.data[!auxi,], se=T, type="link", interval="confidence")
#for each predicted value a random sample of 1000 values from a gaussian distribution
p2 <- rnorm(1e3*100, preds$fit, preds$se.fit)
# for each predicted value, for each random value, a random ssample of 1000 values
#from a Negative binomial distribution
p2 <- MASS:::rnegbin(100*1e6, exp(p2), mynb$theta)
# for each expected value (in row) a random sample of 1e6 values
dim(p2) <- c(100, 1e6)