3

I'm wondering about the following

The threshold is that the chi-squared distribution should be a good approximation to the chi-squared statistic under the null hypothesis.

As the number of cells increases, it starts to matter very little what their expectations might be. Generally, when testing at a 5% or 1% level, you're fine even with 20% low-expectation cells.

Is there any literature that supports this statement?

When in doubt, run a small simulation with your data. R has that built right in to its chi-squared test function.

How would one go about writing a simulation to demonstrate this?

example data

df <- structure(list(x = structure(c(1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 3L, 1L, 1L, 3L, 6L, 1L, 1L, 1L, 3L, 1L, 1L, 4L, 3L, 3L, 4L, 1L, 1L, 5L, 5L, 4L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 4L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 6L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 8L, 3L, 3L, 4L, 3L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 4L, 3L, 4L, 3L, 4L, 1L, 3L, 1L, 1L, 4L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 4L, 1L, 3L, 6L, 3L, 3L, 1L, 1L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 4L, 3L, 1L, 1L, 1L, 6L, 4L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 5L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 6L, 3L, 1L, 1L, 4L, 3L, 4L, 4L, 3L, 3L, 1L, 1L, 6L, 1L, 1L, 3L, 8L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 5L, 5L, 1L, 5L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 3L, 6L, 3L, 1L, 1L, 3L, 1L, 4L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 4L, 4L, 1L, 3L, 1L, 3L, 3L, 4L, 1L, 3L, 5L, 3L, 1L, 3L, 3L, 5L, 1L, 3L, 1L, 1L, 3L, 5L, 4L, 1L, 1L, 1L, 1L, 6L, 3L, 3L, 6L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 5L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 7L, 1L, 4L, 5L, 3L, 3L, 1L, 3L, 1L, 1L, 3L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 3L, 6L, 3L, 3L, 2L, 1L, 3L, 1L, 2L, 6L, 3L, 4L, 3L, 4L, 1L, 6L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 1L, 6L, 3L, 3L, 1L, 1L, 3L, 4L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 1L, 3L, 1L, 1L, 1L, 6L, 3L, 1L, 1L, 3L, 1L, 1L, 6L, 3L, 3L, 3L, 1L, 3L, 1L, 4L, 6L, 1L, 3L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 6L, 1L, 3L, 1L, 3L, 3L, 4L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 6L, 6L, 4L, 4L, 1L, 3L, 2L, 5L, 3L, 2L, 2L, 2L, 3L, 4L, 3L, 6L, 3L, 3L, 3L, 4L, 1L, 3L, 1L, 6L, 4L, 1L, 3L, 3L, 3L, 1L, 5L, 2L, 6L, 3L, 1L, 1L, 3L, 3L, 5L, 3L, 1L, 2L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 6L, 4L, 3L, 3L, 1L, 1L, 1L, 6L, 1L, 3L, 6L, 1L, 1L, 6L, 5L, 1L, 1L, 1L, 1L, 3L, 3L, 4L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 4L, 1L, 4L, 1L, 3L, 3L, 1L, 6L, 4L, 4L, 1L, 3L, 4L, 3L, 1L, 3L, 5L, 3L, 1L, 3L, 5L, 3L, 3L, 3L, 3L, 1L, 3L, 1L, 1L, 1L, 3L, 3L, 4L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 1L, 3L, 3L, 3L, 1L, 1L, 4L, 1L, 1L, 6L, 1L, 2L, 6L, NA, 6L, 7L, 1L, 3L, 6L, 1L, 3L, 3L, 7L, 4L, 1L, 4L, 1L, 1L, 2L, 3L, 4L, 3L, 1L, 4L, 1L, 3L, 1L, 4L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 5L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 5L, 3L, 1L, 1L, 5L, 3L, 3L, 3L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 5L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 4L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 3L, 1L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 4L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 6L, 1L, 1L, 3L, 3L, 4L, 4L, 1L, 1L, 1L, 5L, 3L, 3L, 3L, 4L, 1L, 2L, 3L, 2L, 6L, 2L, 3L, 5L, 1L, 3L, 3L, 3L, 3L, 1L, 4L, 3L, 1L, 4L, 1L, 3L, 1L, 5L), .Label = c("A", "B", "C", "D", "E", "F", "G", "H"), class = "factor"), y = c(1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L)), row.names = c(NA, -707L), class = "data.frame") 
baxx
  • 936

2 Answers2

5

We want to know whether under the null the chi-squared distribution is a good approximation to the chi-squared statistic. In fact, we are specially interested in knowing whether under the null hypothesis the probability of rejecting the null equals the significations (5% or 1%).

The null hypothesis depends on whether we are talking about an independence (or homogeneity) test or a goodness of fit test. However, we can generate samples using the null and check how many of them would be rejected.

For example, let's suppose we want to run a goodness of fit test with element sample size 150 and probabilities (.4, .4, .1, .05, .03, .02). We can run the following simulation (very inefficient code left for clarity):

> probs <- c(.4, .4, .1, .05, .03, .02)
> n <- 150
> probs*n #expected values
[1] 60.0 60.0 15.0  7.5  4.5  3.0
> 
> s <- 1e5
> res <- rep(NA, s)
> for (i in 1:s) {
+   mostra <- sample(1:6, n, replace = TRUE, prob=probs)
+   mostra <- factor(mostra, levels=1:6)
+   res[i] <- chisq.test(table(mostra),
+                        p=probs)$p.value
+ }
There were 50 or more warnings (use warnings() to see the first 50)
> hist(res)
> table(res<.05)/s

  FALSE    TRUE 
0.94909 0.05091 
> table(res<.01)/s

  FALSE    TRUE 
0.98761 0.01239 

Although we have some rather small bins - at least, borderline cases, the histogram (not included) shows that p-values distribution is approximately uniform and we can see that we would be rejecting about 5% and about 1% under those significations. Therefore, chisq.test seem to be working quite well for that test.

Pere
  • 6,583
  • how would one go about writing a simulation for this though? I've added some example data to the OP – baxx Mar 02 '19 at 18:20
  • You have here one example code. I tried to copy your data but it doesn't seem to work. I think there is a problem with parentheses count and - maybe - with a too long line. – Pere Mar 02 '19 at 18:41
  • thanks, I'm a little unsure how to interpret the part about the uniform distribution. Because the p-values are roughly uniform (I've plotted them from your code), we would reject ~5% if we were to reject values <0.05 (as in table(res<0.05)/s). If the test wasn't working so well would we expect to see this value increase / decrease? (I've tried using different probabilities, and it seems pretty robust) – baxx Mar 02 '19 at 20:40
  • example with different probabilities : http://vpaste.net/torSn , which still seems to work out alright even though ~40% of the cells have what could be considered a low count – baxx Mar 02 '19 at 20:42
  • About uniform distribution of p-values: P-values are computed by evaluating the inverse distribution function of a chi squared distribution on the the statistics. If the statistics is actually distributed according to that chi squared distribution, it will be uniformly distributed between 0 and 1. For example, 5% of all values of the test statistic will be under the 5% p-value threshold - and the same is true for any other percentage. – Pere Mar 02 '19 at 21:21
3

The solution by @Pere sets up the simulation but then extract p-values from the Chi-squared test. However, the p-value is based on the chi-squared approximation and the goal of the simulation is to avoid making the approximation in the first place.

So extract the statistics instead of the p-values. Then after a large number of simulations check how often the simulated statistic is at least as large (i.e., at least as extreme) as the observed statistic. This is the simulation p-value.

You can actually do this quite easily with chisq.test since it has an option to simulate rather than approximate the p-value. I assume that what the comment to your previous question refers to.

chisq.test(df$x, df$y, simulate.p.value = TRUE, B = 10000)
#   Pearson's Chi-squared test with simulated p-value (based on
#   10000 replicates)
#
# data:  df$x and df$y
# X-squared = 21.191, df = NA, p-value = 0.0032

If you want to write a simulation yourself here is one way to do. It is not the same method as what you get with simulate.p.value = TRUE. This uses permutations, not bootstrapping.

library("tidyverse")

# Write function to extract chi-square statistic
chisq.stat <- function(df) {
  # Suppress the warning about "incorrect chi-squared approximation."
  # We can ignore it because we want the statistics, not the p-value.
  t <- suppressWarnings(chisq.test(df$x, df$y))
  pluck(t, "statistic")
}

df <- tibble(x = x, y = y) %>% drop_na()

Xsq.obsr <- chisq.stat(df)
Xsq.obsr
# X-squared 
#   21.1907 

Xsq.perm <- df %>%

  # Will do 10000 permutations each with its own id `rep`
  crossing(rep = seq(10000)) %>%
  group_by(rep) %>%
  # Create permutations by shuffling y while keeping x in place
  mutate(y = sample(y, replace = FALSE)) %>%
  ungroup() %>%

  # Compute the chi-square test statistic for each permutation
  nest(- rep) %>%
  mutate(statisic = map_dbl(data, chisq.stat))

pval.perm <- mean(Xsq.perm$statisic >= Xsq.obsr)
pval.perm
# [1] 0.0029
dipetkov
  • 9,805