I'm performing independence tests using the gTest function of R library Rfast but I noticed that segmentation faults occur randomly.
I followed the setting of this question so the data type and the constraints to make the function run (documentation) (minimum value = 0 and consecutive data) are satisfied, but still the function does not always work and I don't know how to fix this issue.
The idea behind the code is to have a function that performs the conditional test of c1 on c2 conditioned on c3 (if provided) as in the setting that I followed.
library(readr)
library(bnlearn)
library(Rcpp)
library(RcppZiggurat)
library(Rfast)
debug <- FALSE
# columns specification
c1 <- "ECO2"
c2 <- "SAO2"
s <- c("VLNG")
test_type <- "g2"
for(i in 0:100){
print(i)
# dataset reading
dataset <- structure(list(SAO2 = c("LOW", "LOW", "LOW", "LOW", "HIGH", "LOW",
"NORMAL", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "HIGH", "LOW",
"LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW",
"LOW", "LOW", "HIGH", "LOW", "LOW", "LOW", "HIGH", "LOW", "NORMAL",
"LOW", "HIGH", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW",
"LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW",
"LOW", "LOW", "LOW", "LOW", "HIGH", "LOW", "LOW", "LOW", "LOW",
"LOW", "LOW", "LOW", "LOW", "LOW", "HIGH", "LOW", "HIGH", "LOW",
"LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "LOW",
"LOW", "LOW", "LOW", "LOW", "LOW", "LOW", "HIGH", "LOW", "LOW",
"LOW", "HIGH", "LOW", "HIGH", "LOW", "LOW", "LOW", "LOW", "LOW",
"HIGH", "LOW", "LOW", "LOW"),
ECO2 = c("ZERO", "ZERO", "ZERO",
"LOW", "LOW", "ZERO", "ZERO", "ZERO", "ZERO", "LOW", "ZERO",
"ZERO", "ZERO", "LOW", "ZERO", "LOW", "HIGH", "ZERO", "ZERO",
"HIGH", "ZERO", "ZERO", "ZERO", "ZERO", "ZERO", "ZERO", "ZERO",
"ZERO", "LOW", "ZERO", "LOW", "ZERO", "ZERO", "ZERO", "LOW",
"LOW", "LOW", "ZERO", "ZERO", "ZERO", "ZERO", "ZERO", "LOW",
"ZERO", "ZERO", "ZERO", "ZERO", "LOW", "LOW", "ZERO", "LOW",
"ZERO", "ZERO", "LOW", "LOW", "LOW", "ZERO", "ZERO", "ZERO",
"ZERO", "HIGH", "ZERO", "ZERO", "ZERO", "ZERO", "LOW", "LOW",
"LOW", "LOW", "ZERO", "ZERO", "ZERO", "HIGH", "ZERO", "HIGH",
"ZERO", "ZERO", "HIGH", "ZERO", "ZERO", "ZERO", "ZERO", "ZERO",
"ZERO", "ZERO", "ZERO", "ZERO", "ZERO", "ZERO", "LOW", "LOW",
"ZERO", "ZERO", "ZERO", "ZERO", "ZERO", "LOW", "ZERO", "ZERO",
"ZERO"),
VLNG = c("ZERO", "ZERO", "ZERO", "ZERO", "LOW", "ZERO",
"ZERO", "ZERO", "ZERO", "ZERO", "ZERO", "ZERO", "ZERO", "LOW",
"ZERO", "ZERO", "HIGH", "ZERO", "ZERO", "HIGH", "ZERO", "ZERO",
"ZERO", "ZERO", "ZERO", "ZERO", "LOW", "ZERO", "ZERO", "ZERO",
"LOW", "ZERO", "ZERO", "ZERO", "LOW", "ZERO", "ZERO", "ZERO",
"ZERO", "ZERO", "ZERO", "ZERO", "LOW", "ZERO", "ZERO", "ZERO",
"ZERO", "LOW", "ZERO", "ZERO", "LOW", "ZERO", "ZERO", "ZERO",
"LOW", "LOW", "ZERO", "ZERO", "ZERO", "ZERO", "HIGH", "ZERO",
"ZERO", "ZERO", "ZERO", "LOW", "ZERO", "LOW", "ZERO", "ZERO",
"ZERO", "ZERO", "ZERO", "ZERO", "HIGH", "ZERO", "ZERO", "HIGH",
"ZERO", "ZERO", "ZERO", "ZERO", "ZERO", "ZERO", "LOW", "ZERO",
"ZERO", "ZERO", "ZERO", "ZERO", "LOW", "ZERO", "ZERO", "ZERO",
"ZERO", "ZERO", "LOW", "ZERO", "ZERO", "ZERO")), class = "data.frame", row.names = c(NA, -100L))
# converting string columns to factor to perform the test
if (test_type=="x2" || test_type=="g2"){
dataset[] <- lapply( dataset, factor) # the "[]" keeps the dataframe structure
}
# finding the correct indexes for the columns
n <- colnames(dataset)
col_c1 <- match(c1, n)
col_c2 <- match(c2, n)
cols_c3 <- c()
uni <- c(length(unique(dataset[c1])[[1]])[[1]],length(unique(dataset[c2])[[1]])[[1]])
if (!s[1]=="()"){
for(v in s){
idx <- match(v, n)
cols_c3 <- append(cols_c3,idx)
uni <- append(uni,length(unique(dataset[v])[[1]])[[1]])
}
if(debug){print("A")}
for (nn in n){
dataset[nn] <- unclass(as.factor(dataset[nn][[1]]))
}
if(debug){print("B")}
ds <- as.matrix(dataset)
if(debug){print("C")}
# make minimum value = 0
ds <- ds - 1
if(debug){
# look at the number of unique values before changing, as a means of validation
print(sapply(1:ncol(ds), function(x) length(unique(ds[, x]))))
# look at the minimum, as a means of validation
print(sapply(1:ncol(ds), function(x) min(ds[,x])))
}
tellMe <- c(1:ncol(ds))
tellMe <- tellMe[-c(col_c1, col_c2, sort(cols_c3))]
# rearrange using the indices
ds <- ds[, c(col_c1, col_c2, sort(cols_c3), tellMe)]
if(debug){
print(sapply(1:ncol(ds), function(x) min(ds[,x])))
print(sapply(1:ncol(ds), function(x) length(unique(ds[, x]))))
print("D")
print(dimnames(ds))
print("E")
}
colnames(ds) <- NULL
ds <- ds[,1:2+length(cols_c3)]
if (length(cols_c3) == 0){
print("UNIVARIATE")
res <- g2Test_univariate(ds[,1:2], uni)
}else{
if(debug){print("MULTIVARIATE")}
counter_c <- c(1:length(cols_c3)) + 2
if(debug){
print(counter_c)
print(uni)
}
# always data in order: c_1 on column 1, c_2 on column 2 and the other from 3 to go
res <- g2Test(ds, 1, 2, counter_c, uni)
}
p_val <- pchisq(q= res$statistic, df=res$df, lower.tail=FALSE)
t <- list("p.value" = p_val, "statistic" = res$statistic)
if(debug){print(t)}
}
}
This code should return the list of numbers from 0 to 100 but it outputs a segmentation fault before it ends after the call of gTest. Please notice that it occurs at random, meaning that it can occur at the first iteration of the for cycle as well as at the last one. I also included some debugging print, set debug <- TRUE to turn them on