I know this question is rather old, but I recently also needed to understand the compression of CUDA fatbinaries. It indeed seems to be a compression somewhat similar to LZ77. I wrote the following code that seems to decompress the actual text sections of compressed fatbinaries.
size_t decompress(const uint8_t* input, size_t input_size, uint8_t* output, size_t output_size)
{
size_t ipos = 0, opos = 0;
uint16_t next_nclen; // length of next non-compressed segment
uint16_t next_clen; // length of next compressed segment
uint16_t back_offset; // negative offset where redudant data is located, relative to current opos
while (ipos < input_size) {
next_nclen = (input[ipos] & 0xf0) >> 4;
next_clen = 4 + (input[ipos] & 0xf);
if (next_nclen == 0xf) {
next_nclen += input[++ipos];
}
if (memcpy(output + opos, input + (++ipos), next_nclen) == NULL) {
fprintf(stderr, "Error copying data");
return 0;
}
ipos += next_nclen;
opos += next_nclen;
if (ipos >= input_size || opos >= output_size) {
break;
}
back_offset = input[ipos] + (input[ipos + 1] << 8);
ipos += 2;
if (next_clen == 0xf+4) {
do {
next_clen += input[ipos++];
} while (input[ipos - 1] == 0xff);
}
if (next_clen <= back_offset) {
if (memcpy(output + opos, output + opos - back_offset, next_clen) == NULL) {
fprintf(stderr, "Error copying data");
return 0;
}
} else {
if (memcpy(output + opos, output + opos - back_offset, back_offset) == NULL) {
fprintf(stderr, "Error copying data");
return 0;
}
for (size_t i = back_offset; i < next_clen; i++) {
output[opos + i] = output[opos + i - back_offset];
}
}
opos += next_clen;
}
return opos;
}
I am no compression expert but I think this is a variant of LZ4 compression. There is some more code related to decoding the fatbinary headers here: https://github.com/n-eiling/cuda-fatbin-decompression.
The output is bit identical to using nvcc with the --no-compress flag.
vec_add.fatbinis clearly not compressed. – julian Oct 16 '18 at 14:14