In short, GCC is allocating registers wrongly for this code.
typedef intptr_t Int;
Int add(Int *x, Int n, Int *y, Int m) {
Int r0, r1, r2;
asm (
R"(xor %k0, %k0
xor %k1, %k1
L0%=:
mov %2, [%[y] + %0 * 8]
neg %1
adc [%[x] + %0 * 8], %2
sbb %1, %1
.
.
.)"
: "=r"(r0), "=r"(r1), "=r"(r2)
: [x]"r"(x), [n]"r"(n), [y]"r"(y), [m]"r"(m)
: "memory"
);
printf("%ld %ld\n", n, r0);
return r0;
}
/*
add:
push rbx
xor ebx, ebx
xor edx, edx ; killing `y`
L017:
mov rcx, [rdx + rbx * 8] ; segfault
neg rdx
adc [rdi + rbx * 8], rcx
sbb rdx, rdx
*/
More weird thing is removing the printf line makes the code compile normally.
The code is a basic implementation of bignum addition. The code itself could also have a bug, but that's apart from this problem.
I read the answers in this question, and now I understand that the & is necessary to tell the input is reused after being consumed. I also read the manual where it states,
GCC may allocate the output operand in the same register as an unrelated input operand, on the assumption that the assembler code consumes its inputs before producing outputs.
But I still don't get why GCC thinks it's okay to overwrite y before it's "consumed", when there is no &. You can see in the code above that GCC is zeroing y before any value is ever read from it.
full code
#include <stdio.h>
#include <stdint.h>
#define asm __asm__ volatile
typedef intptr_t Int;
Int add(Int *x, Int n, Int *y, Int m) {
Int r0, r1, r2;
asm (
R"(xor %k0, %k0
xor %k1, %k1
L0%=:
mov %2, [%[y] + %0 * 8]
neg %1
adc [%[x] + %0 * 8], %2
sbb %1, %1
inc %0
cmp %0, %[m]
jl L0%=
neg %1
jz end%=
carry%=:
inc %0
add qword ptr [%[x] + %0 * 8 - 8], 1
jc carry%=
cmp %0, %[n]
cmovl %0, %[n]
end%=:)"
: "=&r"(r0), "=&r"(r1), "=&r"(r2)
: [x]"r"(x), [n]"r"(n), [y]"r"(y), [m]"r"(m)
: "memory"
);
printf("%ld %ld\n", n, r0);
return r0;
}