From 577e93fe6d729b63447faad471fd0f5f2296f667 Mon Sep 17 00:00:00 2001 From: Quentin Carbonneaux Date: Mon, 3 Oct 2022 10:40:39 +0200 Subject: thread-local storage for arm64_apple It is documented nowhere how this is supposed to work. It is also quite easy to have assertion failures pop in the linker when generating asm slightly different from clang's! The best source of information is found in LLVM's source code (AArch64ISelLowering.cpp). I paste it here for future reference: /// Darwin only has one TLS scheme which must be capable of dealing with the /// fully general situation, in the worst case. This means: /// + "extern __thread" declaration. /// + Defined in a possibly unknown dynamic library. /// /// The general system is that each __thread variable has a [3 x i64] descriptor /// which contains information used by the runtime to calculate the address. The /// only part of this the compiler needs to know about is the first xword, which /// contains a function pointer that must be called with the address of the /// entire descriptor in "x0". /// /// Since this descriptor may be in a different unit, in general even the /// descriptor must be accessed via an indirect load. The "ideal" code sequence /// is: /// adrp x0, _var@TLVPPAGE /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, /// ; the function pointer /// blr x1 ; Uses descriptor address in x0 /// ; Address of _var is now in x0. /// /// If the address of _var's descriptor *is* known to the linker, then it can /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for /// a slight efficiency gain. The call 'blr x1' above is actually special in that it trashes less registers than what the abi would normally permit. In qbe, I don't take advantage of this and lower the call like a regular call. We can revise this later on. Again, the source for this information is LLVM's source code: // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). --- arm64/emit.c | 65 ++++++++++++++++++++++++++++++++++++++---------------------- arm64/isel.c | 31 ++++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 27 deletions(-) (limited to 'arm64') diff --git a/arm64/emit.c b/arm64/emit.c index 7316f78..292dc79 100644 --- a/arm64/emit.c +++ b/arm64/emit.c @@ -245,33 +245,50 @@ emitf(char *s, Ins *i, E *e) static void loadaddr(Con *c, char *rn, E *e) { - static char *ldsym[][2] = { - /* arm64 */ - [0][0] = "\tadrp\t%s, %s%s%s\n", - [0][1] = "\tadd\t%s, %s, #:lo12:%s%s%s\n", - /* apple */ - [1][0] = "\tadrp\t%s, %s%s@page%s\n", - [1][1] = "\tadd\t%s, %s, %s%s@pageoff%s\n", - }; - char *p, *l, off[32]; + char *p, *l, *s; + + switch (c->reloc) { + default: + die("unreachable"); + case RelDef: + if (T.apple) + s = "\tadrp\tR, S@pageO\n" + "\tadd\tR, R, S@pageoffO\n"; + else + s = "\tadrp\tR, SO\n" + "\tadd\tR, R, #:lo12:SO\n"; + break; + case RelThr: + if (T.apple) + s = "\tadrp\tR, S@tlvppage\n" + "\tldr\tR, [R, S@tlvppageoff]\n"; + else + s = "\tmrs\tR, tpidr_el0\n" + "\tadd\tR, R, #:tprel_hi12:SO, lsl #12\n" + "\tadd\tR, R, #:tprel_lo12_nc:SO\n"; + break; + } - if (c->bits.i) - /* todo, handle large offsets */ - sprintf(off, "+%"PRIi64, c->bits.i); - else - off[0] = 0; l = str(c->label); p = l[0] == '"' ? "" : T.assym; - if (c->reloc == RelThr) { - fprintf(e->f, "\tmrs\t%s, tpidr_el0\n", rn); - fprintf(e->f, "\tadd\t%s, %s, #:tprel_hi12:%s%s%s, lsl #12\n", - rn, rn, p, l, off); - fprintf(e->f, "\tadd\t%s, %s, #:tprel_lo12_nc:%s%s%s\n", - rn, rn, p, l, off); - } else { - fprintf(e->f, ldsym[T.apple != 0][0], rn, p, l, off); - fprintf(e->f, ldsym[T.apple != 0][1], rn, rn, p, l, off); - } + for (; *s; s++) + switch (*s) { + default: + fputc(*s, e->f); + break; + case 'R': + fputs(rn, e->f); + break; + case 'S': + fputs(p, e->f); + fputs(l, e->f); + break; + case 'O': + if (c->bits.i) + /* todo, handle large offsets */ + fprintf(e->f, "+%"PRIi64, c->bits.i); + break; + } } static void diff --git a/arm64/isel.c b/arm64/isel.c index 320cf33..a8d36fa 100644 --- a/arm64/isel.c +++ b/arm64/isel.c @@ -70,20 +70,45 @@ static void fixarg(Ref *pr, int k, int phi, Fn *fn) { char buf[32]; - Ref r0, r1, r2; + Ref r0, r1, r2, r3; int s, n; - Con *c; + Con *c, cc; r0 = *pr; switch (rtype(r0)) { case RCon: + c = &fn->con[r0.val]; + if (T.apple + && c->type == CAddr + && c->reloc == RelThr) { + r1 = newtmp("isel", Kl, fn); + *pr = r1; + if (c->bits.i) { + r2 = newtmp("isel", Kl, fn); + cc = (Con){.type = CBits}; + cc.bits.i = c->bits.i; + r3 = newcon(&cc, fn); + emit(Oadd, Kl, r1, r2, r3); + r1 = r2; + } + emit(Ocopy, Kl, r1, TMP(R0), R); + r1 = newtmp("isel", Kl, fn); + r2 = newtmp("isel", Kl, fn); + emit(Ocall, 0, R, r1, CALL(33)); + emit(Ocopy, Kl, TMP(R0), r2, R); + emit(Oload, Kl, r1, r2, R); + cc = *c; + cc.bits.i = 0; + r3 = newcon(&cc, fn); + emit(Ocopy, Kl, r2, r3, R); + break; + } if (KBASE(k) == 0 && phi) return; r1 = newtmp("isel", k, fn); if (KBASE(k) == 0) { emit(Ocopy, k, r1, r0, R); } else { - c = &fn->con[r0.val]; n = stashbits(&c->bits, KWIDE(k) ? 8 : 4); vgrow(&fn->con, ++fn->ncon); c = &fn->con[fn->ncon-1]; -- cgit 1.4.1