summary refs log tree commit diff
path: root/arm64
diff options
context:
space:
mode:
authorQuentin Carbonneaux <quentin@c9x.me>2022-10-03 10:40:39 +0200
committerQuentin Carbonneaux <quentin@c9x.me>2022-10-12 21:11:41 +0200
commit577e93fe6d729b63447faad471fd0f5f2296f667 (patch)
tree411f67778f6ced6d6d2b2f45cc9e739d8a004d31 /arm64
parentb03a8970d7b73959397f0ca5c8f2a532c1905e5d (diff)
downloadroux-577e93fe6d729b63447faad471fd0f5f2296f667.tar.gz
thread-local storage for arm64_apple
It is documented nowhere how this is
supposed to work. It is also quite easy
to have assertion failures pop in the
linker when generating asm slightly
different from clang's!

The best source of information is found
in LLVM's source code (AArch64ISelLowering.cpp).
I paste it here for future reference:

/// Darwin only has one TLS scheme which must be capable of dealing with the
/// fully general situation, in the worst case. This means:
///     + "extern __thread" declaration.
///     + Defined in a possibly unknown dynamic library.
///
/// The general system is that each __thread variable has a [3 x i64] descriptor
/// which contains information used by the runtime to calculate the address. The
/// only part of this the compiler needs to know about is the first xword, which
/// contains a function pointer that must be called with the address of the
/// entire descriptor in "x0".
///
/// Since this descriptor may be in a different unit, in general even the
/// descriptor must be accessed via an indirect load. The "ideal" code sequence
/// is:
///     adrp x0, _var@TLVPPAGE
///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
///                                      ; the function pointer
///     blr x1                           ; Uses descriptor address in x0
///     ; Address of _var is now in x0.
///
/// If the address of _var's descriptor *is* known to the linker, then it can
/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
/// a slight efficiency gain.

The call 'blr x1' above is actually
special in that it trashes less registers
than what the abi would normally permit.
In qbe, I don't take advantage of this
and lower the call like a regular call.
We can revise this later on. Again, the
source for this information is LLVM's
source code:

// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
Diffstat (limited to 'arm64')
-rw-r--r--arm64/emit.c65
-rw-r--r--arm64/isel.c31
2 files changed, 69 insertions, 27 deletions
diff --git a/arm64/emit.c b/arm64/emit.c
index 7316f78..292dc79 100644
--- a/arm64/emit.c
+++ b/arm64/emit.c
@@ -245,33 +245,50 @@ emitf(char *s, Ins *i, E *e)
 static void
 loadaddr(Con *c, char *rn, E *e)
 {
-	static char *ldsym[][2] = {
-		/* arm64 */
-		[0][0] = "\tadrp\t%s, %s%s%s\n",
-		[0][1] = "\tadd\t%s, %s, #:lo12:%s%s%s\n",
-		/* apple */
-		[1][0] = "\tadrp\t%s, %s%s@page%s\n",
-		[1][1] = "\tadd\t%s, %s, %s%s@pageoff%s\n",
-	};
-	char *p, *l, off[32];
+	char *p, *l, *s;
+
+	switch (c->reloc) {
+	default:
+		die("unreachable");
+	case RelDef:
+		if (T.apple)
+			s = "\tadrp\tR, S@pageO\n"
+			    "\tadd\tR, R, S@pageoffO\n";
+		else
+			s = "\tadrp\tR, SO\n"
+			    "\tadd\tR, R, #:lo12:SO\n";
+		break;
+	case RelThr:
+		if (T.apple)
+			s = "\tadrp\tR, S@tlvppage\n"
+			    "\tldr\tR, [R, S@tlvppageoff]\n";
+		else
+			s = "\tmrs\tR, tpidr_el0\n"
+			    "\tadd\tR, R, #:tprel_hi12:SO, lsl #12\n"
+			    "\tadd\tR, R, #:tprel_lo12_nc:SO\n";
+		break;
+	}
 
-	if (c->bits.i)
-		/* todo, handle large offsets */
-		sprintf(off, "+%"PRIi64, c->bits.i);
-	else
-		off[0] = 0;
 	l = str(c->label);
 	p = l[0] == '"' ? "" : T.assym;
-	if (c->reloc == RelThr) {
-		fprintf(e->f, "\tmrs\t%s, tpidr_el0\n", rn);
-		fprintf(e->f, "\tadd\t%s, %s, #:tprel_hi12:%s%s%s, lsl #12\n",
-			rn, rn, p, l, off);
-		fprintf(e->f, "\tadd\t%s, %s, #:tprel_lo12_nc:%s%s%s\n",
-			rn, rn, p, l, off);
-	} else {
-		fprintf(e->f, ldsym[T.apple != 0][0], rn, p, l, off);
-		fprintf(e->f, ldsym[T.apple != 0][1], rn, rn, p, l, off);
-	}
+	for (; *s; s++)
+		switch (*s) {
+		default:
+			fputc(*s, e->f);
+			break;
+		case 'R':
+			fputs(rn, e->f);
+			break;
+		case 'S':
+			fputs(p, e->f);
+			fputs(l, e->f);
+			break;
+		case 'O':
+			if (c->bits.i)
+				/* todo, handle large offsets */
+				fprintf(e->f, "+%"PRIi64, c->bits.i);
+			break;
+		}
 }
 
 static void
diff --git a/arm64/isel.c b/arm64/isel.c
index 320cf33..a8d36fa 100644
--- a/arm64/isel.c
+++ b/arm64/isel.c
@@ -70,20 +70,45 @@ static void
 fixarg(Ref *pr, int k, int phi, Fn *fn)
 {
 	char buf[32];
-	Ref r0, r1, r2;
+	Ref r0, r1, r2, r3;
 	int s, n;
-	Con *c;
+	Con *c, cc;
 
 	r0 = *pr;
 	switch (rtype(r0)) {
 	case RCon:
+		c = &fn->con[r0.val];
+		if (T.apple
+		&& c->type == CAddr
+		&& c->reloc == RelThr) {
+			r1 = newtmp("isel", Kl, fn);
+			*pr = r1;
+			if (c->bits.i) {
+				r2 = newtmp("isel", Kl, fn);
+				cc = (Con){.type = CBits};
+				cc.bits.i = c->bits.i;
+				r3 = newcon(&cc, fn);
+				emit(Oadd, Kl, r1, r2, r3);
+				r1 = r2;
+			}
+			emit(Ocopy, Kl, r1, TMP(R0), R);
+			r1 = newtmp("isel", Kl, fn);
+			r2 = newtmp("isel", Kl, fn);
+			emit(Ocall, 0, R, r1, CALL(33));
+			emit(Ocopy, Kl, TMP(R0), r2, R);
+			emit(Oload, Kl, r1, r2, R);
+			cc = *c;
+			cc.bits.i = 0;
+			r3 = newcon(&cc, fn);
+			emit(Ocopy, Kl, r2, r3, R);
+			break;
+		}
 		if (KBASE(k) == 0 && phi)
 			return;
 		r1 = newtmp("isel", k, fn);
 		if (KBASE(k) == 0) {
 			emit(Ocopy, k, r1, r0, R);
 		} else {
-			c = &fn->con[r0.val];
 			n = stashbits(&c->bits, KWIDE(k) ? 8 : 4);
 			vgrow(&fn->con, ++fn->ncon);
 			c = &fn->con[fn->ncon-1];