summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--all.h5
-rw-r--r--rega.c420
2 files changed, 261 insertions, 164 deletions
diff --git a/all.h b/all.h
index f24d099..1e9476d 100644
--- a/all.h
+++ b/all.h
@@ -288,8 +288,9 @@ struct Tmp {
 	short slot; /* -1 for unset */
 	short cls;
 	struct {
-		int r;
-		bits m;
+		int r;  /* register or -1 */
+		int w;  /* weight */
+		bits m; /* avoid these registers */
 	} hint;
 	int phi;
 	Alias alias;
diff --git a/rega.c b/rega.c
index 02429a6..2f01c07 100644
--- a/rega.c
+++ b/rega.c
@@ -10,6 +10,7 @@ typedef struct RMap RMap;
 struct RMap {
 	int t[Tmp0];
 	int r[Tmp0];
+	int w[Tmp0];   /* wait list, for unmatched hints */
 	BSet b[1];
 	int n;
 };
@@ -20,8 +21,12 @@ static Mem *mem;       /* function mem references */
 static struct {
 	Ref src, dst;
 	int cls;
-} *pm;                 /* parallel move constructed */
-static int cpm, npm;   /* capacity and size of pm */
+} pm[Tmp0];            /* parallel move constructed */
+static int npm;        /* size of pm */
+static int loop;       /* current loop level */
+
+static uint stmov;     /* stats: added moves */
+static uint stblk;     /* stats: added blocks */
 
 static int *
 hint(int t)
@@ -32,21 +37,21 @@ hint(int t)
 static void
 sethint(int t, int r)
 {
-	bits m;
+	Tmp *p;
 
-	m = tmp[phicls(t, tmp)].hint.m;
-	if (*hint(t) == -1)
-	if (!(BIT(r) & m))
-		*hint(t) = r;
+	p = &tmp[phicls(t, tmp)];
+	if (p->hint.r == -1 || p->hint.w > loop) {
+		p->hint.r = r;
+		p->hint.w = loop;
+		tmp[t].visit = -1;
+	}
 }
 
 static void
 rcopy(RMap *ma, RMap *mb)
 {
-	memcpy(ma->t, mb->t, sizeof ma->t);
-	memcpy(ma->r, mb->r, sizeof ma->r);
+	memcpy(ma, mb, sizeof *ma);
 	bscopy(ma->b, mb->b);
-	ma->n = mb->n;
 }
 
 static int
@@ -93,10 +98,10 @@ radd(RMap *m, int t, int r)
 }
 
 static Ref
-ralloc(RMap *m, int t)
+ralloctry(RMap *m, int t, int try)
 {
 	bits regs;
-	int r, r0, r1;
+	int h, r, r0, r1;
 
 	if (t < Tmp0) {
 		assert(bshas(m->b, t));
@@ -107,8 +112,12 @@ ralloc(RMap *m, int t)
 		assert(r != -1);
 		return TMP(r);
 	}
-	r = *hint(t);
+	r = tmp[t].visit;
+	if (r == -1 || bshas(m->b, r))
+		r = *hint(t);
 	if (r == -1 || bshas(m->b, r)) {
+		if (try)
+			return R;
 		regs = tmp[phicls(t, tmp)].hint.m;
 		regs |= m->b->t[0];
 		if (KBASE(tmp[t].cls) == 0) {
@@ -129,9 +138,19 @@ ralloc(RMap *m, int t)
 Found:
 	radd(m, t, r);
 	sethint(t, r);
+	tmp[t].visit = r;
+	h = *hint(t);
+	if (h != -1 && h != r)
+		m->w[h] = t;
 	return TMP(r);
 }
 
+static inline Ref
+ralloc(RMap *m, int t)
+{
+	return ralloctry(m, t, 0);
+}
+
 static int
 rfree(RMap *m, int t)
 {
@@ -168,12 +187,8 @@ mdump(RMap *m)
 static void
 pmadd(Ref src, Ref dst, int k)
 {
-	if (npm == cpm) {
-		cpm = cpm * 2 + 16;
-		pm = realloc(pm, cpm * sizeof pm[0]);
-		if (!pm)
-			die("pmadd, out of memory");
-	}
+	if (npm == Tmp0)
+		die("cannot have more moves than registers");
 	pm[npm].src = src;
 	pm[npm].dst = dst;
 	pm[npm].cls = k;
@@ -182,77 +197,61 @@ pmadd(Ref src, Ref dst, int k)
 
 enum PMStat { ToMove, Moving, Moved };
 
-static Ref
+static int
 pmrec(enum PMStat *status, int i, int *k)
 {
-	Ref swp, swp1;
-	int j, k1;
+	int j, c;
 
 	/* note, this routine might emit
-	 * too many large instructions:
-	 *
-	 *                  , x -- x
-	 *      x -- x -- x        |
-	 *                  ` x -- x
-	 *
-	 * if only the first move is wide
-	 * the whole cycle will be wide,
-	 * this is safe but not necessary
+	 * too many large instructions
 	 */
-
-	if (req(pm[i].src, pm[i].dst))
-		return R;
-	status[i] = Moving;
-	assert(KBASE(*k) == KBASE(pm[i].cls));
-	assert((Kw|1) == Kl && (Ks|1) == Kd);
-	*k |= KWIDE(pm[i].cls); /* see above */
-	swp = R;
-	for (j=0; j<npm; j++) {
-		if (req(pm[j].src, pm[i].dst))
-			switch (status[j]) {
-			case ToMove:
-				k1 = *k;
-				swp1 = pmrec(status, j, &k1);
-				if (!req(swp1, R)) {
-					assert(req(swp, R));
-					swp = swp1;
-					*k = k1;
-				}
-				break;
-			case Moving:
-				assert(req(swp, R));
-				swp = pm[i].dst;
-				break;
-			case Moved:
-				break;
-			}
+	if (req(pm[i].src, pm[i].dst)) {
+		status[i] = Moved;
+		return -1;
+	}
+	assert(KBASE(pm[i].cls) == KBASE(*k));
+	assert((Kw|Kl) == Kl && (Ks|Kd) == Kd);
+	*k |= pm[i].cls;
+	for (j=0; j<npm; j++)
+		if (req(pm[j].dst, pm[i].src))
+			break;
+	switch (j == npm ? Moved : status[j]) {
+	case Moving:
+		c = j; /* start of cycle */
+		emit(Oswap, *k, R, pm[i].src, pm[i].dst);
+		break;
+	case ToMove:
+		status[i] = Moving;
+		c = pmrec(status, j, k);
+		if (c == i) {
+			c = -1; /* end of cycle */
+			break;
+		}
+		if (c != -1) {
+			emit(Oswap, *k, R, pm[i].src, pm[i].dst);
+			break;
+		}
+		/* fall through */
+	case Moved:
+		c = -1;
+		emit(Ocopy, pm[i].cls, pm[i].dst, pm[i].src, R);
+		break;
 	}
 	status[i] = Moved;
-	if (req(swp, R)) {
-		*curi++ = (Ins){Ocopy, pm[i].dst, {pm[i].src}, pm[i].cls};
-		return R;
-	} else if (!req(swp, pm[i].src)) {
-		*curi++ = (Ins){Oswap, R, {pm[i].src, pm[i].dst}, *k};
-		return swp;
-	} else
-		return R;
-
+	return c;
 }
 
 static void
 pmgen()
 {
-	int i, k;
+	int i;
 	enum PMStat *status;
 
 	status = alloc(npm * sizeof status[0]);
 	assert(!npm || status[npm-1] == ToMove);
-	curi = insb;
 	for (i=0; i<npm; i++)
-		if (status[i] == ToMove) {
-			k = pm[i].cls;
-			pmrec(status, i, &k);
-		}
+		if (status[i] == ToMove)
+			pmrec(status, i, (int[]){pm[i].cls});
 }
 
 static void
@@ -261,8 +260,9 @@ move(int r, Ref to, RMap *m)
 	int n, t, r1;
 
 	r1 = req(to, R) ? -1 : rfree(m, to.val);
-	if (bshas(m->b, r) && r1 != r) {
+	if (bshas(m->b, r)) {
 		/* r is used and not by to */
+		assert(r1 != r);
 		for (n=0; m->r[n] != r; n++)
 			assert(n+1 < m->n);
 		t = m->t[n];
@@ -286,10 +286,11 @@ dopm(Blk *b, Ins *i, RMap *m)
 {
 	RMap m0;
 	int n, r, r1, t, s;
-	Ins *i0, *i1, *ip, *ir;
+	Ins *i1, *ip;
 	bits def;
 
-	m0 = *m;
+	m0 = *m; /* okay since we don't use m0.b */
+	m0.b->t = 0;
 	i1 = ++i;
 	do {
 		i--;
@@ -320,21 +321,11 @@ dopm(Blk *b, Ins *i, RMap *m)
 			radd(m, r, r);
 	}
 	pmgen();
-#ifdef TEST_PMOV
-	return 0;
-#endif
-	n = b->nins - (i1 - i) + (curi - insb);
-	i0 = alloc(n * sizeof(Ins));
-	ip = icpy(ip = i0, b->ins, i - b->ins);
-	ip = icpy(ir = ip, insb, curi - insb);
-	ip = icpy(ip, i1, &b->ins[b->nins] - i1);
-	b->nins = n;
-	b->ins = i0;
-	return ir;
+	return i;
 }
 
 static int
-prio(Ref r1, Ref r2)
+prio1(Ref r1, Ref r2)
 {
 	/* trivial heuristic to begin with,
 	 * later we can use the distance to
@@ -350,7 +341,7 @@ insert(Ref *r, Ref **rs, int p)
 	int i;
 
 	rs[i = p] = r;
-	while (i-- > 0 && prio(*r, *rs[i])) {
+	while (i-- > 0 && prio1(*r, *rs[i])) {
 		rs[i+1] = rs[i];
 		rs[i] = r;
 	}
@@ -359,9 +350,9 @@ insert(Ref *r, Ref **rs, int p)
 static void
 doblk(Blk *b, RMap *cur)
 {
-	int x, r, nr;
+	int t, x, r, rf, rt, nr;
 	bits rs;
-	Ins *i;
+	Ins *i, *i1;
 	Mem *m;
 	Ref *ra[4];
 
@@ -369,8 +360,12 @@ doblk(Blk *b, RMap *cur)
 		radd(cur, r, r);
 	if (rtype(b->jmp.arg) == RTmp)
 		b->jmp.arg = ralloc(cur, b->jmp.arg.val);
-	for (i=&b->ins[b->nins]; i!=b->ins;) {
-		switch ((--i)->op) {
+	curi = &insb[NIns];
+	for (i1=&b->ins[b->nins]; i1!=b->ins;) {
+		emiti(*--i1);
+		i = curi;
+		rf = -1;
+		switch (i->op) {
 		case Ocall:
 			rs = T.argregs(i->arg[1], 0) | T.rglob;
 			for (r=0; T.rsave[r]>=0; r++)
@@ -378,8 +373,10 @@ doblk(Blk *b, RMap *cur)
 					rfree(cur, T.rsave[r]);
 			break;
 		case Ocopy:
-			if (isreg(i->arg[0])) {
-				i = dopm(b, i, cur);
+			if (regcpy(i)) {
+				curi++;
+				i1 = dopm(b, i1, cur);
+				stmov += i+1 - curi;
 				continue;
 			}
 			if (isreg(i->to))
@@ -390,14 +387,15 @@ doblk(Blk *b, RMap *cur)
 			if (!req(i->to, R)) {
 				assert(rtype(i->to) == RTmp);
 				r = i->to.val;
-				if (r >= Tmp0 || !(BIT(r) & T.rglob))
-					r = rfree(cur, r);
-				if (r == -1) {
+				if (r < Tmp0 && (BIT(r) & T.rglob))
+					break;
+				rf = rfree(cur, r);
+				if (rf == -1) {
 					assert(!isreg(i->to));
-					*i = (Ins){.op = Onop};
+					curi++;
 					continue;
 				}
-				i->to = TMP(r);
+				i->to = TMP(rf);
 			}
 			break;
 		}
@@ -416,7 +414,57 @@ doblk(Blk *b, RMap *cur)
 			}
 		for (r=0; r<nr; r++)
 			*ra[r] = ralloc(cur, ra[r]->val);
+
+		/* try to change the register of a hinted
+		 * temporary if rf is available */
+		x = 1;
+		if (rf != -1 && (t = cur->w[rf]) != 0)
+		if (!bshas(cur->b, rf) && *hint(t) == rf
+		&& (rt = rfree(cur, t)) != -1) {
+			tmp[t].visit = -1;
+			ralloc(cur, t);
+			assert(bshas(cur->b, rf));
+			emit(Ocopy, tmp[t].cls, TMP(rt), TMP(rf), R);
+			stmov += 1;
+			cur->w[rf] = 0;
+			for (r=0; r<nr; r++)
+				if (req(*ra[r], TMP(rt)))
+					*ra[r] = TMP(rf);
+			/* one could iterate this logic with
+			 * the newly freed rt, but in this case
+			 * the above loop must be changed */
+		}
 	}
+	b->nins = &insb[NIns] - curi;
+	idup(&b->ins, curi, b->nins);
+}
+
+/* qsort() comparison function to peel
+ * loop nests from inside out */
+static int
+carve(const void *a, const void *b)
+{
+	Blk *ba, *bb;
+
+	/* todo, evaluate if this order is really
+	 * better than the simple postorder */
+	ba = *(Blk**)a;
+	bb = *(Blk**)b;
+	if (ba->loop == bb->loop)
+		return ba->id > bb->id ? -1 : ba->id < bb->id;
+	return ba->loop > bb->loop ? -1 : +1;
+}
+
+/* comparison function to order temporaries
+ * for allocation at the end of blocks */
+static int
+prio2(int t1, int t2)
+{
+	if ((tmp[t1].visit ^ tmp[t2].visit) < 0)  /* != signs */
+		return tmp[t1].visit != -1 ? +1 : -1;
+	if ((*hint(t1) ^ *hint(t2)) < 0)
+		return *hint(t1) != -1 ? +1 : -1;
+	return tmp[t1].cost - tmp[t2].cost;
 }
 
 /* register allocation
@@ -425,18 +473,21 @@ doblk(Blk *b, RMap *cur)
 void
 rega(Fn *fn)
 {
-	int j, t, r, r1, x, rl[Tmp0];
-	Blk *b, *b1, *s, ***ps, *blist;
-	RMap *end, *beg, cur, old;
+	int j, t, r, x, rl[Tmp0];
+	Blk *b, *b1, *s, ***ps, *blist, **blk, **bp;
+	RMap *end, *beg, cur, old, *m;
 	Ins *i;
 	Phi *p;
 	uint u, n;
 	Ref src, dst;
 
 	/* 1. setup */
+	stmov = 0;
+	stblk = 0;
 	regu = 0;
 	tmp = fn->tmp;
 	mem = fn->mem;
+	blk = alloc(fn->nblk * sizeof blk[0]);
 	end = alloc(fn->nblk * sizeof end[0]);
 	beg = alloc(fn->nblk * sizeof beg[0]);
 	for (n=0; n<fn->nblk; n++) {
@@ -446,8 +497,15 @@ rega(Fn *fn)
 	bsinit(cur.b, fn->ntmp);
 	bsinit(old.b, fn->ntmp);
 
-	for (t=0; t<fn->ntmp; t++)
-		*hint(t) = t < Tmp0 ? t : -1;
+	loop = INT_MAX;
+	for (t=0; t<fn->ntmp; t++) {
+		tmp[t].hint.r = t < Tmp0 ? t : -1;
+		tmp[t].hint.w = loop;
+		tmp[t].visit = -1;
+	}
+	for (bp=blk, b=fn->start; b; b=b->link)
+		*bp++ = b;
+	qsort(blk, fn->nblk, sizeof blk[0], carve);
 	for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
 		if (i->op != Ocopy || !isreg(i->arg[0]))
 			break;
@@ -456,71 +514,103 @@ rega(Fn *fn)
 			sethint(i->to.val, i->arg[0].val);
 		}
 
-	/* 2. assign registers following post-order */
-	for (n=fn->nblk-1; n!=-1u; n--) {
-		b = fn->rpo[n];
+	/* 2. assign registers */
+	for (bp=blk; bp<&blk[fn->nblk]; bp++) {
+		b = *bp;
+		n = b->id;
+		loop = b->loop;
 		cur.n = 0;
 		bszero(cur.b);
-		for (x=0; x<2; x++)
-			for (t=Tmp0; bsiter(b->out, &t); t++)
-				if (x || (r=*hint(t)) != -1)
-				if (x || !bshas(cur.b, r))
-					ralloc(&cur, t);
+		memset(cur.w, 0, sizeof cur.w);
+		for (x=0, t=Tmp0; bsiter(b->out, &t); t++) {
+			j = x++;
+			rl[j] = t;
+			while (j-- > 0 && prio2(t, rl[j]) > 0) {
+				rl[j+1] = rl[j];
+				rl[j] = t;
+			}
+		}
+		for (j=0; j<x; j++)
+			ralloctry(&cur, rl[j], 1);
+		for (j=0; j<x; j++)
+			ralloc(&cur, rl[j]);
 		rcopy(&end[n], &cur);
 		doblk(b, &cur);
 		bscopy(b->in, cur.b);
 		for (p=b->phi; p; p=p->link)
-			if (rtype(p->to) == RTmp) {
+			if (rtype(p->to) == RTmp)
 				bsclr(b->in, p->to.val);
-				/* heuristic 0:
-				 * if the phi destination has an
-				 * argument from a frequent block
-				 * that was already allocated to
-				 * 'r', use 'r' as the new hint
-				 */
-				memset(rl, 0, sizeof rl);
-				for (u=0; u<p->narg; u++) {
-					t = p->arg[u].val;
-					b1 = p->blk[u];
-					if (rtype(p->arg[u]) == RTmp)
-					if ((r=rfind(&end[b1->id], t)) != -1)
-						rl[r] += b1->loop;
-				}
-				for (x=0, j=0; j<Tmp0; j++)
-					if (rl[j] > rl[x])
-						x = j;
-				if (rl[x] >= b->loop)
-					*hint(p->to.val) = x;
+		rcopy(&beg[n], &cur);
+	}
+
+	/* 3. emit copies shared by multiple edges
+	 * to the same block */
+	for (s=fn->start; s; s=s->link) {
+		if (s->npred <= 1)
+			continue;
+		m = &beg[s->id];
+
+		/* rl maps a register that is live at the
+		 * beginning of s to the one used in all
+		 * predecessors (if any, -1 otherwise) */
+		memset(rl, 0, sizeof rl);
+
+		/* to find the register of a phi in a
+		 * predecessor, we have to find the
+		 * corresponding argument */
+		for (p=s->phi; p; p=p->link) {
+			r = rfind(m, p->to.val);
+			if (r == -1)
+				continue;
+			for (u=0; u<p->narg; u++) {
+				b = p->blk[u];
+				src = p->arg[u];
+				if (rtype(src) != RTmp)
+					continue;
+				x = rfind(&end[b->id], src.val);
+				assert(x != -1);
+				rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
 			}
-		if (b->npred > 1) {
-			/* heuristic 1:
-			 * attempt to satisfy hints
-			 * when it's simple and we have
-			 * multiple predecessors
-			 */
-			rcopy(&old, &cur);
-			curi = &insb[NIns];
-			for (j=0; j<old.n; j++) {
-				t = old.t[j];
-				r = *hint(t);
-				r1 = rfind(&cur, t);
-				if (r != -1 && r != r1)
-				if (!bshas(cur.b, r)) {
-					rfree(&cur, t);
-					radd(&cur, t, r);
-					x = tmp[t].cls;
-					emit(Ocopy, x, TMP(r1), TMP(r), R);
-				}
+			if (rl[r] == 0)
+				rl[r] = -1;
+		}
+
+		/* process non-phis temporaries */
+		for (j=0; j<m->n; j++) {
+			t = m->t[j];
+			r = m->r[j];
+			if (rl[r] || t < Tmp0 /* todo, remove this */)
+				continue;
+			for (bp=s->pred; bp<&s->pred[s->npred]; bp++) {
+				x = rfind(&end[(*bp)->id], t);
+				assert(x != -1);
+				rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
 			}
-			if ((j = &insb[NIns] - curi)) {
-				b->nins += j;
-				i = alloc(b->nins * sizeof(Ins));
-				icpy(icpy(i, curi, j), b->ins, b->nins-j);
-				b->ins = i;
+		}
+
+		npm = 0;
+		for (j=0; j<m->n; j++) {
+			t = m->t[j];
+			r = m->r[j];
+			x = rl[r];
+			assert(x != 0 || t < Tmp0 /* todo, ditto */);
+			if (x > 0) {
+				pmadd(TMP(x), TMP(r), tmp[t].cls);
+				m->r[j] = x;
 			}
 		}
-		rcopy(&beg[n], &cur);
+		curi = &insb[NIns];
+		pmgen();
+		j = &insb[NIns] - curi;
+		if (j == 0)
+			continue;
+		stmov += j;
+		s->nins += j;
+		i = alloc(s->nins * sizeof(Ins));
+		icpy(icpy(i, curi, j), s->ins, s->nins-j);
+		s->ins = i;
 	}
+
 	if (debug['R'])  {
 		fprintf(stderr, "\n> Register mappings:\n");
 		for (n=0; n<fn->nblk; n++) {
@@ -533,7 +623,7 @@ rega(Fn *fn)
 		fprintf(stderr, "\n");
 	}
 
-	/* 3. compose glue code */
+	/* 4. emit remaining copies in new blocks */
 	blist = 0;
 	for (b=fn->start;; b=b->link) {
 		ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}};
@@ -560,8 +650,9 @@ rega(Fn *fn)
 				dst = rref(&beg[s->id], t);
 				pmadd(src, dst, tmp[t].cls);
 			}
+			curi = &insb[NIns];
 			pmgen();
-			if (curi == insb)
+			if (curi == &insb[NIns])
 				continue;
 			b1 = blknew();
 			b1->loop = (b->loop+s->loop) / 2;
@@ -569,8 +660,10 @@ rega(Fn *fn)
 			blist = b1;
 			fn->nblk++;
 			sprintf(b1->name, "%s_%s", b->name, s->name);
-			b1->nins = curi - insb;
-			idup(&b1->ins, insb, b1->nins);
+			b1->nins = &insb[NIns] - curi;
+			stmov += b1->nins;
+			stblk += 1;
+			idup(&b1->ins, curi, b1->nins);
 			b1->jmp.type = Jjmp;
 			b1->s1 = s;
 			**ps = b1;
@@ -585,6 +678,9 @@ rega(Fn *fn)
 	fn->reg = regu;
 
 	if (debug['R']) {
+		fprintf(stderr, "\n> Register allocation statistics:\n");
+		fprintf(stderr, "\tnew moves:  %d\n", stmov);
+		fprintf(stderr, "\tnew blocks: %d\n", stblk);
 		fprintf(stderr, "\n> After register allocation:\n");
 		printfn(fn, stderr);
 	}