summary refs log tree commit diff
path: root/src/rega.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/rega.c')
-rw-r--r--src/rega.c598
1 files changed, 598 insertions, 0 deletions
diff --git a/src/rega.c b/src/rega.c
new file mode 100644
index 0000000..7f8edcf
--- /dev/null
+++ b/src/rega.c
@@ -0,0 +1,598 @@
+#include "all.h"
+
+#ifdef TEST_PMOV
+	#undef assert
+	#define assert(x) assert_test(#x, x)
+#endif
+
+typedef struct RMap RMap;
+
+struct RMap {
+	int t[NIReg+NFReg];
+	int r[NIReg+NFReg];
+	BSet b[1];
+	int n;
+};
+
+static bits regu;      /* registers used */
+static Tmp *tmp;       /* function temporaries */
+static Mem *mem;       /* function mem references */
+static struct {
+	Ref src, dst;
+	int cls;
+} *pm;                 /* parallel move constructed */
+static int cpm, npm;   /* capacity and size of pm */
+
+static int *
+hint(int t)
+{
+	return &tmp[phicls(t, tmp)].hint.r;
+}
+
+static void
+sethint(int t, int r)
+{
+	bits m;
+
+	m = tmp[phicls(t, tmp)].hint.m;
+	if (*hint(t) == -1)
+	if (!(BIT(r) & m))
+		*hint(t) = r;
+}
+
+static void
+rcopy(RMap *ma, RMap *mb)
+{
+	memcpy(ma->t, mb->t, sizeof ma->t);
+	memcpy(ma->r, mb->r, sizeof ma->r);
+	bscopy(ma->b, mb->b);
+	ma->n = mb->n;
+}
+
+static int
+rfind(RMap *m, int t)
+{
+	int i;
+
+	for (i=0; i<m->n; i++)
+		if (m->t[i] == t)
+			return m->r[i];
+	return -1;
+}
+
+static Ref
+rref(RMap *m, int t)
+{
+	int r, s;
+
+	r = rfind(m, t);
+	if (r == -1) {
+		s = tmp[t].slot;
+		assert(s != -1 && "should have spilled");
+		return SLOT(s);
+	} else
+		return TMP(r);
+}
+
+static void
+radd(RMap *m, int t, int r)
+{
+	assert((t >= Tmp0 || t == r) && "invalid temporary");
+	assert(((RAX <= r && r < RAX + NIReg) || (XMM0 <= r && r < XMM0 + NFReg)) && "invalid register");
+	assert(!bshas(m->b, t) && "temporary has mapping");
+	assert(!bshas(m->b, r) && "register already allocated");
+	assert(m->n <= NIReg+NFReg && "too many mappings");
+	bsset(m->b, t);
+	bsset(m->b, r);
+	m->t[m->n] = t;
+	m->r[m->n] = r;
+	m->n++;
+	regu |= BIT(r);
+}
+
+static Ref
+ralloc(RMap *m, int t)
+{
+	bits regs;
+	int r, r0, r1;
+
+	if (t < Tmp0) {
+		assert(bshas(m->b, t));
+		return TMP(t);
+	}
+	if (bshas(m->b, t)) {
+		r = rfind(m, t);
+		assert(r != -1);
+		return TMP(r);
+	}
+	r = *hint(t);
+	if (r == -1 || bshas(m->b, r)) {
+		regs = tmp[phicls(t, tmp)].hint.m;
+		regs |= m->b->t[0];
+		switch (KBASE(tmp[t].cls)) {
+		case 0:
+			r0 = RAX;
+			r1 = RAX + NIReg;
+			break;
+		case 1:
+			r0 = XMM0;
+			r1 = XMM0 + NFReg;
+			break;
+		}
+		for (r=r0; r<r1; r++)
+			if (!(regs & BIT(r)))
+				goto Found;
+		for (r=r0; r<r1; r++)
+			if (!bshas(m->b, r))
+				goto Found;
+		diag("rega: no more regs");
+	}
+Found:
+	radd(m, t, r);
+	sethint(t, r);
+	return TMP(r);
+}
+
+static int
+rfree(RMap *m, int t)
+{
+	int i, r;
+
+	if (!bshas(m->b, t))
+		return -1;
+	for (i=0; m->t[i] != t; i++)
+		assert(i+1 < m->n);
+	r = m->r[i];
+	bsclr(m->b, t);
+	bsclr(m->b, r);
+	m->n--;
+	memmove(&m->t[i], &m->t[i+1], (m->n-i) * sizeof m->t[0]);
+	memmove(&m->r[i], &m->r[i+1], (m->n-i) * sizeof m->r[0]);
+	return r;
+}
+
+static void
+mdump(RMap *m)
+{
+	int i;
+
+	for (i=0; i<m->n; i++)
+		fprintf(stderr, " (%s, R%d)",
+			tmp[m->t[i]].name,
+			m->r[i]);
+	fprintf(stderr, "\n");
+}
+
+static void
+pmadd(Ref src, Ref dst, int k)
+{
+	if (npm == cpm) {
+		cpm = cpm * 2 + 16;
+		pm = realloc(pm, cpm * sizeof pm[0]);
+		if (!pm)
+			diag("pmadd: out of memory");
+	}
+	pm[npm].src = src;
+	pm[npm].dst = dst;
+	pm[npm].cls = k;
+	npm++;
+}
+
+enum PMStat { ToMove, Moving, Moved };
+
+static Ref
+pmrec(enum PMStat *status, int i, int *k)
+{
+	Ref swp, swp1;
+	int j, k1;
+
+	/* note, this routine might emit
+	 * too many large instructions:
+	 *
+	 *                  , x -- x
+	 *      x -- x -- x        |
+	 *                  ` x -- x
+	 *
+	 * if only the first move is wide
+	 * the whole cycle will be wide,
+	 * this is safe but not necessary
+	 */
+
+	if (req(pm[i].src, pm[i].dst))
+		return R;
+	status[i] = Moving;
+	assert(KBASE(*k) == KBASE(pm[i].cls));
+	assert((Kw|1) == Kl && (Ks|1) == Kd);
+	*k |= KWIDE(pm[i].cls); /* see above */
+	swp = R;
+	for (j=0; j<npm; j++) {
+		if (req(pm[j].src, pm[i].dst))
+			switch (status[j]) {
+			case ToMove:
+				k1 = *k;
+				swp1 = pmrec(status, j, &k1);
+				if (!req(swp1, R)) {
+					assert(req(swp, R));
+					swp = swp1;
+					*k = k1;
+				}
+				break;
+			case Moving:
+				assert(req(swp, R));
+				swp = pm[i].dst;
+				break;
+			case Moved:
+				break;
+			}
+	}
+	status[i] = Moved;
+	if (req(swp, R)) {
+		*curi++ = (Ins){OCopy, pm[i].dst, {pm[i].src}, pm[i].cls};
+		return R;
+	} else if (!req(swp, pm[i].src)) {
+		*curi++ = (Ins){OSwap, R, {pm[i].src, pm[i].dst}, *k};
+		return swp;
+	} else
+		return R;
+
+}
+
+static void
+pmgen()
+{
+	int i, k;
+	enum PMStat *status;
+
+	status = alloc(npm * sizeof status[0]);
+	assert(!npm || status[npm-1] == ToMove);
+	curi = insb;
+	for (i=0; i<npm; i++)
+		if (status[i] == ToMove) {
+			k = pm[i].cls;
+			pmrec(status, i, &k);
+		}
+}
+
+static void
+move(int r, Ref to, RMap *m)
+{
+	int n, t, r1;
+
+	r1 = req(to, R) ? -1 : rfree(m, to.val);
+	if (bshas(m->b, r) && r1 != r) {
+		/* r is used and not by to */
+		for (n=0; m->r[n] != r; n++)
+			assert(n+1 < m->n);
+		t = m->t[n];
+		rfree(m, t);
+		bsset(m->b, r);
+		ralloc(m, t);
+		bsclr(m->b, r);
+	}
+	t = req(to, R) ? r : to.val;
+	radd(m, t, r);
+}
+
+static int
+regcpy(Ins *i)
+{
+	return i->op == OCopy && isreg(i->arg[0]);
+}
+
+static Ins *
+dopm(Blk *b, Ins *i, RMap *m)
+{
+	RMap m0;
+	int n, r, r1, t, s;
+	Ins *i0, *i1, *ip, *ir;
+	bits def;
+
+	m0 = *m;
+	i1 = ++i;
+	do {
+		i--;
+		move(i->arg[0].val, i->to, m);
+	} while (i != b->ins && regcpy(i-1));
+	assert(m0.n <= m->n);
+	if (i != b->ins && (i-1)->op == OCall) {
+		def = retregs((i-1)->arg[1], 0);
+		for (r=0; r<NRSave; r++)
+			if (!(BIT(rsave[r]) & def))
+				move(rsave[r], R, m);
+	}
+	for (npm=0, n=0; n<m->n; n++) {
+		t = m->t[n];
+		s = tmp[t].slot;
+		r1 = m->r[n];
+		r = rfind(&m0, t);
+		if (r != -1)
+			pmadd(TMP(r1), TMP(r), tmp[t].cls);
+		else if (s != -1)
+			pmadd(TMP(r1), SLOT(s), tmp[t].cls);
+	}
+	for (ip=i; ip<i1; ip++) {
+		if (!req(ip->to, R))
+			rfree(m, ip->to.val);
+		r = ip->arg[0].val;
+		if (rfind(m, r) == -1)
+			radd(m, r, r);
+	}
+	pmgen();
+#ifdef TEST_PMOV
+	return 0;
+#endif
+	n = b->nins - (i1 - i) + (curi - insb);
+	i0 = alloc(n * sizeof(Ins));
+	ip = icpy(ip = i0, b->ins, i - b->ins);
+	ip = icpy(ir = ip, insb, curi - insb);
+	ip = icpy(ip, i1, &b->ins[b->nins] - i1);
+	b->nins = n;
+	b->ins = i0;
+	return ir;
+}
+
+static int
+prio(Ref r1, Ref r2)
+{
+	/* trivial heuristic to begin with,
+	 * later we can use the distance to
+	 * the definition instruction
+	 */
+	(void) r2;
+	return *hint(r1.val) != -1;
+}
+
+static void
+insert(Ref *r, Ref **rs, int p)
+{
+	int i;
+
+	rs[i = p] = r;
+	while (i-- > 0 && prio(*r, *rs[i])) {
+		rs[i+1] = rs[i];
+		rs[i] = r;
+	}
+}
+
+static void
+doblk(Blk *b, RMap *cur)
+{
+	int x, r, nr;
+	bits rs;
+	Ins *i;
+	Mem *m;
+	Ref *ra[4];
+
+	if (rtype(b->jmp.arg) == RTmp)
+		b->jmp.arg = ralloc(cur, b->jmp.arg.val);
+	else if (rtype(b->jmp.arg) == RACall) {
+		/* add return registers */
+		rs = retregs(b->jmp.arg, 0);
+		for (r=0; rs; rs/=2, r++)
+			if (rs & 1)
+				radd(cur, r, r);
+	}
+	for (i=&b->ins[b->nins]; i!=b->ins;) {
+		switch ((--i)->op) {
+		case OCall:
+			rs = argregs(i->arg[1], 0);
+			for (r=0; r<NRSave; r++)
+				if (!(BIT(rsave[r]) & rs))
+					rfree(cur, rsave[r]);
+			break;
+		case OCopy:
+			if (isreg(i->arg[0])) {
+				i = dopm(b, i, cur);
+				continue;
+			}
+			if (isreg(i->to))
+			if (rtype(i->arg[0]) == RTmp)
+				sethint(i->arg[0].val, i->to.val);
+			/* fall through */
+		default:
+			if (!req(i->to, R)) {
+				assert(rtype(i->to) == RTmp);
+				r = rfree(cur, i->to.val);
+				if (r == -1 && !isreg(i->to)) {
+					*i = (Ins){.op = ONop};
+					continue;
+				}
+				if (i->to.val >= Tmp0)
+					i->to = TMP(r);
+			}
+			break;
+		}
+		for (x=0, nr=0; x<2; x++)
+			switch (rtype(i->arg[x])) {
+			case RAMem:
+				m = &mem[i->arg[x].val & AMask];
+				if (rtype(m->base) == RTmp)
+					insert(&m->base, ra, nr++);
+				if (rtype(m->index) == RTmp)
+					insert(&m->index, ra, nr++);
+				break;
+			case RTmp:
+				insert(&i->arg[x], ra, nr++);
+				break;
+			}
+		for (r=0; r<nr; r++)
+			*ra[r] = ralloc(cur, ra[r]->val);
+	}
+}
+
+/* register allocation
+ * depends on rpo, phi, cost, (and obviously spill)
+ */
+void
+rega(Fn *fn)
+{
+	int j, n, t, r, r1, x, rl[Tmp0];
+	Blk *b, *b1, *s, ***ps, *blist;
+	RMap *end, *beg, cur, old;
+	Ins *i;
+	Phi *p;
+	uint u;
+	Ref src, dst;
+
+	/* 1. setup */
+	regu = 0;
+	tmp = fn->tmp;
+	mem = fn->mem;
+	end = alloc(fn->nblk * sizeof end[0]);
+	beg = alloc(fn->nblk * sizeof beg[0]);
+	for (n=0; n<fn->nblk; n++) {
+		bsinit(end[n].b, fn->ntmp);
+		bsinit(beg[n].b, fn->ntmp);
+	}
+	bsinit(cur.b, fn->ntmp);
+	bsinit(old.b, fn->ntmp);
+
+	for (t=Tmp0; t<fn->ntmp; t++)
+		*hint(t) = -1;
+	for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
+		if (i->op != OCopy || !isreg(i->arg[0]))
+			break;
+		else {
+			assert(rtype(i->to) == RTmp);
+			sethint(i->to.val, i->arg[0].val);
+		}
+
+	/* 2. assign registers following post-order */
+	for (n=fn->nblk-1; n>=0; n--) {
+		b = fn->rpo[n];
+		cur.n = 0;
+		bszero(cur.b);
+		for (x=0; x<2; x++)
+			for (t=Tmp0; t<fn->ntmp; t++) {
+				assert(bshas(b->out, t) ||
+					!bshas(cur.b, t));
+				if (bshas(b->out, t))
+				if (!bshas(cur.b, t))
+				if (x || (r=*hint(t)) != -1)
+				if (x || !bshas(cur.b, r))
+					ralloc(&cur, t);
+			}
+		rcopy(&end[n], &cur);
+		doblk(b, &cur);
+		bscopy(b->in, cur.b);
+		for (p=b->phi; p; p=p->link)
+			if (rtype(p->to) == RTmp) {
+				bsclr(b->in, p->to.val);
+				/* heuristic 0:
+				 * if the phi destination has an
+				 * argument from a frequent block
+				 * that was already allocated to
+				 * 'r', use 'r' as the new hint
+				 */
+				memset(rl, 0, sizeof rl);
+				for (u=0; u<p->narg; u++) {
+					t = p->arg[u].val;
+					b1 = p->blk[u];
+					if (rtype(p->arg[u]) == RTmp)
+					if ((r=rfind(&end[b1->id], t)) != -1)
+						rl[r] += b1->loop;
+				}
+				for (x=0, j=0; j<Tmp0; j++)
+					if (rl[j] > rl[x])
+						x = j;
+				if (rl[x] >= b->loop)
+					*hint(p->to.val) = x;
+			}
+		if (b->npred > 1) {
+			/* heuristic 1:
+			 * attempt to satisfy hints
+			 * when it's simple and we have
+			 * multiple predecessors
+			 */
+			rcopy(&old, &cur);
+			curi = &insb[NIns];
+			for (j=0; j<old.n; j++) {
+				t = old.t[j];
+				r = *hint(t);
+				r1 = rfind(&cur, t);
+				if (r != -1 && r != r1)
+				if (!bshas(cur.b, r)) {
+					rfree(&cur, t);
+					radd(&cur, t, r);
+					x = tmp[t].cls;
+					emit(OCopy, x, TMP(r1), TMP(r), R);
+				}
+			}
+			if ((j = &insb[NIns] - curi)) {
+				b->nins += j;
+				i = alloc(b->nins * sizeof(Ins));
+				icpy(icpy(i, curi, j), b->ins, b->nins-j);
+				b->ins = i;
+			}
+		}
+		rcopy(&beg[n], &cur);
+	}
+	if (debug['R'])  {
+		fprintf(stderr, "\n> Register mappings:\n");
+		for (n=0; n<fn->nblk; n++) {
+			b = fn->rpo[n];
+			fprintf(stderr, "\t%-10s beg", b->name);
+			mdump(&beg[n]);
+			fprintf(stderr, "\t           end");
+			mdump(&end[n]);
+		}
+		fprintf(stderr, "\n");
+	}
+
+	/* 3. compose glue code */
+	blist = 0;
+	for (b=fn->start;; b=b->link) {
+		ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}};
+		for (; (s=**ps); ps++) {
+			npm = 0;
+			for (p=s->phi; p; p=p->link) {
+				dst = p->to;
+				assert(rtype(dst)==RSlot || rtype(dst)==RTmp);
+				if (rtype(dst) == RTmp) {
+					r = rfind(&beg[s->id], dst.val);
+					if (r == -1)
+						continue;
+					dst = TMP(r);
+				}
+				for (u=0; p->blk[u]!=b; u++)
+					assert(u+1 < p->narg);
+				src = p->arg[u];
+				if (rtype(src) == RTmp)
+					src = rref(&end[b->id], src.val);
+				pmadd(src, dst, p->cls);
+			}
+			for (t=Tmp0; t<fn->ntmp; t++)
+				if (bshas(s->in, t)) {
+					src = rref(&end[b->id], t);
+					dst = rref(&beg[s->id], t);
+					pmadd(src, dst, tmp[t].cls);
+				}
+			pmgen();
+			if (curi == insb)
+				continue;
+			b1 = blknew();
+			b1->loop = (b->loop+s->loop) / 2;
+			b1->link = blist;
+			blist = b1;
+			fn->nblk++;
+			sprintf(b1->name, "%s_%s", b->name, s->name);
+			b1->nins = curi - insb;
+			idup(&b1->ins, insb, b1->nins);
+			b1->jmp.type = JJmp;
+			b1->s1 = s;
+			**ps = b1;
+		}
+		if (!b->link) {
+			b->link = blist;
+			break;
+		}
+	}
+	for (b=fn->start; b; b=b->link)
+		b->phi = 0;
+	fn->reg = regu;
+
+	if (debug['R']) {
+		fprintf(stderr, "\n> After register allocation:\n");
+		printfn(fn, stderr);
+	}
+}