summary refs log tree commit diff
path: root/spill.c
diff options
context:
space:
mode:
authorQuentin Carbonneaux <quentin.carbonneaux@yale.edu>2016-03-28 12:53:53 -0400
committerQuentin Carbonneaux <quentin.carbonneaux@yale.edu>2016-03-29 10:10:22 -0400
commitb75cb8388fb9b5f2393443d008bb46c522c5ec9b (patch)
tree25268fe5f71d826ee1f8f0e3a2a82aa68f9bf750 /spill.c
parent1b4943eb1f2a10837f56070bfe604179d0dc10e0 (diff)
downloadroux-b75cb8388fb9b5f2393443d008bb46c522c5ec9b.tar.gz
new layout, put LICENSE in root
Diffstat (limited to 'spill.c')
-rw-r--r--spill.c507
1 files changed, 507 insertions, 0 deletions
diff --git a/spill.c b/spill.c
new file mode 100644
index 0000000..72f8106
--- /dev/null
+++ b/spill.c
@@ -0,0 +1,507 @@
+#include "all.h"
+
+static void
+loopmark(Blk *hd, Blk *b, Phi *p)
+{
+	int k, head;
+	uint n, a;
+
+	head = hd->id;
+	if (b->id < head)
+		return;
+	for (; p; p=p->link)
+		for (a=0; a<p->narg; a++)
+			if (p->blk[a] == b)
+			if (rtype(p->arg[a]) == RTmp)
+				bsset(hd->gen, p->arg[a].val);
+	if (b->visit == head)
+		return;
+	b->visit = head;
+	b->loop *= 10;
+	/* aggregate looping information at
+	 * loop headers */
+	bsunion(hd->gen, b->gen);
+	for (k=0; k<2; k++)
+		if (b->nlive[k] > hd->nlive[k])
+			hd->nlive[k] = b->nlive[k];
+	for (n=0; n<b->npred; n++)
+		loopmark(hd, b->pred[n], b->phi);
+}
+
+static void
+tmpuse(Ref r, int use, int loop, Fn *fn)
+{
+	Mem *m;
+	Tmp *t;
+
+	if (rtype(r) == RAMem) {
+		m = &fn->mem[r.val & AMask];
+		tmpuse(m->base, 1, loop, fn);
+		tmpuse(m->index, 1, loop, fn);
+	}
+	else if (rtype(r) == RTmp && r.val >= Tmp0) {
+		t = &fn->tmp[r.val];
+		t->nuse += use;
+		t->ndef += !use;
+		t->cost += loop;
+	}
+}
+
+/* evaluate spill costs of temporaries,
+ * this also fills usage information
+ * requires rpo, preds
+ */
+void
+fillcost(Fn *fn)
+{
+	int n, hd;
+	uint a;
+	Blk *b;
+	Ins *i;
+	Tmp *t;
+	Phi *p;
+
+	for (b=fn->start; b; b=b->link) {
+		b->loop = 1;
+		b->visit = -1;
+	}
+	if (debug['S'])
+		fprintf(stderr, "\n> Loop information:\n");
+	for (n=0; n<fn->nblk; n++) {
+		b = fn->rpo[n];
+		hd = 0;
+		for (a=0; a<b->npred; a++)
+			if (b->pred[a]->id >= n) {
+				loopmark(b, b->pred[a], b->phi);
+				hd = 1;
+			}
+		if (hd && debug['S']) {
+			fprintf(stderr, "\t%-10s", b->name);
+			fprintf(stderr, " (% 3d ", b->nlive[0]);
+			fprintf(stderr, "% 3d) ", b->nlive[1]);
+			dumpts(b->gen, fn->tmp, stderr);
+		}
+	}
+	for (t=fn->tmp; t-fn->tmp < fn->ntmp; t++) {
+		t->cost = t-fn->tmp < Tmp0 ? 1e6 : 0;
+		t->nuse = 0;
+		t->ndef = 0;
+	}
+	for (b=fn->start; b; b=b->link) {
+		for (p=b->phi; p; p=p->link) {
+			/* todo, the cost computation
+			 * for p->to is not great... */
+			tmpuse(p->to, 0, 0, fn);
+			for (a=0; a<p->narg; a++) {
+				n = p->blk[a]->loop;
+				assert(b->npred==p->narg &&
+					"wrong cfg");
+				n /= b->npred;
+				tmpuse(p->arg[a], 1, n, fn);
+			}
+		}
+		n = b->loop;
+		for (i=b->ins; i-b->ins < b->nins; i++) {
+			tmpuse(i->to, 0, n, fn);
+			tmpuse(i->arg[0], 1, n, fn);
+			tmpuse(i->arg[1], 1, n, fn);
+		}
+		tmpuse(b->jmp.arg, 1, n, fn);
+	}
+	if (debug['S']) {
+		fprintf(stderr, "\n> Spill costs:\n");
+		for (n=Tmp0; n<fn->ntmp; n++)
+			fprintf(stderr, "\t%-10s %d\n",
+				fn->tmp[n].name,
+				fn->tmp[n].cost);
+		fprintf(stderr, "\n");
+	}
+}
+
+static BSet *fst; /* temps to prioritize in registers (for tcmp1) */
+static Tmp *tmp;  /* current temporaries (for tcmpX) */
+static int ntmp;  /* current # of temps (for limit) */
+static int locs;  /* stack size used by locals */
+static int slot4; /* next slot of 4 bytes */
+static int slot8; /* ditto, 8 bytes */
+static BSet mask[2][1]; /* class masks */
+
+static int
+tcmp0(const void *pa, const void *pb)
+{
+	return tmp[*(int *)pb].cost - tmp[*(int *)pa].cost;
+}
+
+static int
+tcmp1(const void *pa, const void *pb)
+{
+	int c;
+
+	c = bshas(fst, *(int *)pb) - bshas(fst, *(int *)pa);
+	return c ? c : tcmp0(pa, pb);
+}
+
+static Ref
+slot(int t)
+{
+	int s;
+
+	if (t < Tmp0)
+		diag("spill: cannot spill register");
+	s = tmp[t].slot;
+	if (s == -1) {
+		assert(NAlign == 3);
+		/* nice logic to pack stack slots
+		 * on demand, there can be only
+		 * one hole and slot4 points to it
+		 *
+		 * invariant: slot4 <= slot8
+		 */
+		if (KWIDE(tmp[t].cls)) {
+			s = slot8;
+			if (slot4 == slot8)
+				slot4 += 2;
+			slot8 += 2;
+		} else {
+			s = slot4;
+			if (slot4 == slot8) {
+				slot8 += 2;
+				slot4 += 1;
+			} else
+				slot4 = slot8;
+		}
+		s += locs;
+		tmp[t].slot = s;
+	}
+	return SLOT(s);
+}
+
+static void
+limit(BSet *b, int k, BSet *f)
+{
+	static int *tarr, maxt;
+	int i, nt;
+	uint t;
+
+	nt = bscount(b);
+	if (nt <= k)
+		return;
+	if (nt > maxt) {
+		free(tarr);
+		tarr = emalloc(nt * sizeof tarr[0]);
+		maxt = nt;
+	}
+	for (i=0, t=0; bsiter(b, &t); t++) {
+		bsclr(b, t);
+		tarr[i++] = t;
+	}
+	if (!f)
+		qsort(tarr, nt, sizeof tarr[0], tcmp0);
+	else {
+		fst = f;
+		qsort(tarr, nt, sizeof tarr[0], tcmp1);
+	}
+	for (i=0; i<k && i<nt; i++)
+		bsset(b, tarr[i]);
+	for (; i<nt; i++)
+		slot(tarr[i]);
+}
+
+static void
+limit2(BSet *b1, int k1, int k2, BSet *fst)
+{
+	BSet b2[1];
+
+	bsinit(b2, ntmp); /* todo, free those */
+	bscopy(b2, b1);
+	bsinter(b1, mask[0]);
+	bsinter(b2, mask[1]);
+	limit(b1, NIReg - k1, fst);
+	limit(b2, NFReg - k2, fst);
+	bsunion(b1, b2);
+}
+
+static void
+sethint(BSet *u, bits r)
+{
+	uint t;
+
+	for (t=Tmp0; bsiter(u, &t); t++)
+		tmp[phicls(t, tmp)].hint.m |= r;
+}
+
+static void
+reloads(BSet *u, BSet *v)
+{
+	uint t;
+
+	for (t=Tmp0; bsiter(u, &t); t++)
+		if (!bshas(v, t))
+			emit(OLoad, tmp[t].cls, TMP(t), slot(t), R);
+}
+
+static void
+store(Ref r, int s)
+{
+	static int kstore[] = {
+		[Kw] = OStorew, [Kl] = OStorel,
+		[Ks] = OStores, [Kd] = OStored,
+	};
+
+	if (s != -1)
+		emit(kstore[tmp[r.val].cls], 0, R, r, SLOT(s));
+}
+
+static int
+regcpy(Ins *i)
+{
+	return i->op == OCopy && isreg(i->arg[0]);
+}
+
+static Ins *
+dopm(Blk *b, Ins *i, BSet *v)
+{
+	int n, t;
+	BSet u[1];
+	Ins *i1;
+	bits r;
+
+	bsinit(u, ntmp); /* todo, free those */
+	/* consecutive copies from
+	 * registers need to be handled
+	 * as one large instruction
+	 *
+	 * fixme: there is an assumption
+	 * that calls are always followed
+	 * by copy instructions here, this
+	 * might not be true if previous
+	 * passes change
+	 */
+	i1 = ++i;
+	do {
+		i--;
+		t = i->to.val;
+		if (!req(i->to, R))
+		if (bshas(v, t)) {
+			bsclr(v, t);
+			store(i->to, tmp[t].slot);
+		}
+		bsset(v, i->arg[0].val);
+	} while (i != b->ins && regcpy(i-1));
+	bscopy(u, v);
+	if (i != b->ins && (i-1)->op == OCall) {
+		v->t[0] &= ~retregs((i-1)->arg[1], 0);
+		limit2(v, NISave, NFSave, 0);
+		for (r=0, n=0; n<NRSave; n++)
+			r |= BIT(rsave[n]);
+		v->t[0] |= argregs((i-1)->arg[1], 0);
+	} else {
+		limit2(v, 0, 0, 0);
+		r = v->t[0];
+	}
+	sethint(v, r);
+	reloads(u, v);
+	do
+		emiti(*--i1);
+	while (i1 != i);
+	return i;
+}
+
+/* spill code insertion
+ * requires spill costs, rpo, liveness
+ *
+ * Note: this will replace liveness
+ * information (in, out) with temporaries
+ * that must be in registers at block
+ * borders
+ *
+ * Be careful with:
+ * - OCopy instructions to ensure register
+ *   constraints
+ */
+void
+spill(Fn *fn)
+{
+	Blk *b, *s1, *s2, *hd, **bp;
+	int j, n, l, t, k, lvarg[2];
+	BSet u[1], v[1], w[1];
+	Ins *i;
+	Phi *p;
+	Mem *m;
+	bits r;
+
+	tmp = fn->tmp;
+	ntmp = fn->ntmp;
+	bsinit(u, ntmp);
+	bsinit(v, ntmp);
+	bsinit(w, ntmp);
+	bsinit(mask[0], ntmp);
+	bsinit(mask[1], ntmp);
+	locs = fn->slot;
+	slot4 = 0;
+	slot8 = 0;
+	for (t=0; t<ntmp; t++) {
+		k = 0;
+		if (t >= XMM0 && t < XMM0 + NFReg)
+			k = 1;
+		else if (t >= Tmp0)
+			k = KBASE(tmp[t].cls);
+		bsset(mask[k], t);
+	}
+
+	for (bp=&fn->rpo[fn->nblk]; bp!=fn->rpo;) {
+		b = *--bp;
+		/* invariant: all bocks with bigger rpo got
+		 * their in,out updated. */
+
+		/* 1. find temporaries in registers at
+		 * the end of the block (put them in v) */
+		curi = 0;
+		s1 = b->s1;
+		s2 = b->s2;
+		hd = 0;
+		if (s1 && s1->id <= n)
+			hd = s1;
+		if (s2 && s2->id <= n)
+		if (!hd || s2->id >= hd->id)
+			hd = s2;
+		r = 0;
+		bszero(v);
+		if (hd) {
+			/* back-edge */
+			for (k=0; k<2; k++) {
+				n = k == 0 ? NIReg : NFReg;
+				bscopy(u, b->out);
+				bsinter(u, mask[k]);
+				bscopy(w, u);
+				bsinter(u, hd->gen);
+				bsdiff(w, hd->gen);
+				if ((int)bscount(u) < n) { /* fixme */
+					j = bscount(w);   /* live through */
+					l = hd->nlive[k];
+					limit(w, n - (l - j), 0);
+					bsunion(u, w);
+				} else
+					limit(u, n, 0);
+				bsunion(v, u);
+			}
+		} else if (s1) {
+			liveon(v, b, s1);
+			if (s2) {
+				liveon(u, b, s2);
+				bscopy(w, u);
+				bsinter(w, v);
+				bsunion(v, u);
+			}
+			limit2(v, 0, 0, w);
+		} else if (rtype(b->jmp.arg) == RACall) {
+			/* return */
+			r = retregs(b->jmp.arg, 0);
+			v->t[0] |= r;
+		}
+		bscopy(b->out, v);
+
+		/* 2. process the block instructions */
+		curi = &insb[NIns];
+		for (i=&b->ins[b->nins]; i!=b->ins;) {
+			i--;
+			if (regcpy(i)) {
+				i = dopm(b, i, v);
+				continue;
+			}
+			bszero(w);
+			if (!req(i->to, R)) {
+				assert(rtype(i->to) == RTmp);
+				t = i->to.val;
+				if (bshas(v, t))
+					bsclr(v, t);
+				else {
+					/* make sure we have a reg
+					 * for the result */
+					bsset(v, t);
+					bsset(w, t);
+				}
+			}
+			j = opdesc[i->op].nmem;
+			for (n=0; n<2; n++)
+				if (rtype(i->arg[n]) == RAMem)
+					j--;
+			for (n=0; n<2; n++)
+				switch (rtype(i->arg[n])) {
+				case RAMem:
+					t = i->arg[n].val;
+					m = &fn->mem[t & AMask];
+					if (rtype(m->base) == RTmp) {
+						bsset(v, m->base.val);
+						bsset(w, m->base.val);
+					}
+					if (rtype(m->index) == RTmp) {
+						bsset(v, m->index.val);
+						bsset(w, m->index.val);
+					}
+					break;
+				case RTmp:
+					t = i->arg[n].val;
+					lvarg[n] = bshas(v, t);
+					bsset(v, t);
+					if (j-- <= 0)
+						bsset(w, t);
+					break;
+				}
+			bscopy(u, v);
+			limit2(v, 0, 0, w);
+			for (n=0; n<2; n++)
+				if (rtype(i->arg[n]) == RTmp) {
+					t = i->arg[n].val;
+					if (!bshas(v, t)) {
+						/* do not reload if the
+						 * the temporary was dead
+						 */
+						if (!lvarg[n])
+							bsclr(u, t);
+						i->arg[n] = slot(t);
+					}
+				}
+			reloads(u, v);
+			if (!req(i->to, R)) {
+				t = i->to.val;
+				store(i->to, tmp[t].slot);
+				bsclr(v, t);
+			}
+			emiti(*i);
+			r = v->t[0] & (BIT(Tmp0)-1);
+			if (r)
+				sethint(v, r);
+		}
+		assert(!r || b==fn->start);
+
+		for (p=b->phi; p; p=p->link) {
+			assert(rtype(p->to) == RTmp);
+			t = p->to.val;
+			if (bshas(v, t)) {
+				bsclr(v, t);
+				store(p->to, tmp[t].slot);
+			} else if (bshas(b->in, t))
+				/* only if the phi is live */
+				p->to = slot(p->to.val);
+		}
+		bscopy(b->in, v);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	}
+
+	/* align the locals to a 16 byte boundary */
+	assert(NAlign == 3);
+	slot8 += slot8 & 3;
+	fn->slot += slot8;
+
+	if (debug['S']) {
+		fprintf(stderr, "\n> Block information:\n");
+		for (b=fn->start; b; b=b->link) {
+			printf("\t%-10s (% 5d) ", b->name, b->loop);
+			dumpts(b->out, fn->tmp, stdout);
+		}
+		fprintf(stderr, "\n> After spilling:\n");
+		printfn(fn, stderr);
+	}
+}