summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--all.h5
-rw-r--r--rega.c420
2 files changed, 261 insertions, 164 deletions
diff --git a/all.h b/all.h
index f24d099..1e9476d 100644
--- a/all.h
+++ b/all.h
@@ -288,8 +288,9 @@ struct Tmp {
short slot; /* -1 for unset */
short cls;
struct {
- int r;
- bits m;
+ int r; /* register or -1 */
+ int w; /* weight */
+ bits m; /* avoid these registers */
} hint;
int phi;
Alias alias;
diff --git a/rega.c b/rega.c
index 02429a6..2f01c07 100644
--- a/rega.c
+++ b/rega.c
@@ -10,6 +10,7 @@ typedef struct RMap RMap;
struct RMap {
int t[Tmp0];
int r[Tmp0];
+ int w[Tmp0]; /* wait list, for unmatched hints */
BSet b[1];
int n;
};
@@ -20,8 +21,12 @@ static Mem *mem; /* function mem references */
static struct {
Ref src, dst;
int cls;
-} *pm; /* parallel move constructed */
-static int cpm, npm; /* capacity and size of pm */
+} pm[Tmp0]; /* parallel move constructed */
+static int npm; /* size of pm */
+static int loop; /* current loop level */
+
+static uint stmov; /* stats: added moves */
+static uint stblk; /* stats: added blocks */
static int *
hint(int t)
@@ -32,21 +37,21 @@ hint(int t)
static void
sethint(int t, int r)
{
- bits m;
+ Tmp *p;
- m = tmp[phicls(t, tmp)].hint.m;
- if (*hint(t) == -1)
- if (!(BIT(r) & m))
- *hint(t) = r;
+ p = &tmp[phicls(t, tmp)];
+ if (p->hint.r == -1 || p->hint.w > loop) {
+ p->hint.r = r;
+ p->hint.w = loop;
+ tmp[t].visit = -1;
+ }
}
static void
rcopy(RMap *ma, RMap *mb)
{
- memcpy(ma->t, mb->t, sizeof ma->t);
- memcpy(ma->r, mb->r, sizeof ma->r);
+ memcpy(ma, mb, sizeof *ma);
bscopy(ma->b, mb->b);
- ma->n = mb->n;
}
static int
@@ -93,10 +98,10 @@ radd(RMap *m, int t, int r)
}
static Ref
-ralloc(RMap *m, int t)
+ralloctry(RMap *m, int t, int try)
{
bits regs;
- int r, r0, r1;
+ int h, r, r0, r1;
if (t < Tmp0) {
assert(bshas(m->b, t));
@@ -107,8 +112,12 @@ ralloc(RMap *m, int t)
assert(r != -1);
return TMP(r);
}
- r = *hint(t);
+ r = tmp[t].visit;
+ if (r == -1 || bshas(m->b, r))
+ r = *hint(t);
if (r == -1 || bshas(m->b, r)) {
+ if (try)
+ return R;
regs = tmp[phicls(t, tmp)].hint.m;
regs |= m->b->t[0];
if (KBASE(tmp[t].cls) == 0) {
@@ -129,9 +138,19 @@ ralloc(RMap *m, int t)
Found:
radd(m, t, r);
sethint(t, r);
+ tmp[t].visit = r;
+ h = *hint(t);
+ if (h != -1 && h != r)
+ m->w[h] = t;
return TMP(r);
}
+static inline Ref
+ralloc(RMap *m, int t)
+{
+ return ralloctry(m, t, 0);
+}
+
static int
rfree(RMap *m, int t)
{
@@ -168,12 +187,8 @@ mdump(RMap *m)
static void
pmadd(Ref src, Ref dst, int k)
{
- if (npm == cpm) {
- cpm = cpm * 2 + 16;
- pm = realloc(pm, cpm * sizeof pm[0]);
- if (!pm)
- die("pmadd, out of memory");
- }
+ if (npm == Tmp0)
+ die("cannot have more moves than registers");
pm[npm].src = src;
pm[npm].dst = dst;
pm[npm].cls = k;
@@ -182,77 +197,61 @@ pmadd(Ref src, Ref dst, int k)
enum PMStat { ToMove, Moving, Moved };
-static Ref
+static int
pmrec(enum PMStat *status, int i, int *k)
{
- Ref swp, swp1;
- int j, k1;
+ int j, c;
/* note, this routine might emit
- * too many large instructions:
- *
- * , x -- x
- * x -- x -- x |
- * ` x -- x
- *
- * if only the first move is wide
- * the whole cycle will be wide,
- * this is safe but not necessary
+ * too many large instructions
*/
-
- if (req(pm[i].src, pm[i].dst))
- return R;
- status[i] = Moving;
- assert(KBASE(*k) == KBASE(pm[i].cls));
- assert((Kw|1) == Kl && (Ks|1) == Kd);
- *k |= KWIDE(pm[i].cls); /* see above */
- swp = R;
- for (j=0; j<npm; j++) {
- if (req(pm[j].src, pm[i].dst))
- switch (status[j]) {
- case ToMove:
- k1 = *k;
- swp1 = pmrec(status, j, &k1);
- if (!req(swp1, R)) {
- assert(req(swp, R));
- swp = swp1;
- *k = k1;
- }
- break;
- case Moving:
- assert(req(swp, R));
- swp = pm[i].dst;
- break;
- case Moved:
- break;
- }
+ if (req(pm[i].src, pm[i].dst)) {
+ status[i] = Moved;
+ return -1;
+ }
+ assert(KBASE(pm[i].cls) == KBASE(*k));
+ assert((Kw|Kl) == Kl && (Ks|Kd) == Kd);
+ *k |= pm[i].cls;
+ for (j=0; j<npm; j++)
+ if (req(pm[j].dst, pm[i].src))
+ break;
+ switch (j == npm ? Moved : status[j]) {
+ case Moving:
+ c = j; /* start of cycle */
+ emit(Oswap, *k, R, pm[i].src, pm[i].dst);
+ break;
+ case ToMove:
+ status[i] = Moving;
+ c = pmrec(status, j, k);
+ if (c == i) {
+ c = -1; /* end of cycle */
+ break;
+ }
+ if (c != -1) {
+ emit(Oswap, *k, R, pm[i].src, pm[i].dst);
+ break;
+ }
+ /* fall through */
+ case Moved:
+ c = -1;
+ emit(Ocopy, pm[i].cls, pm[i].dst, pm[i].src, R);
+ break;
}
status[i] = Moved;
- if (req(swp, R)) {
- *curi++ = (Ins){Ocopy, pm[i].dst, {pm[i].src}, pm[i].cls};
- return R;
- } else if (!req(swp, pm[i].src)) {
- *curi++ = (Ins){Oswap, R, {pm[i].src, pm[i].dst}, *k};
- return swp;
- } else
- return R;
-
+ return c;
}
static void
pmgen()
{
- int i, k;
+ int i;
enum PMStat *status;
status = alloc(npm * sizeof status[0]);
assert(!npm || status[npm-1] == ToMove);
- curi = insb;
for (i=0; i<npm; i++)
- if (status[i] == ToMove) {
- k = pm[i].cls;
- pmrec(status, i, &k);
- }
+ if (status[i] == ToMove)
+ pmrec(status, i, (int[]){pm[i].cls});
}
static void
@@ -261,8 +260,9 @@ move(int r, Ref to, RMap *m)
int n, t, r1;
r1 = req(to, R) ? -1 : rfree(m, to.val);
- if (bshas(m->b, r) && r1 != r) {
+ if (bshas(m->b, r)) {
/* r is used and not by to */
+ assert(r1 != r);
for (n=0; m->r[n] != r; n++)
assert(n+1 < m->n);
t = m->t[n];
@@ -286,10 +286,11 @@ dopm(Blk *b, Ins *i, RMap *m)
{
RMap m0;
int n, r, r1, t, s;
- Ins *i0, *i1, *ip, *ir;
+ Ins *i1, *ip;
bits def;
- m0 = *m;
+ m0 = *m; /* okay since we don't use m0.b */
+ m0.b->t = 0;
i1 = ++i;
do {
i--;
@@ -320,21 +321,11 @@ dopm(Blk *b, Ins *i, RMap *m)
radd(m, r, r);
}
pmgen();
-#ifdef TEST_PMOV
- return 0;
-#endif
- n = b->nins - (i1 - i) + (curi - insb);
- i0 = alloc(n * sizeof(Ins));
- ip = icpy(ip = i0, b->ins, i - b->ins);
- ip = icpy(ir = ip, insb, curi - insb);
- ip = icpy(ip, i1, &b->ins[b->nins] - i1);
- b->nins = n;
- b->ins = i0;
- return ir;
+ return i;
}
static int
-prio(Ref r1, Ref r2)
+prio1(Ref r1, Ref r2)
{
/* trivial heuristic to begin with,
* later we can use the distance to
@@ -350,7 +341,7 @@ insert(Ref *r, Ref **rs, int p)
int i;
rs[i = p] = r;
- while (i-- > 0 && prio(*r, *rs[i])) {
+ while (i-- > 0 && prio1(*r, *rs[i])) {
rs[i+1] = rs[i];
rs[i] = r;
}
@@ -359,9 +350,9 @@ insert(Ref *r, Ref **rs, int p)
static void
doblk(Blk *b, RMap *cur)
{
- int x, r, nr;
+ int t, x, r, rf, rt, nr;
bits rs;
- Ins *i;
+ Ins *i, *i1;
Mem *m;
Ref *ra[4];
@@ -369,8 +360,12 @@ doblk(Blk *b, RMap *cur)
radd(cur, r, r);
if (rtype(b->jmp.arg) == RTmp)
b->jmp.arg = ralloc(cur, b->jmp.arg.val);
- for (i=&b->ins[b->nins]; i!=b->ins;) {
- switch ((--i)->op) {
+ curi = &insb[NIns];
+ for (i1=&b->ins[b->nins]; i1!=b->ins;) {
+ emiti(*--i1);
+ i = curi;
+ rf = -1;
+ switch (i->op) {
case Ocall:
rs = T.argregs(i->arg[1], 0) | T.rglob;
for (r=0; T.rsave[r]>=0; r++)
@@ -378,8 +373,10 @@ doblk(Blk *b, RMap *cur)
rfree(cur, T.rsave[r]);
break;
case Ocopy:
- if (isreg(i->arg[0])) {
- i = dopm(b, i, cur);
+ if (regcpy(i)) {
+ curi++;
+ i1 = dopm(b, i1, cur);
+ stmov += i+1 - curi;
continue;
}
if (isreg(i->to))
@@ -390,14 +387,15 @@ doblk(Blk *b, RMap *cur)
if (!req(i->to, R)) {
assert(rtype(i->to) == RTmp);
r = i->to.val;
- if (r >= Tmp0 || !(BIT(r) & T.rglob))
- r = rfree(cur, r);
- if (r == -1) {
+ if (r < Tmp0 && (BIT(r) & T.rglob))
+ break;
+ rf = rfree(cur, r);
+ if (rf == -1) {
assert(!isreg(i->to));
- *i = (Ins){.op = Onop};
+ curi++;
continue;
}
- i->to = TMP(r);
+ i->to = TMP(rf);
}
break;
}
@@ -416,7 +414,57 @@ doblk(Blk *b, RMap *cur)
}
for (r=0; r<nr; r++)
*ra[r] = ralloc(cur, ra[r]->val);
+
+ /* try to change the register of a hinted
+ * temporary if rf is available */
+ x = 1;
+ if (rf != -1 && (t = cur->w[rf]) != 0)
+ if (!bshas(cur->b, rf) && *hint(t) == rf
+ && (rt = rfree(cur, t)) != -1) {
+ tmp[t].visit = -1;
+ ralloc(cur, t);
+ assert(bshas(cur->b, rf));
+ emit(Ocopy, tmp[t].cls, TMP(rt), TMP(rf), R);
+ stmov += 1;
+ cur->w[rf] = 0;
+ for (r=0; r<nr; r++)
+ if (req(*ra[r], TMP(rt)))
+ *ra[r] = TMP(rf);
+ /* one could iterate this logic with
+ * the newly freed rt, but in this case
+ * the above loop must be changed */
+ }
}
+ b->nins = &insb[NIns] - curi;
+ idup(&b->ins, curi, b->nins);
+}
+
+/* qsort() comparison function to peel
+ * loop nests from inside out */
+static int
+carve(const void *a, const void *b)
+{
+ Blk *ba, *bb;
+
+ /* todo, evaluate if this order is really
+ * better than the simple postorder */
+ ba = *(Blk**)a;
+ bb = *(Blk**)b;
+ if (ba->loop == bb->loop)
+ return ba->id > bb->id ? -1 : ba->id < bb->id;
+ return ba->loop > bb->loop ? -1 : +1;
+}
+
+/* comparison function to order temporaries
+ * for allocation at the end of blocks */
+static int
+prio2(int t1, int t2)
+{
+ if ((tmp[t1].visit ^ tmp[t2].visit) < 0) /* != signs */
+ return tmp[t1].visit != -1 ? +1 : -1;
+ if ((*hint(t1) ^ *hint(t2)) < 0)
+ return *hint(t1) != -1 ? +1 : -1;
+ return tmp[t1].cost - tmp[t2].cost;
}
/* register allocation
@@ -425,18 +473,21 @@ doblk(Blk *b, RMap *cur)
void
rega(Fn *fn)
{
- int j, t, r, r1, x, rl[Tmp0];
- Blk *b, *b1, *s, ***ps, *blist;
- RMap *end, *beg, cur, old;
+ int j, t, r, x, rl[Tmp0];
+ Blk *b, *b1, *s, ***ps, *blist, **blk, **bp;
+ RMap *end, *beg, cur, old, *m;
Ins *i;
Phi *p;
uint u, n;
Ref src, dst;
/* 1. setup */
+ stmov = 0;
+ stblk = 0;
regu = 0;
tmp = fn->tmp;
mem = fn->mem;
+ blk = alloc(fn->nblk * sizeof blk[0]);
end = alloc(fn->nblk * sizeof end[0]);
beg = alloc(fn->nblk * sizeof beg[0]);
for (n=0; n<fn->nblk; n++) {
@@ -446,8 +497,15 @@ rega(Fn *fn)
bsinit(cur.b, fn->ntmp);
bsinit(old.b, fn->ntmp);
- for (t=0; t<fn->ntmp; t++)
- *hint(t) = t < Tmp0 ? t : -1;
+ loop = INT_MAX;
+ for (t=0; t<fn->ntmp; t++) {
+ tmp[t].hint.r = t < Tmp0 ? t : -1;
+ tmp[t].hint.w = loop;
+ tmp[t].visit = -1;
+ }
+ for (bp=blk, b=fn->start; b; b=b->link)
+ *bp++ = b;
+ qsort(blk, fn->nblk, sizeof blk[0], carve);
for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
if (i->op != Ocopy || !isreg(i->arg[0]))
break;
@@ -456,71 +514,103 @@ rega(Fn *fn)
sethint(i->to.val, i->arg[0].val);
}
- /* 2. assign registers following post-order */
- for (n=fn->nblk-1; n!=-1u; n--) {
- b = fn->rpo[n];
+ /* 2. assign registers */
+ for (bp=blk; bp<&blk[fn->nblk]; bp++) {
+ b = *bp;
+ n = b->id;
+ loop = b->loop;
cur.n = 0;
bszero(cur.b);
- for (x=0; x<2; x++)
- for (t=Tmp0; bsiter(b->out, &t); t++)
- if (x || (r=*hint(t)) != -1)
- if (x || !bshas(cur.b, r))
- ralloc(&cur, t);
+ memset(cur.w, 0, sizeof cur.w);
+ for (x=0, t=Tmp0; bsiter(b->out, &t); t++) {
+ j = x++;
+ rl[j] = t;
+ while (j-- > 0 && prio2(t, rl[j]) > 0) {
+ rl[j+1] = rl[j];
+ rl[j] = t;
+ }
+ }
+ for (j=0; j<x; j++)
+ ralloctry(&cur, rl[j], 1);
+ for (j=0; j<x; j++)
+ ralloc(&cur, rl[j]);
rcopy(&end[n], &cur);
doblk(b, &cur);
bscopy(b->in, cur.b);
for (p=b->phi; p; p=p->link)
- if (rtype(p->to) == RTmp) {
+ if (rtype(p->to) == RTmp)
bsclr(b->in, p->to.val);
- /* heuristic 0:
- * if the phi destination has an
- * argument from a frequent block
- * that was already allocated to
- * 'r', use 'r' as the new hint
- */
- memset(rl, 0, sizeof rl);
- for (u=0; u<p->narg; u++) {
- t = p->arg[u].val;
- b1 = p->blk[u];
- if (rtype(p->arg[u]) == RTmp)
- if ((r=rfind(&end[b1->id], t)) != -1)
- rl[r] += b1->loop;
- }
- for (x=0, j=0; j<Tmp0; j++)
- if (rl[j] > rl[x])
- x = j;
- if (rl[x] >= b->loop)
- *hint(p->to.val) = x;
+ rcopy(&beg[n], &cur);
+ }
+
+ /* 3. emit copies shared by multiple edges
+ * to the same block */
+ for (s=fn->start; s; s=s->link) {
+ if (s->npred <= 1)
+ continue;
+ m = &beg[s->id];
+
+ /* rl maps a register that is live at the
+ * beginning of s to the one used in all
+ * predecessors (if any, -1 otherwise) */
+ memset(rl, 0, sizeof rl);
+
+ /* to find the register of a phi in a
+ * predecessor, we have to find the
+ * corresponding argument */
+ for (p=s->phi; p; p=p->link) {
+ r = rfind(m, p->to.val);
+ if (r == -1)
+ continue;
+ for (u=0; u<p->narg; u++) {
+ b = p->blk[u];
+ src = p->arg[u];
+ if (rtype(src) != RTmp)
+ continue;
+ x = rfind(&end[b->id], src.val);
+ assert(x != -1);
+ rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
}
- if (b->npred > 1) {
- /* heuristic 1:
- * attempt to satisfy hints
- * when it's simple and we have
- * multiple predecessors
- */
- rcopy(&old, &cur);
- curi = &insb[NIns];
- for (j=0; j<old.n; j++) {
- t = old.t[j];
- r = *hint(t);
- r1 = rfind(&cur, t);
- if (r != -1 && r != r1)
- if (!bshas(cur.b, r)) {
- rfree(&cur, t);
- radd(&cur, t, r);
- x = tmp[t].cls;
- emit(Ocopy, x, TMP(r1), TMP(r), R);
- }
+ if (rl[r] == 0)
+ rl[r] = -1;
+ }
+
+ /* process non-phis temporaries */
+ for (j=0; j<m->n; j++) {
+ t = m->t[j];
+ r = m->r[j];
+ if (rl[r] || t < Tmp0 /* todo, remove this */)
+ continue;
+ for (bp=s->pred; bp<&s->pred[s->npred]; bp++) {
+ x = rfind(&end[(*bp)->id], t);
+ assert(x != -1);
+ rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
}
- if ((j = &insb[NIns] - curi)) {
- b->nins += j;
- i = alloc(b->nins * sizeof(Ins));
- icpy(icpy(i, curi, j), b->ins, b->nins-j);
- b->ins = i;
+ }
+
+ npm = 0;
+ for (j=0; j<m->n; j++) {
+ t = m->t[j];
+ r = m->r[j];
+ x = rl[r];
+ assert(x != 0 || t < Tmp0 /* todo, ditto */);
+ if (x > 0) {
+ pmadd(TMP(x), TMP(r), tmp[t].cls);
+ m->r[j] = x;
}
}
- rcopy(&beg[n], &cur);
+ curi = &insb[NIns];
+ pmgen();
+ j = &insb[NIns] - curi;
+ if (j == 0)
+ continue;
+ stmov += j;
+ s->nins += j;
+ i = alloc(s->nins * sizeof(Ins));
+ icpy(icpy(i, curi, j), s->ins, s->nins-j);
+ s->ins = i;
}
+
if (debug['R']) {
fprintf(stderr, "\n> Register mappings:\n");
for (n=0; n<fn->nblk; n++) {
@@ -533,7 +623,7 @@ rega(Fn *fn)
fprintf(stderr, "\n");
}
- /* 3. compose glue code */
+ /* 4. emit remaining copies in new blocks */
blist = 0;
for (b=fn->start;; b=b->link) {
ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}};
@@ -560,8 +650,9 @@ rega(Fn *fn)
dst = rref(&beg[s->id], t);
pmadd(src, dst, tmp[t].cls);
}
+ curi = &insb[NIns];
pmgen();
- if (curi == insb)
+ if (curi == &insb[NIns])
continue;
b1 = blknew();
b1->loop = (b->loop+s->loop) / 2;
@@ -569,8 +660,10 @@ rega(Fn *fn)
blist = b1;
fn->nblk++;
sprintf(b1->name, "%s_%s", b->name, s->name);
- b1->nins = curi - insb;
- idup(&b1->ins, insb, b1->nins);
+ b1->nins = &insb[NIns] - curi;
+ stmov += b1->nins;
+ stblk += 1;
+ idup(&b1->ins, curi, b1->nins);
b1->jmp.type = Jjmp;
b1->s1 = s;
**ps = b1;
@@ -585,6 +678,9 @@ rega(Fn *fn)
fn->reg = regu;
if (debug['R']) {
+ fprintf(stderr, "\n> Register allocation statistics:\n");
+ fprintf(stderr, "\tnew moves: %d\n", stmov);
+ fprintf(stderr, "\tnew blocks: %d\n", stblk);
fprintf(stderr, "\n> After register allocation:\n");
printfn(fn, stderr);
}