From 2d02070af019e9896ecf2a63bedc098092fd395d Mon Sep 17 00:00:00 2001 From: Quentin Carbonneaux Date: Tue, 16 May 2017 11:47:46 -0400 Subject: new hinting in the register allocator The previous heuristics were ad hoc and it was hard to understand why they worked at all. This patch can be summarized in three points: 1. When a register is freed (an instruction assigns it), we try to find if a temporary would like to be in it, and if we find one, we move it in the newly freed register. I call this an "eager move". 2. Temporaries now remember in what register they were last allocated; this information is stored in the field Tmp.visit, and prevails on the field Tmp.hint when it is set. (This makes having the same hint for interfering temporaries not so disastrous.) 3. Blocks are now allocated in "onion" order, from the innermost loop to the outermost. This is the change I am the least sure about; it should be evaluated thorougly. --- all.h | 5 +- rega.c | 420 ++++++++++++++++++++++++++++++++++++++++------------------------- 2 files changed, 261 insertions(+), 164 deletions(-) diff --git a/all.h b/all.h index f24d099..1e9476d 100644 --- a/all.h +++ b/all.h @@ -288,8 +288,9 @@ struct Tmp { short slot; /* -1 for unset */ short cls; struct { - int r; - bits m; + int r; /* register or -1 */ + int w; /* weight */ + bits m; /* avoid these registers */ } hint; int phi; Alias alias; diff --git a/rega.c b/rega.c index 02429a6..2f01c07 100644 --- a/rega.c +++ b/rega.c @@ -10,6 +10,7 @@ typedef struct RMap RMap; struct RMap { int t[Tmp0]; int r[Tmp0]; + int w[Tmp0]; /* wait list, for unmatched hints */ BSet b[1]; int n; }; @@ -20,8 +21,12 @@ static Mem *mem; /* function mem references */ static struct { Ref src, dst; int cls; -} *pm; /* parallel move constructed */ -static int cpm, npm; /* capacity and size of pm */ +} pm[Tmp0]; /* parallel move constructed */ +static int npm; /* size of pm */ +static int loop; /* current loop level */ + +static uint stmov; /* stats: added moves */ +static uint stblk; /* stats: added blocks */ static int * hint(int t) @@ -32,21 +37,21 @@ hint(int t) static void sethint(int t, int r) { - bits m; + Tmp *p; - m = tmp[phicls(t, tmp)].hint.m; - if (*hint(t) == -1) - if (!(BIT(r) & m)) - *hint(t) = r; + p = &tmp[phicls(t, tmp)]; + if (p->hint.r == -1 || p->hint.w > loop) { + p->hint.r = r; + p->hint.w = loop; + tmp[t].visit = -1; + } } static void rcopy(RMap *ma, RMap *mb) { - memcpy(ma->t, mb->t, sizeof ma->t); - memcpy(ma->r, mb->r, sizeof ma->r); + memcpy(ma, mb, sizeof *ma); bscopy(ma->b, mb->b); - ma->n = mb->n; } static int @@ -93,10 +98,10 @@ radd(RMap *m, int t, int r) } static Ref -ralloc(RMap *m, int t) +ralloctry(RMap *m, int t, int try) { bits regs; - int r, r0, r1; + int h, r, r0, r1; if (t < Tmp0) { assert(bshas(m->b, t)); @@ -107,8 +112,12 @@ ralloc(RMap *m, int t) assert(r != -1); return TMP(r); } - r = *hint(t); + r = tmp[t].visit; + if (r == -1 || bshas(m->b, r)) + r = *hint(t); if (r == -1 || bshas(m->b, r)) { + if (try) + return R; regs = tmp[phicls(t, tmp)].hint.m; regs |= m->b->t[0]; if (KBASE(tmp[t].cls) == 0) { @@ -129,9 +138,19 @@ ralloc(RMap *m, int t) Found: radd(m, t, r); sethint(t, r); + tmp[t].visit = r; + h = *hint(t); + if (h != -1 && h != r) + m->w[h] = t; return TMP(r); } +static inline Ref +ralloc(RMap *m, int t) +{ + return ralloctry(m, t, 0); +} + static int rfree(RMap *m, int t) { @@ -168,12 +187,8 @@ mdump(RMap *m) static void pmadd(Ref src, Ref dst, int k) { - if (npm == cpm) { - cpm = cpm * 2 + 16; - pm = realloc(pm, cpm * sizeof pm[0]); - if (!pm) - die("pmadd, out of memory"); - } + if (npm == Tmp0) + die("cannot have more moves than registers"); pm[npm].src = src; pm[npm].dst = dst; pm[npm].cls = k; @@ -182,77 +197,61 @@ pmadd(Ref src, Ref dst, int k) enum PMStat { ToMove, Moving, Moved }; -static Ref +static int pmrec(enum PMStat *status, int i, int *k) { - Ref swp, swp1; - int j, k1; + int j, c; /* note, this routine might emit - * too many large instructions: - * - * , x -- x - * x -- x -- x | - * ` x -- x - * - * if only the first move is wide - * the whole cycle will be wide, - * this is safe but not necessary + * too many large instructions */ - - if (req(pm[i].src, pm[i].dst)) - return R; - status[i] = Moving; - assert(KBASE(*k) == KBASE(pm[i].cls)); - assert((Kw|1) == Kl && (Ks|1) == Kd); - *k |= KWIDE(pm[i].cls); /* see above */ - swp = R; - for (j=0; jb, r) && r1 != r) { + if (bshas(m->b, r)) { /* r is used and not by to */ + assert(r1 != r); for (n=0; m->r[n] != r; n++) assert(n+1 < m->n); t = m->t[n]; @@ -286,10 +286,11 @@ dopm(Blk *b, Ins *i, RMap *m) { RMap m0; int n, r, r1, t, s; - Ins *i0, *i1, *ip, *ir; + Ins *i1, *ip; bits def; - m0 = *m; + m0 = *m; /* okay since we don't use m0.b */ + m0.b->t = 0; i1 = ++i; do { i--; @@ -320,21 +321,11 @@ dopm(Blk *b, Ins *i, RMap *m) radd(m, r, r); } pmgen(); -#ifdef TEST_PMOV - return 0; -#endif - n = b->nins - (i1 - i) + (curi - insb); - i0 = alloc(n * sizeof(Ins)); - ip = icpy(ip = i0, b->ins, i - b->ins); - ip = icpy(ir = ip, insb, curi - insb); - ip = icpy(ip, i1, &b->ins[b->nins] - i1); - b->nins = n; - b->ins = i0; - return ir; + return i; } static int -prio(Ref r1, Ref r2) +prio1(Ref r1, Ref r2) { /* trivial heuristic to begin with, * later we can use the distance to @@ -350,7 +341,7 @@ insert(Ref *r, Ref **rs, int p) int i; rs[i = p] = r; - while (i-- > 0 && prio(*r, *rs[i])) { + while (i-- > 0 && prio1(*r, *rs[i])) { rs[i+1] = rs[i]; rs[i] = r; } @@ -359,9 +350,9 @@ insert(Ref *r, Ref **rs, int p) static void doblk(Blk *b, RMap *cur) { - int x, r, nr; + int t, x, r, rf, rt, nr; bits rs; - Ins *i; + Ins *i, *i1; Mem *m; Ref *ra[4]; @@ -369,8 +360,12 @@ doblk(Blk *b, RMap *cur) radd(cur, r, r); if (rtype(b->jmp.arg) == RTmp) b->jmp.arg = ralloc(cur, b->jmp.arg.val); - for (i=&b->ins[b->nins]; i!=b->ins;) { - switch ((--i)->op) { + curi = &insb[NIns]; + for (i1=&b->ins[b->nins]; i1!=b->ins;) { + emiti(*--i1); + i = curi; + rf = -1; + switch (i->op) { case Ocall: rs = T.argregs(i->arg[1], 0) | T.rglob; for (r=0; T.rsave[r]>=0; r++) @@ -378,8 +373,10 @@ doblk(Blk *b, RMap *cur) rfree(cur, T.rsave[r]); break; case Ocopy: - if (isreg(i->arg[0])) { - i = dopm(b, i, cur); + if (regcpy(i)) { + curi++; + i1 = dopm(b, i1, cur); + stmov += i+1 - curi; continue; } if (isreg(i->to)) @@ -390,14 +387,15 @@ doblk(Blk *b, RMap *cur) if (!req(i->to, R)) { assert(rtype(i->to) == RTmp); r = i->to.val; - if (r >= Tmp0 || !(BIT(r) & T.rglob)) - r = rfree(cur, r); - if (r == -1) { + if (r < Tmp0 && (BIT(r) & T.rglob)) + break; + rf = rfree(cur, r); + if (rf == -1) { assert(!isreg(i->to)); - *i = (Ins){.op = Onop}; + curi++; continue; } - i->to = TMP(r); + i->to = TMP(rf); } break; } @@ -416,7 +414,57 @@ doblk(Blk *b, RMap *cur) } for (r=0; rval); + + /* try to change the register of a hinted + * temporary if rf is available */ + x = 1; + if (rf != -1 && (t = cur->w[rf]) != 0) + if (!bshas(cur->b, rf) && *hint(t) == rf + && (rt = rfree(cur, t)) != -1) { + tmp[t].visit = -1; + ralloc(cur, t); + assert(bshas(cur->b, rf)); + emit(Ocopy, tmp[t].cls, TMP(rt), TMP(rf), R); + stmov += 1; + cur->w[rf] = 0; + for (r=0; rnins = &insb[NIns] - curi; + idup(&b->ins, curi, b->nins); +} + +/* qsort() comparison function to peel + * loop nests from inside out */ +static int +carve(const void *a, const void *b) +{ + Blk *ba, *bb; + + /* todo, evaluate if this order is really + * better than the simple postorder */ + ba = *(Blk**)a; + bb = *(Blk**)b; + if (ba->loop == bb->loop) + return ba->id > bb->id ? -1 : ba->id < bb->id; + return ba->loop > bb->loop ? -1 : +1; +} + +/* comparison function to order temporaries + * for allocation at the end of blocks */ +static int +prio2(int t1, int t2) +{ + if ((tmp[t1].visit ^ tmp[t2].visit) < 0) /* != signs */ + return tmp[t1].visit != -1 ? +1 : -1; + if ((*hint(t1) ^ *hint(t2)) < 0) + return *hint(t1) != -1 ? +1 : -1; + return tmp[t1].cost - tmp[t2].cost; } /* register allocation @@ -425,18 +473,21 @@ doblk(Blk *b, RMap *cur) void rega(Fn *fn) { - int j, t, r, r1, x, rl[Tmp0]; - Blk *b, *b1, *s, ***ps, *blist; - RMap *end, *beg, cur, old; + int j, t, r, x, rl[Tmp0]; + Blk *b, *b1, *s, ***ps, *blist, **blk, **bp; + RMap *end, *beg, cur, old, *m; Ins *i; Phi *p; uint u, n; Ref src, dst; /* 1. setup */ + stmov = 0; + stblk = 0; regu = 0; tmp = fn->tmp; mem = fn->mem; + blk = alloc(fn->nblk * sizeof blk[0]); end = alloc(fn->nblk * sizeof end[0]); beg = alloc(fn->nblk * sizeof beg[0]); for (n=0; nnblk; n++) { @@ -446,8 +497,15 @@ rega(Fn *fn) bsinit(cur.b, fn->ntmp); bsinit(old.b, fn->ntmp); - for (t=0; tntmp; t++) - *hint(t) = t < Tmp0 ? t : -1; + loop = INT_MAX; + for (t=0; tntmp; t++) { + tmp[t].hint.r = t < Tmp0 ? t : -1; + tmp[t].hint.w = loop; + tmp[t].visit = -1; + } + for (bp=blk, b=fn->start; b; b=b->link) + *bp++ = b; + qsort(blk, fn->nblk, sizeof blk[0], carve); for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++) if (i->op != Ocopy || !isreg(i->arg[0])) break; @@ -456,71 +514,103 @@ rega(Fn *fn) sethint(i->to.val, i->arg[0].val); } - /* 2. assign registers following post-order */ - for (n=fn->nblk-1; n!=-1u; n--) { - b = fn->rpo[n]; + /* 2. assign registers */ + for (bp=blk; bp<&blk[fn->nblk]; bp++) { + b = *bp; + n = b->id; + loop = b->loop; cur.n = 0; bszero(cur.b); - for (x=0; x<2; x++) - for (t=Tmp0; bsiter(b->out, &t); t++) - if (x || (r=*hint(t)) != -1) - if (x || !bshas(cur.b, r)) - ralloc(&cur, t); + memset(cur.w, 0, sizeof cur.w); + for (x=0, t=Tmp0; bsiter(b->out, &t); t++) { + j = x++; + rl[j] = t; + while (j-- > 0 && prio2(t, rl[j]) > 0) { + rl[j+1] = rl[j]; + rl[j] = t; + } + } + for (j=0; jin, cur.b); for (p=b->phi; p; p=p->link) - if (rtype(p->to) == RTmp) { + if (rtype(p->to) == RTmp) bsclr(b->in, p->to.val); - /* heuristic 0: - * if the phi destination has an - * argument from a frequent block - * that was already allocated to - * 'r', use 'r' as the new hint - */ - memset(rl, 0, sizeof rl); - for (u=0; unarg; u++) { - t = p->arg[u].val; - b1 = p->blk[u]; - if (rtype(p->arg[u]) == RTmp) - if ((r=rfind(&end[b1->id], t)) != -1) - rl[r] += b1->loop; - } - for (x=0, j=0; j rl[x]) - x = j; - if (rl[x] >= b->loop) - *hint(p->to.val) = x; + rcopy(&beg[n], &cur); + } + + /* 3. emit copies shared by multiple edges + * to the same block */ + for (s=fn->start; s; s=s->link) { + if (s->npred <= 1) + continue; + m = &beg[s->id]; + + /* rl maps a register that is live at the + * beginning of s to the one used in all + * predecessors (if any, -1 otherwise) */ + memset(rl, 0, sizeof rl); + + /* to find the register of a phi in a + * predecessor, we have to find the + * corresponding argument */ + for (p=s->phi; p; p=p->link) { + r = rfind(m, p->to.val); + if (r == -1) + continue; + for (u=0; unarg; u++) { + b = p->blk[u]; + src = p->arg[u]; + if (rtype(src) != RTmp) + continue; + x = rfind(&end[b->id], src.val); + assert(x != -1); + rl[r] = (!rl[r] || rl[r] == x) ? x : -1; } - if (b->npred > 1) { - /* heuristic 1: - * attempt to satisfy hints - * when it's simple and we have - * multiple predecessors - */ - rcopy(&old, &cur); - curi = &insb[NIns]; - for (j=0; jn; j++) { + t = m->t[j]; + r = m->r[j]; + if (rl[r] || t < Tmp0 /* todo, remove this */) + continue; + for (bp=s->pred; bp<&s->pred[s->npred]; bp++) { + x = rfind(&end[(*bp)->id], t); + assert(x != -1); + rl[r] = (!rl[r] || rl[r] == x) ? x : -1; } - if ((j = &insb[NIns] - curi)) { - b->nins += j; - i = alloc(b->nins * sizeof(Ins)); - icpy(icpy(i, curi, j), b->ins, b->nins-j); - b->ins = i; + } + + npm = 0; + for (j=0; jn; j++) { + t = m->t[j]; + r = m->r[j]; + x = rl[r]; + assert(x != 0 || t < Tmp0 /* todo, ditto */); + if (x > 0) { + pmadd(TMP(x), TMP(r), tmp[t].cls); + m->r[j] = x; } } - rcopy(&beg[n], &cur); + curi = &insb[NIns]; + pmgen(); + j = &insb[NIns] - curi; + if (j == 0) + continue; + stmov += j; + s->nins += j; + i = alloc(s->nins * sizeof(Ins)); + icpy(icpy(i, curi, j), s->ins, s->nins-j); + s->ins = i; } + if (debug['R']) { fprintf(stderr, "\n> Register mappings:\n"); for (n=0; nnblk; n++) { @@ -533,7 +623,7 @@ rega(Fn *fn) fprintf(stderr, "\n"); } - /* 3. compose glue code */ + /* 4. emit remaining copies in new blocks */ blist = 0; for (b=fn->start;; b=b->link) { ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}}; @@ -560,8 +650,9 @@ rega(Fn *fn) dst = rref(&beg[s->id], t); pmadd(src, dst, tmp[t].cls); } + curi = &insb[NIns]; pmgen(); - if (curi == insb) + if (curi == &insb[NIns]) continue; b1 = blknew(); b1->loop = (b->loop+s->loop) / 2; @@ -569,8 +660,10 @@ rega(Fn *fn) blist = b1; fn->nblk++; sprintf(b1->name, "%s_%s", b->name, s->name); - b1->nins = curi - insb; - idup(&b1->ins, insb, b1->nins); + b1->nins = &insb[NIns] - curi; + stmov += b1->nins; + stblk += 1; + idup(&b1->ins, curi, b1->nins); b1->jmp.type = Jjmp; b1->s1 = s; **ps = b1; @@ -585,6 +678,9 @@ rega(Fn *fn) fn->reg = regu; if (debug['R']) { + fprintf(stderr, "\n> Register allocation statistics:\n"); + fprintf(stderr, "\tnew moves: %d\n", stmov); + fprintf(stderr, "\tnew blocks: %d\n", stblk); fprintf(stderr, "\n> After register allocation:\n"); printfn(fn, stderr); } -- cgit 1.4.1