diff options
-rw-r--r-- | all.h | 5 | ||||
-rw-r--r-- | rega.c | 420 |
2 files changed, 261 insertions, 164 deletions
diff --git a/all.h b/all.h index f24d099..1e9476d 100644 --- a/all.h +++ b/all.h @@ -288,8 +288,9 @@ struct Tmp { short slot; /* -1 for unset */ short cls; struct { - int r; - bits m; + int r; /* register or -1 */ + int w; /* weight */ + bits m; /* avoid these registers */ } hint; int phi; Alias alias; diff --git a/rega.c b/rega.c index 02429a6..2f01c07 100644 --- a/rega.c +++ b/rega.c @@ -10,6 +10,7 @@ typedef struct RMap RMap; struct RMap { int t[Tmp0]; int r[Tmp0]; + int w[Tmp0]; /* wait list, for unmatched hints */ BSet b[1]; int n; }; @@ -20,8 +21,12 @@ static Mem *mem; /* function mem references */ static struct { Ref src, dst; int cls; -} *pm; /* parallel move constructed */ -static int cpm, npm; /* capacity and size of pm */ +} pm[Tmp0]; /* parallel move constructed */ +static int npm; /* size of pm */ +static int loop; /* current loop level */ + +static uint stmov; /* stats: added moves */ +static uint stblk; /* stats: added blocks */ static int * hint(int t) @@ -32,21 +37,21 @@ hint(int t) static void sethint(int t, int r) { - bits m; + Tmp *p; - m = tmp[phicls(t, tmp)].hint.m; - if (*hint(t) == -1) - if (!(BIT(r) & m)) - *hint(t) = r; + p = &tmp[phicls(t, tmp)]; + if (p->hint.r == -1 || p->hint.w > loop) { + p->hint.r = r; + p->hint.w = loop; + tmp[t].visit = -1; + } } static void rcopy(RMap *ma, RMap *mb) { - memcpy(ma->t, mb->t, sizeof ma->t); - memcpy(ma->r, mb->r, sizeof ma->r); + memcpy(ma, mb, sizeof *ma); bscopy(ma->b, mb->b); - ma->n = mb->n; } static int @@ -93,10 +98,10 @@ radd(RMap *m, int t, int r) } static Ref -ralloc(RMap *m, int t) +ralloctry(RMap *m, int t, int try) { bits regs; - int r, r0, r1; + int h, r, r0, r1; if (t < Tmp0) { assert(bshas(m->b, t)); @@ -107,8 +112,12 @@ ralloc(RMap *m, int t) assert(r != -1); return TMP(r); } - r = *hint(t); + r = tmp[t].visit; + if (r == -1 || bshas(m->b, r)) + r = *hint(t); if (r == -1 || bshas(m->b, r)) { + if (try) + return R; regs = tmp[phicls(t, tmp)].hint.m; regs |= m->b->t[0]; if (KBASE(tmp[t].cls) == 0) { @@ -129,9 +138,19 @@ ralloc(RMap *m, int t) Found: radd(m, t, r); sethint(t, r); + tmp[t].visit = r; + h = *hint(t); + if (h != -1 && h != r) + m->w[h] = t; return TMP(r); } +static inline Ref +ralloc(RMap *m, int t) +{ + return ralloctry(m, t, 0); +} + static int rfree(RMap *m, int t) { @@ -168,12 +187,8 @@ mdump(RMap *m) static void pmadd(Ref src, Ref dst, int k) { - if (npm == cpm) { - cpm = cpm * 2 + 16; - pm = realloc(pm, cpm * sizeof pm[0]); - if (!pm) - die("pmadd, out of memory"); - } + if (npm == Tmp0) + die("cannot have more moves than registers"); pm[npm].src = src; pm[npm].dst = dst; pm[npm].cls = k; @@ -182,77 +197,61 @@ pmadd(Ref src, Ref dst, int k) enum PMStat { ToMove, Moving, Moved }; -static Ref +static int pmrec(enum PMStat *status, int i, int *k) { - Ref swp, swp1; - int j, k1; + int j, c; /* note, this routine might emit - * too many large instructions: - * - * , x -- x - * x -- x -- x | - * ` x -- x - * - * if only the first move is wide - * the whole cycle will be wide, - * this is safe but not necessary + * too many large instructions */ - - if (req(pm[i].src, pm[i].dst)) - return R; - status[i] = Moving; - assert(KBASE(*k) == KBASE(pm[i].cls)); - assert((Kw|1) == Kl && (Ks|1) == Kd); - *k |= KWIDE(pm[i].cls); /* see above */ - swp = R; - for (j=0; j<npm; j++) { - if (req(pm[j].src, pm[i].dst)) - switch (status[j]) { - case ToMove: - k1 = *k; - swp1 = pmrec(status, j, &k1); - if (!req(swp1, R)) { - assert(req(swp, R)); - swp = swp1; - *k = k1; - } - break; - case Moving: - assert(req(swp, R)); - swp = pm[i].dst; - break; - case Moved: - break; - } + if (req(pm[i].src, pm[i].dst)) { + status[i] = Moved; + return -1; + } + assert(KBASE(pm[i].cls) == KBASE(*k)); + assert((Kw|Kl) == Kl && (Ks|Kd) == Kd); + *k |= pm[i].cls; + for (j=0; j<npm; j++) + if (req(pm[j].dst, pm[i].src)) + break; + switch (j == npm ? Moved : status[j]) { + case Moving: + c = j; /* start of cycle */ + emit(Oswap, *k, R, pm[i].src, pm[i].dst); + break; + case ToMove: + status[i] = Moving; + c = pmrec(status, j, k); + if (c == i) { + c = -1; /* end of cycle */ + break; + } + if (c != -1) { + emit(Oswap, *k, R, pm[i].src, pm[i].dst); + break; + } + /* fall through */ + case Moved: + c = -1; + emit(Ocopy, pm[i].cls, pm[i].dst, pm[i].src, R); + break; } status[i] = Moved; - if (req(swp, R)) { - *curi++ = (Ins){Ocopy, pm[i].dst, {pm[i].src}, pm[i].cls}; - return R; - } else if (!req(swp, pm[i].src)) { - *curi++ = (Ins){Oswap, R, {pm[i].src, pm[i].dst}, *k}; - return swp; - } else - return R; - + return c; } static void pmgen() { - int i, k; + int i; enum PMStat *status; status = alloc(npm * sizeof status[0]); assert(!npm || status[npm-1] == ToMove); - curi = insb; for (i=0; i<npm; i++) - if (status[i] == ToMove) { - k = pm[i].cls; - pmrec(status, i, &k); - } + if (status[i] == ToMove) + pmrec(status, i, (int[]){pm[i].cls}); } static void @@ -261,8 +260,9 @@ move(int r, Ref to, RMap *m) int n, t, r1; r1 = req(to, R) ? -1 : rfree(m, to.val); - if (bshas(m->b, r) && r1 != r) { + if (bshas(m->b, r)) { /* r is used and not by to */ + assert(r1 != r); for (n=0; m->r[n] != r; n++) assert(n+1 < m->n); t = m->t[n]; @@ -286,10 +286,11 @@ dopm(Blk *b, Ins *i, RMap *m) { RMap m0; int n, r, r1, t, s; - Ins *i0, *i1, *ip, *ir; + Ins *i1, *ip; bits def; - m0 = *m; + m0 = *m; /* okay since we don't use m0.b */ + m0.b->t = 0; i1 = ++i; do { i--; @@ -320,21 +321,11 @@ dopm(Blk *b, Ins *i, RMap *m) radd(m, r, r); } pmgen(); -#ifdef TEST_PMOV - return 0; -#endif - n = b->nins - (i1 - i) + (curi - insb); - i0 = alloc(n * sizeof(Ins)); - ip = icpy(ip = i0, b->ins, i - b->ins); - ip = icpy(ir = ip, insb, curi - insb); - ip = icpy(ip, i1, &b->ins[b->nins] - i1); - b->nins = n; - b->ins = i0; - return ir; + return i; } static int -prio(Ref r1, Ref r2) +prio1(Ref r1, Ref r2) { /* trivial heuristic to begin with, * later we can use the distance to @@ -350,7 +341,7 @@ insert(Ref *r, Ref **rs, int p) int i; rs[i = p] = r; - while (i-- > 0 && prio(*r, *rs[i])) { + while (i-- > 0 && prio1(*r, *rs[i])) { rs[i+1] = rs[i]; rs[i] = r; } @@ -359,9 +350,9 @@ insert(Ref *r, Ref **rs, int p) static void doblk(Blk *b, RMap *cur) { - int x, r, nr; + int t, x, r, rf, rt, nr; bits rs; - Ins *i; + Ins *i, *i1; Mem *m; Ref *ra[4]; @@ -369,8 +360,12 @@ doblk(Blk *b, RMap *cur) radd(cur, r, r); if (rtype(b->jmp.arg) == RTmp) b->jmp.arg = ralloc(cur, b->jmp.arg.val); - for (i=&b->ins[b->nins]; i!=b->ins;) { - switch ((--i)->op) { + curi = &insb[NIns]; + for (i1=&b->ins[b->nins]; i1!=b->ins;) { + emiti(*--i1); + i = curi; + rf = -1; + switch (i->op) { case Ocall: rs = T.argregs(i->arg[1], 0) | T.rglob; for (r=0; T.rsave[r]>=0; r++) @@ -378,8 +373,10 @@ doblk(Blk *b, RMap *cur) rfree(cur, T.rsave[r]); break; case Ocopy: - if (isreg(i->arg[0])) { - i = dopm(b, i, cur); + if (regcpy(i)) { + curi++; + i1 = dopm(b, i1, cur); + stmov += i+1 - curi; continue; } if (isreg(i->to)) @@ -390,14 +387,15 @@ doblk(Blk *b, RMap *cur) if (!req(i->to, R)) { assert(rtype(i->to) == RTmp); r = i->to.val; - if (r >= Tmp0 || !(BIT(r) & T.rglob)) - r = rfree(cur, r); - if (r == -1) { + if (r < Tmp0 && (BIT(r) & T.rglob)) + break; + rf = rfree(cur, r); + if (rf == -1) { assert(!isreg(i->to)); - *i = (Ins){.op = Onop}; + curi++; continue; } - i->to = TMP(r); + i->to = TMP(rf); } break; } @@ -416,7 +414,57 @@ doblk(Blk *b, RMap *cur) } for (r=0; r<nr; r++) *ra[r] = ralloc(cur, ra[r]->val); + + /* try to change the register of a hinted + * temporary if rf is available */ + x = 1; + if (rf != -1 && (t = cur->w[rf]) != 0) + if (!bshas(cur->b, rf) && *hint(t) == rf + && (rt = rfree(cur, t)) != -1) { + tmp[t].visit = -1; + ralloc(cur, t); + assert(bshas(cur->b, rf)); + emit(Ocopy, tmp[t].cls, TMP(rt), TMP(rf), R); + stmov += 1; + cur->w[rf] = 0; + for (r=0; r<nr; r++) + if (req(*ra[r], TMP(rt))) + *ra[r] = TMP(rf); + /* one could iterate this logic with + * the newly freed rt, but in this case + * the above loop must be changed */ + } } + b->nins = &insb[NIns] - curi; + idup(&b->ins, curi, b->nins); +} + +/* qsort() comparison function to peel + * loop nests from inside out */ +static int +carve(const void *a, const void *b) +{ + Blk *ba, *bb; + + /* todo, evaluate if this order is really + * better than the simple postorder */ + ba = *(Blk**)a; + bb = *(Blk**)b; + if (ba->loop == bb->loop) + return ba->id > bb->id ? -1 : ba->id < bb->id; + return ba->loop > bb->loop ? -1 : +1; +} + +/* comparison function to order temporaries + * for allocation at the end of blocks */ +static int +prio2(int t1, int t2) +{ + if ((tmp[t1].visit ^ tmp[t2].visit) < 0) /* != signs */ + return tmp[t1].visit != -1 ? +1 : -1; + if ((*hint(t1) ^ *hint(t2)) < 0) + return *hint(t1) != -1 ? +1 : -1; + return tmp[t1].cost - tmp[t2].cost; } /* register allocation @@ -425,18 +473,21 @@ doblk(Blk *b, RMap *cur) void rega(Fn *fn) { - int j, t, r, r1, x, rl[Tmp0]; - Blk *b, *b1, *s, ***ps, *blist; - RMap *end, *beg, cur, old; + int j, t, r, x, rl[Tmp0]; + Blk *b, *b1, *s, ***ps, *blist, **blk, **bp; + RMap *end, *beg, cur, old, *m; Ins *i; Phi *p; uint u, n; Ref src, dst; /* 1. setup */ + stmov = 0; + stblk = 0; regu = 0; tmp = fn->tmp; mem = fn->mem; + blk = alloc(fn->nblk * sizeof blk[0]); end = alloc(fn->nblk * sizeof end[0]); beg = alloc(fn->nblk * sizeof beg[0]); for (n=0; n<fn->nblk; n++) { @@ -446,8 +497,15 @@ rega(Fn *fn) bsinit(cur.b, fn->ntmp); bsinit(old.b, fn->ntmp); - for (t=0; t<fn->ntmp; t++) - *hint(t) = t < Tmp0 ? t : -1; + loop = INT_MAX; + for (t=0; t<fn->ntmp; t++) { + tmp[t].hint.r = t < Tmp0 ? t : -1; + tmp[t].hint.w = loop; + tmp[t].visit = -1; + } + for (bp=blk, b=fn->start; b; b=b->link) + *bp++ = b; + qsort(blk, fn->nblk, sizeof blk[0], carve); for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++) if (i->op != Ocopy || !isreg(i->arg[0])) break; @@ -456,71 +514,103 @@ rega(Fn *fn) sethint(i->to.val, i->arg[0].val); } - /* 2. assign registers following post-order */ - for (n=fn->nblk-1; n!=-1u; n--) { - b = fn->rpo[n]; + /* 2. assign registers */ + for (bp=blk; bp<&blk[fn->nblk]; bp++) { + b = *bp; + n = b->id; + loop = b->loop; cur.n = 0; bszero(cur.b); - for (x=0; x<2; x++) - for (t=Tmp0; bsiter(b->out, &t); t++) - if (x || (r=*hint(t)) != -1) - if (x || !bshas(cur.b, r)) - ralloc(&cur, t); + memset(cur.w, 0, sizeof cur.w); + for (x=0, t=Tmp0; bsiter(b->out, &t); t++) { + j = x++; + rl[j] = t; + while (j-- > 0 && prio2(t, rl[j]) > 0) { + rl[j+1] = rl[j]; + rl[j] = t; + } + } + for (j=0; j<x; j++) + ralloctry(&cur, rl[j], 1); + for (j=0; j<x; j++) + ralloc(&cur, rl[j]); rcopy(&end[n], &cur); doblk(b, &cur); bscopy(b->in, cur.b); for (p=b->phi; p; p=p->link) - if (rtype(p->to) == RTmp) { + if (rtype(p->to) == RTmp) bsclr(b->in, p->to.val); - /* heuristic 0: - * if the phi destination has an - * argument from a frequent block - * that was already allocated to - * 'r', use 'r' as the new hint - */ - memset(rl, 0, sizeof rl); - for (u=0; u<p->narg; u++) { - t = p->arg[u].val; - b1 = p->blk[u]; - if (rtype(p->arg[u]) == RTmp) - if ((r=rfind(&end[b1->id], t)) != -1) - rl[r] += b1->loop; - } - for (x=0, j=0; j<Tmp0; j++) - if (rl[j] > rl[x]) - x = j; - if (rl[x] >= b->loop) - *hint(p->to.val) = x; + rcopy(&beg[n], &cur); + } + + /* 3. emit copies shared by multiple edges + * to the same block */ + for (s=fn->start; s; s=s->link) { + if (s->npred <= 1) + continue; + m = &beg[s->id]; + + /* rl maps a register that is live at the + * beginning of s to the one used in all + * predecessors (if any, -1 otherwise) */ + memset(rl, 0, sizeof rl); + + /* to find the register of a phi in a + * predecessor, we have to find the + * corresponding argument */ + for (p=s->phi; p; p=p->link) { + r = rfind(m, p->to.val); + if (r == -1) + continue; + for (u=0; u<p->narg; u++) { + b = p->blk[u]; + src = p->arg[u]; + if (rtype(src) != RTmp) + continue; + x = rfind(&end[b->id], src.val); + assert(x != -1); + rl[r] = (!rl[r] || rl[r] == x) ? x : -1; } - if (b->npred > 1) { - /* heuristic 1: - * attempt to satisfy hints - * when it's simple and we have - * multiple predecessors - */ - rcopy(&old, &cur); - curi = &insb[NIns]; - for (j=0; j<old.n; j++) { - t = old.t[j]; - r = *hint(t); - r1 = rfind(&cur, t); - if (r != -1 && r != r1) - if (!bshas(cur.b, r)) { - rfree(&cur, t); - radd(&cur, t, r); - x = tmp[t].cls; - emit(Ocopy, x, TMP(r1), TMP(r), R); - } + if (rl[r] == 0) + rl[r] = -1; + } + + /* process non-phis temporaries */ + for (j=0; j<m->n; j++) { + t = m->t[j]; + r = m->r[j]; + if (rl[r] || t < Tmp0 /* todo, remove this */) + continue; + for (bp=s->pred; bp<&s->pred[s->npred]; bp++) { + x = rfind(&end[(*bp)->id], t); + assert(x != -1); + rl[r] = (!rl[r] || rl[r] == x) ? x : -1; } - if ((j = &insb[NIns] - curi)) { - b->nins += j; - i = alloc(b->nins * sizeof(Ins)); - icpy(icpy(i, curi, j), b->ins, b->nins-j); - b->ins = i; + } + + npm = 0; + for (j=0; j<m->n; j++) { + t = m->t[j]; + r = m->r[j]; + x = rl[r]; + assert(x != 0 || t < Tmp0 /* todo, ditto */); + if (x > 0) { + pmadd(TMP(x), TMP(r), tmp[t].cls); + m->r[j] = x; } } - rcopy(&beg[n], &cur); + curi = &insb[NIns]; + pmgen(); + j = &insb[NIns] - curi; + if (j == 0) + continue; + stmov += j; + s->nins += j; + i = alloc(s->nins * sizeof(Ins)); + icpy(icpy(i, curi, j), s->ins, s->nins-j); + s->ins = i; } + if (debug['R']) { fprintf(stderr, "\n> Register mappings:\n"); for (n=0; n<fn->nblk; n++) { @@ -533,7 +623,7 @@ rega(Fn *fn) fprintf(stderr, "\n"); } - /* 3. compose glue code */ + /* 4. emit remaining copies in new blocks */ blist = 0; for (b=fn->start;; b=b->link) { ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}}; @@ -560,8 +650,9 @@ rega(Fn *fn) dst = rref(&beg[s->id], t); pmadd(src, dst, tmp[t].cls); } + curi = &insb[NIns]; pmgen(); - if (curi == insb) + if (curi == &insb[NIns]) continue; b1 = blknew(); b1->loop = (b->loop+s->loop) / 2; @@ -569,8 +660,10 @@ rega(Fn *fn) blist = b1; fn->nblk++; sprintf(b1->name, "%s_%s", b->name, s->name); - b1->nins = curi - insb; - idup(&b1->ins, insb, b1->nins); + b1->nins = &insb[NIns] - curi; + stmov += b1->nins; + stblk += 1; + idup(&b1->ins, curi, b1->nins); b1->jmp.type = Jjmp; b1->s1 = s; **ps = b1; @@ -585,6 +678,9 @@ rega(Fn *fn) fn->reg = regu; if (debug['R']) { + fprintf(stderr, "\n> Register allocation statistics:\n"); + fprintf(stderr, "\tnew moves: %d\n", stmov); + fprintf(stderr, "\tnew blocks: %d\n", stblk); fprintf(stderr, "\n> After register allocation:\n"); printfn(fn, stderr); } |