diff options
author | Quentin Carbonneaux <quentin.carbonneaux@yale.edu> | 2016-03-28 12:53:53 -0400 |
---|---|---|
committer | Quentin Carbonneaux <quentin.carbonneaux@yale.edu> | 2016-03-29 10:10:22 -0400 |
commit | b75cb8388fb9b5f2393443d008bb46c522c5ec9b (patch) | |
tree | 25268fe5f71d826ee1f8f0e3a2a82aa68f9bf750 /spill.c | |
parent | 1b4943eb1f2a10837f56070bfe604179d0dc10e0 (diff) | |
download | roux-b75cb8388fb9b5f2393443d008bb46c522c5ec9b.tar.gz |
new layout, put LICENSE in root
Diffstat (limited to 'spill.c')
-rw-r--r-- | spill.c | 507 |
1 files changed, 507 insertions, 0 deletions
diff --git a/spill.c b/spill.c new file mode 100644 index 0000000..72f8106 --- /dev/null +++ b/spill.c @@ -0,0 +1,507 @@ +#include "all.h" + +static void +loopmark(Blk *hd, Blk *b, Phi *p) +{ + int k, head; + uint n, a; + + head = hd->id; + if (b->id < head) + return; + for (; p; p=p->link) + for (a=0; a<p->narg; a++) + if (p->blk[a] == b) + if (rtype(p->arg[a]) == RTmp) + bsset(hd->gen, p->arg[a].val); + if (b->visit == head) + return; + b->visit = head; + b->loop *= 10; + /* aggregate looping information at + * loop headers */ + bsunion(hd->gen, b->gen); + for (k=0; k<2; k++) + if (b->nlive[k] > hd->nlive[k]) + hd->nlive[k] = b->nlive[k]; + for (n=0; n<b->npred; n++) + loopmark(hd, b->pred[n], b->phi); +} + +static void +tmpuse(Ref r, int use, int loop, Fn *fn) +{ + Mem *m; + Tmp *t; + + if (rtype(r) == RAMem) { + m = &fn->mem[r.val & AMask]; + tmpuse(m->base, 1, loop, fn); + tmpuse(m->index, 1, loop, fn); + } + else if (rtype(r) == RTmp && r.val >= Tmp0) { + t = &fn->tmp[r.val]; + t->nuse += use; + t->ndef += !use; + t->cost += loop; + } +} + +/* evaluate spill costs of temporaries, + * this also fills usage information + * requires rpo, preds + */ +void +fillcost(Fn *fn) +{ + int n, hd; + uint a; + Blk *b; + Ins *i; + Tmp *t; + Phi *p; + + for (b=fn->start; b; b=b->link) { + b->loop = 1; + b->visit = -1; + } + if (debug['S']) + fprintf(stderr, "\n> Loop information:\n"); + for (n=0; n<fn->nblk; n++) { + b = fn->rpo[n]; + hd = 0; + for (a=0; a<b->npred; a++) + if (b->pred[a]->id >= n) { + loopmark(b, b->pred[a], b->phi); + hd = 1; + } + if (hd && debug['S']) { + fprintf(stderr, "\t%-10s", b->name); + fprintf(stderr, " (% 3d ", b->nlive[0]); + fprintf(stderr, "% 3d) ", b->nlive[1]); + dumpts(b->gen, fn->tmp, stderr); + } + } + for (t=fn->tmp; t-fn->tmp < fn->ntmp; t++) { + t->cost = t-fn->tmp < Tmp0 ? 1e6 : 0; + t->nuse = 0; + t->ndef = 0; + } + for (b=fn->start; b; b=b->link) { + for (p=b->phi; p; p=p->link) { + /* todo, the cost computation + * for p->to is not great... */ + tmpuse(p->to, 0, 0, fn); + for (a=0; a<p->narg; a++) { + n = p->blk[a]->loop; + assert(b->npred==p->narg && + "wrong cfg"); + n /= b->npred; + tmpuse(p->arg[a], 1, n, fn); + } + } + n = b->loop; + for (i=b->ins; i-b->ins < b->nins; i++) { + tmpuse(i->to, 0, n, fn); + tmpuse(i->arg[0], 1, n, fn); + tmpuse(i->arg[1], 1, n, fn); + } + tmpuse(b->jmp.arg, 1, n, fn); + } + if (debug['S']) { + fprintf(stderr, "\n> Spill costs:\n"); + for (n=Tmp0; n<fn->ntmp; n++) + fprintf(stderr, "\t%-10s %d\n", + fn->tmp[n].name, + fn->tmp[n].cost); + fprintf(stderr, "\n"); + } +} + +static BSet *fst; /* temps to prioritize in registers (for tcmp1) */ +static Tmp *tmp; /* current temporaries (for tcmpX) */ +static int ntmp; /* current # of temps (for limit) */ +static int locs; /* stack size used by locals */ +static int slot4; /* next slot of 4 bytes */ +static int slot8; /* ditto, 8 bytes */ +static BSet mask[2][1]; /* class masks */ + +static int +tcmp0(const void *pa, const void *pb) +{ + return tmp[*(int *)pb].cost - tmp[*(int *)pa].cost; +} + +static int +tcmp1(const void *pa, const void *pb) +{ + int c; + + c = bshas(fst, *(int *)pb) - bshas(fst, *(int *)pa); + return c ? c : tcmp0(pa, pb); +} + +static Ref +slot(int t) +{ + int s; + + if (t < Tmp0) + diag("spill: cannot spill register"); + s = tmp[t].slot; + if (s == -1) { + assert(NAlign == 3); + /* nice logic to pack stack slots + * on demand, there can be only + * one hole and slot4 points to it + * + * invariant: slot4 <= slot8 + */ + if (KWIDE(tmp[t].cls)) { + s = slot8; + if (slot4 == slot8) + slot4 += 2; + slot8 += 2; + } else { + s = slot4; + if (slot4 == slot8) { + slot8 += 2; + slot4 += 1; + } else + slot4 = slot8; + } + s += locs; + tmp[t].slot = s; + } + return SLOT(s); +} + +static void +limit(BSet *b, int k, BSet *f) +{ + static int *tarr, maxt; + int i, nt; + uint t; + + nt = bscount(b); + if (nt <= k) + return; + if (nt > maxt) { + free(tarr); + tarr = emalloc(nt * sizeof tarr[0]); + maxt = nt; + } + for (i=0, t=0; bsiter(b, &t); t++) { + bsclr(b, t); + tarr[i++] = t; + } + if (!f) + qsort(tarr, nt, sizeof tarr[0], tcmp0); + else { + fst = f; + qsort(tarr, nt, sizeof tarr[0], tcmp1); + } + for (i=0; i<k && i<nt; i++) + bsset(b, tarr[i]); + for (; i<nt; i++) + slot(tarr[i]); +} + +static void +limit2(BSet *b1, int k1, int k2, BSet *fst) +{ + BSet b2[1]; + + bsinit(b2, ntmp); /* todo, free those */ + bscopy(b2, b1); + bsinter(b1, mask[0]); + bsinter(b2, mask[1]); + limit(b1, NIReg - k1, fst); + limit(b2, NFReg - k2, fst); + bsunion(b1, b2); +} + +static void +sethint(BSet *u, bits r) +{ + uint t; + + for (t=Tmp0; bsiter(u, &t); t++) + tmp[phicls(t, tmp)].hint.m |= r; +} + +static void +reloads(BSet *u, BSet *v) +{ + uint t; + + for (t=Tmp0; bsiter(u, &t); t++) + if (!bshas(v, t)) + emit(OLoad, tmp[t].cls, TMP(t), slot(t), R); +} + +static void +store(Ref r, int s) +{ + static int kstore[] = { + [Kw] = OStorew, [Kl] = OStorel, + [Ks] = OStores, [Kd] = OStored, + }; + + if (s != -1) + emit(kstore[tmp[r.val].cls], 0, R, r, SLOT(s)); +} + +static int +regcpy(Ins *i) +{ + return i->op == OCopy && isreg(i->arg[0]); +} + +static Ins * +dopm(Blk *b, Ins *i, BSet *v) +{ + int n, t; + BSet u[1]; + Ins *i1; + bits r; + + bsinit(u, ntmp); /* todo, free those */ + /* consecutive copies from + * registers need to be handled + * as one large instruction + * + * fixme: there is an assumption + * that calls are always followed + * by copy instructions here, this + * might not be true if previous + * passes change + */ + i1 = ++i; + do { + i--; + t = i->to.val; + if (!req(i->to, R)) + if (bshas(v, t)) { + bsclr(v, t); + store(i->to, tmp[t].slot); + } + bsset(v, i->arg[0].val); + } while (i != b->ins && regcpy(i-1)); + bscopy(u, v); + if (i != b->ins && (i-1)->op == OCall) { + v->t[0] &= ~retregs((i-1)->arg[1], 0); + limit2(v, NISave, NFSave, 0); + for (r=0, n=0; n<NRSave; n++) + r |= BIT(rsave[n]); + v->t[0] |= argregs((i-1)->arg[1], 0); + } else { + limit2(v, 0, 0, 0); + r = v->t[0]; + } + sethint(v, r); + reloads(u, v); + do + emiti(*--i1); + while (i1 != i); + return i; +} + +/* spill code insertion + * requires spill costs, rpo, liveness + * + * Note: this will replace liveness + * information (in, out) with temporaries + * that must be in registers at block + * borders + * + * Be careful with: + * - OCopy instructions to ensure register + * constraints + */ +void +spill(Fn *fn) +{ + Blk *b, *s1, *s2, *hd, **bp; + int j, n, l, t, k, lvarg[2]; + BSet u[1], v[1], w[1]; + Ins *i; + Phi *p; + Mem *m; + bits r; + + tmp = fn->tmp; + ntmp = fn->ntmp; + bsinit(u, ntmp); + bsinit(v, ntmp); + bsinit(w, ntmp); + bsinit(mask[0], ntmp); + bsinit(mask[1], ntmp); + locs = fn->slot; + slot4 = 0; + slot8 = 0; + for (t=0; t<ntmp; t++) { + k = 0; + if (t >= XMM0 && t < XMM0 + NFReg) + k = 1; + else if (t >= Tmp0) + k = KBASE(tmp[t].cls); + bsset(mask[k], t); + } + + for (bp=&fn->rpo[fn->nblk]; bp!=fn->rpo;) { + b = *--bp; + /* invariant: all bocks with bigger rpo got + * their in,out updated. */ + + /* 1. find temporaries in registers at + * the end of the block (put them in v) */ + curi = 0; + s1 = b->s1; + s2 = b->s2; + hd = 0; + if (s1 && s1->id <= n) + hd = s1; + if (s2 && s2->id <= n) + if (!hd || s2->id >= hd->id) + hd = s2; + r = 0; + bszero(v); + if (hd) { + /* back-edge */ + for (k=0; k<2; k++) { + n = k == 0 ? NIReg : NFReg; + bscopy(u, b->out); + bsinter(u, mask[k]); + bscopy(w, u); + bsinter(u, hd->gen); + bsdiff(w, hd->gen); + if ((int)bscount(u) < n) { /* fixme */ + j = bscount(w); /* live through */ + l = hd->nlive[k]; + limit(w, n - (l - j), 0); + bsunion(u, w); + } else + limit(u, n, 0); + bsunion(v, u); + } + } else if (s1) { + liveon(v, b, s1); + if (s2) { + liveon(u, b, s2); + bscopy(w, u); + bsinter(w, v); + bsunion(v, u); + } + limit2(v, 0, 0, w); + } else if (rtype(b->jmp.arg) == RACall) { + /* return */ + r = retregs(b->jmp.arg, 0); + v->t[0] |= r; + } + bscopy(b->out, v); + + /* 2. process the block instructions */ + curi = &insb[NIns]; + for (i=&b->ins[b->nins]; i!=b->ins;) { + i--; + if (regcpy(i)) { + i = dopm(b, i, v); + continue; + } + bszero(w); + if (!req(i->to, R)) { + assert(rtype(i->to) == RTmp); + t = i->to.val; + if (bshas(v, t)) + bsclr(v, t); + else { + /* make sure we have a reg + * for the result */ + bsset(v, t); + bsset(w, t); + } + } + j = opdesc[i->op].nmem; + for (n=0; n<2; n++) + if (rtype(i->arg[n]) == RAMem) + j--; + for (n=0; n<2; n++) + switch (rtype(i->arg[n])) { + case RAMem: + t = i->arg[n].val; + m = &fn->mem[t & AMask]; + if (rtype(m->base) == RTmp) { + bsset(v, m->base.val); + bsset(w, m->base.val); + } + if (rtype(m->index) == RTmp) { + bsset(v, m->index.val); + bsset(w, m->index.val); + } + break; + case RTmp: + t = i->arg[n].val; + lvarg[n] = bshas(v, t); + bsset(v, t); + if (j-- <= 0) + bsset(w, t); + break; + } + bscopy(u, v); + limit2(v, 0, 0, w); + for (n=0; n<2; n++) + if (rtype(i->arg[n]) == RTmp) { + t = i->arg[n].val; + if (!bshas(v, t)) { + /* do not reload if the + * the temporary was dead + */ + if (!lvarg[n]) + bsclr(u, t); + i->arg[n] = slot(t); + } + } + reloads(u, v); + if (!req(i->to, R)) { + t = i->to.val; + store(i->to, tmp[t].slot); + bsclr(v, t); + } + emiti(*i); + r = v->t[0] & (BIT(Tmp0)-1); + if (r) + sethint(v, r); + } + assert(!r || b==fn->start); + + for (p=b->phi; p; p=p->link) { + assert(rtype(p->to) == RTmp); + t = p->to.val; + if (bshas(v, t)) { + bsclr(v, t); + store(p->to, tmp[t].slot); + } else if (bshas(b->in, t)) + /* only if the phi is live */ + p->to = slot(p->to.val); + } + bscopy(b->in, v); + b->nins = &insb[NIns] - curi; + idup(&b->ins, curi, b->nins); + } + + /* align the locals to a 16 byte boundary */ + assert(NAlign == 3); + slot8 += slot8 & 3; + fn->slot += slot8; + + if (debug['S']) { + fprintf(stderr, "\n> Block information:\n"); + for (b=fn->start; b; b=b->link) { + printf("\t%-10s (% 5d) ", b->name, b->loop); + dumpts(b->out, fn->tmp, stdout); + } + fprintf(stderr, "\n> After spilling:\n"); + printfn(fn, stderr); + } +} |