diff options
Diffstat (limited to 'isel.c')
-rw-r--r-- | isel.c | 1136 |
1 files changed, 1136 insertions, 0 deletions
diff --git a/isel.c b/isel.c new file mode 100644 index 0000000..2a55733 --- /dev/null +++ b/isel.c @@ -0,0 +1,1136 @@ +#include "all.h" +#include <limits.h> + +/* For x86_64, do the following: + * + * - lower calls + * - check that constants are used only in + * places allowed + * - ensure immediates always fit in 32b + * - explicit machine register contraints + * on instructions like division. + * - implement fast locals (the streak of + * constant allocX in the first basic block) + * - recognize complex addressing modes + * + * Invariant: the use counts that are used + * in sel() must be sound. This + * is not so trivial, maybe the + * dce should be moved out... + */ + +typedef struct ANum ANum; +typedef struct AClass AClass; +typedef struct RAlloc RAlloc; + +struct ANum { + char n, l, r; + Ins *i; + Ref mem; +}; + +static void amatch(Addr *, Ref, ANum *, Fn *, int); + +static int +fcmptoi(int fc) +{ + switch (fc) { + default: diag("isel: fcmptoi defaulted"); + case FCle: return ICule; + case FClt: return ICult; + case FCgt: return ICugt; + case FCge: return ICuge; + case FCne: return ICne; + case FCeq: return ICeq; + case FCo: return ICXnp; + case FCuo: return ICXp; + } +} + +static int +iscmp(int op, int *pk, int *pc) +{ + int k, c; + + if (OCmpw <= op && op <= OCmpw1) { + c = op - OCmpw; + k = Kw; + } + else if (OCmpl <= op && op <= OCmpl1) { + c = op - OCmpl; + k = Kl; + } + else if (OCmps <= op && op <= OCmps1) { + c = fcmptoi(op - OCmps); + k = Ks; + } + else if (OCmpd <= op && op <= OCmpd1) { + c = fcmptoi(op - OCmpd); + k = Kd; + } + else + return 0; + if (pk) + *pk = k; + if (pc) + *pc = c; + return 1; +} + +static int +noimm(Ref r, Fn *fn) +{ + int64_t val; + + if (rtype(r) != RCon) + return 0; + switch (fn->con[r.val].type) { + default: + diag("isel: invalid constant"); + case CAddr: + /* we only support the 'small' + * code model of the ABI, this + * means that we can always + * address data with 32bits + */ + return 0; + case CBits: + val = fn->con[r.val].bits.i; + return (val < INT32_MIN || val > INT32_MAX); + } +} + +static int +rslot(Ref r, Fn *fn) +{ + if (rtype(r) != RTmp) + return -1; + return fn->tmp[r.val].slot; +} + +static int +argcls(Ins *i, int n) +{ + return opdesc[i->op].argcls[n][i->cls]; +} + +static void +fixarg(Ref *r, int k, int phi, Fn *fn) +{ + Addr a; + Ref r0, r1; + int s, n; + + r1 = r0 = *r; + s = rslot(r0, fn); + if (KBASE(k) == 1 && rtype(r0) == RCon) { + /* load floating points from memory + * slots, they can't be used as + * immediates + */ + r1 = MEM(fn->nmem); + vgrow(&fn->mem, ++fn->nmem); + memset(&a, 0, sizeof a); + a.offset.type = CAddr; + a.offset.local = 1; + n = stashfp(fn->con[r0.val].bits.i, KWIDE(k)); + sprintf(a.offset.label, "fp%d", n); + fn->mem[fn->nmem-1] = a; + } + else if (!phi && k == Kl && noimm(r0, fn)) { + /* load constants that do not fit in + * a 32bit signed integer into a + * long temporary + */ + r1 = newtmp("isel", Kl, fn); + emit(OCopy, Kl, r1, r0, R); + } + else if (s != -1) { + /* load fast locals' addresses into + * temporaries right before the + * instruction + */ + r1 = newtmp("isel", Kl, fn); + emit(OAddr, Kl, r1, SLOT(s), R); + } + *r = r1; +} + +static void +chuse(Ref r, int du, Fn *fn) +{ + if (rtype(r) == RTmp) + fn->tmp[r.val].nuse += du; +} + +static void +seladdr(Ref *r, ANum *an, Fn *fn) +{ + Addr a; + Ref r0, r1; + + r0 = *r; + if (rtype(r0) == RTmp) { + chuse(r0, -1, fn); + r1 = an[r0.val].mem; + if (req(r1, R)) { + amatch(&a, r0, an, fn, 1); + vgrow(&fn->mem, ++fn->nmem); + fn->mem[fn->nmem-1] = a; + r1 = MEM(fn->nmem-1); + chuse(a.base, +1, fn); + chuse(a.index, +1, fn); + if (rtype(a.base) != RTmp) + if (rtype(a.index) != RTmp) + an[r0.val].mem = r1; + } + *r = r1; + } +} + +static void +selcmp(Ref arg[2], int k, Fn *fn) +{ + Ref r; + + if (rtype(arg[0]) == RCon) { + r = arg[1]; + arg[1] = arg[0]; + arg[0] = r; + } + assert(rtype(arg[0]) != RCon); + emit(OXCmp, k, R, arg[1], arg[0]); + fixarg(&curi->arg[0], k, 0, fn); +} + +static void +sel(Ins i, ANum *an, Fn *fn) +{ + Ref r0, r1; + int x, k, kc; + int64_t val; + Ins *i0; + + if (rtype(i.to) == RTmp) + if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1])) + if (fn->tmp[i.to.val].nuse == 0) { + chuse(i.arg[0], -1, fn); + chuse(i.arg[1], -1, fn); + return; + } + i0 = curi; + k = i.cls; + switch (i.op) { + case ODiv: + case ORem: + case OUDiv: + case OURem: + if (i.op == ODiv || i.op == OUDiv) + r0 = TMP(RAX), r1 = TMP(RDX); + else + r0 = TMP(RDX), r1 = TMP(RAX); + emit(OCopy, k, i.to, r0, R); + emit(OCopy, k, R, r1, R); + if (rtype(i.arg[1]) == RCon) { + /* immediates not allowed for + * divisions in x86 + */ + r0 = newtmp("isel", k, fn); + } else + r0 = i.arg[1]; + if (i.op == ODiv || i.op == ORem) { + emit(OXIDiv, k, R, r0, R); + emit(OSign, k, TMP(RDX), TMP(RAX), R); + } else { + emit(OXDiv, k, R, r0, R); + emit(OCopy, k, TMP(RDX), CON_Z, R); + } + emit(OCopy, k, TMP(RAX), i.arg[0], R); + if (rtype(i.arg[1]) == RCon) + emit(OCopy, k, r0, i.arg[1], R); + break; + case OSar: + case OShr: + case OShl: + if (rtype(i.arg[1]) == RCon) + goto Emit; + r0 = i.arg[1]; + i.arg[1] = TMP(RCX); + emit(OCopy, Kw, R, TMP(RCX), R); + emiti(i); + emit(OCopy, Kw, TMP(RCX), r0, R); + break; + case ONop: + break; + case OStored: + case OStores: + case OStorel: + case OStorew: + case OStoreh: + case OStoreb: + if (rtype(i.arg[0]) == RCon) { + if (i.op == OStored) + i.op = OStorel; + if (i.op == OStores) + i.op = OStorew; + } + seladdr(&i.arg[1], an, fn); + goto Emit; + case_OLoad: + seladdr(&i.arg[0], an, fn); + goto Emit; + case OCall: + case OSAlloc: + case OCopy: + case OAdd: + case OSub: + case OMul: + case OAnd: + case OOr: + case OXor: + case OXTest: + case OFtosi: + case OSitof: + case OExts: + case OTruncd: + case OCast: + case_OExt: +Emit: + emiti(i); + fixarg(&curi->arg[0], argcls(curi, 0), 0, fn); + fixarg(&curi->arg[1], argcls(curi, 1), 0, fn); + break; + case OAlloc: + case OAlloc+1: + case OAlloc+2: /* == OAlloc1 */ + /* we need to make sure + * the stack remains aligned + * (rsp = 0) mod 16 + */ + if (rtype(i.arg[0]) == RCon) { + assert(fn->con[i.arg[0].val].type == CBits); + val = fn->con[i.arg[0].val].bits.i; + val = (val + 15) & ~INT64_C(15); + if (val < 0 || val > INT32_MAX) + diag("isel: alloc too large"); + emit(OSAlloc, Kl, i.to, getcon(val, fn), R); + } else { + /* r0 = (i.arg[0] + 15) & -16 */ + r0 = newtmp("isel", Kl, fn); + r1 = newtmp("isel", Kl, fn); + emit(OSAlloc, Kl, i.to, r0, R); + emit(OAnd, Kl, r0, r1, getcon(-16, fn)); + emit(OAdd, Kl, r1, i.arg[0], getcon(15, fn)); + } + break; + default: + if (isext(i.op)) + goto case_OExt; + if (isload(i.op)) + goto case_OLoad; + if (iscmp(i.op, &kc, &x)) { + if (rtype(i.arg[0]) == RCon) + x = icmpop(x); + emit(OXSet+x, k, i.to, R, R); + selcmp(i.arg, kc, fn); + break; + } + diag("isel: non-exhaustive implementation"); + } + + while (i0 > curi && --i0) + if (rslot(i0->arg[0], fn) != -1 + || rslot(i0->arg[1], fn) != -1) + diag("isel: usupported address argument"); +} + +static Ins * +flagi(Ins *i0, Ins *i) +{ + while (i>i0) { + i--; + if (opdesc[i->op].sflag) + return i; + if (opdesc[i->op].lflag) + continue; + return 0; + } + return 0; +} + +struct AClass { + int inmem; + int align; + uint size; + int cls[2]; +}; + +static void +aclass(AClass *a, Typ *t) +{ + int e, s, n, cls; + uint sz, al; + + sz = t->size; + al = 1u << t->align; + + /* the ABI requires sizes to be rounded + * up to the nearest multiple of 8, moreover + * it makes it easy load and store structures + * in registers + */ + if (al < 8) + al = 8; + sz = (sz + al-1) & -al; + + a->size = sz; + a->align = t->align; + + if (t->dark || sz > 16) { + /* large or unaligned structures are + * required to be passed in memory + */ + a->inmem = 1; + return; + } + + a->inmem = 0; + for (e=0, s=0; e<2; e++) { + cls = -1; + for (n=0; n<8 && t->seg[s].len; s++) { + if (t->seg[s].ispad) { + /* don't change anything */ + } + else if (t->seg[s].isflt) { + if (cls == -1) + cls = Kd; + } + else + cls = Kl; + n += t->seg[s].len; + } + assert(n <= 8); + a->cls[e] = cls; + } +} + +static void +blit(Ref rstk, uint soff, Ref rsrc, uint sz, Fn *fn) +{ + Ref r, r1; + uint boff; + + /* it's an impolite blit, we might go across the end + * of the source object a little bit... */ + for (boff=0; sz>0; sz-=8, soff+=8, boff+=8) { + r = newtmp("abi", Kl, fn); + r1 = newtmp("abi", Kl, fn); + emit(OStorel, 0, R, r, r1); + emit(OAdd, Kl, r1, rstk, getcon(soff, fn)); + r1 = newtmp("abi", Kl, fn); + emit(OLoad, Kl, r, r1, R); + emit(OAdd, Kl, r1, rsrc, getcon(boff, fn)); + chuse(rsrc, +1, fn); + chuse(rstk, +1, fn); + } +} + +static int +retr(Ref reg[2], AClass *aret) +{ + static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}}; + int n, k, ca, nr[2]; + + nr[0] = nr[1] = 0; + ca = 0; + for (n=0; aret->cls[n]>=0 && n<2; n++) { + k = KBASE(aret->cls[n]); + reg[n] = TMP(retreg[k][nr[k]++]); + ca += 1 << (2 * k); + } + return ca; +} + +static void +selret(Blk *b, Fn *fn) +{ + int j, k, ca; + Ref r, r0, reg[2]; + AClass aret; + + j = b->jmp.type; + + if (!isret(j) || j == JRet0) + return; + + r0 = b->jmp.arg; + b->jmp.type = JRet0; + + if (j == JRetc) { + aclass(&aret, &typ[fn->retty]); + if (aret.inmem) { + assert(rtype(fn->retr) == RTmp); + emit(OCopy, Kl, TMP(RAX), fn->retr, R); + chuse(fn->retr, +1, fn); + blit(fn->retr, 0, r0, aret.size, fn); + ca = 1; + } else { + ca = retr(reg, &aret); + if (aret.size > 8) { + r = newtmp("abi", Kl, fn); + emit(OLoad, Kl, reg[1], r, R); + emit(OAdd, Kl, r, r0, getcon(8, fn)); + chuse(r0, +1, fn); + } + emit(OLoad, Kl, reg[0], r0, R); + } + } else { + k = j - JRetw; + if (KBASE(k) == 0) { + emit(OCopy, k, TMP(RAX), r0, R); + ca = 1; + } else { + emit(OCopy, k, TMP(XMM0), r0, R); + ca = 1 << 2; + } + } + + b->jmp.arg = CALL(ca); +} + +static void +seljmp(Blk *b, Fn *fn) +{ + Ref r; + int c, k; + Ins *fi; + + if (b->jmp.type == JRet0 || b->jmp.type == JJmp) + return; + assert(b->jmp.type == JJnz); + r = b->jmp.arg; + b->jmp.arg = R; + assert(!req(r, R)); + if (rtype(r) == RCon) { + b->jmp.type = JJmp; + if (req(r, CON_Z)) + b->s1 = b->s2; + b->s2 = 0; + return; + } + fi = flagi(b->ins, &b->ins[b->nins]); + if (fi && req(fi->to, r)) { + if (iscmp(fi->op, &k, &c)) { + if (rtype(fi->arg[0]) == RCon) + c = icmpop(c); + b->jmp.type = JXJc + c; + if (fn->tmp[r.val].nuse == 1) { + assert(fn->tmp[r.val].ndef == 1); + selcmp(fi->arg, k, fn); + *fi = (Ins){.op = ONop}; + } + return; + } + if (fi->op == OAnd && fn->tmp[r.val].nuse == 1 + && (rtype(fi->arg[0]) == RTmp || + rtype(fi->arg[1]) == RTmp)) { + fi->op = OXTest; + fi->to = R; + b->jmp.type = JXJc + ICne; + if (rtype(fi->arg[1]) == RCon) { + r = fi->arg[1]; + fi->arg[1] = fi->arg[0]; + fi->arg[0] = r; + } + return; + } + /* since flags are not tracked in liveness, + * the result of the flag-setting instruction + * has to be marked as live + */ + if (fn->tmp[r.val].nuse == 1) + emit(OCopy, Kw, R, r, R); + b->jmp.type = JXJc + ICne; + return; + } + selcmp((Ref[2]){r, CON_Z}, Kw, fn); /* todo, add long branch if non-zero */ + b->jmp.type = JXJc + ICne; +} + +static int +classify(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret) +{ + int nint, ni, nsse, ns, n, *pn; + AClass *a; + Ins *i; + + if (aret && aret->inmem) + nint = 5; /* hidden argument */ + else + nint = 6; + nsse = 8; + for (i=i0, a=ac; i<i1; i++, a++) { + if (i->op == op) { + if (KBASE(i->cls) == 0) + pn = &nint; + else + pn = &nsse; + if (*pn > 0) { + --*pn; + a->inmem = 0; + } else + a->inmem = 2; + a->align = 3; + a->size = 8; + a->cls[0] = i->cls; + } else { + n = i->arg[0].val & AMask; + aclass(a, &typ[n]); + if (a->inmem) + continue; + ni = ns = 0; + for (n=0; n<2; n++) + if (KBASE(a->cls[n]) == 0) + ni++; + else + ns++; + if (nint >= ni && nsse >= ns) { + nint -= ni; + nsse -= ns; + } else + a->inmem = 1; + } + } + + return ((6-nint) << 4) | ((8-nsse) << 8); +} + +int rsave[] = { + RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14 +}; +int rclob[] = {RBX, R12, R13, R14, R15}; + +MAKESURE(rsave_has_correct_size, sizeof rsave == NRSave * sizeof(int)); +MAKESURE(rclob_has_correct_size, sizeof rclob == NRClob * sizeof(int)); + +bits +retregs(Ref r, int p[2]) +{ + bits b; + int ni, nf; + + assert(rtype(r) == RACall); + b = 0; + ni = r.val & 3; + nf = (r.val >> 2) & 3; + if (ni >= 1) + b |= BIT(RAX); + if (ni >= 2) + b |= BIT(RDX); + if (nf >= 1) + b |= BIT(XMM0); + if (nf >= 2) + b |= BIT(XMM1); + if (p) { + p[0] = ni; + p[1] = nf; + } + return b; +} + +bits +argregs(Ref r, int p[2]) +{ + bits b; + int j, ni, nf; + + assert(rtype(r) == RACall); + b = 0; + ni = (r.val >> 4) & 15; + nf = (r.val >> 8) & 15; + for (j=0; j<ni; j++) + b |= BIT(rsave[j]); + for (j=0; j<nf; j++) + b |= BIT(XMM0+j); + if (p) { + p[0] = ni + 1; + p[1] = nf; + } + return b | BIT(RAX); +} + +static Ref +rarg(int ty, int *ni, int *ns) +{ + if (KBASE(ty) == 0) + return TMP(rsave[(*ni)++]); + else + return TMP(XMM0 + (*ns)++); +} + +struct RAlloc { + Ins i; + RAlloc *link; +}; + +static void +selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap) +{ + Ins *i; + AClass *ac, *a, aret; + int ca, ni, ns; + uint stk, off; + Ref r, r1, r2, reg[2], regcp[2]; + RAlloc *ra; + + ac = alloc((i1-i0) * sizeof ac[0]); + if (!req(i1->arg[1], R)) { + assert(rtype(i1->arg[1]) == RAType); + aclass(&aret, &typ[i1->arg[1].val & AMask]); + ca = classify(i0, i1, ac, OArg, &aret); + } else + ca = classify(i0, i1, ac, OArg, 0); + + for (stk=0, a=&ac[i1-i0]; a>ac;) + if ((--a)->inmem) { + assert(a->align <= 4); + stk += a->size; + if (a->align == 4) + stk += stk & 15; + } + stk += stk & 15; + if (stk) { + r = getcon(-(int64_t)stk, fn); + emit(OSAlloc, Kl, R, r, R); + } + + if (!req(i1->arg[1], R)) { + if (aret.inmem) { + /* get the return location from eax + * it saves one callee-save reg */ + r1 = newtmp("abi", Kl, fn); + emit(OCopy, Kl, i1->to, TMP(RAX), R); + ca += 1; + } else { + if (aret.size > 8) { + r = newtmp("abi", Kl, fn); + regcp[1] = newtmp("abi", aret.cls[1], fn); + emit(OStorel, 0, R, regcp[1], r); + emit(OAdd, Kl, r, i1->to, getcon(8, fn)); + chuse(i1->to, +1, fn); + ca += 1 << (2 * KBASE(aret.cls[1])); + } + regcp[0] = newtmp("abi", aret.cls[0], fn); + emit(OStorel, 0, R, regcp[0], i1->to); + ca += 1 << (2 * KBASE(aret.cls[0])); + retr(reg, &aret); + if (aret.size > 8) + emit(OCopy, aret.cls[1], regcp[1], reg[1], R); + emit(OCopy, aret.cls[0], regcp[0], reg[0], R); + r1 = i1->to; + } + /* allocate return pad */ + ra = alloc(sizeof *ra); + assert(NAlign == 3); + aret.align -= 2; + if (aret.align < 0) + aret.align = 0; + ra->i.op = OAlloc + aret.align; + ra->i.cls = Kl; + ra->i.to = r1; + ra->i.arg[0] = getcon(aret.size, fn); + ra->link = (*rap); + *rap = ra; + } else { + ra = 0; + if (KBASE(i1->cls) == 0) { + emit(OCopy, i1->cls, i1->to, TMP(RAX), R); + ca += 1; + } else { + emit(OCopy, i1->cls, i1->to, TMP(XMM0), R); + ca += 1 << 2; + } + } + emit(OCall, i1->cls, R, i1->arg[0], CALL(ca)); + emit(OCopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R); + + ni = ns = 0; + if (ra && aret.inmem) + emit(OCopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */ + for (i=i0, a=ac; i<i1; i++, a++) { + if (a->inmem) + continue; + r1 = rarg(a->cls[0], &ni, &ns); + if (i->op == OArgc) { + if (a->size > 8) { + r2 = rarg(a->cls[1], &ni, &ns); + r = newtmp("abi", Kl, fn); + emit(OLoad, a->cls[1], r2, r, R); + emit(OAdd, Kl, r, i->arg[1], getcon(8, fn)); + chuse(i->arg[1], +1, fn); + } + emit(OLoad, a->cls[0], r1, i->arg[1], R); + } else + emit(OCopy, i->cls, r1, i->arg[0], R); + } + + if (!stk) + return; + + r = newtmp("abi", Kl, fn); + chuse(r, -1, fn); + for (i=i0, a=ac, off=0; i<i1; i++, a++) { + if (!a->inmem) + continue; + if (i->op == OArgc) { + if (a->align == 4) + off += off & 15; + blit(r, off, i->arg[1], a->size, fn); + } else { + r1 = newtmp("abi", Kl, fn); + emit(OStorel, 0, R, i->arg[0], r1); + emit(OAdd, Kl, r1, r, getcon(off, fn)); + chuse(r, +1, fn); + } + off += a->size; + } + emit(OSAlloc, Kl, r, getcon(stk, fn), R); +} + +static void +selpar(Fn *fn, Ins *i0, Ins *i1) +{ + AClass *ac, *a, aret; + Ins *i; + int ni, ns, s, al; + Ref r, r1; + + ac = alloc((i1-i0) * sizeof ac[0]); + curi = insb; + ni = ns = 0; + + if (fn->retty >= 0) { + aclass(&aret, &typ[fn->retty]); + if (aret.inmem) { + r = newtmp("abi", Kl, fn); + *curi++ = (Ins){OCopy, r, {rarg(Kl, &ni, &ns)}, Kl}; + fn->retr = r; + } + classify(i0, i1, ac, OPar, &aret); + } else + classify(i0, i1, ac, OPar, 0); + + assert(NAlign == 3); + + s = 4; + for (i=i0, a=ac; i<i1; i++, a++) { + switch (a->inmem) { + case 1: + assert(a->align <= 4); + if (a->align == 4) + s = (s+3) & -4; + fn->tmp[i->to.val].slot = -s; /* HACK! */ + s += a->size / 4; + continue; + case 2: + *curi++ = (Ins){OLoad, i->to, {SLOT(-s)}, i->cls}; + s += 2; + continue; + } + r1 = rarg(a->cls[0], &ni, &ns); + if (i->op == OParc) { + r = newtmp("abi", Kl, fn); + *curi++ = (Ins){OCopy, r, {r1}, Kl}; + a->cls[0] = r.val; + if (a->size > 8) { + r1 = rarg(a->cls[1], &ni, &ns); + r = newtmp("abi", Kl, fn); + *curi++ = (Ins){OCopy, r, {r1}, Kl}; + a->cls[1] = r.val; + } + } else + *curi++ = (Ins){OCopy, i->to, {r1}, i->cls}; + } + for (i=i0, a=ac; i<i1; i++, a++) { + if (i->op != OParc || a->inmem) + continue; + assert(NAlign == 3); + for (al=0; a->align >> (al+2); al++) + ; + r = TMP(a->cls[0]); + r1 = i->to; + *curi++ = (Ins){OAlloc+al, r1, {getcon(a->size, fn)}, Kl}; + *curi++ = (Ins){OStorel, R, {r, r1}, 0}; + if (a->size > 8) { + r = newtmp("abi", Kl, fn); + *curi++ = (Ins){OAdd, r, {r1, getcon(8, fn)}, Kl}; + r1 = TMP(a->cls[1]); + *curi++ = (Ins){OStorel, R, {r1, r}, 0}; + } + } +} + +static int +aref(Ref r, ANum *ai) +{ + switch (rtype(r)) { + default: + diag("isel: aref defaulted"); + case RCon: + return 2; + case RTmp: + return ai[r.val].n; + } +} + +static int +ascale(Ref r, Con *con) +{ + int64_t n; + + if (rtype(r) != RCon) + return 0; + if (con[r.val].type != CBits) + return 0; + n = con[r.val].bits.i; + return n == 1 || n == 2 || n == 4 || n == 8; +} + +static void +anumber(ANum *ai, Blk *b, Con *con) +{ + /* This should be made obsolete by a proper + * reassoc pass. + * + * Rules: + * + * RTmp(_) -> 0 tmp + * ( RTmp(_) -> 1 slot ) + * RCon(_) -> 2 con + * 0 * 2 -> 3 s * i (when constant is 1,2,4,8) + */ + static char add[10][10] = { + [2] [2] = 2, /* folding */ + [2] [5] = 5, [5] [2] = 5, + [2] [6] = 6, [6] [2] = 6, + [2] [7] = 7, [7] [2] = 7, + [0] [0] = 4, /* 4: b + s * i */ + [0] [3] = 4, [3] [0] = 4, + [2] [3] = 5, [3] [2] = 5, /* 5: o + s * i */ + [0] [2] = 6, [2] [0] = 6, /* 6: o + b */ + [2] [4] = 7, [4] [2] = 7, /* 7: o + b + s * i */ + [0] [5] = 7, [5] [0] = 7, + [6] [3] = 7, [3] [6] = 7, + + }; + int a, a1, a2, n1, n2, t1, t2; + Ins *i; + + for (i=b->ins; i-b->ins < b->nins; i++) { + if (rtype(i->to) == RTmp) + ai[i->to.val].i = i; + if (i->op != OAdd && i->op != OMul) + continue; + a1 = aref(i->arg[0], ai); + a2 = aref(i->arg[1], ai); + t1 = a1 != 1 && a1 != 2; + t2 = a2 != 1 && a2 != 2; + if (i->op == OAdd) { + a = add[n1 = a1][n2 = a2]; + if (t1 && a < add[0][a2]) + a = add[n1 = 0][n2 = a2]; + if (t2 && a < add[a1][0]) + a = add[n1 = a1][n2 = 0]; + if (t1 && t2 && a < add[0][0]) + a = add[n1 = 0][n2 = 0]; + } else { + n1 = n2 = a = 0; + if (ascale(i->arg[0], con) && t2) + a = 3, n1 = 2, n2 = 0; + if (t1 && ascale(i->arg[1], con)) + a = 3, n1 = 0, n2 = 2; + } + ai[i->to.val].n = a; + ai[i->to.val].l = n1; + ai[i->to.val].r = n2; + } +} + +static void +amatch(Addr *a, Ref r, ANum *ai, Fn *fn, int top) +{ + Ins *i; + int nl, nr, t, s; + Ref al, ar; + + if (top) + memset(a, 0, sizeof *a); + if (rtype(r) == RCon) { + addcon(&a->offset, &fn->con[r.val]); + return; + } + assert(rtype(r) == RTmp); + i = ai[r.val].i; + nl = ai[r.val].l; + nr = ai[r.val].r; + if (i) { + if (nl > nr) { + al = i->arg[1]; + ar = i->arg[0]; + t = nl, nl = nr, nr = t; + } else { + al = i->arg[0]; + ar = i->arg[1]; + } + } + switch (ai[r.val].n) { + default: + diag("isel: amatch defaulted"); + case 3: /* s * i */ + if (!top) { + a->index = al; + a->scale = fn->con[ar.val].bits.i; + } else + a->base = r; + break; + case 4: /* b + s * i */ + switch (nr) { + case 0: + if (fn->tmp[ar.val].slot != -1) { + al = i->arg[1]; + ar = i->arg[0]; + } + a->index = ar; + a->scale = 1; + break; + case 3: + amatch(a, ar, ai, fn, 0); + break; + } + r = al; + case 0: + s = fn->tmp[r.val].slot; + if (s != -1) + r = SLOT(s); + a->base = r; + break; + case 2: /* constants */ + case 5: /* o + s * i */ + case 6: /* o + b */ + case 7: /* o + b + s * i */ + amatch(a, ar, ai, fn, 0); + amatch(a, al, ai, fn, 0); + break; + } +} + +/* instruction selection + * requires use counts (as given by parsing) + */ +void +isel(Fn *fn) +{ + Blk *b, **sb; + Ins *i, *i0, *ip; + Phi *p; + uint a; + int n, al; + int64_t sz; + ANum *ainfo; + RAlloc *ral; + + for (n=0; n<fn->ntmp; n++) + fn->tmp[n].slot = -1; + fn->slot = 0; + + /* lower arguments */ + for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++) + if (i->op != OPar && i->op != OParc) + break; + selpar(fn, b->ins, i); + n = b->nins - (i - b->ins) + (curi - insb); + i0 = alloc(n * sizeof(Ins)); + ip = icpy(ip = i0, insb, curi - insb); + ip = icpy(ip, i, &b->ins[b->nins] - i); + b->nins = n; + b->ins = i0; + + /* lower function calls and returns */ + ral = 0; + b = fn->start; + do { + if (!(b = b->link)) + b = fn->start; /* do it last */ + curi = &insb[NIns]; + selret(b, fn); + for (i=&b->ins[b->nins]; i!=b->ins;) { + if ((--i)->op == OCall) { + for (i0=i; i0>b->ins; i0--) + if ((i0-1)->op != OArg) + if ((i0-1)->op != OArgc) + break; + selcall(fn, i0, i, &ral); + i = i0; + continue; + } + assert(i->op != OArg && i->op != OArgc); + emiti(*i); + } + if (b == fn->start) + for (; ral; ral=ral->link) + emiti(ral->i); + b->nins = &insb[NIns] - curi; + idup(&b->ins, curi, b->nins); + } while (b != fn->start); + + if (debug['A']) { + fprintf(stderr, "\n> After call lowering:\n"); + printfn(fn, stderr); + } + + /* assign slots to fast allocs */ + b = fn->start; + assert(NAlign == 3 && "change n=4 and sz /= 4 below"); + for (al=OAlloc, n=4; al<=OAlloc1; al++, n*=2) + for (i=b->ins; i-b->ins < b->nins; i++) + if (i->op == al) { + if (rtype(i->arg[0]) != RCon) + break; + sz = fn->con[i->arg[0].val].bits.i; + if (sz < 0 || sz >= INT_MAX-3) + diag("isel: invalid alloc size"); + sz = (sz + n-1) & -n; + sz /= 4; + fn->tmp[i->to.val].slot = fn->slot; + fn->slot += sz; + *i = (Ins){.op = ONop}; + } + + /* process basic blocks */ + n = fn->ntmp; + ainfo = emalloc(n * sizeof ainfo[0]); + for (b=fn->start; b; b=b->link) { + curi = &insb[NIns]; + for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++) + for (p=(*sb)->phi; p; p=p->link) { + for (a=0; p->blk[a] != b; a++) + assert(a+1 < p->narg); + fixarg(&p->arg[a], p->cls, 1, fn); + } + memset(ainfo, 0, n * sizeof ainfo[0]); + anumber(ainfo, b, fn->con); + seljmp(b, fn); + for (i=&b->ins[b->nins]; i!=b->ins;) + sel(*--i, ainfo, fn); + b->nins = &insb[NIns] - curi; + idup(&b->ins, curi, b->nins); + } + free(ainfo); + + if (debug['I']) { + fprintf(stderr, "\n> After instruction selection:\n"); + printfn(fn, stderr); + } +} |