diff options
Diffstat (limited to 'amd64/isel.c')
-rw-r--r-- | amd64/isel.c | 603 |
1 files changed, 603 insertions, 0 deletions
diff --git a/amd64/isel.c b/amd64/isel.c new file mode 100644 index 0000000..1623b9b --- /dev/null +++ b/amd64/isel.c @@ -0,0 +1,603 @@ +#include "all.h" +#include <limits.h> + +/* For x86_64, do the following: + * + * - check that constants are used only in + * places allowed + * - ensure immediates always fit in 32b + * - expose machine register contraints + * on instructions like division. + * - implement fast locals (the streak of + * constant allocX in the first basic block) + * - recognize complex addressing modes + * + * Invariant: the use counts that are used + * in sel() must be sound. This + * is not so trivial, maybe the + * dce should be moved out... + */ + +typedef struct ANum ANum; + +struct ANum { + char n, l, r; + Ins *i; +}; + +static void amatch(Addr *, Ref, ANum *, Fn *, int); + +static int +noimm(Ref r, Fn *fn) +{ + int64_t val; + + if (rtype(r) != RCon) + return 0; + switch (fn->con[r.val].type) { + case CAddr: + /* we only support the 'small' + * code model of the ABI, this + * means that we can always + * address data with 32bits + */ + return 0; + case CBits: + val = fn->con[r.val].bits.i; + return (val < INT32_MIN || val > INT32_MAX); + default: + die("invalid constant"); + } +} + +static int +rslot(Ref r, Fn *fn) +{ + if (rtype(r) != RTmp) + return -1; + return fn->tmp[r.val].slot; +} + +static void +fixarg(Ref *r, int k, int cpy, Fn *fn) +{ + Addr a, *m; + Ref r0, r1; + int s, n; + + r1 = r0 = *r; + s = rslot(r0, fn); + if (KBASE(k) == 1 && rtype(r0) == RCon) { + /* load floating points from memory + * slots, they can't be used as + * immediates + */ + r1 = MEM(fn->nmem); + vgrow(&fn->mem, ++fn->nmem); + memset(&a, 0, sizeof a); + a.offset.type = CAddr; + a.offset.local = 1; + n = gasstashfp(fn->con[r0.val].bits.i, KWIDE(k)); + sprintf(a.offset.label, "fp%d", n); + fn->mem[fn->nmem-1] = a; + } + else if (!cpy && k == Kl && noimm(r0, fn)) { + /* load constants that do not fit in + * a 32bit signed integer into a + * long temporary + */ + r1 = newtmp("isel", Kl, fn); + emit(Ocopy, Kl, r1, r0, R); + } + else if (s != -1) { + /* load fast locals' addresses into + * temporaries right before the + * instruction + */ + r1 = newtmp("isel", Kl, fn); + emit(Oaddr, Kl, r1, SLOT(s), R); + } + else if (rtype(r0) == RMem) { + /* apple asm fix */ + m = &fn->mem[r0.val]; + if (req(m->base, R)) { + n = fn->ncon; + vgrow(&fn->con, ++fn->ncon); + fn->con[n] = m->offset; + m->offset.type = CUndef; + r0 = newtmp("isel", Kl, fn); + emit(Oaddr, Kl, r0, CON(n), R); + m->base = r0; + } + } + *r = r1; +} + +static void +seladdr(Ref *r, ANum *an, Fn *fn) +{ + Addr a; + Ref r0; + + r0 = *r; + if (rtype(r0) == RTmp) { + amatch(&a, r0, an, fn, 1); + if (req(a.base, r0)) + return; + if (a.offset.type == CAddr) + if (!req(a.base, R)) { + /* apple asm fix */ + if (!req(a.index, R)) + return; + else { + a.index = a.base; + a.scale = 1; + a.base = R; + } + } + chuse(r0, -1, fn); + vgrow(&fn->mem, ++fn->nmem); + fn->mem[fn->nmem-1] = a; + chuse(a.base, +1, fn); + chuse(a.index, +1, fn); + *r = MEM(fn->nmem-1); + } +} + +static int +selcmp(Ref arg[2], int k, Fn *fn) +{ + int swap; + Ref r, *iarg; + + swap = rtype(arg[0]) == RCon; + if (swap) { + r = arg[1]; + arg[1] = arg[0]; + arg[0] = r; + } + emit(Oxcmp, k, R, arg[1], arg[0]); + iarg = curi->arg; + if (rtype(arg[0]) == RCon) { + assert(k == Kl); + iarg[1] = newtmp("isel", k, fn); + emit(Ocopy, k, iarg[1], arg[0], R); + } + fixarg(&iarg[0], k, 0, fn); + fixarg(&iarg[1], k, 0, fn); + return swap; +} + +static void +sel(Ins i, ANum *an, Fn *fn) +{ + Ref r0, r1, *iarg; + int x, k, kc; + int64_t sz; + Ins *i0, *i1; + + if (rtype(i.to) == RTmp) + if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1])) + if (fn->tmp[i.to.val].nuse == 0) { + chuse(i.arg[0], -1, fn); + chuse(i.arg[1], -1, fn); + return; + } + i0 = curi; + k = i.cls; + switch (i.op) { + case Odiv: + case Orem: + case Oudiv: + case Ourem: + if (i.op == Odiv || i.op == Oudiv) + r0 = TMP(RAX), r1 = TMP(RDX); + else + r0 = TMP(RDX), r1 = TMP(RAX); + emit(Ocopy, k, i.to, r0, R); + emit(Ocopy, k, R, r1, R); + if (rtype(i.arg[1]) == RCon) { + /* immediates not allowed for + * divisions in x86 + */ + r0 = newtmp("isel", k, fn); + } else + r0 = i.arg[1]; + if (fn->tmp[r0.val].slot != -1) + err("unlikely argument %%%s in %s", + fn->tmp[r0.val].name, optab[i.op].name); + if (i.op == Odiv || i.op == Orem) { + emit(Oxidiv, k, R, r0, R); + emit(Osign, k, TMP(RDX), TMP(RAX), R); + } else { + emit(Oxdiv, k, R, r0, R); + emit(Ocopy, k, TMP(RDX), CON_Z, R); + } + emit(Ocopy, k, TMP(RAX), i.arg[0], R); + fixarg(&curi->arg[0], k, 0, fn); + if (rtype(i.arg[1]) == RCon) + emit(Ocopy, k, r0, i.arg[1], R); + break; + case Osar: + case Oshr: + case Oshl: + if (rtype(i.arg[1]) == RCon) + goto Emit; + r0 = i.arg[1]; + i.arg[1] = TMP(RCX); + emit(Ocopy, Kw, R, TMP(RCX), R); + emiti(i); + emit(Ocopy, Kw, TMP(RCX), r0, R); + break; + case Onop: + break; + case Ostored: + case Ostores: + case Ostorel: + case Ostorew: + case Ostoreh: + case Ostoreb: + if (rtype(i.arg[0]) == RCon) { + if (i.op == Ostored) + i.op = Ostorel; + if (i.op == Ostores) + i.op = Ostorew; + } + seladdr(&i.arg[1], an, fn); + goto Emit; + case_Oload: + seladdr(&i.arg[0], an, fn); + goto Emit; + case Ocall: + case Osalloc: + case Ocopy: + case Oadd: + case Osub: + case Omul: + case Oand: + case Oor: + case Oxor: + case Oxtest: + case Ostosi: + case Odtosi: + case Oswtof: + case Osltof: + case Oexts: + case Otruncd: + case Ocast: + case_OExt: +Emit: + emiti(i); + iarg = curi->arg; /* fixarg() can change curi */ + fixarg(&iarg[0], argcls(&i, 0), 0, fn); + fixarg(&iarg[1], argcls(&i, 1), 0, fn); + break; + case Oalloc: + case Oalloc+1: + case Oalloc+2: /* == Oalloc1 */ + /* we need to make sure + * the stack remains aligned + * (rsp = 0) mod 16 + */ + if (rtype(i.arg[0]) == RCon) { + sz = fn->con[i.arg[0].val].bits.i; + if (sz < 0 || sz >= INT_MAX-15) + err("invalid alloc size %"PRId64, sz); + sz = (sz + 15) & -16; + emit(Osalloc, Kl, i.to, getcon(sz, fn), R); + } else { + /* r0 = (i.arg[0] + 15) & -16 */ + r0 = newtmp("isel", Kl, fn); + r1 = newtmp("isel", Kl, fn); + emit(Osalloc, Kl, i.to, r0, R); + emit(Oand, Kl, r0, r1, getcon(-16, fn)); + emit(Oadd, Kl, r1, i.arg[0], getcon(15, fn)); + if (fn->tmp[i.arg[0].val].slot != -1) + err("unlikely argument %%%s in %s", + fn->tmp[i.arg[0].val].name, optab[i.op].name); + } + break; + default: + if (isext(i.op)) + goto case_OExt; + if (isload(i.op)) + goto case_Oload; + if (iscmp(i.op, &kc, &x)) { + emit(Oflag+x, k, i.to, R, R); + i1 = curi; + if (selcmp(i.arg, kc, fn)) + i1->op = Oflag + cmpop(x); + break; + } + die("unknown instruction %s", optab[i.op].name); + } + + while (i0 > curi && --i0) { + assert(rslot(i0->arg[0], fn) == -1); + assert(rslot(i0->arg[1], fn) == -1); + } +} + +static Ins * +flagi(Ins *i0, Ins *i) +{ + while (i>i0) { + i--; + if (amd64_op[i->op].zflag) + return i; + if (amd64_op[i->op].lflag) + continue; + return 0; + } + return 0; +} + +static void +seljmp(Blk *b, Fn *fn) +{ + Ref r; + int c, k; + Ins *fi; + Tmp *t; + + if (b->jmp.type == Jret0 || b->jmp.type == Jjmp) + return; + assert(b->jmp.type == Jjnz); + r = b->jmp.arg; + t = &fn->tmp[r.val]; + b->jmp.arg = R; + assert(!req(r, R) && rtype(r) != RCon); + if (b->s1 == b->s2) { + chuse(r, -1, fn); + b->jmp.type = Jjmp; + b->s2 = 0; + return; + } + fi = flagi(b->ins, &b->ins[b->nins]); + if (!fi || !req(fi->to, r)) { + selcmp((Ref[2]){r, CON_Z}, Kw, fn); /* todo, long jnz */ + b->jmp.type = Jjf + Cine; + } + else if (iscmp(fi->op, &k, &c)) { + if (t->nuse == 1) { + if (selcmp(fi->arg, k, fn)) + c = cmpop(c); + *fi = (Ins){.op = Onop}; + } + b->jmp.type = Jjf + c; + } + else if (fi->op == Oand && t->nuse == 1 + && (rtype(fi->arg[0]) == RTmp || + rtype(fi->arg[1]) == RTmp)) { + fi->op = Oxtest; + fi->to = R; + b->jmp.type = Jjf + Cine; + if (rtype(fi->arg[1]) == RCon) { + r = fi->arg[1]; + fi->arg[1] = fi->arg[0]; + fi->arg[0] = r; + } + } + else { + /* since flags are not tracked in liveness, + * the result of the flag-setting instruction + * has to be marked as live + */ + if (t->nuse == 1) + emit(Ocopy, Kw, R, r, R); + b->jmp.type = Jjf + Cine; + } +} + +static int +aref(Ref r, ANum *ai) +{ + switch (rtype(r)) { + case RCon: + return 2; + case RTmp: + return ai[r.val].n; + default: + die("constant or temporary expected"); + } +} + +static int +ascale(Ref r, Con *con) +{ + int64_t n; + + if (rtype(r) != RCon) + return 0; + if (con[r.val].type != CBits) + return 0; + n = con[r.val].bits.i; + return n == 1 || n == 2 || n == 4 || n == 8; +} + +static void +anumber(ANum *ai, Blk *b, Con *con) +{ + /* This should be made obsolete by a proper + * reassoc pass. + * + * Rules: + * + * RTmp(_) -> 0 tmp + * ( RTmp(_) -> 1 slot ) + * RCon(_) -> 2 con + * 0 * 2 -> 3 s * i (when constant is 1,2,4,8) + */ + static char add[10][10] = { + [2] [2] = 2, /* folding */ + [2] [5] = 5, [5] [2] = 5, + [2] [6] = 6, [6] [2] = 6, + [2] [7] = 7, [7] [2] = 7, + [0] [0] = 4, /* 4: b + s * i */ + [0] [3] = 4, [3] [0] = 4, + [2] [3] = 5, [3] [2] = 5, /* 5: o + s * i */ + [0] [2] = 6, [2] [0] = 6, /* 6: o + b */ + [2] [4] = 7, [4] [2] = 7, /* 7: o + b + s * i */ + [0] [5] = 7, [5] [0] = 7, + [6] [3] = 7, [3] [6] = 7, + + }; + int a, a1, a2, n1, n2, t1, t2; + Ins *i; + + for (i=b->ins; i-b->ins < b->nins; i++) { + if (rtype(i->to) == RTmp) + ai[i->to.val].i = i; + if (i->op != Oadd && i->op != Omul) + continue; + a1 = aref(i->arg[0], ai); + a2 = aref(i->arg[1], ai); + t1 = a1 != 1 && a1 != 2; + t2 = a2 != 1 && a2 != 2; + if (i->op == Oadd) { + a = add[n1 = a1][n2 = a2]; + if (t1 && a < add[0][a2]) + a = add[n1 = 0][n2 = a2]; + if (t2 && a < add[a1][0]) + a = add[n1 = a1][n2 = 0]; + if (t1 && t2 && a < add[0][0]) + a = add[n1 = 0][n2 = 0]; + } else { + n1 = n2 = a = 0; + if (ascale(i->arg[0], con) && t2) + a = 3, n1 = 2, n2 = 0; + if (t1 && ascale(i->arg[1], con)) + a = 3, n1 = 0, n2 = 2; + } + ai[i->to.val].n = a; + ai[i->to.val].l = n1; + ai[i->to.val].r = n2; + } +} + +static void +amatch(Addr *a, Ref r, ANum *ai, Fn *fn, int top) +{ + Ins *i; + int nl, nr, t, s; + Ref al, ar; + + if (top) + memset(a, 0, sizeof *a); + if (rtype(r) == RCon) { + addcon(&a->offset, &fn->con[r.val]); + return; + } + assert(rtype(r) == RTmp); + i = ai[r.val].i; + nl = ai[r.val].l; + nr = ai[r.val].r; + if (i) { + if (nl > nr) { + al = i->arg[1]; + ar = i->arg[0]; + t = nl, nl = nr, nr = t; + } else { + al = i->arg[0]; + ar = i->arg[1]; + } + } + switch (ai[r.val].n) { + case 3: /* s * i */ + if (!top) { + a->index = al; + a->scale = fn->con[ar.val].bits.i; + } else + a->base = r; + break; + case 4: /* b + s * i */ + switch (nr) { + case 0: + if (fn->tmp[ar.val].slot != -1) { + al = i->arg[1]; + ar = i->arg[0]; + } + a->index = ar; + a->scale = 1; + break; + case 3: + amatch(a, ar, ai, fn, 0); + break; + } + r = al; + case 0: + s = fn->tmp[r.val].slot; + if (s != -1) + r = SLOT(s); + a->base = r; + break; + case 2: /* constants */ + case 5: /* o + s * i */ + case 6: /* o + b */ + case 7: /* o + b + s * i */ + amatch(a, ar, ai, fn, 0); + amatch(a, al, ai, fn, 0); + break; + default: + die("unreachable"); + } +} + +/* instruction selection + * requires use counts (as given by parsing) + */ +void +amd64_isel(Fn *fn) +{ + Blk *b, **sb; + Ins *i; + Phi *p; + uint a; + int n, al; + int64_t sz; + ANum *ainfo; + + /* assign slots to fast allocs */ + b = fn->start; + /* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */ + for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2) + for (i=b->ins; i-b->ins < b->nins; i++) + if (i->op == al) { + if (rtype(i->arg[0]) != RCon) + break; + sz = fn->con[i->arg[0].val].bits.i; + if (sz < 0 || sz >= INT_MAX-15) + err("invalid alloc size %"PRId64, sz); + sz = (sz + n-1) & -n; + sz /= 4; + fn->tmp[i->to.val].slot = fn->slot; + fn->slot += sz; + *i = (Ins){.op = Onop}; + } + + /* process basic blocks */ + n = fn->ntmp; + ainfo = emalloc(n * sizeof ainfo[0]); + for (b=fn->start; b; b=b->link) { + curi = &insb[NIns]; + for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++) + for (p=(*sb)->phi; p; p=p->link) { + for (a=0; p->blk[a] != b; a++) + assert(a+1 < p->narg); + fixarg(&p->arg[a], p->cls, 1, fn); + } + memset(ainfo, 0, n * sizeof ainfo[0]); + anumber(ainfo, b, fn->con); + seljmp(b, fn); + for (i=&b->ins[b->nins]; i!=b->ins;) + sel(*--i, ainfo, fn); + b->nins = &insb[NIns] - curi; + idup(&b->ins, curi, b->nins); + } + free(ainfo); + + if (debug['I']) { + fprintf(stderr, "\n> After instruction selection:\n"); + printfn(fn, stderr); + } +} |