diff options
Diffstat (limited to 'amd64')
-rw-r--r-- | amd64/all.h | 70 | ||||
-rw-r--r-- | amd64/emit.c | 561 | ||||
-rw-r--r-- | amd64/isel.c | 603 | ||||
-rw-r--r-- | amd64/sysv.c | 701 | ||||
-rw-r--r-- | amd64/targ.c | 30 |
5 files changed, 1965 insertions, 0 deletions
diff --git a/amd64/all.h b/amd64/all.h new file mode 100644 index 0000000..3a2db0e --- /dev/null +++ b/amd64/all.h @@ -0,0 +1,70 @@ +#include "../all.h" + +typedef struct Amd64Op Amd64Op; + +enum Amd64Reg { + RAX = RXX+1, /* caller-save */ + RCX, + RDX, + RSI, + RDI, + R8, + R9, + R10, + R11, + + RBX, /* callee-save */ + R12, + R13, + R14, + R15, + + RBP, /* globally live */ + RSP, + + XMM0, /* sse */ + XMM1, + XMM2, + XMM3, + XMM4, + XMM5, + XMM6, + XMM7, + XMM8, + XMM9, + XMM10, + XMM11, + XMM12, + XMM13, + XMM14, + XMM15, + + NFPR = XMM14 - XMM0 + 1, /* reserve XMM15 */ + NGPR = RSP - RAX + 1, + NGPS = R11 - RAX + 1, + NFPS = NFPR, + NCLR = R15 - RBX + 1, +}; +MAKESURE(reg_not_tmp, XMM15 < (int)Tmp0); + +struct Amd64Op { + char nmem; + char zflag; + char lflag; +}; + +/* targ.c */ +extern Amd64Op amd64_op[]; + +/* sysv.c (abi) */ +extern int amd64_sysv_rsave[]; +extern int amd64_sysv_rclob[]; +bits amd64_sysv_retregs(Ref, int[2]); +bits amd64_sysv_argregs(Ref, int[2]); +void amd64_sysv_abi(Fn *); + +/* isel.c */ +void amd64_isel(Fn *); + +/* emit.c */ +void amd64_emitfn(Fn *, FILE *); diff --git a/amd64/emit.c b/amd64/emit.c new file mode 100644 index 0000000..eccbd02 --- /dev/null +++ b/amd64/emit.c @@ -0,0 +1,561 @@ +#include "all.h" + + +#define CMP(X) \ + X(Ciule, "be") \ + X(Ciult, "b") \ + X(Cisle, "le") \ + X(Cislt, "l") \ + X(Cisgt, "g") \ + X(Cisge, "ge") \ + X(Ciugt, "a") \ + X(Ciuge, "ae") \ + X(Cieq, "z") \ + X(Cine, "nz") \ + X(NCmpI+Cfle, "be") \ + X(NCmpI+Cflt, "b") \ + X(NCmpI+Cfgt, "a") \ + X(NCmpI+Cfge, "ae") \ + X(NCmpI+Cfeq, "z") \ + X(NCmpI+Cfne, "nz") \ + X(NCmpI+Cfo, "np") \ + X(NCmpI+Cfuo, "p") + +enum { + SLong = 0, + SWord = 1, + SShort = 2, + SByte = 3, + + Ki = -1, /* matches Kw and Kl */ + Ka = -2, /* matches all classes */ +}; + +/* Instruction format strings: + * + * if the format string starts with -, the instruction + * is assumed to be 3-address and is put in 2-address + * mode using an extra mov if necessary + * + * if the format string starts with +, the same as the + * above applies, but commutativity is also assumed + * + * %k is used to set the class of the instruction, + * it'll expand to "l", "q", "ss", "sd", depending + * on the instruction class + * %0 designates the first argument + * %1 designates the second argument + * %= designates the result + * + * if %k is not used, a prefix to 0, 1, or = must be + * added, it can be: + * M - memory reference + * L - long (64 bits) + * W - word (32 bits) + * H - short (16 bits) + * B - byte (8 bits) + * S - single precision float + * D - double precision float + */ +static struct { + short op; + short cls; + char *asm; +} omap[] = { + { Oadd, Ka, "+add%k %1, %=" }, + { Osub, Ka, "-sub%k %1, %=" }, + { Oand, Ki, "+and%k %1, %=" }, + { Oor, Ki, "+or%k %1, %=" }, + { Oxor, Ki, "+xor%k %1, %=" }, + { Osar, Ki, "-sar%k %B1, %=" }, + { Oshr, Ki, "-shr%k %B1, %=" }, + { Oshl, Ki, "-shl%k %B1, %=" }, + { Omul, Ki, "+imul%k %1, %=" }, + { Omul, Ks, "+mulss %1, %=" }, + { Omul, Kd, "+mulsd %1, %=" }, + { Odiv, Ka, "-div%k %1, %=" }, + { Ostorel, Ka, "movq %L0, %M1" }, + { Ostorew, Ka, "movl %W0, %M1" }, + { Ostoreh, Ka, "movw %H0, %M1" }, + { Ostoreb, Ka, "movb %B0, %M1" }, + { Ostores, Ka, "movss %S0, %M1" }, + { Ostored, Ka, "movsd %D0, %M1" }, + { Oload, Ka, "mov%k %M0, %=" }, + { Oloadsw, Kl, "movslq %M0, %L=" }, + { Oloadsw, Kw, "movl %M0, %W=" }, + { Oloaduw, Ki, "movl %M0, %W=" }, + { Oloadsh, Ki, "movsw%k %M0, %=" }, + { Oloaduh, Ki, "movzw%k %M0, %=" }, + { Oloadsb, Ki, "movsb%k %M0, %=" }, + { Oloadub, Ki, "movzb%k %M0, %=" }, + { Oextsw, Kl, "movslq %W0, %L=" }, + { Oextuw, Kl, "movl %W0, %W=" }, + { Oextsh, Ki, "movsw%k %H0, %=" }, + { Oextuh, Ki, "movzw%k %H0, %=" }, + { Oextsb, Ki, "movsb%k %B0, %=" }, + { Oextub, Ki, "movzb%k %B0, %=" }, + + { Oexts, Kd, "cvtss2sd %0, %=" }, + { Otruncd, Ks, "cvttsd2ss %0, %=" }, + { Ostosi, Ki, "cvttss2si%k %0, %=" }, + { Odtosi, Ki, "cvttsd2si%k %0, %=" }, + { Oswtof, Ka, "cvtsi2%k %W0, %=" }, + { Osltof, Ka, "cvtsi2%k %L0, %=" }, + { Ocast, Ki, "movq %D0, %L=" }, + { Ocast, Ka, "movq %L0, %D=" }, + + { Oaddr, Ki, "lea%k %M0, %=" }, + { Oswap, Ki, "xchg%k %0, %1" }, + { Osign, Kl, "cqto" }, + { Osign, Kw, "cltd" }, + { Oxdiv, Ki, "div%k %0" }, + { Oxidiv, Ki, "idiv%k %0" }, + { Oxcmp, Ks, "comiss %S0, %S1" }, + { Oxcmp, Kd, "comisd %D0, %D1" }, + { Oxcmp, Ki, "cmp%k %0, %1" }, + { Oxtest, Ki, "test%k %0, %1" }, +#define X(c, s) \ + { Oflag+c, Ki, "set" s " %B=\n\tmovzb%k %B=, %=" }, + CMP(X) +#undef X + { NOp, 0, 0 } +}; + +static char *rname[][4] = { + [RAX] = {"rax", "eax", "ax", "al"}, + [RBX] = {"rbx", "ebx", "bx", "bl"}, + [RCX] = {"rcx", "ecx", "cx", "cl"}, + [RDX] = {"rdx", "edx", "dx", "dl"}, + [RSI] = {"rsi", "esi", "si", "sil"}, + [RDI] = {"rdi", "edi", "di", "dil"}, + [RBP] = {"rbp", "ebp", "bp", "bpl"}, + [RSP] = {"rsp", "esp", "sp", "spl"}, + [R8 ] = {"r8" , "r8d", "r8w", "r8b"}, + [R9 ] = {"r9" , "r9d", "r9w", "r9b"}, + [R10] = {"r10", "r10d", "r10w", "r10b"}, + [R11] = {"r11", "r11d", "r11w", "r11b"}, + [R12] = {"r12", "r12d", "r12w", "r12b"}, + [R13] = {"r13", "r13d", "r13w", "r13b"}, + [R14] = {"r14", "r14d", "r14w", "r14b"}, + [R15] = {"r15", "r15d", "r15w", "r15b"}, +}; + + +static int +slot(int s, Fn *fn) +{ + struct { int i:29; } x; + + /* sign extend s using a bitfield */ + x.i = s; + assert(x.i <= fn->slot); + /* specific to NAlign == 3 */ + if (x.i < 0) + return -4 * x.i; + else if (fn->vararg) + return -176 + -4 * (fn->slot - x.i); + else + return -4 * (fn->slot - x.i); +} + +static void +emitcon(Con *con, FILE *f) +{ + switch (con->type) { + case CAddr: + if (con->local) + fprintf(f, "%s%s", gasloc, con->label); + else + fprintf(f, "%s%s", gassym, con->label); + if (con->bits.i) + fprintf(f, "%+"PRId64, con->bits.i); + break; + case CBits: + fprintf(f, "%"PRId64, con->bits.i); + break; + default: + die("unreachable"); + } +} + +static char * +regtoa(int reg, int sz) +{ + static char buf[6]; + + if (reg >= XMM0) { + sprintf(buf, "xmm%d", reg-XMM0); + return buf; + } else + return rname[reg][sz]; +} + +static Ref +getarg(char c, Ins *i) +{ + switch (c) { + case '0': + return i->arg[0]; + case '1': + return i->arg[1]; + case '=': + return i->to; + default: + die("invalid arg letter %c", c); + } +} + +static void emitins(Ins, Fn *, FILE *); + +static void +emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f) +{ + Ins icp; + + icp.op = Ocopy; + icp.arg[0] = r2; + icp.to = r1; + icp.cls = k; + emitins(icp, fn, f); +} + +static void +emitf(char *s, Ins *i, Fn *fn, FILE *f) +{ + static char clstoa[][3] = {"l", "q", "ss", "sd"}; + char c; + int sz; + Ref ref; + Mem *m; + Con off; + + switch (*s) { + case '+': + if (req(i->arg[1], i->to)) { + ref = i->arg[0]; + i->arg[0] = i->arg[1]; + i->arg[1] = ref; + } + /* fall through */ + case '-': + assert((!req(i->arg[1], i->to) || req(i->arg[0], i->to)) && + "cannot convert to 2-address"); + emitcopy(i->to, i->arg[0], i->cls, fn, f); + s++; + break; + } + + fputc('\t', f); +Next: + while ((c = *s++) != '%') + if (!c) { + fputc('\n', f); + return; + } else + fputc(c, f); + switch ((c = *s++)) { + case '%': + fputc('%', f); + break; + case 'k': + fputs(clstoa[i->cls], f); + break; + case '0': + case '1': + case '=': + sz = KWIDE(i->cls) ? SLong : SWord; + s--; + goto Ref; + case 'D': + case 'S': + sz = SLong; /* does not matter for floats */ + Ref: + c = *s++; + ref = getarg(c, i); + switch (rtype(ref)) { + case RTmp: + assert(isreg(ref)); + fprintf(f, "%%%s", regtoa(ref.val, sz)); + break; + case RSlot: + fprintf(f, "%d(%%rbp)", slot(ref.val, fn)); + break; + case RMem: + Mem: + m = &fn->mem[ref.val]; + if (rtype(m->base) == RSlot) { + off.type = CBits; + off.bits.i = slot(m->base.val, fn); + addcon(&m->offset, &off); + m->base = TMP(RBP); + } + if (m->offset.type != CUndef) + emitcon(&m->offset, f); + fputc('(', f); + if (req(m->base, R)) + fprintf(f, "%%rip"); + else + fprintf(f, "%%%s", regtoa(m->base.val, SLong)); + if (!req(m->index, R)) + fprintf(f, ", %%%s, %d", + regtoa(m->index.val, SLong), + m->scale + ); + fputc(')', f); + break; + case RCon: + fputc('$', f); + emitcon(&fn->con[ref.val], f); + break; + default: + die("unreachable"); + } + break; + case 'L': + sz = SLong; + goto Ref; + case 'W': + sz = SWord; + goto Ref; + case 'H': + sz = SShort; + goto Ref; + case 'B': + sz = SByte; + goto Ref; + case 'M': + c = *s++; + ref = getarg(c, i); + switch (rtype(ref)) { + case RMem: + goto Mem; + case RSlot: + fprintf(f, "%d(%%rbp)", slot(ref.val, fn)); + break; + case RCon: + emitcon(&fn->con[ref.val], f); + fprintf(f, "(%%rip)"); + break; + case RTmp: + assert(isreg(ref)); + fprintf(f, "(%%%s)", regtoa(ref.val, SLong)); + break; + default: + die("unreachable"); + } + break; + default: + die("invalid format specifier %%%c", c); + } + goto Next; +} + +static void +emitins(Ins i, Fn *fn, FILE *f) +{ + Ref r; + int64_t val; + int o; + + switch (i.op) { + default: + Table: + /* most instructions are just pulled out of + * the table omap[], some special cases are + * detailed below */ + for (o=0;; o++) { + /* this linear search should really be a binary + * search */ + if (omap[o].op == NOp) + die("no match for %s(%d)", + optab[i.op].name, "wlsd"[i.cls]); + if (omap[o].op == i.op) + if (omap[o].cls == i.cls + || (omap[o].cls == Ki && KBASE(i.cls) == 0) + || (omap[o].cls == Ka)) + break; + } + emitf(omap[o].asm, &i, fn, f); + break; + case Onop: + /* just do nothing for nops, they are inserted + * by some passes */ + break; + case Omul: + /* here, we try to use the 3-addresss form + * of multiplication when possible */ + if (rtype(i.arg[1]) == RCon) { + r = i.arg[0]; + i.arg[0] = i.arg[1]; + i.arg[1] = r; + } + if (KBASE(i.cls) == 0 /* only available for ints */ + && rtype(i.arg[0]) == RCon + && rtype(i.arg[1]) == RTmp) { + emitf("imul%k %0, %1, %=", &i, fn, f); + break; + } + goto Table; + case Osub: + /* we have to use the negation trick to handle + * some 3-address substractions */ + if (req(i.to, i.arg[1])) { + emitf("neg%k %=", &i, fn, f); + emitf("add%k %0, %=", &i, fn, f); + break; + } + goto Table; + case Ocopy: + /* make sure we don't emit useless copies, + * also, we can use a trick to load 64-bits + * registers, it's detailed in my note below + * http://c9x.me/art/notes.html?09/19/2015 */ + if (req(i.to, R) || req(i.arg[0], R)) + break; + if (isreg(i.to) + && rtype(i.arg[0]) == RCon + && i.cls == Kl + && fn->con[i.arg[0].val].type == CBits + && (val = fn->con[i.arg[0].val].bits.i) >= 0 + && val <= UINT32_MAX) { + emitf("movl %W0, %W=", &i, fn, f); + } else if (isreg(i.to) + && rtype(i.arg[0]) == RCon + && fn->con[i.arg[0].val].type == CAddr) { + emitf("lea%k %M0, %=", &i, fn, f); + } else if (!req(i.arg[0], i.to)) + emitf("mov%k %0, %=", &i, fn, f); + break; + case Ocall: + /* calls simply have a weird syntax in AT&T + * assembly... */ + switch (rtype(i.arg[0])) { + case RCon: + fprintf(f, "\tcallq "); + emitcon(&fn->con[i.arg[0].val], f); + fprintf(f, "\n"); + break; + case RTmp: + emitf("callq *%L0", &i, fn, f); + break; + default: + die("invalid call argument"); + } + break; + case Osalloc: + /* there is no good reason why this is here + * maybe we should split Osalloc in 2 different + * instructions depending on the result + */ + emitf("subq %L0, %%rsp", &i, fn, f); + if (!req(i.to, R)) + emitcopy(i.to, TMP(RSP), Kl, fn, f); + break; + case Oswap: + if (KBASE(i.cls) == 0) + goto Table; + /* for floats, there is no swap instruction + * so we use xmm15 as a temporary + */ + emitcopy(TMP(XMM0+15), i.arg[0], i.cls, fn, f); + emitcopy(i.arg[0], i.arg[1], i.cls, fn, f); + emitcopy(i.arg[1], TMP(XMM0+15), i.cls, fn, f); + break; + } +} + +static int +framesz(Fn *fn) +{ + int i, o, f; + + /* specific to NAlign == 3 */ + for (i=0, o=0; i<NCLR; i++) + o ^= 1 & (fn->reg >> amd64_sysv_rclob[i]); + f = fn->slot; + f = (f + 3) & -4; + return 4*f + 8*o + 176*fn->vararg; +} + +void +amd64_emitfn(Fn *fn, FILE *f) +{ + static char *ctoa[] = { + #define X(c, s) [c] = s, + CMP(X) + #undef X + }; + static int id0; + Blk *b, *s; + Ins *i, itmp; + int *r, c, fs, o, n, lbl; + + fprintf(f, ".text\n"); + if (fn->export) + fprintf(f, ".globl %s%s\n", gassym, fn->name); + fprintf(f, + "%s%s:\n" + "\tpushq %%rbp\n" + "\tmovq %%rsp, %%rbp\n", + gassym, fn->name + ); + fs = framesz(fn); + if (fs) + fprintf(f, "\tsub $%d, %%rsp\n", fs); + if (fn->vararg) { + o = -176; + for (r=amd64_sysv_rsave; r<&amd64_sysv_rsave[6]; r++, o+=8) + fprintf(f, "\tmovq %%%s, %d(%%rbp)\n", rname[*r][0], o); + for (n=0; n<8; ++n, o+=16) + fprintf(f, "\tmovaps %%xmm%d, %d(%%rbp)\n", n, o); + } + for (r=amd64_sysv_rclob; r<&amd64_sysv_rclob[NCLR]; r++) + if (fn->reg & BIT(*r)) { + itmp.arg[0] = TMP(*r); + emitf("pushq %L0", &itmp, fn, f); + } + + for (lbl=0, b=fn->start; b; b=b->link) { + if (lbl || b->npred > 1) + fprintf(f, "%sbb%d:\n", gasloc, id0+b->id); + for (i=b->ins; i!=&b->ins[b->nins]; i++) + emitins(*i, fn, f); + lbl = 1; + switch (b->jmp.type) { + case Jret0: + for (r=&amd64_sysv_rclob[NCLR]; r>amd64_sysv_rclob;) + if (fn->reg & BIT(*--r)) { + itmp.arg[0] = TMP(*r); + emitf("popq %L0", &itmp, fn, f); + } + fprintf(f, + "\tleave\n" + "\tret\n" + ); + break; + case Jjmp: + Jmp: + if (b->s1 != b->link) + fprintf(f, "\tjmp %sbb%d\n", + gasloc, id0+b->s1->id); + else + lbl = 0; + break; + default: + c = b->jmp.type - Jjf; + if (0 <= c && c <= NCmp) { + if (b->link == b->s2) { + s = b->s1; + b->s1 = b->s2; + b->s2 = s; + } else + c = cmpneg(c); + fprintf(f, "\tj%s %sbb%d\n", ctoa[c], + gasloc, id0+b->s2->id); + goto Jmp; + } + die("unhandled jump %d", b->jmp.type); + } + } + id0 += fn->nblk; +} diff --git a/amd64/isel.c b/amd64/isel.c new file mode 100644 index 0000000..1623b9b --- /dev/null +++ b/amd64/isel.c @@ -0,0 +1,603 @@ +#include "all.h" +#include <limits.h> + +/* For x86_64, do the following: + * + * - check that constants are used only in + * places allowed + * - ensure immediates always fit in 32b + * - expose machine register contraints + * on instructions like division. + * - implement fast locals (the streak of + * constant allocX in the first basic block) + * - recognize complex addressing modes + * + * Invariant: the use counts that are used + * in sel() must be sound. This + * is not so trivial, maybe the + * dce should be moved out... + */ + +typedef struct ANum ANum; + +struct ANum { + char n, l, r; + Ins *i; +}; + +static void amatch(Addr *, Ref, ANum *, Fn *, int); + +static int +noimm(Ref r, Fn *fn) +{ + int64_t val; + + if (rtype(r) != RCon) + return 0; + switch (fn->con[r.val].type) { + case CAddr: + /* we only support the 'small' + * code model of the ABI, this + * means that we can always + * address data with 32bits + */ + return 0; + case CBits: + val = fn->con[r.val].bits.i; + return (val < INT32_MIN || val > INT32_MAX); + default: + die("invalid constant"); + } +} + +static int +rslot(Ref r, Fn *fn) +{ + if (rtype(r) != RTmp) + return -1; + return fn->tmp[r.val].slot; +} + +static void +fixarg(Ref *r, int k, int cpy, Fn *fn) +{ + Addr a, *m; + Ref r0, r1; + int s, n; + + r1 = r0 = *r; + s = rslot(r0, fn); + if (KBASE(k) == 1 && rtype(r0) == RCon) { + /* load floating points from memory + * slots, they can't be used as + * immediates + */ + r1 = MEM(fn->nmem); + vgrow(&fn->mem, ++fn->nmem); + memset(&a, 0, sizeof a); + a.offset.type = CAddr; + a.offset.local = 1; + n = gasstashfp(fn->con[r0.val].bits.i, KWIDE(k)); + sprintf(a.offset.label, "fp%d", n); + fn->mem[fn->nmem-1] = a; + } + else if (!cpy && k == Kl && noimm(r0, fn)) { + /* load constants that do not fit in + * a 32bit signed integer into a + * long temporary + */ + r1 = newtmp("isel", Kl, fn); + emit(Ocopy, Kl, r1, r0, R); + } + else if (s != -1) { + /* load fast locals' addresses into + * temporaries right before the + * instruction + */ + r1 = newtmp("isel", Kl, fn); + emit(Oaddr, Kl, r1, SLOT(s), R); + } + else if (rtype(r0) == RMem) { + /* apple asm fix */ + m = &fn->mem[r0.val]; + if (req(m->base, R)) { + n = fn->ncon; + vgrow(&fn->con, ++fn->ncon); + fn->con[n] = m->offset; + m->offset.type = CUndef; + r0 = newtmp("isel", Kl, fn); + emit(Oaddr, Kl, r0, CON(n), R); + m->base = r0; + } + } + *r = r1; +} + +static void +seladdr(Ref *r, ANum *an, Fn *fn) +{ + Addr a; + Ref r0; + + r0 = *r; + if (rtype(r0) == RTmp) { + amatch(&a, r0, an, fn, 1); + if (req(a.base, r0)) + return; + if (a.offset.type == CAddr) + if (!req(a.base, R)) { + /* apple asm fix */ + if (!req(a.index, R)) + return; + else { + a.index = a.base; + a.scale = 1; + a.base = R; + } + } + chuse(r0, -1, fn); + vgrow(&fn->mem, ++fn->nmem); + fn->mem[fn->nmem-1] = a; + chuse(a.base, +1, fn); + chuse(a.index, +1, fn); + *r = MEM(fn->nmem-1); + } +} + +static int +selcmp(Ref arg[2], int k, Fn *fn) +{ + int swap; + Ref r, *iarg; + + swap = rtype(arg[0]) == RCon; + if (swap) { + r = arg[1]; + arg[1] = arg[0]; + arg[0] = r; + } + emit(Oxcmp, k, R, arg[1], arg[0]); + iarg = curi->arg; + if (rtype(arg[0]) == RCon) { + assert(k == Kl); + iarg[1] = newtmp("isel", k, fn); + emit(Ocopy, k, iarg[1], arg[0], R); + } + fixarg(&iarg[0], k, 0, fn); + fixarg(&iarg[1], k, 0, fn); + return swap; +} + +static void +sel(Ins i, ANum *an, Fn *fn) +{ + Ref r0, r1, *iarg; + int x, k, kc; + int64_t sz; + Ins *i0, *i1; + + if (rtype(i.to) == RTmp) + if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1])) + if (fn->tmp[i.to.val].nuse == 0) { + chuse(i.arg[0], -1, fn); + chuse(i.arg[1], -1, fn); + return; + } + i0 = curi; + k = i.cls; + switch (i.op) { + case Odiv: + case Orem: + case Oudiv: + case Ourem: + if (i.op == Odiv || i.op == Oudiv) + r0 = TMP(RAX), r1 = TMP(RDX); + else + r0 = TMP(RDX), r1 = TMP(RAX); + emit(Ocopy, k, i.to, r0, R); + emit(Ocopy, k, R, r1, R); + if (rtype(i.arg[1]) == RCon) { + /* immediates not allowed for + * divisions in x86 + */ + r0 = newtmp("isel", k, fn); + } else + r0 = i.arg[1]; + if (fn->tmp[r0.val].slot != -1) + err("unlikely argument %%%s in %s", + fn->tmp[r0.val].name, optab[i.op].name); + if (i.op == Odiv || i.op == Orem) { + emit(Oxidiv, k, R, r0, R); + emit(Osign, k, TMP(RDX), TMP(RAX), R); + } else { + emit(Oxdiv, k, R, r0, R); + emit(Ocopy, k, TMP(RDX), CON_Z, R); + } + emit(Ocopy, k, TMP(RAX), i.arg[0], R); + fixarg(&curi->arg[0], k, 0, fn); + if (rtype(i.arg[1]) == RCon) + emit(Ocopy, k, r0, i.arg[1], R); + break; + case Osar: + case Oshr: + case Oshl: + if (rtype(i.arg[1]) == RCon) + goto Emit; + r0 = i.arg[1]; + i.arg[1] = TMP(RCX); + emit(Ocopy, Kw, R, TMP(RCX), R); + emiti(i); + emit(Ocopy, Kw, TMP(RCX), r0, R); + break; + case Onop: + break; + case Ostored: + case Ostores: + case Ostorel: + case Ostorew: + case Ostoreh: + case Ostoreb: + if (rtype(i.arg[0]) == RCon) { + if (i.op == Ostored) + i.op = Ostorel; + if (i.op == Ostores) + i.op = Ostorew; + } + seladdr(&i.arg[1], an, fn); + goto Emit; + case_Oload: + seladdr(&i.arg[0], an, fn); + goto Emit; + case Ocall: + case Osalloc: + case Ocopy: + case Oadd: + case Osub: + case Omul: + case Oand: + case Oor: + case Oxor: + case Oxtest: + case Ostosi: + case Odtosi: + case Oswtof: + case Osltof: + case Oexts: + case Otruncd: + case Ocast: + case_OExt: +Emit: + emiti(i); + iarg = curi->arg; /* fixarg() can change curi */ + fixarg(&iarg[0], argcls(&i, 0), 0, fn); + fixarg(&iarg[1], argcls(&i, 1), 0, fn); + break; + case Oalloc: + case Oalloc+1: + case Oalloc+2: /* == Oalloc1 */ + /* we need to make sure + * the stack remains aligned + * (rsp = 0) mod 16 + */ + if (rtype(i.arg[0]) == RCon) { + sz = fn->con[i.arg[0].val].bits.i; + if (sz < 0 || sz >= INT_MAX-15) + err("invalid alloc size %"PRId64, sz); + sz = (sz + 15) & -16; + emit(Osalloc, Kl, i.to, getcon(sz, fn), R); + } else { + /* r0 = (i.arg[0] + 15) & -16 */ + r0 = newtmp("isel", Kl, fn); + r1 = newtmp("isel", Kl, fn); + emit(Osalloc, Kl, i.to, r0, R); + emit(Oand, Kl, r0, r1, getcon(-16, fn)); + emit(Oadd, Kl, r1, i.arg[0], getcon(15, fn)); + if (fn->tmp[i.arg[0].val].slot != -1) + err("unlikely argument %%%s in %s", + fn->tmp[i.arg[0].val].name, optab[i.op].name); + } + break; + default: + if (isext(i.op)) + goto case_OExt; + if (isload(i.op)) + goto case_Oload; + if (iscmp(i.op, &kc, &x)) { + emit(Oflag+x, k, i.to, R, R); + i1 = curi; + if (selcmp(i.arg, kc, fn)) + i1->op = Oflag + cmpop(x); + break; + } + die("unknown instruction %s", optab[i.op].name); + } + + while (i0 > curi && --i0) { + assert(rslot(i0->arg[0], fn) == -1); + assert(rslot(i0->arg[1], fn) == -1); + } +} + +static Ins * +flagi(Ins *i0, Ins *i) +{ + while (i>i0) { + i--; + if (amd64_op[i->op].zflag) + return i; + if (amd64_op[i->op].lflag) + continue; + return 0; + } + return 0; +} + +static void +seljmp(Blk *b, Fn *fn) +{ + Ref r; + int c, k; + Ins *fi; + Tmp *t; + + if (b->jmp.type == Jret0 || b->jmp.type == Jjmp) + return; + assert(b->jmp.type == Jjnz); + r = b->jmp.arg; + t = &fn->tmp[r.val]; + b->jmp.arg = R; + assert(!req(r, R) && rtype(r) != RCon); + if (b->s1 == b->s2) { + chuse(r, -1, fn); + b->jmp.type = Jjmp; + b->s2 = 0; + return; + } + fi = flagi(b->ins, &b->ins[b->nins]); + if (!fi || !req(fi->to, r)) { + selcmp((Ref[2]){r, CON_Z}, Kw, fn); /* todo, long jnz */ + b->jmp.type = Jjf + Cine; + } + else if (iscmp(fi->op, &k, &c)) { + if (t->nuse == 1) { + if (selcmp(fi->arg, k, fn)) + c = cmpop(c); + *fi = (Ins){.op = Onop}; + } + b->jmp.type = Jjf + c; + } + else if (fi->op == Oand && t->nuse == 1 + && (rtype(fi->arg[0]) == RTmp || + rtype(fi->arg[1]) == RTmp)) { + fi->op = Oxtest; + fi->to = R; + b->jmp.type = Jjf + Cine; + if (rtype(fi->arg[1]) == RCon) { + r = fi->arg[1]; + fi->arg[1] = fi->arg[0]; + fi->arg[0] = r; + } + } + else { + /* since flags are not tracked in liveness, + * the result of the flag-setting instruction + * has to be marked as live + */ + if (t->nuse == 1) + emit(Ocopy, Kw, R, r, R); + b->jmp.type = Jjf + Cine; + } +} + +static int +aref(Ref r, ANum *ai) +{ + switch (rtype(r)) { + case RCon: + return 2; + case RTmp: + return ai[r.val].n; + default: + die("constant or temporary expected"); + } +} + +static int +ascale(Ref r, Con *con) +{ + int64_t n; + + if (rtype(r) != RCon) + return 0; + if (con[r.val].type != CBits) + return 0; + n = con[r.val].bits.i; + return n == 1 || n == 2 || n == 4 || n == 8; +} + +static void +anumber(ANum *ai, Blk *b, Con *con) +{ + /* This should be made obsolete by a proper + * reassoc pass. + * + * Rules: + * + * RTmp(_) -> 0 tmp + * ( RTmp(_) -> 1 slot ) + * RCon(_) -> 2 con + * 0 * 2 -> 3 s * i (when constant is 1,2,4,8) + */ + static char add[10][10] = { + [2] [2] = 2, /* folding */ + [2] [5] = 5, [5] [2] = 5, + [2] [6] = 6, [6] [2] = 6, + [2] [7] = 7, [7] [2] = 7, + [0] [0] = 4, /* 4: b + s * i */ + [0] [3] = 4, [3] [0] = 4, + [2] [3] = 5, [3] [2] = 5, /* 5: o + s * i */ + [0] [2] = 6, [2] [0] = 6, /* 6: o + b */ + [2] [4] = 7, [4] [2] = 7, /* 7: o + b + s * i */ + [0] [5] = 7, [5] [0] = 7, + [6] [3] = 7, [3] [6] = 7, + + }; + int a, a1, a2, n1, n2, t1, t2; + Ins *i; + + for (i=b->ins; i-b->ins < b->nins; i++) { + if (rtype(i->to) == RTmp) + ai[i->to.val].i = i; + if (i->op != Oadd && i->op != Omul) + continue; + a1 = aref(i->arg[0], ai); + a2 = aref(i->arg[1], ai); + t1 = a1 != 1 && a1 != 2; + t2 = a2 != 1 && a2 != 2; + if (i->op == Oadd) { + a = add[n1 = a1][n2 = a2]; + if (t1 && a < add[0][a2]) + a = add[n1 = 0][n2 = a2]; + if (t2 && a < add[a1][0]) + a = add[n1 = a1][n2 = 0]; + if (t1 && t2 && a < add[0][0]) + a = add[n1 = 0][n2 = 0]; + } else { + n1 = n2 = a = 0; + if (ascale(i->arg[0], con) && t2) + a = 3, n1 = 2, n2 = 0; + if (t1 && ascale(i->arg[1], con)) + a = 3, n1 = 0, n2 = 2; + } + ai[i->to.val].n = a; + ai[i->to.val].l = n1; + ai[i->to.val].r = n2; + } +} + +static void +amatch(Addr *a, Ref r, ANum *ai, Fn *fn, int top) +{ + Ins *i; + int nl, nr, t, s; + Ref al, ar; + + if (top) + memset(a, 0, sizeof *a); + if (rtype(r) == RCon) { + addcon(&a->offset, &fn->con[r.val]); + return; + } + assert(rtype(r) == RTmp); + i = ai[r.val].i; + nl = ai[r.val].l; + nr = ai[r.val].r; + if (i) { + if (nl > nr) { + al = i->arg[1]; + ar = i->arg[0]; + t = nl, nl = nr, nr = t; + } else { + al = i->arg[0]; + ar = i->arg[1]; + } + } + switch (ai[r.val].n) { + case 3: /* s * i */ + if (!top) { + a->index = al; + a->scale = fn->con[ar.val].bits.i; + } else + a->base = r; + break; + case 4: /* b + s * i */ + switch (nr) { + case 0: + if (fn->tmp[ar.val].slot != -1) { + al = i->arg[1]; + ar = i->arg[0]; + } + a->index = ar; + a->scale = 1; + break; + case 3: + amatch(a, ar, ai, fn, 0); + break; + } + r = al; + case 0: + s = fn->tmp[r.val].slot; + if (s != -1) + r = SLOT(s); + a->base = r; + break; + case 2: /* constants */ + case 5: /* o + s * i */ + case 6: /* o + b */ + case 7: /* o + b + s * i */ + amatch(a, ar, ai, fn, 0); + amatch(a, al, ai, fn, 0); + break; + default: + die("unreachable"); + } +} + +/* instruction selection + * requires use counts (as given by parsing) + */ +void +amd64_isel(Fn *fn) +{ + Blk *b, **sb; + Ins *i; + Phi *p; + uint a; + int n, al; + int64_t sz; + ANum *ainfo; + + /* assign slots to fast allocs */ + b = fn->start; + /* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */ + for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2) + for (i=b->ins; i-b->ins < b->nins; i++) + if (i->op == al) { + if (rtype(i->arg[0]) != RCon) + break; + sz = fn->con[i->arg[0].val].bits.i; + if (sz < 0 || sz >= INT_MAX-15) + err("invalid alloc size %"PRId64, sz); + sz = (sz + n-1) & -n; + sz /= 4; + fn->tmp[i->to.val].slot = fn->slot; + fn->slot += sz; + *i = (Ins){.op = Onop}; + } + + /* process basic blocks */ + n = fn->ntmp; + ainfo = emalloc(n * sizeof ainfo[0]); + for (b=fn->start; b; b=b->link) { + curi = &insb[NIns]; + for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++) + for (p=(*sb)->phi; p; p=p->link) { + for (a=0; p->blk[a] != b; a++) + assert(a+1 < p->narg); + fixarg(&p->arg[a], p->cls, 1, fn); + } + memset(ainfo, 0, n * sizeof ainfo[0]); + anumber(ainfo, b, fn->con); + seljmp(b, fn); + for (i=&b->ins[b->nins]; i!=b->ins;) + sel(*--i, ainfo, fn); + b->nins = &insb[NIns] - curi; + idup(&b->ins, curi, b->nins); + } + free(ainfo); + + if (debug['I']) { + fprintf(stderr, "\n> After instruction selection:\n"); + printfn(fn, stderr); + } +} diff --git a/amd64/sysv.c b/amd64/sysv.c new file mode 100644 index 0000000..dcaa812 --- /dev/null +++ b/amd64/sysv.c @@ -0,0 +1,701 @@ +#include "all.h" + +typedef struct AClass AClass; +typedef struct RAlloc RAlloc; + +struct AClass { + int inmem; + int align; + uint size; + int cls[2]; + Ref ref[2]; +}; + +struct RAlloc { + Ins i; + RAlloc *link; +}; + +static void +classify(AClass *a, Typ *t, int *pn, int *pe) +{ + Seg *seg; + int n, s, *cls; + + for (n=0; n<t->nunion; n++) { + seg = t->seg[n]; + for (s=0; *pe<2; (*pe)++) { + cls = &a->cls[*pe]; + for (; *pn<8; s++) { + switch (seg[s].type) { + case SEnd: + goto Done; + case SPad: + /* don't change anything */ + break; + case SFlt: + if (*cls == Kx) + *cls = Kd; + break; + case SInt: + *cls = Kl; + break; + case STyp: + classify(a, &typ[seg[s].len], pn, pe); + continue; + } + *pn += seg[s].len; + } + Done: + assert(*pn <= 8); + *pn = 0; + } + } +} + +static void +typclass(AClass *a, Typ *t) +{ + int e, n; + uint sz, al; + + sz = t->size; + al = 1u << t->align; + + /* the ABI requires sizes to be rounded + * up to the nearest multiple of 8, moreover + * it makes it easy load and store structures + * in registers + */ + if (al < 8) + al = 8; + sz = (sz + al-1) & -al; + + a->size = sz; + a->align = t->align; + + if (t->dark || sz > 16 || sz == 0) { + /* large or unaligned structures are + * required to be passed in memory + */ + a->inmem = 1; + return; + } + + a->cls[0] = Kx; + a->cls[1] = Kx; + a->inmem = 0; + n = 0; + e = 0; + classify(a, t, &n, &e); +} + +static int +retr(Ref reg[2], AClass *aret) +{ + static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}}; + int n, k, ca, nr[2]; + + nr[0] = nr[1] = 0; + ca = 0; + for (n=0; (uint)n*8<aret->size; n++) { + k = KBASE(aret->cls[n]); + reg[n] = TMP(retreg[k][nr[k]++]); + ca += 1 << (2 * k); + } + return ca; +} + +static void +selret(Blk *b, Fn *fn) +{ + int j, k, ca; + Ref r, r0, reg[2]; + AClass aret; + + j = b->jmp.type; + + if (!isret(j) || j == Jret0) + return; + + r0 = b->jmp.arg; + b->jmp.type = Jret0; + + if (j == Jretc) { + typclass(&aret, &typ[fn->retty]); + if (aret.inmem) { + assert(rtype(fn->retr) == RTmp); + emit(Ocopy, Kl, TMP(RAX), fn->retr, R); + blit(fn->retr, 0, r0, aret.size, fn); + ca = 1; + } else { + ca = retr(reg, &aret); + if (aret.size > 8) { + r = newtmp("abi", Kl, fn); + emit(Oload, Kl, reg[1], r, R); + emit(Oadd, Kl, r, r0, getcon(8, fn)); + } + emit(Oload, Kl, reg[0], r0, R); + } + } else { + k = j - Jretw; + if (KBASE(k) == 0) { + emit(Ocopy, k, TMP(RAX), r0, R); + ca = 1; + } else { + emit(Ocopy, k, TMP(XMM0), r0, R); + ca = 1 << 2; + } + } + + b->jmp.arg = CALL(ca); +} + +static int +argsclass(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret, Ref *env) +{ + int nint, ni, nsse, ns, n, *pn; + AClass *a; + Ins *i; + + if (aret && aret->inmem) + nint = 5; /* hidden argument */ + else + nint = 6; + nsse = 8; + for (i=i0, a=ac; i<i1; i++, a++) + switch (i->op - op + Oarg) { + case Oarg: + if (KBASE(i->cls) == 0) + pn = &nint; + else + pn = &nsse; + if (*pn > 0) { + --*pn; + a->inmem = 0; + } else + a->inmem = 2; + a->align = 3; + a->size = 8; + a->cls[0] = i->cls; + break; + case Oargc: + n = i->arg[0].val; + typclass(a, &typ[n]); + if (a->inmem) + continue; + ni = ns = 0; + for (n=0; (uint)n*8<a->size; n++) + if (KBASE(a->cls[n]) == 0) + ni++; + else + ns++; + if (nint >= ni && nsse >= ns) { + nint -= ni; + nsse -= ns; + } else + a->inmem = 1; + break; + case Oarge: + if (op == Opar) + *env = i->to; + else + *env = i->arg[0]; + break; + } + + return ((6-nint) << 4) | ((8-nsse) << 8); +} + +int amd64_sysv_rsave[] = { + RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, -1 +}; +int amd64_sysv_rclob[] = {RBX, R12, R13, R14, R15, -1}; + +MAKESURE(sysv_arrays_ok, + sizeof amd64_sysv_rsave == (NGPS+NFPS+1) * sizeof(int) && + sizeof amd64_sysv_rclob == (NCLR+1) * sizeof(int) +); + +/* layout of call's second argument (RCall) + * + * 29 12 8 4 3 0 + * |0...00|x|xxxx|xxxx|xx|xx| range + * | | | | ` gp regs returned (0..2) + * | | | ` sse regs returned (0..2) + * | | ` gp regs passed (0..6) + * | ` sse regs passed (0..8) + * ` 1 if rax is used to pass data (0..1) + */ + +bits +amd64_sysv_retregs(Ref r, int p[2]) +{ + bits b; + int ni, nf; + + assert(rtype(r) == RCall); + b = 0; + ni = r.val & 3; + nf = (r.val >> 2) & 3; + if (ni >= 1) + b |= BIT(RAX); + if (ni >= 2) + b |= BIT(RDX); + if (nf >= 1) + b |= BIT(XMM0); + if (nf >= 2) + b |= BIT(XMM1); + if (p) { + p[0] = ni; + p[1] = nf; + } + return b; +} + +bits +amd64_sysv_argregs(Ref r, int p[2]) +{ + bits b; + int j, ni, nf, ra; + + assert(rtype(r) == RCall); + b = 0; + ni = (r.val >> 4) & 15; + nf = (r.val >> 8) & 15; + ra = (r.val >> 12) & 1; + for (j=0; j<ni; j++) + b |= BIT(amd64_sysv_rsave[j]); + for (j=0; j<nf; j++) + b |= BIT(XMM0+j); + if (p) { + p[0] = ni + ra; + p[1] = nf; + } + return b | (ra ? BIT(RAX) : 0); +} + +static Ref +rarg(int ty, int *ni, int *ns) +{ + if (KBASE(ty) == 0) + return TMP(amd64_sysv_rsave[(*ni)++]); + else + return TMP(XMM0 + (*ns)++); +} + +static void +selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap) +{ + Ins *i; + AClass *ac, *a, aret; + int ca, ni, ns, al, varc, envc; + uint stk, off; + Ref r, r1, r2, reg[2], env; + RAlloc *ra; + + env = R; + ac = alloc((i1-i0) * sizeof ac[0]); + + if (!req(i1->arg[1], R)) { + assert(rtype(i1->arg[1]) == RType); + typclass(&aret, &typ[i1->arg[1].val]); + ca = argsclass(i0, i1, ac, Oarg, &aret, &env); + } else + ca = argsclass(i0, i1, ac, Oarg, 0, &env); + + for (stk=0, a=&ac[i1-i0]; a>ac;) + if ((--a)->inmem) { + if (a->align > 4) + err("sysv abi requires alignments of 16 or less"); + stk += a->size; + if (a->align == 4) + stk += stk & 15; + } + stk += stk & 15; + if (stk) { + r = getcon(-(int64_t)stk, fn); + emit(Osalloc, Kl, R, r, R); + } + + if (!req(i1->arg[1], R)) { + if (aret.inmem) { + /* get the return location from eax + * it saves one callee-save reg */ + r1 = newtmp("abi", Kl, fn); + emit(Ocopy, Kl, i1->to, TMP(RAX), R); + ca += 1; + } else { + if (aret.size > 8) { + r = newtmp("abi", Kl, fn); + aret.ref[1] = newtmp("abi", aret.cls[1], fn); + emit(Ostorel, 0, R, aret.ref[1], r); + emit(Oadd, Kl, r, i1->to, getcon(8, fn)); + } + aret.ref[0] = newtmp("abi", aret.cls[0], fn); + emit(Ostorel, 0, R, aret.ref[0], i1->to); + ca += retr(reg, &aret); + if (aret.size > 8) + emit(Ocopy, aret.cls[1], aret.ref[1], reg[1], R); + emit(Ocopy, aret.cls[0], aret.ref[0], reg[0], R); + r1 = i1->to; + } + /* allocate return pad */ + ra = alloc(sizeof *ra); + /* specific to NAlign == 3 */ + al = aret.align >= 2 ? aret.align - 2 : 0; + ra->i = (Ins){Oalloc+al, r1, {getcon(aret.size, fn)}, Kl}; + ra->link = (*rap); + *rap = ra; + } else { + ra = 0; + if (KBASE(i1->cls) == 0) { + emit(Ocopy, i1->cls, i1->to, TMP(RAX), R); + ca += 1; + } else { + emit(Ocopy, i1->cls, i1->to, TMP(XMM0), R); + ca += 1 << 2; + } + } + envc = !req(R, env); + varc = i1->op == Ovacall; + if (varc && envc) + err("sysv abi does not support variadic env calls"); + ca |= (varc | envc) << 12; + emit(Ocall, i1->cls, R, i1->arg[0], CALL(ca)); + if (envc) + emit(Ocopy, Kl, TMP(RAX), env, R); + if (varc) + emit(Ocopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R); + + ni = ns = 0; + if (ra && aret.inmem) + emit(Ocopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */ + for (i=i0, a=ac; i<i1; i++, a++) { + if (a->inmem) + continue; + r1 = rarg(a->cls[0], &ni, &ns); + if (i->op == Oargc) { + if (a->size > 8) { + r2 = rarg(a->cls[1], &ni, &ns); + r = newtmp("abi", Kl, fn); + emit(Oload, a->cls[1], r2, r, R); + emit(Oadd, Kl, r, i->arg[1], getcon(8, fn)); + } + emit(Oload, a->cls[0], r1, i->arg[1], R); + } else + emit(Ocopy, i->cls, r1, i->arg[0], R); + } + + if (!stk) + return; + + r = newtmp("abi", Kl, fn); + for (i=i0, a=ac, off=0; i<i1; i++, a++) { + if (!a->inmem) + continue; + if (i->op == Oargc) { + if (a->align == 4) + off += off & 15; + blit(r, off, i->arg[1], a->size, fn); + } else { + r1 = newtmp("abi", Kl, fn); + emit(Ostorel, 0, R, i->arg[0], r1); + emit(Oadd, Kl, r1, r, getcon(off, fn)); + } + off += a->size; + } + emit(Osalloc, Kl, r, getcon(stk, fn), R); +} + +static int +selpar(Fn *fn, Ins *i0, Ins *i1) +{ + AClass *ac, *a, aret; + Ins *i; + int ni, ns, s, al, fa; + Ref r, env; + + env = R; + ac = alloc((i1-i0) * sizeof ac[0]); + curi = &insb[NIns]; + ni = ns = 0; + + if (fn->retty >= 0) { + typclass(&aret, &typ[fn->retty]); + fa = argsclass(i0, i1, ac, Opar, &aret, &env); + } else + fa = argsclass(i0, i1, ac, Opar, 0, &env); + + for (i=i0, a=ac; i<i1; i++, a++) { + if (i->op != Oparc || a->inmem) + continue; + if (a->size > 8) { + r = newtmp("abi", Kl, fn); + a->ref[1] = newtmp("abi", Kl, fn); + emit(Ostorel, 0, R, a->ref[1], r); + emit(Oadd, Kl, r, i->to, getcon(8, fn)); + } + a->ref[0] = newtmp("abi", Kl, fn); + emit(Ostorel, 0, R, a->ref[0], i->to); + /* specific to NAlign == 3 */ + al = a->align >= 2 ? a->align - 2 : 0; + emit(Oalloc+al, Kl, i->to, getcon(a->size, fn), R); + } + + if (fn->retty >= 0 && aret.inmem) { + r = newtmp("abi", Kl, fn); + emit(Ocopy, Kl, r, rarg(Kl, &ni, &ns), R); + fn->retr = r; + } + + for (i=i0, a=ac, s=4; i<i1; i++, a++) { + switch (a->inmem) { + case 1: + if (a->align > 4) + err("sysv abi requires alignments of 16 or less"); + if (a->align == 4) + s = (s+3) & -4; + fn->tmp[i->to.val].slot = -s; + s += a->size / 4; + continue; + case 2: + emit(Oload, i->cls, i->to, SLOT(-s), R); + s += 2; + continue; + } + r = rarg(a->cls[0], &ni, &ns); + if (i->op == Oparc) { + emit(Ocopy, Kl, a->ref[0], r, R); + if (a->size > 8) { + r = rarg(a->cls[1], &ni, &ns); + emit(Ocopy, Kl, a->ref[1], r, R); + } + } else + emit(Ocopy, i->cls, i->to, r, R); + } + + if (!req(R, env)) + emit(Ocopy, Kl, env, TMP(RAX), R); + + return fa | (s*4)<<12; +} + +static Blk * +split(Fn *fn, Blk *b) +{ + Blk *bn; + + ++fn->nblk; + bn = blknew(); + bn->nins = &insb[NIns] - curi; + idup(&bn->ins, curi, bn->nins); + curi = &insb[NIns]; + bn->visit = ++b->visit; + snprintf(bn->name, NString, "%s.%d", b->name, b->visit); + bn->loop = b->loop; + bn->link = b->link; + b->link = bn; + return bn; +} + +static void +chpred(Blk *b, Blk *bp, Blk *bp1) +{ + Phi *p; + uint a; + + for (p=b->phi; p; p=p->link) { + for (a=0; p->blk[a]!=bp; a++) + assert(a+1<p->narg); + p->blk[a] = bp1; + } +} + +static void +selvaarg(Fn *fn, Blk *b, Ins *i) +{ + Ref loc, lreg, lstk, nr, r0, r1, c4, c8, c16, c, ap; + Blk *b0, *bstk, *breg; + int isint; + + c4 = getcon(4, fn); + c8 = getcon(8, fn); + c16 = getcon(16, fn); + ap = i->arg[0]; + isint = KBASE(i->cls) == 0; + + /* @b [...] + r0 =l add ap, (0 or 4) + nr =l loadsw r0 + r1 =w cultw nr, (48 or 176) + jnz r1, @breg, @bstk + @breg + r0 =l add ap, 16 + r1 =l loadl r0 + lreg =l add r1, nr + r0 =w add nr, (8 or 16) + r1 =l add ap, (0 or 4) + storew r0, r1 + @bstk + r0 =l add ap, 8 + lstk =l loadl r0 + r1 =l add lstk, 8 + storel r1, r0 + @b0 + %loc =l phi @breg %lreg, @bstk %lstk + i->to =(i->cls) load %loc + */ + + loc = newtmp("abi", Kl, fn); + emit(Oload, i->cls, i->to, loc, R); + b0 = split(fn, b); + b0->jmp = b->jmp; + b0->s1 = b->s1; + b0->s2 = b->s2; + if (b->s1) + chpred(b->s1, b, b0); + if (b->s2 && b->s2 != b->s1) + chpred(b->s2, b, b0); + + lreg = newtmp("abi", Kl, fn); + nr = newtmp("abi", Kl, fn); + r0 = newtmp("abi", Kw, fn); + r1 = newtmp("abi", Kl, fn); + emit(Ostorew, Kw, R, r0, r1); + emit(Oadd, Kl, r1, ap, isint ? CON_Z : c4); + emit(Oadd, Kw, r0, nr, isint ? c8 : c16); + r0 = newtmp("abi", Kl, fn); + r1 = newtmp("abi", Kl, fn); + emit(Oadd, Kl, lreg, r1, nr); + emit(Oload, Kl, r1, r0, R); + emit(Oadd, Kl, r0, ap, c16); + breg = split(fn, b); + breg->jmp.type = Jjmp; + breg->s1 = b0; + + lstk = newtmp("abi", Kl, fn); + r0 = newtmp("abi", Kl, fn); + r1 = newtmp("abi", Kl, fn); + emit(Ostorel, Kw, R, r1, r0); + emit(Oadd, Kl, r1, lstk, c8); + emit(Oload, Kl, lstk, r0, R); + emit(Oadd, Kl, r0, ap, c8); + bstk = split(fn, b); + bstk->jmp.type = Jjmp; + bstk->s1 = b0; + + b0->phi = alloc(sizeof *b0->phi); + *b0->phi = (Phi){ + .cls = Kl, .to = loc, + .narg = 2, + .blk = {bstk, breg}, + .arg = {lstk, lreg}, + }; + r0 = newtmp("abi", Kl, fn); + r1 = newtmp("abi", Kw, fn); + b->jmp.type = Jjnz; + b->jmp.arg = r1; + b->s1 = breg; + b->s2 = bstk; + c = getcon(isint ? 48 : 176, fn); + emit(Ocmpw+Ciult, Kw, r1, nr, c); + emit(Oloadsw, Kl, nr, r0, R); + emit(Oadd, Kl, r0, ap, isint ? CON_Z : c4); +} + +static void +selvastart(Fn *fn, int fa, Ref ap) +{ + Ref r0, r1; + int gp, fp, sp; + + gp = ((fa >> 4) & 15) * 8; + fp = 48 + ((fa >> 8) & 15) * 16; + sp = fa >> 12; + r0 = newtmp("abi", Kl, fn); + r1 = newtmp("abi", Kl, fn); + emit(Ostorel, Kw, R, r1, r0); + emit(Oadd, Kl, r1, TMP(RBP), getcon(-176, fn)); + emit(Oadd, Kl, r0, ap, getcon(16, fn)); + r0 = newtmp("abi", Kl, fn); + r1 = newtmp("abi", Kl, fn); + emit(Ostorel, Kw, R, r1, r0); + emit(Oadd, Kl, r1, TMP(RBP), getcon(sp, fn)); + emit(Oadd, Kl, r0, ap, getcon(8, fn)); + r0 = newtmp("abi", Kl, fn); + emit(Ostorew, Kw, R, getcon(fp, fn), r0); + emit(Oadd, Kl, r0, ap, getcon(4, fn)); + emit(Ostorew, Kw, R, getcon(gp, fn), ap); +} + +void +amd64_sysv_abi(Fn *fn) +{ + Blk *b; + Ins *i, *i0, *ip; + RAlloc *ral; + int n, fa; + + for (b=fn->start; b; b=b->link) + b->visit = 0; + + /* lower parameters */ + for (b=fn->start, i=b->ins; i-b->ins<b->nins; i++) + if (!ispar(i->op)) + break; + fa = selpar(fn, b->ins, i); + n = b->nins - (i - b->ins) + (&insb[NIns] - curi); + i0 = alloc(n * sizeof(Ins)); + ip = icpy(ip = i0, curi, &insb[NIns] - curi); + ip = icpy(ip, i, &b->ins[b->nins] - i); + b->nins = n; + b->ins = i0; + + /* lower calls, returns, and vararg instructions */ + ral = 0; + b = fn->start; + do { + if (!(b = b->link)) + b = fn->start; /* do it last */ + if (b->visit) + continue; + curi = &insb[NIns]; + selret(b, fn); + for (i=&b->ins[b->nins]; i!=b->ins;) + switch ((--i)->op) { + default: + emiti(*i); + break; + case Ocall: + case Ovacall: + for (i0=i; i0>b->ins; i0--) + if (!isarg((i0-1)->op)) + break; + selcall(fn, i0, i, &ral); + i = i0; + break; + case Ovastart: + selvastart(fn, fa, i->arg[0]); + break; + case Ovaarg: + selvaarg(fn, b, i); + break; + case Oarg: + case Oargc: + die("unreachable"); + } + if (b == fn->start) + for (; ral; ral=ral->link) + emiti(ral->i); + b->nins = &insb[NIns] - curi; + idup(&b->ins, curi, b->nins); + } while (b != fn->start); + + if (debug['A']) { + fprintf(stderr, "\n> After ABI lowering:\n"); + printfn(fn, stderr); + } +} diff --git a/amd64/targ.c b/amd64/targ.c new file mode 100644 index 0000000..e227574 --- /dev/null +++ b/amd64/targ.c @@ -0,0 +1,30 @@ +#include "all.h" + +Amd64Op amd64_op[NOp] = { +#define O(op, t, x) [O##op] = +#define X(nm, zf, lf) { nm, zf, lf, }, + #include "../ops.h" +}; + +static int +amd64_memargs(int op) +{ + return amd64_op[op].nmem; +} + +Target T_amd64_sysv = { + .gpr0 = RAX, + .ngpr = NGPR, + .fpr0 = XMM0, + .nfpr = NFPR, + .rglob = BIT(RBP) | BIT(RSP), + .nrglob = 2, + .rsave = amd64_sysv_rsave, + .nrsave = {NGPS, NFPS}, + .retregs = amd64_sysv_retregs, + .argregs = amd64_sysv_argregs, + .memargs = amd64_memargs, + .abi = amd64_sysv_abi, + .isel = amd64_isel, + .emitfn = amd64_emitfn, +}; |