5 files changed, 1965 insertions, 0 deletions
diff --git a/amd64/all.h b/amd64/all.h
new file mode 100644
index 0000000..3a2db0e
--- /dev/null
+++ b/amd64/all.h
@@ -0,0 +1,70 @@
+#include "../all.h"
+
+typedef struct Amd64Op Amd64Op;
+
+enum Amd64Reg {
+	RAX = RXX+1, /* caller-save */
+	RCX,
+	RDX,
+	RSI,
+	RDI,
+	R8,
+	R9,
+	R10,
+	R11,
+
+	RBX, /* callee-save */
+	R12,
+	R13,
+	R14,
+	R15,
+
+	RBP, /* globally live */
+	RSP,
+
+	XMM0, /* sse */
+	XMM1,
+	XMM2,
+	XMM3,
+	XMM4,
+	XMM5,
+	XMM6,
+	XMM7,
+	XMM8,
+	XMM9,
+	XMM10,
+	XMM11,
+	XMM12,
+	XMM13,
+	XMM14,
+	XMM15,
+
+	NFPR = XMM14 - XMM0 + 1, /* reserve XMM15 */
+	NGPR = RSP - RAX + 1,
+	NGPS = R11 - RAX + 1,
+	NFPS = NFPR,
+	NCLR = R15 - RBX + 1,
+};
+MAKESURE(reg_not_tmp, XMM15 < (int)Tmp0);
+
+struct Amd64Op {
+	char nmem;
+	char zflag;
+	char lflag;
+};
+
+/* targ.c */
+extern Amd64Op amd64_op[];
+
+/* sysv.c (abi) */
+extern int amd64_sysv_rsave[];
+extern int amd64_sysv_rclob[];
+bits amd64_sysv_retregs(Ref, int[2]);
+bits amd64_sysv_argregs(Ref, int[2]);
+void amd64_sysv_abi(Fn *);
+
+/* isel.c */
+void amd64_isel(Fn *);
+
+/* emit.c */
+void amd64_emitfn(Fn *, FILE *);
diff --git a/amd64/emit.c b/amd64/emit.c
new file mode 100644
index 0000000..eccbd02
--- /dev/null
+++ b/amd64/emit.c
@@ -0,0 +1,561 @@
+#include "all.h"
+
+
+#define CMP(X) \
+	X(Ciule,      "be") \
+	X(Ciult,      "b")  \
+	X(Cisle,      "le") \
+	X(Cislt,      "l")  \
+	X(Cisgt,      "g")  \
+	X(Cisge,      "ge") \
+	X(Ciugt,      "a")  \
+	X(Ciuge,      "ae") \
+	X(Cieq,       "z")  \
+	X(Cine,       "nz") \
+	X(NCmpI+Cfle, "be") \
+	X(NCmpI+Cflt, "b")  \
+	X(NCmpI+Cfgt, "a")  \
+	X(NCmpI+Cfge, "ae") \
+	X(NCmpI+Cfeq, "z")  \
+	X(NCmpI+Cfne, "nz") \
+	X(NCmpI+Cfo,  "np") \
+	X(NCmpI+Cfuo, "p")
+
+enum {
+	SLong = 0,
+	SWord = 1,
+	SShort = 2,
+	SByte = 3,
+
+	Ki = -1, /* matches Kw and Kl */
+	Ka = -2, /* matches all classes */
+};
+
+/* Instruction format strings:
+ *
+ * if the format string starts with -, the instruction
+ * is assumed to be 3-address and is put in 2-address
+ * mode using an extra mov if necessary
+ *
+ * if the format string starts with +, the same as the
+ * above applies, but commutativity is also assumed
+ *
+ * %k  is used to set the class of the instruction,
+ *     it'll expand to "l", "q", "ss", "sd", depending
+ *     on the instruction class
+ * %0  designates the first argument
+ * %1  designates the second argument
+ * %=  designates the result
+ *
+ * if %k is not used, a prefix to 0, 1, or = must be
+ * added, it can be:
+ *   M - memory reference
+ *   L - long  (64 bits)
+ *   W - word  (32 bits)
+ *   H - short (16 bits)
+ *   B - byte  (8 bits)
+ *   S - single precision float
+ *   D - double precision float
+ */
+static struct {
+	short op;
+	short cls;
+	char *asm;
+} omap[] = {
+	{ Oadd,    Ka, "+add%k %1, %=" },
+	{ Osub,    Ka, "-sub%k %1, %=" },
+	{ Oand,    Ki, "+and%k %1, %=" },
+	{ Oor,     Ki, "+or%k %1, %=" },
+	{ Oxor,    Ki, "+xor%k %1, %=" },
+	{ Osar,    Ki, "-sar%k %B1, %=" },
+	{ Oshr,    Ki, "-shr%k %B1, %=" },
+	{ Oshl,    Ki, "-shl%k %B1, %=" },
+	{ Omul,    Ki, "+imul%k %1, %=" },
+	{ Omul,    Ks, "+mulss %1, %=" },
+	{ Omul,    Kd, "+mulsd %1, %=" },
+	{ Odiv,    Ka, "-div%k %1, %=" },
+	{ Ostorel, Ka, "movq %L0, %M1" },
+	{ Ostorew, Ka, "movl %W0, %M1" },
+	{ Ostoreh, Ka, "movw %H0, %M1" },
+	{ Ostoreb, Ka, "movb %B0, %M1" },
+	{ Ostores, Ka, "movss %S0, %M1" },
+	{ Ostored, Ka, "movsd %D0, %M1" },
+	{ Oload,   Ka, "mov%k %M0, %=" },
+	{ Oloadsw, Kl, "movslq %M0, %L=" },
+	{ Oloadsw, Kw, "movl %M0, %W=" },
+	{ Oloaduw, Ki, "movl %M0, %W=" },
+	{ Oloadsh, Ki, "movsw%k %M0, %=" },
+	{ Oloaduh, Ki, "movzw%k %M0, %=" },
+	{ Oloadsb, Ki, "movsb%k %M0, %=" },
+	{ Oloadub, Ki, "movzb%k %M0, %=" },
+	{ Oextsw,  Kl, "movslq %W0, %L=" },
+	{ Oextuw,  Kl, "movl %W0, %W=" },
+	{ Oextsh,  Ki, "movsw%k %H0, %=" },
+	{ Oextuh,  Ki, "movzw%k %H0, %=" },
+	{ Oextsb,  Ki, "movsb%k %B0, %=" },
+	{ Oextub,  Ki, "movzb%k %B0, %=" },
+
+	{ Oexts,   Kd, "cvtss2sd %0, %=" },
+	{ Otruncd, Ks, "cvttsd2ss %0, %=" },
+	{ Ostosi,  Ki, "cvttss2si%k %0, %=" },
+	{ Odtosi,  Ki, "cvttsd2si%k %0, %=" },
+	{ Oswtof,  Ka, "cvtsi2%k %W0, %=" },
+	{ Osltof,  Ka, "cvtsi2%k %L0, %=" },
+	{ Ocast,   Ki, "movq %D0, %L=" },
+	{ Ocast,   Ka, "movq %L0, %D=" },
+
+	{ Oaddr,   Ki, "lea%k %M0, %=" },
+	{ Oswap,   Ki, "xchg%k %0, %1" },
+	{ Osign,   Kl, "cqto" },
+	{ Osign,   Kw, "cltd" },
+	{ Oxdiv,   Ki, "div%k %0" },
+	{ Oxidiv,  Ki, "idiv%k %0" },
+	{ Oxcmp,   Ks, "comiss %S0, %S1" },
+	{ Oxcmp,   Kd, "comisd %D0, %D1" },
+	{ Oxcmp,   Ki, "cmp%k %0, %1" },
+	{ Oxtest,  Ki, "test%k %0, %1" },
+#define X(c, s) \
+	{ Oflag+c, Ki, "set" s " %B=\n\tmovzb%k %B=, %=" },
+	CMP(X)
+#undef X
+	{ NOp, 0, 0 }
+};
+
+static char *rname[][4] = {
+	[RAX] = {"rax", "eax", "ax", "al"},
+	[RBX] = {"rbx", "ebx", "bx", "bl"},
+	[RCX] = {"rcx", "ecx", "cx", "cl"},
+	[RDX] = {"rdx", "edx", "dx", "dl"},
+	[RSI] = {"rsi", "esi", "si", "sil"},
+	[RDI] = {"rdi", "edi", "di", "dil"},
+	[RBP] = {"rbp", "ebp", "bp", "bpl"},
+	[RSP] = {"rsp", "esp", "sp", "spl"},
+	[R8 ] = {"r8" , "r8d", "r8w", "r8b"},
+	[R9 ] = {"r9" , "r9d", "r9w", "r9b"},
+	[R10] = {"r10", "r10d", "r10w", "r10b"},
+	[R11] = {"r11", "r11d", "r11w", "r11b"},
+	[R12] = {"r12", "r12d", "r12w", "r12b"},
+	[R13] = {"r13", "r13d", "r13w", "r13b"},
+	[R14] = {"r14", "r14d", "r14w", "r14b"},
+	[R15] = {"r15", "r15d", "r15w", "r15b"},
+};
+
+
+static int
+slot(int s, Fn *fn)
+{
+	struct { int i:29; } x;
+
+	/* sign extend s using a bitfield */
+	x.i = s;
+	assert(x.i <= fn->slot);
+	/* specific to NAlign == 3 */
+	if (x.i < 0)
+		return -4 * x.i;
+	else if (fn->vararg)
+		return -176 + -4 * (fn->slot - x.i);
+	else
+		return -4 * (fn->slot - x.i);
+}
+
+static void
+emitcon(Con *con, FILE *f)
+{
+	switch (con->type) {
+	case CAddr:
+		if (con->local)
+			fprintf(f, "%s%s", gasloc, con->label);
+		else
+			fprintf(f, "%s%s", gassym, con->label);
+		if (con->bits.i)
+			fprintf(f, "%+"PRId64, con->bits.i);
+		break;
+	case CBits:
+		fprintf(f, "%"PRId64, con->bits.i);
+		break;
+	default:
+		die("unreachable");
+	}
+}
+
+static char *
+regtoa(int reg, int sz)
+{
+	static char buf[6];
+
+	if (reg >= XMM0) {
+		sprintf(buf, "xmm%d", reg-XMM0);
+		return buf;
+	} else
+		return rname[reg][sz];
+}
+
+static Ref
+getarg(char c, Ins *i)
+{
+	switch (c) {
+	case '0':
+		return i->arg[0];
+	case '1':
+		return i->arg[1];
+	case '=':
+		return i->to;
+	default:
+		die("invalid arg letter %c", c);
+	}
+}
+
+static void emitins(Ins, Fn *, FILE *);
+
+static void
+emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f)
+{
+	Ins icp;
+
+	icp.op = Ocopy;
+	icp.arg[0] = r2;
+	icp.to = r1;
+	icp.cls = k;
+	emitins(icp, fn, f);
+}
+
+static void
+emitf(char *s, Ins *i, Fn *fn, FILE *f)
+{
+	static char clstoa[][3] = {"l", "q", "ss", "sd"};
+	char c;
+	int sz;
+	Ref ref;
+	Mem *m;
+	Con off;
+
+	switch (*s) {
+	case '+':
+		if (req(i->arg[1], i->to)) {
+			ref = i->arg[0];
+			i->arg[0] = i->arg[1];
+			i->arg[1] = ref;
+		}
+		/* fall through */
+	case '-':
+		assert((!req(i->arg[1], i->to) || req(i->arg[0], i->to)) &&
+			"cannot convert to 2-address");
+		emitcopy(i->to, i->arg[0], i->cls, fn, f);
+		s++;
+		break;
+	}
+
+	fputc('\t', f);
+Next:
+	while ((c = *s++) != '%')
+		if (!c) {
+			fputc('\n', f);
+			return;
+		} else
+			fputc(c, f);
+	switch ((c = *s++)) {
+	case '%':
+		fputc('%', f);
+		break;
+	case 'k':
+		fputs(clstoa[i->cls], f);
+		break;
+	case '0':
+	case '1':
+	case '=':
+		sz = KWIDE(i->cls) ? SLong : SWord;
+		s--;
+		goto Ref;
+	case 'D':
+	case 'S':
+		sz = SLong; /* does not matter for floats */
+	Ref:
+		c = *s++;
+		ref = getarg(c, i);
+		switch (rtype(ref)) {
+		case RTmp:
+			assert(isreg(ref));
+			fprintf(f, "%%%s", regtoa(ref.val, sz));
+			break;
+		case RSlot:
+			fprintf(f, "%d(%%rbp)", slot(ref.val, fn));
+			break;
+		case RMem:
+		Mem:
+			m = &fn->mem[ref.val];
+			if (rtype(m->base) == RSlot) {
+				off.type = CBits;
+				off.bits.i = slot(m->base.val, fn);
+				addcon(&m->offset, &off);
+				m->base = TMP(RBP);
+			}
+			if (m->offset.type != CUndef)
+				emitcon(&m->offset, f);
+			fputc('(', f);
+			if (req(m->base, R))
+				fprintf(f, "%%rip");
+			else
+				fprintf(f, "%%%s", regtoa(m->base.val, SLong));
+			if (!req(m->index, R))
+				fprintf(f, ", %%%s, %d",
+					regtoa(m->index.val, SLong),
+					m->scale
+				);
+			fputc(')', f);
+			break;
+		case RCon:
+			fputc('$', f);
+			emitcon(&fn->con[ref.val], f);
+			break;
+		default:
+			die("unreachable");
+		}
+		break;
+	case 'L':
+		sz = SLong;
+		goto Ref;
+	case 'W':
+		sz = SWord;
+		goto Ref;
+	case 'H':
+		sz = SShort;
+		goto Ref;
+	case 'B':
+		sz = SByte;
+		goto Ref;
+	case 'M':
+		c = *s++;
+		ref = getarg(c, i);
+		switch (rtype(ref)) {
+		case RMem:
+			goto Mem;
+		case RSlot:
+			fprintf(f, "%d(%%rbp)", slot(ref.val, fn));
+			break;
+		case RCon:
+			emitcon(&fn->con[ref.val], f);
+			fprintf(f, "(%%rip)");
+			break;
+		case RTmp:
+			assert(isreg(ref));
+			fprintf(f, "(%%%s)", regtoa(ref.val, SLong));
+			break;
+		default:
+			die("unreachable");
+		}
+		break;
+	default:
+		die("invalid format specifier %%%c", c);
+	}
+	goto Next;
+}
+
+static void
+emitins(Ins i, Fn *fn, FILE *f)
+{
+	Ref r;
+	int64_t val;
+	int o;
+
+	switch (i.op) {
+	default:
+	Table:
+		/* most instructions are just pulled out of
+		 * the table omap[], some special cases are
+		 * detailed below */
+		for (o=0;; o++) {
+			/* this linear search should really be a binary
+			 * search */
+			if (omap[o].op == NOp)
+				die("no match for %s(%d)",
+					optab[i.op].name, "wlsd"[i.cls]);
+			if (omap[o].op == i.op)
+			if (omap[o].cls == i.cls
+			|| (omap[o].cls == Ki && KBASE(i.cls) == 0)
+			|| (omap[o].cls == Ka))
+				break;
+		}
+		emitf(omap[o].asm, &i, fn, f);
+		break;
+	case Onop:
+		/* just do nothing for nops, they are inserted
+		 * by some passes */
+		break;
+	case Omul:
+		/* here, we try to use the 3-addresss form
+		 * of multiplication when possible */
+		if (rtype(i.arg[1]) == RCon) {
+			r = i.arg[0];
+			i.arg[0] = i.arg[1];
+			i.arg[1] = r;
+		}
+		if (KBASE(i.cls) == 0 /* only available for ints */
+		&& rtype(i.arg[0]) == RCon
+		&& rtype(i.arg[1]) == RTmp) {
+			emitf("imul%k %0, %1, %=", &i, fn, f);
+			break;
+		}
+		goto Table;
+	case Osub:
+		/* we have to use the negation trick to handle
+		 * some 3-address substractions */
+		if (req(i.to, i.arg[1])) {
+			emitf("neg%k %=", &i, fn, f);
+			emitf("add%k %0, %=", &i, fn, f);
+			break;
+		}
+		goto Table;
+	case Ocopy:
+		/* make sure we don't emit useless copies,
+		 * also, we can use a trick to load 64-bits
+		 * registers, it's detailed in my note below
+		 * http://c9x.me/art/notes.html?09/19/2015 */
+		if (req(i.to, R) || req(i.arg[0], R))
+			break;
+		if (isreg(i.to)
+		&& rtype(i.arg[0]) == RCon
+		&& i.cls == Kl
+		&& fn->con[i.arg[0].val].type == CBits
+		&& (val = fn->con[i.arg[0].val].bits.i) >= 0
+		&& val <= UINT32_MAX) {
+			emitf("movl %W0, %W=", &i, fn, f);
+		} else if (isreg(i.to)
+		&& rtype(i.arg[0]) == RCon
+		&& fn->con[i.arg[0].val].type == CAddr) {
+			emitf("lea%k %M0, %=", &i, fn, f);
+		} else if (!req(i.arg[0], i.to))
+			emitf("mov%k %0, %=", &i, fn, f);
+		break;
+	case Ocall:
+		/* calls simply have a weird syntax in AT&T
+		 * assembly... */
+		switch (rtype(i.arg[0])) {
+		case RCon:
+			fprintf(f, "\tcallq ");
+			emitcon(&fn->con[i.arg[0].val], f);
+			fprintf(f, "\n");
+			break;
+		case RTmp:
+			emitf("callq *%L0", &i, fn, f);
+			break;
+		default:
+			die("invalid call argument");
+		}
+		break;
+	case Osalloc:
+		/* there is no good reason why this is here
+		 * maybe we should split Osalloc in 2 different
+		 * instructions depending on the result
+		 */
+		emitf("subq %L0, %%rsp", &i, fn, f);
+		if (!req(i.to, R))
+			emitcopy(i.to, TMP(RSP), Kl, fn, f);
+		break;
+	case Oswap:
+		if (KBASE(i.cls) == 0)
+			goto Table;
+		/* for floats, there is no swap instruction
+		 * so we use xmm15 as a temporary
+		 */
+		emitcopy(TMP(XMM0+15), i.arg[0], i.cls, fn, f);
+		emitcopy(i.arg[0], i.arg[1], i.cls, fn, f);
+		emitcopy(i.arg[1], TMP(XMM0+15), i.cls, fn, f);
+		break;
+	}
+}
+
+static int
+framesz(Fn *fn)
+{
+	int i, o, f;
+
+	/* specific to NAlign == 3 */
+	for (i=0, o=0; i<NCLR; i++)
+		o ^= 1 & (fn->reg >> amd64_sysv_rclob[i]);
+	f = fn->slot;
+	f = (f + 3) & -4;
+	return 4*f + 8*o + 176*fn->vararg;
+}
+
+void
+amd64_emitfn(Fn *fn, FILE *f)
+{
+	static char *ctoa[] = {
+	#define X(c, s) [c] = s,
+		CMP(X)
+	#undef X
+	};
+	static int id0;
+	Blk *b, *s;
+	Ins *i, itmp;
+	int *r, c, fs, o, n, lbl;
+
+	fprintf(f, ".text\n");
+	if (fn->export)
+		fprintf(f, ".globl %s%s\n", gassym, fn->name);
+	fprintf(f,
+		"%s%s:\n"
+		"\tpushq %%rbp\n"
+		"\tmovq %%rsp, %%rbp\n",
+		gassym, fn->name
+	);
+	fs = framesz(fn);
+	if (fs)
+		fprintf(f, "\tsub $%d, %%rsp\n", fs);
+	if (fn->vararg) {
+		o = -176;
+		for (r=amd64_sysv_rsave; r<&amd64_sysv_rsave[6]; r++, o+=8)
+			fprintf(f, "\tmovq %%%s, %d(%%rbp)\n", rname[*r][0], o);
+		for (n=0; n<8; ++n, o+=16)
+			fprintf(f, "\tmovaps %%xmm%d, %d(%%rbp)\n", n, o);
+	}
+	for (r=amd64_sysv_rclob; r<&amd64_sysv_rclob[NCLR]; r++)
+		if (fn->reg & BIT(*r)) {
+			itmp.arg[0] = TMP(*r);
+			emitf("pushq %L0", &itmp, fn, f);
+		}
+
+	for (lbl=0, b=fn->start; b; b=b->link) {
+		if (lbl || b->npred > 1)
+			fprintf(f, "%sbb%d:\n", gasloc, id0+b->id);
+		for (i=b->ins; i!=&b->ins[b->nins]; i++)
+			emitins(*i, fn, f);
+		lbl = 1;
+		switch (b->jmp.type) {
+		case Jret0:
+			for (r=&amd64_sysv_rclob[NCLR]; r>amd64_sysv_rclob;)
+				if (fn->reg & BIT(*--r)) {
+					itmp.arg[0] = TMP(*r);
+					emitf("popq %L0", &itmp, fn, f);
+				}
+			fprintf(f,
+				"\tleave\n"
+				"\tret\n"
+			);
+			break;
+		case Jjmp:
+		Jmp:
+			if (b->s1 != b->link)
+				fprintf(f, "\tjmp %sbb%d\n",
+					gasloc, id0+b->s1->id);
+			else
+				lbl = 0;
+			break;
+		default:
+			c = b->jmp.type - Jjf;
+			if (0 <= c && c <= NCmp) {
+				if (b->link == b->s2) {
+					s = b->s1;
+					b->s1 = b->s2;
+					b->s2 = s;
+				} else
+					c = cmpneg(c);
+				fprintf(f, "\tj%s %sbb%d\n", ctoa[c],
+					gasloc, id0+b->s2->id);
+				goto Jmp;
+			}
+			die("unhandled jump %d", b->jmp.type);
+		}
+	}
+	id0 += fn->nblk;
+}
diff --git a/amd64/isel.c b/amd64/isel.c
new file mode 100644
index 0000000..1623b9b
--- /dev/null
+++ b/amd64/isel.c
@@ -0,0 +1,603 @@
+#include "all.h"
+#include <limits.h>
+
+/* For x86_64, do the following:
+ *
+ * - check that constants are used only in
+ *   places allowed
+ * - ensure immediates always fit in 32b
+ * - expose machine register contraints
+ *   on instructions like division.
+ * - implement fast locals (the streak of
+ *   constant allocX in the first basic block)
+ * - recognize complex addressing modes
+ *
+ * Invariant: the use counts that are used
+ *            in sel() must be sound.  This
+ *            is not so trivial, maybe the
+ *            dce should be moved out...
+ */
+
+typedef struct ANum ANum;
+
+struct ANum {
+	char n, l, r;
+	Ins *i;
+};
+
+static void amatch(Addr *, Ref, ANum *, Fn *, int);
+
+static int
+noimm(Ref r, Fn *fn)
+{
+	int64_t val;
+
+	if (rtype(r) != RCon)
+		return 0;
+	switch (fn->con[r.val].type) {
+	case CAddr:
+		/* we only support the 'small'
+		 * code model of the ABI, this
+		 * means that we can always
+		 * address data with 32bits
+		 */
+		return 0;
+	case CBits:
+		val = fn->con[r.val].bits.i;
+		return (val < INT32_MIN || val > INT32_MAX);
+	default:
+		die("invalid constant");
+	}
+}
+
+static int
+rslot(Ref r, Fn *fn)
+{
+	if (rtype(r) != RTmp)
+		return -1;
+	return fn->tmp[r.val].slot;
+}
+
+static void
+fixarg(Ref *r, int k, int cpy, Fn *fn)
+{
+	Addr a, *m;
+	Ref r0, r1;
+	int s, n;
+
+	r1 = r0 = *r;
+	s = rslot(r0, fn);
+	if (KBASE(k) == 1 && rtype(r0) == RCon) {
+		/* load floating points from memory
+		 * slots, they can't be used as
+		 * immediates
+		 */
+		r1 = MEM(fn->nmem);
+		vgrow(&fn->mem, ++fn->nmem);
+		memset(&a, 0, sizeof a);
+		a.offset.type = CAddr;
+		a.offset.local = 1;
+		n = gasstashfp(fn->con[r0.val].bits.i, KWIDE(k));
+		sprintf(a.offset.label, "fp%d", n);
+		fn->mem[fn->nmem-1] = a;
+	}
+	else if (!cpy && k == Kl && noimm(r0, fn)) {
+		/* load constants that do not fit in
+		 * a 32bit signed integer into a
+		 * long temporary
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(Ocopy, Kl, r1, r0, R);
+	}
+	else if (s != -1) {
+		/* load fast locals' addresses into
+		 * temporaries right before the
+		 * instruction
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(Oaddr, Kl, r1, SLOT(s), R);
+	}
+	else if (rtype(r0) == RMem) {
+		/* apple asm fix */
+		m = &fn->mem[r0.val];
+		if (req(m->base, R)) {
+			n = fn->ncon;
+			vgrow(&fn->con, ++fn->ncon);
+			fn->con[n] = m->offset;
+			m->offset.type = CUndef;
+			r0 = newtmp("isel", Kl, fn);
+			emit(Oaddr, Kl, r0, CON(n), R);
+			m->base = r0;
+		}
+	}
+	*r = r1;
+}
+
+static void
+seladdr(Ref *r, ANum *an, Fn *fn)
+{
+	Addr a;
+	Ref r0;
+
+	r0 = *r;
+	if (rtype(r0) == RTmp) {
+		amatch(&a, r0, an, fn, 1);
+		if (req(a.base, r0))
+			return;
+		if (a.offset.type == CAddr)
+		if (!req(a.base, R)) {
+			/* apple asm fix */
+			if (!req(a.index, R))
+				return;
+			else {
+				a.index = a.base;
+				a.scale = 1;
+				a.base = R;
+			}
+		}
+		chuse(r0, -1, fn);
+		vgrow(&fn->mem, ++fn->nmem);
+		fn->mem[fn->nmem-1] = a;
+		chuse(a.base, +1, fn);
+		chuse(a.index, +1, fn);
+		*r = MEM(fn->nmem-1);
+	}
+}
+
+static int
+selcmp(Ref arg[2], int k, Fn *fn)
+{
+	int swap;
+	Ref r, *iarg;
+
+	swap = rtype(arg[0]) == RCon;
+	if (swap) {
+		r = arg[1];
+		arg[1] = arg[0];
+		arg[0] = r;
+	}
+	emit(Oxcmp, k, R, arg[1], arg[0]);
+	iarg = curi->arg;
+	if (rtype(arg[0]) == RCon) {
+		assert(k == Kl);
+		iarg[1] = newtmp("isel", k, fn);
+		emit(Ocopy, k, iarg[1], arg[0], R);
+	}
+	fixarg(&iarg[0], k, 0, fn);
+	fixarg(&iarg[1], k, 0, fn);
+	return swap;
+}
+
+static void
+sel(Ins i, ANum *an, Fn *fn)
+{
+	Ref r0, r1, *iarg;
+	int x, k, kc;
+	int64_t sz;
+	Ins *i0, *i1;
+
+	if (rtype(i.to) == RTmp)
+	if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
+	if (fn->tmp[i.to.val].nuse == 0) {
+		chuse(i.arg[0], -1, fn);
+		chuse(i.arg[1], -1, fn);
+		return;
+	}
+	i0 = curi;
+	k = i.cls;
+	switch (i.op) {
+	case Odiv:
+	case Orem:
+	case Oudiv:
+	case Ourem:
+		if (i.op == Odiv || i.op == Oudiv)
+			r0 = TMP(RAX), r1 = TMP(RDX);
+		else
+			r0 = TMP(RDX), r1 = TMP(RAX);
+		emit(Ocopy, k, i.to, r0, R);
+		emit(Ocopy, k, R, r1, R);
+		if (rtype(i.arg[1]) == RCon) {
+			/* immediates not allowed for
+			 * divisions in x86
+			 */
+			r0 = newtmp("isel", k, fn);
+		} else
+			r0 = i.arg[1];
+		if (fn->tmp[r0.val].slot != -1)
+			err("unlikely argument %%%s in %s",
+				fn->tmp[r0.val].name, optab[i.op].name);
+		if (i.op == Odiv || i.op == Orem) {
+			emit(Oxidiv, k, R, r0, R);
+			emit(Osign, k, TMP(RDX), TMP(RAX), R);
+		} else {
+			emit(Oxdiv, k, R, r0, R);
+			emit(Ocopy, k, TMP(RDX), CON_Z, R);
+		}
+		emit(Ocopy, k, TMP(RAX), i.arg[0], R);
+		fixarg(&curi->arg[0], k, 0, fn);
+		if (rtype(i.arg[1]) == RCon)
+			emit(Ocopy, k, r0, i.arg[1], R);
+		break;
+	case Osar:
+	case Oshr:
+	case Oshl:
+		if (rtype(i.arg[1]) == RCon)
+			goto Emit;
+		r0 = i.arg[1];
+		i.arg[1] = TMP(RCX);
+		emit(Ocopy, Kw, R, TMP(RCX), R);
+		emiti(i);
+		emit(Ocopy, Kw, TMP(RCX), r0, R);
+		break;
+	case Onop:
+		break;
+	case Ostored:
+	case Ostores:
+	case Ostorel:
+	case Ostorew:
+	case Ostoreh:
+	case Ostoreb:
+		if (rtype(i.arg[0]) == RCon) {
+			if (i.op == Ostored)
+				i.op = Ostorel;
+			if (i.op == Ostores)
+				i.op = Ostorew;
+		}
+		seladdr(&i.arg[1], an, fn);
+		goto Emit;
+	case_Oload:
+		seladdr(&i.arg[0], an, fn);
+		goto Emit;
+	case Ocall:
+	case Osalloc:
+	case Ocopy:
+	case Oadd:
+	case Osub:
+	case Omul:
+	case Oand:
+	case Oor:
+	case Oxor:
+	case Oxtest:
+	case Ostosi:
+	case Odtosi:
+	case Oswtof:
+	case Osltof:
+	case Oexts:
+	case Otruncd:
+	case Ocast:
+	case_OExt:
+Emit:
+		emiti(i);
+		iarg = curi->arg; /* fixarg() can change curi */
+		fixarg(&iarg[0], argcls(&i, 0), 0, fn);
+		fixarg(&iarg[1], argcls(&i, 1), 0, fn);
+		break;
+	case Oalloc:
+	case Oalloc+1:
+	case Oalloc+2: /* == Oalloc1 */
+		/* we need to make sure
+		 * the stack remains aligned
+		 * (rsp = 0) mod 16
+		 */
+		if (rtype(i.arg[0]) == RCon) {
+			sz = fn->con[i.arg[0].val].bits.i;
+			if (sz < 0 || sz >= INT_MAX-15)
+				err("invalid alloc size %"PRId64, sz);
+			sz = (sz + 15)  & -16;
+			emit(Osalloc, Kl, i.to, getcon(sz, fn), R);
+		} else {
+			/* r0 = (i.arg[0] + 15) & -16 */
+			r0 = newtmp("isel", Kl, fn);
+			r1 = newtmp("isel", Kl, fn);
+			emit(Osalloc, Kl, i.to, r0, R);
+			emit(Oand, Kl, r0, r1, getcon(-16, fn));
+			emit(Oadd, Kl, r1, i.arg[0], getcon(15, fn));
+			if (fn->tmp[i.arg[0].val].slot != -1)
+				err("unlikely argument %%%s in %s",
+					fn->tmp[i.arg[0].val].name, optab[i.op].name);
+		}
+		break;
+	default:
+		if (isext(i.op))
+			goto case_OExt;
+		if (isload(i.op))
+			goto case_Oload;
+		if (iscmp(i.op, &kc, &x)) {
+			emit(Oflag+x, k, i.to, R, R);
+			i1 = curi;
+			if (selcmp(i.arg, kc, fn))
+				i1->op = Oflag + cmpop(x);
+			break;
+		}
+		die("unknown instruction %s", optab[i.op].name);
+	}
+
+	while (i0 > curi && --i0) {
+		assert(rslot(i0->arg[0], fn) == -1);
+		assert(rslot(i0->arg[1], fn) == -1);
+	}
+}
+
+static Ins *
+flagi(Ins *i0, Ins *i)
+{
+	while (i>i0) {
+		i--;
+		if (amd64_op[i->op].zflag)
+			return i;
+		if (amd64_op[i->op].lflag)
+			continue;
+		return 0;
+	}
+	return 0;
+}
+
+static void
+seljmp(Blk *b, Fn *fn)
+{
+	Ref r;
+	int c, k;
+	Ins *fi;
+	Tmp *t;
+
+	if (b->jmp.type == Jret0 || b->jmp.type == Jjmp)
+		return;
+	assert(b->jmp.type == Jjnz);
+	r = b->jmp.arg;
+	t = &fn->tmp[r.val];
+	b->jmp.arg = R;
+	assert(!req(r, R) && rtype(r) != RCon);
+	if (b->s1 == b->s2) {
+		chuse(r, -1, fn);
+		b->jmp.type = Jjmp;
+		b->s2 = 0;
+		return;
+	}
+	fi = flagi(b->ins, &b->ins[b->nins]);
+	if (!fi || !req(fi->to, r)) {
+		selcmp((Ref[2]){r, CON_Z}, Kw, fn); /* todo, long jnz */
+		b->jmp.type = Jjf + Cine;
+	}
+	else if (iscmp(fi->op, &k, &c)) {
+		if (t->nuse == 1) {
+			if (selcmp(fi->arg, k, fn))
+				c = cmpop(c);
+			*fi = (Ins){.op = Onop};
+		}
+		b->jmp.type = Jjf + c;
+	}
+	else if (fi->op == Oand && t->nuse == 1
+	     && (rtype(fi->arg[0]) == RTmp ||
+	         rtype(fi->arg[1]) == RTmp)) {
+		fi->op = Oxtest;
+		fi->to = R;
+		b->jmp.type = Jjf + Cine;
+		if (rtype(fi->arg[1]) == RCon) {
+			r = fi->arg[1];
+			fi->arg[1] = fi->arg[0];
+			fi->arg[0] = r;
+		}
+	}
+	else {
+		/* since flags are not tracked in liveness,
+		 * the result of the flag-setting instruction
+		 * has to be marked as live
+		 */
+		if (t->nuse == 1)
+			emit(Ocopy, Kw, R, r, R);
+		b->jmp.type = Jjf + Cine;
+	}
+}
+
+static int
+aref(Ref r, ANum *ai)
+{
+	switch (rtype(r)) {
+	case RCon:
+		return 2;
+	case RTmp:
+		return ai[r.val].n;
+	default:
+		die("constant or temporary expected");
+	}
+}
+
+static int
+ascale(Ref r, Con *con)
+{
+	int64_t n;
+
+	if (rtype(r) != RCon)
+		return 0;
+	if (con[r.val].type != CBits)
+		return 0;
+	n = con[r.val].bits.i;
+	return n == 1 || n == 2 || n == 4 || n == 8;
+}
+
+static void
+anumber(ANum *ai, Blk *b, Con *con)
+{
+	/* This should be made obsolete by a proper
+	 * reassoc pass.
+	 *
+	 * Rules:
+	 *
+	 *   RTmp(_) -> 0    tmp
+	 *   ( RTmp(_) -> 1    slot )
+	 *   RCon(_) -> 2    con
+	 *   0 * 2   -> 3    s * i (when constant is 1,2,4,8)
+	 */
+	static char add[10][10] = {
+		[2] [2] = 2,              /* folding */
+		[2] [5] = 5, [5] [2] = 5,
+		[2] [6] = 6, [6] [2] = 6,
+		[2] [7] = 7, [7] [2] = 7,
+		[0] [0] = 4,              /* 4: b + s * i */
+		[0] [3] = 4, [3] [0] = 4,
+		[2] [3] = 5, [3] [2] = 5, /* 5: o + s * i */
+		[0] [2] = 6, [2] [0] = 6, /* 6: o + b */
+		[2] [4] = 7, [4] [2] = 7, /* 7: o + b + s * i */
+		[0] [5] = 7, [5] [0] = 7,
+		[6] [3] = 7, [3] [6] = 7,
+
+	};
+	int a, a1, a2, n1, n2, t1, t2;
+	Ins *i;
+
+	for (i=b->ins; i-b->ins < b->nins; i++) {
+		if (rtype(i->to) == RTmp)
+			ai[i->to.val].i = i;
+		if (i->op != Oadd && i->op != Omul)
+			continue;
+		a1 = aref(i->arg[0], ai);
+		a2 = aref(i->arg[1], ai);
+		t1 = a1 != 1 && a1 != 2;
+		t2 = a2 != 1 && a2 != 2;
+		if (i->op == Oadd) {
+			a = add[n1 = a1][n2 = a2];
+			if (t1 && a < add[0][a2])
+				a = add[n1 = 0][n2 = a2];
+			if (t2 && a < add[a1][0])
+				a = add[n1 = a1][n2 = 0];
+			if (t1 && t2 && a < add[0][0])
+				a = add[n1 = 0][n2 = 0];
+		} else {
+			n1 = n2 = a = 0;
+			if (ascale(i->arg[0], con) && t2)
+				a = 3, n1 = 2, n2 = 0;
+			if (t1 && ascale(i->arg[1], con))
+				a = 3, n1 = 0, n2 = 2;
+		}
+		ai[i->to.val].n = a;
+		ai[i->to.val].l = n1;
+		ai[i->to.val].r = n2;
+	}
+}
+
+static void
+amatch(Addr *a, Ref r, ANum *ai, Fn *fn, int top)
+{
+	Ins *i;
+	int nl, nr, t, s;
+	Ref al, ar;
+
+	if (top)
+		memset(a, 0, sizeof *a);
+	if (rtype(r) == RCon) {
+		addcon(&a->offset, &fn->con[r.val]);
+		return;
+	}
+	assert(rtype(r) == RTmp);
+	i = ai[r.val].i;
+	nl = ai[r.val].l;
+	nr = ai[r.val].r;
+	if (i) {
+		if (nl > nr) {
+			al = i->arg[1];
+			ar = i->arg[0];
+			t = nl, nl = nr, nr = t;
+		} else {
+			al = i->arg[0];
+			ar = i->arg[1];
+		}
+	}
+	switch (ai[r.val].n) {
+	case 3: /* s * i */
+		if (!top) {
+			a->index = al;
+			a->scale = fn->con[ar.val].bits.i;
+		} else
+			a->base = r;
+		break;
+	case 4: /* b + s * i */
+		switch (nr) {
+		case 0:
+			if (fn->tmp[ar.val].slot != -1) {
+				al = i->arg[1];
+				ar = i->arg[0];
+			}
+			a->index = ar;
+			a->scale = 1;
+			break;
+		case 3:
+			amatch(a, ar, ai, fn, 0);
+			break;
+		}
+		r = al;
+	case 0:
+		s = fn->tmp[r.val].slot;
+		if (s != -1)
+			r = SLOT(s);
+		a->base = r;
+		break;
+	case 2: /* constants */
+	case 5: /* o + s * i */
+	case 6: /* o + b */
+	case 7: /* o + b + s * i */
+		amatch(a, ar, ai, fn, 0);
+		amatch(a, al, ai, fn, 0);
+		break;
+	default:
+		die("unreachable");
+	}
+}
+
+/* instruction selection
+ * requires use counts (as given by parsing)
+ */
+void
+amd64_isel(Fn *fn)
+{
+	Blk *b, **sb;
+	Ins *i;
+	Phi *p;
+	uint a;
+	int n, al;
+	int64_t sz;
+	ANum *ainfo;
+
+	/* assign slots to fast allocs */
+	b = fn->start;
+	/* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */
+	for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2)
+		for (i=b->ins; i-b->ins < b->nins; i++)
+			if (i->op == al) {
+				if (rtype(i->arg[0]) != RCon)
+					break;
+				sz = fn->con[i->arg[0].val].bits.i;
+				if (sz < 0 || sz >= INT_MAX-15)
+					err("invalid alloc size %"PRId64, sz);
+				sz = (sz + n-1) & -n;
+				sz /= 4;
+				fn->tmp[i->to.val].slot = fn->slot;
+				fn->slot += sz;
+				*i = (Ins){.op = Onop};
+			}
+
+	/* process basic blocks */
+	n = fn->ntmp;
+	ainfo = emalloc(n * sizeof ainfo[0]);
+	for (b=fn->start; b; b=b->link) {
+		curi = &insb[NIns];
+		for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
+			for (p=(*sb)->phi; p; p=p->link) {
+				for (a=0; p->blk[a] != b; a++)
+					assert(a+1 < p->narg);
+				fixarg(&p->arg[a], p->cls, 1, fn);
+			}
+		memset(ainfo, 0, n * sizeof ainfo[0]);
+		anumber(ainfo, b, fn->con);
+		seljmp(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;)
+			sel(*--i, ainfo, fn);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	}
+	free(ainfo);
+
+	if (debug['I']) {
+		fprintf(stderr, "\n> After instruction selection:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/amd64/sysv.c b/amd64/sysv.c
new file mode 100644
index 0000000..dcaa812
--- /dev/null
+++ b/amd64/sysv.c
@@ -0,0 +1,701 @@
+#include "all.h"
+
+typedef struct AClass AClass;
+typedef struct RAlloc RAlloc;
+
+struct AClass {
+	int inmem;
+	int align;
+	uint size;
+	int cls[2];
+	Ref ref[2];
+};
+
+struct RAlloc {
+	Ins i;
+	RAlloc *link;
+};
+
+static void
+classify(AClass *a, Typ *t, int *pn, int *pe)
+{
+	Seg *seg;
+	int n, s, *cls;
+
+	for (n=0; n<t->nunion; n++) {
+		seg = t->seg[n];
+		for (s=0; *pe<2; (*pe)++) {
+			cls = &a->cls[*pe];
+			for (; *pn<8; s++) {
+				switch (seg[s].type) {
+				case SEnd:
+					goto Done;
+				case SPad:
+					/* don't change anything */
+					break;
+				case SFlt:
+					if (*cls == Kx)
+						*cls = Kd;
+					break;
+				case SInt:
+					*cls = Kl;
+					break;
+				case STyp:
+					classify(a, &typ[seg[s].len], pn, pe);
+					continue;
+				}
+				*pn += seg[s].len;
+			}
+		Done:
+			assert(*pn <= 8);
+			*pn = 0;
+		}
+	}
+}
+
+static void
+typclass(AClass *a, Typ *t)
+{
+	int e, n;
+	uint sz, al;
+
+	sz = t->size;
+	al = 1u << t->align;
+
+	/* the ABI requires sizes to be rounded
+	 * up to the nearest multiple of 8, moreover
+	 * it makes it easy load and store structures
+	 * in registers
+	 */
+	if (al < 8)
+		al = 8;
+	sz = (sz + al-1) & -al;
+
+	a->size = sz;
+	a->align = t->align;
+
+	if (t->dark || sz > 16 || sz == 0) {
+		/* large or unaligned structures are
+		 * required to be passed in memory
+		 */
+		a->inmem = 1;
+		return;
+	}
+
+	a->cls[0] = Kx;
+	a->cls[1] = Kx;
+	a->inmem = 0;
+	n = 0;
+	e = 0;
+	classify(a, t, &n, &e);
+}
+
+static int
+retr(Ref reg[2], AClass *aret)
+{
+	static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
+	int n, k, ca, nr[2];
+
+	nr[0] = nr[1] = 0;
+	ca = 0;
+	for (n=0; (uint)n*8<aret->size; n++) {
+		k = KBASE(aret->cls[n]);
+		reg[n] = TMP(retreg[k][nr[k]++]);
+		ca += 1 << (2 * k);
+	}
+	return ca;
+}
+
+static void
+selret(Blk *b, Fn *fn)
+{
+	int j, k, ca;
+	Ref r, r0, reg[2];
+	AClass aret;
+
+	j = b->jmp.type;
+
+	if (!isret(j) || j == Jret0)
+		return;
+
+	r0 = b->jmp.arg;
+	b->jmp.type = Jret0;
+
+	if (j == Jretc) {
+		typclass(&aret, &typ[fn->retty]);
+		if (aret.inmem) {
+			assert(rtype(fn->retr) == RTmp);
+			emit(Ocopy, Kl, TMP(RAX), fn->retr, R);
+			blit(fn->retr, 0, r0, aret.size, fn);
+			ca = 1;
+		} else {
+			ca = retr(reg, &aret);
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				emit(Oload, Kl, reg[1], r, R);
+				emit(Oadd, Kl, r, r0, getcon(8, fn));
+			}
+			emit(Oload, Kl, reg[0], r0, R);
+		}
+	} else {
+		k = j - Jretw;
+		if (KBASE(k) == 0) {
+			emit(Ocopy, k, TMP(RAX), r0, R);
+			ca = 1;
+		} else {
+			emit(Ocopy, k, TMP(XMM0), r0, R);
+			ca = 1 << 2;
+		}
+	}
+
+	b->jmp.arg = CALL(ca);
+}
+
+static int
+argsclass(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret, Ref *env)
+{
+	int nint, ni, nsse, ns, n, *pn;
+	AClass *a;
+	Ins *i;
+
+	if (aret && aret->inmem)
+		nint = 5; /* hidden argument */
+	else
+		nint = 6;
+	nsse = 8;
+	for (i=i0, a=ac; i<i1; i++, a++)
+		switch (i->op - op + Oarg) {
+		case Oarg:
+			if (KBASE(i->cls) == 0)
+				pn = &nint;
+			else
+				pn = &nsse;
+			if (*pn > 0) {
+				--*pn;
+				a->inmem = 0;
+			} else
+				a->inmem = 2;
+			a->align = 3;
+			a->size = 8;
+			a->cls[0] = i->cls;
+			break;
+		case Oargc:
+			n = i->arg[0].val;
+			typclass(a, &typ[n]);
+			if (a->inmem)
+				continue;
+			ni = ns = 0;
+			for (n=0; (uint)n*8<a->size; n++)
+				if (KBASE(a->cls[n]) == 0)
+					ni++;
+				else
+					ns++;
+			if (nint >= ni && nsse >= ns) {
+				nint -= ni;
+				nsse -= ns;
+			} else
+				a->inmem = 1;
+			break;
+		case Oarge:
+			if (op == Opar)
+				*env = i->to;
+			else
+				*env = i->arg[0];
+			break;
+		}
+
+	return ((6-nint) << 4) | ((8-nsse) << 8);
+}
+
+int amd64_sysv_rsave[] = {
+	RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX,
+	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, -1
+};
+int amd64_sysv_rclob[] = {RBX, R12, R13, R14, R15, -1};
+
+MAKESURE(sysv_arrays_ok,
+	sizeof amd64_sysv_rsave == (NGPS+NFPS+1) * sizeof(int) &&
+	sizeof amd64_sysv_rclob == (NCLR+1) * sizeof(int)
+);
+
+/* layout of call's second argument (RCall)
+ *
+ *  29     12    8    4  3  0
+ *  |0...00|x|xxxx|xxxx|xx|xx|                  range
+ *          |    |    |  |  ` gp regs returned (0..2)
+ *          |    |    |  ` sse regs returned   (0..2)
+ *          |    |    ` gp regs passed         (0..6)
+ *          |    ` sse regs passed             (0..8)
+ *          ` 1 if rax is used to pass data    (0..1)
+ */
+
+bits
+amd64_sysv_retregs(Ref r, int p[2])
+{
+	bits b;
+	int ni, nf;
+
+	assert(rtype(r) == RCall);
+	b = 0;
+	ni = r.val & 3;
+	nf = (r.val >> 2) & 3;
+	if (ni >= 1)
+		b |= BIT(RAX);
+	if (ni >= 2)
+		b |= BIT(RDX);
+	if (nf >= 1)
+		b |= BIT(XMM0);
+	if (nf >= 2)
+		b |= BIT(XMM1);
+	if (p) {
+		p[0] = ni;
+		p[1] = nf;
+	}
+	return b;
+}
+
+bits
+amd64_sysv_argregs(Ref r, int p[2])
+{
+	bits b;
+	int j, ni, nf, ra;
+
+	assert(rtype(r) == RCall);
+	b = 0;
+	ni = (r.val >> 4) & 15;
+	nf = (r.val >> 8) & 15;
+	ra = (r.val >> 12) & 1;
+	for (j=0; j<ni; j++)
+		b |= BIT(amd64_sysv_rsave[j]);
+	for (j=0; j<nf; j++)
+		b |= BIT(XMM0+j);
+	if (p) {
+		p[0] = ni + ra;
+		p[1] = nf;
+	}
+	return b | (ra ? BIT(RAX) : 0);
+}
+
+static Ref
+rarg(int ty, int *ni, int *ns)
+{
+	if (KBASE(ty) == 0)
+		return TMP(amd64_sysv_rsave[(*ni)++]);
+	else
+		return TMP(XMM0 + (*ns)++);
+}
+
+static void
+selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
+{
+	Ins *i;
+	AClass *ac, *a, aret;
+	int ca, ni, ns, al, varc, envc;
+	uint stk, off;
+	Ref r, r1, r2, reg[2], env;
+	RAlloc *ra;
+
+	env = R;
+	ac = alloc((i1-i0) * sizeof ac[0]);
+
+	if (!req(i1->arg[1], R)) {
+		assert(rtype(i1->arg[1]) == RType);
+		typclass(&aret, &typ[i1->arg[1].val]);
+		ca = argsclass(i0, i1, ac, Oarg, &aret, &env);
+	} else
+		ca = argsclass(i0, i1, ac, Oarg, 0, &env);
+
+	for (stk=0, a=&ac[i1-i0]; a>ac;)
+		if ((--a)->inmem) {
+			if (a->align > 4)
+				err("sysv abi requires alignments of 16 or less");
+			stk += a->size;
+			if (a->align == 4)
+				stk += stk & 15;
+		}
+	stk += stk & 15;
+	if (stk) {
+		r = getcon(-(int64_t)stk, fn);
+		emit(Osalloc, Kl, R, r, R);
+	}
+
+	if (!req(i1->arg[1], R)) {
+		if (aret.inmem) {
+			/* get the return location from eax
+			 * it saves one callee-save reg */
+			r1 = newtmp("abi", Kl, fn);
+			emit(Ocopy, Kl, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				aret.ref[1] = newtmp("abi", aret.cls[1], fn);
+				emit(Ostorel, 0, R, aret.ref[1], r);
+				emit(Oadd, Kl, r, i1->to, getcon(8, fn));
+			}
+			aret.ref[0] = newtmp("abi", aret.cls[0], fn);
+			emit(Ostorel, 0, R, aret.ref[0], i1->to);
+			ca += retr(reg, &aret);
+			if (aret.size > 8)
+				emit(Ocopy, aret.cls[1], aret.ref[1], reg[1], R);
+			emit(Ocopy, aret.cls[0], aret.ref[0], reg[0], R);
+			r1 = i1->to;
+		}
+		/* allocate return pad */
+		ra = alloc(sizeof *ra);
+		/* specific to NAlign == 3 */
+		al = aret.align >= 2 ? aret.align - 2 : 0;
+		ra->i = (Ins){Oalloc+al, r1, {getcon(aret.size, fn)}, Kl};
+		ra->link = (*rap);
+		*rap = ra;
+	} else {
+		ra = 0;
+		if (KBASE(i1->cls) == 0) {
+			emit(Ocopy, i1->cls, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			emit(Ocopy, i1->cls, i1->to, TMP(XMM0), R);
+			ca += 1 << 2;
+		}
+	}
+	envc = !req(R, env);
+	varc = i1->op == Ovacall;
+	if (varc && envc)
+		err("sysv abi does not support variadic env calls");
+	ca |= (varc | envc) << 12;
+	emit(Ocall, i1->cls, R, i1->arg[0], CALL(ca));
+	if (envc)
+		emit(Ocopy, Kl, TMP(RAX), env, R);
+	if (varc)
+		emit(Ocopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);
+
+	ni = ns = 0;
+	if (ra && aret.inmem)
+		emit(Ocopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (a->inmem)
+			continue;
+		r1 = rarg(a->cls[0], &ni, &ns);
+		if (i->op == Oargc) {
+			if (a->size > 8) {
+				r2 = rarg(a->cls[1], &ni, &ns);
+				r = newtmp("abi", Kl, fn);
+				emit(Oload, a->cls[1], r2, r, R);
+				emit(Oadd, Kl, r, i->arg[1], getcon(8, fn));
+			}
+			emit(Oload, a->cls[0], r1, i->arg[1], R);
+		} else
+			emit(Ocopy, i->cls, r1, i->arg[0], R);
+	}
+
+	if (!stk)
+		return;
+
+	r = newtmp("abi", Kl, fn);
+	for (i=i0, a=ac, off=0; i<i1; i++, a++) {
+		if (!a->inmem)
+			continue;
+		if (i->op == Oargc) {
+			if (a->align == 4)
+				off += off & 15;
+			blit(r, off, i->arg[1], a->size, fn);
+		} else {
+			r1 = newtmp("abi", Kl, fn);
+			emit(Ostorel, 0, R, i->arg[0], r1);
+			emit(Oadd, Kl, r1, r, getcon(off, fn));
+		}
+		off += a->size;
+	}
+	emit(Osalloc, Kl, r, getcon(stk, fn), R);
+}
+
+static int
+selpar(Fn *fn, Ins *i0, Ins *i1)
+{
+	AClass *ac, *a, aret;
+	Ins *i;
+	int ni, ns, s, al, fa;
+	Ref r, env;
+
+	env = R;
+	ac = alloc((i1-i0) * sizeof ac[0]);
+	curi = &insb[NIns];
+	ni = ns = 0;
+
+	if (fn->retty >= 0) {
+		typclass(&aret, &typ[fn->retty]);
+		fa = argsclass(i0, i1, ac, Opar, &aret, &env);
+	} else
+		fa = argsclass(i0, i1, ac, Opar, 0, &env);
+
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (i->op != Oparc || a->inmem)
+			continue;
+		if (a->size > 8) {
+			r = newtmp("abi", Kl, fn);
+			a->ref[1] = newtmp("abi", Kl, fn);
+			emit(Ostorel, 0, R, a->ref[1], r);
+			emit(Oadd, Kl, r, i->to, getcon(8, fn));
+		}
+		a->ref[0] = newtmp("abi", Kl, fn);
+		emit(Ostorel, 0, R, a->ref[0], i->to);
+		/* specific to NAlign == 3 */
+		al = a->align >= 2 ? a->align - 2 : 0;
+		emit(Oalloc+al, Kl, i->to, getcon(a->size, fn), R);
+	}
+
+	if (fn->retty >= 0 && aret.inmem) {
+		r = newtmp("abi", Kl, fn);
+		emit(Ocopy, Kl, r, rarg(Kl, &ni, &ns), R);
+		fn->retr = r;
+	}
+
+	for (i=i0, a=ac, s=4; i<i1; i++, a++) {
+		switch (a->inmem) {
+		case 1:
+			if (a->align > 4)
+				err("sysv abi requires alignments of 16 or less");
+			if (a->align == 4)
+				s = (s+3) & -4;
+			fn->tmp[i->to.val].slot = -s;
+			s += a->size / 4;
+			continue;
+		case 2:
+			emit(Oload, i->cls, i->to, SLOT(-s), R);
+			s += 2;
+			continue;
+		}
+		r = rarg(a->cls[0], &ni, &ns);
+		if (i->op == Oparc) {
+			emit(Ocopy, Kl, a->ref[0], r, R);
+			if (a->size > 8) {
+				r = rarg(a->cls[1], &ni, &ns);
+				emit(Ocopy, Kl, a->ref[1], r, R);
+			}
+		} else
+			emit(Ocopy, i->cls, i->to, r, R);
+	}
+
+	if (!req(R, env))
+		emit(Ocopy, Kl, env, TMP(RAX), R);
+
+	return fa | (s*4)<<12;
+}
+
+static Blk *
+split(Fn *fn, Blk *b)
+{
+	Blk *bn;
+
+	++fn->nblk;
+	bn = blknew();
+	bn->nins = &insb[NIns] - curi;
+	idup(&bn->ins, curi, bn->nins);
+	curi = &insb[NIns];
+	bn->visit = ++b->visit;
+	snprintf(bn->name, NString, "%s.%d", b->name, b->visit);
+	bn->loop = b->loop;
+	bn->link = b->link;
+	b->link = bn;
+	return bn;
+}
+
+static void
+chpred(Blk *b, Blk *bp, Blk *bp1)
+{
+	Phi *p;
+	uint a;
+
+	for (p=b->phi; p; p=p->link) {
+		for (a=0; p->blk[a]!=bp; a++)
+			assert(a+1<p->narg);
+		p->blk[a] = bp1;
+	}
+}
+
+static void
+selvaarg(Fn *fn, Blk *b, Ins *i)
+{
+	Ref loc, lreg, lstk, nr, r0, r1, c4, c8, c16, c, ap;
+	Blk *b0, *bstk, *breg;
+	int isint;
+
+	c4 = getcon(4, fn);
+	c8 = getcon(8, fn);
+	c16 = getcon(16, fn);
+	ap = i->arg[0];
+	isint = KBASE(i->cls) == 0;
+
+	/* @b [...]
+	       r0 =l add ap, (0 or 4)
+	       nr =l loadsw r0
+	       r1 =w cultw nr, (48 or 176)
+	       jnz r1, @breg, @bstk
+	   @breg
+	       r0 =l add ap, 16
+	       r1 =l loadl r0
+	       lreg =l add r1, nr
+	       r0 =w add nr, (8 or 16)
+	       r1 =l add ap, (0 or 4)
+	       storew r0, r1
+	   @bstk
+	       r0 =l add ap, 8
+	       lstk =l loadl r0
+	       r1 =l add lstk, 8
+	       storel r1, r0
+	   @b0
+	       %loc =l phi @breg %lreg, @bstk %lstk
+	       i->to =(i->cls) load %loc
+	*/
+
+	loc = newtmp("abi", Kl, fn);
+	emit(Oload, i->cls, i->to, loc, R);
+	b0 = split(fn, b);
+	b0->jmp = b->jmp;
+	b0->s1 = b->s1;
+	b0->s2 = b->s2;
+	if (b->s1)
+		chpred(b->s1, b, b0);
+	if (b->s2 && b->s2 != b->s1)
+		chpred(b->s2, b, b0);
+
+	lreg = newtmp("abi", Kl, fn);
+	nr = newtmp("abi", Kl, fn);
+	r0 = newtmp("abi", Kw, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorew, Kw, R, r0, r1);
+	emit(Oadd, Kl, r1, ap, isint ? CON_Z : c4);
+	emit(Oadd, Kw, r0, nr, isint ? c8 : c16);
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Oadd, Kl, lreg, r1, nr);
+	emit(Oload, Kl, r1, r0, R);
+	emit(Oadd, Kl, r0, ap, c16);
+	breg = split(fn, b);
+	breg->jmp.type = Jjmp;
+	breg->s1 = b0;
+
+	lstk = newtmp("abi", Kl, fn);
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, lstk, c8);
+	emit(Oload, Kl, lstk, r0, R);
+	emit(Oadd, Kl, r0, ap, c8);
+	bstk = split(fn, b);
+	bstk->jmp.type = Jjmp;
+	bstk->s1 = b0;
+
+	b0->phi = alloc(sizeof *b0->phi);
+	*b0->phi = (Phi){
+		.cls = Kl, .to = loc,
+		.narg = 2,
+		.blk = {bstk, breg},
+		.arg = {lstk, lreg},
+	};
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kw, fn);
+	b->jmp.type = Jjnz;
+	b->jmp.arg = r1;
+	b->s1 = breg;
+	b->s2 = bstk;
+	c = getcon(isint ? 48 : 176, fn);
+	emit(Ocmpw+Ciult, Kw, r1, nr, c);
+	emit(Oloadsw, Kl, nr, r0, R);
+	emit(Oadd, Kl, r0, ap, isint ? CON_Z : c4);
+}
+
+static void
+selvastart(Fn *fn, int fa, Ref ap)
+{
+	Ref r0, r1;
+	int gp, fp, sp;
+
+	gp = ((fa >> 4) & 15) * 8;
+	fp = 48 + ((fa >> 8) & 15) * 16;
+	sp = fa >> 12;
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, TMP(RBP), getcon(-176, fn));
+	emit(Oadd, Kl, r0, ap, getcon(16, fn));
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, TMP(RBP), getcon(sp, fn));
+	emit(Oadd, Kl, r0, ap, getcon(8, fn));
+	r0 = newtmp("abi", Kl, fn);
+	emit(Ostorew, Kw, R, getcon(fp, fn), r0);
+	emit(Oadd, Kl, r0, ap, getcon(4, fn));
+	emit(Ostorew, Kw, R, getcon(gp, fn), ap);
+}
+
+void
+amd64_sysv_abi(Fn *fn)
+{
+	Blk *b;
+	Ins *i, *i0, *ip;
+	RAlloc *ral;
+	int n, fa;
+
+	for (b=fn->start; b; b=b->link)
+		b->visit = 0;
+
+	/* lower parameters */
+	for (b=fn->start, i=b->ins; i-b->ins<b->nins; i++)
+		if (!ispar(i->op))
+			break;
+	fa = selpar(fn, b->ins, i);
+	n = b->nins - (i - b->ins) + (&insb[NIns] - curi);
+	i0 = alloc(n * sizeof(Ins));
+	ip = icpy(ip = i0, curi, &insb[NIns] - curi);
+	ip = icpy(ip, i, &b->ins[b->nins] - i);
+	b->nins = n;
+	b->ins = i0;
+
+	/* lower calls, returns, and vararg instructions */
+	ral = 0;
+	b = fn->start;
+	do {
+		if (!(b = b->link))
+			b = fn->start; /* do it last */
+		if (b->visit)
+			continue;
+		curi = &insb[NIns];
+		selret(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;)
+			switch ((--i)->op) {
+			default:
+				emiti(*i);
+				break;
+			case Ocall:
+			case Ovacall:
+				for (i0=i; i0>b->ins; i0--)
+					if (!isarg((i0-1)->op))
+						break;
+				selcall(fn, i0, i, &ral);
+				i = i0;
+				break;
+			case Ovastart:
+				selvastart(fn, fa, i->arg[0]);
+				break;
+			case Ovaarg:
+				selvaarg(fn, b, i);
+				break;
+			case Oarg:
+			case Oargc:
+				die("unreachable");
+			}
+		if (b == fn->start)
+			for (; ral; ral=ral->link)
+				emiti(ral->i);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	} while (b != fn->start);
+
+	if (debug['A']) {
+		fprintf(stderr, "\n> After ABI lowering:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/amd64/targ.c b/amd64/targ.c
new file mode 100644
index 0000000..e227574
--- /dev/null
+++ b/amd64/targ.c
@@ -0,0 +1,30 @@
+#include "all.h"
+
+Amd64Op amd64_op[NOp] = {
+#define O(op, t, x) [O##op] =
+#define X(nm, zf, lf) { nm, zf, lf, },
+	#include "../ops.h"
+};
+
+static int
+amd64_memargs(int op)
+{
+	return amd64_op[op].nmem;
+}
+
+Target T_amd64_sysv = {
+	.gpr0 = RAX,
+	.ngpr = NGPR,
+	.fpr0 = XMM0,
+	.nfpr = NFPR,
+	.rglob = BIT(RBP) | BIT(RSP),
+	.nrglob = 2,
+	.rsave = amd64_sysv_rsave,
+	.nrsave = {NGPS, NFPS},
+	.retregs = amd64_sysv_retregs,
+	.argregs = amd64_sysv_argregs,
+	.memargs = amd64_memargs,
+	.abi = amd64_sysv_abi,
+	.isel = amd64_isel,
+	.emitfn = amd64_emitfn,
+};