summary refs log tree commit diff
path: root/amd64
diff options
context:
space:
mode:
authorQuentin Carbonneaux <quentin@c9x.me>2017-04-08 21:06:33 -0400
committerQuentin Carbonneaux <quentin@c9x.me>2017-04-08 21:56:20 -0400
commit49a4593c335126ba279f47328824abfef379725e (patch)
tree2f4cb5e9884ec958ea32a494da302a9aae8ca420 /amd64
parent9d1c38d69547d835f7228651e71e8a7d744c456d (diff)
downloadroux-49a4593c335126ba279f47328824abfef379725e.tar.gz
prepare for multi-target
This big diff does multiple changes to allow
the addition of new targets to qbe.  The
changes are listed below in decreasing order
of impact.

1. Add a new Target structure.

To add support for a given target, one has to
implement all the members of the Target
structure.  All the source files where changed
to use this interface where needed.

2. Single out amd64-specific code.

In this commit, the amd64 target T_amd64_sysv
is the only target available, it is implemented
in the amd64/ directory.  All the non-static
items in this directory are prefixed with either
amd64_ or amd64_sysv (for items that are
specific to the System V ABI).

3. Centralize Ops information.

There is now a file 'ops.h' that must be used to
store all the available operations together with
their metadata.  The various targets will only
select what they need; but it is beneficial that
there is only *one* place to change to add a new
instruction.

One good side effect of this change is that any
operation 'xyz' in the IL now as a corresponding
'Oxyz' in the code.

4. Misc fixes.

One notable change is that instruction selection
now generates generic comparison operations and
the lowering to the target's comparisons is done
in the emitter.

GAS directives for data are the same for many
targets, so data emission was extracted in a
file 'gas.c'.

5. Modularize the Makefile.

The Makefile now has a list of C files that
are target-independent (SRC), and one list
of C files per target.  Each target can also
use its own 'all.h' header (for example to
define registers).
Diffstat (limited to 'amd64')
-rw-r--r--amd64/all.h70
-rw-r--r--amd64/emit.c561
-rw-r--r--amd64/isel.c603
-rw-r--r--amd64/sysv.c701
-rw-r--r--amd64/targ.c30
5 files changed, 1965 insertions, 0 deletions
diff --git a/amd64/all.h b/amd64/all.h
new file mode 100644
index 0000000..3a2db0e
--- /dev/null
+++ b/amd64/all.h
@@ -0,0 +1,70 @@
+#include "../all.h"
+
+typedef struct Amd64Op Amd64Op;
+
+enum Amd64Reg {
+	RAX = RXX+1, /* caller-save */
+	RCX,
+	RDX,
+	RSI,
+	RDI,
+	R8,
+	R9,
+	R10,
+	R11,
+
+	RBX, /* callee-save */
+	R12,
+	R13,
+	R14,
+	R15,
+
+	RBP, /* globally live */
+	RSP,
+
+	XMM0, /* sse */
+	XMM1,
+	XMM2,
+	XMM3,
+	XMM4,
+	XMM5,
+	XMM6,
+	XMM7,
+	XMM8,
+	XMM9,
+	XMM10,
+	XMM11,
+	XMM12,
+	XMM13,
+	XMM14,
+	XMM15,
+
+	NFPR = XMM14 - XMM0 + 1, /* reserve XMM15 */
+	NGPR = RSP - RAX + 1,
+	NGPS = R11 - RAX + 1,
+	NFPS = NFPR,
+	NCLR = R15 - RBX + 1,
+};
+MAKESURE(reg_not_tmp, XMM15 < (int)Tmp0);
+
+struct Amd64Op {
+	char nmem;
+	char zflag;
+	char lflag;
+};
+
+/* targ.c */
+extern Amd64Op amd64_op[];
+
+/* sysv.c (abi) */
+extern int amd64_sysv_rsave[];
+extern int amd64_sysv_rclob[];
+bits amd64_sysv_retregs(Ref, int[2]);
+bits amd64_sysv_argregs(Ref, int[2]);
+void amd64_sysv_abi(Fn *);
+
+/* isel.c */
+void amd64_isel(Fn *);
+
+/* emit.c */
+void amd64_emitfn(Fn *, FILE *);
diff --git a/amd64/emit.c b/amd64/emit.c
new file mode 100644
index 0000000..eccbd02
--- /dev/null
+++ b/amd64/emit.c
@@ -0,0 +1,561 @@
+#include "all.h"
+
+
+#define CMP(X) \
+	X(Ciule,      "be") \
+	X(Ciult,      "b")  \
+	X(Cisle,      "le") \
+	X(Cislt,      "l")  \
+	X(Cisgt,      "g")  \
+	X(Cisge,      "ge") \
+	X(Ciugt,      "a")  \
+	X(Ciuge,      "ae") \
+	X(Cieq,       "z")  \
+	X(Cine,       "nz") \
+	X(NCmpI+Cfle, "be") \
+	X(NCmpI+Cflt, "b")  \
+	X(NCmpI+Cfgt, "a")  \
+	X(NCmpI+Cfge, "ae") \
+	X(NCmpI+Cfeq, "z")  \
+	X(NCmpI+Cfne, "nz") \
+	X(NCmpI+Cfo,  "np") \
+	X(NCmpI+Cfuo, "p")
+
+enum {
+	SLong = 0,
+	SWord = 1,
+	SShort = 2,
+	SByte = 3,
+
+	Ki = -1, /* matches Kw and Kl */
+	Ka = -2, /* matches all classes */
+};
+
+/* Instruction format strings:
+ *
+ * if the format string starts with -, the instruction
+ * is assumed to be 3-address and is put in 2-address
+ * mode using an extra mov if necessary
+ *
+ * if the format string starts with +, the same as the
+ * above applies, but commutativity is also assumed
+ *
+ * %k  is used to set the class of the instruction,
+ *     it'll expand to "l", "q", "ss", "sd", depending
+ *     on the instruction class
+ * %0  designates the first argument
+ * %1  designates the second argument
+ * %=  designates the result
+ *
+ * if %k is not used, a prefix to 0, 1, or = must be
+ * added, it can be:
+ *   M - memory reference
+ *   L - long  (64 bits)
+ *   W - word  (32 bits)
+ *   H - short (16 bits)
+ *   B - byte  (8 bits)
+ *   S - single precision float
+ *   D - double precision float
+ */
+static struct {
+	short op;
+	short cls;
+	char *asm;
+} omap[] = {
+	{ Oadd,    Ka, "+add%k %1, %=" },
+	{ Osub,    Ka, "-sub%k %1, %=" },
+	{ Oand,    Ki, "+and%k %1, %=" },
+	{ Oor,     Ki, "+or%k %1, %=" },
+	{ Oxor,    Ki, "+xor%k %1, %=" },
+	{ Osar,    Ki, "-sar%k %B1, %=" },
+	{ Oshr,    Ki, "-shr%k %B1, %=" },
+	{ Oshl,    Ki, "-shl%k %B1, %=" },
+	{ Omul,    Ki, "+imul%k %1, %=" },
+	{ Omul,    Ks, "+mulss %1, %=" },
+	{ Omul,    Kd, "+mulsd %1, %=" },
+	{ Odiv,    Ka, "-div%k %1, %=" },
+	{ Ostorel, Ka, "movq %L0, %M1" },
+	{ Ostorew, Ka, "movl %W0, %M1" },
+	{ Ostoreh, Ka, "movw %H0, %M1" },
+	{ Ostoreb, Ka, "movb %B0, %M1" },
+	{ Ostores, Ka, "movss %S0, %M1" },
+	{ Ostored, Ka, "movsd %D0, %M1" },
+	{ Oload,   Ka, "mov%k %M0, %=" },
+	{ Oloadsw, Kl, "movslq %M0, %L=" },
+	{ Oloadsw, Kw, "movl %M0, %W=" },
+	{ Oloaduw, Ki, "movl %M0, %W=" },
+	{ Oloadsh, Ki, "movsw%k %M0, %=" },
+	{ Oloaduh, Ki, "movzw%k %M0, %=" },
+	{ Oloadsb, Ki, "movsb%k %M0, %=" },
+	{ Oloadub, Ki, "movzb%k %M0, %=" },
+	{ Oextsw,  Kl, "movslq %W0, %L=" },
+	{ Oextuw,  Kl, "movl %W0, %W=" },
+	{ Oextsh,  Ki, "movsw%k %H0, %=" },
+	{ Oextuh,  Ki, "movzw%k %H0, %=" },
+	{ Oextsb,  Ki, "movsb%k %B0, %=" },
+	{ Oextub,  Ki, "movzb%k %B0, %=" },
+
+	{ Oexts,   Kd, "cvtss2sd %0, %=" },
+	{ Otruncd, Ks, "cvttsd2ss %0, %=" },
+	{ Ostosi,  Ki, "cvttss2si%k %0, %=" },
+	{ Odtosi,  Ki, "cvttsd2si%k %0, %=" },
+	{ Oswtof,  Ka, "cvtsi2%k %W0, %=" },
+	{ Osltof,  Ka, "cvtsi2%k %L0, %=" },
+	{ Ocast,   Ki, "movq %D0, %L=" },
+	{ Ocast,   Ka, "movq %L0, %D=" },
+
+	{ Oaddr,   Ki, "lea%k %M0, %=" },
+	{ Oswap,   Ki, "xchg%k %0, %1" },
+	{ Osign,   Kl, "cqto" },
+	{ Osign,   Kw, "cltd" },
+	{ Oxdiv,   Ki, "div%k %0" },
+	{ Oxidiv,  Ki, "idiv%k %0" },
+	{ Oxcmp,   Ks, "comiss %S0, %S1" },
+	{ Oxcmp,   Kd, "comisd %D0, %D1" },
+	{ Oxcmp,   Ki, "cmp%k %0, %1" },
+	{ Oxtest,  Ki, "test%k %0, %1" },
+#define X(c, s) \
+	{ Oflag+c, Ki, "set" s " %B=\n\tmovzb%k %B=, %=" },
+	CMP(X)
+#undef X
+	{ NOp, 0, 0 }
+};
+
+static char *rname[][4] = {
+	[RAX] = {"rax", "eax", "ax", "al"},
+	[RBX] = {"rbx", "ebx", "bx", "bl"},
+	[RCX] = {"rcx", "ecx", "cx", "cl"},
+	[RDX] = {"rdx", "edx", "dx", "dl"},
+	[RSI] = {"rsi", "esi", "si", "sil"},
+	[RDI] = {"rdi", "edi", "di", "dil"},
+	[RBP] = {"rbp", "ebp", "bp", "bpl"},
+	[RSP] = {"rsp", "esp", "sp", "spl"},
+	[R8 ] = {"r8" , "r8d", "r8w", "r8b"},
+	[R9 ] = {"r9" , "r9d", "r9w", "r9b"},
+	[R10] = {"r10", "r10d", "r10w", "r10b"},
+	[R11] = {"r11", "r11d", "r11w", "r11b"},
+	[R12] = {"r12", "r12d", "r12w", "r12b"},
+	[R13] = {"r13", "r13d", "r13w", "r13b"},
+	[R14] = {"r14", "r14d", "r14w", "r14b"},
+	[R15] = {"r15", "r15d", "r15w", "r15b"},
+};
+
+
+static int
+slot(int s, Fn *fn)
+{
+	struct { int i:29; } x;
+
+	/* sign extend s using a bitfield */
+	x.i = s;
+	assert(x.i <= fn->slot);
+	/* specific to NAlign == 3 */
+	if (x.i < 0)
+		return -4 * x.i;
+	else if (fn->vararg)
+		return -176 + -4 * (fn->slot - x.i);
+	else
+		return -4 * (fn->slot - x.i);
+}
+
+static void
+emitcon(Con *con, FILE *f)
+{
+	switch (con->type) {
+	case CAddr:
+		if (con->local)
+			fprintf(f, "%s%s", gasloc, con->label);
+		else
+			fprintf(f, "%s%s", gassym, con->label);
+		if (con->bits.i)
+			fprintf(f, "%+"PRId64, con->bits.i);
+		break;
+	case CBits:
+		fprintf(f, "%"PRId64, con->bits.i);
+		break;
+	default:
+		die("unreachable");
+	}
+}
+
+static char *
+regtoa(int reg, int sz)
+{
+	static char buf[6];
+
+	if (reg >= XMM0) {
+		sprintf(buf, "xmm%d", reg-XMM0);
+		return buf;
+	} else
+		return rname[reg][sz];
+}
+
+static Ref
+getarg(char c, Ins *i)
+{
+	switch (c) {
+	case '0':
+		return i->arg[0];
+	case '1':
+		return i->arg[1];
+	case '=':
+		return i->to;
+	default:
+		die("invalid arg letter %c", c);
+	}
+}
+
+static void emitins(Ins, Fn *, FILE *);
+
+static void
+emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f)
+{
+	Ins icp;
+
+	icp.op = Ocopy;
+	icp.arg[0] = r2;
+	icp.to = r1;
+	icp.cls = k;
+	emitins(icp, fn, f);
+}
+
+static void
+emitf(char *s, Ins *i, Fn *fn, FILE *f)
+{
+	static char clstoa[][3] = {"l", "q", "ss", "sd"};
+	char c;
+	int sz;
+	Ref ref;
+	Mem *m;
+	Con off;
+
+	switch (*s) {
+	case '+':
+		if (req(i->arg[1], i->to)) {
+			ref = i->arg[0];
+			i->arg[0] = i->arg[1];
+			i->arg[1] = ref;
+		}
+		/* fall through */
+	case '-':
+		assert((!req(i->arg[1], i->to) || req(i->arg[0], i->to)) &&
+			"cannot convert to 2-address");
+		emitcopy(i->to, i->arg[0], i->cls, fn, f);
+		s++;
+		break;
+	}
+
+	fputc('\t', f);
+Next:
+	while ((c = *s++) != '%')
+		if (!c) {
+			fputc('\n', f);
+			return;
+		} else
+			fputc(c, f);
+	switch ((c = *s++)) {
+	case '%':
+		fputc('%', f);
+		break;
+	case 'k':
+		fputs(clstoa[i->cls], f);
+		break;
+	case '0':
+	case '1':
+	case '=':
+		sz = KWIDE(i->cls) ? SLong : SWord;
+		s--;
+		goto Ref;
+	case 'D':
+	case 'S':
+		sz = SLong; /* does not matter for floats */
+	Ref:
+		c = *s++;
+		ref = getarg(c, i);
+		switch (rtype(ref)) {
+		case RTmp:
+			assert(isreg(ref));
+			fprintf(f, "%%%s", regtoa(ref.val, sz));
+			break;
+		case RSlot:
+			fprintf(f, "%d(%%rbp)", slot(ref.val, fn));
+			break;
+		case RMem:
+		Mem:
+			m = &fn->mem[ref.val];
+			if (rtype(m->base) == RSlot) {
+				off.type = CBits;
+				off.bits.i = slot(m->base.val, fn);
+				addcon(&m->offset, &off);
+				m->base = TMP(RBP);
+			}
+			if (m->offset.type != CUndef)
+				emitcon(&m->offset, f);
+			fputc('(', f);
+			if (req(m->base, R))
+				fprintf(f, "%%rip");
+			else
+				fprintf(f, "%%%s", regtoa(m->base.val, SLong));
+			if (!req(m->index, R))
+				fprintf(f, ", %%%s, %d",
+					regtoa(m->index.val, SLong),
+					m->scale
+				);
+			fputc(')', f);
+			break;
+		case RCon:
+			fputc('$', f);
+			emitcon(&fn->con[ref.val], f);
+			break;
+		default:
+			die("unreachable");
+		}
+		break;
+	case 'L':
+		sz = SLong;
+		goto Ref;
+	case 'W':
+		sz = SWord;
+		goto Ref;
+	case 'H':
+		sz = SShort;
+		goto Ref;
+	case 'B':
+		sz = SByte;
+		goto Ref;
+	case 'M':
+		c = *s++;
+		ref = getarg(c, i);
+		switch (rtype(ref)) {
+		case RMem:
+			goto Mem;
+		case RSlot:
+			fprintf(f, "%d(%%rbp)", slot(ref.val, fn));
+			break;
+		case RCon:
+			emitcon(&fn->con[ref.val], f);
+			fprintf(f, "(%%rip)");
+			break;
+		case RTmp:
+			assert(isreg(ref));
+			fprintf(f, "(%%%s)", regtoa(ref.val, SLong));
+			break;
+		default:
+			die("unreachable");
+		}
+		break;
+	default:
+		die("invalid format specifier %%%c", c);
+	}
+	goto Next;
+}
+
+static void
+emitins(Ins i, Fn *fn, FILE *f)
+{
+	Ref r;
+	int64_t val;
+	int o;
+
+	switch (i.op) {
+	default:
+	Table:
+		/* most instructions are just pulled out of
+		 * the table omap[], some special cases are
+		 * detailed below */
+		for (o=0;; o++) {
+			/* this linear search should really be a binary
+			 * search */
+			if (omap[o].op == NOp)
+				die("no match for %s(%d)",
+					optab[i.op].name, "wlsd"[i.cls]);
+			if (omap[o].op == i.op)
+			if (omap[o].cls == i.cls
+			|| (omap[o].cls == Ki && KBASE(i.cls) == 0)
+			|| (omap[o].cls == Ka))
+				break;
+		}
+		emitf(omap[o].asm, &i, fn, f);
+		break;
+	case Onop:
+		/* just do nothing for nops, they are inserted
+		 * by some passes */
+		break;
+	case Omul:
+		/* here, we try to use the 3-addresss form
+		 * of multiplication when possible */
+		if (rtype(i.arg[1]) == RCon) {
+			r = i.arg[0];
+			i.arg[0] = i.arg[1];
+			i.arg[1] = r;
+		}
+		if (KBASE(i.cls) == 0 /* only available for ints */
+		&& rtype(i.arg[0]) == RCon
+		&& rtype(i.arg[1]) == RTmp) {
+			emitf("imul%k %0, %1, %=", &i, fn, f);
+			break;
+		}
+		goto Table;
+	case Osub:
+		/* we have to use the negation trick to handle
+		 * some 3-address substractions */
+		if (req(i.to, i.arg[1])) {
+			emitf("neg%k %=", &i, fn, f);
+			emitf("add%k %0, %=", &i, fn, f);
+			break;
+		}
+		goto Table;
+	case Ocopy:
+		/* make sure we don't emit useless copies,
+		 * also, we can use a trick to load 64-bits
+		 * registers, it's detailed in my note below
+		 * http://c9x.me/art/notes.html?09/19/2015 */
+		if (req(i.to, R) || req(i.arg[0], R))
+			break;
+		if (isreg(i.to)
+		&& rtype(i.arg[0]) == RCon
+		&& i.cls == Kl
+		&& fn->con[i.arg[0].val].type == CBits
+		&& (val = fn->con[i.arg[0].val].bits.i) >= 0
+		&& val <= UINT32_MAX) {
+			emitf("movl %W0, %W=", &i, fn, f);
+		} else if (isreg(i.to)
+		&& rtype(i.arg[0]) == RCon
+		&& fn->con[i.arg[0].val].type == CAddr) {
+			emitf("lea%k %M0, %=", &i, fn, f);
+		} else if (!req(i.arg[0], i.to))
+			emitf("mov%k %0, %=", &i, fn, f);
+		break;
+	case Ocall:
+		/* calls simply have a weird syntax in AT&T
+		 * assembly... */
+		switch (rtype(i.arg[0])) {
+		case RCon:
+			fprintf(f, "\tcallq ");
+			emitcon(&fn->con[i.arg[0].val], f);
+			fprintf(f, "\n");
+			break;
+		case RTmp:
+			emitf("callq *%L0", &i, fn, f);
+			break;
+		default:
+			die("invalid call argument");
+		}
+		break;
+	case Osalloc:
+		/* there is no good reason why this is here
+		 * maybe we should split Osalloc in 2 different
+		 * instructions depending on the result
+		 */
+		emitf("subq %L0, %%rsp", &i, fn, f);
+		if (!req(i.to, R))
+			emitcopy(i.to, TMP(RSP), Kl, fn, f);
+		break;
+	case Oswap:
+		if (KBASE(i.cls) == 0)
+			goto Table;
+		/* for floats, there is no swap instruction
+		 * so we use xmm15 as a temporary
+		 */
+		emitcopy(TMP(XMM0+15), i.arg[0], i.cls, fn, f);
+		emitcopy(i.arg[0], i.arg[1], i.cls, fn, f);
+		emitcopy(i.arg[1], TMP(XMM0+15), i.cls, fn, f);
+		break;
+	}
+}
+
+static int
+framesz(Fn *fn)
+{
+	int i, o, f;
+
+	/* specific to NAlign == 3 */
+	for (i=0, o=0; i<NCLR; i++)
+		o ^= 1 & (fn->reg >> amd64_sysv_rclob[i]);
+	f = fn->slot;
+	f = (f + 3) & -4;
+	return 4*f + 8*o + 176*fn->vararg;
+}
+
+void
+amd64_emitfn(Fn *fn, FILE *f)
+{
+	static char *ctoa[] = {
+	#define X(c, s) [c] = s,
+		CMP(X)
+	#undef X
+	};
+	static int id0;
+	Blk *b, *s;
+	Ins *i, itmp;
+	int *r, c, fs, o, n, lbl;
+
+	fprintf(f, ".text\n");
+	if (fn->export)
+		fprintf(f, ".globl %s%s\n", gassym, fn->name);
+	fprintf(f,
+		"%s%s:\n"
+		"\tpushq %%rbp\n"
+		"\tmovq %%rsp, %%rbp\n",
+		gassym, fn->name
+	);
+	fs = framesz(fn);
+	if (fs)
+		fprintf(f, "\tsub $%d, %%rsp\n", fs);
+	if (fn->vararg) {
+		o = -176;
+		for (r=amd64_sysv_rsave; r<&amd64_sysv_rsave[6]; r++, o+=8)
+			fprintf(f, "\tmovq %%%s, %d(%%rbp)\n", rname[*r][0], o);
+		for (n=0; n<8; ++n, o+=16)
+			fprintf(f, "\tmovaps %%xmm%d, %d(%%rbp)\n", n, o);
+	}
+	for (r=amd64_sysv_rclob; r<&amd64_sysv_rclob[NCLR]; r++)
+		if (fn->reg & BIT(*r)) {
+			itmp.arg[0] = TMP(*r);
+			emitf("pushq %L0", &itmp, fn, f);
+		}
+
+	for (lbl=0, b=fn->start; b; b=b->link) {
+		if (lbl || b->npred > 1)
+			fprintf(f, "%sbb%d:\n", gasloc, id0+b->id);
+		for (i=b->ins; i!=&b->ins[b->nins]; i++)
+			emitins(*i, fn, f);
+		lbl = 1;
+		switch (b->jmp.type) {
+		case Jret0:
+			for (r=&amd64_sysv_rclob[NCLR]; r>amd64_sysv_rclob;)
+				if (fn->reg & BIT(*--r)) {
+					itmp.arg[0] = TMP(*r);
+					emitf("popq %L0", &itmp, fn, f);
+				}
+			fprintf(f,
+				"\tleave\n"
+				"\tret\n"
+			);
+			break;
+		case Jjmp:
+		Jmp:
+			if (b->s1 != b->link)
+				fprintf(f, "\tjmp %sbb%d\n",
+					gasloc, id0+b->s1->id);
+			else
+				lbl = 0;
+			break;
+		default:
+			c = b->jmp.type - Jjf;
+			if (0 <= c && c <= NCmp) {
+				if (b->link == b->s2) {
+					s = b->s1;
+					b->s1 = b->s2;
+					b->s2 = s;
+				} else
+					c = cmpneg(c);
+				fprintf(f, "\tj%s %sbb%d\n", ctoa[c],
+					gasloc, id0+b->s2->id);
+				goto Jmp;
+			}
+			die("unhandled jump %d", b->jmp.type);
+		}
+	}
+	id0 += fn->nblk;
+}
diff --git a/amd64/isel.c b/amd64/isel.c
new file mode 100644
index 0000000..1623b9b
--- /dev/null
+++ b/amd64/isel.c
@@ -0,0 +1,603 @@
+#include "all.h"
+#include <limits.h>
+
+/* For x86_64, do the following:
+ *
+ * - check that constants are used only in
+ *   places allowed
+ * - ensure immediates always fit in 32b
+ * - expose machine register contraints
+ *   on instructions like division.
+ * - implement fast locals (the streak of
+ *   constant allocX in the first basic block)
+ * - recognize complex addressing modes
+ *
+ * Invariant: the use counts that are used
+ *            in sel() must be sound.  This
+ *            is not so trivial, maybe the
+ *            dce should be moved out...
+ */
+
+typedef struct ANum ANum;
+
+struct ANum {
+	char n, l, r;
+	Ins *i;
+};
+
+static void amatch(Addr *, Ref, ANum *, Fn *, int);
+
+static int
+noimm(Ref r, Fn *fn)
+{
+	int64_t val;
+
+	if (rtype(r) != RCon)
+		return 0;
+	switch (fn->con[r.val].type) {
+	case CAddr:
+		/* we only support the 'small'
+		 * code model of the ABI, this
+		 * means that we can always
+		 * address data with 32bits
+		 */
+		return 0;
+	case CBits:
+		val = fn->con[r.val].bits.i;
+		return (val < INT32_MIN || val > INT32_MAX);
+	default:
+		die("invalid constant");
+	}
+}
+
+static int
+rslot(Ref r, Fn *fn)
+{
+	if (rtype(r) != RTmp)
+		return -1;
+	return fn->tmp[r.val].slot;
+}
+
+static void
+fixarg(Ref *r, int k, int cpy, Fn *fn)
+{
+	Addr a, *m;
+	Ref r0, r1;
+	int s, n;
+
+	r1 = r0 = *r;
+	s = rslot(r0, fn);
+	if (KBASE(k) == 1 && rtype(r0) == RCon) {
+		/* load floating points from memory
+		 * slots, they can't be used as
+		 * immediates
+		 */
+		r1 = MEM(fn->nmem);
+		vgrow(&fn->mem, ++fn->nmem);
+		memset(&a, 0, sizeof a);
+		a.offset.type = CAddr;
+		a.offset.local = 1;
+		n = gasstashfp(fn->con[r0.val].bits.i, KWIDE(k));
+		sprintf(a.offset.label, "fp%d", n);
+		fn->mem[fn->nmem-1] = a;
+	}
+	else if (!cpy && k == Kl && noimm(r0, fn)) {
+		/* load constants that do not fit in
+		 * a 32bit signed integer into a
+		 * long temporary
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(Ocopy, Kl, r1, r0, R);
+	}
+	else if (s != -1) {
+		/* load fast locals' addresses into
+		 * temporaries right before the
+		 * instruction
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(Oaddr, Kl, r1, SLOT(s), R);
+	}
+	else if (rtype(r0) == RMem) {
+		/* apple asm fix */
+		m = &fn->mem[r0.val];
+		if (req(m->base, R)) {
+			n = fn->ncon;
+			vgrow(&fn->con, ++fn->ncon);
+			fn->con[n] = m->offset;
+			m->offset.type = CUndef;
+			r0 = newtmp("isel", Kl, fn);
+			emit(Oaddr, Kl, r0, CON(n), R);
+			m->base = r0;
+		}
+	}
+	*r = r1;
+}
+
+static void
+seladdr(Ref *r, ANum *an, Fn *fn)
+{
+	Addr a;
+	Ref r0;
+
+	r0 = *r;
+	if (rtype(r0) == RTmp) {
+		amatch(&a, r0, an, fn, 1);
+		if (req(a.base, r0))
+			return;
+		if (a.offset.type == CAddr)
+		if (!req(a.base, R)) {
+			/* apple asm fix */
+			if (!req(a.index, R))
+				return;
+			else {
+				a.index = a.base;
+				a.scale = 1;
+				a.base = R;
+			}
+		}
+		chuse(r0, -1, fn);
+		vgrow(&fn->mem, ++fn->nmem);
+		fn->mem[fn->nmem-1] = a;
+		chuse(a.base, +1, fn);
+		chuse(a.index, +1, fn);
+		*r = MEM(fn->nmem-1);
+	}
+}
+
+static int
+selcmp(Ref arg[2], int k, Fn *fn)
+{
+	int swap;
+	Ref r, *iarg;
+
+	swap = rtype(arg[0]) == RCon;
+	if (swap) {
+		r = arg[1];
+		arg[1] = arg[0];
+		arg[0] = r;
+	}
+	emit(Oxcmp, k, R, arg[1], arg[0]);
+	iarg = curi->arg;
+	if (rtype(arg[0]) == RCon) {
+		assert(k == Kl);
+		iarg[1] = newtmp("isel", k, fn);
+		emit(Ocopy, k, iarg[1], arg[0], R);
+	}
+	fixarg(&iarg[0], k, 0, fn);
+	fixarg(&iarg[1], k, 0, fn);
+	return swap;
+}
+
+static void
+sel(Ins i, ANum *an, Fn *fn)
+{
+	Ref r0, r1, *iarg;
+	int x, k, kc;
+	int64_t sz;
+	Ins *i0, *i1;
+
+	if (rtype(i.to) == RTmp)
+	if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
+	if (fn->tmp[i.to.val].nuse == 0) {
+		chuse(i.arg[0], -1, fn);
+		chuse(i.arg[1], -1, fn);
+		return;
+	}
+	i0 = curi;
+	k = i.cls;
+	switch (i.op) {
+	case Odiv:
+	case Orem:
+	case Oudiv:
+	case Ourem:
+		if (i.op == Odiv || i.op == Oudiv)
+			r0 = TMP(RAX), r1 = TMP(RDX);
+		else
+			r0 = TMP(RDX), r1 = TMP(RAX);
+		emit(Ocopy, k, i.to, r0, R);
+		emit(Ocopy, k, R, r1, R);
+		if (rtype(i.arg[1]) == RCon) {
+			/* immediates not allowed for
+			 * divisions in x86
+			 */
+			r0 = newtmp("isel", k, fn);
+		} else
+			r0 = i.arg[1];
+		if (fn->tmp[r0.val].slot != -1)
+			err("unlikely argument %%%s in %s",
+				fn->tmp[r0.val].name, optab[i.op].name);
+		if (i.op == Odiv || i.op == Orem) {
+			emit(Oxidiv, k, R, r0, R);
+			emit(Osign, k, TMP(RDX), TMP(RAX), R);
+		} else {
+			emit(Oxdiv, k, R, r0, R);
+			emit(Ocopy, k, TMP(RDX), CON_Z, R);
+		}
+		emit(Ocopy, k, TMP(RAX), i.arg[0], R);
+		fixarg(&curi->arg[0], k, 0, fn);
+		if (rtype(i.arg[1]) == RCon)
+			emit(Ocopy, k, r0, i.arg[1], R);
+		break;
+	case Osar:
+	case Oshr:
+	case Oshl:
+		if (rtype(i.arg[1]) == RCon)
+			goto Emit;
+		r0 = i.arg[1];
+		i.arg[1] = TMP(RCX);
+		emit(Ocopy, Kw, R, TMP(RCX), R);
+		emiti(i);
+		emit(Ocopy, Kw, TMP(RCX), r0, R);
+		break;
+	case Onop:
+		break;
+	case Ostored:
+	case Ostores:
+	case Ostorel:
+	case Ostorew:
+	case Ostoreh:
+	case Ostoreb:
+		if (rtype(i.arg[0]) == RCon) {
+			if (i.op == Ostored)
+				i.op = Ostorel;
+			if (i.op == Ostores)
+				i.op = Ostorew;
+		}
+		seladdr(&i.arg[1], an, fn);
+		goto Emit;
+	case_Oload:
+		seladdr(&i.arg[0], an, fn);
+		goto Emit;
+	case Ocall:
+	case Osalloc:
+	case Ocopy:
+	case Oadd:
+	case Osub:
+	case Omul:
+	case Oand:
+	case Oor:
+	case Oxor:
+	case Oxtest:
+	case Ostosi:
+	case Odtosi:
+	case Oswtof:
+	case Osltof:
+	case Oexts:
+	case Otruncd:
+	case Ocast:
+	case_OExt:
+Emit:
+		emiti(i);
+		iarg = curi->arg; /* fixarg() can change curi */
+		fixarg(&iarg[0], argcls(&i, 0), 0, fn);
+		fixarg(&iarg[1], argcls(&i, 1), 0, fn);
+		break;
+	case Oalloc:
+	case Oalloc+1:
+	case Oalloc+2: /* == Oalloc1 */
+		/* we need to make sure
+		 * the stack remains aligned
+		 * (rsp = 0) mod 16
+		 */
+		if (rtype(i.arg[0]) == RCon) {
+			sz = fn->con[i.arg[0].val].bits.i;
+			if (sz < 0 || sz >= INT_MAX-15)
+				err("invalid alloc size %"PRId64, sz);
+			sz = (sz + 15)  & -16;
+			emit(Osalloc, Kl, i.to, getcon(sz, fn), R);
+		} else {
+			/* r0 = (i.arg[0] + 15) & -16 */
+			r0 = newtmp("isel", Kl, fn);
+			r1 = newtmp("isel", Kl, fn);
+			emit(Osalloc, Kl, i.to, r0, R);
+			emit(Oand, Kl, r0, r1, getcon(-16, fn));
+			emit(Oadd, Kl, r1, i.arg[0], getcon(15, fn));
+			if (fn->tmp[i.arg[0].val].slot != -1)
+				err("unlikely argument %%%s in %s",
+					fn->tmp[i.arg[0].val].name, optab[i.op].name);
+		}
+		break;
+	default:
+		if (isext(i.op))
+			goto case_OExt;
+		if (isload(i.op))
+			goto case_Oload;
+		if (iscmp(i.op, &kc, &x)) {
+			emit(Oflag+x, k, i.to, R, R);
+			i1 = curi;
+			if (selcmp(i.arg, kc, fn))
+				i1->op = Oflag + cmpop(x);
+			break;
+		}
+		die("unknown instruction %s", optab[i.op].name);
+	}
+
+	while (i0 > curi && --i0) {
+		assert(rslot(i0->arg[0], fn) == -1);
+		assert(rslot(i0->arg[1], fn) == -1);
+	}
+}
+
+static Ins *
+flagi(Ins *i0, Ins *i)
+{
+	while (i>i0) {
+		i--;
+		if (amd64_op[i->op].zflag)
+			return i;
+		if (amd64_op[i->op].lflag)
+			continue;
+		return 0;
+	}
+	return 0;
+}
+
+static void
+seljmp(Blk *b, Fn *fn)
+{
+	Ref r;
+	int c, k;
+	Ins *fi;
+	Tmp *t;
+
+	if (b->jmp.type == Jret0 || b->jmp.type == Jjmp)
+		return;
+	assert(b->jmp.type == Jjnz);
+	r = b->jmp.arg;
+	t = &fn->tmp[r.val];
+	b->jmp.arg = R;
+	assert(!req(r, R) && rtype(r) != RCon);
+	if (b->s1 == b->s2) {
+		chuse(r, -1, fn);
+		b->jmp.type = Jjmp;
+		b->s2 = 0;
+		return;
+	}
+	fi = flagi(b->ins, &b->ins[b->nins]);
+	if (!fi || !req(fi->to, r)) {
+		selcmp((Ref[2]){r, CON_Z}, Kw, fn); /* todo, long jnz */
+		b->jmp.type = Jjf + Cine;
+	}
+	else if (iscmp(fi->op, &k, &c)) {
+		if (t->nuse == 1) {
+			if (selcmp(fi->arg, k, fn))
+				c = cmpop(c);
+			*fi = (Ins){.op = Onop};
+		}
+		b->jmp.type = Jjf + c;
+	}
+	else if (fi->op == Oand && t->nuse == 1
+	     && (rtype(fi->arg[0]) == RTmp ||
+	         rtype(fi->arg[1]) == RTmp)) {
+		fi->op = Oxtest;
+		fi->to = R;
+		b->jmp.type = Jjf + Cine;
+		if (rtype(fi->arg[1]) == RCon) {
+			r = fi->arg[1];
+			fi->arg[1] = fi->arg[0];
+			fi->arg[0] = r;
+		}
+	}
+	else {
+		/* since flags are not tracked in liveness,
+		 * the result of the flag-setting instruction
+		 * has to be marked as live
+		 */
+		if (t->nuse == 1)
+			emit(Ocopy, Kw, R, r, R);
+		b->jmp.type = Jjf + Cine;
+	}
+}
+
+static int
+aref(Ref r, ANum *ai)
+{
+	switch (rtype(r)) {
+	case RCon:
+		return 2;
+	case RTmp:
+		return ai[r.val].n;
+	default:
+		die("constant or temporary expected");
+	}
+}
+
+static int
+ascale(Ref r, Con *con)
+{
+	int64_t n;
+
+	if (rtype(r) != RCon)
+		return 0;
+	if (con[r.val].type != CBits)
+		return 0;
+	n = con[r.val].bits.i;
+	return n == 1 || n == 2 || n == 4 || n == 8;
+}
+
+static void
+anumber(ANum *ai, Blk *b, Con *con)
+{
+	/* This should be made obsolete by a proper
+	 * reassoc pass.
+	 *
+	 * Rules:
+	 *
+	 *   RTmp(_) -> 0    tmp
+	 *   ( RTmp(_) -> 1    slot )
+	 *   RCon(_) -> 2    con
+	 *   0 * 2   -> 3    s * i (when constant is 1,2,4,8)
+	 */
+	static char add[10][10] = {
+		[2] [2] = 2,              /* folding */
+		[2] [5] = 5, [5] [2] = 5,
+		[2] [6] = 6, [6] [2] = 6,
+		[2] [7] = 7, [7] [2] = 7,
+		[0] [0] = 4,              /* 4: b + s * i */
+		[0] [3] = 4, [3] [0] = 4,
+		[2] [3] = 5, [3] [2] = 5, /* 5: o + s * i */
+		[0] [2] = 6, [2] [0] = 6, /* 6: o + b */
+		[2] [4] = 7, [4] [2] = 7, /* 7: o + b + s * i */
+		[0] [5] = 7, [5] [0] = 7,
+		[6] [3] = 7, [3] [6] = 7,
+
+	};
+	int a, a1, a2, n1, n2, t1, t2;
+	Ins *i;
+
+	for (i=b->ins; i-b->ins < b->nins; i++) {
+		if (rtype(i->to) == RTmp)
+			ai[i->to.val].i = i;
+		if (i->op != Oadd && i->op != Omul)
+			continue;
+		a1 = aref(i->arg[0], ai);
+		a2 = aref(i->arg[1], ai);
+		t1 = a1 != 1 && a1 != 2;
+		t2 = a2 != 1 && a2 != 2;
+		if (i->op == Oadd) {
+			a = add[n1 = a1][n2 = a2];
+			if (t1 && a < add[0][a2])
+				a = add[n1 = 0][n2 = a2];
+			if (t2 && a < add[a1][0])
+				a = add[n1 = a1][n2 = 0];
+			if (t1 && t2 && a < add[0][0])
+				a = add[n1 = 0][n2 = 0];
+		} else {
+			n1 = n2 = a = 0;
+			if (ascale(i->arg[0], con) && t2)
+				a = 3, n1 = 2, n2 = 0;
+			if (t1 && ascale(i->arg[1], con))
+				a = 3, n1 = 0, n2 = 2;
+		}
+		ai[i->to.val].n = a;
+		ai[i->to.val].l = n1;
+		ai[i->to.val].r = n2;
+	}
+}
+
+static void
+amatch(Addr *a, Ref r, ANum *ai, Fn *fn, int top)
+{
+	Ins *i;
+	int nl, nr, t, s;
+	Ref al, ar;
+
+	if (top)
+		memset(a, 0, sizeof *a);
+	if (rtype(r) == RCon) {
+		addcon(&a->offset, &fn->con[r.val]);
+		return;
+	}
+	assert(rtype(r) == RTmp);
+	i = ai[r.val].i;
+	nl = ai[r.val].l;
+	nr = ai[r.val].r;
+	if (i) {
+		if (nl > nr) {
+			al = i->arg[1];
+			ar = i->arg[0];
+			t = nl, nl = nr, nr = t;
+		} else {
+			al = i->arg[0];
+			ar = i->arg[1];
+		}
+	}
+	switch (ai[r.val].n) {
+	case 3: /* s * i */
+		if (!top) {
+			a->index = al;
+			a->scale = fn->con[ar.val].bits.i;
+		} else
+			a->base = r;
+		break;
+	case 4: /* b + s * i */
+		switch (nr) {
+		case 0:
+			if (fn->tmp[ar.val].slot != -1) {
+				al = i->arg[1];
+				ar = i->arg[0];
+			}
+			a->index = ar;
+			a->scale = 1;
+			break;
+		case 3:
+			amatch(a, ar, ai, fn, 0);
+			break;
+		}
+		r = al;
+	case 0:
+		s = fn->tmp[r.val].slot;
+		if (s != -1)
+			r = SLOT(s);
+		a->base = r;
+		break;
+	case 2: /* constants */
+	case 5: /* o + s * i */
+	case 6: /* o + b */
+	case 7: /* o + b + s * i */
+		amatch(a, ar, ai, fn, 0);
+		amatch(a, al, ai, fn, 0);
+		break;
+	default:
+		die("unreachable");
+	}
+}
+
+/* instruction selection
+ * requires use counts (as given by parsing)
+ */
+void
+amd64_isel(Fn *fn)
+{
+	Blk *b, **sb;
+	Ins *i;
+	Phi *p;
+	uint a;
+	int n, al;
+	int64_t sz;
+	ANum *ainfo;
+
+	/* assign slots to fast allocs */
+	b = fn->start;
+	/* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */
+	for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2)
+		for (i=b->ins; i-b->ins < b->nins; i++)
+			if (i->op == al) {
+				if (rtype(i->arg[0]) != RCon)
+					break;
+				sz = fn->con[i->arg[0].val].bits.i;
+				if (sz < 0 || sz >= INT_MAX-15)
+					err("invalid alloc size %"PRId64, sz);
+				sz = (sz + n-1) & -n;
+				sz /= 4;
+				fn->tmp[i->to.val].slot = fn->slot;
+				fn->slot += sz;
+				*i = (Ins){.op = Onop};
+			}
+
+	/* process basic blocks */
+	n = fn->ntmp;
+	ainfo = emalloc(n * sizeof ainfo[0]);
+	for (b=fn->start; b; b=b->link) {
+		curi = &insb[NIns];
+		for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
+			for (p=(*sb)->phi; p; p=p->link) {
+				for (a=0; p->blk[a] != b; a++)
+					assert(a+1 < p->narg);
+				fixarg(&p->arg[a], p->cls, 1, fn);
+			}
+		memset(ainfo, 0, n * sizeof ainfo[0]);
+		anumber(ainfo, b, fn->con);
+		seljmp(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;)
+			sel(*--i, ainfo, fn);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	}
+	free(ainfo);
+
+	if (debug['I']) {
+		fprintf(stderr, "\n> After instruction selection:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/amd64/sysv.c b/amd64/sysv.c
new file mode 100644
index 0000000..dcaa812
--- /dev/null
+++ b/amd64/sysv.c
@@ -0,0 +1,701 @@
+#include "all.h"
+
+typedef struct AClass AClass;
+typedef struct RAlloc RAlloc;
+
+struct AClass {
+	int inmem;
+	int align;
+	uint size;
+	int cls[2];
+	Ref ref[2];
+};
+
+struct RAlloc {
+	Ins i;
+	RAlloc *link;
+};
+
+static void
+classify(AClass *a, Typ *t, int *pn, int *pe)
+{
+	Seg *seg;
+	int n, s, *cls;
+
+	for (n=0; n<t->nunion; n++) {
+		seg = t->seg[n];
+		for (s=0; *pe<2; (*pe)++) {
+			cls = &a->cls[*pe];
+			for (; *pn<8; s++) {
+				switch (seg[s].type) {
+				case SEnd:
+					goto Done;
+				case SPad:
+					/* don't change anything */
+					break;
+				case SFlt:
+					if (*cls == Kx)
+						*cls = Kd;
+					break;
+				case SInt:
+					*cls = Kl;
+					break;
+				case STyp:
+					classify(a, &typ[seg[s].len], pn, pe);
+					continue;
+				}
+				*pn += seg[s].len;
+			}
+		Done:
+			assert(*pn <= 8);
+			*pn = 0;
+		}
+	}
+}
+
+static void
+typclass(AClass *a, Typ *t)
+{
+	int e, n;
+	uint sz, al;
+
+	sz = t->size;
+	al = 1u << t->align;
+
+	/* the ABI requires sizes to be rounded
+	 * up to the nearest multiple of 8, moreover
+	 * it makes it easy load and store structures
+	 * in registers
+	 */
+	if (al < 8)
+		al = 8;
+	sz = (sz + al-1) & -al;
+
+	a->size = sz;
+	a->align = t->align;
+
+	if (t->dark || sz > 16 || sz == 0) {
+		/* large or unaligned structures are
+		 * required to be passed in memory
+		 */
+		a->inmem = 1;
+		return;
+	}
+
+	a->cls[0] = Kx;
+	a->cls[1] = Kx;
+	a->inmem = 0;
+	n = 0;
+	e = 0;
+	classify(a, t, &n, &e);
+}
+
+static int
+retr(Ref reg[2], AClass *aret)
+{
+	static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
+	int n, k, ca, nr[2];
+
+	nr[0] = nr[1] = 0;
+	ca = 0;
+	for (n=0; (uint)n*8<aret->size; n++) {
+		k = KBASE(aret->cls[n]);
+		reg[n] = TMP(retreg[k][nr[k]++]);
+		ca += 1 << (2 * k);
+	}
+	return ca;
+}
+
+static void
+selret(Blk *b, Fn *fn)
+{
+	int j, k, ca;
+	Ref r, r0, reg[2];
+	AClass aret;
+
+	j = b->jmp.type;
+
+	if (!isret(j) || j == Jret0)
+		return;
+
+	r0 = b->jmp.arg;
+	b->jmp.type = Jret0;
+
+	if (j == Jretc) {
+		typclass(&aret, &typ[fn->retty]);
+		if (aret.inmem) {
+			assert(rtype(fn->retr) == RTmp);
+			emit(Ocopy, Kl, TMP(RAX), fn->retr, R);
+			blit(fn->retr, 0, r0, aret.size, fn);
+			ca = 1;
+		} else {
+			ca = retr(reg, &aret);
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				emit(Oload, Kl, reg[1], r, R);
+				emit(Oadd, Kl, r, r0, getcon(8, fn));
+			}
+			emit(Oload, Kl, reg[0], r0, R);
+		}
+	} else {
+		k = j - Jretw;
+		if (KBASE(k) == 0) {
+			emit(Ocopy, k, TMP(RAX), r0, R);
+			ca = 1;
+		} else {
+			emit(Ocopy, k, TMP(XMM0), r0, R);
+			ca = 1 << 2;
+		}
+	}
+
+	b->jmp.arg = CALL(ca);
+}
+
+static int
+argsclass(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret, Ref *env)
+{
+	int nint, ni, nsse, ns, n, *pn;
+	AClass *a;
+	Ins *i;
+
+	if (aret && aret->inmem)
+		nint = 5; /* hidden argument */
+	else
+		nint = 6;
+	nsse = 8;
+	for (i=i0, a=ac; i<i1; i++, a++)
+		switch (i->op - op + Oarg) {
+		case Oarg:
+			if (KBASE(i->cls) == 0)
+				pn = &nint;
+			else
+				pn = &nsse;
+			if (*pn > 0) {
+				--*pn;
+				a->inmem = 0;
+			} else
+				a->inmem = 2;
+			a->align = 3;
+			a->size = 8;
+			a->cls[0] = i->cls;
+			break;
+		case Oargc:
+			n = i->arg[0].val;
+			typclass(a, &typ[n]);
+			if (a->inmem)
+				continue;
+			ni = ns = 0;
+			for (n=0; (uint)n*8<a->size; n++)
+				if (KBASE(a->cls[n]) == 0)
+					ni++;
+				else
+					ns++;
+			if (nint >= ni && nsse >= ns) {
+				nint -= ni;
+				nsse -= ns;
+			} else
+				a->inmem = 1;
+			break;
+		case Oarge:
+			if (op == Opar)
+				*env = i->to;
+			else
+				*env = i->arg[0];
+			break;
+		}
+
+	return ((6-nint) << 4) | ((8-nsse) << 8);
+}
+
+int amd64_sysv_rsave[] = {
+	RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX,
+	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, -1
+};
+int amd64_sysv_rclob[] = {RBX, R12, R13, R14, R15, -1};
+
+MAKESURE(sysv_arrays_ok,
+	sizeof amd64_sysv_rsave == (NGPS+NFPS+1) * sizeof(int) &&
+	sizeof amd64_sysv_rclob == (NCLR+1) * sizeof(int)
+);
+
+/* layout of call's second argument (RCall)
+ *
+ *  29     12    8    4  3  0
+ *  |0...00|x|xxxx|xxxx|xx|xx|                  range
+ *          |    |    |  |  ` gp regs returned (0..2)
+ *          |    |    |  ` sse regs returned   (0..2)
+ *          |    |    ` gp regs passed         (0..6)
+ *          |    ` sse regs passed             (0..8)
+ *          ` 1 if rax is used to pass data    (0..1)
+ */
+
+bits
+amd64_sysv_retregs(Ref r, int p[2])
+{
+	bits b;
+	int ni, nf;
+
+	assert(rtype(r) == RCall);
+	b = 0;
+	ni = r.val & 3;
+	nf = (r.val >> 2) & 3;
+	if (ni >= 1)
+		b |= BIT(RAX);
+	if (ni >= 2)
+		b |= BIT(RDX);
+	if (nf >= 1)
+		b |= BIT(XMM0);
+	if (nf >= 2)
+		b |= BIT(XMM1);
+	if (p) {
+		p[0] = ni;
+		p[1] = nf;
+	}
+	return b;
+}
+
+bits
+amd64_sysv_argregs(Ref r, int p[2])
+{
+	bits b;
+	int j, ni, nf, ra;
+
+	assert(rtype(r) == RCall);
+	b = 0;
+	ni = (r.val >> 4) & 15;
+	nf = (r.val >> 8) & 15;
+	ra = (r.val >> 12) & 1;
+	for (j=0; j<ni; j++)
+		b |= BIT(amd64_sysv_rsave[j]);
+	for (j=0; j<nf; j++)
+		b |= BIT(XMM0+j);
+	if (p) {
+		p[0] = ni + ra;
+		p[1] = nf;
+	}
+	return b | (ra ? BIT(RAX) : 0);
+}
+
+static Ref
+rarg(int ty, int *ni, int *ns)
+{
+	if (KBASE(ty) == 0)
+		return TMP(amd64_sysv_rsave[(*ni)++]);
+	else
+		return TMP(XMM0 + (*ns)++);
+}
+
+static void
+selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
+{
+	Ins *i;
+	AClass *ac, *a, aret;
+	int ca, ni, ns, al, varc, envc;
+	uint stk, off;
+	Ref r, r1, r2, reg[2], env;
+	RAlloc *ra;
+
+	env = R;
+	ac = alloc((i1-i0) * sizeof ac[0]);
+
+	if (!req(i1->arg[1], R)) {
+		assert(rtype(i1->arg[1]) == RType);
+		typclass(&aret, &typ[i1->arg[1].val]);
+		ca = argsclass(i0, i1, ac, Oarg, &aret, &env);
+	} else
+		ca = argsclass(i0, i1, ac, Oarg, 0, &env);
+
+	for (stk=0, a=&ac[i1-i0]; a>ac;)
+		if ((--a)->inmem) {
+			if (a->align > 4)
+				err("sysv abi requires alignments of 16 or less");
+			stk += a->size;
+			if (a->align == 4)
+				stk += stk & 15;
+		}
+	stk += stk & 15;
+	if (stk) {
+		r = getcon(-(int64_t)stk, fn);
+		emit(Osalloc, Kl, R, r, R);
+	}
+
+	if (!req(i1->arg[1], R)) {
+		if (aret.inmem) {
+			/* get the return location from eax
+			 * it saves one callee-save reg */
+			r1 = newtmp("abi", Kl, fn);
+			emit(Ocopy, Kl, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				aret.ref[1] = newtmp("abi", aret.cls[1], fn);
+				emit(Ostorel, 0, R, aret.ref[1], r);
+				emit(Oadd, Kl, r, i1->to, getcon(8, fn));
+			}
+			aret.ref[0] = newtmp("abi", aret.cls[0], fn);
+			emit(Ostorel, 0, R, aret.ref[0], i1->to);
+			ca += retr(reg, &aret);
+			if (aret.size > 8)
+				emit(Ocopy, aret.cls[1], aret.ref[1], reg[1], R);
+			emit(Ocopy, aret.cls[0], aret.ref[0], reg[0], R);
+			r1 = i1->to;
+		}
+		/* allocate return pad */
+		ra = alloc(sizeof *ra);
+		/* specific to NAlign == 3 */
+		al = aret.align >= 2 ? aret.align - 2 : 0;
+		ra->i = (Ins){Oalloc+al, r1, {getcon(aret.size, fn)}, Kl};
+		ra->link = (*rap);
+		*rap = ra;
+	} else {
+		ra = 0;
+		if (KBASE(i1->cls) == 0) {
+			emit(Ocopy, i1->cls, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			emit(Ocopy, i1->cls, i1->to, TMP(XMM0), R);
+			ca += 1 << 2;
+		}
+	}
+	envc = !req(R, env);
+	varc = i1->op == Ovacall;
+	if (varc && envc)
+		err("sysv abi does not support variadic env calls");
+	ca |= (varc | envc) << 12;
+	emit(Ocall, i1->cls, R, i1->arg[0], CALL(ca));
+	if (envc)
+		emit(Ocopy, Kl, TMP(RAX), env, R);
+	if (varc)
+		emit(Ocopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);
+
+	ni = ns = 0;
+	if (ra && aret.inmem)
+		emit(Ocopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (a->inmem)
+			continue;
+		r1 = rarg(a->cls[0], &ni, &ns);
+		if (i->op == Oargc) {
+			if (a->size > 8) {
+				r2 = rarg(a->cls[1], &ni, &ns);
+				r = newtmp("abi", Kl, fn);
+				emit(Oload, a->cls[1], r2, r, R);
+				emit(Oadd, Kl, r, i->arg[1], getcon(8, fn));
+			}
+			emit(Oload, a->cls[0], r1, i->arg[1], R);
+		} else
+			emit(Ocopy, i->cls, r1, i->arg[0], R);
+	}
+
+	if (!stk)
+		return;
+
+	r = newtmp("abi", Kl, fn);
+	for (i=i0, a=ac, off=0; i<i1; i++, a++) {
+		if (!a->inmem)
+			continue;
+		if (i->op == Oargc) {
+			if (a->align == 4)
+				off += off & 15;
+			blit(r, off, i->arg[1], a->size, fn);
+		} else {
+			r1 = newtmp("abi", Kl, fn);
+			emit(Ostorel, 0, R, i->arg[0], r1);
+			emit(Oadd, Kl, r1, r, getcon(off, fn));
+		}
+		off += a->size;
+	}
+	emit(Osalloc, Kl, r, getcon(stk, fn), R);
+}
+
+static int
+selpar(Fn *fn, Ins *i0, Ins *i1)
+{
+	AClass *ac, *a, aret;
+	Ins *i;
+	int ni, ns, s, al, fa;
+	Ref r, env;
+
+	env = R;
+	ac = alloc((i1-i0) * sizeof ac[0]);
+	curi = &insb[NIns];
+	ni = ns = 0;
+
+	if (fn->retty >= 0) {
+		typclass(&aret, &typ[fn->retty]);
+		fa = argsclass(i0, i1, ac, Opar, &aret, &env);
+	} else
+		fa = argsclass(i0, i1, ac, Opar, 0, &env);
+
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (i->op != Oparc || a->inmem)
+			continue;
+		if (a->size > 8) {
+			r = newtmp("abi", Kl, fn);
+			a->ref[1] = newtmp("abi", Kl, fn);
+			emit(Ostorel, 0, R, a->ref[1], r);
+			emit(Oadd, Kl, r, i->to, getcon(8, fn));
+		}
+		a->ref[0] = newtmp("abi", Kl, fn);
+		emit(Ostorel, 0, R, a->ref[0], i->to);
+		/* specific to NAlign == 3 */
+		al = a->align >= 2 ? a->align - 2 : 0;
+		emit(Oalloc+al, Kl, i->to, getcon(a->size, fn), R);
+	}
+
+	if (fn->retty >= 0 && aret.inmem) {
+		r = newtmp("abi", Kl, fn);
+		emit(Ocopy, Kl, r, rarg(Kl, &ni, &ns), R);
+		fn->retr = r;
+	}
+
+	for (i=i0, a=ac, s=4; i<i1; i++, a++) {
+		switch (a->inmem) {
+		case 1:
+			if (a->align > 4)
+				err("sysv abi requires alignments of 16 or less");
+			if (a->align == 4)
+				s = (s+3) & -4;
+			fn->tmp[i->to.val].slot = -s;
+			s += a->size / 4;
+			continue;
+		case 2:
+			emit(Oload, i->cls, i->to, SLOT(-s), R);
+			s += 2;
+			continue;
+		}
+		r = rarg(a->cls[0], &ni, &ns);
+		if (i->op == Oparc) {
+			emit(Ocopy, Kl, a->ref[0], r, R);
+			if (a->size > 8) {
+				r = rarg(a->cls[1], &ni, &ns);
+				emit(Ocopy, Kl, a->ref[1], r, R);
+			}
+		} else
+			emit(Ocopy, i->cls, i->to, r, R);
+	}
+
+	if (!req(R, env))
+		emit(Ocopy, Kl, env, TMP(RAX), R);
+
+	return fa | (s*4)<<12;
+}
+
+static Blk *
+split(Fn *fn, Blk *b)
+{
+	Blk *bn;
+
+	++fn->nblk;
+	bn = blknew();
+	bn->nins = &insb[NIns] - curi;
+	idup(&bn->ins, curi, bn->nins);
+	curi = &insb[NIns];
+	bn->visit = ++b->visit;
+	snprintf(bn->name, NString, "%s.%d", b->name, b->visit);
+	bn->loop = b->loop;
+	bn->link = b->link;
+	b->link = bn;
+	return bn;
+}
+
+static void
+chpred(Blk *b, Blk *bp, Blk *bp1)
+{
+	Phi *p;
+	uint a;
+
+	for (p=b->phi; p; p=p->link) {
+		for (a=0; p->blk[a]!=bp; a++)
+			assert(a+1<p->narg);
+		p->blk[a] = bp1;
+	}
+}
+
+static void
+selvaarg(Fn *fn, Blk *b, Ins *i)
+{
+	Ref loc, lreg, lstk, nr, r0, r1, c4, c8, c16, c, ap;
+	Blk *b0, *bstk, *breg;
+	int isint;
+
+	c4 = getcon(4, fn);
+	c8 = getcon(8, fn);
+	c16 = getcon(16, fn);
+	ap = i->arg[0];
+	isint = KBASE(i->cls) == 0;
+
+	/* @b [...]
+	       r0 =l add ap, (0 or 4)
+	       nr =l loadsw r0
+	       r1 =w cultw nr, (48 or 176)
+	       jnz r1, @breg, @bstk
+	   @breg
+	       r0 =l add ap, 16
+	       r1 =l loadl r0
+	       lreg =l add r1, nr
+	       r0 =w add nr, (8 or 16)
+	       r1 =l add ap, (0 or 4)
+	       storew r0, r1
+	   @bstk
+	       r0 =l add ap, 8
+	       lstk =l loadl r0
+	       r1 =l add lstk, 8
+	       storel r1, r0
+	   @b0
+	       %loc =l phi @breg %lreg, @bstk %lstk
+	       i->to =(i->cls) load %loc
+	*/
+
+	loc = newtmp("abi", Kl, fn);
+	emit(Oload, i->cls, i->to, loc, R);
+	b0 = split(fn, b);
+	b0->jmp = b->jmp;
+	b0->s1 = b->s1;
+	b0->s2 = b->s2;
+	if (b->s1)
+		chpred(b->s1, b, b0);
+	if (b->s2 && b->s2 != b->s1)
+		chpred(b->s2, b, b0);
+
+	lreg = newtmp("abi", Kl, fn);
+	nr = newtmp("abi", Kl, fn);
+	r0 = newtmp("abi", Kw, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorew, Kw, R, r0, r1);
+	emit(Oadd, Kl, r1, ap, isint ? CON_Z : c4);
+	emit(Oadd, Kw, r0, nr, isint ? c8 : c16);
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Oadd, Kl, lreg, r1, nr);
+	emit(Oload, Kl, r1, r0, R);
+	emit(Oadd, Kl, r0, ap, c16);
+	breg = split(fn, b);
+	breg->jmp.type = Jjmp;
+	breg->s1 = b0;
+
+	lstk = newtmp("abi", Kl, fn);
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, lstk, c8);
+	emit(Oload, Kl, lstk, r0, R);
+	emit(Oadd, Kl, r0, ap, c8);
+	bstk = split(fn, b);
+	bstk->jmp.type = Jjmp;
+	bstk->s1 = b0;
+
+	b0->phi = alloc(sizeof *b0->phi);
+	*b0->phi = (Phi){
+		.cls = Kl, .to = loc,
+		.narg = 2,
+		.blk = {bstk, breg},
+		.arg = {lstk, lreg},
+	};
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kw, fn);
+	b->jmp.type = Jjnz;
+	b->jmp.arg = r1;
+	b->s1 = breg;
+	b->s2 = bstk;
+	c = getcon(isint ? 48 : 176, fn);
+	emit(Ocmpw+Ciult, Kw, r1, nr, c);
+	emit(Oloadsw, Kl, nr, r0, R);
+	emit(Oadd, Kl, r0, ap, isint ? CON_Z : c4);
+}
+
+static void
+selvastart(Fn *fn, int fa, Ref ap)
+{
+	Ref r0, r1;
+	int gp, fp, sp;
+
+	gp = ((fa >> 4) & 15) * 8;
+	fp = 48 + ((fa >> 8) & 15) * 16;
+	sp = fa >> 12;
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, TMP(RBP), getcon(-176, fn));
+	emit(Oadd, Kl, r0, ap, getcon(16, fn));
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, TMP(RBP), getcon(sp, fn));
+	emit(Oadd, Kl, r0, ap, getcon(8, fn));
+	r0 = newtmp("abi", Kl, fn);
+	emit(Ostorew, Kw, R, getcon(fp, fn), r0);
+	emit(Oadd, Kl, r0, ap, getcon(4, fn));
+	emit(Ostorew, Kw, R, getcon(gp, fn), ap);
+}
+
+void
+amd64_sysv_abi(Fn *fn)
+{
+	Blk *b;
+	Ins *i, *i0, *ip;
+	RAlloc *ral;
+	int n, fa;
+
+	for (b=fn->start; b; b=b->link)
+		b->visit = 0;
+
+	/* lower parameters */
+	for (b=fn->start, i=b->ins; i-b->ins<b->nins; i++)
+		if (!ispar(i->op))
+			break;
+	fa = selpar(fn, b->ins, i);
+	n = b->nins - (i - b->ins) + (&insb[NIns] - curi);
+	i0 = alloc(n * sizeof(Ins));
+	ip = icpy(ip = i0, curi, &insb[NIns] - curi);
+	ip = icpy(ip, i, &b->ins[b->nins] - i);
+	b->nins = n;
+	b->ins = i0;
+
+	/* lower calls, returns, and vararg instructions */
+	ral = 0;
+	b = fn->start;
+	do {
+		if (!(b = b->link))
+			b = fn->start; /* do it last */
+		if (b->visit)
+			continue;
+		curi = &insb[NIns];
+		selret(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;)
+			switch ((--i)->op) {
+			default:
+				emiti(*i);
+				break;
+			case Ocall:
+			case Ovacall:
+				for (i0=i; i0>b->ins; i0--)
+					if (!isarg((i0-1)->op))
+						break;
+				selcall(fn, i0, i, &ral);
+				i = i0;
+				break;
+			case Ovastart:
+				selvastart(fn, fa, i->arg[0]);
+				break;
+			case Ovaarg:
+				selvaarg(fn, b, i);
+				break;
+			case Oarg:
+			case Oargc:
+				die("unreachable");
+			}
+		if (b == fn->start)
+			for (; ral; ral=ral->link)
+				emiti(ral->i);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	} while (b != fn->start);
+
+	if (debug['A']) {
+		fprintf(stderr, "\n> After ABI lowering:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/amd64/targ.c b/amd64/targ.c
new file mode 100644
index 0000000..e227574
--- /dev/null
+++ b/amd64/targ.c
@@ -0,0 +1,30 @@
+#include "all.h"
+
+Amd64Op amd64_op[NOp] = {
+#define O(op, t, x) [O##op] =
+#define X(nm, zf, lf) { nm, zf, lf, },
+	#include "../ops.h"
+};
+
+static int
+amd64_memargs(int op)
+{
+	return amd64_op[op].nmem;
+}
+
+Target T_amd64_sysv = {
+	.gpr0 = RAX,
+	.ngpr = NGPR,
+	.fpr0 = XMM0,
+	.nfpr = NFPR,
+	.rglob = BIT(RBP) | BIT(RSP),
+	.nrglob = 2,
+	.rsave = amd64_sysv_rsave,
+	.nrsave = {NGPS, NFPS},
+	.retregs = amd64_sysv_retregs,
+	.argregs = amd64_sysv_argregs,
+	.memargs = amd64_memargs,
+	.abi = amd64_sysv_abi,
+	.isel = amd64_isel,
+	.emitfn = amd64_emitfn,
+};