53 files changed, 8122 insertions, 0 deletions
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..0416fa9
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,5 @@
+qbe
+doc
+.comfile
+*.o
+*.out
diff --git a/src/.tag b/src/.tag
new file mode 100644
index 0000000..5b8c210
--- /dev/null
+++ b/src/.tag
@@ -0,0 +1,11 @@
+Look slot(
+
+Get lisc.h
+Get parse.c
+Get isel.c
+Get spill.c
+Get rega.c
+Get emit.c
+
+New
+|fmt
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..b9c87df
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,17 @@
+BIN = qbe
+OBJ = main.o util.o parse.o mem.o ssa.o copy.o live.o isel.o spill.o rega.o emit.o
+
+CFLAGS = -Wall -Wextra -std=c99 -g -pedantic
+
+$(BIN): $(OBJ)
+	$(CC) $(LDFLAGS) $(OBJ) -o $@
+
+$(OBJ): all.h
+
+.PHONY: clean check syndoc
+clean:
+	rm -f $(BIN) $(OBJ)
+check: $(BIN)
+	test/go.sh all
+syndoc:
+	unison -auto doc ssh://qcar@h/data/d/ssa-doc
diff --git a/src/all.h b/src/all.h
new file mode 100644
index 0000000..e0542da
--- /dev/null
+++ b/src/all.h
@@ -0,0 +1,554 @@
+#include <assert.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define MAKESURE(what, x) typedef char make_sure_##what[(x)?1:-1]
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+typedef unsigned long ulong;
+typedef unsigned long bits;
+
+typedef struct BSet BSet;
+typedef struct Ref Ref;
+typedef struct OpDesc OpDesc;
+typedef struct Ins Ins;
+typedef struct Phi Phi;
+typedef struct Blk Blk;
+typedef struct Use Use;
+typedef struct Tmp Tmp;
+typedef struct Con Con;
+typedef struct Addr Mem;
+typedef struct Fn Fn;
+typedef struct Typ Typ;
+typedef struct Dat Dat;
+
+enum Reg {
+	RXX,
+
+	RAX, /* caller-save */
+	RCX,
+	RDX,
+	RSI,
+	RDI,
+	R8,
+	R9,
+	R10,
+	R11,
+
+	RBX, /* callee-save */
+	R12,
+	R13,
+	R14,
+	R15,
+
+	RBP, /* reserved */
+	RSP,
+
+	XMM0, /* sse */
+	XMM1,
+	XMM2,
+	XMM3,
+	XMM4,
+	XMM5,
+	XMM6,
+	XMM7,
+	XMM8,
+	XMM9,
+	XMM10,
+	XMM11,
+	XMM12,
+	XMM13,
+	XMM14,
+	XMM15,
+
+	Tmp0, /* first non-reg temporary */
+
+	NIReg = R12 - RAX + 1,
+	NFReg = XMM14 - XMM0 + 1,
+	NISave = 9,
+	NFSave = NFReg,
+	NRSave = NISave + NFSave,
+	NRClob = 5,
+};
+
+enum {
+	NString = 32,
+	NPred   = 63,
+	NIns    = 8192,
+	NAlign  = 3,
+	NSeg    = 32,
+	NTyp    = 128,
+	NBit    = CHAR_BIT * sizeof(bits),
+};
+
+MAKESURE(NBit_is_enough, NBit >= (int)Tmp0);
+
+#define BIT(n) ((bits)1 << (n))
+
+struct BSet {
+	uint nt;
+	bits *t;
+};
+
+struct Ref {
+	uint16_t type:2;
+	uint16_t val:14;
+};
+
+enum Alt {
+	AType,
+	ACall,
+	AMem,
+
+	AShift = 12,
+	AMask = (1<<AShift) - 1
+};
+
+enum {
+	RTmp,
+	RCon,
+	RSlot,
+	RAlt,
+
+	RAType = RAlt + AType,
+	RACall = RAlt + ACall,
+	RAMem  = RAlt + AMem,
+
+	NRef = (1<<14) - 1
+};
+
+#define R        (Ref){0, 0}
+#define TMP(x)   (Ref){RTmp, x}
+#define CON(x)   (Ref){RCon, x}
+#define CON_Z    CON(0)          /* reserved zero constant */
+#define SLOT(x)  (Ref){RSlot, x}
+#define TYPE(x)  (Ref){RAlt, (x)|(AType<<AShift)}
+#define CALL(x)  (Ref){RAlt, (x)|(ACall<<AShift)}
+#define MEM(x)   (assert(x<(1<<AShift) && "too many mems"), \
+                 (Ref){RAlt, (x)|(AMem<<AShift)})
+
+static inline int req(Ref a, Ref b)
+{
+	return a.type == b.type && a.val == b.val;
+}
+
+static inline int rtype(Ref r)
+{
+	if (req(r, R))
+		return -1;
+	if (r.type == RAlt)
+		return RAlt + (r.val >> AShift);
+	return r.type;
+}
+
+static inline int isreg(Ref r)
+{
+	return rtype(r) == RTmp && r.val < Tmp0;
+}
+
+enum ICmp {
+#define ICMPS(X) \
+	X(ule)   \
+	X(ult)   \
+	X(sle)   \
+	X(slt)   \
+	X(sgt)   \
+	X(sge)   \
+	X(ugt)   \
+	X(uge)   \
+	X(eq)    \
+	X(ne) /* make sure icmpop() below works! */
+
+#define X(c) IC##c,
+	ICMPS(X)
+#undef X
+	NICmp,
+
+	ICXnp = NICmp, /* x64 specific */
+	ICXp,
+	NXICmp
+};
+
+static inline int icmpop(int c)
+{
+	return c >= ICeq ? c : ICuge - c;
+}
+
+enum FCmp {
+#define FCMPS(X) \
+	X(le)    \
+	X(lt)    \
+	X(gt)    \
+	X(ge)    \
+	X(ne)    \
+	X(eq)    \
+	X(o)     \
+	X(uo)
+
+#define X(c) FC##c,
+	FCMPS(X)
+#undef X
+	NFCmp
+};
+
+enum Class {
+	Kw,
+	Kl,
+	Ks,
+	Kd
+};
+
+#define KWIDE(k) ((k)&1)
+#define KBASE(k) ((k)>>1)
+
+enum Op {
+	OXXX,
+
+	/* public instructions */
+	OAdd,
+	OSub,
+	ODiv,
+	ORem,
+	OUDiv,
+	OURem,
+	OMul,
+	OAnd,
+	OOr,
+	OXor,
+	OSar,
+	OShr,
+	OShl,
+	OCmpw,
+	OCmpw1 = OCmpw + NICmp-1,
+	OCmpl,
+	OCmpl1 = OCmpl + NICmp-1,
+	OCmps,
+	OCmps1 = OCmps + NFCmp-1,
+	OCmpd,
+	OCmpd1 = OCmpd + NFCmp-1,
+
+	OStored,
+	OStores,
+	OStorel,
+	OStorew,
+	OStoreh,
+	OStoreb,
+#define isstore(o) (OStored <= o && o <= OStoreb)
+	OLoadsw,  /* needs to match OExt (mem.c) */
+	OLoaduw,
+	OLoadsh,
+	OLoaduh,
+	OLoadsb,
+	OLoadub,
+	OLoad,
+#define isload(o) (OLoadsw <= o && o <= OLoad)
+	OExtsw,
+	OExtuw,
+	OExtsh,
+	OExtuh,
+	OExtsb,
+	OExtub,
+#define isext(o) (OExtsw <= o && o <= OExtub)
+
+	OExts,
+	OTruncd,
+	OFtosi,
+	OSitof,
+	OCast,
+
+	OAlloc,
+	OAlloc1 = OAlloc + NAlign-1,
+
+	OCopy,
+	NPubOp,
+
+	/* function instructions */
+	OPar = NPubOp,
+	OParc,
+	OArg,
+	OArgc,
+	OCall,
+
+	/* reserved instructions */
+	ONop,
+	OAddr,
+	OSwap,
+	OSign,
+	OSAlloc,
+	OXIDiv,
+	OXDiv,
+	OXCmp,
+	OXSet,
+	OXSetnp = OXSet + ICXnp,
+	OXSetp  = OXSet + ICXp,
+	OXTest,
+	NOp
+};
+
+enum Jmp {
+	JXXX,
+	JRet0,
+	JRetw,
+	JRetl,
+	JRets,
+	JRetd,
+	JRetc,
+#define isret(j) (JRet0 <= j && j <= JRetc)
+	JJmp,
+	JJnz,
+	JXJc,
+	JXJnp = JXJc + ICXnp,
+	JXJp  = JXJc + ICXp,
+	NJmp
+};
+
+struct OpDesc {
+	char *name;
+	int nmem;
+	char argcls[2][4];
+	uint sflag:1; /* sets the zero flag */
+	uint lflag:1; /* leaves flags */
+};
+
+struct Ins {
+	ushort op:14;
+	Ref to;
+	Ref arg[2];
+	ushort cls:2;
+};
+
+struct Phi {
+	Ref to;
+	Ref arg[NPred];
+	Blk *blk[NPred];
+	uint narg;
+	int cls;
+	Phi *link;
+};
+
+struct Blk {
+	Phi *phi;
+	Ins *ins;
+	uint nins;
+	struct {
+		short type;
+		Ref arg;
+	} jmp;
+	Blk *s1;
+	Blk *s2;
+	Blk *link;
+
+	int id;
+	int visit;
+
+	Blk *idom;
+	Blk *dom, *dlink;
+	Blk **fron;
+	int nfron;
+
+	Blk **pred;
+	uint npred;
+	BSet in[1], out[1], gen[1];
+	int nlive[2];
+	int loop;
+	char name[NString];
+};
+
+struct Use {
+	enum {
+		UXXX,
+		UPhi,
+		UIns,
+		UJmp,
+	} type;
+	int bid;
+	union {
+		Ins *ins;
+		Phi *phi;
+	} u;
+};
+
+struct Tmp {
+	char name[NString];
+	Use *use;
+	uint ndef, nuse;
+	uint cost;
+	short slot;
+	short cls;
+	struct {
+		int r;
+		bits m;
+	} hint;
+	int phi;
+	int visit;
+};
+
+struct Con {
+	enum {
+		CUndef,
+		CBits,
+		CAddr,
+	} type;
+	char label[NString];
+	union {
+		int64_t i;
+		double d;
+		float s;
+	} bits;
+	char flt; /* for printing, see parse.c */
+};
+
+typedef struct Addr Addr;
+
+struct Addr { /* x64 addressing */
+	Con offset;
+	Ref base;
+	Ref index;
+	int scale;
+};
+
+struct Fn {
+	Blk *start;
+	Tmp *tmp;
+	Con *con;
+	Mem *mem;
+	int ntmp;
+	int ncon;
+	int nmem;
+	int nblk;
+	int retty; /* index in typ[], -1 if no aggregate return */
+	Ref retr;
+	Blk **rpo;
+	bits reg;
+	int slot;
+	char name[NString];
+};
+
+struct Typ {
+	char name[NString];
+	int dark;
+	uint size;
+	int align;
+
+	struct {
+		uint isflt:1;
+		uint ispad:1;
+		uint len:30;
+	} seg[NSeg+1];
+};
+
+struct Dat {
+	enum {
+		DStart,
+		DEnd,
+		DName,
+		DAlign,
+		DB,
+		DH,
+		DW,
+		DL,
+		DZ
+	} type;
+	union {
+		int64_t num;
+		double fltd;
+		float flts;
+		char *str;
+		struct {
+			char *nam;
+			int64_t off;
+		} ref;
+	} u;
+	char isref;
+	char isstr;
+};
+
+
+/* main.c */
+extern char debug['Z'+1];
+
+/* util.c */
+extern Typ typ[NTyp];
+extern Ins insb[NIns], *curi;
+void diag(char *) __attribute__((noreturn));
+void *emalloc(size_t);
+void *alloc(size_t);
+void freeall(void);
+Blk *blknew(void);
+void emit(int, int, Ref, Ref, Ref);
+void emiti(Ins);
+void idup(Ins **, Ins *, ulong);
+Ins *icpy(Ins *, Ins *, ulong);
+void *vnew(ulong, size_t);
+void vgrow(void *, ulong);
+int phicls(int, Tmp *);
+Ref newtmp(char *, int, Fn *);
+Ref getcon(int64_t, Fn *);
+void addcon(Con *, Con *);
+void dumpts(BSet *, Tmp *, FILE *);
+
+void bsinit(BSet *, uint);
+void bszero(BSet *);
+uint bscount(BSet *);
+void bsset(BSet *, uint);
+void bsclr(BSet *, uint);
+void bscopy(BSet *, BSet *);
+void bsunion(BSet *, BSet *);
+void bsinter(BSet *, BSet *);
+void bsdiff(BSet *, BSet *);
+int bsequal(BSet *, BSet *);
+int bsiter(BSet *, uint *);
+
+static inline int
+bshas(BSet *bs, uint elt)
+{
+	assert(elt < bs->nt * NBit);
+	return (bs->t[elt/NBit] & BIT(elt%NBit)) != 0;
+}
+
+/* parse.c */
+extern OpDesc opdesc[NOp];
+void parse(FILE *, char *, void (Dat *), void (Fn *));
+void printfn(Fn *, FILE *);
+void printref(Ref, Fn *, FILE *);
+void err(char *, ...);
+
+/* mem.c */
+void memopt(Fn *);
+
+/* ssa.c */
+void filluse(Fn *);
+void fillpreds(Fn *);
+void fillrpo(Fn *);
+void ssa(Fn *);
+
+/* copy.c */
+void copy(Fn *);
+
+/* live.c */
+void liveon(BSet *, Blk *, Blk *);
+void filllive(Fn *);
+
+/* isel.c */
+extern int rsave[/* NRSave */];
+extern int rclob[/* NRClob */];
+bits retregs(Ref, int[2]);
+bits argregs(Ref, int[2]);
+void isel(Fn *);
+
+/* spill.c */
+void fillcost(Fn *);
+void spill(Fn *);
+
+/* rega.c */
+void rega(Fn *);
+
+/* emit.c */
+void emitfn(Fn *, FILE *);
+void emitdat(Dat *, FILE *);
+int stashfp(int64_t, int);
+void emitfin(FILE *);
diff --git a/src/copy.c b/src/copy.c
new file mode 100644
index 0000000..ef2d01d
--- /dev/null
+++ b/src/copy.c
@@ -0,0 +1,159 @@
+#include "all.h"
+
+typedef struct RList RList;
+struct RList {
+	int t;
+	RList *l;
+};
+
+static Ref
+copyof(Ref r, Ref *cp)
+{
+	if (rtype(r) == RTmp)
+		return cp[r.val];
+	else
+		return r;
+}
+
+static void
+update(Ref r, Ref rcp, Ref *cp, RList **w)
+{
+	RList *l;
+
+	if (!req(cp[r.val], rcp)) {
+		cp[r.val] = rcp;
+		l = emalloc(sizeof *l);
+		l->t = r.val;
+		l->l = *w;
+		*w = l;
+	}
+}
+
+static void
+visitphi(Phi *p, Ref *cp, RList **w)
+{
+	uint a;
+	Ref r, r1;
+
+	r = R;
+	for (a=0; a<p->narg; a++) {
+		r1 = copyof(p->arg[a], cp);
+		if (req(r1, R))
+			continue;
+		if (req(r, R) || req(r, r1))
+			r = r1;
+		else {
+			r = p->to;
+			break;
+		}
+	}
+	assert(!req(r, R));
+	update(p->to, r, cp, w);
+}
+
+static void
+visitins(Ins *i, Ref *cp, RList **w)
+{
+	Ref r;
+
+	if (i->op == OCopy) {
+		r = copyof(i->arg[0], cp);
+		update(i->to, r, cp, w);
+	} else if (!req(i->to, R)) {
+		assert(rtype(i->to) == RTmp);
+		update(i->to, i->to, cp, w);
+	}
+}
+
+void
+copy(Fn *fn)
+{
+	Blk *b;
+	Ref *cp, r;
+	RList *w, *w1;
+	Use *u, *u1;
+	Ins *i;
+	Phi *p, **pp;
+	uint a;
+	int t;
+
+	w = 0;
+	cp = emalloc(fn->ntmp * sizeof cp[0]);
+	for (b=fn->start; b; b=b->link) {
+		for (p=b->phi; p; p=p->link)
+			visitphi(p, cp, &w);
+		for (i=b->ins; i-b->ins < b->nins; i++)
+			visitins(i, cp, &w);
+	}
+	while ((w1=w)) {
+		t = w->t;
+		w = w->l;
+		free(w1);
+		u = fn->tmp[t].use;
+		u1 = u + fn->tmp[t].nuse;
+		for (; u<u1; u++)
+			switch (u->type) {
+			default:
+				diag("copy: invalid use");
+			case UPhi:
+				visitphi(u->u.phi, cp, &w);
+				break;
+			case UIns:
+				visitins(u->u.ins, cp, &w);
+				break;
+			case UJmp:
+				break;
+			}
+	}
+	for (b=fn->start; b; b=b->link) {
+		for (pp=&b->phi; (p=*pp);) {
+			r = cp[p->to.val];
+			if (!req(r, p->to)) {
+				*pp = p->link;
+				continue;
+			}
+			for (a=0; a<p->narg; a++)
+				if (rtype(p->arg[a]) == RTmp) {
+					r = cp[p->arg[a].val];
+					assert(!req(r, R));
+					p->arg[a] = r;
+				}
+			pp=&p->link;
+		}
+		for (i=b->ins; i-b->ins < b->nins; i++) {
+			r = cp[i->to.val];
+			if (!req(r, i->to)) {
+				*i = (Ins){.op = ONop};
+				continue;
+			}
+			for (a=0; a<2; a++)
+				if (rtype(i->arg[a]) == RTmp) {
+					r = cp[i->arg[a].val];
+					assert(!req(r, R));
+					i->arg[a] = r;
+				}
+		}
+		if (rtype(b->jmp.arg) == RTmp) {
+			r = cp[b->jmp.arg.val];
+			assert(!req(r, R));
+			b->jmp.arg = r;
+		}
+	}
+	if (debug['C']) {
+		fprintf(stderr, "\n> Copy information:");
+		for (t=Tmp0; t<fn->ntmp; t++) {
+			if (req(cp[t], R)) {
+				fprintf(stderr, "\n%10s not seen!",
+					fn->tmp[t].name);
+			}
+			else if (!req(cp[t], TMP(t))) {
+				fprintf(stderr, "\n%10s copy of ",
+					fn->tmp[t].name);
+				printref(cp[t], fn, stderr);
+			}
+		}
+		fprintf(stderr, "\n\n> After copy elimination:\n");
+		printfn(fn, stderr);
+	}
+	free(cp);
+}
diff --git a/src/emit.c b/src/emit.c
new file mode 100644
index 0000000..b9dc782
--- /dev/null
+++ b/src/emit.c
@@ -0,0 +1,666 @@
+#include "all.h"
+
+enum {
+	SLong = 0,
+	SWord = 1,
+	SShort = 2,
+	SByte = 3,
+
+	Ki = -1, /* matches Kw and Kl */
+	Ka = -2, /* matches all classes */
+};
+
+/* Instruction format strings:
+ *
+ * if the format string starts with -, the instruction
+ * is assumed to be 3-address and is put in 2-address
+ * mode using an extra mov if necessary
+ *
+ * if the format string starts with +, the same as the
+ * above applies, but commutativity is also assumed
+ *
+ * %k  is used to set the class of the instruction,
+ *     it'll expand to "l", "q", "ss", "sd", depending
+ *     on the instruction class
+ * %0  designates the first argument
+ * %1  designates the second argument
+ * %=  designates the result
+ *
+ * if %k is not used, a prefix to 0, 1, or = must be
+ * added, it can be:
+ *   M - memory reference
+ *   L - long  (64 bits)
+ *   W - word  (32 bits)
+ *   H - short (16 bits)
+ *   B - byte  (8 bits)
+ *   S - single precision float
+ *   D - double precision float
+ */
+static struct {
+	short op;
+	short cls;
+	char *asm;
+} omap[] = {
+	{ OAdd,    Ka, "+add%k %1, %=" },
+	{ OSub,    Ka, "-sub%k %1, %=" },
+	{ OAnd,    Ki, "+and%k %1, %=" },
+	{ OOr,     Ki, "+or%k %1, %=" },
+	{ OXor,    Ki, "+xor%k %1, %=" },
+	{ OSar,    Ki, "-sar%k %B1, %=" },
+	{ OShr,    Ki, "-shr%k %B1, %=" },
+	{ OShl,    Ki, "-shl%k %B1, %=" },
+	{ OMul,    Ki, "+imul%k %1, %=" },
+	{ OMul,    Ks, "+mulss %1, %=" }, /* fixme */
+	{ OMul,    Kd, "+mulsd %1, %=" },
+	{ ODiv,    Ka, "-div%k %1, %=" },
+	{ OStorel, Ka, "movq %L0, %M1" },
+	{ OStorew, Ka, "movl %W0, %M1" },
+	{ OStoreh, Ka, "movw %H0, %M1" },
+	{ OStoreb, Ka, "movb %B0, %M1" },
+	{ OStores, Ka, "movss %S0, %M1" },
+	{ OStored, Ka, "movsd %D0, %M1" },
+	{ OLoad,   Ka, "mov%k %M0, %=" },
+	{ OLoadsw, Kl, "movslq %M0, %L=" },
+	{ OLoadsw, Kw, "movl %M0, %W=" },
+	{ OLoaduw, Ki, "movl %M0, %W=" },
+	{ OLoadsh, Ki, "movsw%k %M0, %=" },
+	{ OLoaduh, Ki, "movzw%k %M0, %=" },
+	{ OLoadsb, Ki, "movsb%k %M0, %=" },
+	{ OLoadub, Ki, "movzb%k %M0, %=" },
+	{ OExtsw,  Kl, "movslq %W0, %L=" },
+	{ OExtuw,  Kl, "movl %W0, %W=" },
+	{ OExtsh,  Ki, "movsw%k %H0, %=" },
+	{ OExtuh,  Ki, "movzw%k %H0, %=" },
+	{ OExtsb,  Ki, "movsb%k %B0, %=" },
+	{ OExtub,  Ki, "movzb%k %B0, %=" },
+
+	{ OExts,   Kd, "cvtss2sd %0, %=" },  /* see if factorization is possible */
+	{ OTruncd, Ks, "cvttsd2ss %0, %=" },
+	{ OFtosi,  Kw, "cvttss2si %0, %=" },
+	{ OFtosi,  Kl, "cvttsd2si %0, %=" },
+	{ OSitof,  Ks, "cvtsi2ss %W0, %=" },
+	{ OSitof,  Kd, "cvtsi2sd %L0, %=" },
+	{ OCast,   Ki, "movq %D0, %L=" },
+	{ OCast,   Ka, "movq %L0, %D=" },
+
+	{ OAddr,   Ki, "lea%k %M0, %=" },
+	{ OSwap,   Ki, "xchg%k %0, %1" },
+	{ OSign,   Kl, "cqto" },
+	{ OSign,   Kw, "cltd" },
+	{ OXDiv,   Ki, "div%k %0" },
+	{ OXIDiv,  Ki, "idiv%k %0" },
+	{ OXCmp,   Ks, "comiss %S0, %S1" },  /* fixme, Kf */
+	{ OXCmp,   Kd, "comisd %D0, %D1" },
+	{ OXCmp,   Ki, "cmp%k %0, %1" },
+	{ OXTest,  Ki, "test%k %0, %1" },
+	{ OXSet+ICeq,  Ki, "setz %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICsle, Ki, "setle %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICslt, Ki, "setl %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICsgt, Ki, "setg %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICsge, Ki, "setge %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICne,  Ki, "setnz %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICXnp, Ki, "setnp %B=\n\tmovsb%k %B=, %=" },
+	{ OXSet+ICXp,  Ki, "setp %B=\n\tmovsb%k %B=, %=" },
+	{ NOp, 0, 0 }
+};
+
+static char *rname[][4] = {
+	[RAX] = {"rax", "eax", "ax", "al"},
+	[RBX] = {"rbx", "ebx", "bx", "bl"},
+	[RCX] = {"rcx", "ecx", "cx", "cl"},
+	[RDX] = {"rdx", "edx", "dx", "dl"},
+	[RSI] = {"rsi", "esi", "si", "sil"},
+	[RDI] = {"rdi", "edi", "di", "dil"},
+	[RBP] = {"rbp", "ebp", "bp", "bpl"},
+	[RSP] = {"rsp", "esp", "sp", "spl"},
+	[R8 ] = {"r8" , "r8d", "r8w", "r8b"},
+	[R9 ] = {"r9" , "r9d", "r9w", "r9b"},
+	[R10] = {"r10", "r10d", "r10w", "r10b"},
+	[R11] = {"r11", "r11d", "r11w", "r11b"},
+	[R12] = {"r12", "r12d", "r12w", "r12b"},
+	[R13] = {"r13", "r13d", "r13w", "r13b"},
+	[R14] = {"r14", "r14d", "r14w", "r14b"},
+	[R15] = {"r15", "r15d", "r15w", "r15b"},
+};
+
+
+static int
+slot(int s, Fn *fn)
+{
+	struct { int i:14; } x;
+
+	/* sign extend s using a bitfield */
+	x.i = s;
+	assert(NAlign == 3);
+	if (x.i < 0)
+		return -4 * x.i;
+	else {
+		assert(fn->slot >= x.i);
+		return -4 * (fn->slot - x.i);
+	}
+}
+
+static void
+emitcon(Con *con, FILE *f)
+{
+	switch (con->type) {
+	default:
+		diag("emit: invalid constant");
+	case CAddr:
+		fputs(con->label, f);
+		if (con->bits.i)
+			fprintf(f, "%+"PRId64, con->bits.i);
+		break;
+	case CBits:
+		fprintf(f, "%"PRId64, con->bits.i);
+		break;
+	}
+}
+
+static char *
+regtoa(int reg, int sz)
+{
+	static char buf[6];
+
+	if (reg >= XMM0) {
+		sprintf(buf, "xmm%d", reg-XMM0);
+		return buf;
+	} else
+		return rname[reg][sz];
+}
+
+static Ref
+getarg(char c, Ins *i)
+{
+	switch (c) {
+	default:
+		diag("emit: 0, 1, = expected in format");
+	case '0':
+		return i->arg[0];
+	case '1':
+		return i->arg[1];
+	case '=':
+		return i->to;
+	}
+}
+
+static void emitins(Ins, Fn *, FILE *);
+
+static void
+emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f)
+{
+	Ins icp;
+
+	icp.op = OCopy;
+	icp.arg[0] = r2;
+	icp.to = r1;
+	icp.cls = k;
+	emitins(icp, fn, f);
+}
+
+static void
+emitf(char *s, Ins *i, Fn *fn, FILE *f)
+{
+	static char clstoa[][3] = {"l", "q", "ss", "sd"};
+	char c;
+	int sz;
+	Ref ref;
+	Mem *m;
+	Con off;
+
+	switch (*s) {
+	case '+':
+		if (req(i->arg[1], i->to)) {
+			ref = i->arg[0];
+			i->arg[0] = i->arg[1];
+			i->arg[1] = ref;
+		}
+		/* fall through */
+	case '-':
+		if (req(i->arg[1], i->to) && !req(i->arg[0], i->to))
+			diag("emit: cannot convert to 2-address");
+		emitcopy(i->to, i->arg[0], i->cls, fn, f);
+		s++;
+		break;
+	}
+
+	fputc('\t', f);
+Next:
+	while ((c = *s++) != '%')
+		if (!c) {
+			fputc('\n', f);
+			return;
+		} else
+			fputc(c, f);
+	switch ((c = *s++)) {
+	default:
+		diag("emit: invalid escape");
+	case '%':
+		fputc('%', f);
+		break;
+	case 'k':
+		fputs(clstoa[i->cls], f);
+		break;
+	case '0':
+	case '1':
+	case '=':
+		sz = KWIDE(i->cls) ? SLong : SWord;
+		s--;
+		/* fall through */
+	case 'D':
+	case 'S':
+	Ref:
+		c = *s++;
+		ref = getarg(c, i);
+		switch (rtype(ref)) {
+		default:
+			diag("emit: invalid reference");
+		case RTmp:
+			assert(isreg(ref));
+			fprintf(f, "%%%s", regtoa(ref.val, sz));
+			break;
+		case RSlot:
+			fprintf(f, "%d(%%rbp)", slot(ref.val, fn));
+			break;
+		case RAMem:
+		Mem:
+			m = &fn->mem[ref.val & AMask];
+			if (rtype(m->base) == RSlot) {
+				off.type = CBits;
+				off.bits.i = slot(m->base.val, fn);
+				addcon(&m->offset, &off);
+				m->base = TMP(RBP);
+			}
+			if (m->offset.type != CUndef)
+				emitcon(&m->offset, f);
+			if (req(m->base, R) && req(m->index, R))
+				break;
+			fputc('(', f);
+			if (!req(m->base, R))
+				fprintf(f, "%%%s", regtoa(m->base.val, SLong));
+			if (!req(m->index, R))
+				fprintf(f, ", %%%s, %d",
+					regtoa(m->index.val, SLong),
+					m->scale
+				);
+			fputc(')', f);
+			break;
+		case RCon:
+			fputc('$', f);
+			emitcon(&fn->con[ref.val], f);
+			break;
+		}
+		break;
+	case 'L':
+		sz = SLong;
+		goto Ref;
+	case 'W':
+		sz = SWord;
+		goto Ref;
+	case 'H':
+		sz = SShort;
+		goto Ref;
+	case 'B':
+		sz = SByte;
+		goto Ref;
+	case 'M':
+		c = *s++;
+		ref = getarg(c, i);
+		switch (rtype(ref)) {
+		default:
+			diag("emit: invalid memory reference");
+		case RAMem:
+			goto Mem;
+		case RSlot:
+			fprintf(f, "%d(%%rbp)", slot(ref.val, fn));
+			break;
+		case RCon:
+			emitcon(&fn->con[ref.val], f);
+			fprintf(f, "(%%rip)");
+			break;
+		case RTmp:
+			assert(isreg(ref));
+			fprintf(f, "(%%%s)", regtoa(ref.val, SLong));
+			break;
+		}
+		break;
+	}
+	goto Next;
+}
+
+static void
+emitins(Ins i, Fn *fn, FILE *f)
+{
+	Ref r;
+	int64_t val;
+	int o;
+
+	switch (i.op) {
+	default:
+	Table:
+		/* most instructions are just pulled out of
+		 * the table omap[], some special cases are
+		 * detailed below */
+		for (o=0;; o++) {
+			/* this linear search should really be a binary
+			 * search */
+			if (omap[o].op == NOp)
+				diag("emit: no entry found for instruction");
+			if (omap[o].op == i.op)
+			if (omap[o].cls == i.cls
+			|| (omap[o].cls == Ki && KBASE(i.cls) == 0)
+			|| (omap[o].cls == Ka))
+				break;
+		}
+		emitf(omap[o].asm, &i, fn, f);
+		break;
+	case ONop:
+		/* just do nothing for nops, they are inserted
+		 * by some passes */
+		break;
+	case OMul:
+		/* here, we try to use the 3-addresss form
+		 * of multiplication when possible */
+		if (rtype(i.arg[1]) == RCon) {
+			r = i.arg[0];
+			i.arg[0] = i.arg[1];
+			i.arg[1] = r;
+		}
+		if (KBASE(i.cls) == 0 /* only available for ints */
+		&& rtype(i.arg[0]) == RCon
+		&& rtype(i.arg[1]) == RTmp) {
+			emitf("imul%k %0, %1, %=", &i, fn, f);
+			break;
+		}
+		goto Table;
+	case OSub:
+		/* we have to use the negation trick to handle
+		 * some 3-address substractions */
+		if (req(i.to, i.arg[1])) {
+			emitf("neg%k %=", &i, fn, f);
+			emitf("add%k %0, %=", &i, fn, f);
+			break;
+		}
+		goto Table;
+	case OCopy:
+		/* make sure we don't emit useless copies,
+		 * also, we can use a trick to load 64-bits
+		 * registers, it's detailed in my note below
+		 * http://c9x.me/art/notes.html?09/19/2015 */
+		if (req(i.to, R) || req(i.arg[0], R))
+			break;
+		if (isreg(i.to)
+		&& rtype(i.arg[0]) == RCon
+		&& i.cls == Kl
+		&& fn->con[i.arg[0].val].type == CBits
+		&& (val = fn->con[i.arg[0].val].bits.i) >= 0
+		&& val <= UINT32_MAX) {
+			emitf("movl %W0, %W=", &i, fn, f);
+		} else if (!req(i.arg[0], i.to))
+			emitf("mov%k %0, %=", &i, fn, f);
+		break;
+	case OCall:
+		/* calls simply have a weird syntax in AT&T
+		 * assembly... */
+		switch (rtype(i.arg[0])) {
+		default:
+			diag("emit: invalid call instruction");
+		case RCon:
+			fprintf(f, "\tcallq ");
+			emitcon(&fn->con[i.arg[0].val], f);
+			fprintf(f, "\n");
+			break;
+		case RTmp:
+			emitf("callq *%L0", &i, fn, f);
+			break;
+		}
+		break;
+	case OSAlloc:
+		/* there is no good reason why this is here
+		 * maybe we should split OSAlloc in 2 different
+		 * instructions depending on the result
+		 */
+		emitf("subq %L0, %%rsp", &i, fn, f);
+		if (!req(i.to, R))
+			emitcopy(i.to, TMP(RSP), Kl, fn, f);
+		break;
+	case OSwap:
+		if (KBASE(i.cls) == 0)
+			goto Table;
+		/* for floats, there is no swap instruction
+		 * so we use xmm15 as a temporary
+		 */
+		emitcopy(TMP(XMM0+15), i.arg[0], i.cls, fn, f);
+		emitcopy(i.arg[0], i.arg[1], i.cls, fn, f);
+		emitcopy(i.arg[1], TMP(XMM0+15), i.cls, fn, f);
+		break;
+	}
+}
+
+static int
+cneg(int cmp)
+{
+	switch (cmp) {
+	default:   diag("emit: cneg() unhandled comparison");
+	case ICule: return ICugt;
+	case ICult: return ICuge;
+	case ICsle: return ICsgt;
+	case ICslt: return ICsge;
+	case ICsgt: return ICsle;
+	case ICsge: return ICslt;
+	case ICugt: return ICule;
+	case ICuge: return ICult;
+	case ICeq:  return ICne;
+	case ICne:  return ICeq;
+	case ICXnp: return ICXp;
+	case ICXp:  return ICXnp;
+	}
+}
+
+static int
+framesz(Fn *fn)
+{
+	int i, o, f;
+
+	assert(NAlign == 3);
+	for (i=0, o=0; i<NRClob; i++)
+		o ^= 1 & (fn->reg >> rclob[i]);
+	f = fn->slot;
+	f = (f + 3) & -4;
+	return 4*f + 8*o;
+}
+
+void
+emitfn(Fn *fn, FILE *f)
+{
+	static char *ctoa[] = {
+		[ICeq]  = "z",
+		[ICule] = "be",
+		[ICult] = "b",
+		[ICsle] = "le",
+		[ICslt] = "l",
+		[ICsgt] = "g",
+		[ICsge] = "ge",
+		[ICugt] = "a",
+		[ICuge] = "ae",
+		[ICne]  = "nz",
+		[ICXnp] = "np",
+		[ICXp]  = "p"
+	};
+	Blk *b, *s;
+	Ins *i, itmp;
+	int *r, c, fs;
+
+	fprintf(f,
+		".text\n"
+		".globl %s\n"
+		".type %s, @function\n"
+		"%s:\n"
+		"\tpush %%rbp\n"
+		"\tmov %%rsp, %%rbp\n",
+		fn->name, fn->name, fn->name
+	);
+	fs = framesz(fn);
+	if (fs)
+		fprintf(f, "\tsub $%d, %%rsp\n", fs);
+	for (r=rclob; r-rclob < NRClob; r++)
+		if (fn->reg & BIT(*r)) {
+			itmp.arg[0] = TMP(*r);
+			emitf("pushq %L0", &itmp, fn, f);
+		}
+
+	for (b=fn->start; b; b=b->link) {
+		fprintf(f, ".L%s:\n", b->name);
+		for (i=b->ins; i!=&b->ins[b->nins]; i++)
+			emitins(*i, fn, f);
+		switch (b->jmp.type) {
+		case JRet0:
+			for (r=&rclob[NRClob]; r>rclob;)
+				if (fn->reg & BIT(*--r)) {
+					itmp.arg[0] = TMP(*r);
+					emitf("popq %L0", &itmp, fn, f);
+				}
+			fprintf(f,
+				"\tleave\n"
+				"\tret\n"
+			);
+			break;
+		case JJmp:
+			if (b->s1 != b->link)
+				fprintf(f, "\tjmp .L%s\n", b->s1->name);
+			break;
+		default:
+			c = b->jmp.type - JXJc;
+			if (0 <= c && c <= NXICmp) {
+				if (b->link == b->s2) {
+					s = b->s1;
+				} else if (b->link == b->s1) {
+					c = cneg(c);
+					s = b->s2;
+				} else
+					diag("emit: unhandled jump (1)");
+				fprintf(f, "\tj%s .L%s\n", ctoa[c], s->name);
+				break;
+			}
+			diag("emit: unhandled jump (2)");
+		}
+	}
+
+}
+
+void
+emitdat(Dat *d, FILE *f)
+{
+	static int align;
+	static char *dtoa[] = {
+		[DAlign] = ".align",
+		[DB] = "\t.byte",
+		[DH] = "\t.value",
+		[DW] = "\t.long",
+		[DL] = "\t.quad"
+	};
+
+	switch (d->type) {
+	case DStart:
+		align = 0;
+		fprintf(f, ".data\n");
+		break;
+	case DEnd:
+		break;
+	case DName:
+		if (!align)
+			fprintf(f, ".align 8\n");
+		fprintf(f,
+			".globl %s\n"
+			".type %s, @object\n"
+			"%s:\n",
+			d->u.str, d->u.str, d->u.str
+		);
+		break;
+	case DZ:
+		fprintf(f, "\t.fill %"PRId64",1,0\n", d->u.num);
+		break;
+	default:
+		if (d->type == DAlign)
+			align = 1;
+
+		if (d->isstr) {
+			if (d->type != DB)
+				err("strings only supported for 'b' currently");
+			fprintf(f, "\t.ascii \"%s\"\n", d->u.str);
+		}
+		else if (d->isref) {
+			fprintf(f, "%s %s%+"PRId64"\n",
+				dtoa[d->type], d->u.ref.nam,
+				d->u.ref.off);
+		}
+		else {
+			fprintf(f, "%s %"PRId64"\n",
+				dtoa[d->type], d->u.num);
+		}
+		break;
+	}
+}
+
+typedef struct FBits FBits;
+
+struct FBits {
+	int64_t bits;
+	int wide;
+	FBits *link;
+};
+
+static FBits *stash;
+
+int
+stashfp(int64_t n, int w)
+{
+	FBits **pb, *b;
+	int i;
+
+	/* does a dumb de-dup of fp constants
+	 * this should be the linker's job */
+	for (pb=&stash, i=0; (b=*pb); pb=&b->link, i++)
+		if (n == b->bits && w == b->wide)
+			return i;
+	b = emalloc(sizeof *b);
+	b->bits = n;
+	b->wide = w;
+	b->link = 0;
+	*pb = b;
+	return i;
+}
+
+void
+emitfin(FILE *f)
+{
+	FBits *b;
+	int i;
+
+	if (!stash)
+		return;
+	fprintf(f, "/* floating point constants */\n");
+	fprintf(f, ".data\n.align 8\n");
+	for (b=stash, i=0; b; b=b->link, i++)
+		if (b->wide)
+			fprintf(f,
+				".Lfp%d:\n"
+				"\t.quad %"PRId64
+				" /* %f */\n",
+				i, b->bits,
+				*(double *)&b->bits
+			);
+	for (b=stash, i=0; b; b=b->link, i++)
+		if (!b->wide)
+			fprintf(f,
+				".Lfp%d:\n"
+				"\t.long %"PRId64
+				" /* %lf */\n",
+				i, b->bits & 0xffffffff,
+				*(float *)&b->bits
+			);
+	while ((b=stash)) {
+		stash = b->link;
+		free(b);
+	}
+}
diff --git a/src/isel.c b/src/isel.c
new file mode 100644
index 0000000..48e29ef
--- /dev/null
+++ b/src/isel.c
@@ -0,0 +1,1135 @@
+#include "all.h"
+#include <limits.h>
+
+/* For x86_64, do the following:
+ *
+ * - lower calls
+ * - check that constants are used only in
+ *   places allowed
+ * - ensure immediates always fit in 32b
+ * - explicit machine register contraints
+ *   on instructions like division.
+ * - implement fast locals (the streak of
+ *   constant allocX in the first basic block)
+ * - recognize complex addressing modes
+ *
+ * Invariant: the use counts that are used
+ *            in sel() must be sound.  This
+ *            is not so trivial, maybe the
+ *            dce should be moved out...
+ */
+
+typedef struct ANum ANum;
+typedef struct AClass AClass;
+typedef struct RAlloc RAlloc;
+
+struct ANum {
+	char n, l, r;
+	Ins *i;
+	Ref mem;
+};
+
+static void amatch(Addr *, Ref, ANum *, Fn *, int);
+
+static int
+fcmptoi(int fc)
+{
+	switch (fc) {
+	default:   diag("isel: fcmptoi defaulted");
+	case FCle: return ICule;
+	case FClt: return ICult;
+	case FCgt: return ICugt;
+	case FCge: return ICuge;
+	case FCne: return ICne;
+	case FCeq: return ICeq;
+	case FCo:  return ICXnp;
+	case FCuo: return ICXp;
+	}
+}
+
+static int
+iscmp(int op, int *pk, int *pc)
+{
+	int k, c;
+
+	if (OCmpw <= op && op <= OCmpw1) {
+		c = op - OCmpw;
+		k = Kw;
+	}
+	else if (OCmpl <= op && op <= OCmpl1) {
+		c = op - OCmpl;
+		k = Kl;
+	}
+	else if (OCmps <= op && op <= OCmps1) {
+		c = fcmptoi(op - OCmps);
+		k = Ks;
+	}
+	else if (OCmpd <= op && op <= OCmpd1) {
+		c = fcmptoi(op - OCmpd);
+		k = Kd;
+	}
+	else
+		return 0;
+	if (pk)
+		*pk = k;
+	if (pc)
+		*pc = c;
+	return 1;
+}
+
+static int
+noimm(Ref r, Fn *fn)
+{
+	int64_t val;
+
+	if (rtype(r) != RCon)
+		return 0;
+	switch (fn->con[r.val].type) {
+	default:
+		diag("isel: invalid constant");
+	case CAddr:
+		/* we only support the 'small'
+		 * code model of the ABI, this
+		 * means that we can always
+		 * address data with 32bits
+		 */
+		return 0;
+	case CBits:
+		val = fn->con[r.val].bits.i;
+		return (val < INT32_MIN || val > INT32_MAX);
+	}
+}
+
+static int
+rslot(Ref r, Fn *fn)
+{
+	if (rtype(r) != RTmp)
+		return -1;
+	return fn->tmp[r.val].slot;
+}
+
+static int
+argcls(Ins *i, int n)
+{
+	return opdesc[i->op].argcls[n][i->cls];
+}
+
+static void
+fixarg(Ref *r, int k, int phi, Fn *fn)
+{
+	Addr a;
+	Ref r0, r1;
+	int s, n;
+
+	r1 = r0 = *r;
+	s = rslot(r0, fn);
+	if (KBASE(k) == 1 && rtype(r0) == RCon) {
+		/* load floating points from memory
+		 * slots, they can't be used as
+		 * immediates
+		 */
+		r1 = MEM(fn->nmem);
+		vgrow(&fn->mem, ++fn->nmem);
+		memset(&a, 0, sizeof a);
+		a.offset.type = CAddr;
+		n = stashfp(fn->con[r0.val].bits.i, KWIDE(k));
+		sprintf(a.offset.label, ".Lfp%d", n);
+		fn->mem[fn->nmem-1] = a;
+	}
+	else if (!phi && k == Kl && noimm(r0, fn)) {
+		/* load constants that do not fit in
+		 * a 32bit signed integer into a
+		 * long temporary
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(OCopy, Kl, r1, r0, R);
+	}
+	else if (s != -1) {
+		/* load fast locals' addresses into
+		 * temporaries right before the
+		 * instruction
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(OAddr, Kl, r1, SLOT(s), R);
+	}
+	*r = r1;
+}
+
+static void
+chuse(Ref r, int du, Fn *fn)
+{
+	if (rtype(r) == RTmp)
+		fn->tmp[r.val].nuse += du;
+}
+
+static void
+seladdr(Ref *r, ANum *an, Fn *fn)
+{
+	Addr a;
+	Ref r0, r1;
+
+	r0 = *r;
+	if (rtype(r0) == RTmp) {
+		chuse(r0, -1, fn);
+		r1 = an[r0.val].mem;
+		if (req(r1, R)) {
+			amatch(&a, r0, an, fn, 1);
+			vgrow(&fn->mem, ++fn->nmem);
+			fn->mem[fn->nmem-1] = a;
+			r1 = MEM(fn->nmem-1);
+			chuse(a.base, +1, fn);
+			chuse(a.index, +1, fn);
+			if (rtype(a.base) != RTmp)
+			if (rtype(a.index) != RTmp)
+				an[r0.val].mem = r1;
+		}
+		*r = r1;
+	}
+}
+
+static void
+selcmp(Ref arg[2], int k, Fn *fn)
+{
+	Ref r;
+
+	if (rtype(arg[0]) == RCon) {
+		r = arg[1];
+		arg[1] = arg[0];
+		arg[0] = r;
+	}
+	assert(rtype(arg[0]) != RCon);
+	emit(OXCmp, k, R, arg[1], arg[0]);
+	fixarg(&curi->arg[0], k, 0, fn);
+}
+
+static void
+sel(Ins i, ANum *an, Fn *fn)
+{
+	Ref r0, r1;
+	int x, k, kc;
+	int64_t val;
+	Ins *i0;
+
+	if (rtype(i.to) == RTmp)
+	if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
+	if (fn->tmp[i.to.val].nuse == 0) {
+		chuse(i.arg[0], -1, fn);
+		chuse(i.arg[1], -1, fn);
+		return;
+	}
+	i0 = curi;
+	k = i.cls;
+	switch (i.op) {
+	case ODiv:
+	case ORem:
+	case OUDiv:
+	case OURem:
+		if (i.op == ODiv || i.op == OUDiv)
+			r0 = TMP(RAX), r1 = TMP(RDX);
+		else
+			r0 = TMP(RDX), r1 = TMP(RAX);
+		emit(OCopy, k, i.to, r0, R);
+		emit(OCopy, k, R, r1, R);
+		if (rtype(i.arg[1]) == RCon) {
+			/* immediates not allowed for
+			 * divisions in x86
+			 */
+			r0 = newtmp("isel", k, fn);
+		} else
+			r0 = i.arg[1];
+		if (i.op == ODiv || i.op == ORem) {
+			emit(OXIDiv, k, R, r0, R);
+			emit(OSign, k, TMP(RDX), TMP(RAX), R);
+		} else {
+			emit(OXDiv, k, R, r0, R);
+			emit(OCopy, k, TMP(RDX), CON_Z, R);
+		}
+		emit(OCopy, k, TMP(RAX), i.arg[0], R);
+		if (rtype(i.arg[1]) == RCon)
+			emit(OCopy, k, r0, i.arg[1], R);
+		break;
+	case OSar:
+	case OShr:
+	case OShl:
+		if (rtype(i.arg[1]) == RCon)
+			goto Emit;
+		r0 = i.arg[1];
+		i.arg[1] = TMP(RCX);
+		emit(OCopy, Kw, R, TMP(RCX), R);
+		emiti(i);
+		emit(OCopy, Kw, TMP(RCX), r0, R);
+		break;
+	case ONop:
+		break;
+	case OStored:
+	case OStores:
+	case OStorel:
+	case OStorew:
+	case OStoreh:
+	case OStoreb:
+		if (rtype(i.arg[0]) == RCon) {
+			if (i.op == OStored)
+				i.op = OStorel;
+			if (i.op == OStores)
+				i.op = OStorew;
+		}
+		seladdr(&i.arg[1], an, fn);
+		goto Emit;
+	case_OLoad:
+		seladdr(&i.arg[0], an, fn);
+		goto Emit;
+	case OCall:
+	case OSAlloc:
+	case OCopy:
+	case OAdd:
+	case OSub:
+	case OMul:
+	case OAnd:
+	case OOr:
+	case OXor:
+	case OXTest:
+	case OFtosi:
+	case OSitof:
+	case OExts:
+	case OTruncd:
+	case OCast:
+	case_OExt:
+Emit:
+		emiti(i);
+		fixarg(&curi->arg[0], argcls(curi, 0), 0, fn);
+		fixarg(&curi->arg[1], argcls(curi, 1), 0, fn);
+		break;
+	case OAlloc:
+	case OAlloc+1:
+	case OAlloc+2: /* == OAlloc1 */
+		/* we need to make sure
+		 * the stack remains aligned
+		 * (rsp = 0) mod 16
+		 */
+		if (rtype(i.arg[0]) == RCon) {
+			assert(fn->con[i.arg[0].val].type == CBits);
+			val = fn->con[i.arg[0].val].bits.i;
+			val = (val + 15)  & ~INT64_C(15);
+			if (val < 0 || val > INT32_MAX)
+				diag("isel: alloc too large");
+			emit(OSAlloc, Kl, i.to, getcon(val, fn), R);
+		} else {
+			/* r0 = (i.arg[0] + 15) & -16 */
+			r0 = newtmp("isel", Kl, fn);
+			r1 = newtmp("isel", Kl, fn);
+			emit(OSAlloc, Kl, i.to, r0, R);
+			emit(OAnd, Kl, r0, r1, getcon(-16, fn));
+			emit(OAdd, Kl, r1, i.arg[0], getcon(15, fn));
+		}
+		break;
+	default:
+		if (isext(i.op))
+			goto case_OExt;
+		if (isload(i.op))
+			goto case_OLoad;
+		if (iscmp(i.op, &kc, &x)) {
+			if (rtype(i.arg[0]) == RCon)
+				x = icmpop(x);
+			emit(OXSet+x, k, i.to, R, R);
+			selcmp(i.arg, kc, fn);
+			break;
+		}
+		diag("isel: non-exhaustive implementation");
+	}
+
+	while (i0 > curi && --i0)
+		if (rslot(i0->arg[0], fn) != -1
+		||  rslot(i0->arg[1], fn) != -1)
+			diag("isel: usupported address argument");
+}
+
+static Ins *
+flagi(Ins *i0, Ins *i)
+{
+	while (i>i0) {
+		i--;
+		if (opdesc[i->op].sflag)
+			return i;
+		if (opdesc[i->op].lflag)
+			continue;
+		return 0;
+	}
+	return 0;
+}
+
+struct AClass {
+	int inmem;
+	int align;
+	uint size;
+	int cls[2];
+};
+
+static void
+aclass(AClass *a, Typ *t)
+{
+	int e, s, n, cls;
+	uint sz, al;
+
+	sz = t->size;
+	al = 1u << t->align;
+
+	/* the ABI requires sizes to be rounded
+	 * up to the nearest multiple of 8, moreover
+	 * it makes it easy load and store structures
+	 * in registers
+	 */
+	if (al < 8)
+		al = 8;
+	sz = (sz + al-1) & -al;
+
+	a->size = sz;
+	a->align = t->align;
+
+	if (t->dark || sz > 16) {
+		/* large or unaligned structures are
+		 * required to be passed in memory
+		 */
+		a->inmem = 1;
+		return;
+	}
+
+	a->inmem = 0;
+	for (e=0, s=0; e<2; e++) {
+		cls = -1;
+		for (n=0; n<8 && t->seg[s].len; s++) {
+			if (t->seg[s].ispad) {
+				/* don't change anything */
+			}
+			else if (t->seg[s].isflt) {
+				if (cls == -1)
+					cls = Kd;
+			}
+			else
+				cls = Kl;
+			n += t->seg[s].len;
+		}
+		assert(n <= 8);
+		a->cls[e] = cls;
+	}
+}
+
+static void
+blit(Ref rstk, uint soff, Ref rsrc, uint sz, Fn *fn)
+{
+	Ref r, r1;
+	uint boff;
+
+	/* it's an impolite blit, we might go across the end
+	 * of the source object a little bit... */
+	for (boff=0; sz>0; sz-=8, soff+=8, boff+=8) {
+		r = newtmp("abi", Kl, fn);
+		r1 = newtmp("abi", Kl, fn);
+		emit(OStorel, 0, R, r, r1);
+		emit(OAdd, Kl, r1, rstk, getcon(soff, fn));
+		r1 = newtmp("abi", Kl, fn);
+		emit(OLoad, Kl, r, r1, R);
+		emit(OAdd, Kl, r1, rsrc, getcon(boff, fn));
+		chuse(rsrc, +1, fn);
+		chuse(rstk, +1, fn);
+	}
+}
+
+static int
+retr(Ref reg[2], AClass *aret)
+{
+	static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
+	int n, k, ca, nr[2];
+
+	nr[0] = nr[1] = 0;
+	ca = 0;
+	for (n=0; aret->cls[n]>=0 && n<2; n++) {
+		k = KBASE(aret->cls[n]);
+		reg[n] = TMP(retreg[k][nr[k]++]);
+		ca += 1 << (2 * k);
+	}
+	return ca;
+}
+
+static void
+selret(Blk *b, Fn *fn)
+{
+	int j, k, ca;
+	Ref r, r0, reg[2];
+	AClass aret;
+
+	j = b->jmp.type;
+
+	if (!isret(j) || j == JRet0)
+		return;
+
+	r0 = b->jmp.arg;
+	b->jmp.type = JRet0;
+
+	if (j == JRetc) {
+		aclass(&aret, &typ[fn->retty]);
+		if (aret.inmem) {
+			assert(rtype(fn->retr) == RTmp);
+			emit(OCopy, Kl, TMP(RAX), fn->retr, R);
+			chuse(fn->retr, +1, fn);
+			blit(fn->retr, 0, r0, aret.size, fn);
+			ca = 1;
+		} else {
+			ca = retr(reg, &aret);
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				emit(OLoad, Kl, reg[1], r, R);
+				emit(OAdd, Kl, r, r0, getcon(8, fn));
+				chuse(r0, +1, fn);
+			}
+			emit(OLoad, Kl, reg[0], r0, R);
+		}
+	} else {
+		k = j - JRetw;
+		if (KBASE(k) == 0) {
+			emit(OCopy, k, TMP(RAX), r0, R);
+			ca = 1;
+		} else {
+			emit(OCopy, k, TMP(XMM0), r0, R);
+			ca = 1 << 2;
+		}
+	}
+
+	b->jmp.arg = CALL(ca);
+}
+
+static void
+seljmp(Blk *b, Fn *fn)
+{
+	Ref r;
+	int c, k;
+	Ins *fi;
+
+	if (b->jmp.type == JRet0 || b->jmp.type == JJmp)
+		return;
+	assert(b->jmp.type == JJnz);
+	r = b->jmp.arg;
+	b->jmp.arg = R;
+	assert(!req(r, R));
+	if (rtype(r) == RCon) {
+		b->jmp.type = JJmp;
+		if (req(r, CON_Z))
+			b->s1 = b->s2;
+		b->s2 = 0;
+		return;
+	}
+	fi = flagi(b->ins, &b->ins[b->nins]);
+	if (fi && req(fi->to, r)) {
+		if (iscmp(fi->op, &k, &c)) {
+			if (rtype(fi->arg[0]) == RCon)
+				c = icmpop(c);
+			b->jmp.type = JXJc + c;
+			if (fn->tmp[r.val].nuse == 1) {
+				assert(fn->tmp[r.val].ndef == 1);
+				selcmp(fi->arg, k, fn);
+				*fi = (Ins){.op = ONop};
+			}
+			return;
+		}
+		if (fi->op == OAnd && fn->tmp[r.val].nuse == 1
+		&& (rtype(fi->arg[0]) == RTmp ||
+		    rtype(fi->arg[1]) == RTmp)) {
+			fi->op = OXTest;
+			fi->to = R;
+			b->jmp.type = JXJc + ICne;
+			if (rtype(fi->arg[1]) == RCon) {
+				r = fi->arg[1];
+				fi->arg[1] = fi->arg[0];
+				fi->arg[0] = r;
+			}
+			return;
+		}
+		/* since flags are not tracked in liveness,
+		 * the result of the flag-setting instruction
+		 * has to be marked as live
+		 */
+		if (fn->tmp[r.val].nuse == 1)
+			emit(OCopy, Kw, R, r, R);
+		b->jmp.type = JXJc + ICne;
+		return;
+	}
+	selcmp((Ref[2]){r, CON_Z}, Kw, fn); /* todo, add long branch if non-zero */
+	b->jmp.type = JXJc + ICne;
+}
+
+static int
+classify(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret)
+{
+	int nint, ni, nsse, ns, n, *pn;
+	AClass *a;
+	Ins *i;
+
+	if (aret && aret->inmem)
+		nint = 5; /* hidden argument */
+	else
+		nint = 6;
+	nsse = 8;
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (i->op == op) {
+			if (KBASE(i->cls) == 0)
+				pn = &nint;
+			else
+				pn = &nsse;
+			if (*pn > 0) {
+				--*pn;
+				a->inmem = 0;
+			} else
+				a->inmem = 2;
+			a->align = 3;
+			a->size = 8;
+			a->cls[0] = i->cls;
+		} else {
+			n = i->arg[0].val & AMask;
+			aclass(a, &typ[n]);
+			if (a->inmem)
+				continue;
+			ni = ns = 0;
+			for (n=0; n<2; n++)
+				if (KBASE(a->cls[n]) == 0)
+					ni++;
+				else
+					ns++;
+			if (nint >= ni && nsse >= ns) {
+				nint -= ni;
+				nsse -= ns;
+			} else
+				a->inmem = 1;
+		}
+	}
+
+	return ((6-nint) << 4) | ((8-nsse) << 8);
+}
+
+int rsave[] = {
+	RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX,
+	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14
+};
+int rclob[] = {RBX, R12, R13, R14, R15};
+
+MAKESURE(rsave_has_correct_size, sizeof rsave == NRSave * sizeof(int));
+MAKESURE(rclob_has_correct_size, sizeof rclob == NRClob * sizeof(int));
+
+bits
+retregs(Ref r, int p[2])
+{
+	bits b;
+	int ni, nf;
+
+	assert(rtype(r) == RACall);
+	b = 0;
+	ni = r.val & 3;
+	nf = (r.val >> 2) & 3;
+	if (ni >= 1)
+		b |= BIT(RAX);
+	if (ni >= 2)
+		b |= BIT(RDX);
+	if (nf >= 1)
+		b |= BIT(XMM0);
+	if (nf >= 2)
+		b |= BIT(XMM1);
+	if (p) {
+		p[0] = ni;
+		p[1] = nf;
+	}
+	return b;
+}
+
+bits
+argregs(Ref r, int p[2])
+{
+	bits b;
+	int j, ni, nf;
+
+	assert(rtype(r) == RACall);
+	b = 0;
+	ni = (r.val >> 4) & 15;
+	nf = (r.val >> 8) & 15;
+	for (j=0; j<ni; j++)
+		b |= BIT(rsave[j]);
+	for (j=0; j<nf; j++)
+		b |= BIT(XMM0+j);
+	if (p) {
+		p[0] = ni + 1;
+		p[1] = nf;
+	}
+	return b | BIT(RAX);
+}
+
+static Ref
+rarg(int ty, int *ni, int *ns)
+{
+	if (KBASE(ty) == 0)
+		return TMP(rsave[(*ni)++]);
+	else
+		return TMP(XMM0 + (*ns)++);
+}
+
+struct RAlloc {
+	Ins i;
+	RAlloc *link;
+};
+
+static void
+selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
+{
+	Ins *i;
+	AClass *ac, *a, aret;
+	int ca, ni, ns;
+	uint stk, off;
+	Ref r, r1, r2, reg[2], regcp[2];
+	RAlloc *ra;
+
+	ac = alloc((i1-i0) * sizeof ac[0]);
+	if (!req(i1->arg[1], R)) {
+		assert(rtype(i1->arg[1]) == RAType);
+		aclass(&aret, &typ[i1->arg[1].val & AMask]);
+		ca = classify(i0, i1, ac, OArg, &aret);
+	} else
+		ca = classify(i0, i1, ac, OArg, 0);
+
+	for (stk=0, a=&ac[i1-i0]; a>ac;)
+		if ((--a)->inmem) {
+			assert(a->align <= 4);
+			stk += a->size;
+			if (a->align == 4)
+				stk += stk & 15;
+		}
+	stk += stk & 15;
+	if (stk) {
+		r = getcon(-(int64_t)stk, fn);
+		emit(OSAlloc, Kl, R, r, R);
+	}
+
+	if (!req(i1->arg[1], R)) {
+		if (aret.inmem) {
+			/* get the return location from eax
+			 * it saves one callee-save reg */
+			r1 = newtmp("abi", Kl, fn);
+			emit(OCopy, Kl, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				regcp[1] = newtmp("abi", aret.cls[1], fn);
+				emit(OStorel, 0, R, regcp[1], r);
+				emit(OAdd, Kl, r, i1->to, getcon(8, fn));
+				chuse(i1->to, +1, fn);
+				ca += 1 << (2 * KBASE(aret.cls[1]));
+			}
+			regcp[0] = newtmp("abi", aret.cls[0], fn);
+			emit(OStorel, 0, R, regcp[0], i1->to);
+			ca += 1 << (2 * KBASE(aret.cls[0]));
+			retr(reg, &aret);
+			if (aret.size > 8)
+				emit(OCopy, aret.cls[1], regcp[1], reg[1], R);
+			emit(OCopy, aret.cls[0], regcp[0], reg[0], R);
+			r1 = i1->to;
+		}
+		/* allocate return pad */
+		ra = alloc(sizeof *ra);
+		assert(NAlign == 3);
+		aret.align -= 2;
+		if (aret.align < 0)
+			aret.align = 0;
+		ra->i.op = OAlloc + aret.align;
+		ra->i.cls = Kl;
+		ra->i.to = r1;
+		ra->i.arg[0] = getcon(aret.size, fn);
+		ra->link = (*rap);
+		*rap = ra;
+	} else {
+		ra = 0;
+		if (KBASE(i1->cls) == 0) {
+			emit(OCopy, i1->cls, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			emit(OCopy, i1->cls, i1->to, TMP(XMM0), R);
+			ca += 1 << 2;
+		}
+	}
+	emit(OCall, i1->cls, R, i1->arg[0], CALL(ca));
+	emit(OCopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);
+
+	ni = ns = 0;
+	if (ra && aret.inmem)
+		emit(OCopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (a->inmem)
+			continue;
+		r1 = rarg(a->cls[0], &ni, &ns);
+		if (i->op == OArgc) {
+			if (a->size > 8) {
+				r2 = rarg(a->cls[1], &ni, &ns);
+				r = newtmp("abi", Kl, fn);
+				emit(OLoad, a->cls[1], r2, r, R);
+				emit(OAdd, Kl, r, i->arg[1], getcon(8, fn));
+				chuse(i->arg[1], +1, fn);
+			}
+			emit(OLoad, a->cls[0], r1, i->arg[1], R);
+		} else
+			emit(OCopy, i->cls, r1, i->arg[0], R);
+	}
+
+	if (!stk)
+		return;
+
+	r = newtmp("abi", Kl, fn);
+	chuse(r, -1, fn);
+	for (i=i0, a=ac, off=0; i<i1; i++, a++) {
+		if (!a->inmem)
+			continue;
+		if (i->op == OArgc) {
+			if (a->align == 4)
+				off += off & 15;
+			blit(r, off, i->arg[1], a->size, fn);
+		} else {
+			r1 = newtmp("abi", Kl, fn);
+			emit(OStorel, 0, R, i->arg[0], r1);
+			emit(OAdd, Kl, r1, r, getcon(off, fn));
+			chuse(r, +1, fn);
+		}
+		off += a->size;
+	}
+	emit(OSAlloc, Kl, r, getcon(stk, fn), R);
+}
+
+static void
+selpar(Fn *fn, Ins *i0, Ins *i1)
+{
+	AClass *ac, *a, aret;
+	Ins *i;
+	int ni, ns, s, al;
+	Ref r, r1;
+
+	ac = alloc((i1-i0) * sizeof ac[0]);
+	curi = insb;
+	ni = ns = 0;
+
+	if (fn->retty >= 0) {
+		aclass(&aret, &typ[fn->retty]);
+		if (aret.inmem) {
+			r = newtmp("abi", Kl, fn);
+			*curi++ = (Ins){OCopy, r, {rarg(Kl, &ni, &ns)}, Kl};
+			fn->retr = r;
+		}
+		classify(i0, i1, ac, OPar, &aret);
+	} else
+		classify(i0, i1, ac, OPar, 0);
+
+	assert(NAlign == 3);
+
+	s = 4;
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		switch (a->inmem) {
+		case 1:
+			assert(a->align <= 4);
+			if (a->align == 4)
+				s = (s+3) & -4;
+			fn->tmp[i->to.val].slot = -s; /* HACK! */
+			s += a->size / 4;
+			continue;
+		case 2:
+			*curi++ = (Ins){OLoad, i->to, {SLOT(-s)}, i->cls};
+			s += 2;
+			continue;
+		}
+		r1 = rarg(a->cls[0], &ni, &ns);
+		if (i->op == OParc) {
+			r = newtmp("abi", Kl, fn);
+			*curi++ = (Ins){OCopy, r, {r1}, Kl};
+			a->cls[0] = r.val;
+			if (a->size > 8) {
+				r1 = rarg(a->cls[1], &ni, &ns);
+				r = newtmp("abi", Kl, fn);
+				*curi++ = (Ins){OCopy, r, {r1}, Kl};
+				a->cls[1] = r.val;
+			}
+		} else
+			*curi++ = (Ins){OCopy, i->to, {r1}, i->cls};
+	}
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (i->op != OParc || a->inmem)
+			continue;
+		assert(NAlign == 3);
+		for (al=0; a->align >> (al+2); al++)
+			;
+		r = TMP(a->cls[0]);
+		r1 = i->to;
+		*curi++ = (Ins){OAlloc+al, r1, {getcon(a->size, fn)}, Kl};
+		*curi++ = (Ins){OStorel, R, {r, r1}, 0};
+		if (a->size > 8) {
+			r = newtmp("abi", Kl, fn);
+			*curi++ = (Ins){OAdd, r, {r1, getcon(8, fn)}, Kl};
+			r1 = TMP(a->cls[1]);
+			*curi++ = (Ins){OStorel, R, {r1, r}, 0};
+		}
+	}
+}
+
+static int
+aref(Ref r, ANum *ai)
+{
+	switch (rtype(r)) {
+	default:
+		diag("isel: aref defaulted");
+	case RCon:
+		return 2;
+	case RTmp:
+		return ai[r.val].n;
+	}
+}
+
+static int
+ascale(Ref r, Con *con)
+{
+	int64_t n;
+
+	if (rtype(r) != RCon)
+		return 0;
+	if (con[r.val].type != CBits)
+		return 0;
+	n = con[r.val].bits.i;
+	return n == 1 || n == 2 || n == 4 || n == 8;
+}
+
+static void
+anumber(ANum *ai, Blk *b, Con *con)
+{
+	/* This should be made obsolete by a proper
+	 * reassoc pass.
+	 *
+	 * Rules:
+	 *
+	 *   RTmp(_) -> 0    tmp
+	 *   ( RTmp(_) -> 1    slot )
+	 *   RCon(_) -> 2    con
+	 *   0 * 2   -> 3    s * i (when constant is 1,2,4,8)
+	 */
+	static char add[10][10] = {
+		[2] [2] = 2,              /* folding */
+		[2] [5] = 5, [5] [2] = 5,
+		[2] [6] = 6, [6] [2] = 6,
+		[2] [7] = 7, [7] [2] = 7,
+		[0] [0] = 4,              /* 4: b + s * i */
+		[0] [3] = 4, [3] [0] = 4,
+		[2] [3] = 5, [3] [2] = 5, /* 5: o + s * i */
+		[0] [2] = 6, [2] [0] = 6, /* 6: o + b */
+		[2] [4] = 7, [4] [2] = 7, /* 7: o + b + s * i */
+		[0] [5] = 7, [5] [0] = 7,
+		[6] [3] = 7, [3] [6] = 7,
+
+	};
+	int a, a1, a2, n1, n2, t1, t2;
+	Ins *i;
+
+	for (i=b->ins; i-b->ins < b->nins; i++) {
+		if (rtype(i->to) == RTmp)
+			ai[i->to.val].i = i;
+		if (i->op != OAdd && i->op != OMul)
+			continue;
+		a1 = aref(i->arg[0], ai);
+		a2 = aref(i->arg[1], ai);
+		t1 = a1 != 1 && a1 != 2;
+		t2 = a2 != 1 && a2 != 2;
+		if (i->op == OAdd) {
+			a = add[n1 = a1][n2 = a2];
+			if (t1 && a < add[0][a2])
+				a = add[n1 = 0][n2 = a2];
+			if (t2 && a < add[a1][0])
+				a = add[n1 = a1][n2 = 0];
+			if (t1 && t2 && a < add[0][0])
+				a = add[n1 = 0][n2 = 0];
+		} else {
+			n1 = n2 = a = 0;
+			if (ascale(i->arg[0], con) && t2)
+				a = 3, n1 = 2, n2 = 0;
+			if (t1 && ascale(i->arg[1], con))
+				a = 3, n1 = 0, n2 = 2;
+		}
+		ai[i->to.val].n = a;
+		ai[i->to.val].l = n1;
+		ai[i->to.val].r = n2;
+	}
+}
+
+static void
+amatch(Addr *a, Ref r, ANum *ai, Fn *fn, int top)
+{
+	Ins *i;
+	int nl, nr, t, s;
+	Ref al, ar;
+
+	if (top)
+		memset(a, 0, sizeof *a);
+	if (rtype(r) == RCon) {
+		addcon(&a->offset, &fn->con[r.val]);
+		return;
+	}
+	assert(rtype(r) == RTmp);
+	i = ai[r.val].i;
+	nl = ai[r.val].l;
+	nr = ai[r.val].r;
+	if (i) {
+		if (nl > nr) {
+			al = i->arg[1];
+			ar = i->arg[0];
+			t = nl, nl = nr, nr = t;
+		} else {
+			al = i->arg[0];
+			ar = i->arg[1];
+		}
+	}
+	switch (ai[r.val].n) {
+	default:
+		diag("isel: amatch defaulted");
+	case 3: /* s * i */
+		if (!top) {
+			a->index = al;
+			a->scale = fn->con[ar.val].bits.i;
+		} else
+			a->base = r;
+		break;
+	case 4: /* b + s * i */
+		switch (nr) {
+		case 0:
+			if (fn->tmp[ar.val].slot != -1) {
+				al = i->arg[1];
+				ar = i->arg[0];
+			}
+			a->index = ar;
+			a->scale = 1;
+			break;
+		case 3:
+			amatch(a, ar, ai, fn, 0);
+			break;
+		}
+		r = al;
+	case 0:
+		s = fn->tmp[r.val].slot;
+		if (s != -1)
+			r = SLOT(s);
+		a->base = r;
+		break;
+	case 2: /* constants */
+	case 5: /* o + s * i */
+	case 6: /* o + b */
+	case 7: /* o + b + s * i */
+		amatch(a, ar, ai, fn, 0);
+		amatch(a, al, ai, fn, 0);
+		break;
+	}
+}
+
+/* instruction selection
+ * requires use counts (as given by parsing)
+ */
+void
+isel(Fn *fn)
+{
+	Blk *b, **sb;
+	Ins *i, *i0, *ip;
+	Phi *p;
+	uint a;
+	int n, al;
+	int64_t sz;
+	ANum *ainfo;
+	RAlloc *ral;
+
+	for (n=0; n<fn->ntmp; n++)
+		fn->tmp[n].slot = -1;
+	fn->slot = 0;
+
+	/* lower arguments */
+	for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
+		if (i->op != OPar && i->op != OParc)
+			break;
+	selpar(fn, b->ins, i);
+	n = b->nins - (i - b->ins) + (curi - insb);
+	i0 = alloc(n * sizeof(Ins));
+	ip = icpy(ip = i0, insb, curi - insb);
+	ip = icpy(ip, i, &b->ins[b->nins] - i);
+	b->nins = n;
+	b->ins = i0;
+
+	/* lower function calls and returns */
+	ral = 0;
+	b = fn->start;
+	do {
+		if (!(b = b->link))
+			b = fn->start; /* do it last */
+		curi = &insb[NIns];
+		selret(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;) {
+			if ((--i)->op == OCall) {
+				for (i0=i; i0>b->ins; i0--)
+					if ((i0-1)->op != OArg)
+					if ((i0-1)->op != OArgc)
+						break;
+				selcall(fn, i0, i, &ral);
+				i = i0;
+				continue;
+			}
+			assert(i->op != OArg && i->op != OArgc);
+			emiti(*i);
+		}
+		if (b == fn->start)
+			for (; ral; ral=ral->link)
+				emiti(ral->i);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	} while (b != fn->start);
+
+	if (debug['A']) {
+		fprintf(stderr, "\n> After call lowering:\n");
+		printfn(fn, stderr);
+	}
+
+	/* assign slots to fast allocs */
+	b = fn->start;
+	assert(NAlign == 3 && "change n=4 and sz /= 4 below");
+	for (al=OAlloc, n=4; al<=OAlloc1; al++, n*=2)
+		for (i=b->ins; i-b->ins < b->nins; i++)
+			if (i->op == al) {
+				if (rtype(i->arg[0]) != RCon)
+					break;
+				sz = fn->con[i->arg[0].val].bits.i;
+				if (sz < 0 || sz >= INT_MAX-3)
+					diag("isel: invalid alloc size");
+				sz = (sz + n-1) & -n;
+				sz /= 4;
+				fn->tmp[i->to.val].slot = fn->slot;
+				fn->slot += sz;
+				*i = (Ins){.op = ONop};
+			}
+
+	/* process basic blocks */
+	n = fn->ntmp;
+	ainfo = emalloc(n * sizeof ainfo[0]);
+	for (b=fn->start; b; b=b->link) {
+		curi = &insb[NIns];
+		for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
+			for (p=(*sb)->phi; p; p=p->link) {
+				for (a=0; p->blk[a] != b; a++)
+					assert(a+1 < p->narg);
+				fixarg(&p->arg[a], p->cls, 1, fn);
+			}
+		memset(ainfo, 0, n * sizeof ainfo[0]);
+		anumber(ainfo, b, fn->con);
+		seljmp(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;)
+			sel(*--i, ainfo, fn);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	}
+	free(ainfo);
+
+	if (debug['I']) {
+		fprintf(stderr, "\n> After instruction selection:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/src/live.c b/src/live.c
new file mode 100644
index 0000000..44806e1
--- /dev/null
+++ b/src/live.c
@@ -0,0 +1,174 @@
+#include "all.h"
+
+void
+liveon(BSet *v, Blk *b, Blk *s)
+{
+	Phi *p;
+	uint a;
+
+	bscopy(v, s->in);
+	for (p=s->phi; p; p=p->link) {
+		bsclr(v, p->to.val);
+		for (a=0; a<p->narg; a++)
+			if (p->blk[a] == b)
+			if (rtype(p->arg[a]) == RTmp)
+				bsset(v, p->arg[a].val);
+	}
+}
+
+static int
+phitmp(int t, Tmp *tmp)
+{
+	int tp;
+
+	tp = tmp[t].phi;
+	return tp ? tp : t;
+}
+
+static void
+phifix(int t1, short *phi, Tmp *tmp)
+{
+	int t, t2;
+
+	/* detect temporaries arguments
+	 * of the same phi node that
+	 * interfere and separate them
+	 */
+	t = phitmp(t1, tmp);
+	t2 = phi[t];
+	if (t2 && t2 != t1) {
+		if (t != t1) {
+			tmp[t1].phi = t1;
+			t = t1;
+		} else {
+			tmp[t2].phi = t2;
+			phi[t2] = t2;
+		}
+	}
+	phi[t] = t1;
+}
+
+static void
+bset(Ref r, Blk *b, int *nlv, short *phi, Tmp *tmp)
+{
+
+	if (rtype(r) != RTmp)
+		return;
+	bsset(b->gen, r.val);
+	phifix(r.val, phi, tmp);
+	if (!bshas(b->in, r.val)) {
+		nlv[KBASE(tmp[r.val].cls)]++;
+		bsset(b->in, r.val);
+	}
+}
+
+/* liveness analysis
+ * requires rpo computation
+ */
+void
+filllive(Fn *f)
+{
+	Blk *b;
+	Ins *i;
+	int k, t, m[2], n, chg, nlv[2];
+	short *phi;
+	BSet u[1], v[1];
+	Mem *ma;
+
+	bsinit(u, f->ntmp);
+	bsinit(v, f->ntmp);
+	phi = emalloc(f->ntmp * sizeof phi[0]);
+	for (b=f->start; b; b=b->link) {
+		bsinit(b->in, f->ntmp);
+		bsinit(b->out, f->ntmp);
+		bsinit(b->gen, f->ntmp);
+	}
+	chg = 1;
+Again:
+	for (n=f->nblk-1; n>=0; n--) {
+		b = f->rpo[n];
+
+		bscopy(u, b->out);
+		if (b->s1) {
+			liveon(v, b, b->s1);
+			bsunion(b->out, v);
+		}
+		if (b->s2) {
+			liveon(v, b, b->s2);
+			bsunion(b->out, v);
+		}
+		chg |= !bsequal(b->out, u);
+
+		memset(phi, 0, f->ntmp * sizeof phi[0]);
+		memset(nlv, 0, sizeof nlv);
+		bscopy(b->in, b->out);
+		for (t=0; t<f->ntmp; t++)
+			if (bshas(b->in, t)) {
+				phifix(t, phi, f->tmp);
+				nlv[KBASE(f->tmp[t].cls)]++;
+			}
+		if (rtype(b->jmp.arg) == RACall) {
+			assert(bscount(b->in) == 0 && nlv[0] == 0 && nlv[1] == 0);
+			b->in->t[0] |= retregs(b->jmp.arg, nlv);
+		} else
+			bset(b->jmp.arg, b, nlv, phi, f->tmp);
+		for (k=0; k<2; k++)
+			b->nlive[k] = nlv[k];
+		for (i=&b->ins[b->nins]; i!=b->ins;) {
+			if ((--i)->op == OCall && rtype(i->arg[1]) == RACall) {
+				b->in->t[0] &= ~retregs(i->arg[1], m);
+				for (k=0; k<2; k++)
+					nlv[k] -= m[k];
+				if (nlv[0] + NISave > b->nlive[0])
+					b->nlive[0] = nlv[0] + NISave;
+				if (nlv[1] + NFSave > b->nlive[1])
+					b->nlive[1] = nlv[1] + NFSave;
+				b->in->t[0] |= argregs(i->arg[1], m);
+				for (k=0; k<2; k++)
+					nlv[k] += m[k];
+			}
+			if (!req(i->to, R)) {
+				assert(rtype(i->to) == RTmp);
+				t = i->to.val;
+				if (bshas(b->in, i->to.val))
+					nlv[KBASE(f->tmp[t].cls)]--;
+				bsset(b->gen, t);
+				bsclr(b->in, t);
+				phi[phitmp(t, f->tmp)] = 0;
+			}
+			for (k=0; k<2; k++)
+				switch (rtype(i->arg[k])) {
+				case RAMem:
+					ma = &f->mem[i->arg[k].val & AMask];
+					bset(ma->base, b, nlv, phi, f->tmp);
+					bset(ma->index, b, nlv, phi, f->tmp);
+					break;
+				default:
+					bset(i->arg[k], b, nlv, phi, f->tmp);
+					break;
+				}
+			for (k=0; k<2; k++)
+				if (nlv[k] > b->nlive[k])
+					b->nlive[k] = nlv[k];
+		}
+	}
+	if (chg) {
+		chg = 0;
+		goto Again;
+	}
+	free(phi);
+
+	if (debug['L']) {
+		fprintf(stderr, "\n> Liveness analysis:\n");
+		for (b=f->start; b; b=b->link) {
+			fprintf(stderr, "\t%-10sin:   ", b->name);
+			dumpts(b->in, f->tmp, stderr);
+			fprintf(stderr, "\t          out:  ");
+			dumpts(b->out, f->tmp, stderr);
+			fprintf(stderr, "\t          gen:  ");
+			dumpts(b->gen, f->tmp, stderr);
+			fprintf(stderr, "\t          live: ");
+			fprintf(stderr, "%d %d\n", b->nlive[0], b->nlive[1]);
+		}
+	}
+}
diff --git a/src/main.c b/src/main.c
new file mode 100644
index 0000000..b8cd7d6
--- /dev/null
+++ b/src/main.c
@@ -0,0 +1,117 @@
+#include "all.h"
+#include <ctype.h>
+#include <getopt.h>
+
+char debug['Z'+1] = {
+	['P'] = 0, /* parsing */
+	['A'] = 0, /* abi lowering */
+	['I'] = 0, /* instruction selection */
+	['L'] = 0, /* liveness */
+	['M'] = 0, /* memory optimization */
+	['N'] = 0, /* ssa construction */
+	['C'] = 0, /* copy elimination */
+	['S'] = 0, /* spilling */
+	['R'] = 0, /* reg. allocation */
+};
+
+static FILE *outf;
+static int dbg;
+
+static void
+data(Dat *d)
+{
+	if (dbg)
+		return;
+	if (d->type == DEnd) {
+		fputs("/* end data */\n\n", outf);
+		freeall();
+	}
+	emitdat(d, outf);
+}
+
+static void
+func(Fn *fn)
+{
+	int n;
+
+	if (dbg)
+		fprintf(stderr, "**** Function %s ****", fn->name);
+	if (debug['P']) {
+		fprintf(stderr, "\n> After parsing:\n");
+		printfn(fn, stderr);
+	}
+	fillrpo(fn);
+	fillpreds(fn);
+	filluse(fn);
+	memopt(fn);
+	ssa(fn);
+	filluse(fn);
+	copy(fn);
+	filluse(fn);
+	isel(fn);
+	filllive(fn);
+	fillcost(fn);
+	spill(fn);
+	rega(fn);
+	fillrpo(fn);
+	assert(fn->rpo[0] == fn->start);
+	for (n=0;; n++)
+		if (n == fn->nblk-1) {
+			fn->rpo[n]->link = 0;
+			break;
+		} else
+			fn->rpo[n]->link = fn->rpo[n+1];
+	if (!dbg) {
+		emitfn(fn, outf);
+		fprintf(outf, "/* end function %s */\n\n", fn->name);
+	} else
+		fprintf(stderr, "\n");
+	freeall();
+}
+
+int
+main(int ac, char *av[])
+{
+	FILE *inf;
+	char *f;
+	int c;
+
+	outf = stdout;
+	while ((c = getopt(ac, av, "d:o:")) != -1)
+		switch (c) {
+		case 'd':
+			for (; *optarg; optarg++)
+				if (isalpha(*optarg)) {
+					debug[toupper(*optarg)] = 1;
+					dbg = 1;
+				}
+			break;
+		case 'o':
+			if (strcmp(optarg, "-") != 0)
+				outf = fopen(optarg, "w");
+			break;
+		default:
+			fprintf(stderr, "usage: %s [-d <flags>] [-o out] {file.ssa, -}\n", av[0]);
+			exit(1);
+		}
+
+	do {
+		f = av[optind];
+		if (!f || strcmp(f, "-") == 0) {
+			inf = stdin;
+			f = "-";
+		} else {
+			inf = fopen(f, "r");
+			if (!inf) {
+				fprintf(stderr, "cannot open '%s'\n", f);
+				exit(1);
+			}
+		}
+		parse(inf, f, data, func);
+	} while (++optind < ac);
+
+	if (!dbg)
+		emitfin(outf);
+
+	exit(0);
+}
diff --git a/src/mem.c b/src/mem.c
new file mode 100644
index 0000000..bda43d7
--- /dev/null
+++ b/src/mem.c
@@ -0,0 +1,81 @@
+#include "all.h"
+
+/* Memory optimization:
+ *
+ * - replace alloced slots used only in
+ *   load/store operations
+ *   Assumption: all the accesses have the
+ *   same size (this could be wrong...)
+ */
+
+/* require use, maintains use counts */
+void
+memopt(Fn *fn)
+{
+	Blk *b;
+	Ins *i, *l;
+	Tmp *t;
+	Use *u, *ue;
+	int a;
+
+	b = fn->start;
+	for (i=b->ins; i-b->ins < b->nins; i++) {
+		if (OAlloc > i->op || i->op > OAlloc1)
+			continue;
+		assert(NAlign == 3);
+		assert(rtype(i->to) == RTmp);
+		t = &fn->tmp[i->to.val];
+		for (u=t->use; u != &t->use[t->nuse]; u++) {
+			if (u->type != UIns)
+				goto NextIns;
+			l = u->u.ins;
+			if (!isload(l->op)
+			&& (!isstore(l->op) || req(i->to, l->arg[0])))
+				goto NextIns;
+		}
+		/* get rid of the alloc and replace uses */
+		*i = (Ins){.op = ONop};
+		t->ndef--;
+		ue = &t->use[t->nuse];
+		for (u=t->use; u!=ue; u++) {
+			l = u->u.ins;
+			if (isstore(l->op)) {
+				if (l->op == OStores)
+					l->cls = Kd;
+				else if (l->op == OStored)
+					l->cls = Kd;
+				else if (l->op == OStorel)
+					l->cls = Kl;
+				else
+					l->cls = Kw;
+				l->op = OCopy;
+				l->to = l->arg[1];
+				l->arg[1] = R;
+				t->nuse--;
+				t->ndef++;
+			} else
+				/* try to turn loads into copies so we
+				 * can eliminate them later */
+				switch(l->op) {
+				case OLoad:
+					l->op = OCopy;
+					break;
+				case OLoadsw:
+				case OLoaduw:
+					l->cls = Kw;
+					l->op = OCopy;
+					break;
+				default:
+					/* keep l->cls */
+					a = l->op - OLoadsw;
+					l->op = OExtsw + a;
+					break;
+				}
+		}
+	NextIns:;
+	}
+	if (debug['M']) {
+		fprintf(stderr, "\n> After memory optimization:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/src/parse.c b/src/parse.c
new file mode 100644
index 0000000..903e909
--- /dev/null
+++ b/src/parse.c
@@ -0,0 +1,1081 @@
+#include "all.h"
+#include <ctype.h>
+#include <stdarg.h>
+
+enum {
+	Kx = -1, /* Invalid operand */
+	Km = Kl, /* Memory pointer (for x64) */
+};
+
+OpDesc opdesc[NOp] = {
+#define A(a,b,c,d) {[Kw]=K##a, [Kl]=K##b, [Ks]=K##c, [Kd]=K##d}
+
+	/*            NAME       NM      ARGCLS0     ARGCLS1  SF LF */
+	[OAdd]    = { "add",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[OSub]    = { "sub",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[ODiv]    = { "div",      2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
+	[ORem]    = { "rem",      2, {A(w,l,x,x), A(w,l,x,x)}, 0, 0 },
+	[OUDiv]   = { "udiv",     2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
+	[OURem]   = { "urem",     2, {A(w,l,x,x), A(w,l,x,x)}, 0, 0 },
+	[OMul]    = { "mul",      2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
+	[OAnd]    = { "and",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[OOr]     = { "or",       2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[OXor]    = { "xor",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[OSar]    = { "sar",      1, {A(w,l,x,x), A(w,w,x,x)}, 1, 0 },
+	[OShr]    = { "shr",      1, {A(w,l,x,x), A(w,w,x,x)}, 1, 0 },
+	[OShl]    = { "shl",      1, {A(w,l,x,x), A(w,w,x,x)}, 1, 0 },
+	[OStored] = { "stored",   0, {A(d,d,d,d), A(m,m,m,m)}, 0, 1 },
+	[OStores] = { "stores",   0, {A(s,s,s,s), A(m,m,m,m)}, 0, 1 },
+	[OStorel] = { "storel",   0, {A(l,l,l,l), A(m,m,m,m)}, 0, 1 },
+	[OStorew] = { "storew",   0, {A(w,w,w,w), A(m,m,m,m)}, 0, 1 },
+	[OStoreh] = { "storeh",   0, {A(w,w,w,w), A(m,m,m,m)}, 0, 1 },
+	[OStoreb] = { "storeb",   0, {A(w,w,w,w), A(m,m,m,m)}, 0, 1 },
+	[OLoad]   = { "load",     0, {A(m,m,m,m), A(x,x,x,x)}, 0, 1 },
+	[OLoadsw] = { "loadsw",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OLoaduw] = { "loaduw",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OLoadsh] = { "loadsh",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OLoaduh] = { "loaduh",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OLoadsb] = { "loadsb",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OLoadub] = { "loadub",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OExtsw]  = { "extsw",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExtuw]  = { "extuw",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExtsh]  = { "extsh",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExtuh]  = { "extuh",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExtsb]  = { "extsb",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExtub]  = { "extub",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExts]   = { "exts",     0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OTruncd] = { "truncd",   0, {A(d,d,d,d), A(x,x,x,x)}, 0, 1 },
+	[OFtosi]  = { "ftosi",    0, {A(s,d,x,x), A(x,x,x,x)}, 0, 1 },
+	[OSitof]  = { "sitof",    0, {A(x,x,w,l), A(x,x,x,x)}, 0, 1 },
+	[OCast]   = { "cast",     0, {A(s,d,w,l), A(x,x,x,x)}, 0, 1 },
+	[OCopy]   = { "copy",     1, {A(w,l,s,d), A(x,x,x,x)}, 0, 1 },
+	[ONop]    = { "nop",      0, {A(x,x,x,x), A(x,x,x,x)}, 0, 1 },
+	[OSwap]   = { "swap",     2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
+	[OSign]   = { "sign",     0, {A(w,l,x,x), A(x,x,x,x)}, 0, 0 },
+	[OSAlloc] = { "salloc",   0, {A(x,l,x,x), A(x,x,x,x)}, 0, 0 },
+	[OXDiv]   = { "xdiv",     1, {A(w,l,x,x), A(x,x,x,x)}, 0, 0 },
+	[OXCmp]   = { "xcmp",     1, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[OXTest]  = { "xtest",    1, {A(w,l,x,x), A(w,l,x,x)}, 1, 0 },
+	[OAddr]   = { "addr",     0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OPar]    = { "parn",     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OParc]   = { "parc",     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OArg]    = { "arg",      0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OArgc]   = { "argc",     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OCall]   = { "call",     0, {A(m,m,m,m), A(x,x,x,x)}, 0, 0 },
+	[OXSetnp] = { "xsetnp",   0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OXSetp]  = { "xsetp",    0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OAlloc]   = { "alloc4",  1, {A(l,l,l,l), A(x,x,x,x)}, 0, 0 },
+	[OAlloc+1] = { "alloc8",  1, {A(l,l,l,l), A(x,x,x,x)}, 0, 0 },
+	[OAlloc+2] = { "alloc16", 1, {A(l,l,l,l), A(x,x,x,x)}, 0, 0 },
+#define X(c) \
+	[OCmpw+IC##c] = { "c"    #c "w", 0, {A(w,w,x,x), A(w,w,x,x)}, 1, 0 }, \
+	[OCmpl+IC##c] = { "c"    #c "l", 0, {A(l,l,x,x), A(l,l,x,x)}, 1, 0 }, \
+	[OXSet+IC##c] = { "xset" #c,     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 1 },
+	ICMPS(X)
+#undef X
+#define X(c) \
+	[OCmps+FC##c] = { "c"    #c "s", 0, {A(s,s,x,x), A(s,s,x,x)}, 1, 0 }, \
+	[OCmpd+FC##c] = { "c"    #c "d", 0, {A(d,d,x,x), A(d,d,x,x)}, 1, 0 },
+	FCMPS(X)
+#undef X
+
+};
+#undef A
+
+typedef enum {
+	PXXX,
+	PLbl,
+	PPhi,
+	PIns,
+	PEnd,
+} PState;
+
+enum {
+	TXXX = NPubOp,
+	TCall,
+	TPhi,
+	TJmp,
+	TJnz,
+	TRet,
+	TFunc,
+	TType,
+	TData,
+	TAlign,
+	TL,
+	TW,
+	TH,
+	TB,
+	TD,
+	TS,
+	TZ,
+
+	TInt,
+	TFlts,
+	TFltd,
+	TTmp,
+	TLbl,
+	TGlo,
+	TTyp,
+	TStr,
+
+	TPlus,
+	TEq,
+	TComma,
+	TLParen,
+	TRParen,
+	TLBrace,
+	TRBrace,
+	TNL,
+	TEOF,
+};
+
+
+static FILE *inf;
+static char *inpath;
+static int thead;
+static struct {
+	char chr;
+	double fltd;
+	float flts;
+	int64_t num;
+	char *str;
+} tokval;
+static int lnum;
+
+static Tmp *tmp;
+static Con *con;
+static int ntmp;
+static int ncon;
+static Phi **plink;
+static Blk **bmap;
+static Blk *curb;
+static Blk **blink;
+static int nblk;
+static int rcls;
+static int ntyp;
+
+
+void
+err(char *s, ...)
+{
+	char buf[100], *p, *end;
+	va_list ap;
+
+	p = buf;
+	end = buf + sizeof(buf);
+
+	va_start(ap, s);
+	p += snprintf(p, end - p, "%s:%d: ", inpath, lnum);
+	p += vsnprintf(p, end - p, s, ap);
+	va_end(ap);
+
+	diag(buf);
+}
+
+static int
+lex()
+{
+	static struct {
+		char *str;
+		int tok;
+	} tmap[] = {
+		{ "call", TCall },
+		{ "phi", TPhi },
+		{ "jmp", TJmp },
+		{ "jnz", TJnz },
+		{ "ret", TRet },
+		{ "function", TFunc },
+		{ "type", TType },
+		{ "data", TData },
+		{ "align", TAlign },
+		{ "l", TL },
+		{ "w", TW },
+		{ "h", TH },
+		{ "b", TB },
+		{ "d", TD },
+		{ "s", TS },
+		{ "z", TZ },
+		{ "loadw", OLoad }, /* for convenience */
+		{ "loadl", OLoad },
+		{ "loads", OLoad },
+		{ "loadd", OLoad },
+		{ "alloc1", OAlloc },
+		{ "alloc2", OAlloc },
+		{ 0, TXXX }
+	};
+	static char tok[NString];
+	int c, i;
+	int t;
+
+	do
+		c = fgetc(inf);
+	while (isblank(c));
+	t = TXXX;
+	tokval.chr = c;
+	switch (c) {
+	case EOF:
+		return TEOF;
+	case ',':
+		return TComma;
+	case '(':
+		return TLParen;
+	case ')':
+		return TRParen;
+	case '{':
+		return TLBrace;
+	case '}':
+		return TRBrace;
+	case '=':
+		return TEq;
+	case '+':
+		return TPlus;
+	case 's':
+		if (fscanf(inf, "_%f", &tokval.flts) != 1)
+			break;
+		return TFlts;
+	case 'd':
+		if (fscanf(inf, "_%lf", &tokval.fltd) != 1)
+			break;
+		return TFltd;
+	case '%':
+		t = TTmp;
+		goto Alpha;
+	case '@':
+		t = TLbl;
+		goto Alpha;
+	case '$':
+		t = TGlo;
+		goto Alpha;
+	case ':':
+		t = TTyp;
+		goto Alpha;
+	case '#':
+		while (fgetc(inf) != '\n')
+			;
+	case '\n':
+		lnum++;
+		return TNL;
+	}
+	if (isdigit(c) || c == '-' || c == '+') {
+		ungetc(c, inf);
+		if (fscanf(inf, "%"SCNd64, &tokval.num) != 1)
+			err("invalid integer literal");
+		return TInt;
+	}
+	if (c == '"') {
+		tokval.str = vnew(0, 1);
+		for (i=0;; i++) {
+			c = fgetc(inf);
+			vgrow(&tokval.str, i+1);
+			if (c == '"')
+			if (!i || tokval.str[i-1] != '\\') {
+				tokval.str[i] = 0;
+				return TStr;
+			}
+			tokval.str[i] = c;
+		}
+	}
+	if (0)
+Alpha:		c = fgetc(inf);
+	if (!isalpha(c) && c != '.' && c != '_')
+		err("lexing failure: invalid character %c (%d)", c, c);
+	i = 0;
+	do {
+		if (i >= NString-1)
+			err("identifier too long");
+		tok[i++] = c;
+		c = fgetc(inf);
+	} while (isalpha(c) || c == '$' || c == '.' || c == '_' || isdigit(c));
+	tok[i] = 0;
+	ungetc(c, inf);
+	tokval.str = tok;
+	if (t != TXXX) {
+		return t;
+	}
+	for (i=0; i<NPubOp; i++)
+		if (opdesc[i].name)
+		if (strcmp(tok, opdesc[i].name) == 0)
+			return i;
+	for (i=0; tmap[i].str; i++)
+		if (strcmp(tok, tmap[i].str) == 0)
+			return tmap[i].tok;
+	err("unknown keyword %s", tokval.str);
+	return TXXX;
+}
+
+static int
+peek()
+{
+	if (thead == TXXX)
+		thead = lex();
+	return thead;
+}
+
+static int
+next()
+{
+	int t;
+
+	t = peek();
+	thead = TXXX;
+	return t;
+}
+
+static int
+nextnl()
+{
+	int t;
+
+	while ((t = next()) == TNL)
+		;
+	return t;
+}
+
+static void
+expect(int t)
+{
+	static char *ttoa[] = {
+		[TLbl] = "label",
+		[TComma] = ",",
+		[TEq] = "=",
+		[TNL] = "newline",
+		[TLParen] = "(",
+		[TRParen] = ")",
+		[TLBrace] = "{",
+		[TRBrace] = "}",
+		[TEOF] = 0,
+	};
+	char buf[128], *s1, *s2;
+	int t1;
+
+	t1 = next();
+	if (t == t1)
+		return;
+	s1 = ttoa[t] ? ttoa[t] : "??";
+	s2 = ttoa[t1] ? ttoa[t1] : "??";
+	sprintf(buf, "%s expected, got %s instead", s1, s2);
+	err(buf);
+}
+
+static Ref
+tmpref(char *v)
+{
+	int t;
+
+	for (t=Tmp0; t<ntmp; t++)
+		if (strcmp(v, tmp[t].name) == 0)
+			return TMP(t);
+	vgrow(&tmp, ++ntmp);
+	strcpy(tmp[t].name, v);
+	return TMP(t);
+}
+
+static Ref
+parseref()
+{
+	Con c;
+	int i;
+
+	memset(&c, 0, sizeof c);
+	switch (next()) {
+	case TTmp:
+		return tmpref(tokval.str);
+	case TInt:
+		c.type = CBits;
+		c.bits.i = tokval.num;
+		goto Look;
+	case TFlts:
+		c.type = CBits;
+		c.bits.s = tokval.flts;
+		c.flt = 1;
+		goto Look;
+	case TFltd:
+		c.type = CBits;
+		c.bits.d = tokval.fltd;
+		c.flt = 2;
+		goto Look;
+	case TGlo:
+		c.type = CAddr;
+		strcpy(c.label, tokval.str);
+	Look:
+		for (i=0; i<ncon; i++)
+			if (con[i].type == c.type
+			&& con[i].bits.i == c.bits.i
+			&& strcmp(con[i].label, c.label) == 0)
+				return CON(i);
+		vgrow(&con, ++ncon);
+		con[i] = c;
+		return CON(i);
+	default:
+		return R;
+	}
+}
+
+static int
+parsecls(int *tyn)
+{
+	int i;
+
+	switch (next()) {
+	default:
+		err("invalid class specifier");
+	case TTyp:
+		for (i=0; i<ntyp; i++)
+			if (strcmp(tokval.str, typ[i].name) == 0) {
+				*tyn = i;
+				return 4;
+			}
+		err("undefined type");
+	case TW:
+		return Kw;
+	case TL:
+		return Kl;
+	case TS:
+		return Ks;
+	case TD:
+		return Kd;
+	}
+}
+
+static void
+parserefl(int arg)
+{
+	int k, t, ty;
+	Ref r;
+
+	expect(TLParen);
+	if (peek() == TRParen) {
+		next();
+		return;
+	}
+	for (;;) {
+		if (curi - insb >= NIns)
+			err("too many instructions (1)");
+		k = parsecls(&ty);
+		r = parseref();
+		if (req(r, R))
+			err("invalid reference argument");
+		if (!arg && rtype(r) != RTmp)
+			err("invalid function parameter");
+		if (k == 4)
+			if (arg)
+				*curi = (Ins){OArgc, R, {TYPE(ty), r}, Kl};
+			else
+				*curi = (Ins){OParc, r, {TYPE(ty)}, Kl};
+		else
+			if (arg)
+				*curi = (Ins){OArg, R, {r}, k};
+			else
+				*curi = (Ins){OPar, r, {R}, k};
+		curi++;
+		t = next();
+		if (t == TRParen)
+			break;
+		if (t != TComma)
+			err(", or ) expected");
+	}
+}
+
+static Blk *
+findblk(char *name)
+{
+	int i;
+
+	for (i=0; i<nblk; i++)
+		if (strcmp(bmap[i]->name, name) == 0)
+			return bmap[i];
+	vgrow(&bmap, ++nblk);
+	bmap[i] = blknew();
+	strcpy(bmap[i]->name, name);
+	return bmap[i];
+}
+
+static void
+closeblk()
+{
+	curb->nins = curi - insb;
+	idup(&curb->ins, insb, curb->nins);
+	blink = &curb->link;
+	curi = insb;
+}
+
+static PState
+parseline(PState ps)
+{
+	Ref arg[NPred] = {R};
+	Blk *blk[NPred];
+	Phi *phi;
+	Ref r;
+	Blk *b;
+	int t, op, i, k, ty;
+
+	t = nextnl();
+	if (ps == PLbl && t != TLbl && t != TRBrace)
+		err("label or } expected");
+	switch (t) {
+	default:
+		if (isstore(t)) {
+			/* operations without result */
+			r = R;
+			k = 0;
+			op = t;
+			goto DoOp;
+		}
+		err("label, instruction or jump expected");
+	case TRBrace:
+		return PEnd;
+	case TTmp:
+		break;
+	case TLbl:
+		b = findblk(tokval.str);
+		if (b->jmp.type != JXXX)
+			err("multiple definitions of block");
+		if (curb && curb->jmp.type == JXXX) {
+			closeblk();
+			curb->jmp.type = JJmp;
+			curb->s1 = b;
+		}
+		*blink = b;
+		curb = b;
+		plink = &curb->phi;
+		expect(TNL);
+		return PPhi;
+	case TRet:
+		curb->jmp.type = (int[]){
+			JRetw, JRetl,
+			JRets, JRetd,
+			JRetc, JRet0
+		}[rcls];
+		if (rcls < 5) {
+			r = parseref();
+			if (req(r, R))
+				err("return value expected");
+			curb->jmp.arg = r;
+		}
+		goto Close;
+	case TJmp:
+		curb->jmp.type = JJmp;
+		goto Jump;
+	case TJnz:
+		curb->jmp.type = JJnz;
+		r = parseref();
+		if (req(r, R))
+			err("invalid argument for jnz jump");
+		curb->jmp.arg = r;
+		expect(TComma);
+	Jump:
+		expect(TLbl);
+		curb->s1 = findblk(tokval.str);
+		if (curb->jmp.type != JJmp) {
+			expect(TComma);
+			expect(TLbl);
+			curb->s2 = findblk(tokval.str);
+		}
+	Close:
+		expect(TNL);
+		closeblk();
+		return PLbl;
+	}
+	r = tmpref(tokval.str);
+	expect(TEq);
+	k = parsecls(&ty);
+	op = next();
+DoOp:
+	if (op == TPhi) {
+		if (ps != PPhi)
+			err("unexpected phi instruction");
+		op = -1;
+	}
+	if (op == TCall) {
+		arg[0] = parseref();
+		parserefl(1);
+		expect(TNL);
+		op = OCall;
+		if (k == 4) {
+			k = Kl;
+			arg[1] = TYPE(ty);
+		} else
+			arg[1] = R;
+		goto Ins;
+	}
+	if (k == 4)
+		err("size class must be w, l, s, or d");
+	if (op >= NPubOp)
+		err("invalid instruction");
+	i = 0;
+	if (peek() != TNL)
+		for (;;) {
+			if (i == NPred)
+				err("too many arguments");
+			if (op == -1) {
+				expect(TLbl);
+				blk[i] = findblk(tokval.str);
+			}
+			arg[i] = parseref();
+			if (req(arg[i], R))
+				err("invalid instruction argument");
+			i++;
+			t = peek();
+			if (t == TNL)
+				break;
+			if (t != TComma)
+				err(", or end of line expected");
+			next();
+		}
+	next();
+	if (op != -1) {
+	Ins:
+		if (curi - insb >= NIns)
+			err("too many instructions (2)");
+		curi->op = op;
+		curi->cls = k;
+		curi->to = r;
+		curi->arg[0] = arg[0];
+		curi->arg[1] = arg[1];
+		curi++;
+		return PIns;
+	} else {
+		phi = alloc(sizeof *phi);
+		phi->to = r;
+		phi->cls = k;
+		memcpy(phi->arg, arg, i * sizeof arg[0]);
+		memcpy(phi->blk, blk, i * sizeof blk[0]);
+		phi->narg = i;
+		*plink = phi;
+		plink = &phi->link;
+		return PPhi;
+	}
+}
+
+static Fn *
+parsefn()
+{
+	PState ps;
+	Fn *fn;
+
+	ntmp = Tmp0;
+	ncon = 1; /* first constant must be 0 */
+	curb = 0;
+	nblk = 0;
+	curi = insb;
+	tmp = vnew(ntmp, sizeof tmp[0]);
+	con = vnew(ncon, sizeof con[0]);
+	bmap = vnew(nblk, sizeof bmap[0]);
+	con[0].type = CBits;
+	fn = alloc(sizeof *fn);
+	blink = &fn->start;
+	fn->retty = -1;
+	if (peek() != TGlo)
+		rcls = parsecls(&fn->retty);
+	else
+		rcls = 5;
+	if (next() != TGlo)
+		err("function name expected");
+	strcpy(fn->name, tokval.str);
+	parserefl(0);
+	if (nextnl() != TLBrace)
+		err("function body must start with {");
+	ps = PLbl;
+	do
+		ps = parseline(ps);
+	while (ps != PEnd);
+	if (!curb)
+		err("empty file");
+	if (curb->jmp.type == JXXX)
+		err("last block misses jump");
+	fn->tmp = tmp;
+	fn->con = con;
+	fn->mem = vnew(0, sizeof fn->mem[0]);
+	fn->ntmp = ntmp;
+	fn->ncon = ncon;
+	fn->nmem = 0;
+	fn->nblk = nblk;
+	fn->rpo = 0;
+	return fn;
+}
+
+static void
+parsetyp()
+{
+	Typ *ty;
+	int t, n, sz, al, s, a, c, flt;
+
+	if (ntyp >= NTyp)
+		err("too many type definitions");
+	ty = &typ[ntyp++];
+	ty->align = -1;
+	if (nextnl() != TTyp ||  nextnl() != TEq)
+		err("type name, then = expected");
+	strcpy(ty->name, tokval.str);
+	t = nextnl();
+	if (t == TAlign) {
+		if (nextnl() != TInt)
+			err("alignment expected");
+		for (al=0; tokval.num /= 2; al++)
+			;
+		ty->align = al;
+		t = nextnl();
+	}
+	if (t != TLBrace)
+		err("type body must start with {");
+	t = nextnl();
+	if (t == TInt) {
+		ty->dark = 1;
+		ty->size = tokval.num;
+		if (ty->align == -1)
+			err("dark types need alignment");
+		t = nextnl();
+	} else {
+		ty->dark = 0;
+		n = -1;
+		sz = 0;
+		al = 0;
+		for (;;) {
+			flt = 0;
+			switch (t) {
+			default: err("invalid size specifier %c", tokval.chr);
+			case TD: flt = 1;
+			case TL: s = 8; a = 3; break;
+			case TS: flt = 1;
+			case TW: s = 4; a = 2; break;
+			case TH: s = 2; a = 1; break;
+			case TB: s = 1; a = 0; break;
+			}
+			if (a > al)
+				al = a;
+			if ((a = sz & (s-1))) {
+				a = s - a;
+				if (++n < NSeg) {
+					/* padding segment */
+					ty->seg[n].ispad = 1;
+					ty->seg[n].len = a;
+				}
+			}
+			t = nextnl();
+			if (t == TInt) {
+				c = tokval.num;
+				t = nextnl();
+			} else
+				c = 1;
+			while (c-- > 0) {
+				if (++n < NSeg) {
+					ty->seg[n].isflt = flt;
+					ty->seg[n].ispad = 0;
+					ty->seg[n].len = s;
+				}
+				sz += a + s;
+			}
+			if (t != TComma)
+				break;
+			t = nextnl();
+		}
+		if (++n >= NSeg)
+			ty->dark = 1;
+		else
+			ty->seg[n].len = 0;
+		if (ty->align == -1)
+			ty->align = al;
+		else
+			al = ty->align;
+		a = (1 << al) - 1;
+		ty->size = (sz + a) & ~a;
+	}
+	if (t != TRBrace)
+		err("expected closing }");
+}
+
+static void
+parsedatref(Dat *d)
+{
+	int t;
+
+	d->isref = 1;
+	d->u.ref.nam = tokval.str;
+	d->u.ref.off = 0;
+	t = peek();
+	if (t == TPlus) {
+		next();
+		if (next() != TInt)
+			err("invalid token after offset in ref");
+		d->u.ref.off = tokval.num;
+	}
+}
+
+static void
+parsedatstr(Dat *d)
+{
+	d->isstr = 1;
+	d->u.str = tokval.str;
+}
+
+static void
+parsedat(void cb(Dat *))
+{
+	char s[NString];
+	int t;
+	Dat d;
+
+	d.type = DStart;
+	d.isstr = 0;
+	d.isref = 0;
+	cb(&d);
+	if (nextnl() != TGlo || nextnl() != TEq)
+		err("data name, then = expected");
+	strcpy(s, tokval.str);
+	t = nextnl();
+	if (t == TAlign) {
+		if (nextnl() != TInt)
+			err("alignment expected");
+		d.type = DAlign;
+		d.u.num = tokval.num;
+		cb(&d);
+		t = nextnl();
+	}
+	d.type = DName;
+	d.u.str = s;
+	cb(&d);
+
+	if (t != TLBrace)
+		err("expected data contents in { .. }");
+	for (;;) {
+		switch (nextnl()) {
+		default: err("invalid size specifier %c in data", tokval.chr);
+		case TRBrace: goto Done;
+		case TL: d.type = DL; break;
+		case TW: d.type = DW; break;
+		case TH: d.type = DH; break;
+		case TB: d.type = DB; break;
+		case TS: d.type = DW; break;
+		case TD: d.type = DL; break;
+		case TZ: d.type = DZ; break;
+		}
+		t = nextnl();
+		do {
+			d.isref = 0;
+			d.isstr = 0;
+			memset(&d.u, 0, sizeof d.u);
+			if (t == TFlts)
+				d.u.flts = tokval.flts;
+			else if (t == TFltd)
+				d.u.fltd = tokval.fltd;
+			else if (t == TInt)
+				d.u.num = tokval.num;
+			else if (t == TGlo)
+				parsedatref(&d);
+			else if (t == TStr)
+				parsedatstr(&d);
+			else
+				err("constant literal expected");
+			cb(&d);
+			t = nextnl();
+		} while (t == TInt || t == TFlts || t == TFltd);
+		if (t == TRBrace)
+			break;
+		if (t != TComma)
+			err(", or } expected");
+	}
+Done:
+	d.type = DEnd;
+	cb(&d);
+}
+
+void
+parse(FILE *f, char *path, void data(Dat *), void func(Fn *))
+{
+	inf = f;
+	inpath = path;
+	lnum = 1;
+	thead = TXXX;
+	ntyp = 0;
+	for (;;)
+		switch (nextnl()) {
+		case TFunc:
+			func(parsefn());
+			break;
+		case TType:
+			parsetyp();
+			break;
+		case TData:
+			parsedat(data);
+			break;
+		case TEOF:
+			return;
+		default:
+			err("top-level definition expected");
+			break;
+		}
+}
+
+static void
+printcon(Con *c, FILE *f)
+{
+	switch (c->type) {
+	case CUndef:
+		break;
+	case CAddr:
+		fprintf(f, "$%s", c->label);
+		if (c->bits.i)
+			fprintf(f, "%+"PRIi64, c->bits.i);
+		break;
+	case CBits:
+		if (c->flt == 1)
+			fprintf(f, "s_%f", c->bits.s);
+		else if (c->flt == 2)
+			fprintf(f, "d_%lf", c->bits.d);
+		else
+			fprintf(f, "%"PRIi64, c->bits.i);
+		break;
+	}
+}
+
+void
+printref(Ref r, Fn *fn, FILE *f)
+{
+	int i;
+	Mem *m;
+
+	switch (rtype(r)) {
+	case RTmp:
+		if (r.val < Tmp0)
+			fprintf(f, "R%d", r.val);
+		else
+			fprintf(f, "%%%s", fn->tmp[r.val].name);
+		break;
+	case RCon:
+		printcon(&fn->con[r.val], f);
+		break;
+	case RSlot:
+		fprintf(f, "S%d", r.val);
+		break;
+	case RACall:
+		fprintf(f, "%03x", r.val & AMask);
+		break;
+	case RAType:
+		fprintf(f, ":%s", typ[r.val & AMask].name);
+		break;
+	case RAMem:
+		i = 0;
+		m = &fn->mem[r.val & AMask];
+		fputc('[', f);
+		if (m->offset.type != CUndef) {
+			printcon(&m->offset, f);
+			i = 1;
+		}
+		if (!req(m->base, R)) {
+			if (i)
+				fprintf(f, " + ");
+			printref(m->base, fn, f);
+			i = 1;
+		}
+		if (!req(m->index, R)) {
+			if (i)
+				fprintf(f, " + ");
+			fprintf(f, "%d * ", m->scale);
+			printref(m->index, fn, f);
+		}
+		fputc(']', f);
+		break;
+	}
+}
+
+void
+printfn(Fn *fn, FILE *f)
+{
+	static char *jtoa[NJmp] = {
+		[JRet0]     = "ret",
+		[JRetw]     = "retw",
+		[JRetl]     = "retl",
+		[JRetc]     = "retc",
+		[JRets]     = "rets",
+		[JRetd]     = "retd",
+		[JJnz]      = "jnz",
+		[JXJnp]     = "xjnp",
+		[JXJp]      = "xjp",
+	#define X(c) [JXJc+IC##c] = "xj" #c,
+		ICMPS(X)
+	#undef X
+	};
+	static char prcls[NOp] = {
+		[OArg] = 1,
+		[OSwap] = 1,
+		[OXCmp] = 1,
+		[OXTest] = 1,
+		[OXDiv] = 1,
+		[OXIDiv] = 1,
+	};
+	static char ktoc[] = "wlsd";
+	Blk *b;
+	Phi *p;
+	Ins *i;
+	uint n;
+
+	fprintf(f, "function $%s() {\n", fn->name);
+	for (b=fn->start; b; b=b->link) {
+		fprintf(f, "@%s\n", b->name);
+		for (p=b->phi; p; p=p->link) {
+			fprintf(f, "\t");
+			printref(p->to, fn, f);
+			fprintf(f, " =%c phi ", ktoc[p->cls]);
+			assert(p->narg);
+			for (n=0;; n++) {
+				fprintf(f, "@%s ", p->blk[n]->name);
+				printref(p->arg[n], fn, f);
+				if (n == p->narg-1) {
+					fprintf(f, "\n");
+					break;
+				} else
+					fprintf(f, ", ");
+			}
+		}
+		for (i=b->ins; i-b->ins < b->nins; i++) {
+			fprintf(f, "\t");
+			if (!req(i->to, R)) {
+				printref(i->to, fn, f);
+				fprintf(f, " =%c ", ktoc[i->cls]);
+			}
+			assert(opdesc[i->op].name);
+			fprintf(f, "%s", opdesc[i->op].name);
+			if (req(i->to, R) && prcls[i->op])
+				fputc(ktoc[i->cls], f);
+			if (!req(i->arg[0], R)) {
+				fprintf(f, " ");
+				printref(i->arg[0], fn, f);
+			}
+			if (!req(i->arg[1], R)) {
+				fprintf(f, ", ");
+				printref(i->arg[1], fn, f);
+			}
+			fprintf(f, "\n");
+		}
+		switch (b->jmp.type) {
+		case JRet0:
+		case JRetw:
+		case JRetl:
+		case JRets:
+		case JRetd:
+		case JRetc:
+			fprintf(f, "\t%s", jtoa[b->jmp.type]);
+			if (b->jmp.type != JRet0 || !req(b->jmp.arg, R)) {
+				fprintf(f, " ");
+				printref(b->jmp.arg, fn, f);
+			}
+			if (b->jmp.type == JRetc)
+				fprintf(f, ", :%s", typ[fn->retty].name);
+			fprintf(f, "\n");
+			break;
+		case JJmp:
+			if (b->s1 != b->link)
+				fprintf(f, "\tjmp @%s\n", b->s1->name);
+			break;
+		default:
+			fprintf(f, "\t%s ", jtoa[b->jmp.type]);
+			if (b->jmp.type == JJnz) {
+				printref(b->jmp.arg, fn, f);
+				fprintf(f, ", ");
+			}
+			fprintf(f, "@%s, @%s\n", b->s1->name, b->s2->name);
+			break;
+		}
+	}
+	fprintf(f, "}\n");
+}
diff --git a/src/rega.c b/src/rega.c
new file mode 100644
index 0000000..7f8edcf
--- /dev/null
+++ b/src/rega.c
@@ -0,0 +1,598 @@
+#include "all.h"
+
+#ifdef TEST_PMOV
+	#undef assert
+	#define assert(x) assert_test(#x, x)
+#endif
+
+typedef struct RMap RMap;
+
+struct RMap {
+	int t[NIReg+NFReg];
+	int r[NIReg+NFReg];
+	BSet b[1];
+	int n;
+};
+
+static bits regu;      /* registers used */
+static Tmp *tmp;       /* function temporaries */
+static Mem *mem;       /* function mem references */
+static struct {
+	Ref src, dst;
+	int cls;
+} *pm;                 /* parallel move constructed */
+static int cpm, npm;   /* capacity and size of pm */
+
+static int *
+hint(int t)
+{
+	return &tmp[phicls(t, tmp)].hint.r;
+}
+
+static void
+sethint(int t, int r)
+{
+	bits m;
+
+	m = tmp[phicls(t, tmp)].hint.m;
+	if (*hint(t) == -1)
+	if (!(BIT(r) & m))
+		*hint(t) = r;
+}
+
+static void
+rcopy(RMap *ma, RMap *mb)
+{
+	memcpy(ma->t, mb->t, sizeof ma->t);
+	memcpy(ma->r, mb->r, sizeof ma->r);
+	bscopy(ma->b, mb->b);
+	ma->n = mb->n;
+}
+
+static int
+rfind(RMap *m, int t)
+{
+	int i;
+
+	for (i=0; i<m->n; i++)
+		if (m->t[i] == t)
+			return m->r[i];
+	return -1;
+}
+
+static Ref
+rref(RMap *m, int t)
+{
+	int r, s;
+
+	r = rfind(m, t);
+	if (r == -1) {
+		s = tmp[t].slot;
+		assert(s != -1 && "should have spilled");
+		return SLOT(s);
+	} else
+		return TMP(r);
+}
+
+static void
+radd(RMap *m, int t, int r)
+{
+	assert((t >= Tmp0 || t == r) && "invalid temporary");
+	assert(((RAX <= r && r < RAX + NIReg) || (XMM0 <= r && r < XMM0 + NFReg)) && "invalid register");
+	assert(!bshas(m->b, t) && "temporary has mapping");
+	assert(!bshas(m->b, r) && "register already allocated");
+	assert(m->n <= NIReg+NFReg && "too many mappings");
+	bsset(m->b, t);
+	bsset(m->b, r);
+	m->t[m->n] = t;
+	m->r[m->n] = r;
+	m->n++;
+	regu |= BIT(r);
+}
+
+static Ref
+ralloc(RMap *m, int t)
+{
+	bits regs;
+	int r, r0, r1;
+
+	if (t < Tmp0) {
+		assert(bshas(m->b, t));
+		return TMP(t);
+	}
+	if (bshas(m->b, t)) {
+		r = rfind(m, t);
+		assert(r != -1);
+		return TMP(r);
+	}
+	r = *hint(t);
+	if (r == -1 || bshas(m->b, r)) {
+		regs = tmp[phicls(t, tmp)].hint.m;
+		regs |= m->b->t[0];
+		switch (KBASE(tmp[t].cls)) {
+		case 0:
+			r0 = RAX;
+			r1 = RAX + NIReg;
+			break;
+		case 1:
+			r0 = XMM0;
+			r1 = XMM0 + NFReg;
+			break;
+		}
+		for (r=r0; r<r1; r++)
+			if (!(regs & BIT(r)))
+				goto Found;
+		for (r=r0; r<r1; r++)
+			if (!bshas(m->b, r))
+				goto Found;
+		diag("rega: no more regs");
+	}
+Found:
+	radd(m, t, r);
+	sethint(t, r);
+	return TMP(r);
+}
+
+static int
+rfree(RMap *m, int t)
+{
+	int i, r;
+
+	if (!bshas(m->b, t))
+		return -1;
+	for (i=0; m->t[i] != t; i++)
+		assert(i+1 < m->n);
+	r = m->r[i];
+	bsclr(m->b, t);
+	bsclr(m->b, r);
+	m->n--;
+	memmove(&m->t[i], &m->t[i+1], (m->n-i) * sizeof m->t[0]);
+	memmove(&m->r[i], &m->r[i+1], (m->n-i) * sizeof m->r[0]);
+	return r;
+}
+
+static void
+mdump(RMap *m)
+{
+	int i;
+
+	for (i=0; i<m->n; i++)
+		fprintf(stderr, " (%s, R%d)",
+			tmp[m->t[i]].name,
+			m->r[i]);
+	fprintf(stderr, "\n");
+}
+
+static void
+pmadd(Ref src, Ref dst, int k)
+{
+	if (npm == cpm) {
+		cpm = cpm * 2 + 16;
+		pm = realloc(pm, cpm * sizeof pm[0]);
+		if (!pm)
+			diag("pmadd: out of memory");
+	}
+	pm[npm].src = src;
+	pm[npm].dst = dst;
+	pm[npm].cls = k;
+	npm++;
+}
+
+enum PMStat { ToMove, Moving, Moved };
+
+static Ref
+pmrec(enum PMStat *status, int i, int *k)
+{
+	Ref swp, swp1;
+	int j, k1;
+
+	/* note, this routine might emit
+	 * too many large instructions:
+	 *
+	 *                  , x -- x
+	 *      x -- x -- x        |
+	 *                  ` x -- x
+	 *
+	 * if only the first move is wide
+	 * the whole cycle will be wide,
+	 * this is safe but not necessary
+	 */
+
+	if (req(pm[i].src, pm[i].dst))
+		return R;
+	status[i] = Moving;
+	assert(KBASE(*k) == KBASE(pm[i].cls));
+	assert((Kw|1) == Kl && (Ks|1) == Kd);
+	*k |= KWIDE(pm[i].cls); /* see above */
+	swp = R;
+	for (j=0; j<npm; j++) {
+		if (req(pm[j].src, pm[i].dst))
+			switch (status[j]) {
+			case ToMove:
+				k1 = *k;
+				swp1 = pmrec(status, j, &k1);
+				if (!req(swp1, R)) {
+					assert(req(swp, R));
+					swp = swp1;
+					*k = k1;
+				}
+				break;
+			case Moving:
+				assert(req(swp, R));
+				swp = pm[i].dst;
+				break;
+			case Moved:
+				break;
+			}
+	}
+	status[i] = Moved;
+	if (req(swp, R)) {
+		*curi++ = (Ins){OCopy, pm[i].dst, {pm[i].src}, pm[i].cls};
+		return R;
+	} else if (!req(swp, pm[i].src)) {
+		*curi++ = (Ins){OSwap, R, {pm[i].src, pm[i].dst}, *k};
+		return swp;
+	} else
+		return R;
+
+}
+
+static void
+pmgen()
+{
+	int i, k;
+	enum PMStat *status;
+
+	status = alloc(npm * sizeof status[0]);
+	assert(!npm || status[npm-1] == ToMove);
+	curi = insb;
+	for (i=0; i<npm; i++)
+		if (status[i] == ToMove) {
+			k = pm[i].cls;
+			pmrec(status, i, &k);
+		}
+}
+
+static void
+move(int r, Ref to, RMap *m)
+{
+	int n, t, r1;
+
+	r1 = req(to, R) ? -1 : rfree(m, to.val);
+	if (bshas(m->b, r) && r1 != r) {
+		/* r is used and not by to */
+		for (n=0; m->r[n] != r; n++)
+			assert(n+1 < m->n);
+		t = m->t[n];
+		rfree(m, t);
+		bsset(m->b, r);
+		ralloc(m, t);
+		bsclr(m->b, r);
+	}
+	t = req(to, R) ? r : to.val;
+	radd(m, t, r);
+}
+
+static int
+regcpy(Ins *i)
+{
+	return i->op == OCopy && isreg(i->arg[0]);
+}
+
+static Ins *
+dopm(Blk *b, Ins *i, RMap *m)
+{
+	RMap m0;
+	int n, r, r1, t, s;
+	Ins *i0, *i1, *ip, *ir;
+	bits def;
+
+	m0 = *m;
+	i1 = ++i;
+	do {
+		i--;
+		move(i->arg[0].val, i->to, m);
+	} while (i != b->ins && regcpy(i-1));
+	assert(m0.n <= m->n);
+	if (i != b->ins && (i-1)->op == OCall) {
+		def = retregs((i-1)->arg[1], 0);
+		for (r=0; r<NRSave; r++)
+			if (!(BIT(rsave[r]) & def))
+				move(rsave[r], R, m);
+	}
+	for (npm=0, n=0; n<m->n; n++) {
+		t = m->t[n];
+		s = tmp[t].slot;
+		r1 = m->r[n];
+		r = rfind(&m0, t);
+		if (r != -1)
+			pmadd(TMP(r1), TMP(r), tmp[t].cls);
+		else if (s != -1)
+			pmadd(TMP(r1), SLOT(s), tmp[t].cls);
+	}
+	for (ip=i; ip<i1; ip++) {
+		if (!req(ip->to, R))
+			rfree(m, ip->to.val);
+		r = ip->arg[0].val;
+		if (rfind(m, r) == -1)
+			radd(m, r, r);
+	}
+	pmgen();
+#ifdef TEST_PMOV
+	return 0;
+#endif
+	n = b->nins - (i1 - i) + (curi - insb);
+	i0 = alloc(n * sizeof(Ins));
+	ip = icpy(ip = i0, b->ins, i - b->ins);
+	ip = icpy(ir = ip, insb, curi - insb);
+	ip = icpy(ip, i1, &b->ins[b->nins] - i1);
+	b->nins = n;
+	b->ins = i0;
+	return ir;
+}
+
+static int
+prio(Ref r1, Ref r2)
+{
+	/* trivial heuristic to begin with,
+	 * later we can use the distance to
+	 * the definition instruction
+	 */
+	(void) r2;
+	return *hint(r1.val) != -1;
+}
+
+static void
+insert(Ref *r, Ref **rs, int p)
+{
+	int i;
+
+	rs[i = p] = r;
+	while (i-- > 0 && prio(*r, *rs[i])) {
+		rs[i+1] = rs[i];
+		rs[i] = r;
+	}
+}
+
+static void
+doblk(Blk *b, RMap *cur)
+{
+	int x, r, nr;
+	bits rs;
+	Ins *i;
+	Mem *m;
+	Ref *ra[4];
+
+	if (rtype(b->jmp.arg) == RTmp)
+		b->jmp.arg = ralloc(cur, b->jmp.arg.val);
+	else if (rtype(b->jmp.arg) == RACall) {
+		/* add return registers */
+		rs = retregs(b->jmp.arg, 0);
+		for (r=0; rs; rs/=2, r++)
+			if (rs & 1)
+				radd(cur, r, r);
+	}
+	for (i=&b->ins[b->nins]; i!=b->ins;) {
+		switch ((--i)->op) {
+		case OCall:
+			rs = argregs(i->arg[1], 0);
+			for (r=0; r<NRSave; r++)
+				if (!(BIT(rsave[r]) & rs))
+					rfree(cur, rsave[r]);
+			break;
+		case OCopy:
+			if (isreg(i->arg[0])) {
+				i = dopm(b, i, cur);
+				continue;
+			}
+			if (isreg(i->to))
+			if (rtype(i->arg[0]) == RTmp)
+				sethint(i->arg[0].val, i->to.val);
+			/* fall through */
+		default:
+			if (!req(i->to, R)) {
+				assert(rtype(i->to) == RTmp);
+				r = rfree(cur, i->to.val);
+				if (r == -1 && !isreg(i->to)) {
+					*i = (Ins){.op = ONop};
+					continue;
+				}
+				if (i->to.val >= Tmp0)
+					i->to = TMP(r);
+			}
+			break;
+		}
+		for (x=0, nr=0; x<2; x++)
+			switch (rtype(i->arg[x])) {
+			case RAMem:
+				m = &mem[i->arg[x].val & AMask];
+				if (rtype(m->base) == RTmp)
+					insert(&m->base, ra, nr++);
+				if (rtype(m->index) == RTmp)
+					insert(&m->index, ra, nr++);
+				break;
+			case RTmp:
+				insert(&i->arg[x], ra, nr++);
+				break;
+			}
+		for (r=0; r<nr; r++)
+			*ra[r] = ralloc(cur, ra[r]->val);
+	}
+}
+
+/* register allocation
+ * depends on rpo, phi, cost, (and obviously spill)
+ */
+void
+rega(Fn *fn)
+{
+	int j, n, t, r, r1, x, rl[Tmp0];
+	Blk *b, *b1, *s, ***ps, *blist;
+	RMap *end, *beg, cur, old;
+	Ins *i;
+	Phi *p;
+	uint u;
+	Ref src, dst;
+
+	/* 1. setup */
+	regu = 0;
+	tmp = fn->tmp;
+	mem = fn->mem;
+	end = alloc(fn->nblk * sizeof end[0]);
+	beg = alloc(fn->nblk * sizeof beg[0]);
+	for (n=0; n<fn->nblk; n++) {
+		bsinit(end[n].b, fn->ntmp);
+		bsinit(beg[n].b, fn->ntmp);
+	}
+	bsinit(cur.b, fn->ntmp);
+	bsinit(old.b, fn->ntmp);
+
+	for (t=Tmp0; t<fn->ntmp; t++)
+		*hint(t) = -1;
+	for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
+		if (i->op != OCopy || !isreg(i->arg[0]))
+			break;
+		else {
+			assert(rtype(i->to) == RTmp);
+			sethint(i->to.val, i->arg[0].val);
+		}
+
+	/* 2. assign registers following post-order */
+	for (n=fn->nblk-1; n>=0; n--) {
+		b = fn->rpo[n];
+		cur.n = 0;
+		bszero(cur.b);
+		for (x=0; x<2; x++)
+			for (t=Tmp0; t<fn->ntmp; t++) {
+				assert(bshas(b->out, t) ||
+					!bshas(cur.b, t));
+				if (bshas(b->out, t))
+				if (!bshas(cur.b, t))
+				if (x || (r=*hint(t)) != -1)
+				if (x || !bshas(cur.b, r))
+					ralloc(&cur, t);
+			}
+		rcopy(&end[n], &cur);
+		doblk(b, &cur);
+		bscopy(b->in, cur.b);
+		for (p=b->phi; p; p=p->link)
+			if (rtype(p->to) == RTmp) {
+				bsclr(b->in, p->to.val);
+				/* heuristic 0:
+				 * if the phi destination has an
+				 * argument from a frequent block
+				 * that was already allocated to
+				 * 'r', use 'r' as the new hint
+				 */
+				memset(rl, 0, sizeof rl);
+				for (u=0; u<p->narg; u++) {
+					t = p->arg[u].val;
+					b1 = p->blk[u];
+					if (rtype(p->arg[u]) == RTmp)
+					if ((r=rfind(&end[b1->id], t)) != -1)
+						rl[r] += b1->loop;
+				}
+				for (x=0, j=0; j<Tmp0; j++)
+					if (rl[j] > rl[x])
+						x = j;
+				if (rl[x] >= b->loop)
+					*hint(p->to.val) = x;
+			}
+		if (b->npred > 1) {
+			/* heuristic 1:
+			 * attempt to satisfy hints
+			 * when it's simple and we have
+			 * multiple predecessors
+			 */
+			rcopy(&old, &cur);
+			curi = &insb[NIns];
+			for (j=0; j<old.n; j++) {
+				t = old.t[j];
+				r = *hint(t);
+				r1 = rfind(&cur, t);
+				if (r != -1 && r != r1)
+				if (!bshas(cur.b, r)) {
+					rfree(&cur, t);
+					radd(&cur, t, r);
+					x = tmp[t].cls;
+					emit(OCopy, x, TMP(r1), TMP(r), R);
+				}
+			}
+			if ((j = &insb[NIns] - curi)) {
+				b->nins += j;
+				i = alloc(b->nins * sizeof(Ins));
+				icpy(icpy(i, curi, j), b->ins, b->nins-j);
+				b->ins = i;
+			}
+		}
+		rcopy(&beg[n], &cur);
+	}
+	if (debug['R'])  {
+		fprintf(stderr, "\n> Register mappings:\n");
+		for (n=0; n<fn->nblk; n++) {
+			b = fn->rpo[n];
+			fprintf(stderr, "\t%-10s beg", b->name);
+			mdump(&beg[n]);
+			fprintf(stderr, "\t           end");
+			mdump(&end[n]);
+		}
+		fprintf(stderr, "\n");
+	}
+
+	/* 3. compose glue code */
+	blist = 0;
+	for (b=fn->start;; b=b->link) {
+		ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}};
+		for (; (s=**ps); ps++) {
+			npm = 0;
+			for (p=s->phi; p; p=p->link) {
+				dst = p->to;
+				assert(rtype(dst)==RSlot || rtype(dst)==RTmp);
+				if (rtype(dst) == RTmp) {
+					r = rfind(&beg[s->id], dst.val);
+					if (r == -1)
+						continue;
+					dst = TMP(r);
+				}
+				for (u=0; p->blk[u]!=b; u++)
+					assert(u+1 < p->narg);
+				src = p->arg[u];
+				if (rtype(src) == RTmp)
+					src = rref(&end[b->id], src.val);
+				pmadd(src, dst, p->cls);
+			}
+			for (t=Tmp0; t<fn->ntmp; t++)
+				if (bshas(s->in, t)) {
+					src = rref(&end[b->id], t);
+					dst = rref(&beg[s->id], t);
+					pmadd(src, dst, tmp[t].cls);
+				}
+			pmgen();
+			if (curi == insb)
+				continue;
+			b1 = blknew();
+			b1->loop = (b->loop+s->loop) / 2;
+			b1->link = blist;
+			blist = b1;
+			fn->nblk++;
+			sprintf(b1->name, "%s_%s", b->name, s->name);
+			b1->nins = curi - insb;
+			idup(&b1->ins, insb, b1->nins);
+			b1->jmp.type = JJmp;
+			b1->s1 = s;
+			**ps = b1;
+		}
+		if (!b->link) {
+			b->link = blist;
+			break;
+		}
+	}
+	for (b=fn->start; b; b=b->link)
+		b->phi = 0;
+	fn->reg = regu;
+
+	if (debug['R']) {
+		fprintf(stderr, "\n> After register allocation:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/src/spill.c b/src/spill.c
new file mode 100644
index 0000000..72f8106
--- /dev/null
+++ b/src/spill.c
@@ -0,0 +1,507 @@
+#include "all.h"
+
+static void
+loopmark(Blk *hd, Blk *b, Phi *p)
+{
+	int k, head;
+	uint n, a;
+
+	head = hd->id;
+	if (b->id < head)
+		return;
+	for (; p; p=p->link)
+		for (a=0; a<p->narg; a++)
+			if (p->blk[a] == b)
+			if (rtype(p->arg[a]) == RTmp)
+				bsset(hd->gen, p->arg[a].val);
+	if (b->visit == head)
+		return;
+	b->visit = head;
+	b->loop *= 10;
+	/* aggregate looping information at
+	 * loop headers */
+	bsunion(hd->gen, b->gen);
+	for (k=0; k<2; k++)
+		if (b->nlive[k] > hd->nlive[k])
+			hd->nlive[k] = b->nlive[k];
+	for (n=0; n<b->npred; n++)
+		loopmark(hd, b->pred[n], b->phi);
+}
+
+static void
+tmpuse(Ref r, int use, int loop, Fn *fn)
+{
+	Mem *m;
+	Tmp *t;
+
+	if (rtype(r) == RAMem) {
+		m = &fn->mem[r.val & AMask];
+		tmpuse(m->base, 1, loop, fn);
+		tmpuse(m->index, 1, loop, fn);
+	}
+	else if (rtype(r) == RTmp && r.val >= Tmp0) {
+		t = &fn->tmp[r.val];
+		t->nuse += use;
+		t->ndef += !use;
+		t->cost += loop;
+	}
+}
+
+/* evaluate spill costs of temporaries,
+ * this also fills usage information
+ * requires rpo, preds
+ */
+void
+fillcost(Fn *fn)
+{
+	int n, hd;
+	uint a;
+	Blk *b;
+	Ins *i;
+	Tmp *t;
+	Phi *p;
+
+	for (b=fn->start; b; b=b->link) {
+		b->loop = 1;
+		b->visit = -1;
+	}
+	if (debug['S'])
+		fprintf(stderr, "\n> Loop information:\n");
+	for (n=0; n<fn->nblk; n++) {
+		b = fn->rpo[n];
+		hd = 0;
+		for (a=0; a<b->npred; a++)
+			if (b->pred[a]->id >= n) {
+				loopmark(b, b->pred[a], b->phi);
+				hd = 1;
+			}
+		if (hd && debug['S']) {
+			fprintf(stderr, "\t%-10s", b->name);
+			fprintf(stderr, " (% 3d ", b->nlive[0]);
+			fprintf(stderr, "% 3d) ", b->nlive[1]);
+			dumpts(b->gen, fn->tmp, stderr);
+		}
+	}
+	for (t=fn->tmp; t-fn->tmp < fn->ntmp; t++) {
+		t->cost = t-fn->tmp < Tmp0 ? 1e6 : 0;
+		t->nuse = 0;
+		t->ndef = 0;
+	}
+	for (b=fn->start; b; b=b->link) {
+		for (p=b->phi; p; p=p->link) {
+			/* todo, the cost computation
+			 * for p->to is not great... */
+			tmpuse(p->to, 0, 0, fn);
+			for (a=0; a<p->narg; a++) {
+				n = p->blk[a]->loop;
+				assert(b->npred==p->narg &&
+					"wrong cfg");
+				n /= b->npred;
+				tmpuse(p->arg[a], 1, n, fn);
+			}
+		}
+		n = b->loop;
+		for (i=b->ins; i-b->ins < b->nins; i++) {
+			tmpuse(i->to, 0, n, fn);
+			tmpuse(i->arg[0], 1, n, fn);
+			tmpuse(i->arg[1], 1, n, fn);
+		}
+		tmpuse(b->jmp.arg, 1, n, fn);
+	}
+	if (debug['S']) {
+		fprintf(stderr, "\n> Spill costs:\n");
+		for (n=Tmp0; n<fn->ntmp; n++)
+			fprintf(stderr, "\t%-10s %d\n",
+				fn->tmp[n].name,
+				fn->tmp[n].cost);
+		fprintf(stderr, "\n");
+	}
+}
+
+static BSet *fst; /* temps to prioritize in registers (for tcmp1) */
+static Tmp *tmp;  /* current temporaries (for tcmpX) */
+static int ntmp;  /* current # of temps (for limit) */
+static int locs;  /* stack size used by locals */
+static int slot4; /* next slot of 4 bytes */
+static int slot8; /* ditto, 8 bytes */
+static BSet mask[2][1]; /* class masks */
+
+static int
+tcmp0(const void *pa, const void *pb)
+{
+	return tmp[*(int *)pb].cost - tmp[*(int *)pa].cost;
+}
+
+static int
+tcmp1(const void *pa, const void *pb)
+{
+	int c;
+
+	c = bshas(fst, *(int *)pb) - bshas(fst, *(int *)pa);
+	return c ? c : tcmp0(pa, pb);
+}
+
+static Ref
+slot(int t)
+{
+	int s;
+
+	if (t < Tmp0)
+		diag("spill: cannot spill register");
+	s = tmp[t].slot;
+	if (s == -1) {
+		assert(NAlign == 3);
+		/* nice logic to pack stack slots
+		 * on demand, there can be only
+		 * one hole and slot4 points to it
+		 *
+		 * invariant: slot4 <= slot8
+		 */
+		if (KWIDE(tmp[t].cls)) {
+			s = slot8;
+			if (slot4 == slot8)
+				slot4 += 2;
+			slot8 += 2;
+		} else {
+			s = slot4;
+			if (slot4 == slot8) {
+				slot8 += 2;
+				slot4 += 1;
+			} else
+				slot4 = slot8;
+		}
+		s += locs;
+		tmp[t].slot = s;
+	}
+	return SLOT(s);
+}
+
+static void
+limit(BSet *b, int k, BSet *f)
+{
+	static int *tarr, maxt;
+	int i, nt;
+	uint t;
+
+	nt = bscount(b);
+	if (nt <= k)
+		return;
+	if (nt > maxt) {
+		free(tarr);
+		tarr = emalloc(nt * sizeof tarr[0]);
+		maxt = nt;
+	}
+	for (i=0, t=0; bsiter(b, &t); t++) {
+		bsclr(b, t);
+		tarr[i++] = t;
+	}
+	if (!f)
+		qsort(tarr, nt, sizeof tarr[0], tcmp0);
+	else {
+		fst = f;
+		qsort(tarr, nt, sizeof tarr[0], tcmp1);
+	}
+	for (i=0; i<k && i<nt; i++)
+		bsset(b, tarr[i]);
+	for (; i<nt; i++)
+		slot(tarr[i]);
+}
+
+static void
+limit2(BSet *b1, int k1, int k2, BSet *fst)
+{
+	BSet b2[1];
+
+	bsinit(b2, ntmp); /* todo, free those */
+	bscopy(b2, b1);
+	bsinter(b1, mask[0]);
+	bsinter(b2, mask[1]);
+	limit(b1, NIReg - k1, fst);
+	limit(b2, NFReg - k2, fst);
+	bsunion(b1, b2);
+}
+
+static void
+sethint(BSet *u, bits r)
+{
+	uint t;
+
+	for (t=Tmp0; bsiter(u, &t); t++)
+		tmp[phicls(t, tmp)].hint.m |= r;
+}
+
+static void
+reloads(BSet *u, BSet *v)
+{
+	uint t;
+
+	for (t=Tmp0; bsiter(u, &t); t++)
+		if (!bshas(v, t))
+			emit(OLoad, tmp[t].cls, TMP(t), slot(t), R);
+}
+
+static void
+store(Ref r, int s)
+{
+	static int kstore[] = {
+		[Kw] = OStorew, [Kl] = OStorel,
+		[Ks] = OStores, [Kd] = OStored,
+	};
+
+	if (s != -1)
+		emit(kstore[tmp[r.val].cls], 0, R, r, SLOT(s));
+}
+
+static int
+regcpy(Ins *i)
+{
+	return i->op == OCopy && isreg(i->arg[0]);
+}
+
+static Ins *
+dopm(Blk *b, Ins *i, BSet *v)
+{
+	int n, t;
+	BSet u[1];
+	Ins *i1;
+	bits r;
+
+	bsinit(u, ntmp); /* todo, free those */
+	/* consecutive copies from
+	 * registers need to be handled
+	 * as one large instruction
+	 *
+	 * fixme: there is an assumption
+	 * that calls are always followed
+	 * by copy instructions here, this
+	 * might not be true if previous
+	 * passes change
+	 */
+	i1 = ++i;
+	do {
+		i--;
+		t = i->to.val;
+		if (!req(i->to, R))
+		if (bshas(v, t)) {
+			bsclr(v, t);
+			store(i->to, tmp[t].slot);
+		}
+		bsset(v, i->arg[0].val);
+	} while (i != b->ins && regcpy(i-1));
+	bscopy(u, v);
+	if (i != b->ins && (i-1)->op == OCall) {
+		v->t[0] &= ~retregs((i-1)->arg[1], 0);
+		limit2(v, NISave, NFSave, 0);
+		for (r=0, n=0; n<NRSave; n++)
+			r |= BIT(rsave[n]);
+		v->t[0] |= argregs((i-1)->arg[1], 0);
+	} else {
+		limit2(v, 0, 0, 0);
+		r = v->t[0];
+	}
+	sethint(v, r);
+	reloads(u, v);
+	do
+		emiti(*--i1);
+	while (i1 != i);
+	return i;
+}
+
+/* spill code insertion
+ * requires spill costs, rpo, liveness
+ *
+ * Note: this will replace liveness
+ * information (in, out) with temporaries
+ * that must be in registers at block
+ * borders
+ *
+ * Be careful with:
+ * - OCopy instructions to ensure register
+ *   constraints
+ */
+void
+spill(Fn *fn)
+{
+	Blk *b, *s1, *s2, *hd, **bp;
+	int j, n, l, t, k, lvarg[2];
+	BSet u[1], v[1], w[1];
+	Ins *i;
+	Phi *p;
+	Mem *m;
+	bits r;
+
+	tmp = fn->tmp;
+	ntmp = fn->ntmp;
+	bsinit(u, ntmp);
+	bsinit(v, ntmp);
+	bsinit(w, ntmp);
+	bsinit(mask[0], ntmp);
+	bsinit(mask[1], ntmp);
+	locs = fn->slot;
+	slot4 = 0;
+	slot8 = 0;
+	for (t=0; t<ntmp; t++) {
+		k = 0;
+		if (t >= XMM0 && t < XMM0 + NFReg)
+			k = 1;
+		else if (t >= Tmp0)
+			k = KBASE(tmp[t].cls);
+		bsset(mask[k], t);
+	}
+
+	for (bp=&fn->rpo[fn->nblk]; bp!=fn->rpo;) {
+		b = *--bp;
+		/* invariant: all bocks with bigger rpo got
+		 * their in,out updated. */
+
+		/* 1. find temporaries in registers at
+		 * the end of the block (put them in v) */
+		curi = 0;
+		s1 = b->s1;
+		s2 = b->s2;
+		hd = 0;
+		if (s1 && s1->id <= n)
+			hd = s1;
+		if (s2 && s2->id <= n)
+		if (!hd || s2->id >= hd->id)
+			hd = s2;
+		r = 0;
+		bszero(v);
+		if (hd) {
+			/* back-edge */
+			for (k=0; k<2; k++) {
+				n = k == 0 ? NIReg : NFReg;
+				bscopy(u, b->out);
+				bsinter(u, mask[k]);
+				bscopy(w, u);
+				bsinter(u, hd->gen);
+				bsdiff(w, hd->gen);
+				if ((int)bscount(u) < n) { /* fixme */
+					j = bscount(w);   /* live through */
+					l = hd->nlive[k];
+					limit(w, n - (l - j), 0);
+					bsunion(u, w);
+				} else
+					limit(u, n, 0);
+				bsunion(v, u);
+			}
+		} else if (s1) {
+			liveon(v, b, s1);
+			if (s2) {
+				liveon(u, b, s2);
+				bscopy(w, u);
+				bsinter(w, v);
+				bsunion(v, u);
+			}
+			limit2(v, 0, 0, w);
+		} else if (rtype(b->jmp.arg) == RACall) {
+			/* return */
+			r = retregs(b->jmp.arg, 0);
+			v->t[0] |= r;
+		}
+		bscopy(b->out, v);
+
+		/* 2. process the block instructions */
+		curi = &insb[NIns];
+		for (i=&b->ins[b->nins]; i!=b->ins;) {
+			i--;
+			if (regcpy(i)) {
+				i = dopm(b, i, v);
+				continue;
+			}
+			bszero(w);
+			if (!req(i->to, R)) {
+				assert(rtype(i->to) == RTmp);
+				t = i->to.val;
+				if (bshas(v, t))
+					bsclr(v, t);
+				else {
+					/* make sure we have a reg
+					 * for the result */
+					bsset(v, t);
+					bsset(w, t);
+				}
+			}
+			j = opdesc[i->op].nmem;
+			for (n=0; n<2; n++)
+				if (rtype(i->arg[n]) == RAMem)
+					j--;
+			for (n=0; n<2; n++)
+				switch (rtype(i->arg[n])) {
+				case RAMem:
+					t = i->arg[n].val;
+					m = &fn->mem[t & AMask];
+					if (rtype(m->base) == RTmp) {
+						bsset(v, m->base.val);
+						bsset(w, m->base.val);
+					}
+					if (rtype(m->index) == RTmp) {
+						bsset(v, m->index.val);
+						bsset(w, m->index.val);
+					}
+					break;
+				case RTmp:
+					t = i->arg[n].val;
+					lvarg[n] = bshas(v, t);
+					bsset(v, t);
+					if (j-- <= 0)
+						bsset(w, t);
+					break;
+				}
+			bscopy(u, v);
+			limit2(v, 0, 0, w);
+			for (n=0; n<2; n++)
+				if (rtype(i->arg[n]) == RTmp) {
+					t = i->arg[n].val;
+					if (!bshas(v, t)) {
+						/* do not reload if the
+						 * the temporary was dead
+						 */
+						if (!lvarg[n])
+							bsclr(u, t);
+						i->arg[n] = slot(t);
+					}
+				}
+			reloads(u, v);
+			if (!req(i->to, R)) {
+				t = i->to.val;
+				store(i->to, tmp[t].slot);
+				bsclr(v, t);
+			}
+			emiti(*i);
+			r = v->t[0] & (BIT(Tmp0)-1);
+			if (r)
+				sethint(v, r);
+		}
+		assert(!r || b==fn->start);
+
+		for (p=b->phi; p; p=p->link) {
+			assert(rtype(p->to) == RTmp);
+			t = p->to.val;
+			if (bshas(v, t)) {
+				bsclr(v, t);
+				store(p->to, tmp[t].slot);
+			} else if (bshas(b->in, t))
+				/* only if the phi is live */
+				p->to = slot(p->to.val);
+		}
+		bscopy(b->in, v);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	}
+
+	/* align the locals to a 16 byte boundary */
+	assert(NAlign == 3);
+	slot8 += slot8 & 3;
+	fn->slot += slot8;
+
+	if (debug['S']) {
+		fprintf(stderr, "\n> Block information:\n");
+		for (b=fn->start; b; b=b->link) {
+			printf("\t%-10s (% 5d) ", b->name, b->loop);
+			dumpts(b->out, fn->tmp, stdout);
+		}
+		fprintf(stderr, "\n> After spilling:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/src/ssa.c b/src/ssa.c
new file mode 100644
index 0000000..0c163aa
--- /dev/null
+++ b/src/ssa.c
@@ -0,0 +1,516 @@
+#include "all.h"
+#include <stdarg.h>
+
+static void
+adduse(Tmp *tmp, int ty, Blk *b, ...)
+{
+	Use *u;
+	int n;
+	va_list ap;
+
+	va_start(ap, b);
+	n = tmp->nuse;
+	vgrow(&tmp->use, ++tmp->nuse);
+	u = &tmp->use[n];
+	u->type = ty;
+	u->bid = b->id;
+	switch (ty) {
+	default:
+		diag("ssa: adduse defaulted");
+	case UPhi:
+		u->u.phi = va_arg(ap, Phi *);
+		break;
+	case UIns:
+		u->u.ins = va_arg(ap, Ins *);
+		break;
+	case UJmp:
+		break;
+	}
+	va_end(ap);
+}
+
+/* fill usage, phi, and class information
+ */
+void
+filluse(Fn *fn)
+{
+	Blk *b;
+	Phi *p;
+	Ins *i;
+	int m, t;
+	uint a;
+	Tmp *tmp;
+
+	/* todo, is this the correct file? */
+	tmp = fn->tmp;
+	for (t=0; t<fn->ntmp; t++) {
+		tmp[t].ndef = 0;
+		tmp[t].nuse = 0;
+		tmp[t].phi = 0;
+		tmp[t].cls = 0;
+		if (tmp[t].use == 0)
+			tmp[t].use = vnew(0, sizeof(Use));
+	}
+	for (b=fn->start; b; b=b->link) {
+		for (p=b->phi; p; p=p->link) {
+			assert(rtype(p->to) == RTmp);
+			t = p->to.val;
+			tmp[t].ndef++;
+			tmp[t].cls = p->cls;
+			tmp[t].phi = p->to.val;
+			for (a=0; a<p->narg; a++)
+				if (rtype(p->arg[a]) == RTmp) {
+					t = p->arg[a].val;
+					adduse(&tmp[t], UPhi, b, p);
+					if (!tmp[t].phi)
+						tmp[t].phi = p->to.val;
+				}
+		}
+		for (i=b->ins; i-b->ins < b->nins; i++) {
+			if (!req(i->to, R)) {
+				assert(rtype(i->to) == RTmp);
+				t = i->to.val;
+				tmp[t].ndef++;
+				tmp[t].cls = i->cls;
+			}
+			for (m=0; m<2; m++)
+				if (rtype(i->arg[m]) == RTmp) {
+					t = i->arg[m].val;
+					adduse(&tmp[t], UIns, b, i);
+				}
+		}
+		if (rtype(b->jmp.arg) == RTmp)
+			adduse(&tmp[b->jmp.arg.val], UJmp, b);
+	}
+}
+
+static void
+addpred(Blk *bp, Blk *bc)
+{
+	uint i;
+
+	if (!bc->pred) {
+		bc->pred = alloc(bc->npred * sizeof bc->pred[0]);
+		for (i=0; i<bc->npred; i++)
+			bc->pred[i] = 0;
+	}
+	for (i=0; bc->pred[i]; i++)
+		;
+	bc->pred[i] = bp;
+}
+
+/* fill predecessors information in blocks
+ */
+void
+fillpreds(Fn *f)
+{
+	Blk *b;
+
+	for (b=f->start; b; b=b->link) {
+		b->npred = 0;
+		b->pred = 0;
+	}
+	for (b=f->start; b; b=b->link) {
+		if (b->s1)
+			b->s1->npred++;
+		if (b->s2)
+			b->s2->npred++;
+	}
+	for (b=f->start; b; b=b->link) {
+		if (b->s1)
+			addpred(b, b->s1);
+		if (b->s2)
+			addpred(b, b->s2);
+	}
+}
+
+static int
+rporec(Blk *b, int x)
+{
+	Blk *s1, *s2;
+
+	if (!b || b->id >= 0)
+		return x;
+	b->id = 1;
+	s1 = b->s1;
+	s2 = b->s2;
+	if (s1 && s2 && s1->loop > s2->loop) {
+		s1 = b->s2;
+		s2 = b->s1;
+	}
+	x = rporec(s1, x);
+	x = rporec(s2, x);
+	b->id = x;
+	assert(x >= 0);
+	return x - 1;
+}
+
+/* fill the rpo information in blocks
+ */
+void
+fillrpo(Fn *f)
+{
+	int n;
+	Blk *b, **p;
+
+	for (b=f->start; b; b=b->link)
+		b->id = -1;
+	n = 1 + rporec(f->start, f->nblk-1);
+	f->nblk -= n;
+	f->rpo = alloc(f->nblk * sizeof f->rpo[0]);
+	for (p=&f->start; *p;) {
+		b = *p;
+		if (b->id == -1) {
+			*p = b->link;
+			/* todo, free block */
+		} else {
+			b->id -= n;
+			f->rpo[b->id] = b;
+			p=&(*p)->link;
+		}
+	}
+}
+
+/* for dominators computation, read
+ * "A Simple, Fast Dominance Algorithm"
+ * by K. Cooper, T. Harvey, and K. Kennedy.
+ */
+
+static Blk *
+inter(Blk *b1, Blk *b2)
+{
+	Blk *bt;
+
+	if (b1 == 0)
+		return b2;
+	while (b1 != b2) {
+		if (b1->id < b2->id) {
+			bt = b1;
+			b1 = b2;
+			b2 = bt;
+		}
+		while (b1->id > b2->id) {
+			b1 = b1->idom;
+			assert(b1);
+		}
+	}
+	return b1;
+}
+
+static void
+filldom(Fn *fn)
+{
+	Blk *b, *d;
+	int ch, n;
+	uint p;
+
+	for (b=fn->start; b; b=b->link) {
+		b->idom = 0;
+		b->dom = 0;
+		b->dlink = 0;
+	}
+	do {
+		ch = 0;
+		for (n=1; n<fn->nblk; n++) {
+			b = fn->rpo[n];
+			d = 0;
+			for (p=0; p<b->npred; p++)
+				if (b->pred[p]->idom
+				||  b->pred[p] == fn->start)
+					d = inter(d, b->pred[p]);
+			if (d != b->idom) {
+				ch++;
+				b->idom = d;
+			}
+		}
+	} while (ch);
+	for (b=fn->start; b; b=b->link)
+		if ((d=b->idom)) {
+			assert(d != b);
+			b->dlink = d->dom;
+			d->dom = b;
+		}
+}
+
+static int
+sdom(Blk *b1, Blk *b2)
+{
+	assert(b1 && b2);
+	if (b1 == b2)
+		return 0;
+	while (b2->id > b1->id)
+		b2 = b2->idom;
+	return b1 == b2;
+}
+
+static int
+dom(Blk *b1, Blk *b2)
+{
+	return b1 == b2 || sdom(b1, b2);
+}
+
+static void
+addfron(Blk *a, Blk *b)
+{
+	int n;
+
+	for (n=0; n<a->nfron; n++)
+		if (a->fron[n] == b)
+			return;
+	if (!a->nfron)
+		a->fron = vnew(++a->nfron, sizeof a->fron[0]);
+	else
+		vgrow(&a->fron, ++a->nfron);
+	a->fron[a->nfron-1] = b;
+}
+
+static void
+fillfron(Fn *fn)
+{
+	Blk *a, *b;
+
+	for (b=fn->start; b; b=b->link) {
+		if (b->s1)
+			for (a=b; !sdom(a, b->s1); a=a->idom)
+				addfron(a, b->s1);
+		if (b->s2)
+			for (a=b; !sdom(a, b->s2); a=a->idom)
+				addfron(a, b->s2);
+	}
+}
+
+static Ref
+refindex(int t, Fn *fn)
+{
+	return newtmp(fn->tmp[t].name, fn->tmp[t].cls, fn);
+}
+
+static void
+phiins(Fn *fn)
+{
+	BSet u[1], defs[1];
+	Blk *a, *b, **blist, **be, **bp;
+	Ins *i;
+	Phi *p;
+	Ref r;
+	int t, n, k, nt;
+
+	bsinit(u, fn->nblk);
+	bsinit(defs, fn->nblk);
+	blist = emalloc(fn->nblk * sizeof blist[0]);
+	be = &blist[fn->nblk];
+	nt = fn->ntmp;
+	for (t=Tmp0; t<nt; t++) {
+		fn->tmp[t].visit = 0;
+		if (fn->tmp[t].phi != 0)
+			continue;
+		bszero(u);
+		k = -1;
+		bp = be;
+		for (b=fn->start; b; b=b->link) {
+			b->visit = 0;
+			r = R;
+			for (i=b->ins; i-b->ins < b->nins; i++) {
+				if (!req(r, R)) {
+					if (req(i->arg[0], TMP(t)))
+						i->arg[0] = r;
+					if (req(i->arg[1], TMP(t)))
+						i->arg[1] = r;
+				}
+				if (req(i->to, TMP(t))) {
+					if (!bshas(b->out, t)) {
+						if (fn->tmp[t].ndef == 1)
+							r = TMP(t);
+						else
+							r = refindex(t, fn);
+						i->to = r;
+					} else {
+						if (!bshas(u, b->id)) {
+							bsset(u, b->id);
+							*--bp = b;
+						}
+						if (k == -1)
+							k = i->cls;
+						assert(k == i->cls);
+					}
+				}
+			}
+			if (!req(r, R) && req(b->jmp.arg, TMP(t)))
+				b->jmp.arg = r;
+		}
+		bscopy(defs, u);
+		while (bp != be) {
+			fn->tmp[t].visit = t;
+			b = *bp++;
+			bsclr(u, b->id);
+			for (n=0; n<b->nfron; n++) {
+				a = b->fron[n];
+				if (a->visit++ == 0)
+				if (bshas(a->in, t)) {
+					p = alloc(sizeof *p);
+					p->cls = k;
+					p->to = TMP(t);
+					p->link = a->phi;
+					a->phi = p;
+					if (!bshas(defs, a->id))
+					if (!bshas(u, a->id)) {
+						bsset(u, a->id);
+						*--bp = a;
+					}
+				}
+			}
+		}
+	}
+	free(blist);
+}
+
+typedef struct Name Name;
+struct Name {
+	Ref r;
+	Blk *b;
+	Name *up;
+};
+
+static Name *namel;
+
+static Name *
+nnew(Ref r, Blk *b, Name *up)
+{
+	Name *n;
+
+	if (namel) {
+		n = namel;
+		namel = n->up;
+	} else
+		/* could use alloc, here
+		 * but namel should be reset
+		 */
+		n = emalloc(sizeof *n);
+	n->r = r;
+	n->b = b;
+	n->up = up;
+	return n;
+}
+
+static void
+nfree(Name *n)
+{
+	n->up = namel;
+	namel = n;
+}
+
+static void
+rendef(Ref *r, Blk *b, Name **stk, Fn *fn)
+{
+	Ref r1;
+	int t;
+
+	t = r->val;
+	if (req(*r, R) || !fn->tmp[t].visit)
+		return;
+	r1 = refindex(t, fn);
+	fn->tmp[r1.val].visit = t;
+	stk[t] = nnew(r1, b, stk[t]);
+	*r = r1;
+}
+
+static Ref
+getstk(int t, Blk *b, Name **stk)
+{
+	Name *n, *n1;
+
+	n = stk[t];
+	while (n && !dom(n->b, b)) {
+		n1 = n;
+		n = n->up;
+		nfree(n1);
+	}
+	stk[t] = n;
+	if (!n) {
+		/* uh, oh, warn */
+		return CON_Z;
+	} else
+		return n->r;
+}
+
+static void
+renblk(Blk *b, Name **stk, Fn *fn)
+{
+	Phi *p;
+	Ins *i;
+	Blk *s, **ps, *succ[3];
+	int t, m;
+
+	for (p=b->phi; p; p=p->link)
+		rendef(&p->to, b, stk, fn);
+	for (i=b->ins; i-b->ins < b->nins; i++) {
+		for (m=0; m<2; m++) {
+			t = i->arg[m].val;
+			if (rtype(i->arg[m]) == RTmp)
+			if (fn->tmp[t].visit)
+				i->arg[m] = getstk(t, b, stk);
+		}
+		rendef(&i->to, b, stk, fn);
+	}
+	t = b->jmp.arg.val;
+	if (rtype(b->jmp.arg) == RTmp)
+	if (fn->tmp[t].visit)
+		b->jmp.arg = getstk(t, b, stk);
+	succ[0] = b->s1;
+	succ[1] = b->s2;
+	succ[2] = 0;
+	for (ps=succ; (s=*ps); ps++)
+		for (p=s->phi; p; p=p->link) {
+			t = p->to.val;
+			if ((t=fn->tmp[t].visit)) {
+				m = p->narg++;
+				if (m == NPred)
+					diag("ssa: too many phi arguments");
+				p->arg[m] = getstk(t, b, stk);
+				p->blk[m] = b;
+			}
+		}
+	for (s=b->dom; s; s=s->dlink)
+		renblk(s, stk, fn);
+}
+
+/* require ndef */
+void
+ssa(Fn *fn)
+{
+	Name **stk, *n;
+	int d, nt;
+	Blk *b, *b1;
+
+	nt = fn->ntmp;
+	stk = emalloc(nt * sizeof stk[0]);
+	d = debug['L'];
+	debug['L'] = 0;
+	filldom(fn);
+	if (debug['N']) {
+		fprintf(stderr, "\n> Dominators:\n");
+		for (b1=fn->start; b1; b1=b1->link) {
+			if (!b1->dom)
+				continue;
+			fprintf(stderr, "%10s:", b1->name);
+			for (b=b1->dom; b; b=b->dlink)
+				fprintf(stderr, " %s", b->name);
+			fprintf(stderr, "\n");
+		}
+	}
+	fillfron(fn);
+	filllive(fn);
+	phiins(fn);
+	renblk(fn->start, stk, fn);
+	while (nt--)
+		while ((n=stk[nt])) {
+			stk[nt] = n->up;
+			nfree(n);
+		}
+	debug['L'] = d;
+	free(stk);
+	if (debug['N']) {
+		fprintf(stderr, "\n> After SSA construction:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/src/test/_alt.ssa b/src/test/_alt.ssa
new file mode 100644
index 0000000..3f89e5e
--- /dev/null
+++ b/src/test/_alt.ssa
@@ -0,0 +1,25 @@
+# an example with reducible control
+# flow graph that exposes poor
+# handling of looping constructs
+
+function $test() {
+@start
+	%ten =w copy 10
+	%dum =w copy 0  # dummy live-through temporary
+@loop
+	%alt =w phi @start 0, @left %alt1, @right %alt1
+	%cnt =w phi @start 100, @left %cnt, @right %cnt1
+	%alt1 =w sub 1, %alt
+	jnz %alt1, @right, @left
+@left
+	%x =w phi @loop 10, @left %x1
+	%x1 =w sub %x, 1
+	%z =w copy %x
+	jnz %z, @left, @loop
+@right
+	%cnt1 =w sub %cnt, %ten
+	jnz %cnt1, @loop, @end
+@end
+	%ret =w add %cnt, %dum
+	ret
+}
diff --git a/src/test/_dragon.ssa b/src/test/_dragon.ssa
new file mode 100644
index 0000000..b169e1b
--- /dev/null
+++ b/src/test/_dragon.ssa
@@ -0,0 +1,33 @@
+# a moderately complex test for
+# dominators computation from
+# the dragon book
+# because branching is limited to
+# two, I had to split some blocks
+
+function $dragon() {
+@start
+@b1
+	jnz 0, @b2, @b3
+@b2
+	jmp @b3
+@b3
+	jmp @b4.1
+@b4.1
+	jnz 0, @b3, @b4.2
+@b4.2
+	jnz 0, @b5, @b6
+@b5
+	jmp @b7
+@b6
+	jmp @b7
+@b7
+	jnz 0, @b8.1, @b4.1
+@b8.1
+	jnz 0, @b3, @b8.2
+@b8.2
+	jnz 0, @b9, @b10
+@b9
+	jmp @b1
+@b10
+	jmp @b7
+}
diff --git a/src/test/_fix1.ssa b/src/test/_fix1.ssa
new file mode 100644
index 0000000..e89307f
--- /dev/null
+++ b/src/test/_fix1.ssa
@@ -0,0 +1,15 @@
+function $test() {
+@start
+	%x =w copy 1
+@loop
+	jnz %x, @noz, @isz
+@noz
+	%x =w copy 0
+	jmp @end
+@isz
+	%x =w copy 1
+	jmp @loop
+@end
+	%z =w add 10, %x
+	ret
+}
diff --git a/src/test/_fix2.ssa b/src/test/_fix2.ssa
new file mode 100644
index 0000000..89f236d
--- /dev/null
+++ b/src/test/_fix2.ssa
@@ -0,0 +1,15 @@
+function $test() {
+@start
+	%x =w copy 1
+@loop
+	jnz %x, @noz, @isz
+@noz
+	%x =w copy 0
+	jnz %x, @loop, @end
+@isz
+	%x =w copy 1
+	jmp @loop
+@end
+	%z =w add 10, %x
+	ret
+}
diff --git a/src/test/_fix3.ssa b/src/test/_fix3.ssa
new file mode 100644
index 0000000..283e5a1
--- /dev/null
+++ b/src/test/_fix3.ssa
@@ -0,0 +1,20 @@
+function w $test() {
+@start
+	%x =w copy 100
+	%s =w copy 0
+@l
+	%c =w cslew %x, 10
+	jnz %c, @a, @b
+@a
+	%s =w add %s, %x
+	%x =w sub %x, 1
+	jmp @c
+@b
+	%s =w sub %s, %x
+	jmp @c
+@c
+	%x =w sub %x, 1
+	jnz %x, @l, @end
+@end
+	ret %s
+}
diff --git a/src/test/_fix4.ssa b/src/test/_fix4.ssa
new file mode 100644
index 0000000..181768d
--- /dev/null
+++ b/src/test/_fix4.ssa
@@ -0,0 +1,27 @@
+function $test() {
+@start
+	%x =w copy 3
+	%n =w copy 2
+@loop
+	%c =w ceqw %n, 10000
+	jnz %c, @end, @next
+@next
+	%t =w copy 3
+	%x =w add %x, 2
+@tloop
+	%s =w mul %t, %t
+	%c =w csgtw %s, %x
+	jnz %c, @prime, @test
+@test
+	%r =w rem %x, %t
+	jnz %r, @tnext, @loop
+@tnext
+	%t =w add %t, 2
+	jmp @tloop
+@prime
+	%n =w add %n, 1
+	jmp @loop
+@end
+	storew %x, $a
+	ret
+}
diff --git a/src/test/_live.ssa b/src/test/_live.ssa
new file mode 100644
index 0000000..fce4cb9
--- /dev/null
+++ b/src/test/_live.ssa
@@ -0,0 +1,21 @@
+# this control flow graph is irreducible
+# yet, we expecet the liveness analysis
+# to work properly and make %x live in
+# the block @left
+#
+# nothing should ever be live at the entry
+
+function $test() {
+@start
+	%b =w copy 0
+	%x =w copy 10
+	jnz 0, @loop, @left
+@left
+	jmp @inloop
+@loop
+	%x1 =w add %x, 1
+@inloop
+	%b1 =w add %b, 1
+@endloop
+	jmp @loop
+}
diff --git a/src/test/_rpo.ssa b/src/test/_rpo.ssa
new file mode 100644
index 0000000..a10c6b1
--- /dev/null
+++ b/src/test/_rpo.ssa
@@ -0,0 +1,12 @@
+function $test() {
+@start
+	jmp @foo
+@baz
+	jnz 1, @end, @foo
+@bar
+	jmp @end
+@foo
+	jnz 0, @bar, @baz
+@end
+	ret
+}
diff --git a/src/test/_spill1.ssa b/src/test/_spill1.ssa
new file mode 100644
index 0000000..df5e4c2
--- /dev/null
+++ b/src/test/_spill1.ssa
@@ -0,0 +1,22 @@
+# test with NReg == 3
+# there must be a spill
+# happening on %c
+#
+# if you replace the sub
+# by an add or comment
+# the two marked lines
+# there should be no
+# spill
+#
+
+function $test() {
+@start
+	%f =w copy 0      # here
+	%b =w copy 1
+	%c =w copy 2
+	%a =w sub %b, %c
+	%d =w copy %b
+	%e =w copy %f     # and there
+	%g =w copy %a
+	ret
+}
diff --git a/src/test/_spill2.ssa b/src/test/_spill2.ssa
new file mode 100644
index 0000000..d462d0b
--- /dev/null
+++ b/src/test/_spill2.ssa
@@ -0,0 +1,22 @@
+# stupid spilling test
+
+function $test() {
+@start
+	%x1  =w copy 10
+	%x2  =w add %x1, %x1
+	%x3  =w sub %x2, %x1
+	%x4  =w add %x3, %x1
+	%x5  =w sub %x4, %x1
+	%x6  =w add %x5, %x1
+	%x7  =w sub %x6, %x1
+	%x8  =w add %x7, %x1
+	%x9  =w sub %x8, %x8
+	%x10 =w add %x9, %x7
+	%x11 =w sub %x10, %x6
+	%x12 =w add %x11, %x5
+	%x13 =w sub %x12, %x4
+	%x14 =w add %x13, %x3
+	%x15 =w sub %x14, %x2
+	%x16 =w add %x15, %x1
+	ret
+}
diff --git a/src/test/_spill3.ssa b/src/test/_spill3.ssa
new file mode 100644
index 0000000..cdfda2d
--- /dev/null
+++ b/src/test/_spill3.ssa
@@ -0,0 +1,24 @@
+# make sure comparisons
+# never get their two
+# operands in memory
+# run with NReg == 3, or
+# adapt it!
+
+function $test() {
+@start
+	%a =w loadw $a
+	%b =w loadw $a
+
+@loop
+	%c =w phi @start 0, @loop %f
+	%d =w phi @start 0, @loop %g
+	%e =w phi @start 0, @loop %h
+	%f =w add %c, %d
+	%g =w add %c, %e
+	%h =w add %e, %d
+	%x =w cslew %a, %b
+	jnz %x, @loop, @end
+
+@end
+	ret
+}
diff --git a/src/test/abi1.ssa b/src/test/abi1.ssa
new file mode 100644
index 0000000..69cce44
--- /dev/null
+++ b/src/test/abi1.ssa
@@ -0,0 +1,59 @@
+# test calling into C with two
+# large struct arguments (passed
+# on the stack)
+
+type :mem = { b 17 }
+
+function $alpha(l %p, w %l, l %n) {
+@ini
+	%pe =l add %p, %n
+@lop
+	%p1 =l phi @ini %p, @lop %p2
+	%l1 =w phi @ini %l, @lop %l2
+	storeb %l1, %p1
+	%p2 =l add %p1, 1
+	%l2 =w add %l1, 1
+	%c1 =w ceql %p1, %pe
+	jnz %c1, @end, @lop
+@end
+	storeb 0, %pe
+	ret
+}
+
+function $test() {
+@start
+	%p =l alloc4 17
+	%q =l alloc4 17
+	%r0 =w call $alpha(l %p, w 65, l 16)
+	%r1 =w call $alpha(l %q, w 97, l 16)
+	%r2 =w call $fcb(:mem %p, w 1, w 2, w 3, w 4, w 5, w 6, w 7, w 8, w 9, :mem %q)
+	ret
+}
+
+
+# >>> driver
+# #include <stdio.h>
+# typedef struct { char t[17]; } mem;
+# extern void test();
+# void fcb(mem m, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, mem n) {
+# 	printf("fcb: m = (mem){ t = \"%s\" }\n", m.t);
+# 	printf("     n = (mem){ t = \"%s\" }\n", n.t);
+# 	#define T(n) printf("     i%d = %d\n", n, i##n);
+# 	T(1) T(2) T(3) T(4) T(5) T(6) T(7) T(8) T(9)
+# }
+# int main() { test(); return 0; }
+# <<<
+
+# >>> output
+# fcb: m = (mem){ t = "ABCDEFGHIJKLMNOP" }
+#      n = (mem){ t = "abcdefghijklmnop" }
+#      i1 = 1
+#      i2 = 2
+#      i3 = 3
+#      i4 = 4
+#      i5 = 5
+#      i6 = 6
+#      i7 = 7
+#      i8 = 8
+#      i9 = 9
+# <<<
diff --git a/src/test/abi2.ssa b/src/test/abi2.ssa
new file mode 100644
index 0000000..b82c80c
--- /dev/null
+++ b/src/test/abi2.ssa
@@ -0,0 +1,18 @@
+type :fps = { s, b, s }
+
+function s $sum(:fps %p) {
+@start
+	%f1 =s load %p
+	%p8 =l add 8, %p
+	%f2 =s load %p8
+	%s =s add %f1, %f2
+	ret %s
+}
+
+# >>> driver
+# typedef struct { float f1; char b; float f2; } fps;
+# extern float sum(fps);
+# int main() { fps x = { 1.23, -1, 2.34 }; return !(sum(x) == 1.23f+2.34f); }
+# /* Note the f suffixes above are important
+#  * otherwise C does double operations. */
+# <<<
diff --git a/src/test/abi3.ssa b/src/test/abi3.ssa
new file mode 100644
index 0000000..608d1db
--- /dev/null
+++ b/src/test/abi3.ssa
@@ -0,0 +1,43 @@
+type :four = {l, b, w}
+
+data $z = { w 0 }
+
+function $test() {
+ @start
+	%a  =w loadw $z
+	%y  =w add %a, %a
+
+	%s  =l alloc8 16   # allocate a :four struct
+	%s1 =l add %s, 12  # get address of the w
+	storel 4, %s       # set the l
+	storew 5, %s1      # set the w
+
+	# only the last argument should be on the stack
+	%f  =l add $F, %y
+	%x  =w call %f(w %y, w 1, w 2, w 3, :four %s, w 6)
+
+	# store the result in the
+	# global variable a
+
+	%x1 =w add %y, %x
+	storew %x1, $a
+	ret
+}
+
+# >>> driver
+# #include <stdio.h>
+# struct four { long l; char c; int i; };
+# extern void test(void);
+# int F(int a0, int a1, int a2, int a3, struct four s, int a6) {
+# 	printf("%d %d %d %d %d %d %d\n",
+# 		a0, a1, a2, a3, (int)s.l, s.i, a6);
+# 	return 42;
+# }
+# int a;
+# int main() { test(); printf("%d\n", a); return 0; }
+# <<<
+
+# >>> output
+# 0 1 2 3 4 5 6
+# 42
+# <<<
diff --git a/src/test/abi4.ssa b/src/test/abi4.ssa
new file mode 100644
index 0000000..4c3d89b
--- /dev/null
+++ b/src/test/abi4.ssa
@@ -0,0 +1,38 @@
+# return a large struct to C
+
+type :mem = { b 17 }
+
+function $alpha(l %p, w %l, l %n) {
+@ini
+	%pe =l add %p, %n
+@lop
+	%p1 =l phi @ini %p, @lop %p2
+	%l1 =w phi @ini %l, @lop %l2
+	storeb %l1, %p1
+	%p2 =l add %p1, 1
+	%l2 =w add %l1, 1
+	%c1 =w ceql %p1, %pe
+	jnz %c1, @end, @lop
+@end
+	storeb 0, %pe
+	ret
+}
+
+function :mem $test() {
+@start
+	%p =l alloc4 17
+	%r0 =w call $alpha(l %p, w 65, l 16)
+	ret %p
+}
+
+
+# >>> driver
+# #include <stdio.h>
+# typedef struct { char t[17]; } mem;
+# extern mem test(void);
+# int main() { mem m = test(); printf("%s\n", m.t); return 0; }
+# <<<
+
+# >>> output
+# ABCDEFGHIJKLMNOP
+# <<<
diff --git a/src/test/abi5.ssa b/src/test/abi5.ssa
new file mode 100644
index 0000000..4c5eaea
--- /dev/null
+++ b/src/test/abi5.ssa
@@ -0,0 +1,105 @@
+# returning structs from C
+
+type :st1 = { b 17 }
+type :st2 = { w }
+type :st3 = { s, w }
+type :st4 = { w, d }
+type :st5 = { s, l }
+type :st6 = { b 16 }
+type :st7 = { s, d }
+type :st8 = { w 4 }
+
+data $fmt1 = { b "t1: %s\n", b 0 }
+data $fmt2 = { b "t2: %d\n", b 0 }
+data $fmt3 = { b "t3: %f %d\n", b 0 }
+data $fmt4 = { b "t4: %d %f\n", b 0 }
+data $fmt5 = { b "t5: %f %lld\n", b 0 }
+data $fmt6 = { b "t6: %s\n", b 0 }
+data $fmt7 = { b "t7: %f %f\n", b 0 }
+data $fmt8 = { b "t8: %d %d %d %d\n", b 0 }
+
+function $test() {
+@start
+	%r1 =:st1 call $t1()
+	%i1 =w call $printf(l $fmt1, l %r1)
+
+	%r2 =:st2 call $t2()
+	%w2 =w loadw %r2
+	%i2 =w call $printf(l $fmt2, w %w2)
+
+	%r3 =:st3 call $t3()
+	%s3 =s loads %r3
+	%r34 =l add %r3, 4
+	%w3 =w loadw %r34
+	%p3 =d exts %s3
+	%i3 =w call $printf(l $fmt3, d %p3, w %w3)
+
+	%r4 =:st4 call $t4()
+	%w4 =w loadw %r4
+	%r48 =l add 8, %r4
+	%d4 =d loadd %r48
+	%i4 =w call $printf(l $fmt4, w %w4, d %d4)
+
+	%r5 =:st5 call $t5()
+	%s5 =s loads %r5
+	%d5 =d exts %s5
+	%r58 =l add %r5, 8
+	%l5 =l loadl %r58
+	%i5 =w call $printf(l $fmt5, d %d5, l %l5)
+
+	%r6 =:st6 call $t6()
+	%i6 =w call $printf(l $fmt6, l %r6)
+
+	%r7 =:st7 call $t7()
+	%s7 =s loads %r7
+	%d71 =d exts %s7
+	%r78 =l add %r7, 8
+	%d72 =d loadd %r78
+	%i7 =w call $printf(l $fmt7, d %d71, d %d72)
+
+	%r8 =:st8 call $t8()
+	%r84 =l add 4, %r8
+	%r88 =l add 4, %r84
+	%r812 =l add 4, %r88
+	%w81 =w loadw %r8
+	%w82 =w loadw %r84
+	%w83 =w loadw %r88
+	%w84 =w loadw %r812
+	%i8 =w call $printf(l $fmt8, w %w81, w %w82, w %w83, w %w84)
+
+	ret
+}
+
+
+# >>> driver
+# #include <stdio.h>
+# typedef struct { char t[17]; } st1;
+# typedef struct { int i; } st2;
+# typedef struct { float f; int i; } st3;
+# typedef struct { int i; double d; } st4;
+# typedef struct { float f; long l; } st5;
+# typedef struct { char t[16]; } st6;
+# typedef struct { float f; double d; } st7;
+# typedef struct { int i[4]; } st8;
+# extern void test(void);
+# st1 t1() { return (st1){"abcdefghijklmnop"}; }
+# st2 t2() { return (st2){2}; }
+# st3 t3() { return (st3){3.0,30}; }
+# st4 t4() { return (st4){4,-40}; }
+# st5 t5() { return (st5){5.5,-55}; }
+# st6 t6() { return (st6){"abcdefghijklmno"}; }
+# st7 t7() { return (st7){7.77,77.7}; }
+# st8 t8() { return (st8){-8,88,-888,8888}; }
+# int main() { test(); return 0; }
+# <<<
+
+# >>> output
+# t1: abcdefghijklmnop
+# t2: 2
+# t3: 3.000000 30
+# t4: 4 -40.000000
+# t5: 5.500000 -55
+# t6: abcdefghijklmno
+# t7: 7.770000 77.700000
+# t8: -8 88 -888 8888
+# <<<
diff --git a/src/test/align.ssa b/src/test/align.ssa
new file mode 100644
index 0000000..84d1fb9
--- /dev/null
+++ b/src/test/align.ssa
@@ -0,0 +1,16 @@
+function $test() {
+@start
+	%x =l alloc16 16
+	%y =l add %x, 8
+	%m =w rem %y, 16
+	storew %m, %y
+	%n =w loadw %y
+	storew %n, $a
+	ret
+}
+
+# >>> driver
+# extern void test(void);
+# int a;
+# int main() { test(); return !(a == 8 || a == -8); }
+# <<<
diff --git a/src/test/collatz.ssa b/src/test/collatz.ssa
new file mode 100644
index 0000000..373ecac
--- /dev/null
+++ b/src/test/collatz.ssa
@@ -0,0 +1,61 @@
+# a solution for N=1000 to
+# https://projecteuler.net/problem=14
+# we use a fast local array to
+# memoize small collatz numbers
+
+function $test() {
+@start
+	%mem =l alloc4 4000
+@loop
+	%n =w phi @start 1, @newm %n9, @oldm %n9
+	%cmax =w phi @start 0, @newm %c, @oldm %cmax
+	%fin =w csltw %n, 1000
+	jnz %fin, @cloop, @end
+@cloop
+	%n0 =w phi @loop %n, @odd %n2, @even %n3
+	%c0 =w phi @loop 0, @odd %c1, @even %c1
+	%no1 =w cnew %n0, 1
+	jnz %no1, @iter0, @endcl
+@iter0
+	%ism =w csltw %n0, %n
+	jnz %ism, @getmemo, @iter1
+@iter1
+	%c1 =w add %c0, 1
+	%p =w and %n0, 1
+	jnz %p, @odd, @even
+@odd
+	%n1 =w mul 3, %n0
+	%n2 =w add %n1, 1
+	jmp @cloop
+@even
+	%n3 =w div %n0, 2
+	jmp @cloop
+@getmemo                     # get the count for n0 in mem
+	%n0l =l extsw %n0
+	%idx0 =l mul %n0l, 4
+	%loc0 =l add %idx0, %mem
+	%cn0 =w loadw %loc0
+	%c2 =w add %c0, %cn0
+@endcl                       # store the count for n in mem
+	%c =w phi @getmemo %c2, @cloop %c0
+	%nl =l extsw %n
+	%idx1 =l mul %nl, 4
+	%loc1 =l add %idx1, %mem
+	storew %c, %loc1
+	%n9 =w add 1, %n
+	%big =w cslew %cmax, %c
+	jnz %big, @newm, @oldm
+@newm
+	jmp @loop
+@oldm
+	jmp @loop
+@end
+	storew %cmax, $a
+	ret
+}
+
+# >>> driver
+# extern void test(void);
+# int a;
+# int main() { test(); return !(a == 178); }
+# <<<
diff --git a/src/test/cprime.ssa b/src/test/cprime.ssa
new file mode 100644
index 0000000..1ca60e1
--- /dev/null
+++ b/src/test/cprime.ssa
@@ -0,0 +1,103 @@
+# generated by Andrew Chambers'
+# compiler from the C program
+# following in comments
+
+function w $main() {
+@start
+	%v0 =l alloc8 4
+	%v1 =l alloc8 4
+	%v2 =l alloc8 4
+	%v3 =l alloc8 4
+	%v4 =l alloc8 4
+	storew 5, %v1
+	storew 11, %v2
+	storew 12, %v3
+@L0
+	%v5 =w loadw %v1
+	%v6 =w cnew %v5, 201
+	jnz %v6, @L8, @L1
+@L8
+	storew 1, %v4
+	%v7 =w loadw %v3
+	%v8 =w rem %v7, 2
+	%v9 =w ceqw %v8, 0
+	jnz %v9, @L9, @L5
+@L9
+	storew 0, %v4
+@L5
+	storew 3, %v0
+@L2
+	%v10 =w loadw %v0
+	%v11 =w loadw %v3
+	%v12 =w csltw %v10, %v11
+	jnz %v12, @L10, @L3
+@L10
+	%v13 =w loadw %v3
+	%v14 =w loadw %v0
+	%v15 =w rem %v13, %v14
+	%v16 =w ceqw %v15, 0
+	jnz %v16, @L11, @L4
+@L11
+	storew 0, %v4
+	jmp @L3
+@L4
+	%v17 =w loadw %v0
+	%v18 =w add %v17, 2
+	storew %v18, %v0
+	jmp @L2
+@L3
+	%v19 =w loadw %v4
+	jnz %v19, @L12, @L6
+@L12
+	%v20 =w loadw %v3
+	storew %v20, %v2
+	%v21 =w loadw %v1
+	%v22 =w add %v21, 1
+	storew %v22, %v1
+@L6
+	%v23 =w loadw %v3
+	%v24 =w add %v23, 1
+	storew %v24, %v3
+	jmp @L0
+@L1
+	%v25 =w loadw %v2
+	%v26 =w cnew %v25, 1229
+	jnz %v26, @L13, @L7
+@L13
+	ret 1
+@L7
+	ret 0
+@end
+	ret 0
+}
+
+# int
+# main()
+# {
+#         int i, n, p, next, isprime;
+#
+#         n = 5;
+#         p = 11;
+#         next = 12;
+#         while(n != 201) {
+#                 isprime = 1;
+#                 if(next % 2 == 0) {
+#                         isprime = 0;
+#                 } else {
+#                         for(i = 3; i < next; i = i + 2) {
+#                                 if(next % i == 0) {
+#                                         isprime = 0;
+#                                         break;
+#                                 }
+#                         }
+#                 }
+#                 if(isprime) {
+#                         p = next;
+#                         n = n + 1;
+#                 }
+#                 next = next + 1;
+#         }
+#         if(p != 1229)
+#                 return 1;
+#         return 0;
+# }
diff --git a/src/test/cup.ssa b/src/test/cup.ssa
new file mode 100644
index 0000000..013394f
--- /dev/null
+++ b/src/test/cup.ssa
@@ -0,0 +1,17 @@
+# counts up from -1988 to 1991
+
+function $test() {
+@start
+@loop
+	%n0  =l phi @start -1988, @loop %n1
+	%n1  =l add 1, %n0
+	%cmp =w cslel 1991, %n1
+	jnz %cmp, @end, @loop
+@end
+	ret
+}
+
+# >>> driver
+# extern void test(void);
+# int main() { test(); return 0; }
+# <<<
diff --git a/src/test/dark.ssa b/src/test/dark.ssa
new file mode 100644
index 0000000..5046af3
--- /dev/null
+++ b/src/test/dark.ssa
@@ -0,0 +1,30 @@
+# a hack example,
+# we use a dark type to get
+# a pointer to the stack.
+
+type :magic = align 1 { 0 }
+
+data $ret = { l 0 }
+
+function $test(:magic %p) {
+@start
+	%av =w loadw $a
+	%a1 =w add 1, %av
+	storew %a1, $a       # increment $a
+	%r1 =l loadl $ret    # fetch from $ret
+	%p1 =l add %p, -8
+	%r2 =l loadl %p1     # get the return address
+	storel %r2, $ret     # store it in $ret
+	%c =w ceql %r1, %r2
+	jnz %c, @fin, @cal
+@cal
+	%i =w call $test()   # no argument given, intentionally!
+@fin
+	ret
+}
+
+# >>> driver
+# extern void test(void);
+# int a = 2;
+# int main() { test(); return !(a == 5); }
+# <<<
diff --git a/src/test/double.ssa b/src/test/double.ssa
new file mode 100644
index 0000000..d885d28
--- /dev/null
+++ b/src/test/double.ssa
@@ -0,0 +1,24 @@
+function $test() {
+@start
+	%x1 =d copy d_0.1
+	%x2 =d add d_0.2, %x1
+	%x3 =d sub %x2, d_0.3
+
+@loop
+	%x4 =d phi @start %x3, @loop %x5
+	%i1 =w phi @start 0, @loop %i2
+	%x5 =d add %x4, %x4
+	%i2 =w add %i1, 1
+	%c0 =w cled %x5, 4607182418800017408 # d_1.0
+	jnz %c0, @loop, @end
+
+@end
+	storew %i2, $a
+	ret
+}
+
+# >>> driver
+# extern void test(void);
+# int a;
+# int main() { test(); return !(a == 55); }
+# <<<
diff --git a/src/test/echo.ssa b/src/test/echo.ssa
new file mode 100644
index 0000000..d3c8a25
--- /dev/null
+++ b/src/test/echo.ssa
@@ -0,0 +1,32 @@
+function w $main(w %argc, l %argv) {
+@start
+	%fmt =l alloc8 8
+	storel 1663398693, %fmt             # "%s%c"
+	%av0 =l add %argv, 8
+	%ac0 =w sub %argc, 1
+@loop
+	%av =l phi @start %av0, @loop2 %av1
+	%ac =w phi @start %ac0, @loop2 %ac1
+	%c0 =w ceqw %ac, 0
+	jnz %c0, @end, @loop1
+@loop1
+	%c1 =w ceqw %ac, 1
+	jnz %c1, @last, @nolast
+@last
+	jmp @loop2
+@nolast
+	jmp @loop2
+@loop2
+	%sep =w phi @last 10, @nolast 32
+	%arg =l loadl %av
+	%r =w call $printf(l %fmt, l %arg, w %sep)
+	%av1 =l add %av, 8
+	%ac1 =w sub %ac, 1
+	jmp @loop
+@end
+	ret 0
+}
+
+# >>> output
+# a b c
+# <<<
diff --git a/src/test/eucl.ssa b/src/test/eucl.ssa
new file mode 100644
index 0000000..f50fd2c
--- /dev/null
+++ b/src/test/eucl.ssa
@@ -0,0 +1,24 @@
+# euclide's algorithm in ssa
+# it is a fairly interesting
+# ssa program because of the
+# swap of b and a
+
+function $test() {
+@start
+
+@loop
+	%a =w phi @start 380, @loop %r
+	%b =w phi @start 747, @loop %a
+	%r =w rem %b, %a
+	jnz %r, @loop, @end
+
+@end
+	storew %a, $a
+	ret
+}
+
+# >>> driver
+# extern void test(void);
+# int a;
+# int main() { test(); return !(a == 1); }
+# <<<
diff --git a/src/test/euclc.ssa b/src/test/euclc.ssa
new file mode 100644
index 0000000..c76db2f
--- /dev/null
+++ b/src/test/euclc.ssa
@@ -0,0 +1,29 @@
+function w $test() {
+@l0
+	%a =l alloc4 4
+	%b =l alloc4 4
+	%r =l alloc4 4
+	storew 747, %a
+	storew 380, %b
+@l1
+	%t4 =w loadw %b
+	jnz %t4, @l2, @l3
+@l2
+	%t7 =w loadw %a
+	%t8 =w loadw %b
+	%t6 =w rem %t7, %t8
+	storew %t6, %r
+	%t10 =w loadw %b
+	storew %t10, %a
+	%t12 =w loadw %r
+	storew %t12, %b
+	jmp @l1
+@l3
+	%t13 =w loadw %a
+	ret %t13
+}
+
+# >>> driver
+# extern int test(void);
+# int main() { return !(test() == 1); }
+# <<<
diff --git a/src/test/fpcnv.ssa b/src/test/fpcnv.ssa
new file mode 100644
index 0000000..5fd3be9
--- /dev/null
+++ b/src/test/fpcnv.ssa
@@ -0,0 +1,27 @@
+# floating point casts and conversions
+
+function s $fneg(s %f) {
+@fneg
+	%b0 =w cast %f
+	%b1 =w xor 2147483648, %b0
+	%rs =s cast %b1
+	ret %rs
+}
+
+function d $ftrunc(d %f) {
+@ftrunc
+	%l0 =l ftosi %f
+	%rt =d sitof %l0
+	ret %rt
+}
+
+# >>> driver
+# extern float fneg(float);
+# extern double ftrunc(double);
+# int main() {
+# 	if (fneg(1.23f) != -1.23f)  return 1;
+# 	if (ftrunc(3.1415) != 3.0)  return 2;
+# 	if (ftrunc(-1.234) != -1.0) return 3;
+# 	return 0;
+# }
+# <<<
diff --git a/src/test/go.sh b/src/test/go.sh
new file mode 100755
index 0000000..35bf525
--- /dev/null
+++ b/src/test/go.sh
@@ -0,0 +1,116 @@
+#!/bin/sh
+
+TMP=/tmp/qbe.zzzz
+
+DRV=$TMP.c
+ASM=$TMP.s
+BIN=$TMP.bin
+OUT=$TMP.out
+
+cleanup() {
+	rm -f $DRV $ASM $BIN $OUT
+}
+
+extract() {
+	WHAT="$1"
+	FILE="$2"
+
+	awk "
+		/^# >>> $WHAT/ {
+			p = 1
+			next
+		}
+		/^# <<</ {
+			if (p)
+				p = 0
+		}
+		p
+	" $FILE \
+	| sed -e 's/# //' \
+	| sed -e 's/#$//'
+}
+
+once() {
+	T="$1"
+
+	if ! test -f $T
+	then
+		echo "invalid test file $T" >&2
+		exit 1
+	fi
+
+	echo "$T... "
+
+	if ! ./qbe $T -o $ASM
+	then
+		echo "[qbe fail]"
+		return 1
+	fi
+
+	extract driver $T > $DRV
+	extract output $T > $OUT
+
+	if test -s $DRV
+	then
+		LNK="$DRV $ASM"
+	else
+		LNK="$ASM"
+	fi
+
+	if ! cc -g -o $BIN $LNK
+	then
+		echo "[cc fail]"
+		return 1
+	fi
+
+	if test -s $OUT
+	then
+		$BIN a b c | diff - $OUT
+		RET=$?
+		REASON="output"
+	else
+		$BIN a b c
+		RET=$?
+		REASON="returned $RET"
+	fi
+
+	if test $RET -ne 0
+	then
+		echo "[$REASON fail]"
+		return 1
+	fi
+
+	printf "\033[1A\033[45C[ok]\n"
+}
+
+
+#trap cleanup TERM QUIT
+
+if test -z "$1"
+then
+	echo "usage: test/go.sh {all, SSAFILE}" 2>&1
+	exit 1
+fi
+
+case $1 in
+	"all")
+		F=0
+		for T in test/[!_]*.ssa
+		do
+			once $T
+			F=`expr $F + $?`
+		done
+		if test $F -ge 1
+		then
+			echo
+			echo "$F test(s) failed!"
+		else
+			echo
+			echo "All is fine!"
+		fi
+		;;
+	*)
+		once $1
+		exit $?
+		;;
+esac
diff --git a/src/test/loop.ssa b/src/test/loop.ssa
new file mode 100644
index 0000000..c8c4ee0
--- /dev/null
+++ b/src/test/loop.ssa
@@ -0,0 +1,23 @@
+# simple looping program
+# sums all integers from 100 to 0
+
+function $test() {
+@start
+
+@loop
+	%s  =w phi @start   0, @loop %s1
+	%n  =w phi @start 100, @loop %n1
+	%s1 =w add %s, %n
+	%n1 =w sub %n, 1
+	jnz %n1, @loop, @end
+
+@end
+	storew %s1, $a
+	ret
+}
+
+# >>> driver
+# extern void test(void);
+# int a;
+# int main() { test(); return !(a == 5050); }
+# <<<
diff --git a/src/test/mandel.ssa b/src/test/mandel.ssa
new file mode 100644
index 0000000..efefeb3
--- /dev/null
+++ b/src/test/mandel.ssa
@@ -0,0 +1,123 @@
+# Print the Mandelbrot set on the
+# terminal line output.
+
+function w $mandel(d %x, d %y) {
+@mandel
+	%cr =d sub %y, d_0.5
+	%ci =d copy %x
+@loop
+	%i =w phi @mandel 0, @loop1 %i1
+	%zr =d phi @mandel d_0, @loop1 %zr1
+	%zi =d phi @mandel d_0, @loop1 %zi1
+	%i1 =w add 1, %i
+	%tmp =d mul %zr, %zi
+	%zr2 =d mul %zr, %zr
+	%zi2 =d mul %zi, %zi
+	%zrx =d sub %zr2, %zi2
+	%zr1 =d add %zrx, %cr
+	%zix =d add %tmp, %tmp
+	%zi1 =d add %zix, %ci
+	%sum =d add %zi2, %zr2
+	%cmp1 =w cgtd %sum, d_16
+	jnz %cmp1, @reti, @loop1
+@loop1
+	%cmp2 =w csgtw %i1, 1000
+	jnz %cmp2, @ret0, @loop
+@reti
+	ret %i1
+@ret0
+	ret 0
+}
+
+function w $main() {
+@main
+@loopy
+	%y =d phi @main d_-1, @loopy1 %y1
+@loopx
+	%x =d phi @loopy d_-1, @loopx1 %x1
+	%i =w call $mandel(d %x, d %y)
+	jnz %i, @out, @in
+@in
+	%r0 =w call $putchar(w 42)   # '*'
+	jmp @loopx1
+@out
+	%r1 =w call $putchar(w 32)   # ' '
+	jmp @loopx1
+@loopx1
+	%x1 =d add %x, d_0.032
+	%cmp1 =w cgtd %x1, d_1
+	jnz %cmp1, @loopy1, @loopx
+@loopy1
+	%r2 =w call $putchar(w 10)   # '\n'
+	%y1 =d add %y, d_0.032
+	%cmp2 =w cgtd %y1, d_1
+	jnz %cmp2, @ret, @loopy
+@ret
+	ret 0
+}
+
+# >>> output
+#                                                                #
+#                                                                #
+#                                                                #
+#                                                                #
+#                                *                               #
+#                               ****                             #
+#                               ****                             #
+#                               ***                              #
+#                              *****                             #
+#                             *********                          #
+#                           ************                         #
+#                        *****************                       #
+#                         ****************                       #
+#                         ***************                        #
+#                         ****************                       #
+#                         ****************                       #
+#                        *****************                       #
+#                         ****************                       #
+#                         ****************                       #
+#                          **************                        #
+#                          *************                         #
+#                           ************                         #
+#                            *********                           #
+#                              *****                             #
+#                            ***********                         #
+#                         *****************                      #
+#                      **********************                    #
+#                   * *********************** **                 #
+#                   ***************************                  #
+#                  *****************************                 #
+#               * *******************************  **            #
+#              ** ***********************************            #
+#              *********************************** *             #
+#               ***********************************              #
+#              *************************************             #
+#              *************************************             #
+#             ***************************************            #
+#             ***************************************            #
+#             ***************************************            #
+#             ****************************************           #
+#       *     ****************************************           #
+#       ********************************************** ****      #
+#       ****************************************************     #
+#     * *****************************************************    #
+#     * *****************************************************    #
+#       ***** **************************************** ****      #
+#         *   ****************************************    *      #
+#             ****************************************           #
+#             ***************************************            #
+#            ****************************************            #
+#              ***************************************           #
+#             ****************************************           #
+#               ************************************             #
+#               ***********************************              #
+#                *********************************               #
+#               ************************************             #
+#               *** ************* ************** ***             #
+#                    ***********   ************   **             #
+#                      ********      ********                    #
+#                     **                *   *                    #
+#                                                                #
+#                                                                #
+#                                                                #
+# <<<
diff --git a/src/test/max.ssa b/src/test/max.ssa
new file mode 100644
index 0000000..547e9d4
--- /dev/null
+++ b/src/test/max.ssa
@@ -0,0 +1,33 @@
+# find the maximum value
+# in a nul-terminated array
+# of unsigned bytes
+#
+# the output is stored in $a
+
+data $arr = { b 10, b -60, b 10, b 100, b 200, b 0 }
+
+function $test() {
+@start
+@loop
+	%max =w phi @start -1, @new %byt, @old %max
+	%loc =l phi @start $arr,  @new %loc1, @old %loc1
+	%byt =w loadub %loc
+	%loc1 =l add 1, %loc
+	jnz %byt, @iter, @end
+@iter
+	%cmp =w cslew %max, %byt
+	jnz %cmp, @new, @old
+@new
+	jmp @loop
+@old
+	jmp @loop
+@end
+	storew %max, $a
+	ret
+}
+
+# >>> driver
+# extern void test(void);
+# int a;
+# int main() { test(); return !(a == 200); }
+# <<<
diff --git a/src/test/prime.ssa b/src/test/prime.ssa
new file mode 100644
index 0000000..12d0273
--- /dev/null
+++ b/src/test/prime.ssa
@@ -0,0 +1,32 @@
+# find the 10,001st prime
+# store it in a
+
+function $test() {
+@start
+@loop
+	%n =w phi @start 5, @tloop %n, @yes %n1
+	%p =w phi @start 13, @tloop %p1, @yes %p1
+	%p1 =w add %p, 2
+@tloop
+	%t =w phi @loop 3, @next %t1
+	%r =w rem %p, %t
+	jnz %r, @next, @loop
+@next
+	%t1 =w add 2, %t
+	%tsq =w mul %t1, %t1
+	%c0 =w csgtw %tsq, %p
+	jnz %c0, @yes, @tloop
+@yes
+	%n1 =w add 1, %n
+	%c1 =w ceqw 10001, %n1
+	jnz %c1, @end, @loop
+@end
+	storew %p, $a
+	ret
+}
+
+# >>> driver
+# extern void test(void);
+# int a;
+# int main() { test(); return !(a == 104743); }
+# <<<
diff --git a/src/test/puts10.ssa b/src/test/puts10.ssa
new file mode 100644
index 0000000..1dcf227
--- /dev/null
+++ b/src/test/puts10.ssa
@@ -0,0 +1,29 @@
+function $main() {
+@start
+	%y  =l alloc4 4
+	%y1 =l add %y, 1
+	storeb 0, %y1
+@loop
+	%n =w phi @start 0, @loop %n1
+	%c =w add %n, 48
+	storeb %c, %y
+	%r =w call $puts(l %y)
+	%n1 =w add %n, 1
+	%cmp =w cslew %n1, 9
+	jnz %cmp, @loop, @end
+@end
+	ret
+}
+
+# >>> output
+# 0
+# 1
+# 2
+# 3
+# 4
+# 5
+# 6
+# 7
+# 8
+# 9
+# <<<
diff --git a/src/test/sum.ssa b/src/test/sum.ssa
new file mode 100644
index 0000000..266054e
--- /dev/null
+++ b/src/test/sum.ssa
@@ -0,0 +1,31 @@
+# Simple test for addressing modes.
+
+function w $sum(l %arr, w %num) {
+@start
+@loop
+	%n1 =w phi @start %num, @loop1 %n2
+	%s0 =w phi @start 0, @loop1 %s1
+	%n2 =w sub %n1, 1
+	%c =w cslew %n1, 0
+	jnz %c, @end, @loop1
+@loop1
+	%idx0 =l extsw %n2
+	%idx1 =l mul 4, %idx0
+	%idx2 =l add %idx1, %arr
+	%w =w loadw %idx2
+	%s1 =w add %w, %s0
+	jmp @loop
+@end
+	ret %s0
+}
+
+# >>> driver
+# extern int sum(int *, int);
+# int arr[] = { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21 };
+# #define N sizeof arr / sizeof arr[0]
+# int main() {
+# 	int i, s;
+# 	for (s=i=0; i<N; i++) s+=arr[i];
+# 	return !(sum(arr, N) == s);
+# }
+# <<<
diff --git a/src/tools/abi.ml b/src/tools/abi.ml
new file mode 100644
index 0000000..d845c74
--- /dev/null
+++ b/src/tools/abi.ml
@@ -0,0 +1,532 @@
+(* fuzzer *)
+
+type _ bty =
+  | Char: int bty
+  | Short: int bty
+  | Int: int bty
+  | Long: int bty
+  | Float: float bty
+  | Double: float bty
+
+type _ sty =
+  | Field: 'a bty * 'b sty -> ('a * 'b) sty
+  | Empty: unit sty
+
+type _ aty =
+  | Base: 'a bty -> 'a aty
+  | Struct: 'a sty -> 'a aty
+
+type anyb = AB: _ bty -> anyb (* kinda boring... *)
+type anys = AS: _ sty -> anys
+type anya = AA: _ aty -> anya
+type testb = TB: 'a bty * 'a -> testb
+type testa = TA: 'a aty * 'a -> testa
+
+
+let align a x =
+  let m = x mod a in
+  if m <> 0 then x + (a-m) else x
+
+let btysize: type a. a bty -> int = function
+  | Char -> 1
+  | Short -> 2
+  | Int -> 4
+  | Long -> 8
+  | Float -> 4
+  | Double -> 8
+
+let btyalign = btysize
+
+let styempty: type a. a sty -> bool = function
+  | Field _ -> false
+  | Empty -> true
+
+let stysize s =
+  let rec f: type a. int -> a sty -> int =
+    fun sz -> function
+    | Field (b, s) ->
+      let a = btyalign b in
+      f (align a sz + btysize b) s
+    | Empty -> sz in
+  f 0 s
+
+let rec styalign: type a. a sty -> int = function
+  | Field (b, s) -> max (btyalign b) (styalign s)
+  | Empty -> 1
+
+
+(* Generate types and test vectors. *)
+module Gen = struct
+  module R = Random
+
+  let init = function
+    | None ->
+      let f = open_in "/dev/urandom" in
+      let seed =
+        Char.code (input_char f) lsl 8 +
+        Char.code (input_char f) in
+      close_in f;
+      R.init seed;
+      seed
+    | Some seed ->
+      R.init seed;
+      seed
+
+  let int sz =
+    let bound = 1 lsl (8 * min sz 3 - 1) in
+    let i = R.int bound in
+    if R.bool () then - i else i
+
+  let float () =
+    let f = R.float 1000. in
+    if R.bool () then -. f else f
+
+  let testv: type a. a aty -> a =
+    let tb: type a. a bty -> a = function (* eh, dry... *)
+      | Float  -> float ()
+      | Double -> float ()
+      | Char   -> int (btysize Char)
+      | Short  -> int (btysize Short)
+      | Int    -> int (btysize Int)
+      | Long   -> int (btysize Long) in
+    let rec ts: type a. a sty -> a = function
+      | Field (b, s) -> (tb b, ts s)
+      | Empty -> () in
+    function
+    | Base b -> tb b
+    | Struct s -> ts s
+
+  let b () = (* uniform *)
+    match R.int 6 with
+    | 0 -> AB Char
+    | 1 -> AB Short
+    | 2 -> AB Int
+    | 3 -> AB Long
+    | 4 -> AB Float
+    | _ -> AB Double
+
+  let smax = 5      (* max elements in structs *)
+  let structp = 0.3 (* odds of having a struct type *)
+  let amax = 8      (* max function arguments *)
+
+  let s () =
+    let rec f n =
+      if n = 0 then AS Empty else
+      let AB bt = b () in
+      let AS st = f (n-1) in
+      AS (Field (bt, st)) in
+    f (1 + R.int (smax-1))
+
+  let a () =
+    if R.float 1.0 > structp then
+      let AB bt = b () in
+      AA (Base bt)
+    else
+      let AB bt = b () in
+      let AS st = s () in
+      AA (Struct (Field (bt, st)))
+
+  let test () =
+    let AA ty = a () in
+    let t = testv ty in
+    TA (ty, t)
+
+  let tests () =
+    let rec f n =
+      if n = 0 then [] else
+      test () :: f (n-1) in
+    f (R.int amax)
+
+end
+
+
+(* Code generation for C *)
+module OutC = struct
+  open Printf
+
+  let ctypelong oc name =
+    let cb: type a. a bty -> unit = function
+      | Char   -> fprintf oc "char"
+      | Short  -> fprintf oc "short"
+      | Int    -> fprintf oc "int"
+      | Long   -> fprintf oc "long"
+      | Float  -> fprintf oc "float"
+      | Double -> fprintf oc "double" in
+    let rec cs: type a. int -> a sty -> unit =
+      fun i -> function
+      | Field (b, s) ->
+        cb b;
+        fprintf oc " f%d; " i;
+        cs (i+1) s;
+      | Empty -> () in
+    function
+    | Base b ->
+      cb b;
+    | Struct s ->
+      fprintf oc "struct %s { " name;
+      cs 1 s;
+      fprintf oc "}";
+      ()
+
+  let ctype: type a. out_channel -> string -> a aty -> unit =
+    fun oc name -> function
+    | Struct _ -> fprintf oc "struct %s" name
+    | t -> ctypelong oc "" t
+
+  let base: type a. out_channel -> a bty * a -> unit =
+    fun oc -> function
+    | Char, i   -> fprintf oc "%d" i
+    | Short, i  -> fprintf oc "%d" i
+    | Int, i    -> fprintf oc "%d" i
+    | Long, i   -> fprintf oc "%d" i
+    | Float, f  -> fprintf oc "%ff" f
+    | Double, f -> fprintf oc "%f" f
+
+  let init oc name (TA (ty, t)) =
+    let inits s =
+      let rec f: type a. a sty * a -> unit = function
+        | Field (b, s), (tb, ts) ->
+          base oc (b, tb);
+          fprintf oc ", ";
+          f (s, ts)
+        | Empty, () -> () in
+      fprintf oc "{ ";
+      f s;
+      fprintf oc "}"; in
+    ctype oc name ty;
+    fprintf oc " %s = " name;
+    begin match (ty, t) with
+    | Base b, tb -> base oc (b, tb)
+    | Struct s, ts -> inits (s, ts)
+    end;
+    fprintf oc ";\n";
+    ()
+
+  let extension = ".c"
+
+  let comment oc s =
+    fprintf oc "/* %s */\n" s
+
+  let prelude oc = List.iter (fprintf oc "%s\n")
+    [ "#include <stdio.h>"
+    ; "#include <stdlib.h>"
+    ; ""
+    ; "static void fail(char *chk)"
+    ; "{"
+    ; "\tfprintf(stderr, \"fail: checking %s\\n\", chk);"
+    ; "\tabort();"
+    ; "}"
+    ; ""
+    ]
+
+  let typedef oc name = function
+    | TA (Struct ts, _) ->
+      ctypelong oc name (Struct ts);
+      fprintf oc ";\n";
+    | _ -> ()
+
+  let check oc name =
+    let chkbase: type a. string -> a bty * a -> unit =
+      fun name t ->
+        fprintf oc "\tif (%s != " name;
+        base oc t;
+        fprintf oc ")\n\t\tfail(%S);\n" name; in
+    function
+    | TA (Base b, tb) -> chkbase name (b, tb)
+    | TA (Struct s, ts) ->
+      let rec f: type a. int -> a sty * a -> unit =
+        fun i -> function
+        | Field (b, s), (tb, ts) ->
+          chkbase (Printf.sprintf "%s.f%d" name i) (b, tb);
+          f (i+1) (s, ts);
+        | Empty, () -> () in
+      f 1 (s, ts)
+
+  let argname i = "arg" ^ string_of_int (i+1)
+
+  let proto oc (TA (tret, _)) args =
+    ctype oc "ret" tret;
+    fprintf oc " f(";
+    let narg = List.length args in
+    List.iteri (fun i (TA (targ, _)) ->
+      ctype oc (argname i) targ;
+      fprintf oc " %s" (argname i);
+      if i <> narg-1 then
+        fprintf oc ", ";
+    ) args;
+    fprintf oc ")";
+    ()
+
+  let caller oc ret args =
+    let narg = List.length args in
+    prelude oc;
+    typedef oc "ret" ret;
+    List.iteri (fun i arg ->
+      typedef oc (argname i) arg;
+    ) args;
+    proto oc ret args;
+    fprintf oc ";\n\nint main()\n{\n";
+    List.iteri (fun i arg ->
+      fprintf oc "\t";
+      init oc (argname i) arg;
+    ) args;
+    fprintf oc "\t";
+    let TA (tret, _) = ret in
+    ctype oc "ret" tret;
+    fprintf oc " ret;\n\n";
+    fprintf oc "\tret = f(";
+    List.iteri (fun i _ ->
+      fprintf oc "%s" (argname i);
+      if i <> narg-1 then
+        fprintf oc ", ";
+    ) args;
+    fprintf oc ");\n";
+    check oc "ret" ret;
+    fprintf oc "\n\treturn 0;\n}\n";
+    ()
+
+  let callee oc ret args =
+    prelude oc;
+    typedef oc "ret" ret;
+    List.iteri (fun i arg ->
+      typedef oc (argname i) arg;
+    ) args;
+    fprintf oc "\n";
+    proto oc ret args;
+    fprintf oc "\n{\n\t";
+    init oc "ret" ret;
+    fprintf oc "\n";
+    List.iteri (fun i arg ->
+      check oc (argname i) arg;
+    ) args;
+    fprintf oc "\n\treturn ret;\n}\n";
+    ()
+
+end
+
+(* Code generation for QBE *)
+module OutIL = struct
+  open Printf
+
+  let comment oc s =
+    fprintf oc "# %s\n" s
+
+  let tmp, lbl =
+    let next = ref 0 in
+    (fun () -> incr next; "%t" ^ (string_of_int !next)),
+    (fun () -> incr next; "@l" ^ (string_of_int !next))
+
+  let bvalue: type a. a bty * a -> string = function
+    | Char, i   -> sprintf "%d" i
+    | Short, i  -> sprintf "%d" i
+    | Int, i    -> sprintf "%d" i
+    | Long, i   -> sprintf "%d" i
+    | Float, f  -> sprintf "s_%f" f
+    | Double, f -> sprintf "d_%f" f
+
+  let btype: type a. a bty -> string = function
+    | Char   -> "w"
+    | Short  -> "w"
+    | Int    -> "w"
+    | Long   -> "l"
+    | Float  -> "s"
+    | Double -> "d"
+
+  let extension = ".ssa"
+
+  let argname i = "arg" ^ string_of_int (i+1)
+
+  let siter oc base s g =
+    let rec f: type a. int -> int -> a sty * a -> unit =
+      fun id off -> function
+      | Field (b, s), (tb, ts) ->
+        let off = align (btyalign b) off in
+        let addr = tmp () in
+        fprintf oc "\t%s =l add %d, %s\n" addr off base;
+        g id addr (TB (b, tb));
+        f (id + 1) (off + btysize b) (s, ts);
+     | Empty, () -> () in
+   f 0 0 s
+
+  let bmemtype b =
+    if AB b = AB Char  then "b" else
+    if AB b = AB Short then "h" else
+    btype b
+
+  let init oc = function
+    | TA (Base b, tb) -> bvalue (b, tb)
+    | TA (Struct s, ts) ->
+      let base = tmp () in
+      fprintf oc "\t%s =l alloc%d %d\n"
+        base (styalign s) (stysize s);
+      siter oc base (s, ts)
+      begin fun _ addr (TB (b, tb)) ->
+        fprintf oc "\tstore%s %s, %s\n"
+          (bmemtype b) (bvalue (b, tb)) addr;
+      end;
+      base
+
+  let check oc id name =
+    let bcheck = fun id name (b, tb) ->
+      let tcmp = tmp () in
+      let nxtl = lbl () in
+      fprintf oc "\t%s =w ceq%s %s, %s\n"
+        tcmp (btype b) name (bvalue (b, tb));
+      fprintf oc "\tstorew %d, %%failcode\n" id;
+      fprintf oc "\tjnz %s, %s, @fail\n" tcmp nxtl;
+      fprintf oc "%s\n" nxtl; in
+    function
+    | TA (Base Char, i) ->
+      let tval = tmp () in
+      fprintf oc "\t%s =w extsb %s\n" tval name;
+      bcheck id tval (Int, i)
+    | TA (Base Short, i) ->
+      let tval = tmp () in
+      fprintf oc "\t%s =w extsh %s\n" tval name;
+      bcheck id tval (Int, i)
+    | TA (Base b, tb) ->
+      bcheck id name (b, tb)
+    | TA (Struct s, ts) ->
+      siter oc name (s, ts)
+      begin fun id' addr (TB (b, tb)) ->
+        let tval = tmp () in
+        let lsuffix =
+          if AB b = AB Char  then "sb" else
+          if AB b = AB Short then "sh" else
+          "" in
+        fprintf oc "\t%s =%s load%s %s\n"
+          tval (btype b) lsuffix addr;
+        bcheck (100*id + id'+1) tval (b, tb);
+      end;
+      ()
+
+  let ttype name = function
+    | TA (Base b, _)   -> btype b
+    | TA (Struct _, _) -> ":" ^ name
+
+  let typedef oc name =
+    let rec f: type a. a sty -> unit = function
+      | Field (b, s) ->
+        fprintf oc "%s" (bmemtype b);
+        if not (styempty s) then
+          fprintf oc ", ";
+        f s;
+      | Empty -> () in
+    function
+    | TA (Struct ts, _) ->
+      fprintf oc "type :%s = { " name;
+      f ts;
+      fprintf oc " }\n";
+    | _ -> ()
+
+  let postlude oc = List.iter (fprintf oc "%s\n")
+    [ "@fail"
+    ;  "# failure code"
+    ; "\t%fcode =w loadw %failcode"
+    ; "\t%f0 =w call $printf(l $failstr, w %fcode)"
+    ; "\t%f1 =w call $abort()"
+    ; "\tret 0"
+    ; "}"
+    ; ""
+    ; "data $failstr = { b \"fail on check %d\\n\", b 0 }"
+    ]
+
+  let caller oc ret args =
+    let narg = List.length args in
+    List.iteri (fun i arg ->
+      typedef oc (argname i) arg;
+    ) args;
+    typedef oc "ret" ret;
+    fprintf oc "\nfunction w $main() {\n";
+    fprintf oc "@start\n";
+    fprintf oc "\t%%failcode =l alloc4 4\n";
+    let targs = List.mapi (fun i arg ->
+      comment oc ("define argument " ^ (string_of_int (i+1)));
+      (ttype (argname i) arg, init oc arg)
+    ) args in
+    comment oc "call test function";
+    fprintf oc "\t%%ret =%s call $f(" (ttype "ret" ret);
+    List.iteri (fun i (ty, tmp) ->
+      fprintf oc "%s %s" ty tmp;
+      if i <> narg-1 then
+        fprintf oc ", ";
+    ) targs;
+    fprintf oc ")\n";
+    comment oc "check the return value";
+    check oc 0 "%ret" ret;
+    fprintf oc "\tret 0\n";
+    postlude oc;
+    ()
+
+  let callee oc ret args =
+    let narg = List.length args in
+    List.iteri (fun i arg ->
+      typedef oc (argname i) arg;
+    ) args;
+    typedef oc "ret" ret;
+    fprintf oc "\nfunction %s $f(" (ttype "ret" ret);
+    List.iteri (fun i arg ->
+      let a = argname i in
+      fprintf oc "%s %%%s" (ttype a arg) a;
+      if i <> narg-1 then
+        fprintf oc ", ";
+    ) args;
+    fprintf oc ") {\n";
+    fprintf oc "@start\n";
+    fprintf oc "\t%%failcode =l alloc4 4\n";
+    List.iteri (fun i arg ->
+      comment oc ("checking argument " ^ (string_of_int (i+1)));
+      check oc (i+1) ("%" ^ argname i) arg;
+    ) args;
+    comment oc "define the return value";
+    let rettmp = init oc ret in
+    fprintf oc "\tret %s\n" rettmp;
+    postlude oc;
+    ()
+
+end
+
+
+module type OUT = sig
+  val extension: string
+  val comment: out_channel -> string -> unit
+  val caller: out_channel -> testa -> testa list -> unit
+  val callee: out_channel -> testa -> testa list -> unit
+end
+
+let _ =
+  let usage code =
+    Printf.eprintf "usage: abi.ml [-s SEED] DIR {c,ssa} {c,ssa}\n";
+    exit code in
+
+  let outmod = function
+    | "c"   -> (module OutC : OUT)
+    | "ssa" -> (module OutIL: OUT)
+    | _ -> usage 1 in
+
+  let seed, dir, mcaller, mcallee =
+    match Sys.argv with
+    | [| _; "-s"; seed; dir; caller; callee |] ->
+      let seed =
+        try Some (int_of_string seed) with
+        Failure _ -> usage 1 in
+      seed, dir, outmod caller, outmod callee
+    | [| _; dir; caller; callee |] ->
+      None, dir, outmod caller, outmod callee
+    | [| _; "-h" |] ->
+      usage 0
+    | _ ->
+      usage 1 in
+
+  let seed = Gen.init seed in
+  let tret = Gen.test () in
+  let targs = Gen.tests () in
+  let module OCaller = (val mcaller : OUT) in
+  let module OCallee = (val mcallee : OUT) in
+  let ocaller = open_out (dir ^ "/caller" ^ OCaller.extension) in
+  let ocallee = open_out (dir ^ "/callee" ^ OCallee.extension) in
+  OCaller.comment ocaller (Printf.sprintf "seed %d" seed);
+  OCallee.comment ocallee (Printf.sprintf "seed %d" seed);
+  OCaller.caller ocaller tret targs;
+  OCallee.callee ocallee tret targs;
+  ()
diff --git a/src/tools/abitest.sh b/src/tools/abitest.sh
new file mode 100755
index 0000000..d5b16e5
--- /dev/null
+++ b/src/tools/abitest.sh
@@ -0,0 +1,104 @@
+#!/bin/sh
+
+OCAMLC=/usr/bin/ocamlc
+QBE=`pwd`/qbe
+
+failure() {
+	echo "Failure at stage:" $1 >&2
+	exit 1
+}
+
+cleanup() {
+	rm -fr $TMP
+}
+
+init() {
+	cp tools/abi.ml $TMP
+	pushd $TMP > /dev/null
+
+	cat > Makefile << EOM
+
+.PHONY: test
+test: caller.o callee.o
+	c99 -o \$@ caller.o callee.o
+%.o: %.c
+	c99 -c -o \$@ \$<
+%.o: %.ssa
+	$QBE -o \$*.s \$<
+	c99 -c -o \$@ \$*.s
+
+EOM
+
+	if ! $OCAMLC abi.ml -o gentest
+	then
+		popd > /dev/null
+		cleanup
+		failure "abifuzz compilation"
+	fi
+	popd > /dev/null
+}
+
+once() {
+	if test -z "$3"
+	then
+		$TMP/gentest $TMP $1 $2
+	else
+		$TMP/gentest -s $3 $TMP $1 $2
+	fi
+	make -C $TMP test > /dev/null || failure "building"
+	$TMP/test || failure "runtime"
+}
+
+usage() {
+	echo "usage: abitest.sh [-callssa] [-callc] [-s SEED] [-n ITERATIONS]" >&2
+	exit 1
+}
+
+N=1
+CALLER=c
+CALLEE=ssa
+
+while test -n "$1"
+do
+	case "$1" in
+	"-callssa")
+		;;
+	"-callc")
+		CALLER=ssa
+		CALLEE=c
+		;;
+	"-s")
+		test -n "$2" || usage
+		shift
+		SEED="$1"
+		;;
+	"-n")
+		test -n "$2" || usage
+		shift
+		N="$1"
+		;;
+	*)
+		usage
+		;;
+	esac
+	shift
+done
+
+TMP=`mktemp -d abifuzz.XXXXXX`
+
+init
+
+if test -n "$S"
+then
+	once $CALLER $CALLEE $SEED
+else
+	for n in `seq $N`
+	do
+		once $CALLER $CALLEE
+		echo "$n" | grep "00$"
+	done
+fi
+
+echo "All done."
+
+cleanup
diff --git a/src/tools/fptox.c b/src/tools/fptox.c
new file mode 100644
index 0000000..a2bc155
--- /dev/null
+++ b/src/tools/fptox.c
@@ -0,0 +1,18 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main(int ac, char *av[])
+{
+	double d;
+	float f;
+
+	if (ac < 2) {
+	usage:
+		fputs("usage: fptox NUMBER\n", stderr);
+		return 1;
+	}
+	f = d = strtod(av[1], 0);
+	printf("0x%08x 0x%016llx\n", *(unsigned *)&f, *(unsigned long long*)&d);
+	return 0;
+}
diff --git a/src/tools/pmov.c b/src/tools/pmov.c
new file mode 100644
index 0000000..efbecd7
--- /dev/null
+++ b/src/tools/pmov.c
@@ -0,0 +1,252 @@
+/*% rm -f rega.o main.o && cc -g -std=c99 -Wall -DTEST_PMOV -o # % *.o
+ *
+ * This is a test framwork for the dopm() function
+ * in rega.c, use it when you want to modify it or
+ * all the parallel move functions.
+ *
+ * You might need to decrease NIReg to see it
+ * terminate, I used NIReg == 7 at most.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void assert_test(char *, int), fail(void), iexec(int *);
+
+#include "../rega.c"
+
+static RMap mbeg;
+static Ins ins[NIReg], *ip;
+static Blk dummyb = { .ins = ins };
+
+int
+main()
+{
+	Ins *i1;
+	unsigned long long tm, rm, cnt;
+	RMap mend;
+	int reg[NIReg], val[NIReg+1];
+	int t, i, r, nr;
+
+	tmp = (Tmp[Tmp0+NIReg]){{{0}}};
+	for (t=0; t<Tmp0+NIReg; t++)
+		if (t >= Tmp0) {
+			tmp[t].cls = Kw;
+			tmp[t].hint.r = -1;
+			tmp[t].hint.m = 0;
+			tmp[t].slot = -1;
+			sprintf(tmp[t].name, "tmp%d", t-Tmp0+1);
+		}
+
+	bsinit(mbeg.b, Tmp0+NIReg);
+	bsinit(mend.b, Tmp0+NIReg);
+	cnt = 0;
+	for (tm = 0; tm < 1ull << (2*NIReg); tm++) {
+		mbeg.n = 0;
+		bszero(mbeg.b);
+		ip = ins;
+
+		/* find what temporaries are in copy and
+		 * wether or not they are in register
+		 */
+		for (t=0; t<NIReg; t++)
+			switch ((tm >> (2*t)) & 3) {
+			case 0:
+				/* not in copy, not in reg */
+				break;
+			case 1:
+				/* not in copy, in reg */
+				radd(&mbeg, Tmp0+t, t+1);
+				break;
+			case 2:
+				/* in copy, not in reg */
+				*ip++ = (Ins){OCopy, TMP(Tmp0+t), {R, R}, Kw};
+				break;
+			case 3:
+				/* in copy, in reg */
+				*ip++ = (Ins){OCopy, TMP(Tmp0+t), {R, R}, Kw};
+				radd(&mbeg, Tmp0+t, t+1);
+				break;
+			}
+
+		if (ip == ins)
+			/* cancel if the parallel move
+			 * is empty
+			 */
+			goto Nxt;
+
+		/* find registers for temporaries
+		 * in mbeg
+		 */
+		nr = ip - ins;
+		rm = (1ull << (nr+1)) - 1;
+		for (i=0; i<nr; i++)
+			reg[i] = i+1;
+
+		for (;;) {
+			/* set registers on copies
+			 */
+			for (i=0, i1=ins; i1<ip; i1++, i++)
+				i1->arg[0] = TMP(reg[i]);
+
+			/* compile the parallel move
+			 */
+			rcopy(&mend, &mbeg);
+			dopm(&dummyb, ip-1, &mend);
+			cnt++;
+
+			/* check that mend contain mappings for
+			 * source registers and does not map any
+			 * assigned temporary, then check that
+			 * all temporaries in mend are mapped in
+			 * mbeg and not used in the copy
+			 */
+			for (i1=ins; i1<ip; i1++) {
+				r = i1->arg[0].val;
+				assert(rfree(&mend, r) == r);
+				t = i1->to.val;
+				assert(!bshas(mend.b, t));
+			}
+			for (i=0; i<mend.n; i++) {
+				t = mend.t[i];
+				assert(bshas(mbeg.b, t));
+				t -= Tmp0;
+				assert(((tm >> (2*t)) & 3) == 1);
+			}
+
+			/* execute the code generated and check
+			 * that all assigned temporaries got their
+			 * value, and that all live variables's
+			 * content got preserved
+			 */
+			 for (i=1; i<=NIReg; i++)
+			 	val[i] = i;
+			 iexec(val);
+			 for (i1=ins; i1<ip; i1++) {
+			 	t = i1->to.val;
+			 	r = rfind(&mbeg, t);
+			 	if (r != -1)
+			 		assert(val[r] == i1->arg[0].val);
+			 }
+			 for (i=0; i<mend.n; i++) {
+			 	t = mend.t[i];
+			 	r = mend.r[i];
+			 	assert(val[t-Tmp0+1] == r);
+			 }
+
+			/* find the next register assignment */
+			i = nr - 1;
+			for (;;) {
+				r = reg[i];
+				rm &= ~(1ull<<r);
+				do
+					r++;
+				while (r <= NIReg && (rm & (1ull<<r)));
+				if (r == NIReg+1) {
+					if (i == 0)
+						goto Nxt;
+					i--;
+				} else {
+					rm |= (1ull<<r);
+					reg[i++] = r;
+					break;
+				}
+			}
+			for (; i<nr; i++)
+				for (r=1; r<=NIReg; r++)
+					if (!(rm & (1ull<<r))) {
+						rm |= (1ull<<r);
+						reg[i] = r;
+						break;
+					}
+		}
+	Nxt:	;
+	}
+	printf("%llu tests successful!\n", cnt);
+	exit(0);
+}
+
+
+/* execute what pmgen() wrote (swap, copy) */
+
+#define validr(r)           \
+	rtype(r) == RTmp && \
+	r.val > 0 &&        \
+	r.val <= NIReg
+
+static void
+iexec(int val[])
+{
+	Ins *i;
+	int t;
+
+	for (i=insb; i<curi; i++)
+		switch (i->op) {
+		default:
+			assert(!"iexec: missing case\n");
+			exit(1);
+		case OSwap:
+			assert(validr(i->arg[0]));
+			assert(validr(i->arg[1]));
+			t = val[i->arg[0].val];
+			val[i->arg[0].val] = val[i->arg[1].val];
+			val[i->arg[1].val] = t;
+			break;
+		case OCopy:
+			assert(validr(i->to));
+			assert(validr(i->arg[0]));
+			val[i->to.val] = val[i->arg[0].val];
+			break;
+		}
+}
+
+
+/* failure diagnostics */
+
+static int re;
+
+static void
+replay()
+{
+	RMap mend;
+
+	re = 1;
+	bsinit(mend.b, Tmp0+NIReg);
+	rcopy(&mend, &mbeg);
+	dopm(&dummyb, ip-1, &mend);
+}
+
+static void
+fail()
+{
+	Ins *i1;
+	int i;
+
+	printf("\nIn registers: ");
+	for (i=0; i<mbeg.n; i++)
+		printf("%s(r%d) ",
+			tmp[mbeg.t[i]].name,
+			mbeg.r[i]);
+	printf("\n");
+	printf("Parallel move:\n");
+	for (i1=ins; i1<ip; i1++)
+		printf("\t %s <- r%d\n",
+			tmp[i1->to.val].name,
+			i1->arg[0].val);
+	replay();
+	abort();
+}
+
+static void
+assert_test(char *s, int x)
+{
+	if (x)
+		return;
+	if (re)
+		abort();
+	printf("!assertion failure: %s\n", s);
+	fail();
+}
+
+/* symbols required by the linker */
+char debug['Z'+1];
diff --git a/src/tools/regress.sh b/src/tools/regress.sh
new file mode 100755
index 0000000..4106b00
--- /dev/null
+++ b/src/tools/regress.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+for t in test/*
+do
+	printf "Test $t ... "
+
+	./qbe   $t >/tmp/out.0 2>&1
+	./qbe.1 $t >/tmp/out.1 2>&1
+
+	if diff /tmp/out.0 /tmp/out.1 > /dev/null
+	then
+		echo "OK"
+	else
+		echo "KO"
+		break
+	fi
+done
diff --git a/src/util.c b/src/util.c
new file mode 100644
index 0000000..65b3ff8
--- /dev/null
+++ b/src/util.c
@@ -0,0 +1,329 @@
+#include "all.h"
+
+typedef struct Bitset Bitset;
+typedef struct Vec Vec;
+
+struct Vec {
+	ulong mag;
+	size_t esz;
+	ulong cap;
+	union {
+		long long ll;
+		long double ld;
+		void *ptr;
+	} align[];
+};
+
+enum {
+	VMin = 2,
+	VMag = 0xcabba9e,
+	NPtr = 256,
+};
+
+Typ typ[NTyp];
+Ins insb[NIns], *curi;
+
+static void *ptr[NPtr];
+static void **pool = ptr;
+static int nptr = 1;
+
+void
+diag(char *s)
+{
+	fputs(s, stderr);
+	fputc('\n', stderr);
+	abort();
+}
+
+void *
+emalloc(size_t n)
+{
+	void *p;
+
+	p = calloc(1, n);
+	if (!p)
+		diag("emalloc: out of memory");
+	return p;
+}
+
+void *
+alloc(size_t n)
+{
+	void **pp;
+
+	if (n == 0)
+		return 0;
+	if (nptr >= NPtr) {
+		pp = emalloc(NPtr * sizeof(void *));
+		pp[0] = pool;
+		pool = pp;
+		nptr = 1;
+	}
+	return pool[nptr++] = emalloc(n);
+}
+
+void
+freeall()
+{
+	void **pp;
+
+	for (;;) {
+		for (pp = &pool[1]; pp < &pool[nptr]; pp++)
+			free(*pp);
+		pp = pool[0];
+		if (!pp)
+			break;
+		free(pool);
+		pool = pp;
+		nptr = NPtr;
+	}
+	nptr = 1;
+}
+
+Blk *
+blknew()
+{
+	static Blk z;
+	Blk *b;
+
+	b = alloc(sizeof *b);
+	*b = z;
+	return b;
+}
+
+void
+emit(int op, int k, Ref to, Ref arg0, Ref arg1)
+{
+	if (curi == insb)
+		diag("emit: too many instructions");
+	*--curi = (Ins){
+		.op = op, .cls = k,
+		.to = to, .arg = {arg0, arg1}
+	};
+}
+
+void
+emiti(Ins i)
+{
+	emit(i.op, i.cls, i.to, i.arg[0], i.arg[1]);
+}
+
+void
+idup(Ins **pd, Ins *s, ulong n)
+{
+	*pd = alloc(n * sizeof(Ins));
+	memcpy(*pd, s, n * sizeof(Ins));
+}
+
+Ins *
+icpy(Ins *d, Ins *s, ulong n)
+{
+	memcpy(d, s, n * sizeof(Ins));
+	return d + n;
+}
+
+void *
+vnew(ulong len, size_t esz)
+{
+	ulong cap;
+	Vec *v;
+
+	for (cap=VMin; cap<len; cap*=2)
+		;
+	v = alloc(cap * esz + sizeof(Vec));
+	v->mag = VMag;
+	v->cap = cap;
+	v->esz = esz;
+	return v + 1;
+}
+
+void
+vgrow(void *vp, ulong len)
+{
+	Vec *v;
+	void *v1;
+
+	v = *(Vec **)vp - 1;
+	assert(v+1 && v->mag == VMag);
+	if (v->cap >= len)
+		return;
+	v1 = vnew(len, v->esz);
+	memcpy(v1, v+1, v->cap * v->esz);
+	*(Vec **)vp = v1;
+}
+
+int
+phicls(int t, Tmp *tmp /*, int c*/)
+{
+	if (tmp[t].phi)
+		return tmp[t].phi;
+	return t;
+#if 0
+	int t1;
+
+	t1 = tmp[t].phi;
+	if (!t1)
+		t1 = t;
+	if (t != t1) {
+		t1 = phitmp(t1, tmp, c);
+		if (c)
+			tmp[t].phi = t1;
+	}
+	return t1;
+#endif
+}
+
+Ref
+newtmp(char *prfx, int k,  Fn *fn)
+{
+	static int n;
+	int t;
+
+	t = fn->ntmp++;
+	vgrow(&fn->tmp, fn->ntmp);
+	sprintf(fn->tmp[t].name, "%s%d", prfx, ++n);
+	fn->tmp[t].cls = k;
+	fn->tmp[t].slot = -1;
+	fn->tmp[t].nuse = +1;
+	fn->tmp[t].ndef = +1;
+	return TMP(t);
+}
+
+Ref
+getcon(int64_t val, Fn *fn)
+{
+	int c;
+
+	for (c=0; c<fn->ncon; c++)
+		if (fn->con[c].type == CBits && fn->con[c].bits.i == val)
+			return CON(c);
+	fn->ncon++;
+	vgrow(&fn->con, fn->ncon);
+	fn->con[c] = (Con){.type = CBits, .bits.i = val};
+	return CON(c);
+}
+
+void
+addcon(Con *c0, Con *c1)
+{
+	if (c0->type == CUndef)
+		*c0 = *c1;
+	else {
+		if (c1->type == CAddr) {
+			if (c0->type == CAddr)
+				diag("addcon: adding two addresses");
+			c0->type = CAddr;
+			strcpy(c0->label, c1->label);
+		}
+		c0->bits.i += c1->bits.i;
+	}
+}
+
+void
+bsinit(BSet *bs, uint n)
+{
+	n = (n + NBit-1) / NBit;
+	bs->nt = n;
+	bs->t = alloc(n * sizeof bs->t[0]);
+}
+
+uint
+bscount(BSet *bs)
+{
+	uint i, j, n;
+
+	n = 0;
+	for (i=0; i<bs->nt; i++)
+		for (j=0; j<NBit; j++)
+			if (bs->t[i] & BIT(j))
+				n++;
+	return n;
+}
+
+static inline uint
+bsmax(BSet *bs)
+{
+	return bs->nt * NBit;
+}
+
+void
+bsset(BSet *bs, uint elt)
+{
+	assert(elt < bsmax(bs));
+	bs->t[elt/NBit] |= BIT(elt%NBit);
+}
+
+void
+bsclr(BSet *bs, uint elt)
+{
+	assert(elt < bsmax(bs));
+	bs->t[elt/NBit] &= ~BIT(elt%NBit);
+}
+
+#define BSOP(f, op)                           \
+	void                                  \
+	f(BSet *a, BSet *b)                   \
+	{                                     \
+		uint i;                       \
+		                              \
+		assert(a->nt == b->nt);       \
+		for (i=0; i<a->nt; i++)       \
+			a->t[i] op b->t[i];   \
+	}
+
+BSOP(bscopy, =)
+BSOP(bsunion, |=)
+BSOP(bsinter, &=)
+BSOP(bsdiff, &= ~)
+
+int
+bsequal(BSet *a, BSet *b)
+{
+	uint i;
+
+	assert(a->nt == b->nt);
+	for (i=0; i<a->nt; i++)
+		if (a->t[i] != b->t[i])
+			return 0;
+	return 1;
+}
+
+void
+bszero(BSet *bs)
+{
+	memset(bs->t, 0, bs->nt * sizeof bs->t[0]);
+}
+
+/* iterates on a bitset, use as follows
+ *
+ * 	for (i=0; bsiter(set, &i); i++)
+ * 		use(i);
+ *
+ */
+int
+bsiter(BSet *bs, uint *elt)
+{
+	uint i;
+
+	for (i=*elt;; i++) {
+		while (i < bsmax(bs) && !bs->t[i/NBit])
+			i = (i + NBit) & -NBit;
+		if (i >= bsmax(bs))
+			return 0;
+		if (bshas(bs, i)) {
+			*elt = i;
+			return 1;
+		}
+	}
+}
+
+void
+dumpts(BSet *bs, Tmp *tmp, FILE *f)
+{
+	uint t;
+
+	fprintf(f, "[");
+	for (t=Tmp0; bsiter(bs, &t); t++)
+		fprintf(f, " %s", tmp[t].name);
+	fprintf(f, " ]\n");
+}