summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--Makefile3
-rw-r--r--all.h6
-rw-r--r--isel.c515
-rw-r--r--main.c1
-rw-r--r--parse.c65
-rw-r--r--sysv.c512
-rw-r--r--util.c13
7 files changed, 561 insertions, 554 deletions
diff --git a/Makefile b/Makefile
index 447954b..f6497bd 100644
--- a/Makefile
+++ b/Makefile
@@ -1,9 +1,10 @@
 BIN = qbe
+ABI = sysv
 
 V = @
 OBJDIR = obj
 
-SRC = main.c util.c parse.c mem.c ssa.c copy.c live.c isel.c spill.c rega.c emit.c
+SRC = main.c util.c parse.c mem.c ssa.c copy.c live.c isel.c spill.c rega.c emit.c $(ABI).c
 OBJ = $(SRC:%.c=$(OBJDIR)/%.o)
 
 CFLAGS += -Wall -Wextra -std=c99 -g -pedantic
diff --git a/all.h b/all.h
index 51b641a..a140ef4 100644
--- a/all.h
+++ b/all.h
@@ -495,6 +495,7 @@ void *vnew(ulong, size_t);
 void vgrow(void *, ulong);
 int phicls(int, Tmp *);
 Ref newtmp(char *, int, Fn *);
+void chuse(Ref, int, Fn *);
 Ref getcon(int64_t, Fn *);
 void addcon(Con *, Con *);
 void dumpts(BSet *, Tmp *, FILE *);
@@ -541,11 +542,14 @@ void copy(Fn *);
 void liveon(BSet *, Blk *, Blk *);
 void filllive(Fn *);
 
-/* isel.c */
+/* abi: sysv.c */
 extern int rsave[/* NRSave */];
 extern int rclob[/* NRClob */];
 bits retregs(Ref, int[2]);
 bits argregs(Ref, int[2]);
+void abi(Fn *);
+
+/* isel.c */
 void isel(Fn *);
 
 /* spill.c */
diff --git a/isel.c b/isel.c
index 6d19ccc..f20efb0 100644
--- a/isel.c
+++ b/isel.c
@@ -20,8 +20,6 @@
  */
 
 typedef struct ANum ANum;
-typedef struct AClass AClass;
-typedef struct RAlloc RAlloc;
 
 struct ANum {
 	char n, l, r;
@@ -157,13 +155,6 @@ fixarg(Ref *r, int k, int phi, Fn *fn)
 }
 
 static void
-chuse(Ref r, int du, Fn *fn)
-{
-	if (rtype(r) == RTmp)
-		fn->tmp[r.val].nuse += du;
-}
-
-static void
 seladdr(Ref *r, ANum *an, Fn *fn)
 {
 	Addr a;
@@ -357,146 +348,6 @@ flagi(Ins *i0, Ins *i)
 	return 0;
 }
 
-struct AClass {
-	int inmem;
-	int align;
-	uint size;
-	int cls[2];
-};
-
-static void
-aclass(AClass *a, Typ *t)
-{
-	int e, s, n, cls;
-	uint sz, al;
-
-	sz = t->size;
-	al = 1u << t->align;
-
-	/* the ABI requires sizes to be rounded
-	 * up to the nearest multiple of 8, moreover
-	 * it makes it easy load and store structures
-	 * in registers
-	 */
-	if (al < 8)
-		al = 8;
-	sz = (sz + al-1) & -al;
-
-	a->size = sz;
-	a->align = t->align;
-
-	if (t->dark || sz > 16) {
-		/* large or unaligned structures are
-		 * required to be passed in memory
-		 */
-		a->inmem = 1;
-		return;
-	}
-
-	a->inmem = 0;
-	for (e=0, s=0; e<2; e++) {
-		cls = -1;
-		for (n=0; n<8 && t->seg[s].len; s++) {
-			if (t->seg[s].ispad) {
-				/* don't change anything */
-			}
-			else if (t->seg[s].isflt) {
-				if (cls == -1)
-					cls = Kd;
-			}
-			else
-				cls = Kl;
-			n += t->seg[s].len;
-		}
-		assert(n <= 8);
-		a->cls[e] = cls;
-	}
-}
-
-static void
-blit(Ref rstk, uint soff, Ref rsrc, uint sz, Fn *fn)
-{
-	Ref r, r1;
-	uint boff;
-
-	/* it's an impolite blit, we might go across the end
-	 * of the source object a little bit... */
-	for (boff=0; sz>0; sz-=8, soff+=8, boff+=8) {
-		r = newtmp("abi", Kl, fn);
-		r1 = newtmp("abi", Kl, fn);
-		emit(OStorel, 0, R, r, r1);
-		emit(OAdd, Kl, r1, rstk, getcon(soff, fn));
-		r1 = newtmp("abi", Kl, fn);
-		emit(OLoad, Kl, r, r1, R);
-		emit(OAdd, Kl, r1, rsrc, getcon(boff, fn));
-		chuse(rsrc, +1, fn);
-		chuse(rstk, +1, fn);
-	}
-}
-
-static int
-retr(Ref reg[2], AClass *aret)
-{
-	static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
-	int n, k, ca, nr[2];
-
-	nr[0] = nr[1] = 0;
-	ca = 0;
-	for (n=0; aret->cls[n]>=0 && n<2; n++) {
-		k = KBASE(aret->cls[n]);
-		reg[n] = TMP(retreg[k][nr[k]++]);
-		ca += 1 << (2 * k);
-	}
-	return ca;
-}
-
-static void
-selret(Blk *b, Fn *fn)
-{
-	int j, k, ca;
-	Ref r, r0, reg[2];
-	AClass aret;
-
-	j = b->jmp.type;
-
-	if (!isret(j) || j == JRet0)
-		return;
-
-	r0 = b->jmp.arg;
-	b->jmp.type = JRet0;
-
-	if (j == JRetc) {
-		aclass(&aret, &typ[fn->retty]);
-		if (aret.inmem) {
-			assert(rtype(fn->retr) == RTmp);
-			emit(OCopy, Kl, TMP(RAX), fn->retr, R);
-			chuse(fn->retr, +1, fn);
-			blit(fn->retr, 0, r0, aret.size, fn);
-			ca = 1;
-		} else {
-			ca = retr(reg, &aret);
-			if (aret.size > 8) {
-				r = newtmp("abi", Kl, fn);
-				emit(OLoad, Kl, reg[1], r, R);
-				emit(OAdd, Kl, r, r0, getcon(8, fn));
-				chuse(r0, +1, fn);
-			}
-			emit(OLoad, Kl, reg[0], r0, R);
-		}
-	} else {
-		k = j - JRetw;
-		if (KBASE(k) == 0) {
-			emit(OCopy, k, TMP(RAX), r0, R);
-			ca = 1;
-		} else {
-			emit(OCopy, k, TMP(XMM0), r0, R);
-			ca = 1 << 2;
-		}
-	}
-
-	b->jmp.arg = CALL(ca);
-}
-
 static void
 seljmp(Blk *b, Fn *fn)
 {
@@ -557,320 +408,6 @@ seljmp(Blk *b, Fn *fn)
 }
 
 static int
-classify(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret)
-{
-	int nint, ni, nsse, ns, n, *pn;
-	AClass *a;
-	Ins *i;
-
-	if (aret && aret->inmem)
-		nint = 5; /* hidden argument */
-	else
-		nint = 6;
-	nsse = 8;
-	for (i=i0, a=ac; i<i1; i++, a++) {
-		if (i->op == op) {
-			if (KBASE(i->cls) == 0)
-				pn = &nint;
-			else
-				pn = &nsse;
-			if (*pn > 0) {
-				--*pn;
-				a->inmem = 0;
-			} else
-				a->inmem = 2;
-			a->align = 3;
-			a->size = 8;
-			a->cls[0] = i->cls;
-		} else {
-			n = i->arg[0].val & AMask;
-			aclass(a, &typ[n]);
-			if (a->inmem)
-				continue;
-			ni = ns = 0;
-			for (n=0; n<2; n++)
-				if (KBASE(a->cls[n]) == 0)
-					ni++;
-				else
-					ns++;
-			if (nint >= ni && nsse >= ns) {
-				nint -= ni;
-				nsse -= ns;
-			} else
-				a->inmem = 1;
-		}
-	}
-
-	return ((6-nint) << 4) | ((8-nsse) << 8);
-}
-
-int rsave[] = {
-	RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX,
-	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
-	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14
-};
-int rclob[] = {RBX, R12, R13, R14, R15};
-
-MAKESURE(rsave_has_correct_size, sizeof rsave == NRSave * sizeof(int));
-MAKESURE(rclob_has_correct_size, sizeof rclob == NRClob * sizeof(int));
-
-bits
-retregs(Ref r, int p[2])
-{
-	bits b;
-	int ni, nf;
-
-	assert(rtype(r) == RACall);
-	b = 0;
-	ni = r.val & 3;
-	nf = (r.val >> 2) & 3;
-	if (ni >= 1)
-		b |= BIT(RAX);
-	if (ni >= 2)
-		b |= BIT(RDX);
-	if (nf >= 1)
-		b |= BIT(XMM0);
-	if (nf >= 2)
-		b |= BIT(XMM1);
-	if (p) {
-		p[0] = ni;
-		p[1] = nf;
-	}
-	return b;
-}
-
-bits
-argregs(Ref r, int p[2])
-{
-	bits b;
-	int j, ni, nf;
-
-	assert(rtype(r) == RACall);
-	b = 0;
-	ni = (r.val >> 4) & 15;
-	nf = (r.val >> 8) & 15;
-	for (j=0; j<ni; j++)
-		b |= BIT(rsave[j]);
-	for (j=0; j<nf; j++)
-		b |= BIT(XMM0+j);
-	if (p) {
-		p[0] = ni + 1;
-		p[1] = nf;
-	}
-	return b | BIT(RAX);
-}
-
-static Ref
-rarg(int ty, int *ni, int *ns)
-{
-	if (KBASE(ty) == 0)
-		return TMP(rsave[(*ni)++]);
-	else
-		return TMP(XMM0 + (*ns)++);
-}
-
-struct RAlloc {
-	Ins i;
-	RAlloc *link;
-};
-
-static void
-selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
-{
-	Ins *i;
-	AClass *ac, *a, aret;
-	int ca, ni, ns;
-	uint stk, off;
-	Ref r, r1, r2, reg[2], regcp[2];
-	RAlloc *ra;
-
-	ac = alloc((i1-i0) * sizeof ac[0]);
-	if (!req(i1->arg[1], R)) {
-		assert(rtype(i1->arg[1]) == RAType);
-		aclass(&aret, &typ[i1->arg[1].val & AMask]);
-		ca = classify(i0, i1, ac, OArg, &aret);
-	} else
-		ca = classify(i0, i1, ac, OArg, 0);
-
-	for (stk=0, a=&ac[i1-i0]; a>ac;)
-		if ((--a)->inmem) {
-			assert(a->align <= 4);
-			stk += a->size;
-			if (a->align == 4)
-				stk += stk & 15;
-		}
-	stk += stk & 15;
-	if (stk) {
-		r = getcon(-(int64_t)stk, fn);
-		emit(OSAlloc, Kl, R, r, R);
-	}
-
-	if (!req(i1->arg[1], R)) {
-		if (aret.inmem) {
-			/* get the return location from eax
-			 * it saves one callee-save reg */
-			r1 = newtmp("abi", Kl, fn);
-			emit(OCopy, Kl, i1->to, TMP(RAX), R);
-			ca += 1;
-		} else {
-			if (aret.size > 8) {
-				r = newtmp("abi", Kl, fn);
-				regcp[1] = newtmp("abi", aret.cls[1], fn);
-				emit(OStorel, 0, R, regcp[1], r);
-				emit(OAdd, Kl, r, i1->to, getcon(8, fn));
-				chuse(i1->to, +1, fn);
-				ca += 1 << (2 * KBASE(aret.cls[1]));
-			}
-			regcp[0] = newtmp("abi", aret.cls[0], fn);
-			emit(OStorel, 0, R, regcp[0], i1->to);
-			ca += 1 << (2 * KBASE(aret.cls[0]));
-			retr(reg, &aret);
-			if (aret.size > 8)
-				emit(OCopy, aret.cls[1], regcp[1], reg[1], R);
-			emit(OCopy, aret.cls[0], regcp[0], reg[0], R);
-			r1 = i1->to;
-		}
-		/* allocate return pad */
-		ra = alloc(sizeof *ra);
-		/* specific to NAlign == 3 */
-		aret.align -= 2;
-		if (aret.align < 0)
-			aret.align = 0;
-		ra->i.op = OAlloc + aret.align;
-		ra->i.cls = Kl;
-		ra->i.to = r1;
-		ra->i.arg[0] = getcon(aret.size, fn);
-		ra->link = (*rap);
-		*rap = ra;
-	} else {
-		ra = 0;
-		if (KBASE(i1->cls) == 0) {
-			emit(OCopy, i1->cls, i1->to, TMP(RAX), R);
-			ca += 1;
-		} else {
-			emit(OCopy, i1->cls, i1->to, TMP(XMM0), R);
-			ca += 1 << 2;
-		}
-	}
-	emit(OCall, i1->cls, R, i1->arg[0], CALL(ca));
-	emit(OCopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);
-
-	ni = ns = 0;
-	if (ra && aret.inmem)
-		emit(OCopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */
-	for (i=i0, a=ac; i<i1; i++, a++) {
-		if (a->inmem)
-			continue;
-		r1 = rarg(a->cls[0], &ni, &ns);
-		if (i->op == OArgc) {
-			if (a->size > 8) {
-				r2 = rarg(a->cls[1], &ni, &ns);
-				r = newtmp("abi", Kl, fn);
-				emit(OLoad, a->cls[1], r2, r, R);
-				emit(OAdd, Kl, r, i->arg[1], getcon(8, fn));
-				chuse(i->arg[1], +1, fn);
-			}
-			emit(OLoad, a->cls[0], r1, i->arg[1], R);
-		} else
-			emit(OCopy, i->cls, r1, i->arg[0], R);
-	}
-
-	if (!stk)
-		return;
-
-	r = newtmp("abi", Kl, fn);
-	chuse(r, -1, fn);
-	for (i=i0, a=ac, off=0; i<i1; i++, a++) {
-		if (!a->inmem)
-			continue;
-		if (i->op == OArgc) {
-			if (a->align == 4)
-				off += off & 15;
-			blit(r, off, i->arg[1], a->size, fn);
-		} else {
-			r1 = newtmp("abi", Kl, fn);
-			emit(OStorel, 0, R, i->arg[0], r1);
-			emit(OAdd, Kl, r1, r, getcon(off, fn));
-			chuse(r, +1, fn);
-		}
-		off += a->size;
-	}
-	emit(OSAlloc, Kl, r, getcon(stk, fn), R);
-}
-
-static void
-selpar(Fn *fn, Ins *i0, Ins *i1)
-{
-	AClass *ac, *a, aret;
-	Ins *i;
-	int ni, ns, s, al;
-	Ref r, r1;
-
-	ac = alloc((i1-i0) * sizeof ac[0]);
-	curi = insb;
-	ni = ns = 0;
-
-	if (fn->retty >= 0) {
-		aclass(&aret, &typ[fn->retty]);
-		if (aret.inmem) {
-			r = newtmp("abi", Kl, fn);
-			*curi++ = (Ins){OCopy, r, {rarg(Kl, &ni, &ns)}, Kl};
-			fn->retr = r;
-		}
-		classify(i0, i1, ac, OPar, &aret);
-	} else
-		classify(i0, i1, ac, OPar, 0);
-
-	/* specific to NAlign == 3 */
-
-	s = 4;
-	for (i=i0, a=ac; i<i1; i++, a++) {
-		switch (a->inmem) {
-		case 1:
-			assert(a->align <= 4);
-			if (a->align == 4)
-				s = (s+3) & -4;
-			fn->tmp[i->to.val].slot = -s; /* HACK! */
-			s += a->size / 4;
-			continue;
-		case 2:
-			*curi++ = (Ins){OLoad, i->to, {SLOT(-s)}, i->cls};
-			s += 2;
-			continue;
-		}
-		r1 = rarg(a->cls[0], &ni, &ns);
-		if (i->op == OParc) {
-			r = newtmp("abi", Kl, fn);
-			*curi++ = (Ins){OCopy, r, {r1}, Kl};
-			a->cls[0] = r.val;
-			if (a->size > 8) {
-				r1 = rarg(a->cls[1], &ni, &ns);
-				r = newtmp("abi", Kl, fn);
-				*curi++ = (Ins){OCopy, r, {r1}, Kl};
-				a->cls[1] = r.val;
-			}
-		} else
-			*curi++ = (Ins){OCopy, i->to, {r1}, i->cls};
-	}
-	for (i=i0, a=ac; i<i1; i++, a++) {
-		if (i->op != OParc || a->inmem)
-			continue;
-		for (al=0; a->align >> (al+2); al++)
-			;
-		r = TMP(a->cls[0]);
-		r1 = i->to;
-		*curi++ = (Ins){OAlloc+al, r1, {getcon(a->size, fn)}, Kl};
-		*curi++ = (Ins){OStorel, R, {r, r1}, 0};
-		if (a->size > 8) {
-			r = newtmp("abi", Kl, fn);
-			*curi++ = (Ins){OAdd, r, {r1, getcon(8, fn)}, Kl};
-			r1 = TMP(a->cls[1]);
-			*curi++ = (Ins){OStorel, R, {r1, r}, 0};
-		}
-	}
-}
-
-static int
 aref(Ref r, ANum *ai)
 {
 	switch (rtype(r)) {
@@ -1031,62 +568,12 @@ void
 isel(Fn *fn)
 {
 	Blk *b, **sb;
-	Ins *i, *i0, *ip;
+	Ins *i;
 	Phi *p;
 	uint a;
 	int n, al;
 	int64_t sz;
 	ANum *ainfo;
-	RAlloc *ral;
-
-	for (n=0; n<fn->ntmp; n++)
-		fn->tmp[n].slot = -1;
-	fn->slot = 0;
-
-	/* lower arguments */
-	for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
-		if (i->op != OPar && i->op != OParc)
-			break;
-	selpar(fn, b->ins, i);
-	n = b->nins - (i - b->ins) + (curi - insb);
-	i0 = alloc(n * sizeof(Ins));
-	ip = icpy(ip = i0, insb, curi - insb);
-	ip = icpy(ip, i, &b->ins[b->nins] - i);
-	b->nins = n;
-	b->ins = i0;
-
-	/* lower function calls and returns */
-	ral = 0;
-	b = fn->start;
-	do {
-		if (!(b = b->link))
-			b = fn->start; /* do it last */
-		curi = &insb[NIns];
-		selret(b, fn);
-		for (i=&b->ins[b->nins]; i!=b->ins;) {
-			if ((--i)->op == OCall) {
-				for (i0=i; i0>b->ins; i0--)
-					if ((i0-1)->op != OArg)
-					if ((i0-1)->op != OArgc)
-						break;
-				selcall(fn, i0, i, &ral);
-				i = i0;
-				continue;
-			}
-			assert(i->op != OArg && i->op != OArgc);
-			emiti(*i);
-		}
-		if (b == fn->start)
-			for (; ral; ral=ral->link)
-				emiti(ral->i);
-		b->nins = &insb[NIns] - curi;
-		idup(&b->ins, curi, b->nins);
-	} while (b != fn->start);
-
-	if (debug['A']) {
-		fprintf(stderr, "\n> After call lowering:\n");
-		printfn(fn, stderr);
-	}
 
 	/* assign slots to fast allocs */
 	b = fn->start;
diff --git a/main.c b/main.c
index c1664be..b7f5b3f 100644
--- a/main.c
+++ b/main.c
@@ -49,6 +49,7 @@ func(Fn *fn)
 	filluse(fn);
 	copy(fn);
 	filluse(fn);
+	abi(fn);
 	isel(fn);
 	filllive(fn);
 	fillcost(fn);
diff --git a/parse.c b/parse.c
index 1fd032a..521e2bf 100644
--- a/parse.c
+++ b/parse.c
@@ -143,10 +143,7 @@ static struct {
 } tokval;
 static int lnum;
 
-static Tmp *tmp;
-static Con *con;
-static int ntmp;
-static int ncon;
+static Fn *curf;
 static Phi **plink;
 static Blk **bmap;
 static Blk *curb;
@@ -360,11 +357,11 @@ tmpref(char *v)
 {
 	int t;
 
-	for (t=Tmp0; t<ntmp; t++)
-		if (strcmp(v, tmp[t].name) == 0)
+	for (t=Tmp0; t<curf->ntmp; t++)
+		if (strcmp(v, curf->tmp[t].name) == 0)
 			return TMP(t);
-	vgrow(&tmp, ++ntmp);
-	strcpy(tmp[t].name, v);
+	newtmp(0, Kw, curf);
+	strcpy(curf->tmp[t].name, v);
 	return TMP(t);
 }
 
@@ -396,13 +393,13 @@ parseref()
 		c.type = CAddr;
 		strcpy(c.label, tokval.str);
 	Look:
-		for (i=0; i<ncon; i++)
-			if (con[i].type == c.type
-			&& con[i].bits.i == c.bits.i
-			&& strcmp(con[i].label, c.label) == 0)
+		for (i=0; i<curf->ncon; i++)
+			if (curf->con[i].type == c.type
+			&& curf->con[i].bits.i == c.bits.i
+			&& strcmp(curf->con[i].label, c.label) == 0)
 				return CON(i);
-		vgrow(&con, ++ncon);
-		con[i] = c;
+		vgrow(&curf->con, ++curf->ncon);
+		curf->con[i] = c;
 		return CON(i);
 	default:
 		return R;
@@ -648,29 +645,31 @@ DoOp:
 static Fn *
 parsefn(int export)
 {
+	int r;
 	PState ps;
-	Fn *fn;
 
-	ntmp = Tmp0;
-	ncon = 1; /* first constant must be 0 */
 	curb = 0;
 	nblk = 0;
 	curi = insb;
-	tmp = vnew(ntmp, sizeof tmp[0]);
-	con = vnew(ncon, sizeof con[0]);
+	curf = alloc(sizeof *curf);
+	curf->ntmp = 0;
+	curf->ncon = 1; /* first constant must be 0 */
+	curf->tmp = vnew(curf->ntmp, sizeof curf->tmp[0]);
+	curf->con = vnew(curf->ncon, sizeof curf->con[0]);
+	for (r=0; r<Tmp0; r++)
+		newtmp(0, r < XMM0 ? Kl : Kd, curf);
 	bmap = vnew(nblk, sizeof bmap[0]);
-	con[0].type = CBits;
-	fn = alloc(sizeof *fn);
-	fn->export = export;
-	blink = &fn->start;
-	fn->retty = -1;
+	curf->con[0].type = CBits;
+	curf->export = export;
+	blink = &curf->start;
+	curf->retty = -1;
 	if (peek() != TGlo)
-		rcls = parsecls(&fn->retty);
+		rcls = parsecls(&curf->retty);
 	else
 		rcls = 5;
 	if (next() != TGlo)
 		err("function name expected");
-	strcpy(fn->name, tokval.str);
+	strcpy(curf->name, tokval.str);
 	parserefl(0);
 	if (nextnl() != TLBrace)
 		err("function body must start with {");
@@ -682,15 +681,11 @@ parsefn(int export)
 		err("empty file");
 	if (curb->jmp.type == JXXX)
 		err("last block misses jump");
-	fn->tmp = tmp;
-	fn->con = con;
-	fn->mem = vnew(0, sizeof fn->mem[0]);
-	fn->ntmp = ntmp;
-	fn->ncon = ncon;
-	fn->nmem = 0;
-	fn->nblk = nblk;
-	fn->rpo = 0;
-	return fn;
+	curf->mem = vnew(0, sizeof curf->mem[0]);
+	curf->nmem = 0;
+	curf->nblk = nblk;
+	curf->rpo = 0;
+	return curf;
 }
 
 static void
diff --git a/sysv.c b/sysv.c
new file mode 100644
index 0000000..78c2f07
--- /dev/null
+++ b/sysv.c
@@ -0,0 +1,512 @@
+#include "all.h"
+
+typedef struct AClass AClass;
+typedef struct RAlloc RAlloc;
+
+struct AClass {
+	int inmem;
+	int align;
+	uint size;
+	int cls[2];
+};
+
+static void
+aclass(AClass *a, Typ *t)
+{
+	int e, s, n, cls;
+	uint sz, al;
+
+	sz = t->size;
+	al = 1u << t->align;
+
+	/* the ABI requires sizes to be rounded
+	 * up to the nearest multiple of 8, moreover
+	 * it makes it easy load and store structures
+	 * in registers
+	 */
+	if (al < 8)
+		al = 8;
+	sz = (sz + al-1) & -al;
+
+	a->size = sz;
+	a->align = t->align;
+
+	if (t->dark || sz > 16) {
+		/* large or unaligned structures are
+		 * required to be passed in memory
+		 */
+		a->inmem = 1;
+		return;
+	}
+
+	a->inmem = 0;
+	for (e=0, s=0; e<2; e++) {
+		cls = -1;
+		for (n=0; n<8 && t->seg[s].len; s++) {
+			if (t->seg[s].ispad) {
+				/* don't change anything */
+			}
+			else if (t->seg[s].isflt) {
+				if (cls == -1)
+					cls = Kd;
+			}
+			else
+				cls = Kl;
+			n += t->seg[s].len;
+		}
+		assert(n <= 8);
+		a->cls[e] = cls;
+	}
+}
+
+static void
+blit(Ref rstk, uint soff, Ref rsrc, uint sz, Fn *fn)
+{
+	Ref r, r1;
+	uint boff;
+
+	/* it's an impolite blit, we might go across the end
+	 * of the source object a little bit... */
+	for (boff=0; sz>0; sz-=8, soff+=8, boff+=8) {
+		r = newtmp("abi", Kl, fn);
+		r1 = newtmp("abi", Kl, fn);
+		emit(OStorel, 0, R, r, r1);
+		emit(OAdd, Kl, r1, rstk, getcon(soff, fn));
+		r1 = newtmp("abi", Kl, fn);
+		emit(OLoad, Kl, r, r1, R);
+		emit(OAdd, Kl, r1, rsrc, getcon(boff, fn));
+		chuse(rsrc, +1, fn);
+		chuse(rstk, +1, fn);
+	}
+}
+
+static int
+retr(Ref reg[2], AClass *aret)
+{
+	static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
+	int n, k, ca, nr[2];
+
+	nr[0] = nr[1] = 0;
+	ca = 0;
+	for (n=0; aret->cls[n]>=0 && n<2; n++) {
+		k = KBASE(aret->cls[n]);
+		reg[n] = TMP(retreg[k][nr[k]++]);
+		ca += 1 << (2 * k);
+	}
+	return ca;
+}
+
+static void
+selret(Blk *b, Fn *fn)
+{
+	int j, k, ca;
+	Ref r, r0, reg[2];
+	AClass aret;
+
+	j = b->jmp.type;
+
+	if (!isret(j) || j == JRet0)
+		return;
+
+	r0 = b->jmp.arg;
+	b->jmp.type = JRet0;
+
+	if (j == JRetc) {
+		aclass(&aret, &typ[fn->retty]);
+		if (aret.inmem) {
+			assert(rtype(fn->retr) == RTmp);
+			emit(OCopy, Kl, TMP(RAX), fn->retr, R);
+			chuse(fn->retr, +1, fn);
+			blit(fn->retr, 0, r0, aret.size, fn);
+			ca = 1;
+		} else {
+			ca = retr(reg, &aret);
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				emit(OLoad, Kl, reg[1], r, R);
+				emit(OAdd, Kl, r, r0, getcon(8, fn));
+				chuse(r0, +1, fn);
+			}
+			emit(OLoad, Kl, reg[0], r0, R);
+		}
+	} else {
+		k = j - JRetw;
+		if (KBASE(k) == 0) {
+			emit(OCopy, k, TMP(RAX), r0, R);
+			ca = 1;
+		} else {
+			emit(OCopy, k, TMP(XMM0), r0, R);
+			ca = 1 << 2;
+		}
+	}
+
+	b->jmp.arg = CALL(ca);
+}
+
+static int
+classify(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret)
+{
+	int nint, ni, nsse, ns, n, *pn;
+	AClass *a;
+	Ins *i;
+
+	if (aret && aret->inmem)
+		nint = 5; /* hidden argument */
+	else
+		nint = 6;
+	nsse = 8;
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (i->op == op) {
+			if (KBASE(i->cls) == 0)
+				pn = &nint;
+			else
+				pn = &nsse;
+			if (*pn > 0) {
+				--*pn;
+				a->inmem = 0;
+			} else
+				a->inmem = 2;
+			a->align = 3;
+			a->size = 8;
+			a->cls[0] = i->cls;
+		} else {
+			n = i->arg[0].val & AMask;
+			aclass(a, &typ[n]);
+			if (a->inmem)
+				continue;
+			ni = ns = 0;
+			for (n=0; n<2; n++)
+				if (KBASE(a->cls[n]) == 0)
+					ni++;
+				else
+					ns++;
+			if (nint >= ni && nsse >= ns) {
+				nint -= ni;
+				nsse -= ns;
+			} else
+				a->inmem = 1;
+		}
+	}
+
+	return ((6-nint) << 4) | ((8-nsse) << 8);
+}
+
+int rsave[] = {
+	RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX,
+	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14
+};
+int rclob[] = {RBX, R12, R13, R14, R15};
+
+MAKESURE(rsave_has_correct_size, sizeof rsave == NRSave * sizeof(int));
+MAKESURE(rclob_has_correct_size, sizeof rclob == NRClob * sizeof(int));
+
+bits
+retregs(Ref r, int p[2])
+{
+	bits b;
+	int ni, nf;
+
+	assert(rtype(r) == RACall);
+	b = 0;
+	ni = r.val & 3;
+	nf = (r.val >> 2) & 3;
+	if (ni >= 1)
+		b |= BIT(RAX);
+	if (ni >= 2)
+		b |= BIT(RDX);
+	if (nf >= 1)
+		b |= BIT(XMM0);
+	if (nf >= 2)
+		b |= BIT(XMM1);
+	if (p) {
+		p[0] = ni;
+		p[1] = nf;
+	}
+	return b;
+}
+
+bits
+argregs(Ref r, int p[2])
+{
+	bits b;
+	int j, ni, nf;
+
+	assert(rtype(r) == RACall);
+	b = 0;
+	ni = (r.val >> 4) & 15;
+	nf = (r.val >> 8) & 15;
+	for (j=0; j<ni; j++)
+		b |= BIT(rsave[j]);
+	for (j=0; j<nf; j++)
+		b |= BIT(XMM0+j);
+	if (p) {
+		p[0] = ni + 1;
+		p[1] = nf;
+	}
+	return b | BIT(RAX);
+}
+
+static Ref
+rarg(int ty, int *ni, int *ns)
+{
+	if (KBASE(ty) == 0)
+		return TMP(rsave[(*ni)++]);
+	else
+		return TMP(XMM0 + (*ns)++);
+}
+
+struct RAlloc {
+	Ins i;
+	RAlloc *link;
+};
+
+static void
+selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
+{
+	Ins *i;
+	AClass *ac, *a, aret;
+	int ca, ni, ns;
+	uint stk, off;
+	Ref r, r1, r2, reg[2], regcp[2];
+	RAlloc *ra;
+
+	ac = alloc((i1-i0) * sizeof ac[0]);
+	if (!req(i1->arg[1], R)) {
+		assert(rtype(i1->arg[1]) == RAType);
+		aclass(&aret, &typ[i1->arg[1].val & AMask]);
+		ca = classify(i0, i1, ac, OArg, &aret);
+	} else
+		ca = classify(i0, i1, ac, OArg, 0);
+
+	for (stk=0, a=&ac[i1-i0]; a>ac;)
+		if ((--a)->inmem) {
+			assert(a->align <= 4);
+			stk += a->size;
+			if (a->align == 4)
+				stk += stk & 15;
+		}
+	stk += stk & 15;
+	if (stk) {
+		r = getcon(-(int64_t)stk, fn);
+		emit(OSAlloc, Kl, R, r, R);
+	}
+
+	if (!req(i1->arg[1], R)) {
+		if (aret.inmem) {
+			/* get the return location from eax
+			 * it saves one callee-save reg */
+			r1 = newtmp("abi", Kl, fn);
+			emit(OCopy, Kl, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				regcp[1] = newtmp("abi", aret.cls[1], fn);
+				emit(OStorel, 0, R, regcp[1], r);
+				emit(OAdd, Kl, r, i1->to, getcon(8, fn));
+				chuse(i1->to, +1, fn);
+				ca += 1 << (2 * KBASE(aret.cls[1]));
+			}
+			regcp[0] = newtmp("abi", aret.cls[0], fn);
+			emit(OStorel, 0, R, regcp[0], i1->to);
+			ca += 1 << (2 * KBASE(aret.cls[0]));
+			retr(reg, &aret);
+			if (aret.size > 8)
+				emit(OCopy, aret.cls[1], regcp[1], reg[1], R);
+			emit(OCopy, aret.cls[0], regcp[0], reg[0], R);
+			r1 = i1->to;
+		}
+		/* allocate return pad */
+		ra = alloc(sizeof *ra);
+		/* specific to NAlign == 3 */
+		aret.align -= 2;
+		if (aret.align < 0)
+			aret.align = 0;
+		ra->i.op = OAlloc + aret.align;
+		ra->i.cls = Kl;
+		ra->i.to = r1;
+		ra->i.arg[0] = getcon(aret.size, fn);
+		ra->link = (*rap);
+		*rap = ra;
+	} else {
+		ra = 0;
+		if (KBASE(i1->cls) == 0) {
+			emit(OCopy, i1->cls, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			emit(OCopy, i1->cls, i1->to, TMP(XMM0), R);
+			ca += 1 << 2;
+		}
+	}
+	emit(OCall, i1->cls, R, i1->arg[0], CALL(ca));
+	emit(OCopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);
+
+	ni = ns = 0;
+	if (ra && aret.inmem)
+		emit(OCopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (a->inmem)
+			continue;
+		r1 = rarg(a->cls[0], &ni, &ns);
+		if (i->op == OArgc) {
+			if (a->size > 8) {
+				r2 = rarg(a->cls[1], &ni, &ns);
+				r = newtmp("abi", Kl, fn);
+				emit(OLoad, a->cls[1], r2, r, R);
+				emit(OAdd, Kl, r, i->arg[1], getcon(8, fn));
+				chuse(i->arg[1], +1, fn);
+			}
+			emit(OLoad, a->cls[0], r1, i->arg[1], R);
+		} else
+			emit(OCopy, i->cls, r1, i->arg[0], R);
+	}
+
+	if (!stk)
+		return;
+
+	r = newtmp("abi", Kl, fn);
+	chuse(r, -1, fn);
+	for (i=i0, a=ac, off=0; i<i1; i++, a++) {
+		if (!a->inmem)
+			continue;
+		if (i->op == OArgc) {
+			if (a->align == 4)
+				off += off & 15;
+			blit(r, off, i->arg[1], a->size, fn);
+		} else {
+			r1 = newtmp("abi", Kl, fn);
+			emit(OStorel, 0, R, i->arg[0], r1);
+			emit(OAdd, Kl, r1, r, getcon(off, fn));
+			chuse(r, +1, fn);
+		}
+		off += a->size;
+	}
+	emit(OSAlloc, Kl, r, getcon(stk, fn), R);
+}
+
+static void
+selpar(Fn *fn, Ins *i0, Ins *i1)
+{
+	AClass *ac, *a, aret;
+	Ins *i;
+	int ni, ns, s, al;
+	Ref r, r1;
+
+	ac = alloc((i1-i0) * sizeof ac[0]);
+	curi = insb;
+	ni = ns = 0;
+
+	if (fn->retty >= 0) {
+		aclass(&aret, &typ[fn->retty]);
+		if (aret.inmem) {
+			r = newtmp("abi", Kl, fn);
+			*curi++ = (Ins){OCopy, r, {rarg(Kl, &ni, &ns)}, Kl};
+			fn->retr = r;
+		}
+		classify(i0, i1, ac, OPar, &aret);
+	} else
+		classify(i0, i1, ac, OPar, 0);
+
+	/* specific to NAlign == 3 */
+
+	s = 4;
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		switch (a->inmem) {
+		case 1:
+			assert(a->align <= 4);
+			if (a->align == 4)
+				s = (s+3) & -4;
+			fn->tmp[i->to.val].slot = -s; /* HACK! */
+			s += a->size / 4;
+			continue;
+		case 2:
+			*curi++ = (Ins){OLoad, i->to, {SLOT(-s)}, i->cls};
+			s += 2;
+			continue;
+		}
+		r1 = rarg(a->cls[0], &ni, &ns);
+		if (i->op == OParc) {
+			r = newtmp("abi", Kl, fn);
+			*curi++ = (Ins){OCopy, r, {r1}, Kl};
+			a->cls[0] = r.val;
+			if (a->size > 8) {
+				r1 = rarg(a->cls[1], &ni, &ns);
+				r = newtmp("abi", Kl, fn);
+				*curi++ = (Ins){OCopy, r, {r1}, Kl};
+				a->cls[1] = r.val;
+			}
+		} else
+			*curi++ = (Ins){OCopy, i->to, {r1}, i->cls};
+	}
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (i->op != OParc || a->inmem)
+			continue;
+		for (al=0; a->align >> (al+2); al++)
+			;
+		r = TMP(a->cls[0]);
+		r1 = i->to;
+		*curi++ = (Ins){OAlloc+al, r1, {getcon(a->size, fn)}, Kl};
+		*curi++ = (Ins){OStorel, R, {r, r1}, 0};
+		if (a->size > 8) {
+			r = newtmp("abi", Kl, fn);
+			*curi++ = (Ins){OAdd, r, {r1, getcon(8, fn)}, Kl};
+			r1 = TMP(a->cls[1]);
+			*curi++ = (Ins){OStorel, R, {r1, r}, 0};
+		}
+	}
+}
+
+void
+abi(Fn *fn)
+{
+	Blk *b;
+	Ins *i, *i0, *ip;
+	RAlloc *ral;
+	int n;
+
+	/* lower arguments */
+	for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
+		if (i->op != OPar && i->op != OParc)
+			break;
+	selpar(fn, b->ins, i);
+	n = b->nins - (i - b->ins) + (curi - insb);
+	i0 = alloc(n * sizeof(Ins));
+	ip = icpy(ip = i0, insb, curi - insb);
+	ip = icpy(ip, i, &b->ins[b->nins] - i);
+	b->nins = n;
+	b->ins = i0;
+
+	/* lower calls and returns */
+	ral = 0;
+	b = fn->start;
+	do {
+		if (!(b = b->link))
+			b = fn->start; /* do it last */
+		curi = &insb[NIns];
+		selret(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;) {
+			if ((--i)->op == OCall) {
+				for (i0=i; i0>b->ins; i0--)
+					if ((i0-1)->op != OArg)
+					if ((i0-1)->op != OArgc)
+						break;
+				selcall(fn, i0, i, &ral);
+				i = i0;
+				continue;
+			}
+			assert(i->op != OArg && i->op != OArgc);
+			emiti(*i);
+		}
+		if (b == fn->start)
+			for (; ral; ral=ral->link)
+				emiti(ral->i);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	} while (b != fn->start);
+
+	if (debug['A']) {
+		fprintf(stderr, "\n> After ABI lowering:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/util.c b/util.c
index 1654491..238c996 100644
--- a/util.c
+++ b/util.c
@@ -187,7 +187,8 @@ newtmp(char *prfx, int k,  Fn *fn)
 
 	t = fn->ntmp++;
 	vgrow(&fn->tmp, fn->ntmp);
-	sprintf(fn->tmp[t].name, "%s%d", prfx, ++n);
+	if (prfx)
+		sprintf(fn->tmp[t].name, "%s%d", prfx, ++n);
 	fn->tmp[t].cls = k;
 	fn->tmp[t].slot = -1;
 	fn->tmp[t].nuse = +1;
@@ -195,6 +196,13 @@ newtmp(char *prfx, int k,  Fn *fn)
 	return TMP(t);
 }
 
+void
+chuse(Ref r, int du, Fn *fn)
+{
+	if (rtype(r) == RTmp)
+		fn->tmp[r.val].nuse += du;
+}
+
 Ref
 getcon(int64_t val, Fn *fn)
 {
@@ -203,8 +211,7 @@ getcon(int64_t val, Fn *fn)
 	for (c=0; c<fn->ncon; c++)
 		if (fn->con[c].type == CBits && fn->con[c].bits.i == val)
 			return CON(c);
-	fn->ncon++;
-	vgrow(&fn->con, fn->ncon);
+	vgrow(&fn->con, ++fn->ncon);
 	fn->con[c] = (Con){.type = CBits, .bits.i = val};
 	return CON(c);
 }