summary refs log tree commit diff
path: root/amd64/sysv.c
diff options
context:
space:
mode:
authorQuentin Carbonneaux <quentin@c9x.me>2017-04-08 21:06:33 -0400
committerQuentin Carbonneaux <quentin@c9x.me>2017-04-08 21:56:20 -0400
commit49a4593c335126ba279f47328824abfef379725e (patch)
tree2f4cb5e9884ec958ea32a494da302a9aae8ca420 /amd64/sysv.c
parent9d1c38d69547d835f7228651e71e8a7d744c456d (diff)
downloadroux-49a4593c335126ba279f47328824abfef379725e.tar.gz
prepare for multi-target
This big diff does multiple changes to allow
the addition of new targets to qbe.  The
changes are listed below in decreasing order
of impact.

1. Add a new Target structure.

To add support for a given target, one has to
implement all the members of the Target
structure.  All the source files where changed
to use this interface where needed.

2. Single out amd64-specific code.

In this commit, the amd64 target T_amd64_sysv
is the only target available, it is implemented
in the amd64/ directory.  All the non-static
items in this directory are prefixed with either
amd64_ or amd64_sysv (for items that are
specific to the System V ABI).

3. Centralize Ops information.

There is now a file 'ops.h' that must be used to
store all the available operations together with
their metadata.  The various targets will only
select what they need; but it is beneficial that
there is only *one* place to change to add a new
instruction.

One good side effect of this change is that any
operation 'xyz' in the IL now as a corresponding
'Oxyz' in the code.

4. Misc fixes.

One notable change is that instruction selection
now generates generic comparison operations and
the lowering to the target's comparisons is done
in the emitter.

GAS directives for data are the same for many
targets, so data emission was extracted in a
file 'gas.c'.

5. Modularize the Makefile.

The Makefile now has a list of C files that
are target-independent (SRC), and one list
of C files per target.  Each target can also
use its own 'all.h' header (for example to
define registers).
Diffstat (limited to 'amd64/sysv.c')
-rw-r--r--amd64/sysv.c701
1 files changed, 701 insertions, 0 deletions
diff --git a/amd64/sysv.c b/amd64/sysv.c
new file mode 100644
index 0000000..dcaa812
--- /dev/null
+++ b/amd64/sysv.c
@@ -0,0 +1,701 @@
+#include "all.h"
+
+typedef struct AClass AClass;
+typedef struct RAlloc RAlloc;
+
+struct AClass {
+	int inmem;
+	int align;
+	uint size;
+	int cls[2];
+	Ref ref[2];
+};
+
+struct RAlloc {
+	Ins i;
+	RAlloc *link;
+};
+
+static void
+classify(AClass *a, Typ *t, int *pn, int *pe)
+{
+	Seg *seg;
+	int n, s, *cls;
+
+	for (n=0; n<t->nunion; n++) {
+		seg = t->seg[n];
+		for (s=0; *pe<2; (*pe)++) {
+			cls = &a->cls[*pe];
+			for (; *pn<8; s++) {
+				switch (seg[s].type) {
+				case SEnd:
+					goto Done;
+				case SPad:
+					/* don't change anything */
+					break;
+				case SFlt:
+					if (*cls == Kx)
+						*cls = Kd;
+					break;
+				case SInt:
+					*cls = Kl;
+					break;
+				case STyp:
+					classify(a, &typ[seg[s].len], pn, pe);
+					continue;
+				}
+				*pn += seg[s].len;
+			}
+		Done:
+			assert(*pn <= 8);
+			*pn = 0;
+		}
+	}
+}
+
+static void
+typclass(AClass *a, Typ *t)
+{
+	int e, n;
+	uint sz, al;
+
+	sz = t->size;
+	al = 1u << t->align;
+
+	/* the ABI requires sizes to be rounded
+	 * up to the nearest multiple of 8, moreover
+	 * it makes it easy load and store structures
+	 * in registers
+	 */
+	if (al < 8)
+		al = 8;
+	sz = (sz + al-1) & -al;
+
+	a->size = sz;
+	a->align = t->align;
+
+	if (t->dark || sz > 16 || sz == 0) {
+		/* large or unaligned structures are
+		 * required to be passed in memory
+		 */
+		a->inmem = 1;
+		return;
+	}
+
+	a->cls[0] = Kx;
+	a->cls[1] = Kx;
+	a->inmem = 0;
+	n = 0;
+	e = 0;
+	classify(a, t, &n, &e);
+}
+
+static int
+retr(Ref reg[2], AClass *aret)
+{
+	static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
+	int n, k, ca, nr[2];
+
+	nr[0] = nr[1] = 0;
+	ca = 0;
+	for (n=0; (uint)n*8<aret->size; n++) {
+		k = KBASE(aret->cls[n]);
+		reg[n] = TMP(retreg[k][nr[k]++]);
+		ca += 1 << (2 * k);
+	}
+	return ca;
+}
+
+static void
+selret(Blk *b, Fn *fn)
+{
+	int j, k, ca;
+	Ref r, r0, reg[2];
+	AClass aret;
+
+	j = b->jmp.type;
+
+	if (!isret(j) || j == Jret0)
+		return;
+
+	r0 = b->jmp.arg;
+	b->jmp.type = Jret0;
+
+	if (j == Jretc) {
+		typclass(&aret, &typ[fn->retty]);
+		if (aret.inmem) {
+			assert(rtype(fn->retr) == RTmp);
+			emit(Ocopy, Kl, TMP(RAX), fn->retr, R);
+			blit(fn->retr, 0, r0, aret.size, fn);
+			ca = 1;
+		} else {
+			ca = retr(reg, &aret);
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				emit(Oload, Kl, reg[1], r, R);
+				emit(Oadd, Kl, r, r0, getcon(8, fn));
+			}
+			emit(Oload, Kl, reg[0], r0, R);
+		}
+	} else {
+		k = j - Jretw;
+		if (KBASE(k) == 0) {
+			emit(Ocopy, k, TMP(RAX), r0, R);
+			ca = 1;
+		} else {
+			emit(Ocopy, k, TMP(XMM0), r0, R);
+			ca = 1 << 2;
+		}
+	}
+
+	b->jmp.arg = CALL(ca);
+}
+
+static int
+argsclass(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret, Ref *env)
+{
+	int nint, ni, nsse, ns, n, *pn;
+	AClass *a;
+	Ins *i;
+
+	if (aret && aret->inmem)
+		nint = 5; /* hidden argument */
+	else
+		nint = 6;
+	nsse = 8;
+	for (i=i0, a=ac; i<i1; i++, a++)
+		switch (i->op - op + Oarg) {
+		case Oarg:
+			if (KBASE(i->cls) == 0)
+				pn = &nint;
+			else
+				pn = &nsse;
+			if (*pn > 0) {
+				--*pn;
+				a->inmem = 0;
+			} else
+				a->inmem = 2;
+			a->align = 3;
+			a->size = 8;
+			a->cls[0] = i->cls;
+			break;
+		case Oargc:
+			n = i->arg[0].val;
+			typclass(a, &typ[n]);
+			if (a->inmem)
+				continue;
+			ni = ns = 0;
+			for (n=0; (uint)n*8<a->size; n++)
+				if (KBASE(a->cls[n]) == 0)
+					ni++;
+				else
+					ns++;
+			if (nint >= ni && nsse >= ns) {
+				nint -= ni;
+				nsse -= ns;
+			} else
+				a->inmem = 1;
+			break;
+		case Oarge:
+			if (op == Opar)
+				*env = i->to;
+			else
+				*env = i->arg[0];
+			break;
+		}
+
+	return ((6-nint) << 4) | ((8-nsse) << 8);
+}
+
+int amd64_sysv_rsave[] = {
+	RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX,
+	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, -1
+};
+int amd64_sysv_rclob[] = {RBX, R12, R13, R14, R15, -1};
+
+MAKESURE(sysv_arrays_ok,
+	sizeof amd64_sysv_rsave == (NGPS+NFPS+1) * sizeof(int) &&
+	sizeof amd64_sysv_rclob == (NCLR+1) * sizeof(int)
+);
+
+/* layout of call's second argument (RCall)
+ *
+ *  29     12    8    4  3  0
+ *  |0...00|x|xxxx|xxxx|xx|xx|                  range
+ *          |    |    |  |  ` gp regs returned (0..2)
+ *          |    |    |  ` sse regs returned   (0..2)
+ *          |    |    ` gp regs passed         (0..6)
+ *          |    ` sse regs passed             (0..8)
+ *          ` 1 if rax is used to pass data    (0..1)
+ */
+
+bits
+amd64_sysv_retregs(Ref r, int p[2])
+{
+	bits b;
+	int ni, nf;
+
+	assert(rtype(r) == RCall);
+	b = 0;
+	ni = r.val & 3;
+	nf = (r.val >> 2) & 3;
+	if (ni >= 1)
+		b |= BIT(RAX);
+	if (ni >= 2)
+		b |= BIT(RDX);
+	if (nf >= 1)
+		b |= BIT(XMM0);
+	if (nf >= 2)
+		b |= BIT(XMM1);
+	if (p) {
+		p[0] = ni;
+		p[1] = nf;
+	}
+	return b;
+}
+
+bits
+amd64_sysv_argregs(Ref r, int p[2])
+{
+	bits b;
+	int j, ni, nf, ra;
+
+	assert(rtype(r) == RCall);
+	b = 0;
+	ni = (r.val >> 4) & 15;
+	nf = (r.val >> 8) & 15;
+	ra = (r.val >> 12) & 1;
+	for (j=0; j<ni; j++)
+		b |= BIT(amd64_sysv_rsave[j]);
+	for (j=0; j<nf; j++)
+		b |= BIT(XMM0+j);
+	if (p) {
+		p[0] = ni + ra;
+		p[1] = nf;
+	}
+	return b | (ra ? BIT(RAX) : 0);
+}
+
+static Ref
+rarg(int ty, int *ni, int *ns)
+{
+	if (KBASE(ty) == 0)
+		return TMP(amd64_sysv_rsave[(*ni)++]);
+	else
+		return TMP(XMM0 + (*ns)++);
+}
+
+static void
+selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
+{
+	Ins *i;
+	AClass *ac, *a, aret;
+	int ca, ni, ns, al, varc, envc;
+	uint stk, off;
+	Ref r, r1, r2, reg[2], env;
+	RAlloc *ra;
+
+	env = R;
+	ac = alloc((i1-i0) * sizeof ac[0]);
+
+	if (!req(i1->arg[1], R)) {
+		assert(rtype(i1->arg[1]) == RType);
+		typclass(&aret, &typ[i1->arg[1].val]);
+		ca = argsclass(i0, i1, ac, Oarg, &aret, &env);
+	} else
+		ca = argsclass(i0, i1, ac, Oarg, 0, &env);
+
+	for (stk=0, a=&ac[i1-i0]; a>ac;)
+		if ((--a)->inmem) {
+			if (a->align > 4)
+				err("sysv abi requires alignments of 16 or less");
+			stk += a->size;
+			if (a->align == 4)
+				stk += stk & 15;
+		}
+	stk += stk & 15;
+	if (stk) {
+		r = getcon(-(int64_t)stk, fn);
+		emit(Osalloc, Kl, R, r, R);
+	}
+
+	if (!req(i1->arg[1], R)) {
+		if (aret.inmem) {
+			/* get the return location from eax
+			 * it saves one callee-save reg */
+			r1 = newtmp("abi", Kl, fn);
+			emit(Ocopy, Kl, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				aret.ref[1] = newtmp("abi", aret.cls[1], fn);
+				emit(Ostorel, 0, R, aret.ref[1], r);
+				emit(Oadd, Kl, r, i1->to, getcon(8, fn));
+			}
+			aret.ref[0] = newtmp("abi", aret.cls[0], fn);
+			emit(Ostorel, 0, R, aret.ref[0], i1->to);
+			ca += retr(reg, &aret);
+			if (aret.size > 8)
+				emit(Ocopy, aret.cls[1], aret.ref[1], reg[1], R);
+			emit(Ocopy, aret.cls[0], aret.ref[0], reg[0], R);
+			r1 = i1->to;
+		}
+		/* allocate return pad */
+		ra = alloc(sizeof *ra);
+		/* specific to NAlign == 3 */
+		al = aret.align >= 2 ? aret.align - 2 : 0;
+		ra->i = (Ins){Oalloc+al, r1, {getcon(aret.size, fn)}, Kl};
+		ra->link = (*rap);
+		*rap = ra;
+	} else {
+		ra = 0;
+		if (KBASE(i1->cls) == 0) {
+			emit(Ocopy, i1->cls, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			emit(Ocopy, i1->cls, i1->to, TMP(XMM0), R);
+			ca += 1 << 2;
+		}
+	}
+	envc = !req(R, env);
+	varc = i1->op == Ovacall;
+	if (varc && envc)
+		err("sysv abi does not support variadic env calls");
+	ca |= (varc | envc) << 12;
+	emit(Ocall, i1->cls, R, i1->arg[0], CALL(ca));
+	if (envc)
+		emit(Ocopy, Kl, TMP(RAX), env, R);
+	if (varc)
+		emit(Ocopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);
+
+	ni = ns = 0;
+	if (ra && aret.inmem)
+		emit(Ocopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (a->inmem)
+			continue;
+		r1 = rarg(a->cls[0], &ni, &ns);
+		if (i->op == Oargc) {
+			if (a->size > 8) {
+				r2 = rarg(a->cls[1], &ni, &ns);
+				r = newtmp("abi", Kl, fn);
+				emit(Oload, a->cls[1], r2, r, R);
+				emit(Oadd, Kl, r, i->arg[1], getcon(8, fn));
+			}
+			emit(Oload, a->cls[0], r1, i->arg[1], R);
+		} else
+			emit(Ocopy, i->cls, r1, i->arg[0], R);
+	}
+
+	if (!stk)
+		return;
+
+	r = newtmp("abi", Kl, fn);
+	for (i=i0, a=ac, off=0; i<i1; i++, a++) {
+		if (!a->inmem)
+			continue;
+		if (i->op == Oargc) {
+			if (a->align == 4)
+				off += off & 15;
+			blit(r, off, i->arg[1], a->size, fn);
+		} else {
+			r1 = newtmp("abi", Kl, fn);
+			emit(Ostorel, 0, R, i->arg[0], r1);
+			emit(Oadd, Kl, r1, r, getcon(off, fn));
+		}
+		off += a->size;
+	}
+	emit(Osalloc, Kl, r, getcon(stk, fn), R);
+}
+
+static int
+selpar(Fn *fn, Ins *i0, Ins *i1)
+{
+	AClass *ac, *a, aret;
+	Ins *i;
+	int ni, ns, s, al, fa;
+	Ref r, env;
+
+	env = R;
+	ac = alloc((i1-i0) * sizeof ac[0]);
+	curi = &insb[NIns];
+	ni = ns = 0;
+
+	if (fn->retty >= 0) {
+		typclass(&aret, &typ[fn->retty]);
+		fa = argsclass(i0, i1, ac, Opar, &aret, &env);
+	} else
+		fa = argsclass(i0, i1, ac, Opar, 0, &env);
+
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (i->op != Oparc || a->inmem)
+			continue;
+		if (a->size > 8) {
+			r = newtmp("abi", Kl, fn);
+			a->ref[1] = newtmp("abi", Kl, fn);
+			emit(Ostorel, 0, R, a->ref[1], r);
+			emit(Oadd, Kl, r, i->to, getcon(8, fn));
+		}
+		a->ref[0] = newtmp("abi", Kl, fn);
+		emit(Ostorel, 0, R, a->ref[0], i->to);
+		/* specific to NAlign == 3 */
+		al = a->align >= 2 ? a->align - 2 : 0;
+		emit(Oalloc+al, Kl, i->to, getcon(a->size, fn), R);
+	}
+
+	if (fn->retty >= 0 && aret.inmem) {
+		r = newtmp("abi", Kl, fn);
+		emit(Ocopy, Kl, r, rarg(Kl, &ni, &ns), R);
+		fn->retr = r;
+	}
+
+	for (i=i0, a=ac, s=4; i<i1; i++, a++) {
+		switch (a->inmem) {
+		case 1:
+			if (a->align > 4)
+				err("sysv abi requires alignments of 16 or less");
+			if (a->align == 4)
+				s = (s+3) & -4;
+			fn->tmp[i->to.val].slot = -s;
+			s += a->size / 4;
+			continue;
+		case 2:
+			emit(Oload, i->cls, i->to, SLOT(-s), R);
+			s += 2;
+			continue;
+		}
+		r = rarg(a->cls[0], &ni, &ns);
+		if (i->op == Oparc) {
+			emit(Ocopy, Kl, a->ref[0], r, R);
+			if (a->size > 8) {
+				r = rarg(a->cls[1], &ni, &ns);
+				emit(Ocopy, Kl, a->ref[1], r, R);
+			}
+		} else
+			emit(Ocopy, i->cls, i->to, r, R);
+	}
+
+	if (!req(R, env))
+		emit(Ocopy, Kl, env, TMP(RAX), R);
+
+	return fa | (s*4)<<12;
+}
+
+static Blk *
+split(Fn *fn, Blk *b)
+{
+	Blk *bn;
+
+	++fn->nblk;
+	bn = blknew();
+	bn->nins = &insb[NIns] - curi;
+	idup(&bn->ins, curi, bn->nins);
+	curi = &insb[NIns];
+	bn->visit = ++b->visit;
+	snprintf(bn->name, NString, "%s.%d", b->name, b->visit);
+	bn->loop = b->loop;
+	bn->link = b->link;
+	b->link = bn;
+	return bn;
+}
+
+static void
+chpred(Blk *b, Blk *bp, Blk *bp1)
+{
+	Phi *p;
+	uint a;
+
+	for (p=b->phi; p; p=p->link) {
+		for (a=0; p->blk[a]!=bp; a++)
+			assert(a+1<p->narg);
+		p->blk[a] = bp1;
+	}
+}
+
+static void
+selvaarg(Fn *fn, Blk *b, Ins *i)
+{
+	Ref loc, lreg, lstk, nr, r0, r1, c4, c8, c16, c, ap;
+	Blk *b0, *bstk, *breg;
+	int isint;
+
+	c4 = getcon(4, fn);
+	c8 = getcon(8, fn);
+	c16 = getcon(16, fn);
+	ap = i->arg[0];
+	isint = KBASE(i->cls) == 0;
+
+	/* @b [...]
+	       r0 =l add ap, (0 or 4)
+	       nr =l loadsw r0
+	       r1 =w cultw nr, (48 or 176)
+	       jnz r1, @breg, @bstk
+	   @breg
+	       r0 =l add ap, 16
+	       r1 =l loadl r0
+	       lreg =l add r1, nr
+	       r0 =w add nr, (8 or 16)
+	       r1 =l add ap, (0 or 4)
+	       storew r0, r1
+	   @bstk
+	       r0 =l add ap, 8
+	       lstk =l loadl r0
+	       r1 =l add lstk, 8
+	       storel r1, r0
+	   @b0
+	       %loc =l phi @breg %lreg, @bstk %lstk
+	       i->to =(i->cls) load %loc
+	*/
+
+	loc = newtmp("abi", Kl, fn);
+	emit(Oload, i->cls, i->to, loc, R);
+	b0 = split(fn, b);
+	b0->jmp = b->jmp;
+	b0->s1 = b->s1;
+	b0->s2 = b->s2;
+	if (b->s1)
+		chpred(b->s1, b, b0);
+	if (b->s2 && b->s2 != b->s1)
+		chpred(b->s2, b, b0);
+
+	lreg = newtmp("abi", Kl, fn);
+	nr = newtmp("abi", Kl, fn);
+	r0 = newtmp("abi", Kw, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorew, Kw, R, r0, r1);
+	emit(Oadd, Kl, r1, ap, isint ? CON_Z : c4);
+	emit(Oadd, Kw, r0, nr, isint ? c8 : c16);
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Oadd, Kl, lreg, r1, nr);
+	emit(Oload, Kl, r1, r0, R);
+	emit(Oadd, Kl, r0, ap, c16);
+	breg = split(fn, b);
+	breg->jmp.type = Jjmp;
+	breg->s1 = b0;
+
+	lstk = newtmp("abi", Kl, fn);
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, lstk, c8);
+	emit(Oload, Kl, lstk, r0, R);
+	emit(Oadd, Kl, r0, ap, c8);
+	bstk = split(fn, b);
+	bstk->jmp.type = Jjmp;
+	bstk->s1 = b0;
+
+	b0->phi = alloc(sizeof *b0->phi);
+	*b0->phi = (Phi){
+		.cls = Kl, .to = loc,
+		.narg = 2,
+		.blk = {bstk, breg},
+		.arg = {lstk, lreg},
+	};
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kw, fn);
+	b->jmp.type = Jjnz;
+	b->jmp.arg = r1;
+	b->s1 = breg;
+	b->s2 = bstk;
+	c = getcon(isint ? 48 : 176, fn);
+	emit(Ocmpw+Ciult, Kw, r1, nr, c);
+	emit(Oloadsw, Kl, nr, r0, R);
+	emit(Oadd, Kl, r0, ap, isint ? CON_Z : c4);
+}
+
+static void
+selvastart(Fn *fn, int fa, Ref ap)
+{
+	Ref r0, r1;
+	int gp, fp, sp;
+
+	gp = ((fa >> 4) & 15) * 8;
+	fp = 48 + ((fa >> 8) & 15) * 16;
+	sp = fa >> 12;
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, TMP(RBP), getcon(-176, fn));
+	emit(Oadd, Kl, r0, ap, getcon(16, fn));
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, TMP(RBP), getcon(sp, fn));
+	emit(Oadd, Kl, r0, ap, getcon(8, fn));
+	r0 = newtmp("abi", Kl, fn);
+	emit(Ostorew, Kw, R, getcon(fp, fn), r0);
+	emit(Oadd, Kl, r0, ap, getcon(4, fn));
+	emit(Ostorew, Kw, R, getcon(gp, fn), ap);
+}
+
+void
+amd64_sysv_abi(Fn *fn)
+{
+	Blk *b;
+	Ins *i, *i0, *ip;
+	RAlloc *ral;
+	int n, fa;
+
+	for (b=fn->start; b; b=b->link)
+		b->visit = 0;
+
+	/* lower parameters */
+	for (b=fn->start, i=b->ins; i-b->ins<b->nins; i++)
+		if (!ispar(i->op))
+			break;
+	fa = selpar(fn, b->ins, i);
+	n = b->nins - (i - b->ins) + (&insb[NIns] - curi);
+	i0 = alloc(n * sizeof(Ins));
+	ip = icpy(ip = i0, curi, &insb[NIns] - curi);
+	ip = icpy(ip, i, &b->ins[b->nins] - i);
+	b->nins = n;
+	b->ins = i0;
+
+	/* lower calls, returns, and vararg instructions */
+	ral = 0;
+	b = fn->start;
+	do {
+		if (!(b = b->link))
+			b = fn->start; /* do it last */
+		if (b->visit)
+			continue;
+		curi = &insb[NIns];
+		selret(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;)
+			switch ((--i)->op) {
+			default:
+				emiti(*i);
+				break;
+			case Ocall:
+			case Ovacall:
+				for (i0=i; i0>b->ins; i0--)
+					if (!isarg((i0-1)->op))
+						break;
+				selcall(fn, i0, i, &ral);
+				i = i0;
+				break;
+			case Ovastart:
+				selvastart(fn, fa, i->arg[0]);
+				break;
+			case Ovaarg:
+				selvaarg(fn, b, i);
+				break;
+			case Oarg:
+			case Oargc:
+				die("unreachable");
+			}
+		if (b == fn->start)
+			for (; ral; ral=ral->link)
+				emiti(ral->i);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	} while (b != fn->start);
+
+	if (debug['A']) {
+		fprintf(stderr, "\n> After ABI lowering:\n");
+		printfn(fn, stderr);
+	}
+}