summary refs log tree commit diff
diff options
context:
space:
mode:
authorQuentin Carbonneaux <quentin@c9x.me>2022-12-10 23:16:21 +0100
committerQuentin Carbonneaux <quentin@c9x.me>2022-12-14 23:18:26 +0100
commit26c1c30b7d96d2170195970a8cdb3b024ba7421a (patch)
tree79c45ec28d63619fbe2a88ec2195f8fe4a95a8a5
parent15e25a61b38b250c7543437a093a9efe076cce0a (diff)
downloadroux-26c1c30b7d96d2170195970a8cdb3b024ba7421a.tar.gz
new blit instruction
-rw-r--r--Makefile2
-rw-r--r--alias.c54
-rw-r--r--all.h11
-rw-r--r--amd64/sysv.c13
-rw-r--r--arm64/abi.c19
-rw-r--r--load.c41
-rw-r--r--main.c1
-rw-r--r--mem.c40
-rw-r--r--ops.h2
-rw-r--r--parse.c69
-rw-r--r--rv64/abi.c25
-rw-r--r--simpl.c82
-rw-r--r--test/load2.ssa75
-rw-r--r--test/mem1.ssa35
-rw-r--r--tools/lexh.c2
-rw-r--r--util.c30
16 files changed, 398 insertions, 103 deletions
diff --git a/Makefile b/Makefile
index 674f850..5fadadc 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ PREFIX = /usr/local
 BINDIR = $(PREFIX)/bin
 
 COMMOBJ  = main.o util.o parse.o abi.o cfg.o mem.o ssa.o alias.o load.o \
-           copy.o fold.o live.o spill.o rega.o emit.o
+           copy.o fold.o simpl.o live.o spill.o rega.o emit.o
 AMD64OBJ = amd64/targ.o amd64/sysv.o amd64/isel.o amd64/emit.o
 ARM64OBJ = arm64/targ.o arm64/abi.o arm64/isel.o arm64/emit.o
 RV64OBJ  = rv64/targ.o rv64/abi.o rv64/isel.o rv64/emit.o
diff --git a/alias.c b/alias.c
index 50e659a..3885115 100644
--- a/alias.c
+++ b/alias.c
@@ -28,13 +28,17 @@ getalias(Alias *a, Ref r, Fn *fn)
 }
 
 int
-alias(Ref p, int sp, Ref q, int sq, int *delta, Fn *fn)
+alias(Ref p, int op, int sp, Ref q, int sq, int *delta, Fn *fn)
 {
 	Alias ap, aq;
 	int ovlap;
 
 	getalias(&ap, p, fn);
 	getalias(&aq, q, fn);
+	ap.offset += op;
+	/* when delta is meaningful (ovlap == 1),
+	 * we do not overflow int because sp and
+	 * sq are bounded by 2^28 */
 	*delta = ap.offset - aq.offset;
 	ovlap = ap.offset < aq.offset + sq && aq.offset < ap.offset + sp;
 
@@ -103,13 +107,34 @@ esc(Ref r, Fn *fn)
 	}
 }
 
+static void
+store(Ref r, int sz, Fn *fn)
+{
+	Alias *a;
+	int64_t off;
+	bits m;
+
+	if (rtype(r) == RTmp) {
+		a = &fn->tmp[r.val].alias;
+		if (a->slot) {
+			assert(astack(a->type));
+			off = a->offset;
+			if (sz >= NBit
+			|| (off < 0 || off >= NBit))
+				m = -1;
+			else
+				m = (BIT(sz) - 1) << off;
+			a->slot->u.loc.m |= m;
+		}
+	}
+}
+
 void
 fillalias(Fn *fn)
 {
 	uint n, m;
-	int t;
+	int t, sz;
 	int64_t x;
-	bits w;
 	Blk *b;
 	Phi *p;
 	Ins *i;
@@ -171,26 +196,23 @@ fillalias(Fn *fn)
 					a->offset += a1.offset;
 				}
 			}
-			if (req(i->to, R) || a->type == AUnk) {
+			if (req(i->to, R) || a->type == AUnk)
+			if (i->op != Oblit0) {
 				if (!isload(i->op))
 					esc(i->arg[0], fn);
 				if (!isstore(i->op))
 				if (i->op != Oargc)
 					esc(i->arg[1], fn);
 			}
-			if (isstore(i->op))
-			if (rtype(i->arg[1]) == RTmp) {
-				a = &fn->tmp[i->arg[1].val].alias;
-				if (a->slot) {
-					assert(astack(a->type));
-					x = a->offset;
-					if (0 <= x && x < NBit) {
-						w = BIT(storesz(i)) - 1;
-						a->slot->u.loc.m |= w << x;
-					} else
-						a->slot->u.loc.sz = -1;
-				}
+			if (i->op == Oblit0) {
+				++i;
+				assert(i->op == Oblit1);
+				assert(rtype(i->arg[0]) == RInt);
+				sz = abs(rsval(i->arg[0]));
+				store((i-1)->arg[1], sz, fn);
 			}
+			if (isstore(i->op))
+				store(i->arg[1], storesz(i), fn);
 		}
 		if (b->jmp.type != Jretc)
 			esc(b->jmp.arg, fn);
diff --git a/all.h b/all.h
index fe2b56b..47a61d8 100644
--- a/all.h
+++ b/all.h
@@ -83,7 +83,8 @@ struct Ref {
 enum {
 	RTmp,
 	RCon,
-	RType,
+	RInt,
+	RType, /* last kind to come out of the parser */
 	RSlot,
 	RCall,
 	RMem,
@@ -97,6 +98,7 @@ enum {
 #define TYPE(x)  (Ref){RType, x}
 #define CALL(x)  (Ref){RCall, x}
 #define MEM(x)   (Ref){RMem, x}
+#define INT(x)   (Ref){RInt, (x)&0x1fffffff}
 
 static inline int req(Ref a, Ref b)
 {
@@ -474,8 +476,6 @@ int symeq(Sym, Sym);
 Ref newcon(Con *, Fn *);
 Ref getcon(int64_t, Fn *);
 int addcon(Con *, Con *);
-void blit(Ref, uint, Ref, uint, uint, Fn *);
-void blit0(Ref, Ref, uint, Fn *);
 void salloc(Ref, Ref, Fn *);
 void dumpts(BSet *, Tmp *, FILE *);
 
@@ -528,7 +528,7 @@ void coalesce(Fn *);
 /* alias.c */
 void fillalias(Fn *);
 void getalias(Alias *, Ref, Fn *);
-int alias(Ref, int, Ref, int, int *, Fn *);
+int alias(Ref, int, int, Ref, int, int *, Fn *);
 int escapes(Ref, Fn *);
 
 /* load.c */
@@ -549,6 +549,9 @@ void copy(Fn *);
 /* fold.c */
 void fold(Fn *);
 
+/* simpl.c */
+void simpl(Fn *);
+
 /* live.c */
 void liveon(BSet *, Blk *, Blk *);
 void filllive(Fn *);
diff --git a/amd64/sysv.c b/amd64/sysv.c
index f4e0416..04dfd83 100644
--- a/amd64/sysv.c
+++ b/amd64/sysv.c
@@ -127,7 +127,8 @@ selret(Blk *b, Fn *fn)
 		if (aret.inmem) {
 			assert(rtype(fn->retr) == RTmp);
 			emit(Ocopy, Kl, TMP(RAX), fn->retr, R);
-			blit0(fn->retr, r0, aret.type->size, fn);
+			emit(Oblit1, 0, R, INT(aret.type->size), R);
+			emit(Oblit0, 0, R, r0, fn->retr);
 			ca = 1;
 		} else {
 			ca = retr(reg, &aret);
@@ -410,15 +411,15 @@ selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
 	for (i=i0, a=ac, off=0; i<i1; i++, a++) {
 		if (i->op >= Oarge || !a->inmem)
 			continue;
+		r1 = newtmp("abi", Kl, fn);
 		if (i->op == Oargc) {
 			if (a->align == 4)
 				off += off & 15;
-			blit(r, off, i->arg[1], 0, a->type->size, fn);
-		} else {
-			r1 = newtmp("abi", Kl, fn);
+			emit(Oblit1, 0, R, INT(a->type->size), R);
+			emit(Oblit0, 0, R, i->arg[1], r1);
+		} else
 			emit(Ostorel, 0, R, i->arg[0], r1);
-			emit(Oadd, Kl, r1, r, getcon(off, fn));
-		}
+		emit(Oadd, Kl, r1, r, getcon(off, fn));
 		off += a->size;
 	}
 	emit(Osalloc, Kl, r, getcon(stk, fn), R);
diff --git a/arm64/abi.c b/arm64/abi.c
index 7282031..8ba4ffc 100644
--- a/arm64/abi.c
+++ b/arm64/abi.c
@@ -188,7 +188,8 @@ selret(Blk *b, Fn *fn)
 		typclass(&cr, &typ[fn->retty], gpreg, fpreg);
 		if (cr.class & Cptr) {
 			assert(rtype(fn->retr) == RTmp);
-			blit0(fn->retr, r, cr.t->size, fn);
+			emit(Oblit1, 0, R, INT(cr.t->size), R);
+			emit(Oblit0, 0, R, r, fn->retr);
 			cty = 0;
 		} else {
 			ldregs(cr.reg, cr.cls, cr.nreg, r, fn);
@@ -438,8 +439,8 @@ selcall(Fn *fn, Ins *i0, Ins *i1, Insl **ilp)
 		if ((c->class & Cstk) == 0)
 			continue;
 		off = align(off, c->align);
+		r = newtmp("abi", Kl, fn);
 		if (i->op == Oarg || isargbh(i->op)) {
-			r = newtmp("abi", Kl, fn);
 			switch (c->size) {
 			case 1: op = Ostoreb; break;
 			case 2: op = Ostoreh; break;
@@ -447,18 +448,22 @@ selcall(Fn *fn, Ins *i0, Ins *i1, Insl **ilp)
 			case 8: op = store[*c->cls]; break;
 			}
 			emit(op, 0, R, i->arg[0], r);
-			emit(Oadd, Kl, r, TMP(SP), getcon(off, fn));
+		} else {
+			assert(i->op == Oargc);
+			emit(Oblit1, 0, R, INT(c->size), R);
+			emit(Oblit0, 0, R, i->arg[1], r);
 		}
-		if (i->op == Oargc)
-			blit(TMP(SP), off, i->arg[1], 0, c->size, fn);
+		emit(Oadd, Kl, r, TMP(SP), getcon(off, fn));
 		off += c->size;
 	}
 	if (stk)
 		emit(Osub, Kl, TMP(SP), TMP(SP), rstk);
 
 	for (i=i0, c=ca; i<i1; i++, c++)
-		if (c->class & Cptr)
-			blit0(i->arg[0], i->arg[1], c->t->size, fn);
+		if (c->class & Cptr) {
+			emit(Oblit1, 0, R, INT(c->t->size), R);
+			emit(Oblit0, 0, R, i->arg[1], i->arg[0]);
+		}
 }
 
 static Params
diff --git a/load.c b/load.c
index b16edd2..551d02e 100644
--- a/load.c
+++ b/load.c
@@ -6,7 +6,6 @@ typedef struct Loc Loc;
 typedef struct Slice Slice;
 typedef struct Insert Insert;
 
-
 struct Loc {
 	enum {
 		LRoot,   /* right above the original load */
@@ -19,6 +18,7 @@ struct Loc {
 
 struct Slice {
 	Ref ref;
+	int off;
 	short sz;
 	short cls; /* load class */
 };
@@ -194,6 +194,7 @@ killsl(Ref r, Slice sl)
 static Ref
 def(Slice sl, bits msk, Blk *b, Ins *i, Loc *il)
 {
+	Slice sl1;
 	Blk *bp;
 	bits msk1, msks;
 	int off, cls, cls1, op, sz, ld;
@@ -244,10 +245,33 @@ def(Slice sl, bits msk, Blk *b, Ins *i, Loc *il)
 			sz = storesz(i);
 			r1 = i->arg[1];
 			r = i->arg[0];
+		} else if (i->op == Oblit1) {
+			assert(rtype(i->arg[0]) == RInt);
+			sz = abs(rsval(i->arg[0]));
+			--i;
+			assert(i->op == Oblit0);
+			r1 = i->arg[1];
 		} else
 			continue;
-		switch (alias(sl.ref, sl.sz, r1, sz, &off, curf)) {
+		switch (alias(sl.ref, sl.off, sl.sz, r1, sz, &off, curf)) {
 		case MustAlias:
+			if (i->op == Oblit0) {
+				sl1 = sl;
+				sl1.ref = i->arg[0];
+				if (off >= 0) {
+					assert(off < sz);
+					sl1.off = off;
+					sz -= off;
+					off = 0;
+				} else {
+					sl1.off = 0;
+					sl1.sz += off;
+				}
+				if (sz > sl1.sz)
+					sz = sl1.sz;
+				assert(sz <= 8);
+				sl1.sz = sz;
+			}
 			if (off < 0) {
 				off = -off;
 				msk1 = (MASK(sz) << 8*off) & msks;
@@ -257,7 +281,12 @@ def(Slice sl, bits msk, Blk *b, Ins *i, Loc *il)
 				op = Oshr;
 			}
 			if ((msk1 & msk) == 0)
-				break;
+				continue;
+			if (i->op == Oblit0) {
+				r = def(sl1, MASK(sz), b, i, il);
+				if (req(r, R))
+					goto Load;
+			}
 			if (off) {
 				cls1 = cls;
 				if (op == Oshr && off + sl.sz > 4)
@@ -279,11 +308,11 @@ def(Slice sl, bits msk, Blk *b, Ins *i, Loc *il)
 			return r;
 		case MayAlias:
 			if (ld)
-				break;
+				continue;
 			else
 				goto Load;
 		case NoAlias:
-			break;
+			continue;
 		default:
 			die("unreachable");
 		}
@@ -397,7 +426,7 @@ loadopt(Fn *fn)
 			if (!isload(i->op))
 				continue;
 			sz = loadsz(i);
-			sl = (Slice){i->arg[0], sz, i->cls};
+			sl = (Slice){i->arg[0], 0, sz, i->cls};
 			l = (Loc){LRoot, i-b->ins, b};
 			i->arg[1] = def(sl, MASK(sz), b, i, &l);
 		}
diff --git a/main.c b/main.c
index 3fcfd7f..abfe03e 100644
--- a/main.c
+++ b/main.c
@@ -78,6 +78,7 @@ func(Fn *fn)
 	filluse(fn);
 	fold(fn);
 	T.abi1(fn);
+	simpl(fn);
 	fillpreds(fn);
 	filluse(fn);
 	T.isel(fn);
diff --git a/mem.c b/mem.c
index 7570f19..5d59b96 100644
--- a/mem.c
+++ b/mem.c
@@ -195,12 +195,13 @@ coalesce(Fn *fn)
 	Range r, *br;
 	Slot *s, *s0, *sl;
 	Blk *b, **ps, *succ[3];
-	Ins *i;
+	Ins *i, **bl;
 	Use *u;
 	Tmp *t, *ts;
 	Ref *arg;
 	bits x;
-	int n, m, nsl, ip, *stk;
+	int64_t off0, off1;
+	int n, m, sz, nsl, nbl, ip, *stk;
 	uint total, freed, fused;
 
 	/* minimize the stack usage
@@ -229,6 +230,8 @@ coalesce(Fn *fn)
 	for (b=fn->start; b; b=b->link)
 		b->loop = -1;
 	loopiter(fn, maxrpo);
+	nbl = 0;
+	bl = vnew(0, sizeof bl[0], PHeap);
 	br = emalloc(fn->nblk * sizeof br[0]);
 	ip = INT_MAX - 1;
 	for (n=fn->nblk-1; n>=0; n--) {
@@ -247,8 +250,11 @@ coalesce(Fn *fn)
 				}
 			}
 		}
+		if (b->jmp.type == Jretc)
+			load(b->jmp.arg, -1, --ip, fn, sl);
 		for (i=&b->ins[b->nins]; i!=b->ins;) {
-			arg = (--i)->arg;
+			--i;
+			arg = i->arg;
 			if (i->op == Oargc) {
 				load(arg[1], -1, --ip, fn, sl);
 			}
@@ -260,6 +266,16 @@ coalesce(Fn *fn)
 				x = BIT(storesz(i)) - 1;
 				store(arg[1], x, ip--, fn, sl);
 			}
+			if (i->op == Oblit0) {
+				assert((i+1)->op == Oblit1);
+				assert(rtype((i+1)->arg[0]) == RInt);
+				sz = abs(rsval((i+1)->arg[0]));
+				x = sz >= NBit ? (bits)-1 : BIT(sz) - 1;
+				store(arg[1], x, ip--, fn, sl);
+				load(arg[0], x, ip, fn, sl);
+				vgrow(&bl, ++nbl);
+				bl[nbl-1] = i;
+			}
 		}
 		for (s=sl; s<&sl[nsl]; s++)
 			if (s->l) {
@@ -321,6 +337,8 @@ coalesce(Fn *fn)
 				stk[n-1] = i->to.val;
 			} else {
 				assert(!isarg(i->op));
+				if (i->op == Oblit0)
+					*(i+1) = (Ins){.op = Onop};
 				*i = (Ins){.op = Onop};
 			}
 		}
@@ -340,7 +358,7 @@ coalesce(Fn *fn)
 			if (s->s || !s->r.b)
 				goto Skip;
 			if (rovlap(r, s->r))
-				/* O(n) can be approximated
+				/* O(n); can be approximated
 				 * by 'goto Skip;' if need be
 				 */
 				for (m=n; &sl[m]<s; m++)
@@ -387,6 +405,20 @@ coalesce(Fn *fn)
 		}
 	}
 
+	/* fix newly overlapping blits */
+	for (n=0; n<nbl; n++) {
+		i = bl[n];
+		if (i->op == Oblit0)
+		if (slot(&s, &off0, i->arg[0], fn, sl))
+		if (slot(&s0, &off1, i->arg[1], fn, sl))
+		if (s->s == s0->s && off0 < off1) {
+			sz = rsval((i+1)->arg[0]);
+			assert(sz >= 0);
+			(i+1)->arg[0] = INT(-sz);
+		}
+	}
+	vfree(bl);
+
 	if (debug['M']) {
 		for (s0=sl; s0<&sl[nsl]; s0++) {
 			if (s0->s != s0)
diff --git a/ops.h b/ops.h
index 3d65081..fbcc2a8 100644
--- a/ops.h
+++ b/ops.h
@@ -129,6 +129,8 @@ O(copy,    T(w,l,s,d, x,x,x,x), 0) X(0, 0, 1) V(0)
 /* Miscellaneous and Architecture-Specific Operations */
 O(nop,     T(x,x,x,x, x,x,x,x), 0) X(0, 0, 1) V(0)
 O(addr,    T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(blit0,   T(m,e,e,e, m,e,e,e), 0) X(0, 1, 0) V(0)
+O(blit1,   T(w,e,e,e, x,e,e,e), 0) X(0, 1, 0) V(0)
 O(swap,    T(w,l,s,d, w,l,s,d), 0) X(1, 0, 0) V(0)
 O(sign,    T(w,l,e,e, x,x,e,e), 0) X(0, 0, 0) V(0)
 O(salloc,  T(e,l,e,e, e,x,e,e), 0) X(0, 0, 0) V(0)
diff --git a/parse.c b/parse.c
index 0836b9a..68488a2 100644
--- a/parse.c
+++ b/parse.c
@@ -27,7 +27,7 @@ typedef enum {
 	PEnd,
 } PState;
 
-enum {
+enum Token {
 	Txxx = 0,
 
 	/* aliases */
@@ -38,6 +38,7 @@ enum {
 	Talloc1,
 	Talloc2,
 
+	Tblit,
 	Tcall,
 	Tenv,
 	Tphi,
@@ -94,6 +95,7 @@ static char *kwmap[Ntok] = {
 	[Tloadd] = "loadd",
 	[Talloc1] = "alloc1",
 	[Talloc2] = "alloc2",
+	[Tblit] = "blit",
 	[Tcall] = "call",
 	[Tenv] = "env",
 	[Tphi] = "phi",
@@ -481,7 +483,7 @@ parserefl(int arg)
 	expect(Tlparen);
 	while (peek() != Trparen) {
 		if (curi - insb >= NIns)
-			err("too many instructions (1)");
+			err("too many instructions");
 		if (!arg && vararg)
 			err("no parameters allowed after '...'");
 		switch (peek()) {
@@ -578,6 +580,7 @@ parseline(PState ps)
 	Phi *phi;
 	Ref r;
 	Blk *b;
+	Con *c;
 	int t, op, i, k, ty;
 
 	t = nextnl();
@@ -586,6 +589,7 @@ parseline(PState ps)
 	switch (t) {
 	default:
 		if (isstore(t)) {
+		case Tblit:
 		case Tcall:
 		case Ovastart:
 			/* operations without result */
@@ -657,11 +661,6 @@ parseline(PState ps)
 	k = parsecls(&ty);
 	op = next();
 DoOp:
-	if (op == Tphi) {
-		if (ps != PPhi || curb == curf->start)
-			err("unexpected phi instruction");
-		op = -1;
-	}
 	if (op == Tcall) {
 		arg[0] = parseref();
 		parserefl(1);
@@ -686,14 +685,12 @@ DoOp:
 		err("cannot use vastart in non-variadic function");
 	if (k >= Ksb)
 		err("size class must be w, l, s, or d");
-	if (op >= NPubOp)
-		err("invalid instruction");
 	i = 0;
 	if (peek() != Tnl)
 		for (;;) {
 			if (i == NPred)
 				err("too many arguments");
-			if (op == -1) {
+			if (op == Tphi) {
 				expect(Tlbl);
 				blk[i] = findblk(tokval.str);
 			}
@@ -709,18 +706,10 @@ DoOp:
 			next();
 		}
 	next();
-Ins:
-	if (op != -1) {
-		if (curi - insb >= NIns)
-			err("too many instructions (2)");
-		curi->op = op;
-		curi->cls = k;
-		curi->to = r;
-		curi->arg[0] = arg[0];
-		curi->arg[1] = arg[1];
-		curi++;
-		return PIns;
-	} else {
+	switch (op) {
+	case Tphi:
+		if (ps != PPhi || curb == curf->start)
+			err("unexpected phi instruction");
 		phi = alloc(sizeof *phi);
 		phi->to = r;
 		phi->cls = k;
@@ -732,6 +721,39 @@ Ins:
 		*plink = phi;
 		plink = &phi->link;
 		return PPhi;
+	case Tblit:
+		if (curi - insb >= NIns-1)
+			err("too many instructions");
+		memset(curi, 0, 2 * sizeof(Ins));
+		curi->op = Oblit0;
+		curi->arg[0] = arg[0];
+		curi->arg[1] = arg[1];
+		curi++;
+		if (rtype(arg[2]) != RCon)
+			err("blit size must be constant");
+		c = &curf->con[arg[2].val];
+		r = INT(c->bits.i);
+		if (c->type != CBits
+		|| rsval(r) < 0
+		|| rsval(r) != c->bits.i)
+			err("invalid blit size");
+		curi->op = Oblit1;
+		curi->arg[0] = r;
+		curi++;
+		return PIns;
+	default:
+		if (op >= NPubOp)
+			err("invalid instruction");
+	Ins:
+		if (curi - insb >= NIns)
+			err("too many instructions");
+		curi->op = op;
+		curi->cls = k;
+		curi->to = r;
+		curi->arg[0] = arg[0];
+		curi->arg[1] = arg[1];
+		curi++;
+		return PIns;
 	}
 }
 
@@ -1241,6 +1263,9 @@ printref(Ref r, Fn *fn, FILE *f)
 		}
 		fputc(']', f);
 		break;
+	case RInt:
+		fprintf(f, "%d", rsval(r));
+		break;
 	}
 }
 
diff --git a/rv64/abi.c b/rv64/abi.c
index 3a97a6a..e31425c 100644
--- a/rv64/abi.c
+++ b/rv64/abi.c
@@ -222,7 +222,8 @@ selret(Blk *b, Fn *fn)
 		typclass(&cr, &typ[fn->retty], 1, gpreg, fpreg);
 		if (cr.class & Cptr) {
 			assert(rtype(fn->retr) == RTmp);
-			blit0(fn->retr, r, cr.type->size, fn);
+			emit(Oblit1, 0, R, INT(cr.type->size), R);
+			emit(Oblit0, 0, R, r, fn->retr);
 			cty = 0;
 		} else {
 			ldregs(&cr, r, fn);
@@ -341,7 +342,7 @@ selcall(Fn *fn, Ins *i0, Ins *i1, Insl **ilp)
 	Class *ca, *c, cr;
 	int j, k, cty;
 	uint64_t stk, off;
-	Ref r, r1, tmp[2];
+	Ref r, r1, r2, tmp[2];
 
 	ca = alloc((i1-i0) * sizeof ca[0]);
 	cr.class = 0;
@@ -419,8 +420,10 @@ selcall(Fn *fn, Ins *i0, Ins *i1, Insl **ilp)
 			k = KWIDE(*c->cls) ? Kl : Kw;
 			emit(Ocast, k, TMP(*c->reg), i->arg[0], R);
 		}
-		if (c->class & Cptr)
-			blit0(i->arg[0], i->arg[1], c->type->size, fn);
+		if (c->class & Cptr) {
+			emit(Oblit1, 0, R, INT(c->type->size), R);
+			emit(Oblit0, 0, R, i->arg[1], i->arg[0]);
+		}
 	}
 
 	if (!stk)
@@ -450,11 +453,21 @@ selcall(Fn *fn, Ins *i0, Ins *i1, Insl **ilp)
 		}
 		if (i->op == Oargc) {
 			if (c->class & Cstk1) {
-				blit(r, off, i->arg[1], 0, 8, fn);
+				r1 = newtmp("abi", Kl, fn);
+				r2 = newtmp("abi", Kl, fn);
+				emit(Ostorel, 0, R, r2, r1);
+				emit(Oadd, Kl, r1, r, getcon(off, fn));
+				emit(Oload, Kl, r2, i->arg[1], R);
 				off += 8;
 			}
 			if (c->class & Cstk2) {
-				blit(r, off, i->arg[1], 8, 8, fn);
+				r1 = newtmp("abi", Kl, fn);
+				r2 = newtmp("abi", Kl, fn);
+				emit(Ostorel, 0, R, r2, r1);
+				emit(Oadd, Kl, r1, r, getcon(off, fn));
+				r1 = newtmp("abi", Kl, fn);
+				emit(Oload, Kl, r2, r1, R);
+				emit(Oadd, Kl, r1, i->arg[1], getcon(8, fn));
 				off += 8;
 			}
 		}
diff --git a/simpl.c b/simpl.c
new file mode 100644
index 0000000..7001301
--- /dev/null
+++ b/simpl.c
@@ -0,0 +1,82 @@
+#include "all.h"
+
+static void
+blit(Ref sd[2], int sz, Fn *fn)
+{
+	struct { int st, ld, cls, size; } *p, tbl[] = {
+		{ Ostorel, Oload,   Kl, 8 },
+		{ Ostorew, Oload,   Kw, 4 },
+		{ Ostoreh, Oloaduh, Kw, 2 },
+		{ Ostoreb, Oloadub, Kw, 1 }
+	};
+	Ref r, r1, ro;
+	int off, fwd, n;
+
+	fwd = sz >= 0;
+	sz = abs(sz);
+	off = fwd ? sz : 0;
+	for (p=tbl; sz; p++)
+		for (n=p->size; sz>=n; sz-=n) {
+			off -= fwd ? n : 0;
+			r = newtmp("blt", Kl, fn);
+			r1 = newtmp("blt", Kl, fn);
+			ro = getcon(off, fn);
+			emit(p->st, 0, R, r, r1);
+			emit(Oadd, Kl, r1, sd[1], ro);
+			r1 = newtmp("blt", Kl, fn);
+			emit(p->ld, p->cls, r, r1, R);
+			emit(Oadd, Kl, r1, sd[0], ro);
+			off += fwd ? 0 : n;
+		}
+}
+
+static void
+ins(Ins **pi, int *new, Blk *b, Fn *fn)
+{
+	ulong ni;
+	Ins *i;
+
+	i = *pi;
+	/* simplify more instructions here;
+	 * copy 0 into xor, mul 2^n into shift,
+	 * bit rotations, ... */
+	switch (i->op) {
+	case Oblit1:
+		assert(i > b->ins);
+		assert((i-1)->op == Oblit0);
+		if (!*new) {
+			curi = &insb[NIns];
+			ni = &b->ins[b->nins] - (i+1);
+			curi -= ni;
+			icpy(curi, i+1, ni);
+			*new = 1;
+		}
+		blit((i-1)->arg, rsval(i->arg[0]), fn);
+		*pi = i-1;
+		break;
+	default:
+		if (*new)
+			emiti(*i);
+		break;
+	}
+}
+
+void
+simpl(Fn *fn)
+{
+	Blk *b;
+	Ins *i;
+	int new;
+
+	for (b=fn->start; b; b=b->link) {
+		new = 0;
+		for (i=&b->ins[b->nins]; i!=b->ins;) {
+			--i;
+			ins(&i, &new, b, fn);
+		}
+		if (new) {
+			b->nins = &insb[NIns] - curi;
+			idup(&b->ins, curi, b->nins);
+		}
+	}
+}
diff --git a/test/load2.ssa b/test/load2.ssa
new file mode 100644
index 0000000..05c12a6
--- /dev/null
+++ b/test/load2.ssa
@@ -0,0 +1,75 @@
+# blit & load elimination
+
+export
+function $f() {
+@start
+        %x =l alloc4 12
+        %y =l alloc4 12
+
+	%x1 =l add 1, %x
+	%x2 =l add 1, %x1
+	%x3 =l add 1, %x2
+	%x4 =l add 1, %x3
+	%x5 =l add 1, %x4
+	%x6 =l add 1, %x5
+	%x7 =l add 1, %x6
+	%x8 =l add 1, %x7
+	%x9 =l add 1, %x8
+	%xa =l add 1, %x9
+	%xb =l add 1, %xa
+
+	%y1 =l add 1, %y
+	%y4 =l add 4, %y
+	
+	storew 287454020, %x4   # 0x11223344
+	storew 1432778632, %y   # 0x55667788
+	blit %y, %x5, 1
+	%n =w load %x4
+	call $px(w %n)          # 0x11228844
+
+	storew 287454020, %x4   # 0x11223344
+	storew 1432778632, %y   # 0x55667788
+	blit %y, %x5, 2
+	%n =w load %x4
+	call $px(w %n)          # 0x11778844
+
+	storew 287454020, %x4   # 0x11223344
+	storew 1432778632, %y   # 0x55667788
+	blit %y, %x5, 4
+	%n =w load %x4
+	call $px(w %n)          # 0x66778844
+
+	storew 287454020, %x4   # 0x11223344
+	storew 1432778632, %y   # 0x55667788
+	blit %y, %x2, 4
+	%n =w load %x4
+	call $px(w %n)          # 0x11225566
+
+	storew 287454020, %x4   # 0x11223344
+	storew 0, %y
+	storew 1432778632, %y4  # 0x55667788
+	blit %y1, %x2, 7
+	%n =w load %x4
+	call $px(w %n)          # 0x66778800
+
+	ret
+}
+
+# >>> driver
+# #include <stdio.h>
+# void px(unsigned n) {
+# 	printf("0x%08x\n", n);
+# }
+# int main() {
+# 	extern void f(void);
+# 	f();
+# }
+# <<<
+
+# >>> output
+# 0x11228844
+# 0x11778844
+# 0x66778844
+# 0x11225566
+# 0x66778800
+# <<<
diff --git a/test/mem1.ssa b/test/mem1.ssa
new file mode 100644
index 0000000..b7045a6
--- /dev/null
+++ b/test/mem1.ssa
@@ -0,0 +1,35 @@
+type :i3 = { w 3 } 
+
+export
+function :i3 $blit() {
+@start
+	%l0 =l alloc4 12
+	%l1 =l alloc4 12
+
+	storew 287454020, %l0
+	%l04 =l add %l0, 4
+	storew 1432778632, %l04
+	%l08 =l add %l0, 8
+	storew 2578103244, %l08
+
+	# we expect that %l0 and %l1
+	# are coalesced and the blit
+	# goes backwards
+	%l11 =l add %l1, 1
+	blit %l0, %l11, 11
+
+	storeb 221, %l1
+
+	ret %l1
+}
+
+# >>> driver
+# struct i3 { int a, b, c; };
+# extern struct i3 blit();
+# int main() {
+# 	struct i3 s = blit();
+# 	return !(s.a == 0x223344dd
+# 	      && s.b == 0x66778811
+# 	      && s.c == 0xaabbcc55);
+# }
+# <<<
diff --git a/tools/lexh.c b/tools/lexh.c
index a07514e..5ceb4ee 100644
--- a/tools/lexh.c
+++ b/tools/lexh.c
@@ -26,7 +26,7 @@ char *tok[] = {
 	"vaarg", "vastart", "...", "env",
 
 	"call", "phi", "jmp", "jnz", "ret", "hlt", "export",
-	"function", "type", "data", "section", "align",
+	"function", "type", "data", "section", "align", "blit",
 	"l", "w", "sh", "uh", "h", "sb", "ub", "b",
 	"d", "s", "z", "loadw", "loadl", "loads", "loadd",
 	"alloc1", "alloc2",
diff --git a/util.c b/util.c
index 41b2625..8432b5a 100644
--- a/util.c
+++ b/util.c
@@ -404,36 +404,6 @@ addcon(Con *c0, Con *c1)
 }
 
 void
-blit(Ref rdst, uint doff, Ref rsrc, uint boff, uint sz, Fn *fn)
-{
-	struct { int st, ld, cls, size; } *p, tbl[] = {
-		{ Ostorel, Oload,   Kl, 8 },
-		{ Ostorew, Oload,   Kw, 4 },
-		{ Ostoreh, Oloaduh, Kw, 2 },
-		{ Ostoreb, Oloadub, Kw, 1 }
-	};
-	Ref r, r1;
-	uint s;
-
-	for (p=tbl; sz; p++)
-		for (s=p->size; sz>=s; sz-=s, doff+=s, boff+=s) {
-			r = newtmp("blt", Kl, fn);
-			r1 = newtmp("blt", Kl, fn);
-			emit(p->st, 0, R, r, r1);
-			emit(Oadd, Kl, r1, rdst, getcon(doff, fn));
-			r1 = newtmp("blt", Kl, fn);
-			emit(p->ld, p->cls, r, r1, R);
-			emit(Oadd, Kl, r1, rsrc, getcon(boff, fn));
-		}
-}
-
-void
-blit0(Ref rdst, Ref rsrc, uint sz, Fn *fn)
-{
-	blit(rdst, 0, rsrc, 0, sz, fn);
-}
-
-void
 salloc(Ref rt, Ref rs, Fn *fn)
 {
 	Ref r0, r1;