summary refs log tree commit diff
diff options
context:
space:
mode:
authorMichael Forney <mforney@mforney.org>2022-02-12 02:27:50 -0800
committerQuentin Carbonneaux <quentin@c9x.me>2022-02-17 22:43:12 +0100
commit4e93eeaa3b63b6ae50954a29662cc3ea6be48b23 (patch)
tree42f9dd888d3581ca9758afad53116f95ef790083
parent8e040d58615e49a63fb50dda5dc695e96a54a7bc (diff)
downloadroux-4e93eeaa3b63b6ae50954a29662cc3ea6be48b23.tar.gz
add rv64 backend
It is mostly complete, but still has a few ABI bugs when passing
floats in structs, or when structs are passed partly in register,
and partly on stack.
-rw-r--r--Makefile14
-rw-r--r--all.h2
-rw-r--r--doc/il.txt1
-rw-r--r--doc/rv64.txt20
-rw-r--r--main.c2
-rw-r--r--ops.h266
-rw-r--r--rv64/abi.c584
-rw-r--r--rv64/all.h49
-rw-r--r--rv64/emit.c499
-rw-r--r--rv64/isel.c278
-rw-r--r--rv64/targ.c53
-rw-r--r--test/dark.ssa2
-rwxr-xr-xtools/test.sh24
13 files changed, 1661 insertions, 133 deletions
diff --git a/Makefile b/Makefile
index 1a0074f..711873b 100644
--- a/Makefile
+++ b/Makefile
@@ -7,11 +7,13 @@ SRC      = main.c util.c parse.c cfg.c mem.c ssa.c alias.c load.c copy.c \
            fold.c live.c spill.c rega.c gas.c
 AMD64SRC = amd64/targ.c amd64/sysv.c amd64/isel.c amd64/emit.c
 ARM64SRC = arm64/targ.c arm64/abi.c arm64/isel.c arm64/emit.c
-SRCALL   = $(SRC) $(AMD64SRC) $(ARM64SRC)
+RV64SRC  = rv64/targ.c rv64/abi.c rv64/isel.c rv64/emit.c
+SRCALL   = $(SRC) $(AMD64SRC) $(ARM64SRC) $(RV64SRC)
 
 AMD64OBJ = $(AMD64SRC:%.c=$(OBJDIR)/%.o)
 ARM64OBJ = $(ARM64SRC:%.c=$(OBJDIR)/%.o)
-OBJ      = $(SRC:%.c=$(OBJDIR)/%.o) $(AMD64OBJ) $(ARM64OBJ)
+RV64OBJ  = $(RV64SRC:%.c=$(OBJDIR)/%.o)
+OBJ      = $(SRC:%.c=$(OBJDIR)/%.o) $(AMD64OBJ) $(ARM64OBJ) $(RV64OBJ)
 
 CFLAGS += -Wall -Wextra -std=c99 -g -pedantic
 
@@ -27,11 +29,13 @@ $(OBJDIR)/timestamp:
 	@mkdir -p $(OBJDIR)
 	@mkdir -p $(OBJDIR)/amd64
 	@mkdir -p $(OBJDIR)/arm64
+	@mkdir -p $(OBJDIR)/rv64
 	@touch $@
 
 $(OBJ): all.h ops.h
 $(AMD64OBJ): amd64/all.h
 $(ARM64OBJ): arm64/all.h
+$(RV64OBJ): rv64/all.h
 $(OBJDIR)/main.o: config.h
 
 config.h:
@@ -46,6 +50,9 @@ config.h:
 		*aarch64*)                             \
 			echo "#define Deftgt T_arm64"; \
 			;;                             \
+		*riscv64*)                             \
+			echo "#define Deftgt T_rv64";  \
+			;;                             \
 		*)                                     \
 			echo "#define Deftgt T_amd64_sysv";\
 			;;                             \
@@ -72,6 +79,9 @@ check: $(OBJDIR)/$(BIN)
 check-arm64: $(OBJDIR)/$(BIN)
 	TARGET=arm64 tools/test.sh all
 
+check-rv64: $(OBJDIR)/$(BIN)
+	TARGET=rv64 tools/test.sh all
+
 src:
 	@echo $(SRCALL)
 
diff --git a/all.h b/all.h
index 257d6ba..c19b4ae 100644
--- a/all.h
+++ b/all.h
@@ -179,7 +179,7 @@ enum {
 #define isarg(o) INRANGE(o, Oarg, Oargv)
 #define isret(j) INRANGE(j, Jret0, Jretc)
 
-enum Class {
+enum {
 	Kx = -1, /* "top" class (see usecheck() and clsmerge()) */
 	Kw,
 	Kl,
diff --git a/doc/il.txt b/doc/il.txt
index 994729e..0e05283 100644
--- a/doc/il.txt
+++ b/doc/il.txt
@@ -856,6 +856,7 @@ alignment required by all the targets.
 
     type :valist = align 8 { 24 }  # For amd64_sysv
     type :valist = align 8 { 32 }  # For arm64
+    type :valist = align 8 { 8 }   # For rv64
 
 The following example defines a variadic function adding
 its first three arguments.
diff --git a/doc/rv64.txt b/doc/rv64.txt
new file mode 100644
index 0000000..e696d77
--- /dev/null
+++ b/doc/rv64.txt
@@ -0,0 +1,20 @@
+=========
+RISC-V 64
+=========
+
+- Known issues
+--------------
+
+ABI with structs containing floats is not yet supported.
+
+- Possible improvements
+-----------------------
+
+rv64_isel() could turn compare used only with jnz into b{lt,ge}[u].
+
+- Helpful links
+---------------
+
+RISC-V spec: https://github.com/riscv/riscv-isa-manual/releases/latest/download/riscv-spec.pdf
+ASM manual: https://github.com/riscv-non-isa/riscv-asm-manual/blob/master/riscv-asm.md
+psABI: https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc
diff --git a/main.c b/main.c
index 83f08ba..ee16a75 100644
--- a/main.c
+++ b/main.c
@@ -7,6 +7,7 @@ Target T;
 
 extern Target T_amd64_sysv;
 extern Target T_arm64;
+extern Target T_rv64;
 
 static struct TMap {
 	char *name;
@@ -14,6 +15,7 @@ static struct TMap {
 } tmap[] = {
 	{ "amd64_sysv", &T_amd64_sysv },
 	{ "arm64", &T_arm64 },
+	{ "rv64", &T_rv64 },
 	{ 0, 0 }
 };
 
diff --git a/ops.h b/ops.h
index 9f02262..285bc5c 100644
--- a/ops.h
+++ b/ops.h
@@ -2,6 +2,11 @@
 	#define X(NMemArgs, SetsZeroFlag, LeavesFlags)
 #endif
 
+#ifndef V /* riscv64 */
+	#define V(Imm)
+#endif
+
+
 #define T(a,b,c,d,e,f,g,h) {                          \
 	{[Kw]=K##a, [Kl]=K##b, [Ks]=K##c, [Kd]=K##d}, \
 	{[Kw]=K##e, [Kl]=K##f, [Ks]=K##g, [Kd]=K##h}  \
@@ -13,108 +18,108 @@
 /*********************/
 
 /* Arithmetic and Bits */
-O(add,     T(w,l,s,d, w,l,s,d), 1) X(2, 1, 0)
-O(sub,     T(w,l,s,d, w,l,s,d), 1) X(2, 1, 0)
-O(neg,     T(w,l,s,d, x,x,x,x), 1) X(1, 1, 0)
-O(div,     T(w,l,s,d, w,l,s,d), 1) X(0, 0, 0)
-O(rem,     T(w,l,e,e, w,l,e,e), 1) X(0, 0, 0)
-O(udiv,    T(w,l,e,e, w,l,e,e), 1) X(0, 0, 0)
-O(urem,    T(w,l,e,e, w,l,e,e), 1) X(0, 0, 0)
-O(mul,     T(w,l,s,d, w,l,s,d), 1) X(2, 0, 0)
-O(and,     T(w,l,e,e, w,l,e,e), 1) X(2, 1, 0)
-O(or,      T(w,l,e,e, w,l,e,e), 1) X(2, 1, 0)
-O(xor,     T(w,l,e,e, w,l,e,e), 1) X(2, 1, 0)
-O(sar,     T(w,l,e,e, w,w,e,e), 1) X(1, 1, 0)
-O(shr,     T(w,l,e,e, w,w,e,e), 1) X(1, 1, 0)
-O(shl,     T(w,l,e,e, w,w,e,e), 1) X(1, 1, 0)
+O(add,     T(w,l,s,d, w,l,s,d), 1) X(2, 1, 0) V(1)
+O(sub,     T(w,l,s,d, w,l,s,d), 1) X(2, 1, 0) V(0)
+O(neg,     T(w,l,s,d, x,x,x,x), 1) X(1, 1, 0) V(0)
+O(div,     T(w,l,s,d, w,l,s,d), 1) X(0, 0, 0) V(0)
+O(rem,     T(w,l,e,e, w,l,e,e), 1) X(0, 0, 0) V(0)
+O(udiv,    T(w,l,e,e, w,l,e,e), 1) X(0, 0, 0) V(0)
+O(urem,    T(w,l,e,e, w,l,e,e), 1) X(0, 0, 0) V(0)
+O(mul,     T(w,l,s,d, w,l,s,d), 1) X(2, 0, 0) V(0)
+O(and,     T(w,l,e,e, w,l,e,e), 1) X(2, 1, 0) V(1)
+O(or,      T(w,l,e,e, w,l,e,e), 1) X(2, 1, 0) V(1)
+O(xor,     T(w,l,e,e, w,l,e,e), 1) X(2, 1, 0) V(1)
+O(sar,     T(w,l,e,e, w,w,e,e), 1) X(1, 1, 0) V(1)
+O(shr,     T(w,l,e,e, w,w,e,e), 1) X(1, 1, 0) V(1)
+O(shl,     T(w,l,e,e, w,w,e,e), 1) X(1, 1, 0) V(1)
 
 /* Comparisons */
-O(ceqw,    T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0)
-O(cnew,    T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0)
-O(csgew,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0)
-O(csgtw,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0)
-O(cslew,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0)
-O(csltw,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0)
-O(cugew,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0)
-O(cugtw,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0)
-O(culew,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0)
-O(cultw,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0)
-
-O(ceql,    T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0)
-O(cnel,    T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0)
-O(csgel,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0)
-O(csgtl,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0)
-O(cslel,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0)
-O(csltl,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0)
-O(cugel,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0)
-O(cugtl,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0)
-O(culel,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0)
-O(cultl,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0)
-
-O(ceqs,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0)
-O(cges,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0)
-O(cgts,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0)
-O(cles,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0)
-O(clts,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0)
-O(cnes,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0)
-O(cos,     T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0)
-O(cuos,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0)
-
-O(ceqd,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0)
-O(cged,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0)
-O(cgtd,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0)
-O(cled,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0)
-O(cltd,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0)
-O(cned,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0)
-O(cod,     T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0)
-O(cuod,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0)
+O(ceqw,    T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0) V(0)
+O(cnew,    T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0) V(0)
+O(csgew,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0) V(0)
+O(csgtw,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0) V(0)
+O(cslew,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0) V(0)
+O(csltw,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0) V(1)
+O(cugew,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0) V(0)
+O(cugtw,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0) V(0)
+O(culew,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0) V(0)
+O(cultw,   T(w,w,e,e, w,w,e,e), 1) X(0, 1, 0) V(1)
+
+O(ceql,    T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0) V(0)
+O(cnel,    T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0) V(0)
+O(csgel,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0) V(0)
+O(csgtl,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0) V(0)
+O(cslel,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0) V(0)
+O(csltl,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0) V(1)
+O(cugel,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0) V(0)
+O(cugtl,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0) V(0)
+O(culel,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0) V(0)
+O(cultl,   T(l,l,e,e, l,l,e,e), 1) X(0, 1, 0) V(1)
+
+O(ceqs,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0) V(0)
+O(cges,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0) V(0)
+O(cgts,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0) V(0)
+O(cles,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0) V(0)
+O(clts,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0) V(0)
+O(cnes,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0) V(0)
+O(cos,     T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0) V(0)
+O(cuos,    T(s,s,e,e, s,s,e,e), 1) X(0, 1, 0) V(0)
+
+O(ceqd,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0) V(0)
+O(cged,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0) V(0)
+O(cgtd,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0) V(0)
+O(cled,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0) V(0)
+O(cltd,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0) V(0)
+O(cned,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0) V(0)
+O(cod,     T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0) V(0)
+O(cuod,    T(d,d,e,e, d,d,e,e), 1) X(0, 1, 0) V(0)
 
 /* Memory */
-O(storeb,  T(w,e,e,e, m,e,e,e), 0) X(0, 0, 1)
-O(storeh,  T(w,e,e,e, m,e,e,e), 0) X(0, 0, 1)
-O(storew,  T(w,e,e,e, m,e,e,e), 0) X(0, 0, 1)
-O(storel,  T(l,e,e,e, m,e,e,e), 0) X(0, 0, 1)
-O(stores,  T(s,e,e,e, m,e,e,e), 0) X(0, 0, 1)
-O(stored,  T(d,e,e,e, m,e,e,e), 0) X(0, 0, 1)
-
-O(loadsb,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(loadub,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(loadsh,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(loaduh,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(loadsw,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(loaduw,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(load,    T(m,m,m,m, x,x,x,x), 0) X(0, 0, 1)
+O(storeb,  T(w,e,e,e, m,e,e,e), 0) X(0, 0, 1) V(0)
+O(storeh,  T(w,e,e,e, m,e,e,e), 0) X(0, 0, 1) V(0)
+O(storew,  T(w,e,e,e, m,e,e,e), 0) X(0, 0, 1) V(0)
+O(storel,  T(l,e,e,e, m,e,e,e), 0) X(0, 0, 1) V(0)
+O(stores,  T(s,e,e,e, m,e,e,e), 0) X(0, 0, 1) V(0)
+O(stored,  T(d,e,e,e, m,e,e,e), 0) X(0, 0, 1) V(0)
+
+O(loadsb,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(loadub,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(loadsh,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(loaduh,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(loadsw,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(loaduw,  T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(load,    T(m,m,m,m, x,x,x,x), 0) X(0, 0, 1) V(0)
 
 /* Extensions and Truncations */
-O(extsb,   T(w,w,e,e, x,x,e,e), 1) X(0, 0, 1)
-O(extub,   T(w,w,e,e, x,x,e,e), 1) X(0, 0, 1)
-O(extsh,   T(w,w,e,e, x,x,e,e), 1) X(0, 0, 1)
-O(extuh,   T(w,w,e,e, x,x,e,e), 1) X(0, 0, 1)
-O(extsw,   T(e,w,e,e, e,x,e,e), 1) X(0, 0, 1)
-O(extuw,   T(e,w,e,e, e,x,e,e), 1) X(0, 0, 1)
-
-O(exts,    T(e,e,e,s, e,e,e,x), 1) X(0, 0, 1)
-O(truncd,  T(e,e,d,e, e,e,x,e), 1) X(0, 0, 1)
-O(stosi,   T(s,s,e,e, x,x,e,e), 1) X(0, 0, 1)
-O(stoui,   T(s,s,e,e, x,x,e,e), 1) X(0, 0, 1)
-O(dtosi,   T(d,d,e,e, x,x,e,e), 1) X(0, 0, 1)
-O(dtoui,   T(d,d,e,e, x,x,e,e), 1) X(0, 0, 1)
-O(swtof,   T(e,e,w,w, e,e,x,x), 1) X(0, 0, 1)
-O(uwtof,   T(e,e,w,w, e,e,x,x), 1) X(0, 0, 1)
-O(sltof,   T(e,e,l,l, e,e,x,x), 1) X(0, 0, 1)
-O(ultof,   T(e,e,l,l, e,e,x,x), 1) X(0, 0, 1)
-O(cast,    T(s,d,w,l, x,x,x,x), 1) X(0, 0, 1)
+O(extsb,   T(w,w,e,e, x,x,e,e), 1) X(0, 0, 1) V(0)
+O(extub,   T(w,w,e,e, x,x,e,e), 1) X(0, 0, 1) V(0)
+O(extsh,   T(w,w,e,e, x,x,e,e), 1) X(0, 0, 1) V(0)
+O(extuh,   T(w,w,e,e, x,x,e,e), 1) X(0, 0, 1) V(0)
+O(extsw,   T(e,w,e,e, e,x,e,e), 1) X(0, 0, 1) V(0)
+O(extuw,   T(e,w,e,e, e,x,e,e), 1) X(0, 0, 1) V(0)
+
+O(exts,    T(e,e,e,s, e,e,e,x), 1) X(0, 0, 1) V(0)
+O(truncd,  T(e,e,d,e, e,e,x,e), 1) X(0, 0, 1) V(0)
+O(stosi,   T(s,s,e,e, x,x,e,e), 1) X(0, 0, 1) V(0)
+O(stoui,   T(s,s,e,e, x,x,e,e), 1) X(0, 0, 1) V(0)
+O(dtosi,   T(d,d,e,e, x,x,e,e), 1) X(0, 0, 1) V(0)
+O(dtoui,   T(d,d,e,e, x,x,e,e), 1) X(0, 0, 1) V(0)
+O(swtof,   T(e,e,w,w, e,e,x,x), 1) X(0, 0, 1) V(0)
+O(uwtof,   T(e,e,w,w, e,e,x,x), 1) X(0, 0, 1) V(0)
+O(sltof,   T(e,e,l,l, e,e,x,x), 1) X(0, 0, 1) V(0)
+O(ultof,   T(e,e,l,l, e,e,x,x), 1) X(0, 0, 1) V(0)
+O(cast,    T(s,d,w,l, x,x,x,x), 1) X(0, 0, 1) V(0)
 
 /* Stack Allocation */
-O(alloc4,  T(e,l,e,e, e,x,e,e), 0) X(0, 0, 0)
-O(alloc8,  T(e,l,e,e, e,x,e,e), 0) X(0, 0, 0)
-O(alloc16, T(e,l,e,e, e,x,e,e), 0) X(0, 0, 0)
+O(alloc4,  T(e,l,e,e, e,x,e,e), 0) X(0, 0, 0) V(0)
+O(alloc8,  T(e,l,e,e, e,x,e,e), 0) X(0, 0, 0) V(0)
+O(alloc16, T(e,l,e,e, e,x,e,e), 0) X(0, 0, 0) V(0)
 
 /* Variadic Function Helpers */
-O(vaarg,   T(m,m,m,m, x,x,x,x), 0) X(0, 0, 0)
-O(vastart, T(m,e,e,e, x,e,e,e), 0) X(0, 0, 0)
+O(vaarg,   T(m,m,m,m, x,x,x,x), 0) X(0, 0, 0) V(0)
+O(vastart, T(m,e,e,e, x,e,e,e), 0) X(0, 0, 0) V(0)
 
-O(copy,    T(w,l,s,d, x,x,x,x), 0) X(0, 0, 1)
+O(copy,    T(w,l,s,d, x,x,x,x), 0) X(0, 0, 1) V(0)
 
 
 /****************************************/
@@ -122,52 +127,55 @@ O(copy,    T(w,l,s,d, x,x,x,x), 0) X(0, 0, 1)
 /****************************************/
 
 /* Miscellaneous and Architecture-Specific Operations */
-O(nop,     T(x,x,x,x, x,x,x,x), 0) X(0, 0, 1)
-O(addr,    T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(swap,    T(w,l,s,d, w,l,s,d), 0) X(1, 0, 0)
-O(sign,    T(w,l,e,e, x,x,e,e), 0) X(0, 0, 0)
-O(salloc,  T(e,l,e,e, e,x,e,e), 0) X(0, 0, 0)
-O(xidiv,   T(w,l,e,e, x,x,e,e), 0) X(1, 0, 0)
-O(xdiv,    T(w,l,e,e, x,x,e,e), 0) X(1, 0, 0)
-O(xcmp,    T(w,l,s,d, w,l,s,d), 0) X(1, 1, 0)
-O(xtest,   T(w,l,e,e, w,l,e,e), 0) X(1, 1, 0)
-O(acmp,    T(w,l,e,e, w,l,e,e), 0) X(0, 0, 0)
-O(acmn,    T(w,l,e,e, w,l,e,e), 0) X(0, 0, 0)
-O(afcmp,   T(e,e,s,d, e,e,s,d), 0) X(0, 0, 0)
+O(nop,     T(x,x,x,x, x,x,x,x), 0) X(0, 0, 1) V(0)
+O(addr,    T(m,m,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(swap,    T(w,l,s,d, w,l,s,d), 0) X(1, 0, 0) V(0)
+O(sign,    T(w,l,e,e, x,x,e,e), 0) X(0, 0, 0) V(0)
+O(salloc,  T(e,l,e,e, e,x,e,e), 0) X(0, 0, 0) V(0)
+O(xidiv,   T(w,l,e,e, x,x,e,e), 0) X(1, 0, 0) V(0)
+O(xdiv,    T(w,l,e,e, x,x,e,e), 0) X(1, 0, 0) V(0)
+O(xcmp,    T(w,l,s,d, w,l,s,d), 0) X(1, 1, 0) V(0)
+O(xtest,   T(w,l,e,e, w,l,e,e), 0) X(1, 1, 0) V(0)
+O(acmp,    T(w,l,e,e, w,l,e,e), 0) X(0, 0, 0) V(0)
+O(acmn,    T(w,l,e,e, w,l,e,e), 0) X(0, 0, 0) V(0)
+O(afcmp,   T(e,e,s,d, e,e,s,d), 0) X(0, 0, 0) V(0)
+O(reqz,    T(w,l,e,e, x,x,e,e), 0) X(0, 0, 0) V(0)
+O(rnez,    T(w,l,e,e, x,x,e,e), 0) X(0, 0, 0) V(0)
 
 /* Arguments, Parameters, and Calls */
-O(par,     T(x,x,x,x, x,x,x,x), 0) X(0, 0, 0)
-O(parc,    T(e,x,e,e, e,x,e,e), 0) X(0, 0, 0)
-O(pare,    T(e,x,e,e, e,x,e,e), 0) X(0, 0, 0)
-O(arg,     T(w,l,s,d, x,x,x,x), 0) X(0, 0, 0)
-O(argc,    T(e,x,e,e, e,l,e,e), 0) X(0, 0, 0)
-O(arge,    T(e,l,e,e, e,x,e,e), 0) X(0, 0, 0)
-O(argv,    T(x,x,x,x, x,x,x,x), 0) X(0, 0, 0)
-O(call,    T(m,m,m,m, x,x,x,x), 0) X(0, 0, 0)
+O(par,     T(x,x,x,x, x,x,x,x), 0) X(0, 0, 0) V(0)
+O(parc,    T(e,x,e,e, e,x,e,e), 0) X(0, 0, 0) V(0)
+O(pare,    T(e,x,e,e, e,x,e,e), 0) X(0, 0, 0) V(0)
+O(arg,     T(w,l,s,d, x,x,x,x), 0) X(0, 0, 0) V(0)
+O(argc,    T(e,x,e,e, e,l,e,e), 0) X(0, 0, 0) V(0)
+O(arge,    T(e,l,e,e, e,x,e,e), 0) X(0, 0, 0) V(0)
+O(argv,    T(x,x,x,x, x,x,x,x), 0) X(0, 0, 0) V(0)
+O(call,    T(m,m,m,m, x,x,x,x), 0) X(0, 0, 0) V(0)
 
 /* Flags Setting */
-O(flagieq,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagine,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagisge, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagisgt, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagisle, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagislt, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagiuge, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagiugt, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagiule, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagiult, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagfeq,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagfge,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagfgt,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagfle,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagflt,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagfne,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagfo,   T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
-O(flagfuo,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1)
+O(flagieq,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagine,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagisge, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagisgt, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagisle, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagislt, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagiuge, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagiugt, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagiule, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagiult, T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagfeq,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagfge,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagfgt,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagfle,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagflt,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagfne,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagfo,   T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
+O(flagfuo,  T(x,x,e,e, x,x,e,e), 0) X(0, 0, 1) V(0)
 
 
 #undef T
 #undef X
+#undef V
 #undef O
 
 /*
diff --git a/rv64/abi.c b/rv64/abi.c
new file mode 100644
index 0000000..1dd4fb0
--- /dev/null
+++ b/rv64/abi.c
@@ -0,0 +1,584 @@
+#include "all.h"
+
+typedef struct Class Class;
+typedef struct Insl Insl;
+typedef struct Params Params;
+
+enum {
+	Cptr  = 1, /* replaced by a pointer */
+	Cstk1 = 2, /* pass first XLEN on the stack */
+	Cstk2 = 4, /* pass second XLEN on the stack */
+	Cstk = Cstk1 | Cstk2,
+	Cfpint = 8, /* float passed like integer */
+};
+
+struct Class {
+	char class;
+	uint size;
+	Typ *t;
+	uchar nreg;
+	uchar ngp;
+	uchar nfp;
+	int reg[2];
+	int cls[2];
+};
+
+struct Insl {
+	Ins i;
+	Insl *link;
+};
+
+struct Params {
+	int ngp;
+	int nfp;
+	int stk; /* stack offset for varargs */
+};
+
+static int gpreg[] = { A0,  A1,  A2,  A3,  A4,  A5,  A6,  A7};
+static int fpreg[] = {FA0, FA1, FA2, FA3, FA4, FA5, FA6, FA7};
+
+/* layout of call's second argument (RCall)
+ *
+ *  29       8    4  2  0
+ *  |0.00|xxxx|xxxx|xx|xx|                  range
+ *           |    |  |  ` gp regs returned (0..2)
+ *           |    |  ` fp regs returned    (0..2)
+ *           |    ` gp regs passed         (0..8)
+ *            ` fp regs passed             (0..8)
+ */
+
+bits
+rv64_retregs(Ref r, int p[2])
+{
+	bits b;
+	int ngp, nfp;
+
+	assert(rtype(r) == RCall);
+	ngp = r.val & 3;
+	nfp = (r.val >> 2) & 3;
+	if (p) {
+		p[0] = ngp;
+		p[1] = nfp;
+	}
+	b = 0;
+	while (ngp--)
+		b |= BIT(A0+ngp);
+	while (nfp--)
+		b |= BIT(FA0+nfp);
+	return b;
+}
+
+bits
+rv64_argregs(Ref r, int p[2])
+{
+	bits b;
+	int ngp, nfp;
+
+	assert(rtype(r) == RCall);
+	ngp = (r.val >> 4) & 15;
+	nfp = (r.val >> 8) & 15;
+	b = 0;
+	if (p) {
+		p[0] = ngp;
+		p[1] = nfp;
+	}
+	b = 0;
+	while (ngp--)
+		b |= BIT(A0+ngp);
+	while (nfp--)
+		b |= BIT(FA0+nfp);
+	return b;
+}
+
+static void
+typclass(Class *c, Typ *t, int *gp, int *fp)
+{
+	uint64_t sz;
+	uint n;
+
+	sz = (t->size + 7) & ~7;
+	c->t = t;
+	c->class = 0;
+	c->ngp = 0;
+	c->nfp = 0;
+
+	if (t->align > 4)
+		err("alignments larger than 16 are not supported");
+
+	if (t->dark || sz > 16 || sz == 0) {
+		/* large structs are replaced by a
+		 * pointer to some caller-allocated
+		 * memory */
+		c->class |= Cptr;
+		c->size = 8;
+		return;
+	}
+
+	c->size = sz;
+
+	/* TODO: float */
+
+	for (n=0; n<sz/8; n++, c->ngp++) {
+		c->reg[n] = *gp++;
+		c->cls[n] = Kl;
+	}
+
+	c->nreg = n;
+}
+
+static void
+sttmps(Ref tmp[], int cls[], uint nreg, Ref mem, Fn *fn)
+{
+	static int st[] = {
+		[Kw] = Ostorew, [Kl] = Ostorel,
+		[Ks] = Ostores, [Kd] = Ostored
+	};
+	uint n;
+	uint64_t off;
+	Ref r;
+
+	assert(nreg <= 4);
+	off = 0;
+	for (n=0; n<nreg; n++) {
+		tmp[n] = newtmp("abi", cls[n], fn);
+		r = newtmp("abi", Kl, fn);
+		emit(st[cls[n]], 0, R, tmp[n], r);
+		emit(Oadd, Kl, r, mem, getcon(off, fn));
+		off += KWIDE(cls[n]) ? 8 : 4;
+	}
+}
+
+static void
+ldregs(int reg[], int cls[], int n, Ref mem, Fn *fn)
+{
+	int i;
+	uint64_t off;
+	Ref r;
+
+	off = 0;
+	for (i=0; i<n; i++) {
+		r = newtmp("abi", Kl, fn);
+		emit(Oload, cls[i], TMP(reg[i]), r, R);
+		emit(Oadd, Kl, r, mem, getcon(off, fn));
+		off += KWIDE(cls[i]) ? 8 : 4;
+	}
+}
+
+static void
+selret(Blk *b, Fn *fn)
+{
+	int j, k, cty;
+	Ref r;
+	Class cr;
+
+	j = b->jmp.type;
+
+	if (!isret(j) || j == Jret0)
+		return;
+
+	r = b->jmp.arg;
+	b->jmp.type = Jret0;
+
+	if (j == Jretc) {
+		typclass(&cr, &typ[fn->retty], gpreg, fpreg);
+		cty = (cr.nfp << 2) | cr.ngp;
+		if (cr.class & Cptr) {
+			assert(rtype(fn->retr) == RTmp);
+			blit(fn->retr, 0, r, cr.t->size, fn);
+		} else {
+			ldregs(cr.reg, cr.cls, cr.nreg, r, fn);
+		}
+	} else {
+		k = j - Jretw;
+		if (KBASE(k) == 0) {
+			emit(Ocopy, k, TMP(A0), r, R);
+			cty = 1;
+		} else {
+			emit(Ocopy, k, TMP(FA0), r, R);
+			cty = 1 << 2;
+		}
+	}
+
+	b->jmp.arg = CALL(cty);
+}
+
+static int
+argsclass(Ins *i0, Ins *i1, Class *carg, Ref *env, int retptr)
+{
+	int ngp, nfp, *gp, *fp, vararg;
+	Class *c;
+	Ins *i;
+
+	gp = gpreg;
+	fp = fpreg;
+	ngp = 8;
+	nfp = 8;
+	vararg = 0;
+	if (retptr) {
+		gp++;
+		ngp--;
+	}
+	for (i=i0, c=carg; i<i1; i++, c++) {
+		switch (i->op) {
+		case Opar:
+		case Oarg:
+			c->cls[0] = i->cls;
+			c->size = 8;
+			/* variadic float args are passed in int regs */
+			if (!vararg && KBASE(i->cls) == 1 && nfp > 0) {
+				nfp--;
+				c->reg[0] = *fp++;
+			} else if (ngp > 0) {
+				if (KBASE(i->cls) == 1)
+					c->class |= Cfpint;
+				ngp--;
+				c->reg[0] = *gp++;
+			} else {
+				c->class |= Cstk1;
+			}
+			break;
+		case Oargv:
+			/* subsequent arguments are variadic */
+			vararg = 1;
+			break;
+		case Oparc:
+		case Oargc:
+			typclass(c, &typ[i->arg[0].val], gp, fp);
+			if (c->class & Cptr) {
+				c->ngp = 1;
+				c->reg[0] = *gp;
+				c->cls[0] = Kl;
+			}
+			if (c->ngp <= ngp && c->nfp <= nfp) {
+				ngp -= c->ngp;
+				nfp -= c->nfp;
+				gp += c->ngp;
+				fp += c->nfp;
+				break;
+			}
+			c->ngp += c->nfp;
+			c->nfp = 0;
+			if (c->ngp <= ngp) {
+				ngp -= c->ngp;
+				gp += c->ngp;
+				break;
+			}
+			c->class |= Cstk1;
+			if (c->ngp - 1 > ngp)
+				c->class |= Cstk2;
+			break;
+		case Opare:
+			*env = i->to;
+			break;
+		case Oarge:
+			*env = i->arg[0];
+			break;
+		}
+	}
+	return (gp-gpreg) << 4 | (fp-fpreg) << 8;
+}
+
+static void
+stkblob(Ref r, Class *c, Fn *fn, Insl **ilp)
+{
+	Insl *il;
+	int al;
+	uint64_t sz;
+
+	il = alloc(sizeof *il);
+	al = c->t->align - 2; /* NAlign == 3 */
+	if (al < 0)
+		al = 0;
+	sz = c->class & Cptr ? c->t->size : c->size;
+	il->i = (Ins){Oalloc+al, Kl, r, {getcon(sz, fn)}};
+	il->link = *ilp;
+	*ilp = il;
+}
+
+static void
+selcall(Fn *fn, Ins *i0, Ins *i1, Insl **ilp)
+{
+	Ins *i;
+	Class *ca, *c, cr;
+	int k, cty, envc;
+	uint n;
+	uint64_t stk, off;
+	Ref r, r1, env, tmp[2];
+
+	env = R;
+	ca = alloc((i1-i0) * sizeof ca[0]);
+	cr.class = 0;
+
+	if (!req(i1->arg[1], R))
+		typclass(&cr, &typ[i1->arg[1].val], gpreg, fpreg);
+
+	cty = argsclass(i0, i1, ca, &env, cr.class & Cptr);
+	stk = 0;
+	for (i=i0, c=ca; i<i1; i++, c++) {
+		if (i->op == Oargv)
+			continue;
+		if (c->class & Cptr) {
+			i->arg[0] = newtmp("abi", Kl, fn);
+			stkblob(i->arg[0], c, fn, ilp);
+			i->op = Oarg;
+		}
+		if (c->class & Cstk1)
+			stk += 8;
+		if (c->class & Cstk2)
+			stk += 8;
+	}
+	if (stk)
+		emit(Osalloc, Kl, R, getcon(-stk, fn), R);
+
+	if (!req(i1->arg[1], R)) {
+		stkblob(i1->to, &cr, fn, ilp);
+		cty |= (cr.nfp << 2) | cr.ngp;
+		if (cr.class & Cptr) {
+			cty |= 1;
+			emit(Ocopy, Kw, R, TMP(A0), R);
+		} else {
+			sttmps(tmp, cr.cls, cr.nreg, i1->to, fn);
+			for (n=0; n<cr.nreg; n++) {
+				r = TMP(cr.reg[n]);
+				emit(Ocopy, cr.cls[n], tmp[n], r, R);
+			}
+		}
+	} else if (KBASE(i1->cls) == 0) {
+		emit(Ocopy, i1->cls, i1->to, TMP(A0), R);
+		cty |= 1;
+	} else {
+		emit(Ocopy, i1->cls, i1->to, TMP(FA0), R);
+		cty |= 1 << 2;
+	}
+
+	envc = !req(R, env);
+	if (envc)
+		die("todo (rv64 abi): env calls");
+	emit(Ocall, 0, R, i1->arg[0], CALL(cty));
+
+	if (cr.class & Cptr)
+		/* struct return argument */
+		emit(Ocopy, Kl, TMP(A0), i1->to, R);
+
+	/* move arguments into registers */
+	for (i=i0, c=ca; i<i1; i++, c++) {
+		if (i->op == Oargv || c->class & Cstk1)
+			continue;
+		if (i->op == Oargc) {
+			ldregs(c->reg, c->cls, c->nreg, i->arg[1], fn);
+		} else if (c->class & Cfpint) {
+			k = KWIDE(*c->cls) ? Kl : Kw;
+			r = newtmp("abi", k, fn);
+			emit(Ocopy, k, TMP(c->reg[0]), r, R);
+			c->reg[0] = r.val;
+		} else {
+			emit(Ocopy, *c->cls, TMP(*c->reg), i->arg[0], R);
+		}
+	}
+
+	for (i=i0, c=ca; i<i1; i++, c++) {
+		if (c->class & Cfpint)
+			emit(Ocast, KWIDE(*c->cls) ? Kl : Kw, TMP(*c->reg), i->arg[0], R);
+		if (c->class & Cptr)
+			blit(i->arg[0], 0, i->arg[1], c->t->size, fn);
+	}
+
+	if (!stk)
+		return;
+
+	r = newtmp("abi", Kl, fn);
+	for (i=i0, c=ca, off=0; i<i1; i++, c++) {
+		if (i->op == Oargv || (c->class & Cstk) == 0)
+			continue;
+		if (i->op != Oargc) {
+			r1 = newtmp("abi", Kl, fn);
+			/* w arguments are stored sign-extended
+			 * to 64-bits
+			 *
+			 * s arguments can just be stored with
+			 * Ostores into the first 32-bits in the
+			 * stack position since the ABI says the
+			 * upper bits are undefined
+			 */
+			emit(i->cls == Kw ? Ostorel : Ostorew+i->cls, 0, R, i->arg[0], r1);
+			if (i->cls == Kw) {
+				/* TODO: we only need this sign extension
+				 * for subtyped l temporaries passed as w
+				 * arguments (see rv64/isel.c:fixarg)
+				 *
+				 * however, we cannot just fix it in isel
+				 * since by that point we have forgotten
+				 * the original argument type
+				 */
+				curi->arg[0] = newtmp("abi", Kl, fn);
+				emit(Oextsw, Kl, curi->arg[0], i->arg[0], R);
+			}
+			emit(Oadd, Kl, r1, r, getcon(off, fn));
+		} else
+			blit(r, off, i->arg[1], c->t->size, fn);
+		off += c->size;
+	}
+	emit(Osalloc, Kl, r, getcon(stk, fn), R);
+}
+
+static Params
+selpar(Fn *fn, Ins *i0, Ins *i1)
+{
+	Class *ca, *c, cr;
+	Insl *il;
+	Ins *i;
+	int n, s, cty;
+	Ref r, env, tmp[16], *t;
+
+	env = R;
+	ca = alloc((i1-i0) * sizeof ca[0]);
+	cr.class = 0;
+	curi = &insb[NIns];
+
+	if (fn->retty >= 0) {
+		typclass(&cr, &typ[fn->retty], gpreg, fpreg);
+		if (cr.class & Cptr) {
+			fn->retr = newtmp("abi", Kl, fn);
+			emit(Ocopy, Kl, fn->retr, TMP(A0), R);
+		}
+	}
+
+	cty = argsclass(i0, i1, ca, &env, cr.class & Cptr);
+	fn->reg = rv64_argregs(CALL(cty), 0);
+
+	il = 0;
+	t = tmp;
+	for (i=i0, c=ca; i<i1; i++, c++) {
+		if (i->op != Oparc || (c->class & (Cptr|Cstk)))
+			continue;
+		sttmps(t, c->cls, c->nreg, i->to, fn);
+		stkblob(i->to, c, fn, &il);
+		t += c->nreg;
+	}
+	for (; il; il=il->link)
+		emiti(il->i);
+
+	t = tmp;
+	for (i=i0, c=ca, s=2 + 8 * fn->vararg; i<i1; i++, c++) {
+		if (i->op == Oparc
+		&& (c->class & Cptr) == 0) {
+			if (c->class & Cstk) {
+				fn->tmp[i->to.val].slot = -s;
+				s += c->size / 8;
+			} else {
+				for (n=0; n<c->nreg; n++) {
+					r = TMP(c->reg[n]);
+					emit(Ocopy, c->cls[n], *t++, r, R);
+				}
+			}
+		} else if (c->class & Cstk1) {
+			emit(Oload, c->cls[0], i->to, SLOT(-s), R);
+			s++;
+		} else {
+			emit(Ocopy, c->cls[0], i->to, TMP(c->reg[0]), R);
+		}
+	}
+
+	if (!req(R, env))
+		die("todo (rv64 abi): env calls");
+
+	return (Params){
+		.stk = s,
+		.ngp = (cty >> 4) & 15,
+		.nfp = (cty >> 8) & 15,
+	};
+}
+
+static void
+selvaarg(Fn *fn, Ins *i)
+{
+	Ref loc, newloc;
+
+	loc = newtmp("abi", Kl, fn);
+	newloc = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, newloc, i->arg[0]);
+	emit(Oadd, Kl, newloc, loc, getcon(8, fn));
+	emit(Oload, i->cls, i->to, loc, R);
+	emit(Oload, Kl, loc, i->arg[0], R);
+}
+
+static void
+selvastart(Fn *fn, Params p, Ref ap)
+{
+	Ref rsave;
+	int s;
+
+	rsave = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, rsave, ap);
+	s = p.stk > 2 + 8 * fn->vararg ? p.stk : 2 + p.ngp;
+	emit(Oaddr, Kl, rsave, SLOT(-s), R);
+}
+
+void
+rv64_abi(Fn *fn)
+{
+	Blk *b;
+	Ins *i, *i0, *ip;
+	Insl *il;
+	int n;
+	Params p;
+
+	for (b=fn->start; b; b=b->link)
+		b->visit = 0;
+
+	/* lower parameters */
+	for (b=fn->start, i=b->ins; i<&b->ins[b->nins]; i++)
+		if (!ispar(i->op))
+			break;
+	p = selpar(fn, b->ins, i);
+	n = b->nins - (i - b->ins) + (&insb[NIns] - curi);
+	i0 = alloc(n * sizeof(Ins));
+	ip = icpy(ip = i0, curi, &insb[NIns] - curi);
+	ip = icpy(ip, i, &b->ins[b->nins] - i);
+	b->nins = n;
+	b->ins = i0;
+
+	/* lower calls, returns, and vararg instructions */
+	il = 0;
+	b = fn->start;
+	do {
+		if (!(b = b->link))
+			b = fn->start; /* do it last */
+		if (b->visit)
+			continue;
+		curi = &insb[NIns];
+		selret(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;)
+			switch ((--i)->op) {
+			default:
+				emiti(*i);
+				break;
+			case Ocall:
+				for (i0=i; i0>b->ins; i0--)
+					if (!isarg((i0-1)->op))
+						break;
+				selcall(fn, i0, i, &il);
+				i = i0;
+				break;
+			case Ovastart:
+				selvastart(fn, p, i->arg[0]);
+				break;
+			case Ovaarg:
+				selvaarg(fn, i);
+				break;
+			case Oarg:
+			case Oargc:
+				die("unreachable");
+			}
+		if (b == fn->start)
+			for (; il; il=il->link)
+				emiti(il->i);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	} while (b != fn->start);
+
+	if (debug['A']) {
+		fprintf(stderr, "\n> After ABI lowering:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/rv64/all.h b/rv64/all.h
new file mode 100644
index 0000000..eb2daa9
--- /dev/null
+++ b/rv64/all.h
@@ -0,0 +1,49 @@
+#include "../all.h"
+
+typedef struct Rv64Op Rv64Op;
+
+enum Rv64Reg {
+	/* caller-save */
+	T0 = RXX + 1, T1, T2, T3, T4, T5,
+	A0, A1, A2, A3, A4, A5, A6, A7,
+
+	/* callee-save */
+	S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11,
+
+	/* globally live */
+	FP, SP, GP, TP, RA, T6,
+
+	/* FP caller-save */
+	FT0, FT1, FT2, FT3, FT4, FT5, FT6, FT7, FT8, FT9, FT10, FT11,
+	FA0, FA1, FA2, FA3, FA4, FA5, FA6, FA7,
+
+	/* FP callee-save */
+	FS0, FS1, FS2, FS3, FS4, FS5, FS6, FS7, FS8, FS9, FS10, FS11,
+
+	NFPR = FS11 - FT0 + 1,
+	NGPR = T6 - T0 + 1,
+	NGPS = A7 - T0 + 1,
+	NFPS = FA7 - FT0 + 1,
+	NCLR = (S11 - S1 + 1) + (FS11 - FS0 + 1),
+};
+MAKESURE(reg_not_tmp, FS11 < (int)Tmp0);
+
+struct Rv64Op {
+	char imm;
+};
+
+/* targ.c */
+extern int rv64_rsave[];
+extern int rv64_rclob[];
+extern Rv64Op rv64_op[];
+
+/* abi.c */
+bits rv64_retregs(Ref, int[2]);
+bits rv64_argregs(Ref, int[2]);
+void rv64_abi(Fn *);
+
+/* isel.c */
+void rv64_isel(Fn *);
+
+/* emit.c */
+void rv64_emitfn(Fn *, FILE *);
diff --git a/rv64/emit.c b/rv64/emit.c
new file mode 100644
index 0000000..b34b424
--- /dev/null
+++ b/rv64/emit.c
@@ -0,0 +1,499 @@
+#include "all.h"
+
+enum {
+	Ki = -1, /* matches Kw and Kl */
+	Ka = -2, /* matches all classes */
+};
+
+static struct {
+	short op;
+	short cls;
+	char *asm;
+} omap[] = {
+	{ Oadd,    Ki, "add%k %=, %0, %1" },
+	{ Oadd,    Ka, "fadd.%k %=, %0, %1" },
+	{ Osub,    Ki, "sub%k %=, %0, %1" },
+	{ Osub,    Ka, "fsub.%k %=, %0, %1" },
+	{ Oneg,    Ki, "neg%k %=, %0" },
+	{ Oneg,    Ka, "fneg.%k %=, %0" },
+	{ Odiv,    Ki, "div%k %=, %0, %1" },
+	{ Odiv,    Ka, "fdiv.%k %=, %0, %1" },
+	{ Orem,    Ki, "rem%k %=, %0, %1" },
+	{ Orem,    Kl, "rem %=, %0, %1" },
+	{ Oudiv,   Ki, "divu%k %=, %0, %1" },
+	{ Ourem,   Ki, "remu%k %=, %0, %1" },
+	{ Omul,    Ki, "mul%k %=, %0, %1" },
+	{ Omul,    Ka, "fmul.%k %=, %0, %1" },
+	{ Oand,    Ki, "and %=, %0, %1" },
+	{ Oor,     Ki, "or %=, %0, %1" },
+	{ Oxor,    Ki, "xor %=, %0, %1" },
+	{ Osar,    Ki, "sra%k %=, %0, %1" },
+	{ Oshr,    Ki, "srl%k %=, %0, %1" },
+	{ Oshl,    Ki, "sll%k %=, %0, %1" },
+	{ Ocsltl,  Ki, "slt %=, %0, %1" },
+	{ Ocultl,  Ki, "sltu %=, %0, %1" },
+	{ Oceqs,   Ki, "feq.s %=, %0, %1" },
+	{ Ocges,   Ki, "fge.s %=, %0, %1" },
+	{ Ocgts,   Ki, "fgt.s %=, %0, %1" },
+	{ Ocles,   Ki, "fle.s %=, %0, %1" },
+	{ Oclts,   Ki, "flt.s %=, %0, %1" },
+	{ Oceqd,   Ki, "feq.d %=, %0, %1" },
+	{ Ocged,   Ki, "fge.d %=, %0, %1" },
+	{ Ocgtd,   Ki, "fgt.d %=, %0, %1" },
+	{ Ocled,   Ki, "fle.d %=, %0, %1" },
+	{ Ocltd,   Ki, "flt.d %=, %0, %1" },
+	{ Ostoreb, Kw, "sb %0, %M1" },
+	{ Ostoreh, Kw, "sh %0, %M1" },
+	{ Ostorew, Kw, "sw %0, %M1" },
+	{ Ostorel, Ki, "sd %0, %M1" },
+	{ Ostores, Kw, "fsw %0, %M1" },
+	{ Ostored, Kw, "fsd %0, %M1" },
+	{ Oloadsb, Ki, "lb %=, %M0" },
+	{ Oloadub, Ki, "lbu %=, %M0" },
+	{ Oloadsh, Ki, "lh %=, %M0" },
+	{ Oloaduh, Ki, "lhu %=, %M0" },
+	{ Oloadsw, Ki, "lw %=, %M0" },
+	/* riscv64 always sign-extends 32-bit
+	 * values stored in 64-bit registers
+	 */
+	{ Oloaduw, Kw, "lw %=, %M0" },
+	{ Oloaduw, Kl, "lwu %=, %M0" },
+	{ Oload,   Kw, "lw %=, %M0" },
+	{ Oload,   Kl, "ld %=, %M0" },
+	{ Oload,   Ks, "flw %=, %M0" },
+	{ Oload,   Kd, "fld %=, %M0" },
+	{ Oextsb,  Ki, "sext.b %=, %0" },
+	{ Oextub,  Ki, "zext.b %=, %0" },
+	{ Oextsh,  Ki, "sext.h %=, %0" },
+	{ Oextuh,  Ki, "zext.h %=, %0" },
+	{ Oextsw,  Kl, "sext.w %=, %0" },
+	{ Oextuw,  Kl, "zext.w %=, %0" },
+	{ Otruncd, Ks, "fcvt.s.d %=, %0" },
+	{ Oexts,   Kd, "fcvt.d.s %=, %0" },
+	{ Ostosi,  Kw, "fcvt.w.s %=, %0, rtz" },
+	{ Ostosi,  Kl, "fcvt.l.s %=, %0, rtz" },
+	{ Ostoui,  Kw, "fcvt.wu.s %=, %0, rtz" },
+	{ Ostoui,  Kl, "fcvt.lu.s %=, %0, rtz" },
+	{ Odtosi,  Kw, "fcvt.w.d %=, %0, rtz" },
+	{ Odtosi,  Kl, "fcvt.l.d %=, %0, rtz" },
+	{ Odtoui,  Kw, "fcvt.wu.d %=, %0, rtz" },
+	{ Odtoui,  Kl, "fcvt.lu.d %=, %0, rtz" },
+	{ Oswtof,  Ka, "fcvt.%k.w %=, %0" },
+	{ Ouwtof,  Ka, "fcvt.%k.wu %=, %0" },
+	{ Osltof,  Ka, "fcvt.%k.l %=, %0" },
+	{ Oultof,  Ka, "fcvt.%k.lu %=, %0" },
+	{ Ocast,   Kw, "fmv.x.w %=, %0" },
+	{ Ocast,   Kl, "fmv.x.d %=, %0" },
+	{ Ocast,   Ks, "fmv.w.x %=, %0" },
+	{ Ocast,   Kd, "fmv.d.x %=, %0" },
+	{ Ocopy,   Ki, "mv %=, %0" },
+	{ Ocopy,   Ka, "fmv.%k %=, %0" },
+	{ Oswap,   Ki, "mv %?, %0\n\tmv %0, %1\n\tmv %1, %?" },
+	{ Oreqz,   Ki, "seqz %=, %0" },
+	{ Ornez,   Ki, "snez %=, %0" },
+	{ Ocall,   Kw, "jalr %0" },
+	{ NOp, 0, 0 }
+};
+
+static char *rname[] = {
+	[FP] = "fp",
+	[SP] = "sp",
+	[GP] = "gp",
+	[TP] = "tp",
+	[RA] = "ra",
+	[T6] = "t6",
+	[T0] = "t0", "t1", "t2", "t3", "t4", "t5",
+	[A0] = "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7",
+	[S1] = "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11",
+
+	[FT0] = "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft10", "ft11",
+	[FA0] = "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7",
+	[FS0] = "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fs8", "fs9", "fs10", "fs11",
+};
+
+static int64_t
+slot(int s, Fn *fn)
+{
+	s = ((int32_t)s << 3) >> 3;
+	assert(s <= fn->slot);
+	if (s < 0)
+		return 8 * -s;
+	else
+		return -4 * (fn->slot - s);
+}
+
+static void
+emitaddr(Con *c, FILE *f)
+{
+	char off[32], *p;
+
+	if (c->bits.i)
+		sprintf(off, "+%"PRIi64, c->bits.i);
+	else
+		off[0] = 0;
+	p = c->local ? ".L" : "";
+	fprintf(f, "%s%s%s", p, str(c->label), off);
+}
+
+static void
+emitf(char *s, Ins *i, Fn *fn, FILE *f)
+{
+	static char clschr[] = {'w', 'l', 's', 'd'};
+	Ref r;
+	int k, c;
+	Con *pc;
+	int64_t offset;
+
+	fputc('\t', f);
+	for (;;) {
+		k = i->cls;
+		while ((c = *s++) != '%')
+			if (!c) {
+				fputc('\n', f);
+				return;
+			} else
+				fputc(c, f);
+		switch ((c = *s++)) {
+		default:
+			die("invalid escape");
+		case '?':
+			if (KBASE(k) == 0)
+				fputs("t6", f);
+			else
+				abort();
+			break;
+		case 'k':
+			if (i->cls != Kl)
+				fputc(clschr[i->cls], f);
+			break;
+		case '=':
+		case '0':
+			r = c == '=' ? i->to : i->arg[0];
+			assert(isreg(r));
+			fputs(rname[r.val], f);
+			break;
+		case '1':
+			r = i->arg[1];
+			switch (rtype(r)) {
+			default:
+				die("invalid second argument");
+			case RTmp:
+				assert(isreg(r));
+				fputs(rname[r.val], f);
+				break;
+			case RCon:
+				pc = &fn->con[r.val];
+				assert(pc->type == CBits);
+				assert(pc->bits.i >= -2048 && pc->bits.i <= 2047);
+				fprintf(f, "%d", (int)pc->bits.i);
+				break;
+			}
+			break;
+		case 'M':
+			c = *s++;
+			assert(c == '0' || c == '1');
+			r = i->arg[c - '0'];
+			switch (rtype(r)) {
+			default:
+				die("invalid address argument");
+			case RTmp:
+				fprintf(f, "0(%s)", rname[r.val]);
+				break;
+			case RCon:
+				pc = &fn->con[r.val];
+				assert(pc->type == CAddr);
+				emitaddr(pc, f);
+				if (isstore(i->op)
+				|| (isload(i->op) && KBASE(i->cls) == 1)) {
+					/* store (and float load)
+					 * pseudo-instructions need a
+					 * temporary register in which to
+					 * load the address
+					 */
+					fprintf(f, ", t6");
+				}
+				break;
+			case RSlot:
+				offset = slot(r.val, fn);
+				assert(offset >= -2048 && offset <= 2047);
+				fprintf(f, "%d(fp)", (int)offset);
+				break;
+			}
+			break;
+		}
+	}
+}
+
+static void
+loadcon(Con *c, int r, int k, FILE *f)
+{
+	char *rn;
+	int64_t n;
+	int w;
+
+	w = KWIDE(k);
+	rn = rname[r];
+	switch (c->type) {
+	case CAddr:
+		fprintf(f, "\tla %s, ", rn);
+		emitaddr(c, f);
+		fputc('\n', f);
+		break;
+	case CBits:
+		n = c->bits.i;
+		if (!w)
+			n = (int32_t)n;
+		fprintf(f, "\tli %s, %"PRIu64"\n", rn, n);
+		break;
+	default:
+		die("invalid constant");
+	}
+}
+
+static void
+fixslot(Ref *pr, Fn *fn, FILE *f)
+{
+	Ref r;
+	int64_t s;
+
+	r = *pr;
+	if (rtype(r) == RSlot) {
+		s = slot(r.val, fn);
+		if (s < -2048 || s > 2047) {
+			fprintf(f, "\tli t6, %"PRId64"\n", s);
+			fprintf(f, "\tadd t6, fp, t6\n");
+			*pr = TMP(T6);
+		}
+	}
+}
+
+static void
+emitins(Ins *i, Fn *fn, FILE *f)
+{
+	int o;
+	char *rn;
+	int64_t s;
+	Con *con;
+
+	switch (i->op) {
+	default:
+		if (isload(i->op))
+			fixslot(&i->arg[0], fn, f);
+		else if (isstore(i->op))
+			fixslot(&i->arg[1], fn, f);
+	Table:
+		/* most instructions are just pulled out of
+		 * the table omap[], some special cases are
+		 * detailed below */
+		for (o=0;; o++) {
+			/* this linear search should really be a binary
+			 * search */
+			if (omap[o].op == NOp)
+				die("no match for %s(%c)",
+					optab[i->op].name, "wlsd"[i->cls]);
+			if (omap[o].op == i->op)
+			if (omap[o].cls == i->cls || omap[o].cls == Ka
+			|| (omap[o].cls == Ki && KBASE(i->cls) == 0))
+				break;
+		}
+		emitf(omap[o].asm, i, fn, f);
+		break;
+	case Ocopy:
+		if (req(i->to, i->arg[0]))
+			break;
+		if (rtype(i->to) == RSlot) {
+			switch (rtype(i->arg[0])) {
+			case RSlot:
+			case RCon:
+				die("unimplemented");
+				break;
+			default:
+				assert(isreg(i->arg[0]));
+				i->arg[1] = i->to;
+				i->to = R;
+				switch (i->cls) {
+				case Kw: i->op = Ostorew; break;
+				case Kl: i->op = Ostorel; break;
+				case Ks: i->op = Ostores; break;
+				case Kd: i->op = Ostored; break;
+				}
+				fixslot(&i->arg[1], fn, f);
+				goto Table;
+			}
+			break;
+		}
+		assert(isreg(i->to));
+		switch (rtype(i->arg[0])) {
+		case RCon:
+			loadcon(&fn->con[i->arg[0].val], i->to.val, i->cls, f);
+			break;
+		case RSlot:
+			i->op = Oload;
+			fixslot(&i->arg[0], fn, f);
+			goto Table;
+		default:
+			assert(isreg(i->arg[0]));
+			goto Table;
+		}
+		break;
+	case Onop:
+		break;
+	case Oaddr:
+		assert(rtype(i->arg[0]) == RSlot);
+		rn = rname[i->to.val];
+		s = slot(i->arg[0].val, fn);
+		if (-s < 2048) {
+			fprintf(f, "\tadd %s, fp, %"PRId64"\n", rn, s);
+		} else {
+			fprintf(f,
+				"\tli %s, %"PRId64"\n"
+				"\tadd %s, fp, %s\n",
+				rn, s, rn, rn
+			);
+		}
+		break;
+	case Ocall:
+		switch (rtype(i->arg[0])) {
+		case RCon:
+			con = &fn->con[i->arg[0].val];
+			if (con->type != CAddr || con->bits.i)
+				goto invalid;
+			fprintf(f, "\tcall %s\n", str(con->label));
+			break;
+		case RTmp:
+			emitf("jalr %0", i, fn, f);
+			break;
+		default:
+		invalid:
+			die("invalid call argument");
+		}
+		break;
+	case Osalloc:
+		emitf("sub sp, sp, %0", i, fn, f);
+		if (!req(i->to, R))
+			emitf("mv %=, sp", i, fn, f);
+		break;
+	}
+}
+
+/*
+
+  Stack-frame layout:
+
+  +=============+
+  | varargs     |
+  |  save area  |
+  +-------------+
+  |  saved ra   |
+  |  saved fp   |
+  +-------------+ <- fp
+  |    ...      |
+  | spill slots |
+  |    ...      |
+  +-------------+
+  |    ...      |
+  |   locals    |
+  |    ...      |
+  +-------------+
+  |   padding   |
+  +-------------+
+  | callee-save |
+  |  registers  |
+  +=============+
+
+*/
+
+void
+rv64_emitfn(Fn *fn, FILE *f)
+{
+	static int id0;
+	int lbl, neg, off, frame, *pr, r;
+	Blk *b, *s;
+	Ins *i;
+
+	gasemitlnk(fn->name, &fn->lnk, ".text", f);
+
+	if (fn->vararg) {
+		/* TODO: only need space for registers unused by named arguments */
+		fprintf(f, "\tadd sp, sp, -64\n");
+		for (r = A0; r <= A7; r++)
+			fprintf(f, "\tsd %s, %d(sp)\n", rname[r], 8 * (r - A0));
+	}
+	fprintf(f, "\tsd fp, -16(sp)\n");
+	fprintf(f, "\tsd ra, -8(sp)\n");
+	fprintf(f, "\tadd fp, sp, -16\n");
+
+	frame = (16 + 4 * fn->slot + 15) & ~15;
+	for (pr = rv64_rclob; *pr>=0; pr++) {
+		if (fn->reg & BIT(*pr))
+			frame += 8;
+	}
+	frame = (frame + 15) & ~15;
+
+	if (frame <= 2048)
+		fprintf(f, "\tadd sp, sp, -%d\n", frame);
+	else
+		fprintf(f,
+			"\tli t6, %d\n"
+			"\tsub sp, sp, t6\n",
+			frame);
+	for (pr = rv64_rclob, off = 0; *pr >= 0; pr++) {
+		if (fn->reg & BIT(*pr)) {
+			fprintf(f, "\t%s %s, %d(sp)\n", *pr < FT0 ? "sd" : "fsd", rname[*pr], off);
+			off += 8;
+		}
+	}
+
+	for (lbl = 0, b = fn->start; b; b=b->link) {
+		if (lbl || b->npred > 1)
+			fprintf(f, ".L%d:\n", id0+b->id);
+		for (i=b->ins; i!=&b->ins[b->nins]; i++)
+			emitins(i, fn, f);
+		lbl = 1;
+		switch (b->jmp.type) {
+		case Jret0:
+			if (fn->dynalloc) {
+				if (frame - 16 <= 2048)
+					fprintf(f, "\tadd sp, fp, -%d\n", frame - 16);
+				else
+					fprintf(f,
+						"\tli t6, %d\n"
+						"\tsub sp, sp, t6\n",
+						frame - 16);
+			}
+			for (pr = rv64_rclob, off = 0; *pr >= 0; pr++) {
+				if (fn->reg & BIT(*pr)) {
+					fprintf(f, "\t%s %s, %d(sp)\n", *pr < FT0 ? "ld" : "fld", rname[*pr], off);
+					off += 8;
+				}
+			}
+			fprintf(f,
+				"\tadd sp, fp, %d\n"
+				"\tld ra, 8(fp)\n"
+				"\tld fp, 0(fp)\n"
+				"\tret\n",
+				16 + fn->vararg * 64
+			);
+			break;
+		case Jjmp:
+		Jmp:
+			if (b->s1 != b->link)
+				fprintf(f, "\tj .L%d\n", id0+b->s1->id);
+			else
+				lbl = 0;
+			break;
+		case Jjnz:
+			neg = 0;
+			if (b->link == b->s2) {
+				s = b->s1;
+				b->s1 = b->s2;
+				b->s2 = s;
+				neg = 1;
+			}
+			assert(isreg(b->jmp.arg));
+			fprintf(f, "\tb%sz %s, .L%d\n", neg ? "ne" : "eq", rname[b->jmp.arg.val], id0+b->s2->id);
+			goto Jmp;
+		}
+	}
+	id0 += fn->nblk;
+}
diff --git a/rv64/isel.c b/rv64/isel.c
new file mode 100644
index 0000000..bb6fb02
--- /dev/null
+++ b/rv64/isel.c
@@ -0,0 +1,278 @@
+#include "all.h"
+
+static int
+memarg(Ref *r, int op, Ins *i)
+{
+	return ((isload(op) || op == Ocall) && r == &i->arg[0])
+	|| (isstore(op) && r == &i->arg[1]);
+}
+
+static int
+immarg(Ref *r, int op, Ins *i)
+{
+	return rv64_op[op].imm && r == &i->arg[1];
+}
+
+static void
+fixarg(Ref *r, int k, Ins *i, Fn *fn)
+{
+	char buf[32];
+	Ref r0, r1;
+	int s, n, op;
+	Con *c;
+
+	r0 = r1 = *r;
+	op = i ? i->op : Ocopy;
+	switch (rtype(r0)) {
+	case RCon:
+		c = &fn->con[r0.val];
+		if (c->type == CAddr && memarg(r, op, i))
+			break;
+		if (c->type == CBits && immarg(r, op, i)
+		&& -2048 <= c->bits.i && c->bits.i < 2048)
+			break;
+		r1 = newtmp("isel", k, fn);
+		if (KBASE(k) == 1) {
+			/* load floating points from memory
+			 * slots, they can't be used as
+			 * immediates
+			 */
+			assert(c->type == CBits);
+			n = gasstash(&c->bits, KWIDE(k) ? 8 : 4);
+			vgrow(&fn->con, ++fn->ncon);
+			c = &fn->con[fn->ncon-1];
+			sprintf(buf, "fp%d", n);
+			*c = (Con){.type = CAddr, .local = 1};
+			c->label = intern(buf);
+			emit(Oload, k, r1, CON(c-fn->con), R);
+			break;
+		}
+		emit(Ocopy, k, r1, r0, R);
+		break;
+	case RTmp:
+		if (isreg(r0))
+			break;
+		s = fn->tmp[r0.val].slot;
+		if (s != -1) {
+			/* aggregate passed by value on
+			 * stack, or fast local address,
+			 * replace with slot if we can
+			 */
+			if (memarg(r, op, i)) {
+				r1 = SLOT(s);
+				break;
+			}
+			r1 = newtmp("isel", k, fn);
+			emit(Oaddr, k, r1, SLOT(s), R);
+			break;
+		}
+		if (k == Kw && fn->tmp[r0.val].cls == Kl) {
+			/* TODO: this sign extension isn't needed
+			 * for 32-bit arithmetic instructions
+			 */
+			r1 = newtmp("isel", k, fn);
+			emit(Oextsw, Kl, r1, r0, R);
+		} else {
+			assert(k == fn->tmp[r0.val].cls);
+		}
+		break;
+	}
+	*r = r1;
+}
+
+static void
+negate(Ref *pr, Fn *fn)
+{
+	Ref r;
+
+	r = newtmp("isel", Kw, fn);
+	emit(Oxor, Kw, *pr, r, getcon(1, fn));
+	*pr = r;
+}
+
+static void
+selcmp(Ins i, int k, int op, Fn *fn)
+{
+	Ins *icmp;
+	Ref r, r0, r1;
+	int sign, swap, neg;
+
+	switch (op) {
+	case Cieq:
+		r = newtmp("isel", k, fn);
+		emit(Oreqz, i.cls, i.to, r, R);
+		emit(Oxor, k, r, i.arg[0], i.arg[1]);
+		icmp = curi;
+		fixarg(&icmp->arg[0], k, icmp, fn);
+		fixarg(&icmp->arg[1], k, icmp, fn);
+		return;
+	case Cine:
+		r = newtmp("isel", k, fn);
+		emit(Ornez, i.cls, i.to, r, R);
+		emit(Oxor, k, r, i.arg[0], i.arg[1]);
+		icmp = curi;
+		fixarg(&icmp->arg[0], k, icmp, fn);
+		fixarg(&icmp->arg[1], k, icmp, fn);
+		return;
+	case Cisge: sign = 1, swap = 0, neg = 1; break;
+	case Cisgt: sign = 1, swap = 1, neg = 0; break;
+	case Cisle: sign = 1, swap = 1, neg = 1; break;
+	case Cislt: sign = 1, swap = 0, neg = 0; break;
+	case Ciuge: sign = 0, swap = 0, neg = 1; break;
+	case Ciugt: sign = 0, swap = 1, neg = 0; break;
+	case Ciule: sign = 0, swap = 1, neg = 1; break;
+	case Ciult: sign = 0, swap = 0, neg = 0; break;
+	case NCmpI+Cfeq:
+	case NCmpI+Cfge:
+	case NCmpI+Cfgt:
+	case NCmpI+Cfle:
+	case NCmpI+Cflt:
+		swap = 0, neg = 0;
+		break;
+	case NCmpI+Cfuo:
+		negate(&i.to, fn);
+		/* fallthrough */
+	case NCmpI+Cfo:
+		r0 = newtmp("isel", i.cls, fn);
+		r1 = newtmp("isel", i.cls, fn);
+		emit(Oand, i.cls, i.to, r0, r1);
+		op = KWIDE(k) ? Oceqd : Oceqs;
+		emit(op, i.cls, r0, i.arg[0], i.arg[0]);
+		icmp = curi;
+		fixarg(&icmp->arg[0], k, icmp, fn);
+		fixarg(&icmp->arg[1], k, icmp, fn);
+		emit(op, i.cls, r1, i.arg[1], i.arg[1]);
+		icmp = curi;
+		fixarg(&icmp->arg[0], k, icmp, fn);
+		fixarg(&icmp->arg[1], k, icmp, fn);
+		return;
+	case NCmpI+Cfne:
+		swap = 0, neg = 1;
+		i.op = KWIDE(k) ? Oceqd : Oceqs;
+		break;
+	default:
+		assert(0 && "unknown comparison");
+	}
+	if (op < NCmpI)
+		i.op = sign ? Ocsltl : Ocultl;
+	if (swap) {
+		r = i.arg[0];
+		i.arg[0] = i.arg[1];
+		i.arg[1] = r;
+	}
+	if (neg)
+		negate(&i.to, fn);
+	emiti(i);
+	icmp = curi;
+	fixarg(&icmp->arg[0], k, icmp, fn);
+	fixarg(&icmp->arg[1], k, icmp, fn);
+}
+
+static void
+sel(Ins i, Fn *fn)
+{
+	Ref r0, r1;
+	Ins *i0;
+	int ck, cc;
+	int64_t sz;
+
+	switch (i.op) {
+	case Onop:
+		break;
+	case Oalloc4:
+	case Oalloc8:
+	case Oalloc16:
+		/* we need to make sure
+		 * the stack remains aligned
+		 * (rsp = 0) mod 16
+		 */
+		fn->dynalloc = 1;
+		if (rtype(i.arg[0]) == RCon) {
+			sz = fn->con[i.arg[0].val].bits.i;
+			if (sz < 0)
+				err("invalid alloc size %"PRId64, sz);
+			sz = (sz + 15) & -16;
+			emit(Osalloc, Kl, i.to, getcon(sz, fn), R);
+			fixarg(&curi->arg[0], Kl, curi, fn);
+		} else {
+			/* r0 = (i.arg[0] + 15) & -16 */
+			r0 = newtmp("isel", Kl, fn);
+			r1 = newtmp("isel", Kl, fn);
+			emit(Osalloc, Kl, i.to, r0, R);
+			emit(Oand, Kl, r0, r1, getcon(-16, fn));
+			emit(Oadd, Kl, r1, i.arg[0], getcon(15, fn));
+			if (fn->tmp[i.arg[0].val].slot != -1)
+				err("unlikely argument %%%s in %s",
+					fn->tmp[i.arg[0].val].name, optab[i.op].name);
+		}
+		break;
+	default:
+		if (iscmp(i.op, &ck, &cc)) {
+			selcmp(i, ck, cc, fn);
+			break;
+		}
+		emiti(i);
+		i0 = curi; /* fixarg() can change curi */
+		fixarg(&i0->arg[0], argcls(&i, 0), i0, fn);
+		fixarg(&i0->arg[1], argcls(&i, 1), i0, fn);
+	}
+}
+
+static void
+seljmp(Blk *b, Fn *fn)
+{
+	/* TODO: replace cmp+jnz with beq/bne/blt[u]/bge[u] */
+	if (b->jmp.type == Jjnz)
+		fixarg(&b->jmp.arg, Kw, 0, fn);
+}
+
+void
+rv64_isel(Fn *fn)
+{
+	Blk *b, **sb;
+	Ins *i;
+	Phi *p;
+	uint n;
+	int al;
+	int64_t sz;
+
+	/* assign slots to fast allocs */
+	b = fn->start;
+	/* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */
+	for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2)
+		for (i=b->ins; i<&b->ins[b->nins]; i++)
+			if (i->op == al) {
+				if (rtype(i->arg[0]) != RCon)
+					break;
+				sz = fn->con[i->arg[0].val].bits.i;
+				if (sz < 0 || sz >= INT_MAX-15)
+					err("invalid alloc size %"PRId64, sz);
+				sz = (sz + n-1) & -n;
+				sz /= 4;
+				if (sz > INT_MAX - fn->slot)
+					die("alloc too large");
+				fn->tmp[i->to.val].slot = fn->slot;
+				fn->slot += sz;
+				*i = (Ins){.op = Onop};
+			}
+
+	for (b=fn->start; b; b=b->link) {
+		curi = &insb[NIns];
+		for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
+			for (p=(*sb)->phi; p; p=p->link) {
+				for (n=0; p->blk[n] != b; n++)
+					assert(n+1 < p->narg);
+				fixarg(&p->arg[n], p->cls, 0, fn);
+			}
+		seljmp(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;)
+			sel(*--i, fn);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	}
+
+	if (debug['I']) {
+		fprintf(stderr, "\n> After instruction selection:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/rv64/targ.c b/rv64/targ.c
new file mode 100644
index 0000000..ead8fe2
--- /dev/null
+++ b/rv64/targ.c
@@ -0,0 +1,53 @@
+#include "all.h"
+
+Rv64Op rv64_op[NOp] = {
+#define O(op, t, x) [O##op] =
+#define V(imm) { imm },
+#include "../ops.h"
+};
+
+int rv64_rsave[] = {
+	T0, T1, T2, T3, T4, T5,
+	A0, A1, A2, A3, A4, A5, A6, A7,
+	FA0, FA1, FA2,  FA3,  FA4, FA5, FA6, FA7,
+	FT0, FT1, FT2,  FT3,  FT4, FT5, FT6, FT7,
+	FT8, FT9, FT10, FT11,
+	-1
+};
+int rv64_rclob[] = {
+	     S1,  S2,   S3,   S4,  S5,  S6,  S7,
+	S8,  S9,  S10,  S11,
+	FS0, FS1, FS2,  FS3,  FS4, FS5, FS6, FS7,
+	FS8, FS9, FS10, FS11,
+	-1
+};
+
+/* T6 used as swap register (TODO: is there a better choice?) */
+#define RGLOB (BIT(FP) | BIT(SP) | BIT(GP) | BIT(TP) | BIT(RA) | BIT(T6))
+
+static int
+rv64_memargs(int op)
+{
+	(void)op;
+	return 0;
+}
+
+Target T_rv64 = {
+	.gpr0 = T0,
+	.ngpr = NGPR,
+	.fpr0 = FT0,
+	.nfpr = NFPR,
+	.rglob = RGLOB,
+	.nrglob = 6,
+	.rsave = rv64_rsave,
+	.nrsave = {NGPS, NFPS},
+	.retregs = rv64_retregs,
+	.argregs = rv64_argregs,
+	.memargs = rv64_memargs,
+	.abi = rv64_abi,
+	.isel = rv64_isel,
+	.emitfn = rv64_emitfn,
+};
+
+MAKESURE(rsave_size_ok, sizeof rv64_rsave == (NGPS+NFPS+1) * sizeof(int));
+MAKESURE(rclob_size_ok, sizeof rv64_rclob == (NCLR+1) * sizeof(int));
diff --git a/test/dark.ssa b/test/dark.ssa
index a1b2e60..de58e4c 100644
--- a/test/dark.ssa
+++ b/test/dark.ssa
@@ -1,4 +1,4 @@
-# skip arm64
+# skip arm64 rv64
 # a hack example,
 # we use a dark type to get
 # a pointer to the stack.
diff --git a/tools/test.sh b/tools/test.sh
index c5e5b37..1b23469 100755
--- a/tools/test.sh
+++ b/tools/test.sh
@@ -43,6 +43,30 @@ init() {
 		fi
 		bin="$bin -t arm64"
 		;;
+	rv64)
+		for p in riscv64-linux-musl riscv64-linux-gnu
+		do
+			cc="$p-gcc -no-pie"
+			qemu="qemu-riscv64"
+			if
+				$cc -v >/dev/null 2>&1 &&
+				$qemu -version >/dev/null 2>&1
+			then
+				if sysroot=$($cc -print-sysroot) && test -n "$sysroot"
+				then
+					qemu="$qemu -L $sysroot"
+				fi
+				break
+			fi
+			cc=
+		done
+		if test -z "$cc"
+		then
+			echo "Cannot find riscv64 compiler or qemu."
+			exit 1
+		fi
+		bin="$bin -t rv64"
+		;;
 	"")
 		case `uname` in
 		*Darwin*)