13 files changed, 320 insertions, 210 deletions
diff --git a/docs/ChangeLog b/docs/ChangeLog
index 0393e8c3..28042a47 100644
--- a/docs/ChangeLog
+++ b/docs/ChangeLog
@@ -28,6 +28,8 @@ Version ++2.52c (2019-05-28):
     you use the new -p option :-) - see docs/power_schedules.txt
   - added afl-system-config script to set all system performance options for fuzzing
   - llvm_mode works with llvm 3.9 up to including 6.0.1
+  - qemu_mode got upgraded from 2.1 to 3.1 - incorporated from 
+    https://github.com/andreafioraldi/afl and with community patches added
 
 
 ---------------------------
diff --git a/docs/README b/docs/README
index 9279a361..784509c6 100644
--- a/docs/README
+++ b/docs/README
@@ -16,6 +16,10 @@ american fuzzy lop plus plus
   Additionally AFLfast's power schedules by Marcel Boehme from
   github.com/mboehme/aflfast have been incorporated.
 
+  Plus it was upgraded to qemu 3.1 from 2.1 with the work of 
+  https://github.com/andreafioraldi/afl and got the community patches applied
+  to it.
+
   So all in all this is the best-of AFL that is currently out there :-)
 
 
diff --git a/qemu_mode/build_qemu_support.sh b/qemu_mode/build_qemu_support.sh
index 2c5203cc..959ce0b9 100755
--- a/qemu_mode/build_qemu_support.sh
+++ b/qemu_mode/build_qemu_support.sh
@@ -23,9 +23,9 @@
 #
 
 
-VERSION="2.10.0"
+VERSION="3.1.0"
 QEMU_URL="http://download.qemu-project.org/qemu-${VERSION}.tar.xz"
-QEMU_SHA384="68216c935487bc8c0596ac309e1e3ee75c2c4ce898aab796faa321db5740609ced365fedda025678d072d09ac8928105"
+QEMU_SHA384="0318f2b5a36eafbf17bca0f914567dfa5e8a3cd6ff83bb46fe49a0079cd71ddd3ec4267c6c62a03f9e26e05cc80e6d4b"
 
 echo "================================================="
 echo "AFL binary-only instrumentation QEMU build script"
@@ -131,9 +131,8 @@ echo "[*] Applying patches..."
 patch -p1 <../patches/elfload.diff || exit 1
 patch -p1 <../patches/cpu-exec.diff || exit 1
 patch -p1 <../patches/syscall.diff || exit 1
-patch -p1 <../patches/configure.diff || exit 1
-patch -p1 <../patches/memfd.diff || exit 1
 patch -p1 <../patches/translate-all.diff || exit 1
+patch -p1 <../patches/tcg.diff || exit 1
 patch -p1 <../patches/elfload2.diff || exit 1
 
 echo "[+] Patching done."
@@ -149,7 +148,7 @@ echo "[+] Configuration complete."
 
 echo "[*] Attempting to build QEMU (fingers crossed!)..."
 
-make || exit 1
+make -j `nproc` || exit 1
 
 echo "[+] Build process successful!"
 
diff --git a/qemu_mode/patches/afl-qemu-cpu-inl.h b/qemu_mode/patches/afl-qemu-cpu-inl.h
index f7a32c4c..97c6ea35 100644
--- a/qemu_mode/patches/afl-qemu-cpu-inl.h
+++ b/qemu_mode/patches/afl-qemu-cpu-inl.h
@@ -54,8 +54,8 @@
 
 /* This is equivalent to afl-as.h: */
 
-static unsigned char dummy[65536];
-unsigned char *afl_area_ptr = dummy;
+static unsigned char dummy[65536]; /* costs 64kb but saves a few instructions */
+unsigned char *afl_area_ptr = dummy; /* Exported for afl_gen_trace */
 
 /* Exported variables populated by the code patched into elfload.c: */
 
@@ -65,6 +65,7 @@ abi_ulong afl_entry_point, /* ELF entry point (_start) */
 
 /* Set in the child process in forkserver mode: */
 
+static int forkserver_installed = 0;
 static unsigned char afl_fork_child;
 unsigned int afl_forksrv_pid;
 
@@ -78,7 +79,7 @@ static void afl_setup(void);
 static void afl_forkserver(CPUState*);
 
 static void afl_wait_tsl(CPUState*, int);
-static void afl_request_tsl(target_ulong, target_ulong, uint32_t, TranslationBlock*, int);
+static void afl_request_tsl(target_ulong, target_ulong, uint32_t, uint32_t, TranslationBlock*, int);
 
 /* Data structures passed around by the translate handlers: */
 
@@ -86,6 +87,7 @@ struct afl_tb {
   target_ulong pc;
   target_ulong cs_base;
   uint32_t flags;
+  uint32_t cf_mask;
 };
 
 struct afl_tsl {
@@ -95,13 +97,15 @@ struct afl_tsl {
 
 struct afl_chain {
   struct afl_tb last_tb;
+  uint32_t cf_mask;
   int tb_exit;
 };
 
 /* Some forward decls: */
 
-TranslationBlock *tb_htable_lookup(CPUState*, target_ulong, target_ulong, uint32_t);
-static inline TranslationBlock *tb_find(CPUState*, TranslationBlock*, int);
+TranslationBlock *tb_htable_lookup(CPUState*, target_ulong, target_ulong, uint32_t, uint32_t);
+static inline TranslationBlock *tb_find(CPUState*, TranslationBlock*, int, uint32_t);
+static inline void tb_add_jump(TranslationBlock *tb, int n, TranslationBlock *tb_next);
 
 /*************************
  * ACTUAL IMPLEMENTATION *
@@ -161,14 +165,15 @@ static void afl_setup(void) {
 
 
 /* Fork server logic, invoked once we hit _start. */
-static int forkserver_installed = 0;
+
 static void afl_forkserver(CPUState *cpu) {
+
+  static unsigned char tmp[4];
+
   if (forkserver_installed == 1)
     return;
   forkserver_installed = 1;
-
-  static unsigned char tmp[4];
-  //if (!afl_area_ptr) return;
+  //if (!afl_area_ptr) return; // not necessary because of fixed dummy buffer
 
   /* Tell the parent that we're alive. If the parent doesn't want
      to talk, assume that we're not running in forkserver mode. */
@@ -229,84 +234,40 @@ static void afl_forkserver(CPUState *cpu) {
 }
 
 
-#if 0
-/* The equivalent of the tuple logging routine from afl-as.h. */
-
-static inline void afl_maybe_log(abi_ulong cur_loc) {
-
-  static __thread abi_ulong prev_loc;
-
-  /* Optimize for cur_loc > afl_end_code, which is the most likely case on
-     Linux systems. */
-
-  if (cur_loc > afl_end_code || cur_loc < afl_start_code /*|| !afl_area_ptr*/)
-    return;
-
-  /* Looks like QEMU always maps to fixed locations, so ASAN is not a
-     concern. Phew. But instruction addresses may be aligned. Let's mangle
-     the value to get something quasi-uniform. */
-
-  cur_loc  = (cur_loc >> 4) ^ (cur_loc << 8);
-  cur_loc &= MAP_SIZE - 1;
-
-  /* Implement probabilistic instrumentation by looking at scrambled block
-     address. This keeps the instrumented locations stable across runs. */
-
-  if (cur_loc >= afl_inst_rms) return;
-
-  afl_area_ptr[cur_loc ^ prev_loc]++;
-  prev_loc = cur_loc >> 1;
-
-}
-#endif
-
 /* This code is invoked whenever QEMU decides that it doesn't have a
-   translation of a particular block and needs to compute it. When this happens,
-   we tell the parent to mirror the operation, so that the next fork() has a
-   cached copy. */
+   translation of a particular block and needs to compute it, or when it
+   decides to chain two TBs together. When this happens, we tell the parent to
+   mirror the operation, so that the next fork() has a cached copy. */
 
-#if 0
-static void afl_request_tsl(target_ulong pc, target_ulong cb, uint64_t flags) {
+static void afl_request_tsl(target_ulong pc, target_ulong cb, uint32_t flags, uint32_t cf_mask,
+                            TranslationBlock *last_tb, int tb_exit) {
 
   struct afl_tsl t;
+  struct afl_chain c;
 
   if (!afl_fork_child) return;
 
-  t.pc      = pc;
-  t.cs_base = cb;
-  t.flags   = flags;
-
-  if (write(TSL_FD, &t, sizeof(struct afl_tsl)) != sizeof(struct afl_tsl))
-    return;
-
-}
-#else
-static void afl_request_tsl(target_ulong pc, target_ulong cb, uint32_t flags,
-                            TranslationBlock *last_tb, int tb_exit) {
-  struct afl_tsl t;
-  struct afl_chain c;
- 
-  if (!afl_fork_child) return;
- 
   t.tb.pc      = pc;
   t.tb.cs_base = cb;
   t.tb.flags   = flags;
+  t.tb.cf_mask = cf_mask;
   t.is_chain   = (last_tb != NULL);
- 
+
   if (write(TSL_FD, &t, sizeof(struct afl_tsl)) != sizeof(struct afl_tsl))
     return;
- 
+
   if (t.is_chain) {
     c.last_tb.pc      = last_tb->pc;
     c.last_tb.cs_base = last_tb->cs_base;
     c.last_tb.flags   = last_tb->flags;
+    c.cf_mask         = cf_mask;
     c.tb_exit         = tb_exit;
 
     if (write(TSL_FD, &c, sizeof(struct afl_chain)) != sizeof(struct afl_chain))
       return;
   }
- }
-#endif
+
+}
 
 /* This is the other side of the same channel. Since timeouts are handled by
    afl-fuzz simply killing the child, we can just wait until the pipe breaks. */
@@ -324,14 +285,12 @@ static void afl_wait_tsl(CPUState *cpu, int fd) {
     if (read(fd, &t, sizeof(struct afl_tsl)) != sizeof(struct afl_tsl))
       break;
 
-    tb = tb_htable_lookup(cpu, t.tb.pc, t.tb.cs_base, t.tb.flags);
+    tb = tb_htable_lookup(cpu, t.tb.pc, t.tb.cs_base, t.tb.flags, t.tb.cf_mask);
 
     if(!tb) {
       mmap_lock();
-      tb_lock();
       tb = tb_gen_code(cpu, t.tb.pc, t.tb.cs_base, t.tb.flags, 0);
       mmap_unlock();
-      tb_unlock();
     }
 
     if (t.is_chain) {
@@ -339,13 +298,9 @@ static void afl_wait_tsl(CPUState *cpu, int fd) {
         break;
 
       last_tb = tb_htable_lookup(cpu, c.last_tb.pc, c.last_tb.cs_base,
-                                 c.last_tb.flags);
+                                 c.last_tb.flags, c.cf_mask);
       if (last_tb) {
-        tb_lock();
-        if (!tb->invalid) {
-          tb_add_jump(last_tb, c.tb_exit, tb);
-        }
-        tb_unlock();
+        tb_add_jump(last_tb, c.tb_exit, tb);
       }
     }
 
diff --git a/qemu_mode/patches/afl-qemu-translate-inl.h b/qemu_mode/patches/afl-qemu-translate-inl.h
index 9e778a83..543c2e4e 100644
--- a/qemu_mode/patches/afl-qemu-translate-inl.h
+++ b/qemu_mode/patches/afl-qemu-translate-inl.h
@@ -37,17 +37,26 @@ extern unsigned char *afl_area_ptr;
 extern unsigned int afl_inst_rms;
 extern abi_ulong afl_start_code, afl_end_code;
 
+void tcg_gen_afl_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args);
+
+
+void afl_maybe_log(abi_ulong cur_loc) {
+
+  static __thread abi_ulong prev_loc;
+
+  afl_area_ptr[cur_loc ^ prev_loc]++;
+  prev_loc = cur_loc >> 1;
+
+}
+
+
 /* Generates TCG code for AFL's tracing instrumentation. */
-static void afl_gen_trace(target_ulong cur_loc)
-{
-  static __thread target_ulong prev_loc;
-  TCGv index, count, new_prev_loc;
-  TCGv_ptr prev_loc_ptr, count_ptr;
+static void afl_gen_trace(target_ulong cur_loc) {
 
   /* Optimize for cur_loc > afl_end_code, which is the most likely case on
      Linux systems. */
 
-  if (cur_loc > afl_end_code || cur_loc < afl_start_code || !afl_area_ptr)
+  if (cur_loc > afl_end_code || cur_loc < afl_start_code /*|| !afl_area_ptr*/) // not needed because of static dummy buffer
     return;
 
   /* Looks like QEMU always maps to fixed locations, so ASAN is not a
@@ -62,21 +71,7 @@ static void afl_gen_trace(target_ulong cur_loc)
 
   if (cur_loc >= afl_inst_rms) return;
 
-  /* index = prev_loc ^ cur_loc */
-  prev_loc_ptr = tcg_const_ptr(&prev_loc);
-  index = tcg_temp_new();
-  tcg_gen_ld_tl(index, prev_loc_ptr, 0);
-  tcg_gen_xori_tl(index, index, cur_loc);
-
-  /* afl_area_ptr[index]++ */
-  count_ptr = tcg_const_ptr(afl_area_ptr);
-  tcg_gen_add_ptr(count_ptr, count_ptr, TCGV_NAT_TO_PTR(index));
-  count = tcg_temp_new();
-  tcg_gen_ld8u_tl(count, count_ptr, 0);
-  tcg_gen_addi_tl(count, count, 1);
-  tcg_gen_st8_tl(count, count_ptr, 0);
-
-  /* prev_loc = cur_loc >> 1 */
-  new_prev_loc = tcg_const_tl(cur_loc >> 1);
-  tcg_gen_st_tl(new_prev_loc, prev_loc_ptr, 0);
+  TCGTemp *args[1] = { tcgv_i64_temp( tcg_const_tl(cur_loc) ) };
+  tcg_gen_afl_callN(afl_maybe_log, NULL, 1, args);
+  
 }
diff --git a/qemu_mode/patches/configure.diff b/qemu_mode/patches/configure.diff
deleted file mode 100644
index a9816f87..00000000
--- a/qemu_mode/patches/configure.diff
+++ /dev/null
@@ -1,11 +0,0 @@
---- a/configure
-+++ b/configure
-@@ -3855,7 +3855,7 @@ fi
- # check if memfd is supported
- memfd=no
- cat > $TMPC << EOF
--#include <sys/memfd.h>
-+#include <sys/mman.h>
- 
- int main(void)
- {
diff --git a/qemu_mode/patches/cpu-exec.diff b/qemu_mode/patches/cpu-exec.diff
index 754bf9ef..cd35eef6 100644
--- a/qemu_mode/patches/cpu-exec.diff
+++ b/qemu_mode/patches/cpu-exec.diff
@@ -1,5 +1,7 @@
---- qemu-2.10.0-clean/accel/tcg/cpu-exec.c	2017-08-30 18:50:40.000000000 +0200
-+++ qemu-2.10.0/accel/tcg/cpu-exec.c	2018-09-22 13:21:23.612068407 +0200
+diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+index 870027d4..841ba557 100644
+--- a/accel/tcg/cpu-exec.c
++++ b/accel/tcg/cpu-exec.c
 @@ -36,6 +36,8 @@
  #include "sysemu/cpus.h"
  #include "sysemu/replay.h"
@@ -9,46 +11,37 @@
  /* -icount align implementation. */
  
  typedef struct SyncClocks {
-@@ -144,6 +146,8 @@
+@@ -144,6 +146,8 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
      int tb_exit;
-     uint8_t *tb_ptr = itb->tc_ptr;
+     uint8_t *tb_ptr = itb->tc.ptr;
  
 +    AFL_QEMU_CPU_SNIPPET2;
 +
      qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
-                            "Trace %p [%d: " TARGET_FMT_lx "] %s\n",
-                            itb->tc_ptr, cpu->cpu_index, itb->pc,
-@@ -337,7 +341,7 @@
+                            "Trace %d: %p ["
+                            TARGET_FMT_lx "/" TARGET_FMT_lx "/%#x] %s\n",
+@@ -397,11 +401,13 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
      TranslationBlock *tb;
      target_ulong cs_base, pc;
      uint32_t flags;
--    bool have_tb_lock = false;
-+    bool have_tb_lock = false, was_translated = false, was_chained = false;
++    bool was_translated = false, was_chained = false;
  
-     /* we record a subset of the CPU state. It will
-        always be the same before a given translated block
-@@ -365,6 +369,7 @@
-             if (!tb) {
-                 /* if no translated code available, then translate it now */
-                 tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
-+                was_translated = true;
-             }
- 
-             mmap_unlock();
-@@ -390,11 +395,16 @@
-         }
-         if (!tb->invalid) {
-             tb_add_jump(last_tb, tb_exit, tb);
-+            was_chained = true;
-         }
-     }
-     if (have_tb_lock) {
-         tb_unlock();
-     }
-+    if (was_translated || was_chained) {
-+        afl_request_tsl(pc, cs_base, flags, was_chained ? last_tb : NULL,
-+                        tb_exit);
+     tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
+     if (tb == NULL) {
+         mmap_lock();
+         tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
++        was_translated = true;
+         mmap_unlock();
+         /* We add the TB in the virtual pc hash table for the fast lookup */
+         atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
+@@ -418,6 +424,10 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
+     /* See if we can patch the calling TB. */
+     if (last_tb) {
+         tb_add_jump(last_tb, tb_exit, tb);
++        was_chained = true;
 +    }
++    if (was_translated || was_chained) {
++        afl_request_tsl(pc, cs_base, flags, cf_mask, was_chained ? last_tb : NULL, tb_exit);
+     }
      return tb;
  }
- 
diff --git a/qemu_mode/patches/elfload.diff b/qemu_mode/patches/elfload.diff
index 34ec4847..c8f6e80a 100644
--- a/qemu_mode/patches/elfload.diff
+++ b/qemu_mode/patches/elfload.diff
@@ -1,5 +1,7 @@
---- qemu-2.10.0.orig/linux-user/elfload.c	2017-08-30 18:50:41.000000000 +0200
-+++ qemu-2.10.0/linux-user/elfload.c	2018-10-23 12:48:16.421879765 +0200
+diff --git a/linux-user/elfload.c b/linux-user/elfload.c
+index 5bccd2e2..94e928a4 100644
+--- a/linux-user/elfload.c
++++ b/linux-user/elfload.c
 @@ -20,6 +20,8 @@
  
  #define ELF_OSABI   ELFOSABI_SYSV
@@ -9,7 +11,7 @@
  /* from personality.h */
  
  /*
-@@ -2085,6 +2087,8 @@
+@@ -2301,6 +2303,8 @@ static void load_elf_image(const char *image_name, int image_fd,
      info->brk = 0;
      info->elf_flags = ehdr->e_flags;
  
@@ -18,7 +20,7 @@
      for (i = 0; i < ehdr->e_phnum; i++) {
          struct elf_phdr *eppnt = phdr + i;
          if (eppnt->p_type == PT_LOAD) {
-@@ -2118,9 +2122,11 @@
+@@ -2335,9 +2339,11 @@ static void load_elf_image(const char *image_name, int image_fd,
              if (elf_prot & PROT_EXEC) {
                  if (vaddr < info->start_code) {
                      info->start_code = vaddr;
@@ -30,26 +32,3 @@
                  }
              }
              if (elf_prot & PROT_WRITE) {
-@@ -2443,6 +2449,22 @@
-                                 info, (elf_interpreter ? &interp_info : NULL));
-     info->start_stack = bprm->p;
- 
-+#if defined(TARGET_PPC64) && !defined(TARGET_ABI32)
-+    // On PowerPC64 the entry point is the _function descriptor_
-+    // of the entry function. For AFL to properly initialize,
-+    // afl_entry_point needs to be set to the actual first instruction
-+    // as opposed executed by the target program. This as opposed to 
-+    // where the function's descriptor sits in memory.
-+    
-+    // Shameless copy of PPC init_thread
-+    info_report("Adjusting afl_entry_point");
-+    if (afl_entry_point && (get_ppc64_abi(info) < 2)) {
-+        uint64_t val;
-+        get_user_u64(val, afl_entry_point);
-+        afl_entry_point = val + info->load_bias;
-+    }
-+#endif
-+
-     /* If we have an interpreter, set that as the program's entry point.
-        Copy the load_bias as well, to help PPC64 interpret the entry
-        point as a function descriptor.  Do this after creating elf tables
diff --git a/qemu_mode/patches/elfload2.diff b/qemu_mode/patches/elfload2.diff
index e09d11c6..045060d4 100644
--- a/qemu_mode/patches/elfload2.diff
+++ b/qemu_mode/patches/elfload2.diff
@@ -1,6 +1,6 @@
---- qemu-2.10.0/linux-user/elfload.c.after	2019-05-28 15:21:36.931618928 +0200
-+++ qemu-2.10.0/linux-user/elfload.c	2019-05-28 15:22:23.939617556 +0200
-@@ -2087,7 +2087,20 @@
+--- a/linux-user/elfload.c	2019-06-03 13:06:40.755755923 +0200
++++ b/linux-user/elfload.c	2019-06-03 13:33:01.315709801 +0200
+@@ -2303,7 +2303,20 @@
      info->brk = 0;
      info->elf_flags = ehdr->e_flags;
  
@@ -10,15 +10,38 @@
 +      if ((ptr = getenv("AFL_ENTRYPOINT")) != NULL) {
 +        afl_entry_point = strtoul(ptr, NULL, 16);
 +      } else {
-+        if (!afl_entry_point) afl_entry_point = info->entry;
++        afl_entry_point = info->entry;
 +      }
 +#ifdef TARGET_ARM
 +      /* The least significant bit indicates Thumb mode. */
 +      afl_entry_point = afl_entry_point & ~(target_ulong)1;
 +#endif
-+      if (getenv("AFL_DEBUG") != NULL)
-+        fprintf(stderr, "AFL forkserver entrypoint: %p\n", (void*)afl_entry_point);
-+    } while(0);
++    }
++    if (getenv("AFL_DEBUG") != NULL)
++      fprintf(stderr, "AFL forkserver entrypoint: %p\n", (void*)afl_entry_point);
  
      for (i = 0; i < ehdr->e_phnum; i++) {
          struct elf_phdr *eppnt = phdr + i;
+@@ -2668,6 +2681,22 @@
+        change some of these later */
+     bprm->p = setup_arg_pages(bprm, info);
+ 
++    // On PowerPC64 the entry point is the _function descriptor_
++    // of the entry function. For AFL to properly initialize,
++    // afl_entry_point needs to be set to the actual first instruction
++    // as opposed executed by the target program. This as opposed to 
++    // where the function's descriptor sits in memory.
++    // copied from PPC init_thread
++#if defined(TARGET_PPC64) && !defined(TARGET_ABI32)
++    if (get_ppc64_abi(infop) < 2) {
++        uint64_t val;
++        get_user_u64(val, infop->entry + 8);
++        _regs->gpr[2] = val + infop->load_bias;
++        get_user_u64(val, infop->entry);
++        infop->entry = val + infop->load_bias;
++    }
++#endif
++
+     scratch = g_new0(char, TARGET_PAGE_SIZE);
+     if (STACK_GROWS_DOWN) {
+         bprm->p = copy_elf_strings(1, &bprm->filename, scratch,
diff --git a/qemu_mode/patches/memfd.diff b/qemu_mode/patches/memfd.diff
deleted file mode 100644
index 7f68396c..00000000
--- a/qemu_mode/patches/memfd.diff
+++ /dev/null
@@ -1,12 +0,0 @@
---- a/util/memfd.c
-+++ b/util/memfd.c
-@@ -31,9 +31,7 @@
- 
- #include "qemu/memfd.h"
- 
--#ifdef CONFIG_MEMFD
--#include <sys/memfd.h>
--#elif defined CONFIG_LINUX
-+#if defined CONFIG_LINUX && !defined CONFIG_MEMFD
- #include <sys/syscall.h>
- #include <asm/unistd.h>
diff --git a/qemu_mode/patches/syscall.diff b/qemu_mode/patches/syscall.diff
index 55b29140..cb2acfcd 100644
--- a/qemu_mode/patches/syscall.diff
+++ b/qemu_mode/patches/syscall.diff
@@ -1,21 +1,22 @@
---- qemu-2.10.0-rc3-clean/linux-user/syscall.c	2017-08-15 11:39:41.000000000 -0700
-+++ qemu-2.10.0-rc3/linux-user/syscall.c	2017-08-22 14:34:03.193088186 -0700
-@@ -116,6 +116,8 @@
- 
+diff --git a/linux-user/syscall.c b/linux-user/syscall.c
+index 280137da..8c0e749f 100644
+--- a/linux-user/syscall.c
++++ b/linux-user/syscall.c
+@@ -112,6 +112,8 @@
  #include "qemu.h"
+ #include "fd-trans.h"
  
 +extern unsigned int afl_forksrv_pid;
 +
  #ifndef CLONE_IO
  #define CLONE_IO                0x80000000      /* Clone io context */
  #endif
-@@ -11688,8 +11690,21 @@
-         break;
+@@ -10799,8 +10801,19 @@ static abi_long do_syscall1(void *cpu_env, int num, abi_long arg1,
+         return get_errno(safe_tkill((int)arg1, target_to_host_signal(arg2)));
  
      case TARGET_NR_tgkill:
--        ret = get_errno(safe_tgkill((int)arg1, (int)arg2,
--                        target_to_host_signal(arg3)));
-+
+-        return get_errno(safe_tgkill((int)arg1, (int)arg2,
+-                         target_to_host_signal(arg3)));
 +        {
 +          int pid  = (int)arg1,
 +              tgid = (int)arg2,
@@ -29,7 +30,6 @@
 +          ret = get_errno(safe_tgkill(pid, tgid, target_to_host_signal(sig)));
 +
 +        }
-+
-         break;
  
  #ifdef TARGET_NR_set_robust_list
+     case TARGET_NR_set_robust_list:
diff --git a/qemu_mode/patches/tcg.diff b/qemu_mode/patches/tcg.diff
new file mode 100644
index 00000000..f53a1acf
--- /dev/null
+++ b/qemu_mode/patches/tcg.diff
@@ -0,0 +1,181 @@
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index e85133ef..54b9b390 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -1612,6 +1612,176 @@ bool tcg_op_supported(TCGOpcode op)
+     }
+ }
+ 
++
++/* Call the instrumentation function from the TCG IR */
++void tcg_gen_afl_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
++{
++    int i, real_args, nb_rets, pi;
++    unsigned sizemask, flags;
++    TCGOp *op;
++
++    flags = 0;
++    sizemask = 0;
++
++#if defined(__sparc__) && !defined(__arch64__) \
++    && !defined(CONFIG_TCG_INTERPRETER)
++    /* We have 64-bit values in one register, but need to pass as two
++       separate parameters.  Split them.  */
++    int orig_sizemask = sizemask;
++    int orig_nargs = nargs;
++    TCGv_i64 retl, reth;
++    TCGTemp *split_args[MAX_OPC_PARAM];
++
++    retl = NULL;
++    reth = NULL;
++    if (sizemask != 0) {
++        for (i = real_args = 0; i < nargs; ++i) {
++            int is_64bit = sizemask & (1 << (i+1)*2);
++            if (is_64bit) {
++                TCGv_i64 orig = temp_tcgv_i64(args[i]);
++                TCGv_i32 h = tcg_temp_new_i32();
++                TCGv_i32 l = tcg_temp_new_i32();
++                tcg_gen_extr_i64_i32(l, h, orig);
++                split_args[real_args++] = tcgv_i32_temp(h);
++                split_args[real_args++] = tcgv_i32_temp(l);
++            } else {
++                split_args[real_args++] = args[i];
++            }
++        }
++        nargs = real_args;
++        args = split_args;
++        sizemask = 0;
++    }
++#elif defined(TCG_TARGET_EXTEND_ARGS) && TCG_TARGET_REG_BITS == 64
++    for (i = 0; i < nargs; ++i) {
++        int is_64bit = sizemask & (1 << (i+1)*2);
++        int is_signed = sizemask & (2 << (i+1)*2);
++        if (!is_64bit) {
++            TCGv_i64 temp = tcg_temp_new_i64();
++            TCGv_i64 orig = temp_tcgv_i64(args[i]);
++            if (is_signed) {
++                tcg_gen_ext32s_i64(temp, orig);
++            } else {
++                tcg_gen_ext32u_i64(temp, orig);
++            }
++            args[i] = tcgv_i64_temp(temp);
++        }
++    }
++#endif /* TCG_TARGET_EXTEND_ARGS */
++
++    op = tcg_emit_op(INDEX_op_call);
++
++    pi = 0;
++    if (ret != NULL) {
++#if defined(__sparc__) && !defined(__arch64__) \
++    && !defined(CONFIG_TCG_INTERPRETER)
++        if (orig_sizemask & 1) {
++            /* The 32-bit ABI is going to return the 64-bit value in
++               the %o0/%o1 register pair.  Prepare for this by using
++               two return temporaries, and reassemble below.  */
++            retl = tcg_temp_new_i64();
++            reth = tcg_temp_new_i64();
++            op->args[pi++] = tcgv_i64_arg(reth);
++            op->args[pi++] = tcgv_i64_arg(retl);
++            nb_rets = 2;
++        } else {
++            op->args[pi++] = temp_arg(ret);
++            nb_rets = 1;
++        }
++#else
++        if (TCG_TARGET_REG_BITS < 64 && (sizemask & 1)) {
++#ifdef HOST_WORDS_BIGENDIAN
++            op->args[pi++] = temp_arg(ret + 1);
++            op->args[pi++] = temp_arg(ret);
++#else
++            op->args[pi++] = temp_arg(ret);
++            op->args[pi++] = temp_arg(ret + 1);
++#endif
++            nb_rets = 2;
++        } else {
++            op->args[pi++] = temp_arg(ret);
++            nb_rets = 1;
++        }
++#endif
++    } else {
++        nb_rets = 0;
++    }
++    TCGOP_CALLO(op) = nb_rets;
++
++    real_args = 0;
++    for (i = 0; i < nargs; i++) {
++        int is_64bit = sizemask & (1 << (i+1)*2);
++        if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
++#ifdef TCG_TARGET_CALL_ALIGN_ARGS
++            /* some targets want aligned 64 bit args */
++            if (real_args & 1) {
++                op->args[pi++] = TCG_CALL_DUMMY_ARG;
++                real_args++;
++            }
++#endif
++           /* If stack grows up, then we will be placing successive
++              arguments at lower addresses, which means we need to
++              reverse the order compared to how we would normally
++              treat either big or little-endian.  For those arguments
++              that will wind up in registers, this still works for
++              HPPA (the only current STACK_GROWSUP target) since the
++              argument registers are *also* allocated in decreasing
++              order.  If another such target is added, this logic may
++              have to get more complicated to differentiate between
++              stack arguments and register arguments.  */
++#if defined(HOST_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP)
++            op->args[pi++] = temp_arg(args[i] + 1);
++            op->args[pi++] = temp_arg(args[i]);
++#else
++            op->args[pi++] = temp_arg(args[i]);
++            op->args[pi++] = temp_arg(args[i] + 1);
++#endif
++            real_args += 2;
++            continue;
++        }
++
++        op->args[pi++] = temp_arg(args[i]);
++        real_args++;
++    }
++    op->args[pi++] = (uintptr_t)func;
++    op->args[pi++] = flags;
++    TCGOP_CALLI(op) = real_args;
++
++    /* Make sure the fields didn't overflow.  */
++    tcg_debug_assert(TCGOP_CALLI(op) == real_args);
++    tcg_debug_assert(pi <= ARRAY_SIZE(op->args));
++
++#if defined(__sparc__) && !defined(__arch64__) \
++    && !defined(CONFIG_TCG_INTERPRETER)
++    /* Free all of the parts we allocated above.  */
++    for (i = real_args = 0; i < orig_nargs; ++i) {
++        int is_64bit = orig_sizemask & (1 << (i+1)*2);
++        if (is_64bit) {
++            tcg_temp_free_internal(args[real_args++]);
++            tcg_temp_free_internal(args[real_args++]);
++        } else {
++            real_args++;
++        }
++    }
++    if (orig_sizemask & 1) {
++        /* The 32-bit ABI returned two 32-bit pieces.  Re-assemble them.
++           Note that describing these as TCGv_i64 eliminates an unnecessary
++           zero-extension that tcg_gen_concat_i32_i64 would create.  */
++        tcg_gen_concat32_i64(temp_tcgv_i64(ret), retl, reth);
++        tcg_temp_free_i64(retl);
++        tcg_temp_free_i64(reth);
++    }
++#elif defined(TCG_TARGET_EXTEND_ARGS) && TCG_TARGET_REG_BITS == 64
++    for (i = 0; i < nargs; ++i) {
++        int is_64bit = sizemask & (1 << (i+1)*2);
++        if (!is_64bit) {
++            tcg_temp_free_internal(args[i]);
++        }
++    }
++#endif /* TCG_TARGET_EXTEND_ARGS */
++}
++
++
+ /* Note: we convert the 64 bit args to 32 bit and do some alignment
+    and endian swap. Maybe it would be better to do the alignment
+    and endian swap in tcg_reg_alloc_call(). */
diff --git a/qemu_mode/patches/translate-all.diff b/qemu_mode/patches/translate-all.diff
index 853a66ad..ca310b11 100644
--- a/qemu_mode/patches/translate-all.diff
+++ b/qemu_mode/patches/translate-all.diff
@@ -1,6 +1,8 @@
---- a/accel/tcg/translate-all.c	2017-08-30 18:50:40.000000000 +0200
-+++ b/accel/tcg/translate-all.c	2018-09-21 10:19:42.328766554 +0200
-@@ -60,6 +60,8 @@
+diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+index 639f0b27..21a45494 100644
+--- a/accel/tcg/translate-all.c
++++ b/accel/tcg/translate-all.c
+@@ -59,6 +59,8 @@
  #include "exec/log.h"
  #include "sysemu/cpus.h"
  
@@ -9,11 +11,11 @@
  /* #define DEBUG_TB_INVALIDATE */
  /* #define DEBUG_TB_FLUSH */
  /* make various TB consistency checks */
-@@ -1280,6 +1282,7 @@
-     tcg_func_start(&tcg_ctx);
+@@ -1721,6 +1723,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
+     tcg_func_start(tcg_ctx);
  
-     tcg_ctx.cpu = ENV_GET_CPU(env);
+     tcg_ctx->cpu = ENV_GET_CPU(env);
 +    afl_gen_trace(pc);
      gen_intermediate_code(cpu, tb);
-     tcg_ctx.cpu = NULL;
+     tcg_ctx->cpu = NULL;