about summary refs log tree commit diff
path: root/frida_mode/src
diff options
context:
space:
mode:
Diffstat (limited to 'frida_mode/src')
-rw-r--r--frida_mode/src/cmplog/cmplog.c84
-rw-r--r--frida_mode/src/instrument/instrument.c111
-rw-r--r--frida_mode/src/main.c84
-rw-r--r--frida_mode/src/persistent/persistent.c13
-rw-r--r--frida_mode/src/persistent/persistent_arm64.c12
-rw-r--r--frida_mode/src/persistent/persistent_x64.c53
-rw-r--r--frida_mode/src/persistent/persistent_x86.c40
7 files changed, 330 insertions, 67 deletions
diff --git a/frida_mode/src/cmplog/cmplog.c b/frida_mode/src/cmplog/cmplog.c
index 7b11c350..3df7d13d 100644
--- a/frida_mode/src/cmplog/cmplog.c
+++ b/frida_mode/src/cmplog/cmplog.c
@@ -1,3 +1,8 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <syscall.h>
+
 #include "frida-gum.h"
 
 #include "debug.h"
@@ -5,10 +10,13 @@
 #include "util.h"
 
 #define DEFAULT_MMAP_MIN_ADDR (32UL << 10)
+#define FD_TMP_MAX_SIZE 65536
 
 extern struct cmp_map *__afl_cmp_map;
 
 static GArray *cmplog_ranges = NULL;
+static int     fd_tmp = -1;
+static ssize_t fd_tmp_size = 0;
 
 static gboolean cmplog_range(const GumRangeDetails *details,
                              gpointer               user_data) {
@@ -27,6 +35,40 @@ static gint cmplog_sort(gconstpointer a, gconstpointer b) {
 
 }
 
+static int cmplog_create_temp(void) {
+
+  const char *tmpdir = g_get_tmp_dir();
+  OKF("CMPLOG Temporary directory: %s", tmpdir);
+  gchar *fname = g_strdup_printf("%s/frida-cmplog-XXXXXX", tmpdir);
+  OKF("CMPLOG Temporary file template: %s", fname);
+  int fd = mkstemp(fname);
+  OKF("CMPLOG Temporary file: %s", fname);
+
+  if (fd < 0) {
+
+    FATAL("Failed to create temp file: %s, errno: %d", fname, errno);
+
+  }
+
+  if (unlink(fname) < 0) {
+
+    FATAL("Failed to unlink temp file: %s (%d), errno: %d", fname, fd, errno);
+
+  }
+
+  if (ftruncate(fd, 0) < 0) {
+
+    FATAL("Failed to ftruncate temp file: %s (%d), errno: %d", fname, fd,
+          errno);
+
+  }
+
+  g_free(fname);
+
+  return fd;
+
+}
+
 void cmplog_init(void) {
 
   if (__afl_cmp_map != NULL) { OKF("CMPLOG mode enabled"); }
@@ -44,6 +86,13 @@ void cmplog_init(void) {
 
   }
 
+  /*
+   * We can't use /dev/null or /dev/zero for this since it appears that they
+   * don't validate the input buffer. Persumably as an optimization because they
+   * don't actually write any data. The file will be deleted on close.
+   */
+  fd_tmp = cmplog_create_temp();
+
 }
 
 static gboolean cmplog_contains(GumAddress inner_base, GumAddress inner_limit,
@@ -67,6 +116,9 @@ gboolean cmplog_is_readable(guint64 addr, size_t size) {
    */
   if (addr < DEFAULT_MMAP_MIN_ADDR) { return false; }
 
+  /* Check our addres/length don't wrap around */
+  if (SIZE_MAX - addr < size) { return false; }
+
   GumAddress inner_base = addr;
   GumAddress inner_limit = inner_base + size;
 
@@ -81,6 +133,38 @@ gboolean cmplog_is_readable(guint64 addr, size_t size) {
 
   }
 
+  /*
+   * Our address map can change (e.g. stack growth), use write as a fallback to
+   * validate our address.
+   */
+  ssize_t written = syscall(__NR_write, fd_tmp, (void *)addr, size);
+
+  /*
+   * If the write succeeds, then the buffer must be valid otherwise it would
+   * return EFAULT
+   */
+  if (written > 0) {
+
+    fd_tmp_size += written;
+    if (fd_tmp_size > FD_TMP_MAX_SIZE) {
+
+      /*
+       * Truncate the file, we don't want our temp file to continue growing!
+       */
+      if (ftruncate(fd_tmp, 0) < 0) {
+
+        FATAL("Failed to truncate fd_tmp (%d), errno: %d", fd_tmp, errno);
+
+      }
+
+      fd_tmp_size = 0;
+
+    }
+
+    if ((size_t)written == size) { return true; }
+
+  }
+
   return false;
 
 }
diff --git a/frida_mode/src/instrument/instrument.c b/frida_mode/src/instrument/instrument.c
index f261e79a..ba82b89f 100644
--- a/frida_mode/src/instrument/instrument.c
+++ b/frida_mode/src/instrument/instrument.c
@@ -1,4 +1,6 @@
 #include <unistd.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
 
 #include "frida-gum.h"
 
@@ -18,44 +20,50 @@
 
 static gboolean               tracing = false;
 static gboolean               optimize = false;
+static gboolean               unique = false;
 static GumStalkerTransformer *transformer = NULL;
 
 __thread uint64_t previous_pc = 0;
 
+static GumAddress previous_rip = 0;
+static u8 *       edges_notified = NULL;
+
+static void trace_debug(char *format, ...) {
+
+  va_list ap;
+  char    buffer[4096] = {0};
+  int     ret;
+  int     len;
+
+  va_start(ap, format);
+  ret = vsnprintf(buffer, sizeof(buffer) - 1, format, ap);
+  va_end(ap);
+
+  if (ret < 0) { return; }
+
+  len = strnlen(buffer, sizeof(buffer));
+
+  IGNORED_RETURN(write(STDOUT_FILENO, buffer, len));
+
+}
+
 __attribute__((hot)) static void on_basic_block(GumCpuContext *context,
                                                 gpointer       user_data) {
 
   UNUSED_PARAMETER(context);
-  /*
-   * This function is performance critical as it is called to instrument every
-   * basic block. By moving our print buffer to a global, we avoid it affecting
-   * the critical path with additional stack adjustments if tracing is not
-   * enabled. If tracing is enabled, then we're printing a load of diagnostic
-   * information so this overhead is unlikely to be noticeable.
-   */
-  static char buffer[200];
-  int         len;
-  GumAddress  current_pc = GUM_ADDRESS(user_data);
-  uint8_t *   cursor;
-  uint64_t    value;
-  if (unlikely(tracing)) {
-
-    /* Avoid any functions which may cause an allocation since the target app
-     * may already be running inside malloc and it isn't designed to be
-     * re-entrant on a single thread */
-    len = snprintf(buffer, sizeof(buffer),
-                   "current_pc: 0x%016" G_GINT64_MODIFIER
-                   "x, previous_pc: 0x%016" G_GINT64_MODIFIER "x\n",
-                   current_pc, previous_pc);
 
-    IGNORED_RETURN(write(STDOUT_FILENO, buffer, len + 1));
+  GumAddress current_rip = GUM_ADDRESS(user_data);
+  GumAddress current_pc;
+  GumAddress edge;
+  uint8_t *  cursor;
+  uint64_t   value;
 
-  }
-
-  current_pc = (current_pc >> 4) ^ (current_pc << 8);
+  current_pc = (current_rip >> 4) ^ (current_rip << 8);
   current_pc &= MAP_SIZE - 1;
 
-  cursor = &__afl_area_ptr[current_pc ^ previous_pc];
+  edge = current_pc ^ previous_pc;
+
+  cursor = &__afl_area_ptr[edge];
   value = *cursor;
 
   if (value == 0xff) {
@@ -71,6 +79,23 @@ __attribute__((hot)) static void on_basic_block(GumCpuContext *context,
   *cursor = value;
   previous_pc = current_pc >> 1;
 
+  if (unlikely(tracing)) {
+
+    if (!unique || edges_notified[edge] == 0) {
+
+      trace_debug("TRACE: edge: %10" G_GINT64_MODIFIER
+                  "d, current_rip: 0x%016" G_GINT64_MODIFIER
+                  "x, previous_rip: 0x%016" G_GINT64_MODIFIER "x\n",
+                  edge, current_rip, previous_rip);
+
+    }
+
+    if (unique) { edges_notified[edge] = 1; }
+
+    previous_rip = current_rip;
+
+  }
+
 }
 
 static void instr_basic_block(GumStalkerIterator *iterator,
@@ -164,18 +189,28 @@ void instrument_init(void) {
 
   optimize = (getenv("AFL_FRIDA_INST_NO_OPTIMIZE") == NULL);
   tracing = (getenv("AFL_FRIDA_INST_TRACE") != NULL);
+  unique = (getenv("AFL_FRIDA_INST_TRACE_UNIQUE") != NULL);
 
   if (!instrument_is_coverage_optimize_supported()) optimize = false;
 
   OKF("Instrumentation - optimize [%c]", optimize ? 'X' : ' ');
   OKF("Instrumentation - tracing [%c]", tracing ? 'X' : ' ');
+  OKF("Instrumentation - unique [%c]", unique ? 'X' : ' ');
 
   if (tracing && optimize) {
 
-    FATAL("AFL_FRIDA_INST_OPTIMIZE and AFL_FRIDA_INST_TRACE are incompatible");
+    FATAL("AFL_FRIDA_INST_TRACE requires AFL_FRIDA_INST_NO_OPTIMIZE");
+
+  }
+
+  if (unique && optimize) {
+
+    FATAL("AFL_FRIDA_INST_TRACE_UNIQUE requires AFL_FRIDA_INST_NO_OPTIMIZE");
 
   }
 
+  if (unique) { tracing = TRUE; }
+
   if (__afl_map_size != 0x10000) {
 
     FATAL("Bad map size: 0x%08x", __afl_map_size);
@@ -185,6 +220,28 @@ void instrument_init(void) {
   transformer =
       gum_stalker_transformer_make_from_callback(instr_basic_block, NULL, NULL);
 
+  if (unique) {
+
+    int shm_id = shmget(IPC_PRIVATE, MAP_SIZE, IPC_CREAT | IPC_EXCL | 0600);
+    if (shm_id < 0) { FATAL("shm_id < 0 - errno: %d\n", errno); }
+
+    edges_notified = shmat(shm_id, NULL, 0);
+    g_assert(edges_notified != MAP_FAILED);
+
+    /*
+     * Configure the shared memory region to be removed once the process dies.
+     */
+    if (shmctl(shm_id, IPC_RMID, NULL) < 0) {
+
+      FATAL("shmctl (IPC_RMID) < 0 - errno: %d\n", errno);
+
+    }
+
+    /* Clear it, not sure it's necessary, just seems like good practice */
+    memset(edges_notified, '\0', MAP_SIZE);
+
+  }
+
   instrument_debug_init();
   asan_init();
   cmplog_init();
diff --git a/frida_mode/src/main.c b/frida_mode/src/main.c
index 1ab9993f..7ff23755 100644
--- a/frida_mode/src/main.c
+++ b/frida_mode/src/main.c
@@ -1,4 +1,5 @@
 #include <errno.h>
+#include <fcntl.h>
 #include <unistd.h>
 #include <sys/types.h>
 
@@ -27,6 +28,8 @@
 #include "stats.h"
 #include "util.h"
 
+#define PROC_MAX 65536
+
 #ifdef __APPLE__
 extern mach_port_t mach_task_self();
 extern GumAddress  gum_darwin_find_entrypoint(mach_port_t task);
@@ -78,7 +81,7 @@ static void on_main_os(int argc, char **argv, char **envp) {
 
 #endif
 
-static void embedded_init() {
+static void embedded_init(void) {
 
   static gboolean initialized = false;
   if (!initialized) {
@@ -90,7 +93,84 @@ static void embedded_init() {
 
 }
 
-void afl_frida_start() {
+static void afl_print_cmdline(void) {
+
+  char * buffer = g_malloc0(PROC_MAX);
+  gchar *fname = g_strdup_printf("/proc/%d/cmdline", getppid());
+  int    fd = open(fname, O_RDONLY);
+
+  if (fd < 0) {
+
+    FATAL("Failed to open /proc/self/cmdline, errno: (%d)", errno);
+
+  }
+
+  ssize_t bytes_read = read(fd, buffer, PROC_MAX - 1);
+  if (bytes_read < 0) {
+
+    FATAL("Failed to read /proc/self/cmdline, errno: (%d)", errno);
+
+  }
+
+  int idx = 0;
+
+  for (ssize_t i = 0; i < bytes_read; i++) {
+
+    if (i == 0 || buffer[i - 1] == '\0') {
+
+      OKF("AFL - COMMANDLINE: argv[%d] = %s", idx++, &buffer[i]);
+
+    }
+
+  }
+
+  close(fd);
+  g_free(fname);
+  g_free(buffer);
+
+}
+
+static void afl_print_env(void) {
+
+  char * buffer = g_malloc0(PROC_MAX);
+  gchar *fname = g_strdup_printf("/proc/%d/environ", getppid());
+  int    fd = open(fname, O_RDONLY);
+
+  if (fd < 0) {
+
+    FATAL("Failed to open /proc/self/cmdline, errno: (%d)", errno);
+
+  }
+
+  ssize_t bytes_read = read(fd, buffer, PROC_MAX - 1);
+  if (bytes_read < 0) {
+
+    FATAL("Failed to read /proc/self/cmdline, errno: (%d)", errno);
+
+  }
+
+  int idx = 0;
+
+  for (ssize_t i = 0; i < bytes_read; i++) {
+
+    if (i == 0 || buffer[i - 1] == '\0') {
+
+      OKF("AFL - ENVIRONMENT %3d: %s", idx++, &buffer[i]);
+
+    }
+
+  }
+
+  close(fd);
+  g_free(fname);
+  g_free(buffer);
+
+}
+
+void afl_frida_start(void) {
+
+  afl_print_cmdline();
+  afl_print_env();
 
   embedded_init();
   stalker_init();
diff --git a/frida_mode/src/persistent/persistent.c b/frida_mode/src/persistent/persistent.c
index 2ec5b9cc..243d501d 100644
--- a/frida_mode/src/persistent/persistent.c
+++ b/frida_mode/src/persistent/persistent.c
@@ -13,7 +13,6 @@ afl_persistent_hook_fn hook = NULL;
 guint64                persistent_start = 0;
 guint64                persistent_count = 0;
 guint64                persistent_ret = 0;
-guint64                persistent_ret_offset = 0;
 gboolean               persistent_debug = FALSE;
 
 void persistent_init(void) {
@@ -23,8 +22,6 @@ void persistent_init(void) {
   persistent_start = util_read_address("AFL_FRIDA_PERSISTENT_ADDR");
   persistent_count = util_read_num("AFL_FRIDA_PERSISTENT_CNT");
   persistent_ret = util_read_address("AFL_FRIDA_PERSISTENT_RET");
-  persistent_ret_offset =
-      util_read_address("AFL_FRIDA_PERSISTENT_RETADDR_OFFSET");
 
   if (getenv("AFL_FRIDA_PERSISTENT_DEBUG") != NULL) { persistent_debug = TRUE; }
 
@@ -44,14 +41,6 @@ void persistent_init(void) {
 
   }
 
-  if (persistent_ret_offset != 0 && persistent_ret == 0) {
-
-    FATAL(
-        "AFL_FRIDA_PERSISTENT_RET must be specified if "
-        "AFL_FRIDA_PERSISTENT_RETADDR_OFFSET is");
-
-  }
-
   if (persistent_start != 0 && persistent_count == 0) persistent_count = 1000;
 
   if (persistent_count != 0 && persistent_count < 100)
@@ -68,8 +57,6 @@ void persistent_init(void) {
 
   OKF("Instrumentation - persistent ret [%c] (0x%016" G_GINT64_MODIFIER "X)",
       persistent_ret == 0 ? ' ' : 'X', persistent_ret);
-  OKF("Instrumentation - persistent ret offset [%c] (%" G_GINT64_MODIFIER "d)",
-      persistent_ret_offset == 0 ? ' ' : 'X', persistent_ret_offset);
 
   if (hook_name != NULL) {
 
diff --git a/frida_mode/src/persistent/persistent_arm64.c b/frida_mode/src/persistent/persistent_arm64.c
index b23693fe..d7c6c76b 100644
--- a/frida_mode/src/persistent/persistent_arm64.c
+++ b/frida_mode/src/persistent/persistent_arm64.c
@@ -268,13 +268,15 @@ static void instrument_persitent_restore_regs(GumArm64Writer *   cw,
                                               ARM64_REG_X0, (16 * 14),
                                               GUM_INDEX_SIGNED_OFFSET);
 
-  /* Don't restore RIP or RSP, use x1-x3 as clobber */
-
-  /* LR & Adjusted SP (clobber x1) */
+  /* LR & Adjusted SP (use x1 as clobber) */
   gum_arm64_writer_put_ldp_reg_reg_reg_offset(cw, ARM64_REG_X30, ARM64_REG_X1,
                                               ARM64_REG_X0, (16 * 15),
                                               GUM_INDEX_SIGNED_OFFSET);
 
+  gum_arm64_writer_put_mov_reg_reg(cw, ARM64_REG_SP, ARM64_REG_X1);
+
+  /* Don't restore RIP use x1-x3 as clobber */
+
   /* PC (x2) & CPSR (x1) */
   gum_arm64_writer_put_ldp_reg_reg_reg_offset(cw, ARM64_REG_X2, ARM64_REG_X1,
                                               ARM64_REG_X0, (16 * 16),
@@ -404,7 +406,6 @@ void persistent_prologue(GumStalkerOutput *output) {
 
   gconstpointer loop = cw->code + 1;
 
-  /* Stack must be 16-byte aligned per ABI */
   instrument_persitent_save_regs(cw, &saved_regs);
 
   /* loop: */
@@ -450,9 +451,6 @@ void persistent_epilogue(GumStalkerOutput *output) {
 
   if (persistent_debug) { gum_arm64_writer_put_brk_imm(cw, 0); }
 
-  gum_arm64_writer_put_add_reg_reg_imm(cw, ARM64_REG_SP, ARM64_REG_SP,
-                                       persistent_ret_offset);
-
   gum_arm64_writer_put_ldr_reg_address(cw, ARM64_REG_X0,
                                        GUM_ADDRESS(&saved_lr));
 
diff --git a/frida_mode/src/persistent/persistent_x64.c b/frida_mode/src/persistent/persistent_x64.c
index 858ad38e..653acefe 100644
--- a/frida_mode/src/persistent/persistent_x64.c
+++ b/frida_mode/src/persistent/persistent_x64.c
@@ -43,6 +43,7 @@ struct x86_64_regs {
 typedef struct x86_64_regs arch_api_regs;
 
 static arch_api_regs saved_regs = {0};
+static gpointer      saved_ret = NULL;
 
 gboolean persistent_is_supported(void) {
 
@@ -104,7 +105,7 @@ static void instrument_persitent_save_regs(GumX86Writer *      cw,
 
   /* RED_ZONE + Saved flags, RAX, alignment */
   gum_x86_writer_put_add_reg_imm(cw, GUM_REG_RBX,
-                                 GUM_RED_ZONE_SIZE + (0x8 * 3));
+                                 GUM_RED_ZONE_SIZE + (0x8 * 2));
   gum_x86_writer_put_mov_reg_offset_ptr_reg(cw, GUM_REG_RAX, (0x8 * 16),
                                             GUM_REG_RBX);
 
@@ -159,7 +160,9 @@ static void instrument_persitent_restore_regs(GumX86Writer *      cw,
   gum_x86_writer_put_mov_reg_reg_offset_ptr(cw, GUM_REG_R15, GUM_REG_RAX,
                                             (0x8 * 14));
 
-  /* Don't restore RIP or RSP */
+  /* Don't restore RIP */
+  gum_x86_writer_put_mov_reg_reg_offset_ptr(cw, GUM_REG_RSP, GUM_REG_RAX,
+                                            (0x8 * 16));
 
   /* Restore RBX, RAX & Flags */
   gum_x86_writer_put_lea_reg_reg_offset(cw, GUM_REG_RSP, GUM_REG_RSP,
@@ -242,6 +245,31 @@ static void persistent_prologue_hook(GumX86Writer *      cw,
 
 }
 
+static void instrument_persitent_save_ret(GumX86Writer *cw) {
+
+  /* Stack usage by this function */
+  gssize offset = GUM_RED_ZONE_SIZE + (3 * 8);
+  gum_x86_writer_put_lea_reg_reg_offset(cw, GUM_REG_RSP, GUM_REG_RSP,
+                                        -(GUM_RED_ZONE_SIZE));
+
+  gum_x86_writer_put_pushfx(cw);
+  gum_x86_writer_put_push_reg(cw, GUM_REG_RAX);
+  gum_x86_writer_put_push_reg(cw, GUM_REG_RBX);
+
+  gum_x86_writer_put_mov_reg_address(cw, GUM_REG_RAX, GUM_ADDRESS(&saved_ret));
+  gum_x86_writer_put_mov_reg_reg_offset_ptr(cw, GUM_REG_RBX, GUM_REG_RSP,
+                                            offset);
+  gum_x86_writer_put_mov_reg_ptr_reg(cw, GUM_REG_RAX, GUM_REG_RBX);
+
+  gum_x86_writer_put_pop_reg(cw, GUM_REG_RBX);
+  gum_x86_writer_put_pop_reg(cw, GUM_REG_RAX);
+  gum_x86_writer_put_popfx(cw);
+
+  gum_x86_writer_put_lea_reg_reg_offset(cw, GUM_REG_RSP, GUM_REG_RSP,
+                                        (GUM_RED_ZONE_SIZE));
+
+}
+
 void persistent_prologue(GumStalkerOutput *output) {
 
   /*
@@ -268,11 +296,10 @@ void persistent_prologue(GumStalkerOutput *output) {
 
   gconstpointer loop = cw->code + 1;
 
-  /* Stack must be 16-byte aligned per ABI */
-  instrument_persitent_save_regs(cw, &saved_regs);
+  /* Pop the return value */
+  gum_x86_writer_put_lea_reg_reg_offset(cw, GUM_REG_RSP, GUM_REG_RSP, 8);
 
-  /* pop the return value */
-  gum_x86_writer_put_lea_reg_reg_offset(cw, GUM_REG_RSP, GUM_REG_RSP, (8));
+  instrument_persitent_save_regs(cw, &saved_regs);
 
   /* loop: */
   gum_x86_writer_put_label(cw, loop);
@@ -304,6 +331,8 @@ void persistent_prologue(GumStalkerOutput *output) {
   /* original: */
   gum_x86_writer_put_label(cw, original);
 
+  instrument_persitent_save_ret(cw);
+
   if (persistent_debug) { gum_x86_writer_put_breakpoint(cw); }
 
 }
@@ -314,9 +343,15 @@ void persistent_epilogue(GumStalkerOutput *output) {
 
   if (persistent_debug) { gum_x86_writer_put_breakpoint(cw); }
 
-  gum_x86_writer_put_lea_reg_reg_offset(cw, GUM_REG_RSP, GUM_REG_RSP,
-                                        persistent_ret_offset);
-  gum_x86_writer_put_ret(cw);
+  /* The stack should be aligned when we re-enter our loop */
+  gconstpointer zero = cw->code + 1;
+  gum_x86_writer_put_test_reg_u32(cw, GUM_REG_RSP, 0xF);
+  gum_x86_writer_put_jcc_near_label(cw, X86_INS_JE, zero, GUM_NO_HINT);
+  gum_x86_writer_put_lea_reg_reg_offset(cw, GUM_REG_RSP, GUM_REG_RSP, -8);
+  gum_x86_writer_put_label(cw, zero);
+
+  gum_x86_writer_put_mov_reg_address(cw, GUM_REG_RAX, GUM_ADDRESS(&saved_ret));
+  gum_x86_writer_put_jmp_reg_ptr(cw, GUM_REG_RAX);
 
 }
 
diff --git a/frida_mode/src/persistent/persistent_x86.c b/frida_mode/src/persistent/persistent_x86.c
index 0675edf4..7add6e99 100644
--- a/frida_mode/src/persistent/persistent_x86.c
+++ b/frida_mode/src/persistent/persistent_x86.c
@@ -39,6 +39,7 @@ struct x86_regs {
 typedef struct x86_regs arch_api_regs;
 
 static arch_api_regs saved_regs = {0};
+static gpointer      saved_ret = NULL;
 
 gboolean persistent_is_supported(void) {
 
@@ -117,7 +118,9 @@ static void instrument_persitent_restore_regs(GumX86Writer *   cw,
   gum_x86_writer_put_mov_reg_reg_offset_ptr(cw, GUM_REG_EBP, GUM_REG_EAX,
                                             (0x4 * 6));
 
-  /* Don't restore RIP or RSP */
+  /* Don't restore RIP */
+  gum_x86_writer_put_mov_reg_reg_offset_ptr(cw, GUM_REG_ESP, GUM_REG_EAX,
+                                            (0x4 * 8));
 
   /* Restore RBX, RAX & Flags */
   gum_x86_writer_put_mov_reg_reg_offset_ptr(cw, GUM_REG_EBX, GUM_REG_EAX,
@@ -184,6 +187,26 @@ static void persistent_prologue_hook(GumX86Writer *cw, struct x86_regs *regs) {
 
 }
 
+static void instrument_persitent_save_ret(GumX86Writer *cw) {
+
+  /* Stack usage by this function */
+  gssize offset = (3 * 4);
+
+  gum_x86_writer_put_pushfx(cw);
+  gum_x86_writer_put_push_reg(cw, GUM_REG_EAX);
+  gum_x86_writer_put_push_reg(cw, GUM_REG_EBX);
+
+  gum_x86_writer_put_mov_reg_address(cw, GUM_REG_EAX, GUM_ADDRESS(&saved_ret));
+  gum_x86_writer_put_mov_reg_reg_offset_ptr(cw, GUM_REG_EBX, GUM_REG_ESP,
+                                            offset);
+  gum_x86_writer_put_mov_reg_ptr_reg(cw, GUM_REG_EAX, GUM_REG_EBX);
+
+  gum_x86_writer_put_pop_reg(cw, GUM_REG_EBX);
+  gum_x86_writer_put_pop_reg(cw, GUM_REG_EAX);
+  gum_x86_writer_put_popfx(cw);
+
+}
+
 void persistent_prologue(GumStalkerOutput *output) {
 
   /*
@@ -210,11 +233,10 @@ void persistent_prologue(GumStalkerOutput *output) {
 
   gconstpointer loop = cw->code + 1;
 
-  /* Stack must be 16-byte aligned per ABI */
-  instrument_persitent_save_regs(cw, &saved_regs);
-
   /* Pop the return value */
-  gum_x86_writer_put_lea_reg_reg_offset(cw, GUM_REG_ESP, GUM_REG_ESP, (4));
+  gum_x86_writer_put_lea_reg_reg_offset(cw, GUM_REG_ESP, GUM_REG_ESP, 4);
+
+  instrument_persitent_save_regs(cw, &saved_regs);
 
   /* loop: */
   gum_x86_writer_put_label(cw, loop);
@@ -244,6 +266,8 @@ void persistent_prologue(GumStalkerOutput *output) {
   /* original: */
   gum_x86_writer_put_label(cw, original);
 
+  instrument_persitent_save_ret(cw);
+
   if (persistent_debug) { gum_x86_writer_put_breakpoint(cw); }
 
 }
@@ -254,10 +278,8 @@ void persistent_epilogue(GumStalkerOutput *output) {
 
   if (persistent_debug) { gum_x86_writer_put_breakpoint(cw); }
 
-  gum_x86_writer_put_lea_reg_reg_offset(cw, GUM_REG_ESP, GUM_REG_ESP,
-                                        persistent_ret_offset);
-
-  gum_x86_writer_put_ret(cw);
+  gum_x86_writer_put_mov_reg_address(cw, GUM_REG_EAX, GUM_ADDRESS(&saved_ret));
+  gum_x86_writer_put_jmp_reg_ptr(cw, GUM_REG_EAX);
 
 }