12 files changed, 275 insertions, 82 deletions
diff --git a/instrumentation/LLVMInsTrim.so.cc b/instrumentation/LLVMInsTrim.so.cc
index f0de6536..62de6ec5 100644
--- a/instrumentation/LLVMInsTrim.so.cc
+++ b/instrumentation/LLVMInsTrim.so.cc
@@ -38,7 +38,7 @@ typedef long double max_align_t;
 
 #include "MarkNodes.h"
 #include "afl-llvm-common.h"
-#include "llvm-ngram-coverage.h"
+#include "llvm-alternative-coverage.h"
 
 #include "config.h"
 #include "debug.h"
diff --git a/instrumentation/README.cmplog.md b/instrumentation/README.cmplog.md
index 5f855e1f..a796c7a7 100644
--- a/instrumentation/README.cmplog.md
+++ b/instrumentation/README.cmplog.md
@@ -1,10 +1,11 @@
 # CmpLog instrumentation
 
-The CmpLog instrumentation enables the logging of the comparisons operands in a
+The CmpLog instrumentation enables logging of comparison operands in a
 shared memory.
 
 These values can be used by various mutators built on top of it.
-At the moment we support the RedQueen mutator (input-2-state instructions only).
+At the moment we support the RedQueen mutator (input-2-state instructions only), 
+for details see [the RedQueen paper](https://www.syssec.ruhr-uni-bochum.de/media/emma/veroeffentlichungen/2018/12/17/NDSS19-Redqueen.pdf).
 
 ## Build
 
@@ -13,7 +14,7 @@ program.
 
 The first version is built using the regular AFL++ instrumentation.
 
-The second one, the CmpLog binary, with setting AFL_LLVM_CMPLOG during the compilation.
+The second one, the CmpLog binary, is built with setting AFL_LLVM_CMPLOG during the compilation.
 
 For example:
 
@@ -26,11 +27,12 @@ export AFL_LLVM_CMPLOG=1
 ./configure --cc=~/path/to/afl-clang-fast
 make
 cp ./program ./program.cmplog
+unset AFL_LLVM_CMPLOG
 ```
 
 ## Use
 
-AFL++ has the new -c option that needs to be used to specify the CmpLog binary (the second
+AFL++ has the new `-c` option that needs to be used to specify the CmpLog binary (the second
 build).
 
 For example:
diff --git a/instrumentation/README.ctx.md b/instrumentation/README.ctx.md
index 577b3e5f..335e9921 100644
--- a/instrumentation/README.ctx.md
+++ b/instrumentation/README.ctx.md
@@ -29,7 +29,7 @@ many map collisions occur.
 ## Caller Branch Coverage
 
 If the context sensitive coverage introduces too may collisions and becoming
-decremental, the user can choose to augment edge coverage with just the
+detrimental, the user can choose to augment edge coverage with just the
 called function ID, instead of the entire callstack hash.
 
 In math the coverage is collected as follows:
diff --git a/instrumentation/README.gcc_plugin.md b/instrumentation/README.gcc_plugin.md
index 12449efd..230ceb73 100644
--- a/instrumentation/README.gcc_plugin.md
+++ b/instrumentation/README.gcc_plugin.md
@@ -3,16 +3,20 @@
 See [../README.md](../README.md) for the general instruction manual.
 See [README.llvm.md](README.llvm.md) for the LLVM-based instrumentation.
 
+This document describes how to build and use `afl-gcc-fast` and `afl-g++-fast`,
+which instrument the target with the help of gcc plugins.
+
 TLDR:
-  * `apt-get install gcc-VERSION-plugin-dev`
-  * `make`
-  * gcc and g++ must point to the gcc-VERSION you you have to set AFL_CC/AFL_CXX
+  * check the version of your gcc compiler: `gcc --version`
+  * `apt-get install gcc-VERSION-plugin-dev` or similar to install headers for gcc plugins
+  * `gcc` and `g++` must match the gcc-VERSION you installed headers for. You can set `AFL_CC`/`AFL_CXX`
     to point to these!
-  * just use afl-gcc-fast/afl-g++-fast normally like you would afl-clang-fast
+  * `make`
+  * just use `afl-gcc-fast`/`afl-g++-fast` normally like you would do with `afl-clang-fast`
 
 ## 1) Introduction
 
-The code in this directory allows you to instrument programs for AFL using
+The code in this directory allows to instrument programs for AFL using
 true compiler-level instrumentation, instead of the more crude
 assembly-level rewriting approach taken by afl-gcc and afl-clang. This has
 several interesting properties:
@@ -27,10 +31,10 @@ several interesting properties:
 
   - The instrumentation is CPU-independent. At least in principle, you should
     be able to rely on it to fuzz programs on non-x86 architectures (after
-    building afl-fuzz with AFL_NOX86=1).
+    building `afl-fuzz` with `AFL_NOX86=1`).
 
   - Because the feature relies on the internals of GCC, it is gcc-specific
-    and will *not* work with LLVM (see ../llvm_mode for an alternative).
+    and will *not* work with LLVM (see [README.llvm.md](README.llvm.md) for an alternative).
 
 Once this implementation is shown to be sufficiently robust and portable, it
 will probably replace afl-gcc. For now, it can be built separately and
@@ -41,29 +45,32 @@ The idea and much of the implementation comes from Laszlo Szekeres.
 ## 2) How to use
 
 In order to leverage this mechanism, you need to have modern enough GCC
-(>= version 4.5.0) and the plugin headers installed on your system. That
+(>= version 4.5.0) and the plugin development headers installed on your system. That
 should be all you need. On Debian machines, these headers can be acquired by
 installing the `gcc-VERSION-plugin-dev` packages.
 
-To build the instrumentation itself, type 'make'. This will generate binaries
-called afl-gcc-fast and afl-g++-fast in the parent directory. 
+To build the instrumentation itself, type `make`. This will generate binaries
+called `afl-gcc-fast` and `afl-g++-fast` in the parent directory. 
 
 The gcc and g++ compiler links have to point to gcc-VERSION - or set these
-by pointing the environment variables AFL_CC/AFL_CXX to them.
-If the CC/CXX have been overridden, those compilers will be used from
-those wrappers without using AFL_CXX/AFL_CC settings.
+by pointing the environment variables `AFL_CC`/`AFL_CXX` to them.
+If the `CC`/`CXX` environment variables have been set, those compilers will be 
+preferred over those from the `AFL_CC`/`AFL_CXX` settings.
 
 Once this is done, you can instrument third-party code in a way similar to the
 standard operating mode of AFL, e.g.:
-
-  CC=/path/to/afl/afl-gcc-fast ./configure [...options...]
+```
+  CC=/path/to/afl/afl-gcc-fast
+  CXX=/path/to/afl/afl-g++-fast
+  export CC CXX
+  ./configure [...options...]
   make
+```
+Note: We also used `CXX` to set the C++ compiler to `afl-g++-fast` for C++ code.
 
-Be sure to also include CXX set to afl-g++-fast for C++ code.
-
-The tool honors roughly the same environmental variables as afl-gcc (see
-[env_variables.md](../docs/env_variables.md). This includes AFL_INST_RATIO,
-AFL_USE_ASAN, AFL_HARDEN, and AFL_DONT_OPTIMIZE.
+The tool honors roughly the same environmental variables as `afl-gcc` (see
+[env_variables.md](../docs/env_variables.md). This includes `AFL_INST_RATIO`,
+`AFL_USE_ASAN`, `AFL_HARDEN`, and `AFL_DONT_OPTIMIZE`.
 
 Note: if you want the GCC plugin to be installed on your system for all
 users, you need to build it before issuing 'make install' in the parent
@@ -72,7 +79,7 @@ directory.
 ## 3) Gotchas, feedback, bugs
 
 This is an early-stage mechanism, so field reports are welcome. You can send bug
-reports to afl@aflplus.plus
+reports to afl@aflplus.plus.
 
 ## 4) Bonus feature #1: deferred initialization
 
@@ -88,7 +95,7 @@ file before getting to the fuzzed data.
 In such cases, it's beneficial to initialize the forkserver a bit later, once
 most of the initialization work is already done, but before the binary attempts
 to read the fuzzed input and parse it; in some cases, this can offer a 10x+
-performance gain. You can implement delayed initialization in LLVM mode in a
+performance gain. You can implement delayed initialization in GCC mode in a
 fairly simple way.
 
 First, locate a suitable location in the code where the delayed cloning can
@@ -117,7 +124,7 @@ With the location selected, add this code in the appropriate spot:
 ```
 
 You don't need the #ifdef guards, but they will make the program still work as
-usual when compiled with a tool other than afl-gcc-fast/afl-clang-fast.
+usual when compiled with a compiler other than afl-gcc-fast/afl-clang-fast.
 
 Finally, recompile the program with afl-gcc-fast (afl-gcc or afl-clang will
 *not* generate a deferred-initialization binary) - and you should be all set!
@@ -127,7 +134,7 @@ Finally, recompile the program with afl-gcc-fast (afl-gcc or afl-clang will
 Some libraries provide APIs that are stateless, or whose state can be reset in
 between processing different input files. When such a reset is performed, a
 single long-lived process can be reused to try out multiple test cases,
-eliminating the need for repeated fork() calls and the associated OS overhead.
+eliminating the need for repeated `fork()` calls and the associated OS overhead.
 
 The basic structure of the program that does this would be:
 
@@ -160,5 +167,9 @@ wary of memory leaks and the state of file descriptors.
 When running in this mode, the execution paths will inherently vary a bit
 depending on whether the input loop is being entered for the first time or
 executed again. To avoid spurious warnings, the feature implies
-AFL_NO_VAR_CHECK and hides the "variable path" warnings in the UI.
+`AFL_NO_VAR_CHECK` and hides the "variable path" warnings in the UI.
+
+## 6) Bonus feature #3: selective instrumentation
 
+It can be more effective to fuzzing to only instrument parts of the code.
+For details see [README.instrument_list.md](README.instrument_list.md).
diff --git a/instrumentation/README.neverzero.md b/instrumentation/README.neverzero.md
index 5c894d6e..49104e00 100644
--- a/instrumentation/README.neverzero.md
+++ b/instrumentation/README.neverzero.md
@@ -16,7 +16,7 @@ at a very little cost (one instruction per edge).
 (The alternative of saturated counters has been tested also and proved to be
 inferior in terms of path discovery.)
 
-This is implemented in afl-gcc, however for llvm_mode this is optional if
+This is implemented in afl-gcc and afl-gcc-fast, however for llvm_mode this is optional if
 the llvm version is below 9 - as there is a perfomance bug that is only fixed
 in version 9 and onwards.
 
diff --git a/instrumentation/README.ngram.md b/instrumentation/README.ngram.md
index de3ba432..da61ef32 100644
--- a/instrumentation/README.ngram.md
+++ b/instrumentation/README.ngram.md
@@ -10,8 +10,8 @@ by Jinghan Wang, et. al.
 Note that the original implementation (available
 [here](https://github.com/bitsecurerlab/afl-sensitive))
 is built on top of AFL's QEMU mode.
-This is essentially a port that uses LLVM vectorized instructions to achieve
-the same results when compiling source code.
+This is essentially a port that uses LLVM vectorized instructions (available from
+llvm versions 4.0.1 and higher) to achieve the same results when compiling source code.
 
 In math the branch coverage is performed as follows:
 `map[current_location ^ prev_location[0] >> 1 ^ prev_location[1] >> 1 ^ ... up to n-1`] += 1`
diff --git a/instrumentation/README.out_of_line.md b/instrumentation/README.out_of_line.md
index aad215b6..2264f91f 100644
--- a/instrumentation/README.out_of_line.md
+++ b/instrumentation/README.out_of_line.md
@@ -1,18 +1,16 @@
-===========================================
-Using afl++ without inlined instrumentation
-===========================================
+## Using afl++ without inlined instrumentation
 
   This file describes how you can disable inlining of instrumentation.
 
 
 By default, the GCC plugin will duplicate the effects of calling
-__afl_trace (see afl-gcc-rt.o.c) in instrumented code, instead of
+`__afl_trace` (see `afl-gcc-rt.o.c`) in instrumented code, instead of
 issuing function calls.
 
 The calls are presumed to be slower, more so because the rt file
 itself is not optimized by the compiler.
 
-Setting AFL_GCC_OUT_OF_LINE=1 in the environment while compiling code
+Setting `AFL_GCC_OUT_OF_LINE=1` in the environment while compiling code
 with the plugin will disable this inlining, issuing calls to the
 unoptimized runtime instead.
 
diff --git a/instrumentation/README.persistent_mode.md b/instrumentation/README.persistent_mode.md
index 2cf76adf..24f81ea0 100644
--- a/instrumentation/README.persistent_mode.md
+++ b/instrumentation/README.persistent_mode.md
@@ -16,7 +16,7 @@ Examples can be found in [utils/persistent_mode](../utils/persistent_mode).
 ## 2) TLDR;
 
 Example `fuzz_target.c`:
-```
+```c
 #include "what_you_need_for_your_target.h"
 
 __AFL_FUZZ_INIT();
@@ -60,14 +60,14 @@ The speed increase is usually x10 to x20.
 If you want to be able to compile the target without afl-clang-fast/lto then
 add this just after the includes:
 
-```
+```c
 #ifndef __AFL_FUZZ_TESTCASE_LEN
   ssize_t fuzz_len;
   #define __AFL_FUZZ_TESTCASE_LEN fuzz_len
   unsigned char fuzz_buf[1024000];
   #define __AFL_FUZZ_TESTCASE_BUF fuzz_buf
   #define __AFL_FUZZ_INIT() void sync(void);
-  #define __AFL_LOOP(x) ((fuzz_len = read(0, fuzz_buf, sizeof(fuzz_buf))) > 0 ?
+  #define __AFL_LOOP(x) ((fuzz_len = read(0, fuzz_buf, sizeof(fuzz_buf))) > 0 ? 1 : 0)
   #define __AFL_INIT() sync() 
 #endif
 ```
@@ -75,7 +75,7 @@ add this just after the includes:
 ## 3) Deferred initialization
 
 AFL tries to optimize performance by executing the targeted binary just once,
-stopping it just before main(), and then cloning this "main" process to get
+stopping it just before `main()`, and then cloning this "main" process to get
 a steady supply of targets to fuzz.
 
 Although this approach eliminates much of the OS-, linker- and libc-level
@@ -97,7 +97,7 @@ a location after:
   - The creation of any vital threads or child processes - since the forkserver
     can't clone them easily.
 
-  - The initialization of timers via setitimer() or equivalent calls.
+  - The initialization of timers via `setitimer()` or equivalent calls.
 
   - The creation of temporary files, network sockets, offset-sensitive file
     descriptors, and similar shared-state resources - but only provided that
@@ -150,9 +150,9 @@ the impact of memory leaks and similar glitches; 1000 is a good starting point,
 and going much higher increases the likelihood of hiccups without giving you
 any real performance benefits.
 
-A more detailed template is shown in ../utils/persistent_mode/.
-Similarly to the previous mode, the feature works only with afl-clang-fast; #ifdef
-guards can be used to suppress it when using other compilers.
+A more detailed template is shown in `../utils/persistent_mode/.`
+Similarly to the previous mode, the feature works only with afl-clang-fast; 
+`#ifdef` guards can be used to suppress it when using other compilers.
 
 Note that as with the previous mode, the feature is easy to misuse; if you
 do not fully reset the critical state, you may end up with false positives or
@@ -161,7 +161,7 @@ wary of memory leaks and of the state of file descriptors.
 
 PS. Because there are task switches still involved, the mode isn't as fast as
 "pure" in-process fuzzing offered, say, by LLVM's LibFuzzer; but it is a lot
-faster than the normal fork() model, and compared to in-process fuzzing,
+faster than the normal `fork()` model, and compared to in-process fuzzing,
 should be a lot more robust.
 
 ## 5) Shared memory fuzzing
@@ -174,17 +174,17 @@ Setting this up is very easy:
 
 After the includes set the following macro:
 
-```
+```c
 __AFL_FUZZ_INIT();
 ```
 Directly at the start of main - or if you are using the deferred forkserver
-with `__AFL_INIT()`  then *after* `__AFL_INIT? :
-```
+with `__AFL_INIT()` then *after* `__AFL_INIT()` :
+```c
   unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;
 ```
 
 Then as first line after the `__AFL_LOOP` while loop:
-```
+```c
   int len = __AFL_FUZZ_TESTCASE_LEN;
 ```
 and that is all!
diff --git a/instrumentation/afl-compiler-rt.o.c b/instrumentation/afl-compiler-rt.o.c
index a702ec39..cca38cd0 100644
--- a/instrumentation/afl-compiler-rt.o.c
+++ b/instrumentation/afl-compiler-rt.o.c
@@ -20,7 +20,7 @@
 #include "config.h"
 #include "types.h"
 #include "cmplog.h"
-#include "llvm-ngram-coverage.h"
+#include "llvm-alternative-coverage.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -97,10 +97,12 @@ int __afl_selective_coverage_temp = 1;
 
 #if defined(__ANDROID__) || defined(__HAIKU__)
 PREV_LOC_T __afl_prev_loc[NGRAM_SIZE_MAX];
+PREV_LOC_T __afl_prev_caller[CTX_MAX_K];
 u32        __afl_prev_ctx;
 u32        __afl_cmp_counter;
 #else
 __thread PREV_LOC_T __afl_prev_loc[NGRAM_SIZE_MAX];
+__thread PREV_LOC_T __afl_prev_caller[CTX_MAX_K];
 __thread u32        __afl_prev_ctx;
 __thread u32        __afl_cmp_counter;
 #endif
@@ -313,18 +315,23 @@ static void __afl_map_shm(void) {
      early-stage __afl_area_initial region that is needed to allow some really
      hacky .init code to work correctly in projects such as OpenSSL. */
 
-  if (__afl_debug)
+  if (__afl_debug) {
+
     fprintf(stderr,
-            "DEBUG: id_str %s, __afl_area_ptr %p, __afl_area_initial %p, "
-            "__afl_map_addr 0x%llx, MAP_SIZE %u, __afl_final_loc %u, "
+            "DEBUG: (1) id_str %s, __afl_area_ptr %p, __afl_area_initial %p, "
+            "__afl_area_ptr_dummy 0x%p, __afl_map_addr 0x%llx, MAP_SIZE %u, "
+            "__afl_final_loc %u, "
             "max_size_forkserver %u/0x%x\n",
             id_str == NULL ? "<null>" : id_str, __afl_area_ptr,
-            __afl_area_initial, __afl_map_addr, MAP_SIZE, __afl_final_loc,
-            FS_OPT_MAX_MAPSIZE, FS_OPT_MAX_MAPSIZE);
+            __afl_area_initial, __afl_area_ptr_dummy, __afl_map_addr, MAP_SIZE,
+            __afl_final_loc, FS_OPT_MAX_MAPSIZE, FS_OPT_MAX_MAPSIZE);
+
+  }
 
   if (id_str) {
 
-    if (__afl_area_ptr && __afl_area_ptr != __afl_area_initial) {
+    if (__afl_area_ptr && __afl_area_ptr != __afl_area_initial &&
+        __afl_area_ptr != __afl_area_ptr_dummy) {
 
       if (__afl_map_addr) {
 
@@ -459,6 +466,19 @@ static void __afl_map_shm(void) {
 
   __afl_area_ptr_backup = __afl_area_ptr;
 
+  if (__afl_debug) {
+
+    fprintf(stderr,
+            "DEBUG: (2) id_str %s, __afl_area_ptr %p, __afl_area_initial %p, "
+            "__afl_area_ptr_dummy 0x%p, __afl_map_addr 0x%llx, MAP_SIZE "
+            "%u, __afl_final_loc %u, "
+            "max_size_forkserver %u/0x%x\n",
+            id_str == NULL ? "<null>" : id_str, __afl_area_ptr,
+            __afl_area_initial, __afl_area_ptr_dummy, __afl_map_addr, MAP_SIZE,
+            __afl_final_loc, FS_OPT_MAX_MAPSIZE, FS_OPT_MAX_MAPSIZE);
+
+  }
+
   if (__afl_selective_coverage) {
 
     if (__afl_map_size > MAP_INITIAL_SIZE) {
@@ -1097,11 +1117,14 @@ void __afl_manual_init(void) {
     __afl_sharedmem_fuzzing = 0;
     if (__afl_area_ptr == NULL) __afl_area_ptr = __afl_area_ptr_dummy;
 
-    if (__afl_debug)
+    if (__afl_debug) {
+
       fprintf(stderr,
               "DEBUG: disabled instrumentation because of "
               "AFL_DISABLE_LLVM_INSTRUMENTATION\n");
 
+    }
+
   }
 
   if (!init_done) {
@@ -1285,8 +1308,12 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t *start, uint32_t *stop) {
   if (__afl_already_initialized_forkserver &&
       __afl_final_loc + 1 + stop - start > __afl_map_size) {
 
-    if (__afl_debug)
-      fprintf(stderr, "Warning: new instrumneted code after the forkserver!\n");
+    if (__afl_debug) {
+
+      fprintf(stderr, "Warning: new instrumented code after the forkserver!\n");
+
+    }
+
     __afl_final_loc = 2;
 
     if (1 + stop - start > __afl_map_size) {
@@ -1771,7 +1798,7 @@ void __cmplog_rtn_hook(u8 *ptr1, u8 *ptr2) {
 
     if (__afl_cmp_map->headers[k].shape < len) {
 
-      __afl_cmp_map->headers[k].shape = len;
+      __afl_cmp_map->headers[k].shape = len - 1;
 
     }
 
diff --git a/instrumentation/afl-llvm-common.cc b/instrumentation/afl-llvm-common.cc
index aa54f4f7..0fd3a011 100644
--- a/instrumentation/afl-llvm-common.cc
+++ b/instrumentation/afl-llvm-common.cc
@@ -62,13 +62,15 @@ bool isIgnoreFunction(const llvm::Function *F) {
       "sancov.",
       "__ubsan_",
       "ign.",
-      "__afl_",
+      "__afl",
       "_fini",
-      "__libc_csu",
+      "__libc_",
       "__asan",
       "__msan",
       "__cmplog",
       "__sancov",
+      "__cxx_",
+      "_GLOBAL",
       "msan.",
       "LLVMFuzzerM",
       "LLVMFuzzerC",
diff --git a/instrumentation/afl-llvm-pass.so.cc b/instrumentation/afl-llvm-pass.so.cc
index 33898aec..0f773aba 100644
--- a/instrumentation/afl-llvm-pass.so.cc
+++ b/instrumentation/afl-llvm-pass.so.cc
@@ -62,7 +62,7 @@ typedef long double max_align_t;
 #endif
 
 #include "afl-llvm-common.h"
-#include "llvm-ngram-coverage.h"
+#include "llvm-alternative-coverage.h"
 
 using namespace llvm;
 
@@ -82,6 +82,7 @@ class AFLCoverage : public ModulePass {
 
  protected:
   uint32_t ngram_size = 0;
+  uint32_t ctx_k = 0;
   uint32_t map_size = MAP_SIZE;
   uint32_t function_minimum_size = 1;
   char *   ctx_str = NULL, *caller_str = NULL, *skip_nozero = NULL;
@@ -183,12 +184,17 @@ bool AFLCoverage::runOnModule(Module &M) {
   skip_nozero = getenv("AFL_LLVM_SKIP_NEVERZERO");
 
   unsigned PrevLocSize = 0;
+  unsigned PrevCallerSize = 0;
 
   char *ngram_size_str = getenv("AFL_LLVM_NGRAM_SIZE");
   if (!ngram_size_str) ngram_size_str = getenv("AFL_NGRAM_SIZE");
+  char *ctx_k_str = getenv("AFL_LLVM_CTX_K");
+  if (!ctx_k_str) ctx_k_str = getenv("AFL_CTX_K");
   ctx_str = getenv("AFL_LLVM_CTX");
   caller_str = getenv("AFL_LLVM_CALLER");
 
+  bool instrument_ctx = ctx_str || caller_str;
+
 #ifdef AFL_HAVE_VECTOR_INTRINSICS
   /* Decide previous location vector size (must be a power of two) */
   VectorType *PrevLocTy = NULL;
@@ -205,6 +211,31 @@ bool AFLCoverage::runOnModule(Module &M) {
   if (ngram_size)
     PrevLocSize = ngram_size - 1;
   else
+    PrevLocSize = 1;
+
+  /* Decide K-ctx vector size (must be a power of two) */
+  VectorType *PrevCallerTy = NULL;
+
+  if (ctx_k_str)
+    if (sscanf(ctx_k_str, "%u", &ctx_k) != 1 || ctx_k < 1 || ctx_k > CTX_MAX_K)
+      FATAL("Bad value of AFL_CTX_K (must be between 1 and CTX_MAX_K (%u))",
+            CTX_MAX_K);
+
+  if (ctx_k == 1) {
+
+    ctx_k = 0;
+    instrument_ctx = true;
+    caller_str = ctx_k_str;  // Enable CALLER instead
+
+  }
+
+  if (ctx_k) {
+
+    PrevCallerSize = ctx_k;
+    instrument_ctx = true;
+
+  }
+
 #else
   if (ngram_size_str)
   #ifndef LLVM_VERSION_PATCH
@@ -218,8 +249,20 @@ bool AFLCoverage::runOnModule(Module &M) {
         "%d.%d.%d!",
         LLVM_VERSION_MAJOR, LLVM_VERSION_MINOR, LLVM_VERSION_PATCH);
   #endif
+  if (ctx_k_str)
+  #ifndef LLVM_VERSION_PATCH
+    FATAL(
+        "Sorry, K-CTX branch coverage is not supported with llvm version "
+        "%d.%d.%d!",
+        LLVM_VERSION_MAJOR, LLVM_VERSION_MINOR, 0);
+  #else
+    FATAL(
+        "Sorry, K-CTX branch coverage is not supported with llvm version "
+        "%d.%d.%d!",
+        LLVM_VERSION_MAJOR, LLVM_VERSION_MINOR, LLVM_VERSION_PATCH);
+  #endif
+  PrevLocSize = 1;
 #endif
-    PrevLocSize = 1;
 
 #ifdef AFL_HAVE_VECTOR_INTRINSICS
   int PrevLocVecSize = PowerOf2Ceil(PrevLocSize);
@@ -232,6 +275,17 @@ bool AFLCoverage::runOnModule(Module &M) {
     );
 #endif
 
+#ifdef AFL_HAVE_VECTOR_INTRINSICS
+  int PrevCallerVecSize = PowerOf2Ceil(PrevCallerSize);
+  if (ctx_k)
+    PrevCallerTy = VectorType::get(IntLocTy, PrevCallerVecSize
+  #if LLVM_VERSION_MAJOR >= 12
+                                   ,
+                                   false
+  #endif
+    );
+#endif
+
   /* Get globals for the SHM region and the previous location. Note that
      __afl_prev_loc is thread-local. */
 
@@ -239,6 +293,7 @@ bool AFLCoverage::runOnModule(Module &M) {
       new GlobalVariable(M, PointerType::get(Int8Ty, 0), false,
                          GlobalValue::ExternalLinkage, 0, "__afl_area_ptr");
   GlobalVariable *AFLPrevLoc;
+  GlobalVariable *AFLPrevCaller;
   GlobalVariable *AFLContext = NULL;
 
   if (ctx_str || caller_str)
@@ -276,6 +331,31 @@ bool AFLCoverage::runOnModule(Module &M) {
 #endif
 
 #ifdef AFL_HAVE_VECTOR_INTRINSICS
+  if (ctx_k)
+  #if defined(__ANDROID__) || defined(__HAIKU__)
+    AFLPrevCaller = new GlobalVariable(
+        M, PrevCallerTy, /* isConstant */ false, GlobalValue::ExternalLinkage,
+        /* Initializer */ nullptr, "__afl_prev_caller");
+  #else
+    AFLPrevCaller = new GlobalVariable(
+        M, PrevCallerTy, /* isConstant */ false, GlobalValue::ExternalLinkage,
+        /* Initializer */ nullptr, "__afl_prev_caller",
+        /* InsertBefore */ nullptr, GlobalVariable::GeneralDynamicTLSModel,
+        /* AddressSpace */ 0, /* IsExternallyInitialized */ false);
+  #endif
+  else
+#endif
+#if defined(__ANDROID__) || defined(__HAIKU__)
+    AFLPrevCaller =
+        new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage, 0,
+                           "__afl_prev_caller");
+#else
+  AFLPrevCaller = new GlobalVariable(
+      M, Int32Ty, false, GlobalValue::ExternalLinkage, 0, "__afl_prev_caller",
+      0, GlobalVariable::GeneralDynamicTLSModel, 0, false);
+#endif
+
+#ifdef AFL_HAVE_VECTOR_INTRINSICS
   /* Create the vector shuffle mask for updating the previous block history.
      Note that the first element of the vector will store cur_loc, so just set
      it to undef to allow the optimizer to do its thing. */
@@ -289,13 +369,30 @@ bool AFLCoverage::runOnModule(Module &M) {
     PrevLocShuffle.push_back(ConstantInt::get(Int32Ty, PrevLocSize));
 
   Constant *PrevLocShuffleMask = ConstantVector::get(PrevLocShuffle);
+
+  Constant *                  PrevCallerShuffleMask = NULL;
+  SmallVector<Constant *, 32> PrevCallerShuffle = {UndefValue::get(Int32Ty)};
+
+  if (ctx_k) {
+
+    for (unsigned I = 0; I < PrevCallerSize - 1; ++I)
+      PrevCallerShuffle.push_back(ConstantInt::get(Int32Ty, I));
+
+    for (int I = PrevCallerSize; I < PrevCallerVecSize; ++I)
+      PrevCallerShuffle.push_back(ConstantInt::get(Int32Ty, PrevCallerSize));
+
+    PrevCallerShuffleMask = ConstantVector::get(PrevCallerShuffle);
+
+  }
+
 #endif
 
   // other constants we need
   ConstantInt *Zero = ConstantInt::get(Int8Ty, 0);
   ConstantInt *One = ConstantInt::get(Int8Ty, 1);
 
-  LoadInst *PrevCtx = NULL;  // CTX sensitive coverage
+  Value *   PrevCtx = NULL;     // CTX sensitive coverage
+  LoadInst *PrevCaller = NULL;  // K-CTX coverage
 
   /* Instrument all the things! */
 
@@ -319,12 +416,30 @@ bool AFLCoverage::runOnModule(Module &M) {
       IRBuilder<>          IRB(&(*IP));
 
       // Context sensitive coverage
-      if ((ctx_str || caller_str) && &BB == &F.getEntryBlock()) {
+      if (instrument_ctx && &BB == &F.getEntryBlock()) {
+
+#ifdef AFL_HAVE_VECTOR_INTRINSICS
+        if (ctx_k) {
+
+          PrevCaller = IRB.CreateLoad(AFLPrevCaller);
+          PrevCaller->setMetadata(M.getMDKindID("nosanitize"),
+                                  MDNode::get(C, None));
+          PrevCtx =
+              IRB.CreateZExt(IRB.CreateXorReduce(PrevCaller), IRB.getInt32Ty());
+
+        } else
+
+#endif
+        {
+
+          // load the context ID of the previous function and write to to a
+          // local variable on the stack
+          LoadInst *PrevCtxLoad = IRB.CreateLoad(AFLContext);
+          PrevCtxLoad->setMetadata(M.getMDKindID("nosanitize"),
+                                   MDNode::get(C, None));
+          PrevCtx = PrevCtxLoad;
 
-        // load the context ID of the previous function and write to to a local
-        // variable on the stack
-        PrevCtx = IRB.CreateLoad(AFLContext);
-        PrevCtx->setMetadata(M.getMDKindID("nosanitize"), MDNode::get(C, None));
+        }
 
         // does the function have calls? and is any of the calls larger than one
         // basic block?
@@ -356,10 +471,31 @@ bool AFLCoverage::runOnModule(Module &M) {
         if (has_calls) {
 
           Value *NewCtx = ConstantInt::get(Int32Ty, AFL_R(map_size));
-          if (ctx_str) NewCtx = IRB.CreateXor(PrevCtx, NewCtx);
-          StoreInst *StoreCtx = IRB.CreateStore(NewCtx, AFLContext);
-          StoreCtx->setMetadata(M.getMDKindID("nosanitize"),
-                                MDNode::get(C, None));
+#ifdef AFL_HAVE_VECTOR_INTRINSICS
+          if (ctx_k) {
+
+            Value *ShuffledPrevCaller = IRB.CreateShuffleVector(
+                PrevCaller, UndefValue::get(PrevCallerTy),
+                PrevCallerShuffleMask);
+            Value *UpdatedPrevCaller = IRB.CreateInsertElement(
+                ShuffledPrevCaller, NewCtx, (uint64_t)0);
+
+            StoreInst *Store =
+                IRB.CreateStore(UpdatedPrevCaller, AFLPrevCaller);
+            Store->setMetadata(M.getMDKindID("nosanitize"),
+                               MDNode::get(C, None));
+
+          } else
+
+#endif
+          {
+
+            if (ctx_str) NewCtx = IRB.CreateXor(PrevCtx, NewCtx);
+            StoreInst *StoreCtx = IRB.CreateStore(NewCtx, AFLContext);
+            StoreCtx->setMetadata(M.getMDKindID("nosanitize"),
+                                  MDNode::get(C, None));
+
+          }
 
         }
 
@@ -413,13 +549,20 @@ bool AFLCoverage::runOnModule(Module &M) {
 
         // in CTX mode we have to restore the original context for the caller -
         // she might be calling other functions which need the correct CTX
-        if ((ctx_str || caller_str) && has_calls) {
+        if (instrument_ctx && has_calls) {
 
           Instruction *Inst = BB.getTerminator();
           if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst)) {
 
             IRBuilder<> Post_IRB(Inst);
-            StoreInst * RestoreCtx = Post_IRB.CreateStore(PrevCtx, AFLContext);
+
+            StoreInst *RestoreCtx;
+  #ifdef AFL_HAVE_VECTOR_INTRINSICS
+            if (ctx_k)
+              RestoreCtx = IRB.CreateStore(PrevCaller, AFLPrevCaller);
+            else
+  #endif
+              RestoreCtx = Post_IRB.CreateStore(PrevCtx, AFLContext);
             RestoreCtx->setMetadata(M.getMDKindID("nosanitize"),
                                     MDNode::get(C, None));
 
@@ -460,7 +603,7 @@ bool AFLCoverage::runOnModule(Module &M) {
 #endif
         PrevLocTrans = PrevLoc;
 
-      if (ctx_str || caller_str)
+      if (instrument_ctx)
         PrevLocTrans =
             IRB.CreateZExt(IRB.CreateXor(PrevLocTrans, PrevCtx), Int32Ty);
       else
@@ -547,13 +690,20 @@ bool AFLCoverage::runOnModule(Module &M) {
       // in CTX mode we have to restore the original context for the caller -
       // she might be calling other functions which need the correct CTX.
       // Currently this is only needed for the Ubuntu clang-6.0 bug
-      if ((ctx_str || caller_str) && has_calls) {
+      if (instrument_ctx && has_calls) {
 
         Instruction *Inst = BB.getTerminator();
         if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst)) {
 
           IRBuilder<> Post_IRB(Inst);
-          StoreInst * RestoreCtx = Post_IRB.CreateStore(PrevCtx, AFLContext);
+
+          StoreInst *RestoreCtx;
+#ifdef AFL_HAVE_VECTOR_INTRINSICS
+          if (ctx_k)
+            RestoreCtx = IRB.CreateStore(PrevCaller, AFLPrevCaller);
+          else
+#endif
+            RestoreCtx = Post_IRB.CreateStore(PrevCtx, AFLContext);
           RestoreCtx->setMetadata(M.getMDKindID("nosanitize"),
                                   MDNode::get(C, None));
 
diff --git a/instrumentation/llvm-ngram-coverage.h b/instrumentation/llvm-alternative-coverage.h
index 666839c8..0d7b3957 100644
--- a/instrumentation/llvm-ngram-coverage.h
+++ b/instrumentation/llvm-alternative-coverage.h
@@ -14,5 +14,8 @@ typedef u64 PREV_LOC_T;
 /* Maximum ngram size */
 #define NGRAM_SIZE_MAX 16U
 
+/* Maximum K for top-K context sensitivity */
+#define CTX_MAX_K 32U
+
 #endif