83 files changed, 6671 insertions, 6802 deletions
diff --git a/.gitignore b/.gitignore
index 0527a0b2..e3adb6ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,6 +51,12 @@ afl-showmap.8
 afl-system-config.8
 afl-tmin.8
 afl-whatsup.8
+afl-c++
+afl-cc
+afl-lto
+afl-lto++
+afl-lto++.8
+afl-lto.8
 qemu_mode/libcompcov/compcovtest
 qemu_mode/qemu-*
 unicorn_mode/samples/*/\.test-*
diff --git a/.gitmodules b/.gitmodules
index 80752342..a9c181da 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,7 @@
 [submodule "unicorn_mode/unicornafl"]
 	path = unicorn_mode/unicornafl
-	url = https://github.com/AFLplusplus/unicornafl.git
+	url = https://github.com/AFLplusplus/unicornafl
+
+[submodule "custom_mutators/Grammar-Mutator"]
+	path = custom_mutators/Grammar-Mutator
+	url = https://github.com/AFLplusplus/Grammar-Mutator
diff --git a/.travis.yml b/.travis.yml
index e23a84de..8dffc213 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,6 +6,7 @@ branches:
   only:
     - stable
     - dev
+    - llvm_merge
 
 matrix:
   include:
diff --git a/Android.bp b/Android.bp
index e59129db..2c2114b2 100644
--- a/Android.bp
+++ b/Android.bp
@@ -101,7 +101,7 @@ cc_binary_host {
   ],
 
   srcs: [
-    "llvm_mode/afl-clang-fast.c",
+    "src/afl-cc.c",
   ],
 }
 
@@ -119,7 +119,7 @@ cc_binary_host {
   ],
 
   srcs: [
-    "llvm_mode/afl-clang-fast.c",
+    "src/afl-cc.c",
   ],
 }
 
@@ -136,6 +136,6 @@ cc_library_static {
   ],
 
   srcs: [
-    "llvm_mode/afl-llvm-rt.o.c",
+    "instrumentation/afl-llvm-rt.o.c",
   ],
 }
diff --git a/GNUmakefile b/GNUmakefile
index 61f0ca55..7455483c 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -24,30 +24,31 @@ BIN_PATH    = $(PREFIX)/bin
 HELPER_PATH = $(PREFIX)/lib/afl
 DOC_PATH    = $(PREFIX)/share/doc/afl
 MISC_PATH   = $(PREFIX)/share/afl
-MAN_PATH    = $(PREFIX)/share/man/man8
+MAN_PATH    = $(PREFIX)/man/man8
 
 PROGNAME    = afl
 VERSION     = $(shell grep '^$(HASH)define VERSION ' ../config.h | cut -d '"' -f2)
 
 # PROGS intentionally omit afl-as, which gets installed elsewhere.
 
-PROGS       = afl-gcc afl-g++ afl-fuzz afl-showmap afl-tmin afl-gotcpu afl-analyze
+PROGS       = afl-fuzz afl-showmap afl-tmin afl-gotcpu afl-analyze
 SH_PROGS    = afl-plot afl-cmin afl-cmin.bash afl-whatsup afl-system-config
 MANPAGES=$(foreach p, $(PROGS) $(SH_PROGS), $(p).8) afl-as.8
+ASAN_OPTIONS=detect_leaks=0
 
 ifeq "$(findstring android, $(shell $(CC) --version 2>/dev/null))" ""
- ifeq "$(shell echo 'int main() {return 0; }' | $(CC) $(CFLAGS) -Werror -x c - -flto=full -o .test 2>/dev/null && echo 1 || echo 0 ; rm -f .test )" "1"
+ifeq "$(shell echo 'int main() {return 0; }' | $(CC) $(CFLAGS) -Werror -x c - -flto=full -o .test 2>/dev/null && echo 1 || echo 0 ; rm -f .test )" "1"
 	CFLAGS_FLTO ?= -flto=full
- else
-  ifeq "$(shell echo 'int main() {return 0; }' | $(CC) $(CFLAGS) -Werror -x c - -flto=thin -o .test 2>/dev/null && echo 1 || echo 0 ; rm -f .test )" "1"
+else
+ ifeq "$(shell echo 'int main() {return 0; }' | $(CC) $(CFLAGS) -Werror -x c - -flto=thin -o .test 2>/dev/null && echo 1 || echo 0 ; rm -f .test )" "1"
 	CFLAGS_FLTO ?= -flto=thin
-  else
-   ifeq "$(shell echo 'int main() {return 0; }' | $(CC) $(CFLAGS) -Werror -x c - -flto -o .test 2>/dev/null && echo 1 || echo 0 ; rm -f .test )" "1"
+ else
+  ifeq "$(shell echo 'int main() {return 0; }' | $(CC) $(CFLAGS) -Werror -x c - -flto -o .test 2>/dev/null && echo 1 || echo 0 ; rm -f .test )" "1"
 	CFLAGS_FLTO ?= -flto
-   endif
   endif
  endif
 endif
+endif
 
 ifeq "$(shell echo 'int main() {return 0; }' | $(CC) -fno-move-loop-invariants -fdisable-tree-cunrolli -x c - -o .test 2>/dev/null && echo 1 || echo 0 ; rm -f .test )" "1"
 	SPECIAL_PERFORMANCE += -fno-move-loop-invariants -fdisable-tree-cunrolli
@@ -61,10 +62,7 @@ ifneq "$(shell uname)" "Darwin"
    endif
  endif
  # OS X does not like _FORTIFY_SOURCE=2
- # _FORTIFY_SOURCE=2 does not like -O0
- ifndef DEBUG
-  CFLAGS_OPT += -D_FORTIFY_SOURCE=2
- endif
+ CFLAGS_OPT += -D_FORTIFY_SOURCE=2
 endif
 
 ifeq "$(shell uname)" "SunOS"
@@ -206,10 +204,7 @@ else
 endif
 
 ifneq "$(filter Linux GNU%,$(shell uname))" ""
- # _FORTIFY_SOURCE=2 does not like -O0
- ifndef DEBUG
   override CFLAGS += -D_FORTIFY_SOURCE=2
- endif
   LDFLAGS += -ldl -lrt
 endif
 
@@ -223,11 +218,7 @@ ifneq "$(findstring NetBSD, $(shell uname))" ""
   LDFLAGS += -lpthread
 endif
 
-ifeq "$(findstring clang, $(shell $(CC) --version 2>/dev/null))" ""
-  TEST_CC  = afl-gcc
-else
-  TEST_CC  = afl-clang
-endif
+TEST_CC = afl-gcc
 
 COMM_HDR    = include/alloc-inl.h include/config.h include/debug.h include/types.h
 
@@ -277,28 +268,47 @@ ifdef TEST_MMAP
 	LDFLAGS += -Wno-deprecated-declarations
 endif
 
-all:	test_x86 test_shm test_python ready $(PROGS) afl-as test_build all_done
+.PHONY: all
+all:	test_x86 test_shm test_python ready $(PROGS) afl-as llvm gcc_plugin test_build all_done
+
+.PHONY: llvm
+llvm:
+	-$(MAKE) -f GNUmakefile.llvm
+	@test -e afl-cc || { echo "[-] Compiling afl-cc failed. You seem not to have a working compiler." ; exit 1; }
 
-man:    afl-gcc all $(MANPAGES)
+.PHONY: gcc_plugin
+gcc_plugin:
+	-$(MAKE) -f GNUmakefile.gcc_plugin
 
+.PHONY: man
+man:    $(MANPAGES)
+
+.PHONY: test
+test:	tests
+
+.PHONY: tests
 tests:	source-only
 	@cd test ; ./test-all.sh
 	@rm -f test/errors
 
+.PHONY: performance-tests
 performance-tests:	performance-test
+.PHONY: test-performance
 test-performance:	performance-test
 
+.PHONY: performance-test
 performance-test:	source-only
 	@cd test ; ./test-performance.sh
 
 
 # hint: make targets are also listed in the top level README.md
+.PHONY: help
 help:
 	@echo "HELP --- the following make targets exist:"
 	@echo "=========================================="
 	@echo "all: just the main afl++ binaries"
 	@echo "binary-only: everything for binary-only fuzzing: qemu_mode, unicorn_mode, libdislocator, libtokencap"
-	@echo "source-only: everything for source code fuzzing: llvm_mode, gcc_plugin, libdislocator, libtokencap"
+	@echo "source-only: everything for source code fuzzing: gcc_plugin, libdislocator, libtokencap"
 	@echo "distrib: everything (for both binary-only and source code fuzzing)"
 	@echo "man: creates simple man pages from the help option of the programs"
 	@echo "install: installs everything you have compiled with the build option above"
@@ -322,8 +332,8 @@ help:
 	@echo "=========================================="
 	@echo e.g.: make ASAN_BUILD=1
 
+.PHONY: test_x86
 ifndef AFL_NO_X86
-
 test_x86:
 	@echo "[*] Checking for the default compiler cc..."
 	@type $(CC) >/dev/null || ( echo; echo "Oops, looks like there is no compiler '"$(CC)"' in your path."; echo; echo "Don't panic! You can restart with '"$(_)" CC=<yourCcompiler>'."; echo; exit 1 )
@@ -332,148 +342,129 @@ test_x86:
 	@echo "[*] Checking for the ability to compile x86 code..."
 	@echo 'main() { __asm__("xorb %al, %al"); }' | $(CC) $(CFLAGS) -w -x c - -o .test1 || ( echo; echo "Oops, looks like your compiler can't generate x86 code."; echo; echo "Don't panic! You can use the LLVM or QEMU mode, but see docs/INSTALL first."; echo "(To ignore this error, set AFL_NO_X86=1 and try again.)"; echo; exit 1 )
 	@rm -f .test1
-
 else
-
 test_x86:
 	@echo "[!] Note: skipping x86 compilation checks (AFL_NO_X86 set)."
-
 endif
 
-
+.PHONY: test_shm
 ifeq "$(SHMAT_OK)" "1"
-
 test_shm:
 	@echo "[+] shmat seems to be working."
 	@rm -f .test2
-
 else
-
 test_shm:
 	@echo "[-] shmat seems not to be working, switching to mmap implementation"
-
 endif
 
-
+.PHONY: test_python
 ifeq "$(PYTHON_OK)" "1"
-
 test_python:
 	@rm -f .test 2> /dev/null
 	@echo "[+] $(PYTHON_VERSION) support seems to be working."
-
 else
-
 test_python:
 	@echo "[-] You seem to need to install the package python3-dev, python2-dev or python-dev (and perhaps python[23]-apt), but it is optional so we continue"
-
 endif
 
-
+.PHONY: ready
 ready:
 	@echo "[+] Everything seems to be working, ready to compile."
 
-afl-g++: afl-gcc
-
-afl-gcc: src/afl-gcc.c $(COMM_HDR) | test_x86
-	$(CC) $(CFLAGS) $(CPPFLAGS) src/$@.c -o $@ $(LDFLAGS)
-	set -e; for i in afl-g++ afl-clang afl-clang++; do ln -sf afl-gcc $$i; done
-
 afl-as: src/afl-as.c include/afl-as.h $(COMM_HDR) | test_x86
-	$(CC) $(CFLAGS) $(CPPFLAGS) src/$@.c -o $@ $(LDFLAGS)
-	ln -sf afl-as as
+	$(CC) $(CFLAGS) src/$@.c -o $@ $(LDFLAGS)
+	@ln -sf afl-as as
 
 src/afl-performance.o : $(COMM_HDR) src/afl-performance.c include/hash.h
-	$(CC) $(CFLAGS) $(CPPFLAGS) -Iinclude $(SPECIAL_PERFORMANCE) -O3 -fno-unroll-loops -c src/afl-performance.c -o src/afl-performance.o
+	$(CC) -Iinclude $(SPECIAL_PERFORMANCE) -O3 -fno-unroll-loops -c src/afl-performance.c -o src/afl-performance.o
 
 src/afl-common.o : $(COMM_HDR) src/afl-common.c include/common.h
-	$(CC) $(CFLAGS) $(CFLAGS_FLTO) $(CPPFLAGS) -c src/afl-common.c -o src/afl-common.o
+	$(CC) $(CFLAGS) $(CFLAGS_FLTO) -c src/afl-common.c -o src/afl-common.o
 
 src/afl-forkserver.o : $(COMM_HDR) src/afl-forkserver.c include/forkserver.h
-	$(CC) $(CFLAGS) $(CFLAGS_FLTO) $(CPPFLAGS) -c src/afl-forkserver.c -o src/afl-forkserver.o
+	$(CC) $(CFLAGS) $(CFLAGS_FLTO) -c src/afl-forkserver.c -o src/afl-forkserver.o
 
 src/afl-sharedmem.o : $(COMM_HDR) src/afl-sharedmem.c include/sharedmem.h
-	$(CC) $(CFLAGS) $(CFLAGS_FLTO) $(CPPFLAGS) -c src/afl-sharedmem.c -o src/afl-sharedmem.o
+	$(CC) $(CFLAGS) $(CFLAGS_FLTO) -c src/afl-sharedmem.c -o src/afl-sharedmem.o
 
 afl-fuzz: $(COMM_HDR) include/afl-fuzz.h $(AFL_FUZZ_FILES) src/afl-common.o src/afl-sharedmem.o src/afl-forkserver.o src/afl-performance.o | test_x86
-	$(CC) $(CFLAGS) $(COMPILE_STATIC) $(CFLAGS_FLTO) $(AFL_FUZZ_FILES) $(CPPFLAGS) src/afl-common.o src/afl-sharedmem.o src/afl-forkserver.o src/afl-performance.o -o $@ $(PYFLAGS) $(LDFLAGS)
+	$(CC) $(CFLAGS) $(COMPILE_STATIC) $(CFLAGS_FLTO) $(AFL_FUZZ_FILES) src/afl-common.o src/afl-sharedmem.o src/afl-forkserver.o src/afl-performance.o -o $@ $(PYFLAGS) $(LDFLAGS)
 
 afl-showmap: src/afl-showmap.c src/afl-common.o src/afl-sharedmem.o src/afl-forkserver.o $(COMM_HDR) | test_x86
-	$(CC) $(CFLAGS) $(COMPILE_STATIC) $(CFLAGS_FLTO) $(CPPFLAGS) src/$@.c src/afl-common.o src/afl-sharedmem.o src/afl-forkserver.o -o $@ $(LDFLAGS)
+	$(CC) $(CFLAGS) $(COMPILE_STATIC) $(CFLAGS_FLTO) src/$@.c src/afl-common.o src/afl-sharedmem.o src/afl-forkserver.o -o $@ $(LDFLAGS)
 
 afl-tmin: src/afl-tmin.c src/afl-common.o src/afl-sharedmem.o src/afl-forkserver.o src/afl-performance.o $(COMM_HDR) | test_x86
-	$(CC) $(CFLAGS) $(COMPILE_STATIC) $(CFLAGS_FLTO) $(CPPFLAGS) src/$@.c src/afl-common.o src/afl-sharedmem.o src/afl-forkserver.o src/afl-performance.o -o $@ $(LDFLAGS)
+	$(CC) $(CFLAGS) $(COMPILE_STATIC) $(CFLAGS_FLTO) src/$@.c src/afl-common.o src/afl-sharedmem.o src/afl-forkserver.o src/afl-performance.o -o $@ $(LDFLAGS)
 
 afl-analyze: src/afl-analyze.c src/afl-common.o src/afl-sharedmem.o src/afl-performance.o $(COMM_HDR) | test_x86
-	$(CC) $(CFLAGS) $(COMPILE_STATIC) $(CFLAGS_FLTO) $(CPPFLAGS) src/$@.c src/afl-common.o src/afl-sharedmem.o src/afl-performance.o -o $@ $(LDFLAGS)
+	$(CC) $(CFLAGS) $(COMPILE_STATIC) $(CFLAGS_FLTO) src/$@.c src/afl-common.o src/afl-sharedmem.o src/afl-performance.o -o $@ $(LDFLAGS)
 
 afl-gotcpu: src/afl-gotcpu.c src/afl-common.o $(COMM_HDR) | test_x86
-	$(CC) $(CFLAGS) $(COMPILE_STATIC) $(CFLAGS_FLTO) $(CPPFLAGS) src/$@.c src/afl-common.o -o $@ $(LDFLAGS)
+	$(CC) $(CFLAGS) $(COMPILE_STATIC) $(CFLAGS_FLTO) src/$@.c src/afl-common.o -o $@ $(LDFLAGS)
 
+.PHONY: document
+document:	afl-fuzz-document
 
 # document all mutations and only do one run (use with only one input file!)
-document: $(COMM_HDR) include/afl-fuzz.h $(AFL_FUZZ_FILES) src/afl-common.o src/afl-sharedmem.o src/afl-performance.o | test_x86
-	$(CC) -D_DEBUG=\"1\" -D_AFL_DOCUMENT_MUTATIONS $(CFLAGS) $(CFLAGS_FLTO) $(AFL_FUZZ_FILES) $(CPPFLAGS) src/afl-common.o src/afl-sharedmem.o src/afl-forkserver.c src/afl-performance.o -o afl-fuzz-document $(PYFLAGS) $(LDFLAGS)
+afl-fuzz-document: $(COMM_HDR) include/afl-fuzz.h $(AFL_FUZZ_FILES) src/afl-common.o src/afl-sharedmem.o src/afl-performance.o | test_x86
+	$(CC) -D_DEBUG=\"1\" -D_AFL_DOCUMENT_MUTATIONS $(CFLAGS) $(CFLAGS_FLTO) $(AFL_FUZZ_FILES) src/afl-common.o src/afl-sharedmem.o src/afl-forkserver.c src/afl-performance.o -o afl-fuzz-document $(PYFLAGS) $(LDFLAGS)
 
 test/unittests/unit_maybe_alloc.o : $(COMM_HDR) include/alloc-inl.h test/unittests/unit_maybe_alloc.c $(AFL_FUZZ_FILES)
-	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) $(CPPFLAGS) -c test/unittests/unit_maybe_alloc.c -o test/unittests/unit_maybe_alloc.o
+	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) -c test/unittests/unit_maybe_alloc.c -o test/unittests/unit_maybe_alloc.o
 
 unit_maybe_alloc: test/unittests/unit_maybe_alloc.o
-	@$(CC) $(CFLAGS) $(CPPFLAGS) -Wl,--wrap=exit -Wl,--wrap=printf test/unittests/unit_maybe_alloc.o -o test/unittests/unit_maybe_alloc $(LDFLAGS) $(ASAN_LDFLAGS) -lcmocka
+	@$(CC) $(CFLAGS) -Wl,--wrap=exit -Wl,--wrap=printf test/unittests/unit_maybe_alloc.o -o test/unittests/unit_maybe_alloc $(LDFLAGS) $(ASAN_LDFLAGS) -lcmocka
 	./test/unittests/unit_maybe_alloc
 
 test/unittests/unit_hash.o : $(COMM_HDR) include/alloc-inl.h test/unittests/unit_hash.c $(AFL_FUZZ_FILES) src/afl-performance.o
-	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) $(CPPFLAGS) -c test/unittests/unit_hash.c -o test/unittests/unit_hash.o
+	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) -c test/unittests/unit_hash.c -o test/unittests/unit_hash.o
 
 unit_hash: test/unittests/unit_hash.o src/afl-performance.o
-	@$(CC) $(CFLAGS) $(CPPFLAGS) -Wl,--wrap=exit -Wl,--wrap=printf $^ -o test/unittests/unit_hash $(LDFLAGS) $(ASAN_LDFLAGS) -lcmocka
+	@$(CC) $(CFLAGS) -Wl,--wrap=exit -Wl,--wrap=printf $^ -o test/unittests/unit_hash $(LDFLAGS) $(ASAN_LDFLAGS) -lcmocka
 	./test/unittests/unit_hash
 
 test/unittests/unit_rand.o : $(COMM_HDR) include/alloc-inl.h test/unittests/unit_rand.c $(AFL_FUZZ_FILES) src/afl-performance.o
-	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) $(CPPFLAGS) -c test/unittests/unit_rand.c -o test/unittests/unit_rand.o
+	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) -c test/unittests/unit_rand.c -o test/unittests/unit_rand.o
 
 unit_rand: test/unittests/unit_rand.o src/afl-common.o src/afl-performance.o
-	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) $(CPPFLAGS) -Wl,--wrap=exit -Wl,--wrap=printf $^ -o test/unittests/unit_rand  $(LDFLAGS) $(ASAN_LDFLAGS) -lcmocka
+	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) -Wl,--wrap=exit -Wl,--wrap=printf $^ -o test/unittests/unit_rand  $(LDFLAGS) $(ASAN_LDFLAGS) -lcmocka
 	./test/unittests/unit_rand
 
 test/unittests/unit_list.o : $(COMM_HDR) include/list.h test/unittests/unit_list.c $(AFL_FUZZ_FILES)
-	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) $(CPPFLAGS) -c test/unittests/unit_list.c -o test/unittests/unit_list.o
+	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) -c test/unittests/unit_list.c -o test/unittests/unit_list.o
 
 unit_list: test/unittests/unit_list.o
-	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) $(CPPFLAGS) -Wl,--wrap=exit -Wl,--wrap=printf test/unittests/unit_list.o -o test/unittests/unit_list  $(LDFLAGS) $(ASAN_LDFLAGS) -lcmocka
+	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) -Wl,--wrap=exit -Wl,--wrap=printf test/unittests/unit_list.o -o test/unittests/unit_list  $(LDFLAGS) $(ASAN_LDFLAGS) -lcmocka
 	./test/unittests/unit_list
 
 test/unittests/unit_preallocable.o : $(COMM_HDR) include/alloc-inl.h test/unittests/unit_preallocable.c $(AFL_FUZZ_FILES)
-	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) $(CPPFLAGS) -c test/unittests/unit_preallocable.c -o test/unittests/unit_preallocable.o
+	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) -c test/unittests/unit_preallocable.c -o test/unittests/unit_preallocable.o
 
 unit_preallocable: test/unittests/unit_preallocable.o
-	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) $(CPPFLAGS) -Wl,--wrap=exit -Wl,--wrap=printf test/unittests/unit_preallocable.o -o test/unittests/unit_preallocable $(LDFLAGS) $(ASAN_LDFLAGS) -lcmocka
+	@$(CC) $(CFLAGS) $(ASAN_CFLAGS) -Wl,--wrap=exit -Wl,--wrap=printf test/unittests/unit_preallocable.o -o test/unittests/unit_preallocable $(LDFLAGS) $(ASAN_LDFLAGS) -lcmocka
 	./test/unittests/unit_preallocable
 
+.PHONY: unit_clean
 unit_clean:
 	@rm -f ./test/unittests/unit_preallocable ./test/unittests/unit_list ./test/unittests/unit_maybe_alloc test/unittests/*.o
 
+.PHONY: unit
 ifneq "$(shell uname)" "Darwin"
-
-unit: unit_maybe_alloc unit_preallocable unit_list unit_clean unit_rand unit_hash
-
+unit:	unit_maybe_alloc unit_preallocable unit_list unit_clean unit_rand unit_hash
 else
-
 unit:
 	@echo [-] unit tests are skipped on Darwin \(lacks GNU linker feature --wrap\)
-
 endif
 
+.PHONY: code-format
 code-format:
 	./.custom-format.py -i src/*.c
 	./.custom-format.py -i include/*.h
 	./.custom-format.py -i libdislocator/*.c
 	./.custom-format.py -i libtokencap/*.c
-	./.custom-format.py -i llvm_mode/*.c
-	./.custom-format.py -i llvm_mode/*.h
-	./.custom-format.py -i llvm_mode/*.cc
-	./.custom-format.py -i gcc_plugin/*.c
-	@#./.custom-format.py -i gcc_plugin/*.h
-	./.custom-format.py -i gcc_plugin/*.cc
+	./.custom-format.py -i instrumentation/*.h
+	./.custom-format.py -i instrumentation/*.cc
+	./.custom-format.py -i instrumentation/*.c
 	./.custom-format.py -i custom_mutators/*/*.c
 	@#./.custom-format.py -i custom_mutators/*/*.h # destroys input.h :-(
 	./.custom-format.py -i examples/*/*.c
@@ -489,38 +480,40 @@ code-format:
 	./.custom-format.py -i *.c
 
 
+.PHONY: test_build
 ifndef AFL_NO_X86
-
-test_build: afl-gcc afl-as afl-showmap
+test_build: afl-cc afl-as afl-showmap
 	@echo "[*] Testing the CC wrapper and instrumentation output..."
-	@unset AFL_USE_ASAN AFL_USE_MSAN AFL_CC; AFL_DEBUG=1 AFL_INST_RATIO=100 AFL_AS_FORCE_INSTRUMENT=1 AFL_PATH=. ./$(TEST_CC) $(CFLAGS) test-instr.c -o test-instr $(LDFLAGS) 2>&1 | grep 'afl-as' >/dev/null || (echo "Oops, afl-as did not get called from "$(TEST_CC)". This is normally achieved by "$(CC)" honoring the -B option."; exit 1 )
+	@unset AFL_USE_ASAN AFL_USE_MSAN AFL_CC; AFL_DEBUG=1 AFL_INST_RATIO=100 AFL_PATH=. ./$(TEST_CC) $(CFLAGS) test-instr.c -o test-instr $(LDFLAGS) 2>&1 | grep 'afl-as' >/dev/null || (echo "Oops, afl-as did not get called from "$(TEST_CC)". This is normally achieved by "$(CC)" honoring the -B option."; exit 1 )
 	ASAN_OPTIONS=detect_leaks=0 ./afl-showmap -m none -q -o .test-instr0 ./test-instr < /dev/null
 	echo 1 | ASAN_OPTIONS=detect_leaks=0 ./afl-showmap -m none -q -o .test-instr1 ./test-instr
 	@rm -f test-instr
 	@cmp -s .test-instr0 .test-instr1; DR="$$?"; rm -f .test-instr0 .test-instr1; if [ "$$DR" = "0" ]; then echo; echo "Oops, the instrumentation does not seem to be behaving correctly!"; echo; echo "Please post to https://github.com/AFLplusplus/AFLplusplus/issues to troubleshoot the issue."; echo; exit 1; fi
+	@echo
 	@echo "[+] All right, the instrumentation seems to be working!"
-
 else
-
-test_build: afl-gcc afl-as afl-showmap
+test_build: afl-cc afl-as afl-showmap
 	@echo "[!] Note: skipping build tests (you may need to use LLVM or QEMU mode)."
-
 endif
 
-
+.PHONY: all_done
 all_done: test_build
-	@if [ ! "`type clang 2>/dev/null`" = "" ]; then echo "[+] LLVM users: see llvm_mode/README.md for a faster alternative to afl-gcc."; fi
+	@test -e afl-cc && echo "[+] Main compiler 'afl-cc' successfully built!" || { echo "[-] Main compiler 'afl-cc' failed to built, set up a working build environment first!" ; exit 1 ; }
+	@test -e cmplog-instructions-pass.so && echo "[+] LLVM mode for 'afl-cc' successfully built!" || echo "[-] LLVM mode for 'afl-cc'  failed to built, likely you either have not llvm installed or you have not set LLVM_CONFIG pointing to e.g. llvm-config-11. See instrumenation/README.llvm.md how to do this. Highly recommended!"
+	@test -e SanitizerCoverageLTO.so && echo "[+] LLVM LTO mode for 'afl-cc' successfully built!" || echo "[-] LLVM LTO mode for 'afl-cc'  failed to built, this would need LLVM 11+, see instrumentation/README.lto.md how to build it"
+	@test -e afl-gcc-pass.so && echo "[+] gcc_plugin for 'afl-cc' successfully built!" || echo "[-] gcc_plugin for 'afl-cc'  failed to built, unless you really need it that is fine - or read instrumentation/README.gcc_plugin.md how to build it"
 	@echo "[+] All done! Be sure to review the README.md - it's pretty short and useful."
 	@if [ "`uname`" = "Darwin" ]; then printf "\nWARNING: Fuzzing on MacOS X is slow because of the unusually high overhead of\nfork() on this OS. Consider using Linux or *BSD. You can also use VirtualBox\n(virtualbox.org) to put AFL inside a Linux or *BSD VM.\n\n"; fi
 	@! tty <&1 >/dev/null || printf "\033[0;30mNOTE: If you can read this, your terminal probably uses white background.\nThis will make the UI hard to read. See docs/status_screen.md for advice.\033[0m\n" 2>/dev/null
 
 .NOTPARALLEL: clean all
 
+.PHONY: clean
 clean:
-	rm -f $(PROGS) libradamsa.so afl-fuzz-document afl-as as afl-g++ afl-clang afl-clang++ *.o src/*.o *~ a.out core core.[1-9][0-9]* *.stackdump .test .test1 .test2 test-instr .test-instr0 .test-instr1 afl-qemu-trace afl-gcc-fast afl-gcc-pass.so afl-gcc-rt.o afl-g++-fast ld *.so *.8 test/unittests/*.o test/unittests/unit_maybe_alloc test/unittests/preallocable .afl-*
+	rm -f $(PROGS) libradamsa.so afl-fuzz-document afl-as as afl-g++ afl-clang afl-clang++ *.o src/*.o *~ a.out core core.[1-9][0-9]* *.stackdump .test .test1 .test2 test-instr .test-instr0 .test-instr1 afl-qemu-trace afl-gcc-fast afl-gcc-pass.so afl-g++-fast ld *.so *.8 test/unittests/*.o test/unittests/unit_maybe_alloc test/unittests/preallocable .afl-* afl-gcc afl-g++
 	rm -rf out_dir qemu_mode/qemu-3.1.1 *.dSYM */*.dSYM
-	-$(MAKE) -C llvm_mode clean
-	-$(MAKE) -C gcc_plugin clean
+	-$(MAKE) -f GNUmakefile.llvm clean
+	-$(MAKE) -f GNUmakefile.gcc_plugin clean
 	$(MAKE) -C libdislocator clean
 	$(MAKE) -C libtokencap clean
 	$(MAKE) -C examples/afl_network_proxy clean
@@ -536,14 +529,16 @@ else
 	rm -rf unicorn_mode/unicornafl
 endif
 
+.PHONY: deepclean
 deepclean:	clean
 	rm -rf qemu_mode/qemu-3.1.1.tar.xz
 	rm -rf unicorn_mode/unicornafl
-	git reset --hard >/dev/null 2>&1 || true
+	# NEVER EVER ACTIVATE THAT!!!!! git reset --hard >/dev/null 2>&1 || true
 
+.PHONY: distrib
 distrib: all
-	-$(MAKE) -C llvm_mode
-	-$(MAKE) -C gcc_plugin
+	-$(MAKE) -f GNUmakefile.llvm
+	-$(MAKE) -f GNUmakefile.gcc_plugin
 	$(MAKE) -C libdislocator
 	$(MAKE) -C libtokencap
 	$(MAKE) -C examples/afl_network_proxy
@@ -552,6 +547,7 @@ distrib: all
 	-cd qemu_mode && sh ./build_qemu_support.sh
 	cd unicorn_mode && unset CFLAGS && sh ./build_unicorn_support.sh
 
+.PHONY: binary-only
 binary-only: all
 	$(MAKE) -C libdislocator
 	$(MAKE) -C libtokencap
@@ -561,9 +557,10 @@ binary-only: all
 	-cd qemu_mode && sh ./build_qemu_support.sh
 	cd unicorn_mode && unset CFLAGS && sh ./build_unicorn_support.sh
 
+.PHONY: source-only
 source-only: all
-	-$(MAKE) -C llvm_mode
-	-$(MAKE) -C gcc_plugin
+	-$(MAKE) -f GNUmakefile.llvm
+	-$(MAKE) -f GNUmakefile.gcc_plugin
 	$(MAKE) -C libdislocator
 	$(MAKE) -C libtokencap
 	@#$(MAKE) -C examples/afl_network_proxy
@@ -573,8 +570,7 @@ source-only: all
 %.8:	%
 	@echo .TH $* 8 $(BUILD_DATE) "afl++" > $@
 	@echo .SH NAME >> $@
-	@printf "%s" ".B $* \- " >> $@
-	@./$* -h 2>&1 | head -n 1 | sed -e "s/$$(printf '\e')[^m]*m//g" >> $@
+	@echo .B $* >> $@
 	@echo >> $@
 	@echo .SH SYNOPSIS >> $@
 	@./$* -h 2>&1 | head -n 3 | tail -n 1 | sed 's/^\.\///' >> $@
@@ -590,30 +586,29 @@ source-only: all
 	@echo .SH LICENSE >> $@
 	@echo Apache License Version 2.0, January 2004 >> $@
 
+.PHONY: install
 install: all $(MANPAGES)
-	install -d -m 755 $${DESTDIR}$(BIN_PATH) $${DESTDIR}$(HELPER_PATH) $${DESTDIR}$(DOC_PATH) $${DESTDIR}$(MISC_PATH)
-	rm -f $${DESTDIR}$(BIN_PATH)/afl-plot.sh
+	@install -d -m 755 $${DESTDIR}$(BIN_PATH) $${DESTDIR}$(HELPER_PATH) $${DESTDIR}$(DOC_PATH) $${DESTDIR}$(MISC_PATH)
+	@rm -f $${DESTDIR}$(BIN_PATH)/afl-plot.sh
+	@rm -f $${DESTDIR}$(BIN_PATH)/afl-as
+	@rm -f $${DESTDIR}$(HELPER_PATH)/afl-llvm-rt.o $${DESTDIR}$(HELPER_PATH)/afl-llvm-rt-32.o $${DESTDIR}$(HELPER_PATH)/afl-llvm-rt-64.o $${DESTDIR}$(HELPER_PATH)/afl-gcc-rt.o
 	install -m 755 $(PROGS) $(SH_PROGS) $${DESTDIR}$(BIN_PATH)
-	rm -f $${DESTDIR}$(BIN_PATH)/afl-as
-	if [ -f afl-qemu-trace ]; then install -m 755 afl-qemu-trace $${DESTDIR}$(BIN_PATH); fi
-	if [ -f afl-gcc-fast ]; then set e; install -m 755 afl-gcc-fast $${DESTDIR}$(BIN_PATH); ln -sf afl-gcc-fast $${DESTDIR}$(BIN_PATH)/afl-g++-fast; install -m 755 afl-gcc-pass.so afl-gcc-rt.o $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f afl-clang-fast ]; then $(MAKE) -C llvm_mode install; fi
-	if [ -f libdislocator.so ]; then set -e; install -m 755 libdislocator.so $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f libtokencap.so ]; then set -e; install -m 755 libtokencap.so $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f libcompcov.so ]; then set -e; install -m 755 libcompcov.so $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f afl-fuzz-document ]; then set -e; install -m 755 afl-fuzz-document $${DESTDIR}$(BIN_PATH); fi
-	if [ -f socketfuzz32.so -o -f socketfuzz64.so ]; then $(MAKE) -C examples/socket_fuzzing install; fi
-	if [ -f argvfuzz32.so -o -f argvfuzz64.so ]; then $(MAKE) -C examples/argv_fuzzing install; fi
-	if [ -f examples/afl_network_proxy/afl-network-server ]; then $(MAKE) -C examples/afl_network_proxy install; fi
-	if [ -f libAFLDriver.a ]; then install -m 644 libAFLDriver.a $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f libAFLQemuDriver.a ]; then install -m 644 libAFLQemuDriver.a $${DESTDIR}$(HELPER_PATH); fi
-
-	set -e; ln -sf afl-gcc $${DESTDIR}$(BIN_PATH)/afl-g++
-	set -e; if [ -f afl-clang-fast ] ; then ln -sf afl-clang-fast $${DESTDIR}$(BIN_PATH)/afl-clang ; ln -sf afl-clang-fast $${DESTDIR}$(BIN_PATH)/afl-clang++ ; else ln -sf afl-gcc $${DESTDIR}$(BIN_PATH)/afl-clang ; ln -sf afl-gcc $${DESTDIR}$(BIN_PATH)/afl-clang++; fi
-
-	mkdir -m 0755 -p ${DESTDIR}$(MAN_PATH)
+	@if [ -f afl-qemu-trace ]; then install -m 755 afl-qemu-trace $${DESTDIR}$(BIN_PATH); fi
+	@if [ -f libdislocator.so ]; then set -e; install -m 755 libdislocator.so $${DESTDIR}$(HELPER_PATH); fi
+	@if [ -f libtokencap.so ]; then set -e; install -m 755 libtokencap.so $${DESTDIR}$(HELPER_PATH); fi
+	@if [ -f libcompcov.so ]; then set -e; install -m 755 libcompcov.so $${DESTDIR}$(HELPER_PATH); fi
+	@if [ -f afl-fuzz-document ]; then set -e; install -m 755 afl-fuzz-document $${DESTDIR}$(BIN_PATH); fi
+	@if [ -f socketfuzz32.so -o -f socketfuzz64.so ]; then $(MAKE) -C examples/socket_fuzzing install; fi
+	@if [ -f argvfuzz32.so -o -f argvfuzz64.so ]; then $(MAKE) -C examples/argv_fuzzing install; fi
+	@if [ -f examples/afl_network_proxy/afl-network-server ]; then $(MAKE) -C examples/afl_network_proxy install; fi
+	@if [ -f examples/aflpp_driver/libAFLDriver.a ]; then install -m 644 examples/aflpp_driver/libAFLDriver.a $${DESTDIR}$(HELPER_PATH); fi
+	@if [ -f examples/aflpp_driver/libAFLQemuDriver.a ]; then install -m 644 examples/aflpp_driver/libAFLQemuDriver.a $${DESTDIR}$(HELPER_PATH); fi
+	-$(MAKE) -f GNUmakefile.llvm install
+	-$(MAKE) -f GNUmakefile.gcc_plugin install
+	ln -sf afl-cc $${DESTDIR}$(BIN_PATH)/afl-gcc
+	ln -sf afl-cc $${DESTDIR}$(BIN_PATH)/afl-g++
+	@mkdir -m 0755 -p ${DESTDIR}$(MAN_PATH)
 	install -m0644 *.8 ${DESTDIR}$(MAN_PATH)
-
 	install -m 755 afl-as $${DESTDIR}$(HELPER_PATH)
 	ln -sf afl-as $${DESTDIR}$(HELPER_PATH)/as
 	install -m 644 docs/*.md $${DESTDIR}$(DOC_PATH)
diff --git a/gcc_plugin/GNUmakefile b/GNUmakefile.gcc_plugin
index 625b55fb..b73fcfda 100644
--- a/gcc_plugin/GNUmakefile
+++ b/GNUmakefile.gcc_plugin
@@ -26,10 +26,10 @@ BIN_PATH    ?= $(PREFIX)/bin
 DOC_PATH    ?= $(PREFIX)/share/doc/afl
 MAN_PATH    ?= $(PREFIX)/share/man/man8
 
-VERSION     = $(shell grep '^$(HASH)define VERSION ' ../config.h | cut -d '"' -f2)
+VERSION     = $(shell grep '^$(HASH)define VERSION ' ./config.h | cut -d '"' -f2)
 
 CFLAGS          ?= -O3 -g -funroll-loops -D_FORTIFY_SOURCE=2
-CFLAGS_SAFE     := -Wall -I../include -Wno-pointer-sign \
+CFLAGS_SAFE     := -Wall -Iinclude -Wno-pointer-sign \
                    -DAFL_PATH=\"$(HELPER_PATH)\" -DBIN_PATH=\"$(BIN_PATH)\" \
                    -DGCC_VERSION=\"$(GCCVER)\" -DGCC_BINDIR=\"$(GCCBINDIR)\" \
                    -Wno-unused-function
@@ -80,25 +80,22 @@ ifeq "$(shell uname -s)" "SunOS"
 endif
 
 
-PROGS        = ../afl-gcc-fast ../afl-gcc-pass.so ../afl-gcc-rt.o
+PROGS        = ./afl-gcc-pass.so
 
+.PHONY: all
+all: test_shm test_deps $(PROGS) test_build all_done
 
-all: test_shm test_deps $(PROGS) afl-gcc-fast.8 test_build all_done
-
+.PHONY: test_shm
 ifeq "$(SHMAT_OK)" "1"
-
 test_shm:
 	@echo "[+] shmat seems to be working."
 	@rm -f .test2
-
 else
-
 test_shm:
 	@echo "[-] shmat seems not to be working, switching to mmap implementation"
-
 endif
 
-
+.PHONY: test_deps
 test_deps:
 	@echo "[*] Checking for working '$(CC)'..."
 	@type $(CC) >/dev/null 2>&1 || ( echo "[-] Oops, can't find '$(CC)'. Make sure that it's in your \$$PATH (or set \$$CC and \$$CXX)."; exit 1 )
@@ -106,65 +103,66 @@ test_deps:
 #	@$(CC) -v 2>&1 | grep -q -- --enable-plugin || ( echo "[-] Oops, this gcc has not been configured with plugin support."; exit 1 )
 	@echo "[*] Checking for gcc plugin development header files..."
 	@test -d `$(CC) -print-file-name=plugin`/include || ( echo "[-] Oops, can't find gcc header files. Be sure to install 'gcc-X-plugin-dev'."; exit 1 )
-	@echo "[*] Checking for '../afl-showmap'..."
-	@test -f ../afl-showmap || ( echo "[-] Oops, can't find '../afl-showmap'. Be sure to compile AFL first."; exit 1 )
+	@echo "[*] Checking for './afl-showmap'..."
+	@test -f ./afl-showmap || ( echo "[-] Oops, can't find './afl-showmap'. Be sure to compile AFL first."; exit 1 )
 	@echo "[+] All set and ready to build."
 
-afl-common.o: ../src/afl-common.c
+afl-common.o: ./src/afl-common.c
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@ $(LDFLAGS)
 
-../afl-gcc-fast: afl-gcc-fast.c afl-common.o | test_deps
-	$(CC) -DAFL_GCC_CC=\"$(CC)\" -DAFL_GCC_CXX=\"$(CXX)\" $(CFLAGS) $(CPPFLAGS) $< afl-common.o -o $@ $(LDFLAGS)
-	ln -sf afl-gcc-fast ../afl-g++-fast
-
-../afl-gcc-pass.so: afl-gcc-pass.so.cc | test_deps
+./afl-gcc-pass.so: instrumentation/afl-gcc-pass.so.cc | test_deps
 	$(CXX) $(CXXEFLAGS) $(PLUGIN_FLAGS) -shared $< -o $@
+	ln -sf afl-cc afl-gcc-fast
+	ln -sf afl-cc afl-g++-fast
+	ln -sf afl-cc.8 afl-gcc-fast.8
+	ln -sf afl-cc.8 afl-g++-fast.8
 
-../afl-gcc-rt.o: afl-gcc-rt.o.c | test_deps
-	$(CC) $(CFLAGS_SAFE) $(CPPFLAGS) -fPIC -c $< -o $@
-
+.PHONY: test_build
 test_build: $(PROGS)
 	@echo "[*] Testing the CC wrapper and instrumentation output..."
-	unset AFL_USE_ASAN AFL_USE_MSAN; AFL_QUIET=1 AFL_INST_RATIO=100 AFL_PATH=. AFL_CC=$(CC) ../afl-gcc-fast $(CFLAGS) $(CPPFLAGS) ../test-instr.c -o test-instr $(LDFLAGS)
-#	unset AFL_USE_ASAN AFL_USE_MSAN;             AFL_INST_RATIO=100 AFL_PATH=. AFL_CC=$(CC) ../afl-gcc-fast $(CFLAGS) ../test-instr.c -o test-instr $(LDFLAGS)
-	ASAN_OPTIONS=detect_leaks=0 ../afl-showmap -m none -q -o .test-instr0 ./test-instr </dev/null
-	echo 1 | ASAN_OPTIONS=detect_leaks=0 ../afl-showmap -m none -q -o .test-instr1 ./test-instr
+	unset AFL_USE_ASAN AFL_USE_MSAN; AFL_QUIET=1 AFL_INST_RATIO=100 AFL_PATH=. AFL_CC=$(CC) ./afl-gcc-fast $(CFLAGS) $(CPPFLAGS) ./test-instr.c -o test-instr $(LDFLAGS)
+	ASAN_OPTIONS=detect_leaks=0 ./afl-showmap -m none -q -o .test-instr0 ./test-instr </dev/null
+	echo 1 | ASAN_OPTIONS=detect_leaks=0 ./afl-showmap -m none -q -o .test-instr1 ./test-instr
 	@rm -f test-instr
 	@cmp -s .test-instr0 .test-instr1; DR="$$?"; rm -f .test-instr0 .test-instr1; if [ "$$DR" = "0" ]; then echo; echo "Oops, the instrumentation does not seem to be behaving correctly!"; echo; echo "Please post to https://github.com/AFLplusplus/AFLplusplus/issues to troubleshoot the issue."; echo; exit 1; fi
 	@echo "[+] All right, the instrumentation seems to be working!"
 
+.PHONY: all_done
 all_done: test_build
-	@echo "[+] All done! You can now use '../afl-gcc-fast' to compile programs."
+	@echo "[+] All done! You can now use './afl-gcc-fast' to compile programs."
 
 .NOTPARALLEL: clean
 
 vpath  % ..
 %.8: %
-	@echo .TH $* 8 `date "+%Y-%m-%d"` "afl++" > ../$@
-	@echo .SH NAME >> ../$@
-	@echo .B $* >> ../$@
-	@echo >> ../$@
-	@echo .SH SYNOPSIS >> ../$@
-	@../$* -h 2>&1 | head -n 3 | tail -n 1 | sed 's/^\.\///' >> ../$@
-	@echo >> ../$@
-	@echo .SH OPTIONS >> ../$@
-	@echo .nf >> ../$@
-	@../$* -h 2>&1 | tail -n +4 >> ../$@
-	@echo >> ../$@
-	@echo .SH AUTHOR >> ../$@
-	@echo "afl++ was written by Michal \"lcamtuf\" Zalewski and is maintained by Marc \"van Hauser\" Heuse <mh@mh-sec.de>, Heiko \"hexcoder-\" Eissfeldt <heiko.eissfeldt@hexco.de>, Andrea Fioraldi <andreafioraldi@gmail.com> and Dominik Maier <domenukk@gmail.com>" >> ../$@
-	@echo  The homepage of afl++ is: https://github.com/AFLplusplus/AFLplusplus >> ../$@
-	@echo >> ../$@
-	@echo .SH LICENSE >> ../$@
-	@echo Apache License Version 2.0, January 2004 >> ../$@
-	ln -sf afl-gcc-fast.8 ../afl-g++-fast.8
-
+	@echo .TH $* 8 `date "+%Y-%m-%d"` "afl++" > ./$@
+	@echo .SH NAME >> ./$@
+	@echo .B $* >> ./$@
+	@echo >> ./$@
+	@echo .SH SYNOPSIS >> ./$@
+	@./$* -h 2>&1 | head -n 3 | tail -n 1 | sed 's/^\.\///' >> ./$@
+	@echo >> ./$@
+	@echo .SH OPTIONS >> ./$@
+	@echo .nf >> ./$@
+	@./$* -h 2>&1 | tail -n +4 >> ./$@
+	@echo >> ./$@
+	@echo .SH AUTHOR >> ./$@
+	@echo "afl++ was written by Michal \"lcamtuf\" Zalewski and is maintained by Marc \"van Hauser\" Heuse <mh@mh-sec.de>, Heiko \"hexcoder-\" Eissfeldt <heiko.eissfeldt@hexco.de>, Andrea Fioraldi <andreafioraldi@gmail.com> and Dominik Maier <domenukk@gmail.com>" >> ./$@
+	@echo  The homepage of afl++ is: https://github.com/AFLplusplus/AFLplusplus >> ./$@
+	@echo >> ./$@
+	@echo .SH LICENSE >> ./$@
+	@echo Apache License Version 2.0, January 2004 >> ./$@
+	ln -sf afl-cc.8 ./afl-g++-fast.8
+
+.PHONY: install
 install: all
-	install -m 755 ../afl-gcc-fast $${DESTDIR}$(BIN_PATH)
-	install -m 755 ../afl-gcc-pass.so ../afl-gcc-rt.o $${DESTDIR}$(HELPER_PATH)
-	install -m 644 -T README.md $${DESTDIR}$(DOC_PATH)/README.gcc_plugin.md
-	install -m 644 -T README.instrument_list.md $${DESTDIR}$(DOC_PATH)/README.gcc_plugin.instrument_file.md
+	ln -sf afl-cc $${DESTDIR}$(BIN_PATH)/afl-gcc-fast
+	ln -sf afl-c++ $${DESTDIR}$(BIN_PATH)/afl-g++-fast
+	ln -sf afl-compiler-rt.o $${DESTDIR}$(HELPER_PATH)/afl-gcc-rt.o
+	install -m 755 ./afl-gcc-pass.so $${DESTDIR}$(HELPER_PATH)
+	install -m 644 -T instrumentation/README.gcc_plugin.md $${DESTDIR}$(DOC_PATH)/README.gcc_plugin.md
 
+.PHONY: clean
 clean:
 	rm -f *.o *.so *~ a.out core core.[1-9][0-9]* test-instr .test-instr0 .test-instr1 .test2
-	rm -f $(PROGS) afl-common.o ../afl-g++-fast ../afl-g*-fast.8
+	rm -f $(PROGS) afl-common.o ./afl-g++-fast ./afl-g*-fast.8 instrumentation/*.o
diff --git a/llvm_mode/GNUmakefile b/GNUmakefile.llvm
index 55c488f9..1bb3d265 100644
--- a/llvm_mode/GNUmakefile
+++ b/GNUmakefile.llvm
@@ -26,10 +26,10 @@ DOC_PATH    ?= $(PREFIX)/share/doc/afl
 MISC_PATH   ?= $(PREFIX)/share/afl
 MAN_PATH    ?= $(PREFIX)/share/man/man8
 
-VERSION     = $(shell grep '^$(HASH)define VERSION ' ../config.h | cut -d '"' -f2)
-
 BUILD_DATE  ?= $(shell date -u -d "@$(SOURCE_DATE_EPOCH)" "+%Y-%m-%d" 2>/dev/null || date -u -r "$(SOURCE_DATE_EPOCH)" "+%Y-%m-%d" 2>/dev/null || date -u "+%Y-%m-%d")
 
+VERSION     = $(shell grep '^$(HASH)define VERSION ' ./config.h | cut -d '"' -f2)
+
 ifeq "$(shell uname)" "OpenBSD"
   LLVM_CONFIG ?= $(BIN_PATH)/llvm-config
   HAS_OPT = $(shell test -x $(BIN_PATH)/opt && echo 0 || echo 1)
@@ -41,6 +41,7 @@ else
 endif
 
 LLVMVER  = $(shell $(LLVM_CONFIG) --version 2>/dev/null | sed 's/git//' )
+LLVM_MAJOR = $(shell $(LLVM_CONFIG) --version 2>/dev/null | sed 's/\..*//' )
 LLVM_UNSUPPORTED = $(shell $(LLVM_CONFIG) --version 2>/dev/null | egrep -q '^3\.[0-3]|^19' && echo 1 || echo 0 )
 LLVM_NEW_API = $(shell $(LLVM_CONFIG) --version 2>/dev/null | egrep -q '^1[0-9]' && echo 1 || echo 0 )
 LLVM_HAVE_LTO = $(shell $(LLVM_CONFIG) --version 2>/dev/null | egrep -q '^1[1-9]' && echo 1 || echo 0 )
@@ -78,13 +79,13 @@ ifeq "$(LLVM_TOO_OLD)" "1"
 endif
 
 ifeq "$(LLVM_HAVE_LTO)" "1"
-  $(info [+] llvm_mode detected llvm 11+, enabling afl-clang-lto LTO implementation)
+  $(info [+] llvm_mode detected llvm 11+, enabling afl-lto LTO implementation)
   LLVM_LTO = 1
   #TEST_MMAP = 1
 endif
 
 ifeq "$(LLVM_LTO)" "0"
-  $(info [+] llvm_mode detected llvm < 11, afl-clang-lto LTO will not be build.)
+  $(info [+] llvm_mode detected llvm < 11, afl-lto LTO will not be build.)
 endif
 
 ifeq "$(LLVM_APPLE_XCODE)" "1"
@@ -217,8 +218,14 @@ ifeq "$(LLVM_LTO)" "1"
   endif
 endif
 
+ifeq "$(shell echo 'int main() {return 0; }' | $(CLANG_BIN) -x c - -fdebug-prefix-map=$(CURDIR)=llvm_mode -o .test 2>/dev/null && echo 1 || echo 0 ; rm -f .test )" "1"
+        AFL_CLANG_DEBUG_PREFIX = -fdebug-prefix-map="$(CURDIR)=llvm_mode"
+else
+        AFL_CLANG_DEBUG_PREFIX = ""
+endif
+
 CFLAGS          ?= -O3 -funroll-loops -fPIC -D_FORTIFY_SOURCE=2
-CFLAGS_SAFE     := -Wall -g -Wno-pointer-sign -I ../include/ \
+CFLAGS_SAFE     := -Wall -g -Wno-pointer-sign -I ./include/ -I ./instrumentation/ \
                    -DAFL_PATH=\"$(HELPER_PATH)\" -DBIN_PATH=\"$(BIN_PATH)\" \
                    -DLLVM_BINDIR=\"$(LLVM_BINDIR)\" -DVERSION=\"$(VERSION)\" \
                    -DLLVM_LIBDIR=\"$(LLVM_LIBDIR)\" -DLLVM_VERSION=\"$(LLVMVER)\" \
@@ -227,7 +234,7 @@ CFLAGS_SAFE     := -Wall -g -Wno-pointer-sign -I ../include/ \
                    -DAFL_CLANG_LDPATH=\"$(AFL_CLANG_LDPATH)\" \
                    -DAFL_CLANG_FUSELD=\"$(AFL_CLANG_FUSELD)\" \
                    -DCLANG_BIN=\"$(CLANG_BIN)\" -DCLANGPP_BIN=\"$(CLANGPP_BIN)\" -DUSE_BINDIR=$(USE_BINDIR) -Wno-unused-function \
-                   -fdebug-prefix-map="$(CURDIR)=llvm_mode"
+                   $(AFL_CLANG_DEBUG_PREFIX)
 override CFLAGS += $(CFLAGS_SAFE)
 
 ifdef AFL_TRACE_PC
@@ -235,7 +242,7 @@ ifdef AFL_TRACE_PC
 endif
 
 CXXFLAGS          ?= -O3 -funroll-loops -fPIC -D_FORTIFY_SOURCE=2
-override CXXFLAGS += -Wall -g -I ../include/ \
+override CXXFLAGS += -Wall -g -I ./include/ \
                      -DVERSION=\"$(VERSION)\" -Wno-variadic-macros
 
 ifneq "$(shell $(LLVM_CONFIG) --includedir) 2> /dev/null" ""
@@ -277,7 +284,8 @@ ifeq "$(TEST_MMAP)" "1"
         LDFLAGS += -Wno-deprecated-declarations
 endif
 
-PROGS      = ../afl-clang-fast ../afl-llvm-pass.so ../afl-ld-lto ../afl-llvm-lto-instrumentlist.so ../afl-llvm-lto-instrumentation.so ../libLLVMInsTrim.so ../afl-llvm-rt.o ../afl-llvm-rt-32.o ../afl-llvm-rt-64.o ../compare-transform-pass.so ../split-compares-pass.so ../split-switches-pass.so ../cmplog-routines-pass.so ../cmplog-instructions-pass.so ../SanitizerCoverageLTO.so
+PROGS_ALWAYS = ./afl-cc ./afl-compiler-rt.o ./afl-compiler-rt-32.o ./afl-compiler-rt-64.o 
+PROGS        = $(PROGS_ALWAYS) ./afl-llvm-pass.so ./split-compares-pass.so ./split-switches-pass.so ./cmplog-routines-pass.so ./cmplog-instructions-pass.so ./afl-llvm-dict2file.so ./compare-transform-pass.so ./libLLVMInsTrim.so ./afl-ld-lto ./afl-llvm-lto-instrumentlist.so ./afl-llvm-lto-instrumentation.so ./SanitizerCoverageLTO.so
 
 # If prerequisites are not given, warn, do not build anything, and exit with code 0
 ifeq "$(LLVMVER)" ""
@@ -289,31 +297,31 @@ ifneq "$(LLVM_UNSUPPORTED)$(LLVM_APPLE_XCODE)" "00"
 endif
 
 ifeq "$(NO_BUILD)" "1"
-  TARGETS = no_build
+  TARGETS = test_shm $(PROGS_ALWAYS) afl-cc.8
 else
-  TARGETS = test_shm test_deps $(PROGS) afl-clang-fast.8 test_build all_done
+  TARGETS = test_shm test_deps $(PROGS) afl-cc.8 test_build all_done
 endif
 
 LLVM_MIN_4_0_1 = $(shell awk 'function tonum(ver, a) {split(ver,a,"."); return a[1]*1000000+a[2]*1000+a[3]} BEGIN { exit tonum(ARGV[1]) >= tonum(ARGV[2]) }' $(LLVMVER) 4.0.1; echo $$?)
 
+.PHONY: all
 all: $(TARGETS)
 
+.PHONY: test_shm
 ifeq "$(SHMAT_OK)" "1"
-
 test_shm:
 	@echo "[+] shmat seems to be working."
 	@rm -f .test2
-
 else
-
 test_shm:
 	@echo "[-] shmat seems not to be working, switching to mmap implementation"
-
 endif
 
+.PHONY: no_build
 no_build:
 	@printf "%b\\n" "\\033[0;31mPrerequisites are not met, skipping build llvm_mode\\033[0m"
 
+.PHONY: test_deps
 test_deps:
 	@echo "[*] Checking for working 'llvm-config'..."
  ifneq "$(LLVM_APPLE_XCODE)" "1"
@@ -327,148 +335,164 @@ ifneq "$(CLANGVER)" "$(LLVMVER)"
 else
 	@echo "[*] We have llvm-config version $(LLVMVER) with a clang version $(CLANGVER), good."
 endif
-	@echo "[*] Checking for '../afl-showmap'..."
-	@test -f ../afl-showmap || ( echo "[-] Oops, can't find '../afl-showmap'. Be sure to compile AFL first."; exit 1 )
+	@echo "[*] Checking for './afl-showmap'..."
+	@test -f ./afl-showmap || ( echo "[-] Oops, can't find './afl-showmap'. Be sure to compile AFL first."; exit 1 )
 	@echo "[+] All set and ready to build."
 
-afl-common.o: ../src/afl-common.c
+instrumentation/afl-common.o: ./src/afl-common.c
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@ $(LDFLAGS)
 
-../afl-clang-fast: afl-clang-fast.c afl-common.o | test_deps
-	$(CC) $(CLANG_CFL) $(CFLAGS) $(CPPFLAGS) $< afl-common.o -o $@ $(LDFLAGS) -DCFLAGS_OPT=\"$(CFLAGS_OPT)\"
-	ln -sf afl-clang-fast ../afl-clang-fast++
+./afl-cc: src/afl-cc.c instrumentation/afl-common.o | test_deps
+	$(CC) $(CLANG_CFL) $(CFLAGS) $(CPPFLAGS) $< instrumentation/afl-common.o -o $@ -DLLVM_MAJOR=$(LLVM_MAJOR) $(LDFLAGS) -DCFLAGS_OPT=\"$(CFLAGS_OPT)\"
+	@ln -sf afl-cc ./afl-c++
+	@ln -sf afl-cc ./afl-gcc
+	@ln -sf afl-cc ./afl-g++
+	@ln -sf afl-cc ./afl-clang-fast
+	@ln -sf afl-cc ./afl-clang-fast++
 ifneq "$(AFL_CLANG_FLTO)" ""
 ifeq "$(LLVM_LTO)" "1"
-	ln -sf afl-clang-fast ../afl-clang-lto
-	ln -sf afl-clang-fast ../afl-clang-lto++
+	@ln -sf afl-cc ./afl-clang-lto
+	@ln -sf afl-cc ./afl-clang-lto++
+	@ln -sf afl-cc ./afl-lto
+	@ln -sf afl-cc ./afl-lto++
 endif
 endif
 
-afl-llvm-common.o: afl-llvm-common.cc afl-llvm-common.h
+instrumentation/afl-llvm-common.o: instrumentation/afl-llvm-common.cc instrumentation/afl-llvm-common.h
 	$(CXX) $(CFLAGS) $(CPPFLAGS) `$(LLVM_CONFIG) --cxxflags` -fno-rtti -fPIC -std=$(LLVM_STDCXX) -c $< -o $@ 
 
-../libLLVMInsTrim.so: LLVMInsTrim.so.cc MarkNodes.cc afl-llvm-common.o | test_deps
-	-$(CXX) $(CLANG_CPPFL) -DLLVMInsTrim_EXPORTS -fno-rtti -fPIC -std=$(LLVM_STDCXX) -shared $< MarkNodes.cc -o $@ $(CLANG_LFL) afl-llvm-common.o
+./libLLVMInsTrim.so: instrumentation/LLVMInsTrim.so.cc instrumentation/MarkNodes.cc instrumentation/afl-llvm-common.o | test_deps
+	-$(CXX) $(CLANG_CPPFL) -DLLVMInsTrim_EXPORTS -fno-rtti -fPIC -std=$(LLVM_STDCXX) -shared $< instrumentation/MarkNodes.cc -o $@ $(CLANG_LFL) instrumentation/afl-llvm-common.o
 
-../afl-llvm-pass.so: afl-llvm-pass.so.cc afl-llvm-common.o | test_deps
+./afl-llvm-pass.so: instrumentation/afl-llvm-pass.so.cc instrumentation/afl-llvm-common.o | test_deps
 ifeq "$(LLVM_MIN_4_0_1)" "0"
 	$(info [!] N-gram branch coverage instrumentation is not available for llvm version $(LLVMVER))
 endif
-	$(CXX) $(CLANG_CPPFL) -DLLVMInsTrim_EXPORTS -fno-rtti -fPIC -std=$(LLVM_STDCXX) -shared $< -o $@ $(CLANG_LFL) afl-llvm-common.o
+	$(CXX) $(CLANG_CPPFL) -DLLVMInsTrim_EXPORTS -fno-rtti -fPIC -std=$(LLVM_STDCXX) -shared $< -o $@ $(CLANG_LFL) instrumentation/afl-llvm-common.o
 
-../afl-llvm-lto-instrumentlist.so: afl-llvm-lto-instrumentlist.so.cc afl-llvm-common.o
+./afl-llvm-lto-instrumentlist.so: instrumentation/afl-llvm-lto-instrumentlist.so.cc instrumentation/afl-llvm-common.o
 ifeq "$(LLVM_LTO)" "1"
-	$(CXX) $(CLANG_CPPFL) -fno-rtti -fPIC -std=$(LLVM_STDCXX) -shared $< -o $@ $(CLANG_LFL) afl-llvm-common.o
+	$(CXX) $(CLANG_CPPFL) -fno-rtti -fPIC -std=$(LLVM_STDCXX) -shared $< -o $@ $(CLANG_LFL) instrumentation/afl-llvm-common.o
 endif
 
-../afl-ld-lto: afl-ld-lto.c
+./afl-ld-lto: src/afl-ld-lto.c
 ifeq "$(LLVM_LTO)" "1"
 	$(CC) $(CFLAGS) $(CPPFLAGS) $< -o $@
 endif
 
-../SanitizerCoverageLTO.so: SanitizerCoverageLTO.so.cc
+./SanitizerCoverageLTO.so: instrumentation/SanitizerCoverageLTO.so.cc
 ifeq "$(LLVM_LTO)" "1"
-	$(CXX) $(CLANG_CPPFL) -Wno-writable-strings -fno-rtti -fPIC -std=$(LLVM_STDCXX) -shared $< -o $@ $(CLANG_LFL) afl-llvm-common.o
+	$(CXX) $(CLANG_CPPFL) -Wno-writable-strings -fno-rtti -fPIC -std=$(LLVM_STDCXX) -shared $< -o $@ $(CLANG_LFL) instrumentation/afl-llvm-common.o
 endif
 
-../afl-llvm-lto-instrumentation.so: afl-llvm-lto-instrumentation.so.cc afl-llvm-common.o
+./afl-llvm-lto-instrumentation.so: instrumentation/afl-llvm-lto-instrumentation.so.cc instrumentation/afl-llvm-common.o
 ifeq "$(LLVM_LTO)" "1"
-	$(CXX) $(CLANG_CPPFL) -Wno-writable-strings -fno-rtti -fPIC -std=$(LLVM_STDCXX) -shared $< -o $@ $(CLANG_LFL) afl-llvm-common.o
-	$(CLANG_BIN) $(CFLAGS_SAFE) $(CPPFLAGS) -Wno-unused-result -O0 $(AFL_CLANG_FLTO) -fPIC -c afl-llvm-rt-lto.o.c -o ../afl-llvm-rt-lto.o
-	@$(CLANG_BIN) $(CFLAGS_SAFE) $(CPPFLAGS) -Wno-unused-result -O0 $(AFL_CLANG_FLTO) -m64 -fPIC -c afl-llvm-rt-lto.o.c -o ../afl-llvm-rt-lto-64.o 2>/dev/null; if [ "$$?" = "0" ]; then : ; fi
-	@$(CLANG_BIN) $(CFLAGS_SAFE) $(CPPFLAGS) -Wno-unused-result -O0 $(AFL_CLANG_FLTO) -m32 -fPIC -c afl-llvm-rt-lto.o.c -o ../afl-llvm-rt-lto-32.o 2>/dev/null; if [ "$$?" = "0" ]; then : ; fi
+	$(CXX) $(CLANG_CPPFL) -Wno-writable-strings -fno-rtti -fPIC -std=$(LLVM_STDCXX) -shared $< -o $@ $(CLANG_LFL) instrumentation/afl-llvm-common.o
+	$(CLANG_BIN) $(CFLAGS_SAFE) $(CPPFLAGS) -Wno-unused-result -O0 $(AFL_CLANG_FLTO) -fPIC -c instrumentation/afl-llvm-rt-lto.o.c -o ./afl-llvm-rt-lto.o
+	@$(CLANG_BIN) $(CFLAGS_SAFE) $(CPPFLAGS) -Wno-unused-result -O0 $(AFL_CLANG_FLTO) -m64 -fPIC -c instrumentation/afl-llvm-rt-lto.o.c -o ./afl-llvm-rt-lto-64.o 2>/dev/null; if [ "$$?" = "0" ]; then : ; fi
+	@$(CLANG_BIN) $(CFLAGS_SAFE) $(CPPFLAGS) -Wno-unused-result -O0 $(AFL_CLANG_FLTO) -m32 -fPIC -c instrumentation/afl-llvm-rt-lto.o.c -o ./afl-llvm-rt-lto-32.o 2>/dev/null; if [ "$$?" = "0" ]; then : ; fi
 endif
 
 # laf
-../split-switches-pass.so:	split-switches-pass.so.cc afl-llvm-common.o | test_deps
-	$(CXX) $(CLANG_CPPFL) -shared $< -o $@ $(CLANG_LFL) afl-llvm-common.o
-../compare-transform-pass.so:	compare-transform-pass.so.cc afl-llvm-common.o | test_deps
-	$(CXX) $(CLANG_CPPFL) -shared $< -o $@ $(CLANG_LFL) afl-llvm-common.o
-../split-compares-pass.so:	split-compares-pass.so.cc afl-llvm-common.o | test_deps
-	$(CXX) $(CLANG_CPPFL) -shared $< -o $@ $(CLANG_LFL) afl-llvm-common.o
+./split-switches-pass.so:	instrumentation/split-switches-pass.so.cc instrumentation/afl-llvm-common.o | test_deps
+	$(CXX) $(CLANG_CPPFL) -shared $< -o $@ $(CLANG_LFL) instrumentation/afl-llvm-common.o
+./compare-transform-pass.so:	instrumentation/compare-transform-pass.so.cc instrumentation/afl-llvm-common.o | test_deps
+	$(CXX) $(CLANG_CPPFL) -shared $< -o $@ $(CLANG_LFL) instrumentation/afl-llvm-common.o
+./split-compares-pass.so:	instrumentation/split-compares-pass.so.cc instrumentation/afl-llvm-common.o | test_deps
+	$(CXX) $(CLANG_CPPFL) -shared $< -o $@ $(CLANG_LFL) instrumentation/afl-llvm-common.o
 # /laf
 
-../cmplog-routines-pass.so:	cmplog-routines-pass.cc afl-llvm-common.o | test_deps
-	$(CXX) $(CLANG_CPPFL) -shared $< -o $@ $(CLANG_LFL) afl-llvm-common.o
+./cmplog-routines-pass.so:	instrumentation/cmplog-routines-pass.cc instrumentation/afl-llvm-common.o | test_deps
+	$(CXX) $(CLANG_CPPFL) -shared $< -o $@ $(CLANG_LFL) instrumentation/afl-llvm-common.o
+
+./cmplog-instructions-pass.so:	instrumentation/cmplog-instructions-pass.cc instrumentation/afl-llvm-common.o | test_deps
+	$(CXX) $(CLANG_CPPFL) -shared $< -o $@ $(CLANG_LFL) instrumentation/afl-llvm-common.o
 
-../cmplog-instructions-pass.so:	cmplog-instructions-pass.cc afl-llvm-common.o | test_deps
-	$(CXX) $(CLANG_CPPFL) -shared $< -o $@ $(CLANG_LFL) afl-llvm-common.o
+afl-llvm-dict2file.so:	instrumentation/afl-llvm-dict2file.so.cc instrumentation/afl-llvm-common.o | test_deps
+	$(CXX) $(CLANG_CPPFL) -shared $< -o $@ $(CLANG_LFL) instrumentation/afl-llvm-common.o
 
+.PHONY: document
 document:
-	$(CLANG_BIN) -D_AFL_DOCUMENT_MUTATIONS $(CFLAGS_SAFE) $(CPPFLAGS) $(CLANG_CFL) -O3 -Wno-unused-result -fPIC -c afl-llvm-rt.o.c -o ../afl-llvm-rt.o
-	@$(CLANG_BIN) -D_AFL_DOCUMENT_MUTATIONS $(CFLAGS_SAFE) $(CPPFLAGS) $(CLANG_CFL) -O3 -Wno-unused-result -m32 -fPIC -c afl-llvm-rt.o.c -o ../afl-llvm-rt-32.o 2>/dev/null; if [ "$$?" = "0" ]; then echo "success!"; else echo "failed (that's fine)"; fi
-	@$(CLANG_BIN) -D_AFL_DOCUMENT_MUTATIONS $(CFLAGS_SAFE) $(CPPFLAGS) $(CLANG_CFL) -O3 -Wno-unused-result -m64 -fPIC -c afl-llvm-rt.o.c -o ../afl-llvm-rt-64.o 2>/dev/null; if [ "$$?" = "0" ]; then echo "success!"; else echo "failed (that's fine)"; fi
+	$(CLANG_BIN) -D_AFL_DOCUMENT_MUTATIONS $(CFLAGS_SAFE) $(CPPFLAGS) $(CLANG_CFL) -O3 -Wno-unused-result -fPIC -c instrumentation/afl-compiler-rt.o.c -o ./afl-compiler-rt.o
+	@$(CLANG_BIN) -D_AFL_DOCUMENT_MUTATIONS $(CFLAGS_SAFE) $(CPPFLAGS) $(CLANG_CFL) -O3 -Wno-unused-result -m32 -fPIC -c instrumentation/afl-compiler-rt.o.c -o ./afl-compiler-rt-32.o 2>/dev/null; if [ "$$?" = "0" ]; then echo "success!"; else echo "failed (that's fine)"; fi
+	@$(CLANG_BIN) -D_AFL_DOCUMENT_MUTATIONS $(CFLAGS_SAFE) $(CPPFLAGS) $(CLANG_CFL) -O3 -Wno-unused-result -m64 -fPIC -c instrumentation/afl-compiler-rt.o.c -o ./afl-compiler-rt-64.o 2>/dev/null; if [ "$$?" = "0" ]; then echo "success!"; else echo "failed (that's fine)"; fi
 
-../afl-llvm-rt.o: afl-llvm-rt.o.c | test_deps
+./afl-compiler-rt.o: instrumentation/afl-compiler-rt.o.c | test_deps
 	$(CLANG_BIN) $(CLANG_CFL) $(CFLAGS_SAFE) $(CPPFLAGS) -O3 -Wno-unused-result -fPIC -c $< -o $@
 
-../afl-llvm-rt-32.o: afl-llvm-rt.o.c | test_deps
+./afl-compiler-rt-32.o: instrumentation/afl-compiler-rt.o.c | test_deps
 	@printf "[*] Building 32-bit variant of the runtime (-m32)... "
 	@$(CLANG_BIN) $(CLANG_CFL) $(CFLAGS_SAFE) $(CPPFLAGS) -O3 -Wno-unused-result -m32 -fPIC -c $< -o $@ 2>/dev/null; if [ "$$?" = "0" ]; then echo "success!"; else echo "failed (that's fine)"; fi
+	@test -e afl-compiler-rt-32.o && ln -sf afl-compiler-rt-32.o afl-llvm-rt-64.o
 
-../afl-llvm-rt-64.o: afl-llvm-rt.o.c | test_deps
+./afl-compiler-rt-64.o: instrumentation/afl-compiler-rt.o.c | test_deps
 	@printf "[*] Building 64-bit variant of the runtime (-m64)... "
 	@$(CLANG_BIN) $(CLANG_CFL) $(CFLAGS_SAFE) $(CPPFLAGS) -O3 -Wno-unused-result -m64 -fPIC -c $< -o $@ 2>/dev/null; if [ "$$?" = "0" ]; then echo "success!"; else echo "failed (that's fine)"; fi
+	@test -e afl-compiler-rt-64.o && ln -sf afl-compiler-rt-64.o afl-llvm-rt-64.o
 
+.PHONY: test_build
 test_build: $(PROGS)
 	@echo "[*] Testing the CC wrapper and instrumentation output..."
-	unset AFL_USE_ASAN AFL_USE_MSAN AFL_INST_RATIO; AFL_QUIET=1 AFL_PATH=. AFL_LLVM_LAF_SPLIT_SWITCHES=1 AFL_LLVM_LAF_TRANSFORM_COMPARES=1 AFL_LLVM_LAF_SPLIT_COMPARES=1 ../afl-clang-fast $(CFLAGS) ../test-instr.c -o test-instr $(LDFLAGS)
-	ASAN_OPTIONS=detect_leaks=0 ../afl-showmap -m none -q -o .test-instr0 ./test-instr < /dev/null
-	echo 1 | ASAN_OPTIONS=detect_leaks=0 ../afl-showmap -m none -q -o .test-instr1 ./test-instr
+	unset AFL_USE_ASAN AFL_USE_MSAN AFL_INST_RATIO; AFL_QUIET=1 AFL_PATH=. AFL_LLVM_LAF_ALL=1 ./afl-cc $(CFLAGS) $(CPPFLAGS) ./test-instr.c -o test-instr $(LDFLAGS)
+	ASAN_OPTIONS=detect_leaks=0 ./afl-showmap -m none -q -o .test-instr0 ./test-instr < /dev/null
+	echo 1 | ASAN_OPTIONS=detect_leaks=0 ./afl-showmap -m none -q -o .test-instr1 ./test-instr
 	@rm -f test-instr
 	@cmp -s .test-instr0 .test-instr1; DR="$$?"; rm -f .test-instr0 .test-instr1; if [ "$$DR" = "0" ]; then echo; echo "Oops, the instrumentation does not seem to be behaving correctly!"; echo; echo "Please post to https://github.com/AFLplusplus/AFLplusplus/issues to troubleshoot the issue."; echo; exit 1; fi
 	@echo "[+] All right, the instrumentation seems to be working!"
 
+.PHONY: all_done
 all_done: test_build
-	@echo "[+] All done! You can now use '../afl-clang-fast' to compile programs."
+	@echo "[+] All done! You can now use './afl-cc' to compile programs."
 
 .NOTPARALLEL: clean
 
+.PHONY: install
 install: all
-	install -d -m 755 $${DESTDIR}$(BIN_PATH) $${DESTDIR}$(HELPER_PATH) $${DESTDIR}$(DOC_PATH) $${DESTDIR}$(MISC_PATH)
-	if [ -f ../afl-clang-fast -a -f ../libLLVMInsTrim.so -a -f ../afl-llvm-rt.o ]; then set -e; install -m 755 ../afl-clang-fast $${DESTDIR}$(BIN_PATH); ln -sf afl-clang-fast $${DESTDIR}$(BIN_PATH)/afl-clang-fast++; install -m 755 ../libLLVMInsTrim.so ../afl-llvm-pass.so ../afl-llvm-rt.o $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f ../afl-clang-lto ]; then set -e; ln -sf afl-clang-fast $${DESTDIR}$(BIN_PATH)/afl-clang-lto; ln -sf afl-clang-fast $${DESTDIR}$(BIN_PATH)/afl-clang-lto++; install -m 755 ../afl-llvm-lto-instrumentation.so ../afl-llvm-rt-lto*.o ../afl-llvm-lto-instrumentlist.so $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f ../afl-ld-lto ]; then set -e; install -m 755 ../afl-ld-lto $${DESTDIR}$(BIN_PATH); fi
-	if [ -f ../afl-llvm-rt-32.o ]; then set -e; install -m 755 ../afl-llvm-rt-32.o $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f ../afl-llvm-rt-64.o ]; then set -e; install -m 755 ../afl-llvm-rt-64.o $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f ../compare-transform-pass.so ]; then set -e; install -m 755 ../compare-transform-pass.so $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f ../split-compares-pass.so ]; then set -e; install -m 755 ../split-compares-pass.so $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f ../split-switches-pass.so ]; then set -e; install -m 755 ../split-switches-pass.so $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f ../cmplog-instructions-pass.so ]; then set -e; install -m 755 ../cmplog-*-pass.so $${DESTDIR}$(HELPER_PATH); fi
-	if [ -f ../SanitizerCoverageLTO.so ]; then set -e; install -m 755 ../SanitizerCoverageLTO.so $${DESTDIR}$(HELPER_PATH); fi
-	set -e; install -m 644 ../dynamic_list.txt $${DESTDIR}$(HELPER_PATH)
-	set -e; if [ -f ../afl-clang-fast ] ; then ln -sf afl-clang-fast $${DESTDIR}$(BIN_PATH)/afl-clang ; ln -sf afl-clang-fast $${DESTDIR}$(BIN_PATH)/afl-clang++ ; else ln -sf afl-gcc $${DESTDIR}$(BIN_PATH)/afl-clang ; ln -sf afl-gcc $${DESTDIR}$(BIN_PATH)/afl-clang++; fi
-	install -m 644 README.*.md $${DESTDIR}$(DOC_PATH)/
-	install -m 644 README.md $${DESTDIR}$(DOC_PATH)/README.llvm_mode.md
+	@install -d -m 755 $${DESTDIR}$(BIN_PATH) $${DESTDIR}$(HELPER_PATH) $${DESTDIR}$(DOC_PATH) $${DESTDIR}$(MISC_PATH)
+	@if [ -f ./afl-cc ]; then set -e; install -m 755 ./afl-cc $${DESTDIR}$(BIN_PATH); ln -sf afl-cc $${DESTDIR}$(BIN_PATH)/afl-c++; fi
+	@if [ -f ./afl-compiler-rt.o ]; then set -e; install -m 755 ./afl-compiler-rt.o $${DESTDIR}$(HELPER_PATH); ln -sf afl-compiler-rt.o $${DESTDIR}$(HELPER_PATH)afl-llvm-rt.o ;fi
+	@if [ -f ./afl-lto ]; then set -e; ln -sf afl-cc $${DESTDIR}$(BIN_PATH)/afl-lto; ln -sf afl-cc $${DESTDIR}$(BIN_PATH)/afl-lto++; ln -sf afl-cc $${DESTDIR}$(BIN_PATH)/afl-clang-lto; ln -sf afl-cc $${DESTDIR}$(BIN_PATH)/afl-clang-lto++; install -m 755 ./afl-llvm-lto-instrumentation.so ./afl-llvm-rt-lto*.o ./afl-llvm-lto-instrumentlist.so $${DESTDIR}$(HELPER_PATH); fi
+	@if [ -f ./afl-ld-lto ]; then set -e; install -m 755 ./afl-ld-lto $${DESTDIR}$(BIN_PATH); fi
+	@if [ -f ./afl-compiler-rt-32.o ]; then set -e; install -m 755 ./afl-compiler-rt-32.o $${DESTDIR}$(HELPER_PATH); ln -sf afl-compiler-rt-32.o $${DESTDIR}$(HELPER_PATH)afl-llvm-rt-32.o ;fi
+	@if [ -f ./afl-compiler-rt-64.o ]; then set -e; install -m 755 ./afl-compiler-rt-64.o $${DESTDIR}$(HELPER_PATH); ln -sf afl-compiler-rt-64.o $${DESTDIR}$(HELPER_PATH)afl-llvm-rt-64.o ; fi
+	@if [ -f ./compare-transform-pass.so ]; then set -e; install -m 755 ./*.so $${DESTDIR}$(HELPER_PATH); fi
+	@if [ -f ./compare-transform-pass.so ]; then set -e; ln -sf afl-cc $${DESTDIR}$(BIN_PATH)/afl-clang-fast ; ln -sf ./afl-c++ $${DESTDIR}$(BIN_PATH)/afl-clang-fast++ ; ln -sf afl-cc $${DESTDIR}$(BIN_PATH)/afl-clang ; ln -sf ./afl-c++ $${DESTDIR}$(BIN_PATH)/afl-clang++ ; fi
+	@if [ -f ./SanitizerCoverageLTO.so ]; then set -e; ln -sf afl-cc $${DESTDIR}$(BIN_PATH)/afl-clang-lto ; ln -sf ./afl-c++ $${DESTDIR}$(BIN_PATH)/afl-clang-lto++ ; fi
+	set -e; install -m 644 ./dynamic_list.txt $${DESTDIR}$(HELPER_PATH)
+	install -m 644 instrumentation/README.*.md $${DESTDIR}$(DOC_PATH)/
 
 vpath  % ..
 %.8: %
-	@echo .TH $* 8 $(BUILD_DATE) "afl++" > ../$@
-	@echo .SH NAME >> ../$@
-	@echo -n ".B $* \- " >> ../$@
-	@../$* -h 2>&1 | head -n 1 | sed -e "s/$$(printf '\e')[^m]*m//g" >> ../$@
-	@echo >> ../$@
-	@echo .SH SYNOPSIS >> ../$@
-	@../$* -h 2>&1 | head -n 3 | tail -n 1 | sed 's/^\.\///' >> ../$@
-	@echo >> ../$@
-	@echo .SH OPTIONS >> ../$@
-	@echo .nf >> ../$@
-	@../$* -h 2>&1 | tail -n +4 >> ../$@
-	@echo >> ../$@
-	@echo .SH AUTHOR >> ../$@
-	@echo "afl++ was written by Michal \"lcamtuf\" Zalewski and is maintained by Marc \"van Hauser\" Heuse <mh@mh-sec.de>, Heiko \"hexcoder-\" Eissfeldt <heiko.eissfeldt@hexco.de>, Andrea Fioraldi <andreafioraldi@gmail.com> and Dominik Maier <domenukk@gmail.com>" >> ../$@
-	@echo  The homepage of afl++ is: https://github.com/AFLplusplus/AFLplusplus >> ../$@
-	@echo >> ../$@
-	@echo .SH LICENSE >> ../$@
-	@echo Apache License Version 2.0, January 2004 >> ../$@
-	ln -sf afl-clang-fast.8 ../afl-clang-fast++.8
+	@echo .TH $* 8 $(BUILD_DATE) "afl++" > ./$@
+	@echo .SH NAME >> ./$@
+	@printf ".B $* \- " >> ../$@
+	@./$* -h 2>&1 | head -n 1 | sed -e "s/$$(printf '\e')[^m]*m//g" >> ../$@
+	@echo .B $* >> ./$@
+	@echo >> ./$@
+	@echo .SH SYNOPSIS >> ./$@
+	@./$* -h 2>&1 | head -n 3 | tail -n 1 | sed 's/^\.\///' >> ./$@
+	@echo >> ./$@
+	@echo .SH OPTIONS >> ./$@
+	@echo .nf >> ./$@
+	@./$* -h 2>&1 | tail -n +4 >> ./$@
+	@echo >> ./$@
+	@echo .SH AUTHOR >> ./$@
+	@echo "afl++ was written by Michal \"lcamtuf\" Zalewski and is maintained by Marc \"van Hauser\" Heuse <mh@mh-sec.de>, Heiko \"hexcoder-\" Eissfeldt <heiko.eissfeldt@hexco.de>, Andrea Fioraldi <andreafioraldi@gmail.com> and Dominik Maier <domenukk@gmail.com>" >> ./$@
+	@echo  The homepage of afl++ is: https://github.com/AFLplusplus/AFLplusplus >> ./$@
+	@echo >> ./$@
+	@echo .SH LICENSE >> ./$@
+	@echo Apache License Version 2.0, January 2004 >> ./$@
+	@ln -sf afl-cc.8 ./afl-c++.8
 ifneq "$(AFL_CLANG_FLTO)" ""
 ifeq "$(LLVM_LTO)" "1"
-	ln -sf afl-clang-fast.8 ../afl-clang-lto.8
-	ln -sf afl-clang-fast.8 ../afl-clang-lto++.8
+	@ln -sf afl-cc.8 ./afl-clang-lto.8
+	@ln -sf afl-cc.8 ./afl-clang-lto++.8
+	@ln -sf afl-cc.8 ./afl-lto.8
+	@ln -sf afl-cc.8 ./afl-lto++.8
 endif
 endif
 
+.PHONY: clean
 clean:
 	rm -f *.o *.so *~ a.out core core.[1-9][0-9]* .test2 test-instr .test-instr0 .test-instr1 *.dwo
-	rm -f $(PROGS) afl-common.o ../afl-clang-fast++ ../afl-clang-lto ../afl-clang-lto++ ../afl-clang*.8 ../ld ../afl-ld ../afl-llvm-rt*.o
+	rm -f $(PROGS) afl-common.o ./afl-c++ ./afl-lto ./afl-lto++ ./afl-clang-lto* ./afl-clang-fast* ./afl-clang*.8 ./ld ./afl-ld ./afl-llvm-rt*.o instrumentation/*.o
diff --git a/README.md b/README.md
index 6e5d9c1f..c886489d 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
   Release Version: [2.67c](https://github.com/AFLplusplus/AFLplusplus/releases)
 
-  Github Version: 2.67d
+  Github Version: 3.00a
 
   Repository: [https://github.com/AFLplusplus/AFLplusplus](https://github.com/AFLplusplus/AFLplusplus)
 
@@ -22,6 +22,26 @@
   afl++ is a superior fork to Google's afl - more speed, more and better
   mutations, more and better instrumentation, custom module support, etc.
 
+## Major changes in afl++ 3.0
+
+With afl++ 3.0 we introduced changes that break some previous afl and afl++
+behaviours:
+
+  * There are no llvm_mode and gcc_plugin subdirectories anymore and there is
+    only one compiler: afl-cc. All previous compilers now symlink to this one
+    compiler. All instrumentation source code is now in the `instrumentation/`
+    folder.
+  * qemu_mode got upgraded to QEMU 5.1, but to be able to build this a current
+    ninja build tool version and python3 setuptools are required.
+    qemu_mode also got new options like snapshotting, instrumenting specific
+    shared libraries, etc. and QEMU 5.1 supports more CPU targets so this is
+    worth it.
+  * When instrumenting targets, afl-cc will not supersede optimizations. This
+    allows to fuzz targets as same as they are built for debug or release.
+  * afl-fuzz' `-i` option now descends into subdirectories.
+  * afl-fuzz will skip over empty dictionaries and too large test cases instead
+    of failing.
+
 ## Contents
 
   1. [Features](#important-features-of-afl)
@@ -39,7 +59,7 @@
   with laf-intel and redqueen, unicorn mode, gcc plugin, full *BSD, Solaris and
   Android support and much, much, much more.
 
-  | Feature/Instrumentation  | afl-gcc | llvm_mode | gcc_plugin | qemu_mode        | unicorn_mode |
+  | Feature/Instrumentation  | afl-gcc | llvm      | gcc_plugin | qemu_mode        | unicorn_mode |
   | -------------------------|:-------:|:---------:|:----------:|:----------------:|:------------:|
   | NeverZero                | x86[_64]|     x(1)  |      (2)   |         x        |       x      |
   | Persistent Mode          |         |     x     |     x      | x86[_64]/arm[64] |       x      |
@@ -47,9 +67,8 @@
   | CmpLog                   |         |     x     |            | x86[_64]/arm[64] |              |
   | Selective Instrumentation|         |     x     |     x      |        (x)(3)    |              |
   | Non-Colliding Coverage   |         |     x(4)  |            |        (x)(5)    |              |
-  | InsTrim                  |         |     x     |            |                  |              |
   | Ngram prev_loc Coverage  |         |     x(6)  |            |                  |              |
-  | Context Coverage         |         |     x     |            |                  |              |
+  | Context Coverage         |         |     x(6)  |            |                  |              |
   | Auto Dictionary          |         |     x(7)  |            |                  |              |
   | Snapshot LKM Support     |         |     x     |            |        (x)(5)    |              |
 
@@ -59,11 +78,11 @@
   4. with pcguard mode and LTO mode for LLVM >= 11
   5. upcoming, development in the branch
   6. not compatible with LTO instrumentation and needs at least LLVM >= 4.1
-  7. only in LTO mode with LLVM >= 11
+  7. automatic in LTO mode with LLVM >= 11, an extra pass for all LLVM version that writes to a file to use with afl-fuzz' `-x`
 
   Among others, the following features and patches have been integrated:
 
-  * NeverZero patch for afl-gcc, llvm_mode, qemu_mode and unicorn_mode which prevents a wrapping map value to zero, increases coverage
+  * NeverZero patch for afl-gcc, instrumentation, qemu_mode and unicorn_mode which prevents a wrapping map value to zero, increases coverage
   * Persistent mode, deferred forkserver and in-memory fuzzing for qemu_mode
   * Unicorn mode which allows fuzzing of binaries from completely different platforms (integration provided by domenukk)
   * The new CmpLog instrumentation for LLVM and QEMU inspired by [Redqueen](https://www.syssec.ruhr-uni-bochum.de/media/emma/veroeffentlichungen/2018/12/17/NDSS19-Redqueen.pdf)
@@ -71,10 +90,9 @@
   * AFLfast's power schedules by Marcel Böhme: [https://github.com/mboehme/aflfast](https://github.com/mboehme/aflfast)
   * The MOpt mutator: [https://github.com/puppet-meteor/MOpt-AFL](https://github.com/puppet-meteor/MOpt-AFL)
   * LLVM mode Ngram coverage by Adrian Herrera [https://github.com/adrianherrera/afl-ngram-pass](https://github.com/adrianherrera/afl-ngram-pass)
-  * InsTrim, a CFG llvm_mode instrumentation implementation: [https://github.com/csienslab/instrim](https://github.com/csienslab/instrim)
   * C. Holler's afl-fuzz Python mutator module: [https://github.com/choller/afl](https://github.com/choller/afl)
   * Custom mutator by a library (instead of Python) by kyakdan
-  * LAF-Intel/CompCov support for llvm_mode, qemu_mode and unicorn_mode (with enhanced capabilities)
+  * LAF-Intel/CompCov support for instrumentation, qemu_mode and unicorn_mode (with enhanced capabilities)
   * Radamsa and honggfuzz mutators (as custom mutators).
   * QBDI mode to fuzz android native libraries via Quarkslab's [QBDI](https://github.com/QBDI/QBDI) framework
   * Frida and ptrace mode to fuzz binary-only libraries, etc.
@@ -88,7 +106,7 @@
   send a mail to <afl-users+subscribe@googlegroups.com>.
 
   See [docs/QuickStartGuide.md](docs/QuickStartGuide.md) if you don't have time to
-  read this file.
+  read this file - however this is not recommended!
 
 ## Branches
 
@@ -105,13 +123,14 @@
 
 ## Help wanted
 
-We are happy to be part of [Google Summer of Code 2020](https://summerofcode.withgoogle.com/organizations/5100744400699392/)! :-)
+We were happy to be part of [Google Summer of Code 2020](https://summerofcode.withgoogle.com/organizations/5100744400699392/)
+and we will try to participate again in 2021!
 
 We have several ideas we would like to see in AFL++ to make it even better.
 However, we already work on so many things that we do not have the time for
 all the big ideas.
 
-This can be your way to support and contribute to AFL++ - extend it to
+This can be your way to support and contribute to AFL++ - extend it to do
 something cool.
 
 We have an idea list in [docs/ideas.md](docs/ideas.md).
@@ -132,7 +151,7 @@ This image is automatically generated when a push to the stable repo happens.
 You will find your target source code in /src in the container.
 
 If you want to build afl++ yourself you have many options.
-The easiest is to build and install everything:
+The easiest choice is to build and install everything:
 
 ```shell
 sudo apt install build-essential libtool-bin python3-dev automake flex bison libglib2.0-dev libpixman-1-dev clang python3-setuptools llvm
@@ -142,9 +161,9 @@ sudo make install
 It is recommended to install the newest available gcc, clang and llvm-dev
 possible in your distribution!
 
-Note that "make distrib" also builds llvm_mode, qemu_mode, unicorn_mode and
+Note that "make distrib" also builds instrumentation, qemu_mode, unicorn_mode and
 more. If you just want plain afl++ then do "make all", however compiling and
-using at least llvm_mode is highly recommended for much better results -
+using at least instrumentation is highly recommended for much better results -
 hence in this case
 
 ```shell
@@ -156,7 +175,7 @@ These build targets exist:
 
 * all: just the main afl++ binaries
 * binary-only: everything for binary-only fuzzing: qemu_mode, unicorn_mode, libdislocator, libtokencap
-* source-only: everything for source code fuzzing: llvm_mode, libdislocator, libtokencap
+* source-only: everything for source code fuzzing: instrumentation, libdislocator, libtokencap
 * distrib: everything (for both binary-only and source code fuzzing)
 * man: creates simple man pages from the help option of the programs
 * install: installs everything you have compiled with the build options above
@@ -212,18 +231,19 @@ If you have a binary-only target please skip to [#Instrumenting binary-only apps
 
 Fuzzing source code is a three-step process.
 
-1. compile the target with a special compiler that prepares the target to be
+1. Compile the target with a special compiler that prepares the target to be
    fuzzed efficiently. This step is called "instrumenting a target".
 2. Prepare the fuzzing by selecting and optimizing the input corpus for the
    target.
-3. perform the fuzzing of the target by randomly mutating input and assessing
+3. Perform the fuzzing of the target by randomly mutating input and assessing
    if a generated input was processed in a new path in the target binary.
 
 ### 1. Instrumenting that target
 
 #### a) Selecting the best afl++ compiler for instrumenting the target
 
-afl++ comes with different compilers and instrumentation options.
+afl++ comes with a central compiler `afl-cc` that incorporates various different
+kinds of compiler targets and and instrumentation options.
 The following evaluation flow will help you to select the best possible.
 
 It is highly recommended to have the newest llvm version possible installed,
@@ -231,49 +251,62 @@ anything below 9 is not recommended.
 
 ```
 +--------------------------------+
-| clang/clang++ 11+ is available | --> use afl-clang-lto and afl-clang-lto++
-+--------------------------------+     see [llvm/README.lto.md](llvm/README.lto.md)
+| clang/clang++ 11+ is available | --> use LTO mode (afl-clang-lto/afl-clang-lto++)
++--------------------------------+     see [instrumentation/README.lto.md](instrumentation/README.lto.md)
     |
-    | if not, or if the target fails with afl-clang-lto/++
+    | if not, or if the target fails with LTO afl-clang-lto/++
     |
     v
 +---------------------------------+
-| clang/clang++ 3.3+ is available | --> use afl-clang-fast and afl-clang-fast++
-+---------------------------------+     see [llvm/README.md](llvm/README.md)
+| clang/clang++ 3.3+ is available | --> use LLVM mode (afl-clang-fast/afl-clang-fast++)
++---------------------------------+     see [instrumentation/README.md](instrumentation/README.md)
     |
-    | if not, or if the target fails with afl-clang-fast/++
+    | if not, or if the target fails with LLVM afl-clang-fast/++
     |
     v
  +--------------------------------+
- | if you want to instrument only | -> use afl-gcc-fast and afl-gcc-fast++
- | parts of the target            |    see [gcc_plugin/README.md](gcc_plugin/README.md) and
- +--------------------------------+    [gcc_plugin/README.instrument_list.md](gcc_plugin/README.instrument_list.md)
+ | if you want to instrument only | -> use GCC_PLUGIN mode (afl-gcc-fast/afl-g++-fast)
+ | parts of the target            |    see [instrumentation/README.gcc_plugin.md](instrumentation/README.gcc_plugin.md) and
+ +--------------------------------+    [instrumentation/README.instrument_list.md](instrumentation/README.instrument_list.md)
     |
     | if not, or if you do not have a gcc with plugin support
     |
     v
-   use afl-gcc and afl-g++ (or afl-clang and afl-clang++)
+   use GCC mode (afl-gcc/afl-g++) (or afl-clang/afl-clang++ for clang)
 ```
 
 Clickable README links for the chosen compiler:
 
-  * [afl-clang-lto](llvm/README.lto.md)
-  * [afl-clang-fast](llvm/README.md)
-  * [afl-gcc-fast](gcc_plugin/README.md)
-  * afl-gcc has no README as it has no features
+  * [LTO mode - afl-clang-lto](instrumentation/README.lto.md)
+  * [LLVM mode - afl-clang-fast](instrumentation/README.md)
+  * [GCC_PLUGIN mode - afl-gcc-fast](instrumentation/README.gcc_plugin.md)
+  * GCC mode (afl-gcc) has no README as it has no own features
+
+You can select the mode for the afl-cc compiler by:
+  1. passing --afl-MODE command line options to the compiler via CFLAGS/CXXFLAGS/CPPFLAGS
+  2. use a symlink to afl-cc: afl-gcc, afl-g++, afl-clang, afl-clang++,
+     afl-clang-fast, afl-clang-fast++, afl-clang-lto, afl-clang-lto++,
+     afl-gcc-fast, afl-g++-fast
+  3. using the environment variable AFL_CC_COMPILER with MODE
+
+MODE can be one of: LTO (afl-clang-lto*), LLVM (afl-clang-fast*), GCC_PLUGIN
+(afl-g*-fast) or GCC (afl-gcc/afl-g++).
+
+Because no afl specific command-line options are accepted (beside the
+--afl-MODE command), the compile-time tools make fairly broad use of environment
+variables, which can be listed with `afl-cc -hh` or by reading [docs/env_variables.md](docs/env_variables.md).
 
 #### b) Selecting instrumentation options
 
-The following options are available when you instrument with afl-clang-fast or
-afl-clang-lto:
+The following options are available when you instrument with LTO mode (afl-clang-fast/afl-clang-lto):
 
  * Splitting integer, string, float and switch comparisons so afl++ can easier
    solve these. This is an important option if you do not have a very good
    and large input corpus. This technique is called laf-intel or COMPCOV.
    To use this set the following environment variable before compiling the
    target: `export AFL_LLVM_LAF_ALL=1`
-   You can read more about this in [llvm/README.laf-intel.md](llvm/README.laf-intel.md)
- * A different technique (and usually a better than laf-intel) is to
+   You can read more about this in [instrumentation/README.laf-intel.md](instrumentation/README.laf-intel.md)
+ * A different technique (and usually a better one than laf-intel) is to
    instrument the target so that any compare values in the target are sent to
    afl++ which then tries to put these values into the fuzzing data at different
    locations. This technique is very fast and good - if the target does not
@@ -282,12 +315,13 @@ afl-clang-lto:
    If you want to use this technique, then you have to compile the target
    twice, once specifically with/for this mode, and pass this binary to afl-fuzz
    via the `-c` parameter.
-   Not that you can compile also just a cmplog binary and use that for both
-   however there will a performance penality.
-   You can read more about this in [llvm_mode/README.cmplog.md](llvm_mode/README.cmplog.md)
+   Note that you can compile also just a cmplog binary and use that for both
+   however there will be a performance penality.
+   You can read more about this in [instrumentation/README.cmplog.md](instrumentation/README.cmplog.md)
 
-If you use afl-clang-fast, afl-clang-lto or afl-gcc-fast you have the option to
-selectively only instrument parts of the target that you are interested in:
+If you use LTO, LLVM or GCC_PLUGIN mode (afl-clang-fast/afl-clang-lto/afl-gcc-fast)
+ you have the option to selectively only instrument parts of the target that you
+are interested in:
 
  * To instrument only those parts of the target that you are interested in
    create a file with all the filenames of the source code that should be
@@ -299,29 +333,29 @@ selectively only instrument parts of the target that you are interested in:
    `export AFL_LLVM_DENYLIST=denylist.txt` - depending on if you want per
    default to instrument unless noted (DENYLIST) or not perform instrumentation
    unless requested (ALLOWLIST).
-   **NOTE:** In optimization functions might be inlined and then not match!
-   see [llvm_mode/README.instrument_list.md](llvm_mode/README.instrument_list.md)
+   **NOTE:** During optimization functions might be inlined and then would not match!
+   See [instrumentation/README.instrument_list.md](instrumentation/README.instrument_list.md)
    For afl-clang-fast > 6.0 or if PCGUARD instrumentation is used then use the
    llvm sancov allow-list feature: [http://clang.llvm.org/docs/SanitizerCoverage.html](http://clang.llvm.org/docs/SanitizerCoverage.html)
    The llvm sancov format works with the allowlist/denylist feature of afl++
-   however afl++ is more flexible in the format.
+   however afl++'s format is more flexible.
 
 There are many more options and modes available however these are most of the
 time less effective. See:
- * [llvm_mode/README.ctx.md](llvm_mode/README.ctx.md)
- * [llvm_mode/README.ngram.md](llvm_mode/README.ngram.md)
- * [llvm_mode/README.instrim.md](llvm_mode/README.instrim.md)
+ * [instrumentation/README.ctx.md](instrumentation/README.ctx.md)
+ * [instrumentation/README.ngram.md](instrumentation/README.ngram.md)
+ * [instrumentation/README.instrim.md](instrumentation/README.instrim.md)
 
-afl++ employs never zero counting in its bitmap. You can read more about this
+afl++ performs "never zero" counting in its bitmap. You can read more about this
 here:
- * [llvm_mode/README.neverzero.md](llvm_mode/README.neverzero.md)
+ * [instrumentation/README.neverzero.md](instrumentation/README.neverzero.md)
 
 #### c) Modify the target
 
 If the target has features that make fuzzing more difficult, e.g.
 checksums, HMAC, etc. then modify the source code so that this is
 removed.
-This can even be done for productional source code be eliminating
+This can even be done for operational source code by eliminating
 these checks within this specific defines:
 
 ```
@@ -332,13 +366,15 @@ these checks within this specific defines:
 #endif
 ```
 
+All afl++ compilers will set this preprocessor definition automatically.
+
 #### d) Instrument the target
 
 In this step the target source code is compiled so that it can be fuzzed.
 
 Basically you have to tell the target build system that the selected afl++
 compiler is used. Also - if possible - you should always configure the
-build system that the target is compiled statically and not dynamically.
+build system such that the target is compiled statically and not dynamically.
 How to do this is described below.
 
 Then build the target. (Usually with `make`)
@@ -349,20 +385,22 @@ For `configure` build systems this is usually done by:
 `CC=afl-clang-fast CXX=afl-clang-fast++ ./configure --disable-shared`
 
 Note that if you are using the (better) afl-clang-lto compiler you also have to
-set AR to llvm-ar[-VERSION] and RANLIB to llvm-ranlib[-VERSION] - as it is
-described in [llvm/README.lto.md](llvm/README.lto.md)
+set AR to llvm-ar[-VERSION] and RANLIB to llvm-ranlib[-VERSION] - as is
+described in [instrumentation/README.lto.md](instrumentation/README.lto.md).
 
 ##### cmake
 
-For `configure` build systems this is usually done by:
-`mkdir build; cd build; CC=afl-clang-fast CXX=afl-clang-fast++ cmake ..`
-
-Some cmake scripts require something like `-DCMAKE_CC=... -DCMAKE_CXX=...`
-or `-DCMAKE_C_COMPILER=... DCMAKE_CPP_COMPILER=...` instead.
+For `cmake` build systems this is usually done by:
+`mkdir build; cmake -DCMAKE_C_COMPILERC=afl-cc -DCMAKE_CXX_COMPILER=afl-c++ ..`
 
 Note that if you are using the (better) afl-clang-lto compiler you also have to
-set AR to llvm-ar[-VERSION] and RANLIB to llvm-ranlib[-VERSION] - as it is
-described in [llvm/README.lto.md](llvm/README.lto.md)
+set AR to llvm-ar[-VERSION] and RANLIB to llvm-ranlib[-VERSION] - as is
+described in [instrumentation/README.lto.md](instrumentation/README.lto.md).
+
+##### meson
+
+For meson you have to set the afl++ compiler with the very first command!
+`CC=afl-cc CXX=afl-c++ meson`
 
 ##### other build systems or if configure/cmake didn't work
 
@@ -370,7 +408,7 @@ Sometimes cmake and configure do not pick up the afl++ compiler, or the
 ranlib/ar that is needed - because this was just not foreseen by the developer
 of the target. Or they have non-standard options. Figure out if there is a 
 non-standard way to set this, otherwise set up the build normally and edit the
-generated build environment afterwards manually to point to the right compiler
+generated build environment afterwards manually to point it to the right compiler
 (and/or ranlib and ar).
 
 #### d) Better instrumentation
@@ -383,12 +421,12 @@ This requires the usage of afl-clang-lto or afl-clang-fast.
 This is the so-called `persistent mode`, which is much, much faster but
 requires that you code a source file that is specifically calling the target
 functions that you want to fuzz, plus a few specific afl++ functions around
-it. See [llvm_mode/README.persistent_mode.md](llvm_mode/README.persistent_mode.md) for details.
+it. See [instrumentation/README.persistent_mode.md](instrumentation/README.persistent_mode.md) for details.
 
 Basically if you do not fuzz a target in persistent mode then you are just
 doing it for a hobby and not professionally :-)
 
-### 2. Preparing the fuzzing
+### 2. Preparing the fuzzing campaign
 
 As you fuzz the target with mutated input, having as diverse inputs for the
 target as possible improves the efficiency a lot.
@@ -401,7 +439,7 @@ reported bugs, test suites, random downloads from the internet, unit test
 case data - from all kind of PNG software.
 
 If the input format is not known, you can also modify a target program to write
-away normal data it receives and processes to a file and use these.
+normal data it receives and processes to a file and use these.
 
 #### b) Making the input corpus unique
 
@@ -415,7 +453,7 @@ the run afl-cmin like this:
 `afl-cmin -i INPUTS -o INPUTS_UNIQUE -- bin/target -d @@`
 Note that the INPUTFILE argument that the target program would read from has to be set as `@@`.
 
-If the target reads from stdin instead, just omit  the `@@` as this is the
+If the target reads from stdin instead, just omit the `@@` as this is the
 default.
 
 #### c) Minimizing all corpus files
@@ -432,7 +470,7 @@ for i in *; do
 done
 ```
 
-This can also be parallelized, e.g. with `parallel`
+This step can also be parallelized, e.g. with `parallel`
 
 #### Done!
 
@@ -456,7 +494,7 @@ before the start of afl-fuzz as this improves performance by a x2 speed increase
 
 #### a) Running afl-fuzz
 
-Before to do even a test run of afl-fuzz execute `sudo afl-system-config` (on
+Before you do even a test run of afl-fuzz execute `sudo afl-system-config` (on
 the host if you execute afl-fuzz in a docker container). This reconfigures the
 system for optimal speed - which afl-fuzz checks and bails otherwise.
 Set `export AFL_SKIP_CPUFREQ=1` for afl-fuzz to skip this check if you cannot
@@ -588,7 +626,7 @@ then terminate it. The main node will pick it up and make it available to the
 other secondary nodes over time. Set `export AFL_NO_AFFINITY=1` if you have no
 free core.
 
-Note that you in nearly all cases you can never reach full coverage. A lot of
+Note that you in nearly all cases can never reach full coverage. A lot of
 functionality is usually behind options that were not activated or fuzz e.g.
 if you fuzz a library to convert image formats and your target is the png to
 tiff API then you will not touch any of the other library APIs and features.
@@ -607,7 +645,7 @@ switch or honggfuzz.
 
 #### f) Improve the speed!
 
- * Use [persistent mode](llvm_mode/README.persistent_mode.md) (x2-x20 speed increase)
+ * Use [persistent mode](instrumentation/README.persistent_mode.md) (x2-x20 speed increase)
  * If you do not use shmem persistent mode, use `AFL_TMPDIR` to point the input file on a tempfs location, see [docs/env_variables.md](docs/env_variables.md)
  * Linux: Use the [afl++ snapshot module](https://github.com/AFLplusplus/AFL-Snapshot-LKM) (x2 speed increase)
  * Linux: Improve kernel performance: modify `/etc/default/grub`, set `GRUB_CMDLINE_LINUX_DEFAULT="ibpb=off ibrs=off kpti=off l1tf=off mds=off mitigations=off no_stf_barrier noibpb noibrs nopcid nopti nospec_store_bypass_disable nospectre_v1 nospectre_v2 pcid=off pti=off spec_store_bypass_disable=off spectre_v2=off stf_barrier=off"`; then `update-grub` and `reboot` (warning: makes the system more insecure)
diff --git a/TODO.md b/TODO.md
index 52065bb0..bb420518 100644
--- a/TODO.md
+++ b/TODO.md
@@ -7,7 +7,6 @@
  - afl-plot to support multiple plot_data
  - afl_custom_fuzz_splice_optin()
  - intel-pt tracer
- - honor -O flags and -fno-unroll-loops in afl-cc
 
 ## Further down the road
 
@@ -22,7 +21,6 @@ gcc_plugin:
  - (wait for submission then decide)
 
 qemu_mode:
- - update to 5.x (if the performance bug is gone)
  - non colliding instrumentation
  - rename qemu specific envs to AFL_QEMU (AFL_ENTRYPOINT, AFL_CODE_START/END,
    AFL_COMPCOV_LEVEL?)
diff --git a/afl-whatsup b/afl-whatsup
index abcddbf1..e92b24bd 100755
--- a/afl-whatsup
+++ b/afl-whatsup
@@ -99,7 +99,7 @@ fi
 fmt_duration()
 {
   DUR_STRING=
-  if [ $1 -eq 0 ]; then
+  if [ $1 -le 0 ]; then
     return 1
   fi
 
@@ -109,7 +109,11 @@ fmt_duration()
   local minutes=$(((duration / 60) % 60))
   local seconds=$((duration % 60))
 
-  if [ $days -gt 0 ]; then
+  if [ $duration -le 0 ]; then
+    DUR_STRING="0 seconds"
+  elif [ $duration -eq 1 ]; then
+    DUR_STRING="1 second"
+  elif [ $days -gt 0 ]; then
     DUR_STRING="$days days, $hours hours"
   elif [ $hours -gt 0 ]; then
     DUR_STRING="$hours hours, $minutes minutes"
diff --git a/custom_mutators/README.md b/custom_mutators/README.md
index 3abcfef3..993ccaa1 100644
--- a/custom_mutators/README.md
+++ b/custom_mutators/README.md
@@ -3,6 +3,22 @@
 Custom mutators enhance and alter the mutation strategies of afl++.
 For further information and documentation on how to write your own, read [the docs](../docs/custom_mutators.md).
 
+## The afl++ Grammar Mutator
+
+If you use git to clone afl++, then the following will incorporate our
+excellent grammar custom mutator:
+```
+git submodule init
+git submodule update
+```
+
+otherwise just checkout the repository here with either
+`git clone https://github.com/AFLplusplus/Grammar-Mutator` or
+`svn co https://github.com/AFLplusplus/Grammar-Mutator`.
+
+Read the [Grammar-Mutator/README.md](Grammar-Mutator/README.md) on how to use
+it.
+
 ## Production-Ready Custom Mutators
 
 This directory holds ready to use custom mutators.
diff --git a/docs/Changelog.md b/docs/Changelog.md
index 72c8952c..9de03e78 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -9,23 +9,43 @@ Want to stay in the loop on major new features? Join our mailing list by
 sending a mail to <afl-users+subscribe@googlegroups.com>.
 
 
-### Version ++2.67d (develop)
+### Version ++3.00a (develop)
+  - llvm_mode/ and gcc_plugin/ moved to instrumentation/
+  - all compilers combined to afl-cc which emulates the previous ones
+  - afl-llvm/gcc-rt.o merged into afl-compiler-rt.o
+  - afl-fuzz
+    - reading testcases from -i now descends into subdirectories
+    - allow up to 4 -x command line options
+    - loaded extras now have a duplicate protection
+  - instrumentation
+    - new llvm pass: dict2file via AFL_LLVM_DICT2FILE, create afl-fuzz
+      -x dictionary of string comparisons found during compilation
+    - not overriding -Ox or -fno-unroll-loops anymore
+
+
+### Version ++2.68c (release)
+  - added the GSoC excellent afl++ grammar mutator by Shengtuo to our
+    custom_mutators/ (see custom_mutators/README.md) - or get it here:
+    https://github.com/AFLplusplus/Grammar-Mutator
   - a few QOL changes for Apple and its outdated gmake
   - afl-fuzz:
-    - Fix for auto dictionary entries found during fuzzing to not throw out
+    - fix for auto dictionary entries found during fuzzing to not throw out
       a -x dictionary
     - added total execs done to plot file
-    - AFL_MAX_DET_EXTRAS env variable added to control the amount of deterministic
-      dict entries without recompiling.
-    - AFL_FORKSRV_INIT_TMOUT env variable added to control the time to wait for
-      the forkserver to come up without the need to increase the overall timeout.
+    - AFL_MAX_DET_EXTRAS env variable added to control the amount of
+      deterministic dict entries without recompiling.
+    - AFL_FORKSRV_INIT_TMOUT env variable added to control the time to wait
+      for the forkserver to come up without the need to increase the overall
+      timeout.
     - bugfix for cmplog that results in a heap overflow based on target data
       (thanks to the magma team for reporting!)
+    - write fuzzing setup into out/fuzzer_setup (environment variables and
+      command line)
   - custom mutators:
-    - added afl_custom_fuzz_count/fuzz_count function to allow specifying the 
-      number of fuzz attempts for custom_fuzz
+    - added afl_custom_fuzz_count/fuzz_count function to allow specifying
+      the number of fuzz attempts for custom_fuzz
   - llvm_mode:
-    - Ported SanCov to LTO, and made it the default for LTO. better
+    - ported SanCov to LTO, and made it the default for LTO. better
       instrumentation locations
     - Further llvm 12 support (fast moving target like afl++ :-) )
     - deprecated LLVM SKIPSINGLEBLOCK env environment
diff --git a/docs/FAQ.md b/docs/FAQ.md
index 93a87a72..24942492 100644
--- a/docs/FAQ.md
+++ b/docs/FAQ.md
@@ -37,8 +37,8 @@ e.g. [Fuzzbench Report](https://www.fuzzbench.com/reports/2020-08-03/index.html)
 
 ## How to improve the fuzzing speed
 
-  1. use [llvm_mode](docs/llvm_mode/README.md): afl-clang-lto (llvm >= 11) or afl-clang-fast (llvm >= 9 recommended)
-  2. Use [persistent mode](llvm_mode/README.persistent_mode.md) (x2-x20 speed increase)
+  1. use [instrumentation](docs/README.llvm.md): afl-clang-lto (llvm >= 11) or afl-clang-fast (llvm >= 9 recommended)
+  2. Use [persistent mode](instrumentation/README.persistent_mode.md) (x2-x20 speed increase)
   3. Use the [afl++ snapshot module](https://github.com/AFLplusplus/AFL-Snapshot-LKM) (x2 speed increase)
   4. If you do not use shmem persistent mode, use `AFL_TMPDIR` to point the input file on a tempfs location, see [docs/env_variables.md](docs/env_variables.md)
   5. Improve kernel performance: modify `/etc/default/grub`, set `GRUB_CMDLINE_LINUX_DEFAULT="ibpb=off ibrs=off kpti=off l1tf=off mds=off mitigations=off no_stf_barrier noibpb noibrs nopcid nopti nospec_store_bypass_disable nospectre_v1 nospectre_v2 pcid=off pti=off spec_store_bypass_disable=off spectre_v2=off stf_barrier=off"`; then `update-grub` and `reboot` (warning: makes the system more insecure)
@@ -55,7 +55,7 @@ which is totally unsupported by most coverage aware fuzzers.
 
 The established method to fuzz network services is to modify the source code
 to read from a file or stdin (fd 0) (or even faster via shared memory, combine
-this with persistent mode [llvm_mode/README.persistent_mode.md](llvm_mode/README.persistent_mode.md)
+this with persistent mode [instrumentation/README.persistent_mode.md](instrumentation/README.persistent_mode.md)
 and you have a performance gain of x10 instead of a performance loss of over
 x10 - that is a x100 difference!
 
@@ -180,10 +180,10 @@ afl-clang-fast PCGUARD and afl-clang-lto LTO instrumentation.
 
      b) For PCGUARD instrumented binaries it is much more difficult. Here you
         can either modify the __sanitizer_cov_trace_pc_guard function in
-        llvm_mode/afl-llvm-rt.o.c to write a backtrace to a file if the ID in
+        instrumentation/afl-llvm-rt.o.c to write a backtrace to a file if the ID in
         __afl_area_ptr[*guard] is one of the unstable edge IDs.
         (Example code is already there).
-        Then recompile and reinstall llvm_mode and rebuild your target.
+        Then recompile and reinstall instrumentation and rebuild your target.
         Run the recompiled target with afl-fuzz for a while and then check the
         file that you wrote with the backtrace information.
         Alternatively you can use `gdb` to hook __sanitizer_cov_trace_pc_guard_init
@@ -200,7 +200,7 @@ afl-clang-fast PCGUARD and afl-clang-lto LTO instrumentation.
      remove from instrumentation, or just specify the functions you want to
      skip instrumenting. Note that optimization might inline functions!
 
-     Simply follow this document on how to do this: [llvm_mode/README.instrument_list.md](llvm_mode/README.instrument_list.md)
+     Simply follow this document on how to do this: [instrumentation/README.instrument_list.md](instrumentation/README.instrument_list.md)
      If PCGUARD is used, then you need to follow this guide (needs llvm 12+!):
      [http://clang.llvm.org/docs/SanitizerCoverage.html#partially-disabling-instrumentation](http://clang.llvm.org/docs/SanitizerCoverage.html#partially-disabling-instrumentation)
 
diff --git a/docs/INSTALL.md b/docs/INSTALL.md
index 766f24d7..fb7b5642 100644
--- a/docs/INSTALL.md
+++ b/docs/INSTALL.md
@@ -24,7 +24,7 @@ There are no special dependencies to speak of; you will need GNU make and a
 working compiler (gcc or clang). Some of the optional scripts bundled with the
 program may depend on bash, gdb, and similar basic tools.
 
-If you are using clang, please review llvm_mode/README.md; the LLVM
+If you are using clang, please review README.llvm.md; the LLVM
 integration mode can offer substantial performance gains compared to the
 traditional approach.
 
@@ -52,10 +52,10 @@ sudo gmake install
 Keep in mind that if you are using csh as your shell, the syntax of some of the
 shell commands given in the README.md and other docs will be different.
 
-The `llvm_mode` requires a dynamically linked, fully-operational installation of
+The `llvm` requires a dynamically linked, fully-operational installation of
 clang. At least on FreeBSD, the clang binaries are static and do not include
 some of the essential tools, so if you want to make it work, you may need to
-follow the instructions in llvm_mode/README.md.
+follow the instructions in README.llvm.md.
 
 Beyond that, everything should work as advertised.
 
@@ -97,27 +97,24 @@ and definitely don't look POSIX-compliant. This means two things:
 User emulation mode of QEMU does not appear to be supported on MacOS X, so
 black-box instrumentation mode (`-Q`) will not work.
 
-The llvm_mode requires a fully-operational installation of clang. The one that
+The llvm instrumentation requires a fully-operational installation of clang. The one that
 comes with Xcode is missing some of the essential headers and helper tools.
-See llvm_mode/README.md for advice on how to build the compiler from scratch.
+See README.llvm.md for advice on how to build the compiler from scratch.
 
 ## 4. Linux or *BSD on non-x86 systems
 
 Standard build will fail on non-x86 systems, but you should be able to
 leverage two other options:
 
-  - The LLVM mode (see llvm_mode/README.md), which does not rely on
+  - The LLVM mode (see README.llvm.md), which does not rely on
     x86-specific assembly shims. It's fast and robust, but requires a
     complete installation of clang.
   - The QEMU mode (see qemu_mode/README.md), which can be also used for
     fuzzing cross-platform binaries. It's slower and more fragile, but
     can be used even when you don't have the source for the tested app.
 
-If you're not sure what you need, you need the LLVM mode. To get it, try:
-
-```bash
-AFL_NO_X86=1 gmake && gmake -C llvm_mode
-```
+If you're not sure what you need, you need the LLVM mode, which is built by
+default.
 
 ...and compile your target program with afl-clang-fast or afl-clang-fast++
 instead of the traditional afl-gcc or afl-clang wrappers.
diff --git a/docs/env_variables.md b/docs/env_variables.md
index c47d10e8..9d289f6d 100644
--- a/docs/env_variables.md
+++ b/docs/env_variables.md
@@ -5,13 +5,25 @@
   users or for some types of custom fuzzing setups. See README.md for the general
   instruction manual.
 
-## 1) Settings for afl-gcc, afl-clang, and afl-as - and gcc_plugin afl-gcc-fast
+## 1) Settings for all compilers
 
-Because they can't directly accept command-line options, the compile-time
-tools make fairly broad use of environmental variables:
+Starting with afl++ 3.0 there is only one compiler: afl-cc
+To select the different instrumentation modes this can be done by
+  1. passing --afl-MODE command line options to the compiler
+  2. use a symlink to afl-cc: afl-gcc, afl-g++, afl-clang, afl-clang++,
+     afl-clang-fast, afl-clang-fast++, afl-clang-lto, afl-clang-lto++,
+     afl-gcc-fast, afl-g++-fast
+  3. using the environment variable AFL_CC_COMPILER with MODE
 
-  - Most afl tools do not print any output if stdout/stderr are redirected.
-    If you want to save the output in a file then set the AFL_DEBUG
+MODE can one of LTO (afl-clang-lto*), LLVM (afl-clang-fast*), GCC_PLUGIN
+(afl-g*-fast) or GCC (afl-gcc/afl-g++).
+
+Because beside the --afl-MODE command no afl specific command-line options
+are accepted, the compile-time tools make fairly broad use of environmental
+variables:
+
+  - Most afl tools do not print any ouput if stout/stderr are redirected.
+    If you want to have the output into a file then set the AFL_DEBUG
     environment variable.
     This is sadly necessary for various build processes which fail otherwise.
 
@@ -24,6 +36,8 @@ tools make fairly broad use of environmental variables:
     will cause problems in programs built with -Werror, simply because -O3
     enables more thorough code analysis and can spew out additional warnings.
     To disable optimizations, set AFL_DONT_OPTIMIZE.
+    However if -O... and/or -fno-unroll-loops are set, these are not
+    overriden.
 
   - Setting AFL_USE_ASAN automatically enables ASAN, provided that your
     compiler supports that. Note that fuzzing with ASAN is mildly challenging
@@ -44,7 +58,7 @@ tools make fairly broad use of environmental variables:
     you instrument hand-written assembly when compiling clang code by plugging
     a normalizer into the chain. (There is no equivalent feature for GCC.)
 
-  - Setting AFL_INST_RATIO to a percentage between 0% and 100% controls the
+  - Setting AFL_INST_RATIO to a percentage between 0 and 100% controls the
     probability of instrumenting every branch. This is (very rarely) useful
     when dealing with exceptionally complex programs that saturate the output
     bitmap. Examples include v8, ffmpeg, and perl.
@@ -55,19 +69,16 @@ tools make fairly broad use of environmental variables:
     Setting AFL_INST_RATIO to 0 is a valid choice. This will instrument only
     the transitions between function entry points, but not individual branches.
 
+    Note that this is an outdated variable. A few instances (e.g. afl-gcc)
+    still support these, but state-of-the-art (e.g. LLVM LTO and LLVM PCGUARD)
+    do not need this.
+
   - AFL_NO_BUILTIN causes the compiler to generate code suitable for use with
     libtokencap.so (but perhaps running a bit slower than without the flag).
 
   - TMPDIR is used by afl-as for temporary files; if this variable is not set,
     the tool defaults to /tmp.
 
-  - Setting AFL_KEEP_ASSEMBLY prevents afl-as from deleting instrumented
-    assembly files. Useful for troubleshooting problems or understanding how
-    the tool works. To get them in a predictable place, try something like:
-
-    mkdir assembly_here
-    TMPDIR=$PWD/assembly_here AFL_KEEP_ASSEMBLY=1 make clean all
-
   - If you are a weird person that wants to compile and instrument asm
     text files then use the AFL_AS_FORCE_INSTRUMENT variable:
       AFL_AS_FORCE_INSTRUMENT=1 afl-gcc foo.s -o foo
@@ -78,19 +89,24 @@ tools make fairly broad use of environmental variables:
   - Setting AFL_CAL_FAST will speed up the initial calibration, if the
     application is very slow
 
-## 2) Settings for afl-clang-fast / afl-clang-fast++ / afl-gcc-fast / afl-g++-fast
+## 2) Settings for LLVM and LTO: afl-clang-fast / afl-clang-fast++ / afl-clang-lto / afl-clang-lto++
 
-The native instrumentation helpers (llvm_mode and gcc_plugin) accept a subset
+The native instrumentation helpers (instrumentation and gcc_plugin) accept a subset
 of the settings discussed in section #1, with the exception of:
 
+  - LLVM modes support `AFL_LLVM_DICT2FILE=/absolute/path/file.txt` which will
+    write all constant string comparisons  to this file to be used with
+    afl-fuzz' `-x` option.
+
   - AFL_AS, since this toolchain does not directly invoke GNU as.
 
   - TMPDIR and AFL_KEEP_ASSEMBLY, since no temporary assembly files are
     created.
 
-  - AFL_INST_RATIO, as we by default use collision free instrumentation.
+  - AFL_INST_RATIO, as we by default collision free instrumentation is used.
+    Not all passes support this option though as it is an outdated feature.
 
-Then there are a few specific features that are only available in llvm_mode:
+Then there are a few specific features that are only available in instrumentation:
 
 ### Select the instrumentation mode
 
@@ -121,7 +137,7 @@ Then there are a few specific features that are only available in llvm_mode:
 
     None of the following options are necessary to be used and are rather for
     manual use (which only ever the author of this LTO implementation will use).
-    These are used if several seperated instrumentations are performed which
+    These are used if several seperated instrumentation are performed which
     are then later combined.
 
    - AFL_LLVM_DOCUMENT_IDS=file will document to a file which edge ID was given
@@ -136,7 +152,7 @@ Then there are a few specific features that are only available in llvm_mode:
    - AFL_LLVM_LTO_DONTWRITEID prevents that the highest location ID written
      into the instrumentation is set in a global variable
 
-    See llvm_mode/README.LTO.md for more information.
+    See instrumentation/README.LTO.md for more information.
 
 ### INSTRIM
 
@@ -154,7 +170,7 @@ Then there are a few specific features that are only available in llvm_mode:
       afl-fuzz will only be able to see the path the loop took, but not how
       many times it was called (unless it is a complex loop).
 
-    See llvm_mode/README.instrim.md
+    See instrumentation/README.instrim.md
 
 ### NGRAM
 
@@ -165,7 +181,7 @@ Then there are a few specific features that are only available in llvm_mode:
       config.h to at least 18 and maybe up to 20 for this as otherwise too
       many map collisions occur.
 
-    See llvm_mode/README.ctx.md
+    See instrumentation/README.ctx.md
 
 ### CTX
 
@@ -176,7 +192,7 @@ Then there are a few specific features that are only available in llvm_mode:
       config.h to at least 18 and maybe up to 20 for this as otherwise too
       many map collisions occur.
 
-    See llvm_mode/README.ngram.md
+    See instrumentation/README.ngram.md
 
 ### LAF-INTEL
 
@@ -196,17 +212,17 @@ Then there are a few specific features that are only available in llvm_mode:
 
     - Setting AFL_LLVM_LAF_ALL sets all of the above
 
-    See llvm_mode/README.laf-intel.md for more information.
+    See instrumentation/README.laf-intel.md for more information.
 
 ### INSTRUMENT LIST (selectively instrument files and functions)
 
-    This feature allows selective instrumentation of the source
+    This feature allows selectively instrumentation of the source
 
     - Setting AFL_LLVM_ALLOWLIST or AFL_LLVM_DENYLIST with a filenames and/or
       function will only instrument (or skip) those files that match the names
       listed in the specified file.
 
-    See llvm_mode/README.instrument_list.md for more information.
+    See instrumentation/README.instrument_list.md for more information.
 
 ### NOT_ZERO
 
@@ -220,27 +236,34 @@ Then there are a few specific features that are only available in llvm_mode:
       test. If the target performs only few loops then this will give a
       small performance boost.
 
-    See llvm_mode/README.neverzero.md
+    See instrumentation/README.neverzero.md
 
 ### CMPLOG
 
     - Setting AFL_LLVM_CMPLOG=1 during compilation will tell afl-clang-fast to
-      produce a CmpLog binary. See llvm_mode/README.cmplog.md
+      produce a CmpLog binary. See instrumentation/README.cmplog.md
 
-    See llvm_mode/README.neverzero.md
+    See instrumentation/README.neverzero.md
 
-Then there are a few specific features that are only available in the gcc_plugin:
+## 3) Settings for GCC / GCC_PLUGIN modes
 
-### INSTRUMENT_FILE
+Then there are a few specific features that are only available in GCC and
+GCC_PLUGIN mode.
 
-    This feature allows selective instrumentation of the source
+  - Setting AFL_KEEP_ASSEMBLY prevents afl-as from deleting instrumented
+    assembly files. Useful for troubleshooting problems or understanding how
+    the tool works. (GCC mode only)
+    To get them in a predictable place, try something like:
 
-    - Setting AFL_GCC_INSTRUMENT_FILE with a filename will only instrument those
-      files that match the names listed in this file (one filename per line).
+    mkdir assembly_here
+    TMPDIR=$PWD/assembly_here AFL_KEEP_ASSEMBLY=1 make clean all
 
+  - Setting AFL_GCC_INSTRUMENT_FILE with a filename will only instrument those
+    files that match the names listed in this file (one filename per line).
     See gcc_plugin/README.instrument_list.md for more information.
+    (GCC_PLUGIN mode only)
 
-## 3) Settings for afl-fuzz
+## 4) Settings for afl-fuzz
 
 The main fuzzer binary accepts several options that disable a couple of sanity
 checks or alter some of the more exotic semantics of the tool:
@@ -278,14 +301,6 @@ checks or alter some of the more exotic semantics of the tool:
     don't want AFL to spend too much time classifying that stuff and just
     rapidly put all timeouts in that bin.
 
-  - Setting AFL_FORKSRV_INIT_TMOUT allows yout to specify a different timeout
-    to wait for the forkserver to spin up. The default is the `-t` value times
-    `FORK_WAIT_MULT` from `config.h` (usually 10), so for a `-t 100`, the
-    default would wait `1000` milis. Setting a different time here is useful
-    if the target has a very slow startup time, for example when doing
-    full-system fuzzing or emulation, but you don't want the actual runs
-    to wait too long for timeouts.
-
   - AFL_NO_ARITH causes AFL to skip most of the deterministic arithmetics.
     This can be useful to speed up the fuzzing of text-based file formats.
 
@@ -377,22 +392,12 @@ checks or alter some of the more exotic semantics of the tool:
     Note that this setting inhibits some of the user-friendly diagnostics
     normally done when starting up the forkserver and causes a pretty
     significant performance drop.
-  
-  - Setting AFL_MAX_DET_EXTRAS changes the count of dictionary entries/extras
-    (default 200), after which the entries will be used probabilistically.
-    So, if the dict/extras file (`-x`) contains more tokens than this threshold,
-    not all of the tokens will be used in each fuzzing step, every time.
-    Instead, there is a chance that the entry will be skipped during fuzzing.
-    This makes sure that the fuzzer doesn't spend all its time only inserting
-    the extras, but will still do other mutations. However, it decreases the
-    likelihood for each token to be inserted, before the next queue entry is fuzzed.
-    Either way, all tokens will be used eventually, in a longer fuzzing campaign.
 
   - Outdated environment variables that are that not supported anymore:
     AFL_DEFER_FORKSRV
     AFL_PERSISTENT
 
-## 4) Settings for afl-qemu-trace
+## 5) Settings for afl-qemu-trace
 
 The QEMU wrapper used to instrument binary-only code supports several settings:
 
@@ -446,7 +451,7 @@ The QEMU wrapper used to instrument binary-only code supports several settings:
     stack pointer in which QEMU can find the return address when `start addr` is
     hitted.
 
-## 5) Settings for afl-cmin
+## 6) Settings for afl-cmin
 
 The corpus minimization script offers very little customization:
 
@@ -472,12 +477,12 @@ to match when minimizing crashes. This will make minimization less useful, but
 may prevent the tool from "jumping" from one crashing condition to another in
 very buggy software. You probably want to combine it with the -e flag.
 
-## 7) Settings for afl-analyze
+## 8) Settings for afl-analyze
 
 You can set AFL_ANALYZE_HEX to get file offsets printed as hexadecimal instead
 of decimal.
 
-## 8) Settings for libdislocator
+## 9) Settings for libdislocator
 
 The library honors these environmental variables:
 
@@ -499,12 +504,12 @@ The library honors these environmental variables:
   - AFL_ALIGNED_ALLOC=1 will force the alignment of the allocation size to
     max_align_t to be compliant with the C standard.
 
-## 9) Settings for libtokencap
+## 10) Settings for libtokencap
 
 This library accepts AFL_TOKEN_FILE to indicate the location to which the
 discovered tokens should be written.
 
-## 10) Third-party variables set by afl-fuzz & other tools
+## 11) Third-party variables set by afl-fuzz & other tools
 
 Several variables are not directly interpreted by afl-fuzz, but are set to
 optimal values if not already present in the environment:
diff --git a/docs/ideas.md b/docs/ideas.md
index 65e2e8e6..a5d40963 100644
--- a/docs/ideas.md
+++ b/docs/ideas.md
@@ -3,49 +3,6 @@
 In the following, we describe a variety of ideas that could be implemented
 for future AFL++ versions.
 
-For GSOC2020 interested students please see
-[https://github.com/AFLplusplus/AFLplusplus/issues/208](https://github.com/AFLplusplus/AFLplusplus/issues/208)
-
-## Flexible Grammar Mutator (currently in development)
-
-Currently, AFL++'s mutation does not have deeper knowledge about the fuzzed
-binary, apart from feedback, even though the developer may have insights
-about the target.
-
-A developer may choose to provide dictionaries and implement own mutations
-in python or C, but an easy mutator that behaves according to a given grammar,
-does not exist.
-
-State-of-the-art research on grammar fuzzing has some problems in their
-implementations like code quality, scalability, or ease of use and other
-common issues of the academic code.
-
-We aim to develop a pluggable grammar mutator for afl++ that combines
-various results.
-
-Mentor: andreafioraldi 
-
-## perf-fuzz Linux Kernel Module
-
-Expand on [snapshot LKM](https://github.com/AFLplusplus/AFL-Snapshot-LKM)
-To make it thread safe, can snapshot several processes at once and increase
-overall performance.
-
-Mentor: any
-
-## QEMU 5-based Instrumentation
-
-First tests to use QEMU 4 for binary-only AFL++ showed that caching behavior
-changed, which vastly decreases fuzzing speeds.
-
-In this task test if QEMU 5 performs better and port the afl++ QEMU 3.1
-patches to QEMU 5.
-
-Understanding the current instrumentation and fixing the current caching
-issues will be needed.
-
-Mentor: andreafioraldi
-
 ## WASM Instrumentation
 
 Currently, AFL++ can be used for source code fuzzing and traditional binaries.
@@ -66,20 +23,6 @@ Either improve a single mutator thorugh learning of many different bugs
 
 Mentor: domenukk
 
-## Reengineer `afl-fuzz` as Thread Safe, Embeddable Library (currently in development)
-
-Right now, afl-fuzz is single threaded, cannot safely be embedded in tools,
-and not multi-threaded. It makes use of a large number of globals, must always
-be the parent process and exec child processes. 
-Instead, afl-fuzz could be refactored to contain no global state and globals.
-This allows for different use cases that could be implemented during this
-project.
-Note that in the mean time a lot has happened here already, but e.g. making
-it all work and implement multithreading in afl-fuzz ... there is still quite
-some work to do.
-
-Mentor: hexcoder- or vanhauser-thc
-
 ## Collision-free Binary-Only Maps
 
 AFL++ supports collison-free maps using an LTO (link-time-optimization) pass.
diff --git a/docs/life_pro_tips.md b/docs/life_pro_tips.md
index a5bd7286..0004c297 100644
--- a/docs/life_pro_tips.md
+++ b/docs/life_pro_tips.md
@@ -30,10 +30,10 @@ Check out the `fuzzer_stats` file in the AFL output dir or try `afl-whatsup`.
 It could be important - consult docs/status_screen.md right away!
 
 ## Know your target? Convert it to persistent mode for a huge performance gain!
-Consult section #5 in llvm_mode/README.md for tips.
+Consult section #5 in README.llvm.md for tips.
 
 ## Using clang? 
-Check out llvm_mode/ for a faster alternative to afl-gcc!
+Check out instrumentation/ for a faster alternative to afl-gcc!
 
 ## Did you know that AFL can fuzz closed-source or cross-platform binaries?
 Check out qemu_mode/README.md and unicorn_mode/README.md for more.
diff --git a/docs/perf_tips.md b/docs/perf_tips.md
index 731dc238..fbcb4d8d 100644
--- a/docs/perf_tips.md
+++ b/docs/perf_tips.md
@@ -51,7 +51,7 @@ a file.
 ## 3. Use LLVM instrumentation
 
 When fuzzing slow targets, you can gain 20-100% performance improvement by
-using the LLVM-based instrumentation mode described in [the llvm_mode README](../llvm_mode/README.md).
+using the LLVM-based instrumentation mode described in [the instrumentation README](../instrumentation/README.llvm.md).
 Note that this mode requires the use of clang and will not work with GCC.
 
 The LLVM mode also offers a "persistent", in-process fuzzing mode that can
@@ -62,12 +62,12 @@ modes require you to edit the source code of the fuzzed program, but the
 changes often amount to just strategically placing a single line or two.
 
 If there are important data comparisons performed (e.g. `strcmp(ptr, MAGIC_HDR)`)
-then using laf-intel (see llvm_mode/README.laf-intel.md) will help `afl-fuzz` a lot
+then using laf-intel (see instrumentation/README.laf-intel.md) will help `afl-fuzz` a lot
 to get to the important parts in the code.
 
 If you are only interested in specific parts of the code being fuzzed, you can
 instrument_files the files that are actually relevant. This improves the speed and
-accuracy of afl. See llvm_mode/README.instrument_list.md
+accuracy of afl. See instrumentation/README.instrument_list.md
 
 Also use the InsTrim mode on larger binaries, this improves performance and
 coverage a lot.
@@ -110,7 +110,7 @@ e.g.:
   https://launchpad.net/libeatmydata
 
 In programs that are slow due to unavoidable initialization overhead, you may
-want to try the LLVM deferred forkserver mode (see llvm_mode/README.md),
+want to try the LLVM deferred forkserver mode (see README.llvm.md),
 which can give you speed gains up to 10x, as mentioned above.
 
 Last but not least, if you are using ASAN and the performance is unacceptable,
diff --git a/docs/sister_projects.md b/docs/sister_projects.md
index a501ecbd..640e59f7 100644
--- a/docs/sister_projects.md
+++ b/docs/sister_projects.md
@@ -52,7 +52,7 @@ options.
 Provides an evolutionary instrumentation-guided fuzzing harness that allows
 some programs to be fuzzed without the fork / execve overhead. (Similar
 functionality is now available as the "persistent" feature described in
-[the llvm_mode readme](../llvm_mode/README.md))
+[the llvm_mode readme](../instrumentation/README.llvm.md))
 
 http://llvm.org/docs/LibFuzzer.html
 
@@ -245,7 +245,7 @@ https://code.google.com/p/address-sanitizer/wiki/AsanCoverage#Coverage_counters
 ### AFL JS (Han Choongwoo)
 
 One-off optimizations to speed up the fuzzing of JavaScriptCore (now likely
-superseded by LLVM deferred forkserver init - see llvm_mode/README.md).
+superseded by LLVM deferred forkserver init - see README.llvm.md).
 
 https://github.com/tunz/afl-fuzz-js
 
diff --git a/docs/status_screen.md b/docs/status_screen.md
index b89468ce..2eeb8f3f 100644
--- a/docs/status_screen.md
+++ b/docs/status_screen.md
@@ -324,7 +324,7 @@ there are several things to look at:
   - Multiple threads executing at once in semi-random order. This is harmless
     when the 'stability' metric stays over 90% or so, but can become an issue
     if not. Here's what to try:
-    * Use afl-clang-fast from [llvm_mode](../llvm_mode/) - it uses a thread-local tracking
+    * Use afl-clang-fast from [instrumentation](../instrumentation/) - it uses a thread-local tracking
       model that is less prone to concurrency issues,
     * See if the target can be compiled or run without threads. Common
       `./configure` options include `--without-threads`, `--disable-pthreads`, or
diff --git a/examples/README.md b/examples/README.md
index d28aadbe..46a92c6e 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -47,7 +47,7 @@ Here's a quick overview of the stuff you can find in this directory:
 
 Note that the minimize_corpus.sh tool has graduated from the examples/
 directory and is now available as ../afl-cmin. The LLVM mode has likewise
-graduated to ../llvm_mode/*.
+graduated to ../instrumentation/*.
 
 Most of the tools in this directory are meant chiefly as examples that need to
 be tweaked for your specific needs. They come with some basic documentation,
diff --git a/examples/aflpp_driver/aflpp_driver.c b/examples/aflpp_driver/aflpp_driver.c
index ff5446e9..82e55fc4 100644
--- a/examples/aflpp_driver/aflpp_driver.c
+++ b/examples/aflpp_driver/aflpp_driver.c
@@ -27,7 +27,7 @@ EOF
 # Build your target with -fsanitize-coverage=trace-pc-guard using fresh clang.
 clang -g -fsanitize-coverage=trace-pc-guard test_fuzzer.cc -c
 # Build afl-llvm-rt.o.c from the AFL distribution.
-clang -c -w $AFL_HOME/llvm_mode/afl-llvm-rt.o.c
+clang -c -w $AFL_HOME/instrumentation/afl-llvm-rt.o.c
 # Build this file, link it with afl-llvm-rt.o.o and the target code.
 clang++ afl_driver.cpp test_fuzzer.o afl-llvm-rt.o.o
 # Run AFL:
diff --git a/examples/aflpp_driver/aflpp_driver_test.c b/examples/aflpp_driver/aflpp_driver_test.c
index ddc3effb..b4ff6bc6 100644
--- a/examples/aflpp_driver/aflpp_driver_test.c
+++ b/examples/aflpp_driver/aflpp_driver_test.c
@@ -6,6 +6,8 @@
 
 void __attribute__((noinline)) crashme(const uint8_t *Data, size_t Size) {
 
+  if (Size < 5) return;
+
   if (Data[0] == 'F')
     if (Data[1] == 'A')
       if (Data[2] == '$')
@@ -16,12 +18,11 @@ void __attribute__((noinline)) crashme(const uint8_t *Data, size_t Size) {
 
 int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
 
-  fprintf(stderr, "FUNC crc: %016llx len: %lu\n",
-          hash64((u8 *)Data, (unsigned int)Size,
-                 (unsigned long long int)0xa5b35705),
-          Size);
-
-  if (Size < 5) return 0;
+  if (Size)
+    fprintf(stderr, "FUNC crc: %016llx len: %lu\n",
+            hash64((u8 *)Data, (unsigned int)Size,
+                   (unsigned long long int)0xa5b35705),
+            Size);
 
   crashme(Data, Size);
 
diff --git a/gcc_plugin/Makefile b/gcc_plugin/Makefile
deleted file mode 100644
index 23477e22..00000000
--- a/gcc_plugin/Makefile
+++ /dev/null
@@ -1,159 +0,0 @@
-#
-# american fuzzy lop++ - GCC plugin instrumentation
-# -----------------------------------------------
-#
-# Written by Austin Seipp <aseipp@pobox.com> and
-#            Laszlo Szekeres <lszekeres@google.com> and
-#            Michal Zalewski and
-#            Heiko Eißfeldt  <heiko@hexco.de>
-#
-# GCC integration design is based on the LLVM design, which comes
-# from Laszlo Szekeres.
-#
-# Copyright 2015 Google Inc. All rights reserved.
-# Copyright 2019-2020 AFLplusplus Project. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at:
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-
-PREFIX      ?= /usr/local
-HELPER_PATH ?= $(PREFIX)/lib/afl
-BIN_PATH    ?= $(PREFIX)/bin
-DOC_PATH    ?= $(PREFIX)/share/doc/afl
-MAN_PATH    ?= $(PREFIX)/share/man/man8
-
-VERSION     = $(shell grep '^$(HASH)define VERSION ' ../config.h | cut -d '"' -f2)
-VERSION:sh= grep '^$(HASH)define VERSION ' ../config.h | cut -d '"' -f2
-
-CFLAGS      ?= -O3 -g -funroll-loops -D_FORTIFY_SOURCE=2
-CFLAGS = -Wall -I../include -Wno-pointer-sign \
-               -DAFL_PATH=\"$(HELPER_PATH)\" -DBIN_PATH=\"$(BIN_PATH)\" \
-               -DGCC_VERSION=\"$(GCCVER)\" -DGCC_BINDIR=\"$(GCCBINDIR)\" \
-               -Wno-unused-function
-
-CXXFLAGS    = -O3 -g -funroll-loops -D_FORTIFY_SOURCE=2
-CXXEFLAGS   = $(CXXFLAGS) -Wall
-
-CC          = gcc
-CXX         = g++
-
-MYCC=$(CC:clang=gcc)
-MYCXX=$(CXX:clang++=g++)
-
-PLUGIN_PATH = $(shell $(MYCC) -print-file-name=plugin)
-PLUGIN_PATH:sh= $(MYCC) -print-file-name=plugin
-PLUGIN_FLAGS = -fPIC -fno-rtti -I"$(PLUGIN_PATH)/include"
-HASH=\#
-
-GCCVER    = $(shell $(MYCC) --version 2>/dev/null | awk 'NR == 1 {print $$NF}')
-GCCVER:sh= gcc --version 2>/dev/null | awk 'NR == 1 {print $$NF}'
-GCCBINDIR = $(shell dirname `command -v $(MYCC)` 2>/dev/null )
-GCCBINDIR:sh= dirname `command -v $(MYCC)` 2>/dev/null
-
-_SHMAT_OK= $(shell echo '$(HASH)include <sys/ipc.h>@$(HASH)include <sys/shm.h>@int main() { int _id = shmget(IPC_PRIVATE, 65536, IPC_CREAT | IPC_EXCL | 0600); shmctl(_id, IPC_RMID, 0); return 0;}' | tr @ '\n' | $(MYCC) -x c - -o .test2 2>/dev/null && echo 1 || echo 0 ; rm -f .test2 )
-_SHMAT_OK:sh= echo '$(HASH)include <sys/ipc.h>@$(HASH)include <sys/shm.h>@int main() { int _id = shmget(IPC_PRIVATE, 65536, IPC_CREAT | IPC_EXCL | 0600); shmctl(_id, IPC_RMID, 0); return 0;}' | tr @ '\n' | $(MYCC) -x c - -o .test2 2>/dev/null && echo 1 || echo 0 ; rm -f .test2
-
-IGNORE_MMAP=$(TEST_MMAP:1=0)
-__SHMAT_OK=$(_SHMAT_OK)$(IGNORE_MMAP)
-___SHMAT_OK=$(__SHMAT_OK:10=0)
-SHMAT_OK=$(___SHMAT_OK:1=1)
-_CFLAGS_ADD=$(SHMAT_OK:1=)
-CFLAGS_ADD=$(_CFLAGS_ADD:0=-DUSEMMAP=1)
-
-_LDFLAGS_ADD=$(SHMAT_OK:1=)
-LDFLAGS_ADD=$(_LDFLAGS_ADD:0=-lrt)
-
-CFLAGS += $(CFLAGS_ADD)
-LDFLAGS += $(LDFLAGS_ADD)
-
-PROGS        = ../afl-gcc-pass.so ../afl-gcc-fast ../afl-gcc-rt.o
-
-all: test_shm test_deps $(PROGS) ../afl-gcc-fast.8 test_build all_done
-
-debug:
-	@echo _SHMAT_OK = $(_SHMAT_OK)
-	@echo IGNORE_MMAP = $(IGNORE_MMAP)
-	@echo __SHMAT_OK = $(__SHMAT_OK)
-	@echo ___SHMAT_OK = $(___SHMAT_OK)
-	@echo SHMAT_OK = $(SHMAT_OK)
-
-test_shm:
-	@if [ "$(SHMAT_OK)" == "1" ]; then \
-	  echo "[+] shmat seems to be working."; \
-	  rm -f .test2; \
-	else \
-	  echo "[-] shmat seems not to be working, switching to mmap implementation"; \
-	fi
-
-test_deps:
-	@echo "[*] Checking for working '$(MYCC)'..."
-	@type $(MYCC) >/dev/null 2>&1 || ( echo "[-] Oops, can't find '$(MYCC)'. Make sure that it's in your \$$PATH (or set \$$CC and \$$CXX)."; exit 1 )
-#	@echo "[*] Checking for gcc for plugin support..."
-#	@$(MYCC) -v 2>&1 | grep -q -- --enable-plugin || ( echo "[-] Oops, this gcc has not been configured with plugin support."; exit 1 )
-	@echo "[*] Checking for gcc plugin development header files..."
-	@test -d `$(MYCC) -print-file-name=plugin`/include || ( echo "[-] Oops, can't find gcc header files. Be sure to install 'gcc-X-plugin-dev'."; exit 1 )
-	@echo "[*] Checking for '../afl-showmap'..."
-	@test -f ../afl-showmap || ( echo "[-] Oops, can't find '../afl-showmap'. Be sure to compile AFL first."; exit 1 )
-	@echo "[+] All set and ready to build."
-
-afl-common.o: ../src/afl-common.c
-	$(MYCC) $(CFLAGS) -c $< -o $@ $(LDFLAGS)
-
-../afl-gcc-fast: afl-gcc-fast.c afl-common.o
-	$(MYCC) -DAFL_GCC_CC=\"$(MYCC)\" -DAFL_GCC_CXX=\"$(MYCXX)\" $(CFLAGS) afl-gcc-fast.c afl-common.o -o $@ $(LDFLAGS)
-	ln -sf afl-gcc-fast ../afl-g++-fast
-
-../afl-gcc-pass.so: afl-gcc-pass.so.cc
-	$(MYCXX) $(CXXEFLAGS) $(PLUGIN_FLAGS) -shared afl-gcc-pass.so.cc -o $@
-
-../afl-gcc-rt.o: afl-gcc-rt.o.c
-	$(MYCC) $(CFLAGS) -fPIC -c afl-gcc-rt.o.c -o $@
-
-test_build: $(PROGS)
-	@echo "[*] Testing the CC wrapper and instrumentation output..."
-	@unset AFL_USE_ASAN AFL_USE_MSAN; AFL_QUIET=1 AFL_INST_RATIO=100 AFL_PATH=. AFL_CC=$(CC) ../afl-gcc-fast $(CFLAGS) ../test-instr.c -o test-instr $(LDFLAGS)
-#	unset AFL_USE_ASAN AFL_USE_MSAN;             AFL_INST_RATIO=100 AFL_PATH=. AFL_CC=$(CC) ../afl-gcc-fast $(CFLAGS) ../test-instr.c -o test-instr $(LDFLAGS)
-	@ASAN_OPTIONS=detect_leaks=0 ../afl-showmap -m none -q -o .test-instr0 ./test-instr </dev/null
-	@ASAN_OPTIONS=detect_leaks=0 echo 1 | ../afl-showmap -m none -q -o .test-instr1 ./test-instr
-	@rm -f test-instr
-	@trap 'rm .test-instr0 .test-instr1' 0;if cmp -s .test-instr0 .test-instr1; then echo; echo "Oops, the instrumentation does not seem to be behaving correctly!"; echo; echo "Please post to https://github.com/AFLplusplus/AFLplusplus/issues to troubleshoot the issue."; echo; exit 1; fi
-	@echo "[+] All right, the instrumentation seems to be working!"
-
-all_done: test_build
-	@echo "[+] All done! You can now use '../afl-gcc-fast' to compile programs."
-
-.NOTPARALLEL: clean
-
-../afl-gcc-fast.8: ../afl-gcc-fast
-	@echo .TH $* 8 `date "+%Y-%m-%d"` "afl++" > ../$@
-	@echo .SH NAME >> ../$@
-	@echo .B $* >> ../$@
-	@echo >> ../$@
-	@echo .SH SYNOPSIS >> ../$@
-	@../$* -h 2>&1 | head -n 3 | tail -n 1 | sed 's/^\.\///' >> ../$@
-	@echo >> ../$@
-	@echo .SH OPTIONS >> ../$@
-	@echo .nf >> ../$@
-	@../$* -h 2>&1 | tail -n +4 >> ../$@
-	@echo >> ../$@
-	@echo .SH AUTHOR >> ../$@
-	@echo "afl++ was written by Michal \"lcamtuf\" Zalewski and is maintained by Marc \"van Hauser\" Heuse <mh@mh-sec.de>, Heiko \"hexcoder-\" Eissfeldt <heiko.eissfeldt@hexco.de>, Andrea Fioraldi <andreafioraldi@gmail.com> and Dominik Maier <domenukk@gmail.com>" >> ../$@
-	@echo  The homepage of afl++ is: https://github.com/AFLplusplus/AFLplusplus >> ../$@
-	@echo >> ../$@
-	@echo .SH LICENSE >> ../$@
-	@echo Apache License Version 2.0, January 2004 >> ../$@
-	ln -sf afl-gcc-fast.8 ../afl-g++-fast.8
-
-install: all
-	install -m 755 ../afl-gcc-fast $${DESTDIR}$(BIN_PATH)
-	install -m 755 ../afl-gcc-pass.so ../afl-gcc-rt.o $${DESTDIR}$(HELPER_PATH)
-	install -m 644 -T README.md $${DESTDIR}$(DOC_PATH)/README.gcc_plugin.md
-	install -m 644 -T README.instrument_list.md $${DESTDIR}$(DOC_PATH)/README.gcc_plugin.instrument_file.md
-
-clean:
-	rm -f *.o *.so *~ a.out core core.[1-9][0-9]* test-instr .test-instr0 .test-instr1 .test2
-	rm -f $(PROGS) afl-common.o ../afl-g++-fast ../afl-g*-fast.8
diff --git a/gcc_plugin/README.instrument_list.md b/gcc_plugin/README.instrument_list.md
deleted file mode 100644
index d0eaf6ff..00000000
--- a/gcc_plugin/README.instrument_list.md
+++ /dev/null
@@ -1,73 +0,0 @@
-========================================
-Using afl++ with partial instrumentation
-========================================
-
-  This file describes how you can selectively instrument only the source files
-  that are interesting to you using the gcc instrumentation provided by
-  afl++.
-
-  Plugin by hexcoder-.
-
-
-## 1) Description and purpose
-
-When building and testing complex programs where only a part of the program is
-the fuzzing target, it often helps to only instrument the necessary parts of
-the program, leaving the rest uninstrumented. This helps to focus the fuzzer
-on the important parts of the program, avoiding undesired noise and
-disturbance by uninteresting code being exercised.
-
-For this purpose, I have added a "partial instrumentation" support to the gcc
-plugin of AFLFuzz that allows you to specify on a source file level which files
-should be compiled with or without instrumentation.
-
-
-## 2) Building the gcc plugin
-
-The new code is part of the existing afl++ gcc plugin in the gcc_plugin/
-subdirectory. There is nothing specifically to do :)
-
-
-## 3) How to use the partial instrumentation mode
-
-In order to build with partial instrumentation, you need to build with
-afl-gcc-fast and afl-g++-fast respectively. The only required change is
-that you need to set the environment variable AFL_GCC_INSTRUMENT_FILE when calling
-the compiler.
-
-The environment variable must point to a file containing all the filenames
-that should be instrumented. For matching, the filename that is being compiled
-must end in the filename entry contained in this instrument list (to avoid breaking
-the matching when absolute paths are used during compilation).
-
-For example if your source tree looks like this:
-
-```
-project/
-project/feature_a/a1.cpp
-project/feature_a/a2.cpp
-project/feature_b/b1.cpp
-project/feature_b/b2.cpp
-```
-
-and you only want to test feature_a, then create a instrument list file containing:
-
-```
-feature_a/a1.cpp
-feature_a/a2.cpp
-```
-
-However if the instrument list file contains only this, it works as well:
-
-```
-a1.cpp
-a2.cpp
-```
-
-but it might lead to files being unwantedly instrumented if the same filename
-exists somewhere else in the project directories.
-
-The created instrument list file is then set to AFL_GCC_INSTRUMENT_FILE when you compile
-your program. For each file that didn't match the instrument list, the compiler will
-issue a warning at the end stating that no blocks were instrumented. If you
-didn't intend to instrument that file, then you can safely ignore that warning.
diff --git a/gcc_plugin/afl-gcc-fast.c b/gcc_plugin/afl-gcc-fast.c
deleted file mode 100644
index b1bacfbd..00000000
--- a/gcc_plugin/afl-gcc-fast.c
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
-   american fuzzy lop++ - GCC wrapper for GCC plugin
-   ------------------------------------------------
-
-   Written by Austin Seipp <aseipp@pobox.com> and
-              Laszlo Szekeres <lszekeres@google.com> and
-              Michal Zalewski
-
-   GCC integration design is based on the LLVM design, which comes
-   from Laszlo Szekeres.
-
-   Copyright 2015 Google Inc. All rights reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at:
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   This program is a drop-in replacement for gcc, similar in most
-   respects to ../afl-gcc, but with compiler instrumentation through a
-   plugin. It tries to figure out compilation mode, adds a bunch of
-   flags, and then calls the real compiler.
-
- */
-
-#define AFL_MAIN
-
-#include "config.h"
-#include "types.h"
-#include "debug.h"
-#include "common.h"
-#include "alloc-inl.h"
-
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-
-static u8 * obj_path;                  /* Path to runtime libraries         */
-static u8 **cc_params;                 /* Parameters passed to the real CC  */
-static u32  cc_par_cnt = 1;            /* Param count, including argv0      */
-u8          use_stdin = 0;                                         /* dummy */
-
-/* Try to find the runtime libraries. If that fails, abort. */
-
-static void find_obj(u8 *argv0) {
-
-  u8 *afl_path = getenv("AFL_PATH");
-  u8 *slash, *tmp;
-
-  if (afl_path) {
-
-    tmp = alloc_printf("%s/afl-gcc-rt.o", afl_path);
-
-    if (!access(tmp, R_OK)) {
-
-      obj_path = afl_path;
-      ck_free(tmp);
-      return;
-
-    }
-
-    ck_free(tmp);
-
-  }
-
-  slash = strrchr(argv0, '/');
-
-  if (slash) {
-
-    u8 *dir;
-
-    *slash = 0;
-    dir = ck_strdup(argv0);
-    *slash = '/';
-
-    tmp = alloc_printf("%s/afl-gcc-rt.o", dir);
-
-    if (!access(tmp, R_OK)) {
-
-      obj_path = dir;
-      ck_free(tmp);
-      return;
-
-    }
-
-    ck_free(tmp);
-    ck_free(dir);
-
-  }
-
-  if (!access(AFL_PATH "/afl-gcc-rt.o", R_OK)) {
-
-    obj_path = AFL_PATH;
-    return;
-
-  }
-
-  FATAL(
-      "Unable to find 'afl-gcc-rt.o' or 'afl-gcc-pass.so'. Please set "
-      "AFL_PATH");
-
-}
-
-/* Copy argv to cc_params, making the necessary edits. */
-
-static void edit_params(u32 argc, char **argv) {
-
-  u8  fortify_set = 0, asan_set = 0, x_set = 0, maybe_linking = 1;
-  u8 *name;
-
-  cc_params = ck_alloc((argc + 128) * sizeof(u8 *));
-
-  name = strrchr(argv[0], '/');
-  if (!name)
-    name = argv[0];
-  else
-    ++name;
-
-  if (!strcmp(name, "afl-g++-fast")) {
-
-    u8 *alt_cxx = getenv("AFL_CXX");
-    cc_params[0] = alt_cxx && *alt_cxx ? alt_cxx : (u8 *)AFL_GCC_CXX;
-
-  } else if (!strcmp(name, "afl-gcc-fast")) {
-
-    u8 *alt_cc = getenv("AFL_CC");
-    cc_params[0] = alt_cc && *alt_cc ? alt_cc : (u8 *)AFL_GCC_CC;
-
-  } else {
-
-    fprintf(stderr, "Name of the binary: %s\n", argv[0]);
-    FATAL(
-        "Name of the binary is not a known name, expected afl-(gcc|g++)-fast");
-
-  }
-
-  char *fplugin_arg = alloc_printf("-fplugin=%s/afl-gcc-pass.so", obj_path);
-  cc_params[cc_par_cnt++] = fplugin_arg;
-
-  /* Detect stray -v calls from ./configure scripts. */
-
-  if (argc == 1 && !strcmp(argv[1], "-v")) maybe_linking = 0;
-
-  while (--argc) {
-
-    u8 *cur = *(++argv);
-
-#if defined(__x86_64__)
-    if (!strcmp(cur, "-m32")) FATAL("-m32 is not supported");
-#endif
-
-    if (!strcmp(cur, "-x")) x_set = 1;
-
-    if (!strcmp(cur, "-c") || !strcmp(cur, "-S") || !strcmp(cur, "-E") ||
-        !strcmp(cur, "-v"))
-      maybe_linking = 0;
-
-    if (!strcmp(cur, "-fsanitize=address") || !strcmp(cur, "-fsanitize=memory"))
-      asan_set = 1;
-
-    if (strstr(cur, "FORTIFY_SOURCE")) fortify_set = 1;
-
-    if (!strcmp(cur, "-shared")) maybe_linking = 0;
-
-    cc_params[cc_par_cnt++] = cur;
-
-  }
-
-  if (getenv("AFL_HARDEN")) {
-
-    cc_params[cc_par_cnt++] = "-fstack-protector-all";
-
-    if (!fortify_set) cc_params[cc_par_cnt++] = "-D_FORTIFY_SOURCE=2";
-
-  }
-
-  if (!asan_set) {
-
-    if (getenv("AFL_USE_ASAN")) {
-
-      if (getenv("AFL_USE_MSAN")) FATAL("ASAN and MSAN are mutually exclusive");
-
-      if (getenv("AFL_HARDEN"))
-        FATAL("ASAN and AFL_HARDEN are mutually exclusive");
-
-      cc_params[cc_par_cnt++] = "-U_FORTIFY_SOURCE";
-      cc_params[cc_par_cnt++] = "-fsanitize=address";
-
-    } else if (getenv("AFL_USE_MSAN")) {
-
-      if (getenv("AFL_USE_ASAN")) FATAL("ASAN and MSAN are mutually exclusive");
-
-      if (getenv("AFL_HARDEN"))
-        FATAL("MSAN and AFL_HARDEN are mutually exclusive");
-
-      cc_params[cc_par_cnt++] = "-U_FORTIFY_SOURCE";
-      cc_params[cc_par_cnt++] = "-fsanitize=memory";
-
-    }
-
-  }
-
-  if (getenv("AFL_USE_UBSAN")) {
-
-    cc_params[cc_par_cnt++] = "-fsanitize=undefined";
-    cc_params[cc_par_cnt++] = "-fsanitize-undefined-trap-on-error";
-    cc_params[cc_par_cnt++] = "-fno-sanitize-recover=all";
-
-  }
-
-  if (!getenv("AFL_DONT_OPTIMIZE")) {
-
-    cc_params[cc_par_cnt++] = "-g";
-    cc_params[cc_par_cnt++] = "-O3";
-    cc_params[cc_par_cnt++] = "-funroll-loops";
-
-  }
-
-  if (getenv("AFL_NO_BUILTIN")) {
-
-    cc_params[cc_par_cnt++] = "-fno-builtin-strcmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strncmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strcasecmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strncasecmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-memcmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-bcmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strstr";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strcasestr";
-
-  }
-
-#if defined(USEMMAP) && !defined(__HAIKU__)
-  cc_params[cc_par_cnt++] = "-lrt";
-#endif
-
-  cc_params[cc_par_cnt++] = "-D__AFL_HAVE_MANUAL_CONTROL=1";
-  cc_params[cc_par_cnt++] = "-D__AFL_COMPILER=1";
-  cc_params[cc_par_cnt++] = "-DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION=1";
-
-  /* When the user tries to use persistent or deferred forkserver modes by
-     appending a single line to the program, we want to reliably inject a
-     signature into the binary (to be picked up by afl-fuzz) and we want
-     to call a function from the runtime .o file. This is unnecessarily
-     painful for three reasons:
-
-     1) We need to convince the compiler not to optimize out the signature.
-        This is done with __attribute__((used)).
-
-     2) We need to convince the linker, when called with -Wl,--gc-sections,
-        not to do the same. This is done by forcing an assignment to a
-        'volatile' pointer.
-
-     3) We need to declare __afl_persistent_loop() in the global namespace,
-        but doing this within a method in a class is hard - :: and extern "C"
-        are forbidden and __attribute__((alias(...))) doesn't work. Hence the
-        __asm__ aliasing trick.
-
-   */
-
-  cc_params[cc_par_cnt++] =
-      "-D__AFL_LOOP(_A)="
-      "({ static volatile char *_B __attribute__((used)); "
-      " _B = (char*)\"" PERSIST_SIG
-      "\"; "
-#ifdef __APPLE__
-      "int _L(unsigned int) __asm__(\"___afl_persistent_loop\"); "
-#else
-      "int _L(unsigned int) __asm__(\"__afl_persistent_loop\"); "
-#endif                                                        /* ^__APPLE__ */
-      "_L(_A); })";
-
-  cc_params[cc_par_cnt++] =
-      "-D__AFL_INIT()="
-      "do { static volatile char *_A __attribute__((used)); "
-      " _A = (char*)\"" DEFER_SIG
-      "\"; "
-#ifdef __APPLE__
-      "void _I(void) __asm__(\"___afl_manual_init\"); "
-#else
-      "void _I(void) __asm__(\"__afl_manual_init\"); "
-#endif                                                        /* ^__APPLE__ */
-      "_I(); } while (0)";
-
-  if (maybe_linking) {
-
-    if (x_set) {
-
-      cc_params[cc_par_cnt++] = "-x";
-      cc_params[cc_par_cnt++] = "none";
-
-    }
-
-    cc_params[cc_par_cnt++] = alloc_printf("%s/afl-gcc-rt.o", obj_path);
-
-  }
-
-  cc_params[cc_par_cnt] = NULL;
-
-}
-
-/* Main entry point */
-
-int main(int argc, char **argv, char **envp) {
-
-  if (argc < 2 || strcmp(argv[1], "-h") == 0) {
-
-    printf(cCYA
-           "afl-gcc-fast" VERSION cRST
-           " initially by <aseipp@pobox.com>, maintainer: hexcoder-\n"
-           "\n"
-           "afl-gcc-fast [options]\n"
-           "\n"
-           "This is a helper application for afl-fuzz. It serves as a drop-in "
-           "replacement\n"
-           "for gcc, letting you recompile third-party code with the required "
-           "runtime\n"
-           "instrumentation. A common use pattern would be one of the "
-           "following:\n\n"
-
-           "  CC=%s/afl-gcc-fast ./configure\n"
-           "  CXX=%s/afl-g++-fast ./configure\n\n"
-
-           "In contrast to the traditional afl-gcc tool, this version is "
-           "implemented as\n"
-           "a GCC plugin and tends to offer improved performance with slow "
-           "programs\n"
-           "(similarly to the LLVM plugin used by afl-clang-fast).\n\n"
-
-           "Environment variables used:\n"
-           "AFL_CC: path to the C compiler to use\n"
-           "AFL_CXX: path to the C++ compiler to use\n"
-           "AFL_PATH: path to instrumenting pass and runtime (afl-gcc-rt.*o)\n"
-           "AFL_DONT_OPTIMIZE: disable optimization instead of -O3\n"
-           "AFL_NO_BUILTIN: compile for use with libtokencap.so\n"
-           "AFL_INST_RATIO: percentage of branches to instrument\n"
-           "AFL_QUIET: suppress verbose output\n"
-           "AFL_DEBUG: enable developer debugging output\n"
-           "AFL_HARDEN: adds code hardening to catch memory bugs\n"
-           "AFL_USE_ASAN: activate address sanitizer\n"
-           "AFL_USE_MSAN: activate memory sanitizer\n"
-           "AFL_USE_UBSAN: activate undefined behaviour sanitizer\n"
-           "AFL_GCC_INSTRUMENT_FILE: enable selective instrumentation by "
-           "filename\n"
-
-           "\nafl-gcc-fast was built for gcc %s with the gcc binary path of "
-           "\"%s\".\n\n",
-           BIN_PATH, BIN_PATH, GCC_VERSION, GCC_BINDIR);
-
-    exit(1);
-
-  } else if ((isatty(2) && !getenv("AFL_QUIET")) ||
-
-             getenv("AFL_DEBUG") != NULL) {
-
-    SAYF(cCYA "afl-gcc-fast" VERSION cRST
-              " initially by <aseipp@pobox.com>, maintainer: hexcoder-\n");
-
-    if (getenv("AFL_GCC_INSTRUMENT_FILE") == NULL &&
-        getenv("AFL_GCC_WHITELIST") == NULL) {
-
-      SAYF(
-          cYEL
-          "Warning:" cRST
-          " using afl-gcc-fast without using AFL_GCC_INSTRUMENT_FILE currently "
-          "produces worse results than afl-gcc. Even better, use "
-          "llvm_mode for now.\n");
-
-    }
-
-  } else
-
-    be_quiet = 1;
-
-  u8 *ptr;
-  if (!be_quiet &&
-      ((ptr = getenv("AFL_MAP_SIZE")) || (ptr = getenv("AFL_MAPSIZE")))) {
-
-    u32 map_size = atoi(ptr);
-    if (map_size != MAP_SIZE)
-      WARNF("AFL_MAP_SIZE is not supported by afl-gcc-fast");
-
-  }
-
-  check_environment_vars(envp);
-
-  find_obj(argv[0]);
-
-  edit_params(argc, argv);
-  /*if (isatty(2) && !getenv("AFL_QUIET")) {
-
-            printf("Calling \"%s\" with:\n", cc_params[0]);
-            for(int i=1; i<cc_par_cnt; i++) printf("%s\n", cc_params[i]);
-
-    }
-
-  */
-  execvp(cc_params[0], (char **)cc_params);
-
-  FATAL("Oops, failed to execute '%s' - check your PATH", cc_params[0]);
-
-  return 0;
-
-}
-
diff --git a/gcc_plugin/afl-gcc-rt.o.c b/gcc_plugin/afl-gcc-rt.o.c
deleted file mode 100644
index 49a03cae..00000000
--- a/gcc_plugin/afl-gcc-rt.o.c
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
-   american fuzzy lop++ - GCC plugin instrumentation bootstrap
-   ---------------------------------------------------------
-
-   Written by Austin Seipp <aseipp@pobox.com> and
-              Laszlo Szekeres <lszekeres@google.com> and
-              Michal Zalewski
-
-   GCC integration design is based on the LLVM design, which comes
-   from Laszlo Szekeres.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at:
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   This code is the rewrite of afl-as.h's main_payload.
-
-*/
-
-#ifdef __ANDROID__
-  #include "android-ashmem.h"
-#endif
-#include "../config.h"
-#include "../types.h"
-
-#ifdef USEMMAP
-  #include <stdio.h>
-#endif
-#include <stdlib.h>
-#include <signal.h>
-#include <unistd.h>
-#include <string.h>
-#include <assert.h>
-
-#include <sys/mman.h>
-#ifndef USEMMAP
-  #include <sys/shm.h>
-#endif
-#include <sys/wait.h>
-#include <sys/types.h>
-
-#include <sys/mman.h>
-#include <fcntl.h>
-
-/* Globals needed by the injected instrumentation. The __afl_area_initial region
-   is used for instrumentation output before __afl_map_shm() has a chance to
-   run. It will end up as .comm, so it shouldn't be too wasteful. */
-
-u8  __afl_area_initial[MAP_SIZE];
-u8 *__afl_area_ptr = __afl_area_initial;
-
-#ifdef __ANDROID__
-u32 __afl_prev_loc;
-u32 __afl_final_loc;
-#else
-__thread u32 __afl_prev_loc;
-__thread u32 __afl_final_loc;
-#endif
-
-/* Trace a basic block with some ID */
-void __afl_trace(const u32 x) {
-
-#if 1                                      /* enable for neverZero feature. */
-  __afl_area_ptr[__afl_prev_loc ^ x] +=
-      1 + ((u8)(1 + __afl_area_ptr[__afl_prev_loc ^ x]) == 0);
-#else
-  ++__afl_area_ptr[__afl_prev_loc ^ x];
-#endif
-
-  __afl_prev_loc = (x >> 1);
-  return;
-
-}
-
-/* Running in persistent mode? */
-
-static u8 is_persistent;
-
-/* SHM setup. */
-
-static void __afl_map_shm(void) {
-
-  u8 *id_str = getenv(SHM_ENV_VAR);
-
-  /* If we're running under AFL, attach to the appropriate region, replacing the
-     early-stage __afl_area_initial region that is needed to allow some really
-     hacky .init code to work correctly in projects such as OpenSSL. */
-
-  if (id_str) {
-
-#ifdef USEMMAP
-    const char *   shm_file_path = id_str;
-    int            shm_fd = -1;
-    unsigned char *shm_base = NULL;
-
-    /* create the shared memory segment as if it was a file */
-    shm_fd = shm_open(shm_file_path, O_RDWR, 0600);
-    if (shm_fd == -1) {
-
-      fprintf(stderr, "shm_open() failed\n");
-      exit(1);
-
-    }
-
-    /* map the shared memory segment to the address space of the process */
-    shm_base = mmap(0, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
-    if (shm_base == MAP_FAILED) {
-
-      close(shm_fd);
-      shm_fd = -1;
-
-      fprintf(stderr, "mmap() failed\n");
-      exit(2);
-
-    }
-
-    __afl_area_ptr = shm_base;
-#else
-    u32 shm_id = atoi(id_str);
-
-    __afl_area_ptr = shmat(shm_id, NULL, 0);
-#endif
-
-    /* Whooooops. */
-
-    if (__afl_area_ptr == (void *)-1) exit(1);
-
-    /* Write something into the bitmap so that even with low AFL_INST_RATIO,
-       our parent doesn't give up on us. */
-
-    __afl_area_ptr[0] = 1;
-
-  }
-
-}
-
-/* Fork server logic. */
-
-static void __afl_start_forkserver(void) {
-
-  u8  tmp[4] = {0, 0, 0, 0};
-  u32 map_size = MAP_SIZE;
-  s32 child_pid;
-
-  u8 child_stopped = 0;
-
-  void (*old_sigchld_handler)(int) = signal(SIGCHLD, SIG_DFL);
-
-  /* Phone home and tell the parent that we're OK. If parent isn't there,
-     assume we're not running in forkserver mode and just execute program. */
-
-  if (MAP_SIZE <= 0x800000) {
-
-    map_size = (FS_OPT_ENABLED | FS_OPT_MAPSIZE | FS_OPT_SET_MAPSIZE(MAP_SIZE));
-    memcpy(tmp, &map_size, 4);
-
-  }
-
-  if (write(FORKSRV_FD + 1, tmp, 4) != 4) return;
-
-  while (1) {
-
-    u32 was_killed;
-    int status;
-
-    /* Wait for parent by reading from the pipe. Abort if read fails. */
-
-    if (read(FORKSRV_FD, &was_killed, 4) != 4) exit(1);
-
-    /* If we stopped the child in persistent mode, but there was a race
-       condition and afl-fuzz already issued SIGKILL, write off the old
-       process. */
-
-    if (child_stopped && was_killed) {
-
-      child_stopped = 0;
-      if (waitpid(child_pid, &status, 0) < 0) exit(1);
-
-    }
-
-    if (!child_stopped) {
-
-      /* Once woken up, create a clone of our process. */
-
-      child_pid = fork();
-      if (child_pid < 0) exit(1);
-
-      /* In child process: close fds, resume execution. */
-
-      if (!child_pid) {
-
-        signal(SIGCHLD, old_sigchld_handler);
-
-        close(FORKSRV_FD);
-        close(FORKSRV_FD + 1);
-        return;
-
-      }
-
-    } else {
-
-      /* Special handling for persistent mode: if the child is alive but
-         currently stopped, simply restart it with SIGCONT. */
-
-      kill(child_pid, SIGCONT);
-      child_stopped = 0;
-
-    }
-
-    /* In parent process: write PID to pipe, then wait for child. */
-
-    if (write(FORKSRV_FD + 1, &child_pid, 4) != 4) exit(1);
-
-    if (waitpid(child_pid, &status, is_persistent ? WUNTRACED : 0) < 0) exit(1);
-
-    /* In persistent mode, the child stops itself with SIGSTOP to indicate
-       a successful run. In this case, we want to wake it up without forking
-       again. */
-
-    if (WIFSTOPPED(status)) child_stopped = 1;
-
-    /* Relay wait status to pipe, then loop back. */
-
-    if (write(FORKSRV_FD + 1, &status, 4) != 4) exit(1);
-
-  }
-
-}
-
-/* A simplified persistent mode handler, used as explained in README.md. */
-
-int __afl_persistent_loop(unsigned int max_cnt) {
-
-  static u8  first_pass = 1;
-  static u32 cycle_cnt;
-
-  if (first_pass) {
-
-    /* Make sure that every iteration of __AFL_LOOP() starts with a clean slate.
-       On subsequent calls, the parent will take care of that, but on the first
-       iteration, it's our job to erase any trace of whatever happened
-       before the loop. */
-
-    if (is_persistent) {
-
-      memset(__afl_area_ptr, 0, MAP_SIZE);
-      __afl_area_ptr[0] = 1;
-      __afl_prev_loc = 0;
-
-    }
-
-    cycle_cnt = max_cnt;
-    first_pass = 0;
-    return 1;
-
-  }
-
-  if (is_persistent) {
-
-    if (--cycle_cnt) {
-
-      raise(SIGSTOP);
-
-      __afl_area_ptr[0] = 1;
-      __afl_prev_loc = 0;
-
-      return 1;
-
-    } else {
-
-      /* When exiting __AFL_LOOP(), make sure that the subsequent code that
-         follows the loop is not traced. We do that by pivoting back to the
-         dummy output region. */
-
-      __afl_area_ptr = __afl_area_initial;
-
-    }
-
-  }
-
-  return 0;
-
-}
-
-/* This one can be called from user code when deferred forkserver mode
-    is enabled. */
-
-void __afl_manual_init(void) {
-
-  static u8 init_done;
-
-  if (!init_done) {
-
-    __afl_map_shm();
-    __afl_start_forkserver();
-    init_done = 1;
-
-  }
-
-}
-
-/* Proper initialization routine. */
-
-__attribute__((constructor(101))) void __afl_auto_init(void) {
-
-  is_persistent = !!getenv(PERSIST_ENV_VAR);
-
-  if (getenv(DEFER_ENV_VAR)) return;
-
-  __afl_manual_init();
-
-}
-
diff --git a/include/afl-fuzz.h b/include/afl-fuzz.h
index f3a76492..9e469864 100644
--- a/include/afl-fuzz.h
+++ b/include/afl-fuzz.h
@@ -162,8 +162,7 @@ struct queue_entry {
   u8 *trace_mini;                       /* Trace bytes, if kept             */
   u32 tc_ref;                           /* Trace bytes ref count            */
 
-  struct queue_entry *next,             /* Next element, if any             */
-      *next_100;                        /* 100 elements ahead               */
+  struct queue_entry *next;             /* Next element, if any             */
 
 };
 
@@ -575,8 +574,7 @@ typedef struct afl_state {
 
   struct queue_entry *queue,            /* Fuzzing queue (linked list)      */
       *queue_cur,                       /* Current offset within the queue  */
-      *queue_top,                       /* Top of the list                  */
-      *q_prev100;                       /* Previous 100 marker              */
+      *queue_top;                       /* Top of the list                  */
 
   // growing buf
   struct queue_entry **queue_buf;
@@ -937,6 +935,7 @@ u8 has_new_bits(afl_state_t *, u8 *);
 
 void load_extras_file(afl_state_t *, u8 *, u32 *, u32 *, u32);
 void load_extras(afl_state_t *, u8 *);
+void dedup_extras(afl_state_t *);
 void add_extra(afl_state_t *afl, u8 *mem, u32 len);
 void maybe_add_auto(afl_state_t *, u8 *, u32);
 void save_auto(afl_state_t *);
@@ -945,6 +944,7 @@ void destroy_extras(afl_state_t *);
 
 /* Stats */
 
+void write_setup_file(afl_state_t *, u32, char **);
 void write_stats_file(afl_state_t *, double, double, double);
 void maybe_update_plot_file(afl_state_t *, double, double);
 void show_stats(afl_state_t *);
@@ -973,7 +973,7 @@ u8   fuzz_one(afl_state_t *);
 void bind_to_free_cpu(afl_state_t *);
 #endif
 void   setup_post(afl_state_t *);
-void   read_testcases(afl_state_t *);
+void   read_testcases(afl_state_t *, u8 *);
 void   perform_dry_run(afl_state_t *);
 void   pivot_inputs(afl_state_t *);
 u32    find_start_position(afl_state_t *);
diff --git a/include/common.h b/include/common.h
index 87a7425b..c364ade0 100644
--- a/include/common.h
+++ b/include/common.h
@@ -110,5 +110,11 @@ u8 *u_stringify_time_diff(u8 *buf, u64 cur_ms, u64 event_ms);
 /* Reads the map size from ENV */
 u32 get_map_size(void);
 
+/* create a stream file */
+FILE *create_ffile(u8 *fn);
+
+/* create a file */
+s32 create_file(u8 *fn);
+
 #endif
 
diff --git a/include/config.h b/include/config.h
index 6c73bced..8cc70075 100644
--- a/include/config.h
+++ b/include/config.h
@@ -28,7 +28,7 @@
 /* Version string: */
 
 // c = release, d = volatile github dev, e = experimental branch
-#define VERSION "++2.67d"
+#define VERSION "++3.00a"
 
 /******************************************************
  *                                                    *
@@ -195,7 +195,7 @@
    steps; past this point, the "extras/user" step will be still carried out,
    but with proportionally lower odds: */
 
-#define MAX_DET_EXTRAS 200
+#define MAX_DET_EXTRAS 256
 
 /* Maximum number of auto-extracted dictionary tokens to actually use in fuzzing
    (first value), and to keep in memory as candidates. The latter should be much
diff --git a/include/envs.h b/include/envs.h
index c7761e19..d9968fcd 100644
--- a/include/envs.h
+++ b/include/envs.h
@@ -69,6 +69,7 @@ static char *afl_environment_variables[] = {
     "AFL_LLVM_CMPLOG",
     "AFL_LLVM_INSTRIM",
     "AFL_LLVM_CTX",
+    "AFL_LLVM_DICT2FILE",
     "AFL_LLVM_DOCUMENT_IDS",
     "AFL_LLVM_INSTRUMENT",
     "AFL_LLVM_INSTRIM_LOOPHEAD",
@@ -112,12 +113,16 @@ static char *afl_environment_variables[] = {
     "AFL_QEMU_COMPCOV_DEBUG",
     "AFL_QEMU_DEBUG_MAPS",
     "AFL_QEMU_DISABLE_CACHE",
+    "AFL_QEMU_DRIVER_NO_HOOK",
     "AFL_QEMU_PERSISTENT_ADDR",
     "AFL_QEMU_PERSISTENT_CNT",
     "AFL_QEMU_PERSISTENT_GPR",
     "AFL_QEMU_PERSISTENT_HOOK",
     "AFL_QEMU_PERSISTENT_RET",
     "AFL_QEMU_PERSISTENT_RETADDR_OFFSET",
+    "AFL_QEMU_PERSISTENT_EXITS",
+    "AFL_QEMU_INST_RANGES",
+    "AFL_QEMU_SNAPSHOT",
     "AFL_QUIET",
     "AFL_RANDOM_ALLOC_CANARY",
     "AFL_REAL_PATH",
diff --git a/include/xxh3.h b/include/xxh3.h
deleted file mode 100644
index 2354bde9..00000000
--- a/include/xxh3.h
+++ /dev/null
@@ -1,3187 +0,0 @@
-/*
- * xxHash - Extremely Fast Hash algorithm
- * Development source file for `xxh3`
- * Copyright (C) 2019-2020 Yann Collet
- *
- * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above
- *      copyright notice, this list of conditions and the following disclaimer
- *      in the documentation and/or other materials provided with the
- *      distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * You can contact the author at:
- *   - xxHash homepage: https://www.xxhash.com
- *   - xxHash source repository: https://github.com/Cyan4973/xxHash
- */
-
-/*
- * Note: This file is separated for development purposes.
- * It will be integrated into `xxhash.h` when development stage is completed.
- *
- * Credit: most of the work on vectorial and asm variants comes from
- * @easyaspi314
- */
-
-#ifndef XXH3_H_1397135465
-#define XXH3_H_1397135465
-
-/* ===   Dependencies   === */
-#ifndef XXHASH_H_5627135585666179
-  /* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */
-  #undef XXH_INLINE_ALL                               /* avoid redefinition */
-  #define XXH_INLINE_ALL
-#endif
-#include "xxhash.h"
-
-/* ===   Compiler specifics   === */
-
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L      /* >= C99 */
-  #define XXH_RESTRICT restrict
-#else
-  /* Note: it might be useful to define __restrict or __restrict__ for some C++
-   * compilers */
-  #define XXH_RESTRICT                                           /* disable */
-#endif
-
-#if (defined(__GNUC__) && (__GNUC__ >= 3)) ||                   \
-    (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \
-    defined(__clang__)
-  #define XXH_likely(x) __builtin_expect(x, 1)
-  #define XXH_unlikely(x) __builtin_expect(x, 0)
-#else
-  #define XXH_likely(x) (x)
-  #define XXH_unlikely(x) (x)
-#endif
-
-#if defined(__GNUC__)
-  #if defined(__AVX2__)
-    #include <immintrin.h>
-  #elif defined(__SSE2__)
-    #include <emmintrin.h>
-  #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
-    #define inline __inline__                                  /* clang bug */
-    #include <arm_neon.h>
-    #undef inline
-  #endif
-#elif defined(_MSC_VER)
-  #include <intrin.h>
-#endif
-
-/*
- * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
- * remaining a true 64-bit/128-bit hash function.
- *
- * This is done by prioritizing a subset of 64-bit operations that can be
- * emulated without too many steps on the average 32-bit machine.
- *
- * For example, these two lines seem similar, and run equally fast on 64-bit:
- *
- *   xxh_u64 x;
- *   x ^= (x >> 47); // good
- *   x ^= (x >> 13); // bad
- *
- * However, to a 32-bit machine, there is a major difference.
- *
- * x ^= (x >> 47) looks like this:
- *
- *   x.lo ^= (x.hi >> (47 - 32));
- *
- * while x ^= (x >> 13) looks like this:
- *
- *   // note: funnel shifts are not usually cheap.
- *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
- *   x.hi ^= (x.hi >> 13);
- *
- * The first one is significantly faster than the second, simply because the
- * shift is larger than 32. This means:
- *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
- *    32 bits in the shift.
- *  - The shift result will always fit in the lower 32 bits, and therefore,
- *    we can ignore the upper 32 bits in the xor.
- *
- * Thanks to this optimization, XXH3 only requires these features to be
- * efficient:
- *
- *  - Usable unaligned access
- *  - A 32-bit or 64-bit ALU
- *      - If 32-bit, a decent ADC instruction
- *  - A 32 or 64-bit multiply with a 64-bit result
- *  - For the 128-bit variant, a decent byteswap helps short inputs.
- *
- * The first two are already required by XXH32, and almost all 32-bit and 64-bit
- * platforms which can run XXH32 can run XXH3 efficiently.
- *
- * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
- * notable exception.
- *
- * First of all, Thumb-1 lacks support for the UMULL instruction which
- * performs the important long multiply. This means numerous __aeabi_lmul
- * calls.
- *
- * Second of all, the 8 functional registers are just not enough.
- * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
- * Lo registers, and this shuffling results in thousands more MOVs than A32.
- *
- * A32 and T32 don't have this limitation. They can access all 14 registers,
- * do a 32->64 multiply with UMULL, and the flexible operand allowing free
- * shifts is helpful, too.
- *
- * Therefore, we do a quick sanity check.
- *
- * If compiling Thumb-1 for a target which supports ARM instructions, we will
- * emit a warning, as it is not a "sane" platform to compile for.
- *
- * Usually, if this happens, it is because of an accident and you probably need
- * to specify -march, as you likely meant to compile for a newer architecture.
- */
-#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
-  #warning "XXH3 is highly inefficient without ARM or Thumb-2."
-#endif
-
-/* ==========================================
- * Vectorization detection
- * ========================================== */
-#define XXH_SCALAR 0                             /* Portable scalar version */
-#define XXH_SSE2 1                     /* SSE2 for Pentium 4 and all x86_64 */
-#define XXH_AVX2 2                        /* AVX2 for Haswell and Bulldozer */
-#define XXH_AVX512 3                      /* AVX512 for Skylake and Icelake */
-#define XXH_NEON 4                 /* NEON for most ARMv7-A and all AArch64 */
-#define XXH_VSX 5                         /* VSX and ZVector for POWER8/z13 */
-
-#ifndef XXH_VECTOR                        /* can be defined on command line */
-  #if defined(__AVX512F__)
-    #define XXH_VECTOR XXH_AVX512
-  #elif defined(__AVX2__)
-    #define XXH_VECTOR XXH_AVX2
-  #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
-      (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
-    #define XXH_VECTOR XXH_SSE2
-  #elif defined(__GNUC__) /* msvc support maybe later */                   \
-      && (defined(__ARM_NEON__) || defined(__ARM_NEON)) &&                 \
-      (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
-       ||                                                                  \
-       (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
-    #define XXH_VECTOR XXH_NEON
-  #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \
-      (defined(__s390x__) && defined(__VEC__)) &&             \
-          defined(__GNUC__)                                 /* TODO: IBM XL */
-    #define XXH_VECTOR XXH_VSX
-  #else
-    #define XXH_VECTOR XXH_SCALAR
-  #endif
-#endif
-
-/*
- * Controls the alignment of the accumulator,
- * for compatibility with aligned vector loads, which are usually faster.
- */
-#ifndef XXH_ACC_ALIGN
-  #if defined(XXH_X86DISPATCH)
-    #define XXH_ACC_ALIGN 64               /* for compatibility with avx512 */
-  #elif XXH_VECTOR == XXH_SCALAR                                  /* scalar */
-    #define XXH_ACC_ALIGN 8
-  #elif XXH_VECTOR == XXH_SSE2                                      /* sse2 */
-    #define XXH_ACC_ALIGN 16
-  #elif XXH_VECTOR == XXH_AVX2                                      /* avx2 */
-    #define XXH_ACC_ALIGN 32
-  #elif XXH_VECTOR == XXH_NEON                                      /* neon */
-    #define XXH_ACC_ALIGN 16
-  #elif XXH_VECTOR == XXH_VSX                                        /* vsx */
-    #define XXH_ACC_ALIGN 16
-  #elif XXH_VECTOR == XXH_AVX512                                  /* avx512 */
-    #define XXH_ACC_ALIGN 64
-  #endif
-#endif
-
-#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \
-    XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
-  #define XXH_SEC_ALIGN XXH_ACC_ALIGN
-#else
-  #define XXH_SEC_ALIGN 8
-#endif
-
-/*
- * UGLY HACK:
- * GCC usually generates the best code with -O3 for xxHash.
- *
- * However, when targeting AVX2, it is overzealous in its unrolling resulting
- * in code roughly 3/4 the speed of Clang.
- *
- * There are other issues, such as GCC splitting _mm256_loadu_si256 into
- * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
- * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
- *
- * That is why when compiling the AVX2 version, it is recommended to use either
- *   -O2 -mavx2 -march=haswell
- * or
- *   -O2 -mavx2 -mno-avx256-split-unaligned-load
- * for decent performance, or to use Clang instead.
- *
- * Fortunately, we can control the first one with a pragma that forces GCC into
- * -O2, but the other one we can't control without "failed to inline always
- * inline function due to target mismatch" warnings.
- */
-#if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
-    && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
-    && defined(__OPTIMIZE__) &&                                      \
-    !defined(__OPTIMIZE_SIZE__)                      /* respect -O0 and -Os */
-  #pragma GCC push_options
-  #pragma GCC optimize("-O2")
-#endif
-
-#if XXH_VECTOR == XXH_NEON
-  /*
-   * NEON's setup for vmlal_u32 is a little more complicated than it is on
-   * SSE2, AVX2, and VSX.
-   *
-   * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an
-   * upcast.
-   *
-   * To do the same operation, the 128-bit 'Q' register needs to be split into
-   * two 64-bit 'D' registers, performing this operation::
-   *
-   *   [                a                 |                 b                ]
-   *            |              '---------. .--------'                |
-   *            |                         x                          |
-   *            |              .---------' '--------.                |
-   *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
-   *
-   * Due to significant changes in aarch64, the fastest method for aarch64 is
-   * completely different than the fastest method for ARMv7-A.
-   *
-   * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
-   * D11 will modify the high half of Q5. This is similar to how modifying AH
-   * will only affect bits 8-15 of AX on x86.
-   *
-   * VZIP takes two registers, and puts even lanes in one register and odd lanes
-   * in the other.
-   *
-   * On ARMv7-A, this strangely modifies both parameters in place instead of
-   * taking the usual 3-operand form.
-   *
-   * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
-   * lower and upper halves of the Q register to end up with the high and low
-   * halves where we want - all in one instruction.
-   *
-   *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1],
-   * d11[1] }
-   *
-   * Unfortunately we need inline assembly for this: Instructions modifying two
-   * registers at once is not possible in GCC or Clang's IR, and they have to
-   * create a copy.
-   *
-   * aarch64 requires a different approach.
-   *
-   * In order to make it easier to write a decent compiler for aarch64, many
-   * quirks were removed, such as conditional execution.
-   *
-   * NEON was also affected by this.
-   *
-   * aarch64 cannot access the high bits of a Q-form register, and writes to a
-   * D-form register zero the high bits, similar to how writes to W-form scalar
-   * registers (or DWORD registers on x86_64) work.
-   *
-   * The formerly free vget_high intrinsics now require a vext (with a few
-   * exceptions)
-   *
-   * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
-   * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
-   * operand.
-   *
-   * The equivalent of the VZIP.32 on the lower and upper halves would be this
-   * mess:
-   *
-   *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
-   *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
-   *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
-   *
-   * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64
-   * (SHRN):
-   *
-   *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
-   *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
-   *
-   * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
-   */
-
-  /*
-   * Function-like macro:
-   * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t
-   * &outHi)
-   * {
-
-   *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
-   *     outHi = (uint32x2_t)(in >> 32);
-   *     in = UNDEFINED;
-   * }
-   */
-  #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
-      && defined(__GNUC__) && !defined(__aarch64__) && !defined(__arm64__)
-    #define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                                   \
-      do {                                                                                         \
-                                                                                                   \
-        /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 =                      \
-         * upper D half */                                                                         \
-        /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486             \
-         */                                                                                        \
-        /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \
-         */                                                                                        \
-        __asm__("vzip.32  %e0, %f0" : "+w"(in));                                                   \
-        (outLo) = vget_low_u32(vreinterpretq_u32_u64(in));                                         \
-        (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                        \
-                                                                                                   \
-      } while (0)
-
-  #else
-    #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
-      do {                                       \
-                                                 \
-        (outLo) = vmovn_u64(in);                 \
-        (outHi) = vshrn_n_u64((in), 32);         \
-                                                 \
-      } while (0)
-
-  #endif
-#endif                                            /* XXH_VECTOR == XXH_NEON */
-
-/*
- * VSX and Z Vector helpers.
- *
- * This is very messy, and any pull requests to clean this up are welcome.
- *
- * There are a lot of problems with supporting VSX and s390x, due to
- * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
- */
-#if XXH_VECTOR == XXH_VSX
-  #if defined(__s390x__)
-    #include <s390intrin.h>
-  #else
-    #include <altivec.h>
-  #endif
-
-  #undef vector                                       /* Undo the pollution */
-
-typedef __vector unsigned long long xxh_u64x2;
-typedef __vector unsigned char      xxh_u8x16;
-typedef __vector unsigned           xxh_u32x4;
-
-  #ifndef XXH_VSX_BE
-    #if defined(__BIG_ENDIAN__) || \
-        (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-      #define XXH_VSX_BE 1
-    #elif defined(__VEC_ELEMENT_REG_ORDER__) && \
-        __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
-      #warning "-maltivec=be is not recommended. Please use native endianness."
-      #define XXH_VSX_BE 1
-    #else
-      #define XXH_VSX_BE 0
-    #endif
-  #endif                                            /* !defined(XXH_VSX_BE) */
-
-  #if XXH_VSX_BE
-    /* A wrapper for POWER9's vec_revb. */
-    #if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
-      #define XXH_vec_revb vec_revb
-    #else
-XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
-
-  xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
-                               0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08};
-  return vec_perm(val, val, vByteSwap);
-
-}
-
-    #endif
-  #endif                                                      /* XXH_VSX_BE */
-
-/*
- * Performs an unaligned load and byte swaps it on big endian.
- */
-XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) {
-
-  xxh_u64x2 ret;
-  memcpy(&ret, ptr, sizeof(xxh_u64x2));
-  #if XXH_VSX_BE
-  ret = XXH_vec_revb(ret);
-  #endif
-  return ret;
-
-}
-
-  /*
-   * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
-   *
-   * These intrinsics weren't added until GCC 8, despite existing for a while,
-   * and they are endian dependent. Also, their meaning swap depending on
-   * version.
-   * */
-  #if defined(__s390x__)
-    /* s390x is always big endian, no issue on this platform */
-    #define XXH_vec_mulo vec_mulo
-    #define XXH_vec_mule vec_mule
-  #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
-    /* Clang has a better way to control this, we can just use the builtin which
-     * doesn't swap. */
-    #define XXH_vec_mulo __builtin_altivec_vmulouw
-    #define XXH_vec_mule __builtin_altivec_vmuleuw
-  #else
-/* gcc needs inline assembly */
-/* Adapted from
- * https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
-XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) {
-
-  xxh_u64x2 result;
-  __asm__("vmulouw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
-  return result;
-
-}
-
-XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) {
-
-  xxh_u64x2 result;
-  __asm__("vmuleuw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
-  return result;
-
-}
-
-  #endif                                      /* XXH_vec_mulo, XXH_vec_mule */
-#endif                                             /* XXH_VECTOR == XXH_VSX */
-
-/* prefetch
- * can be disabled, by declaring XXH_NO_PREFETCH build macro */
-#if defined(XXH_NO_PREFETCH)
-  #define XXH_PREFETCH(ptr) (void)(ptr)                         /* disabled */
-#else
-  #if defined(_MSC_VER) && \
-      (defined(_M_X64) ||  \
-       defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
-    #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-    #define XXH_PREFETCH(ptr) _mm_prefetch((const char *)(ptr), _MM_HINT_T0)
-  #elif defined(__GNUC__) && \
-      ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
-    #define XXH_PREFETCH(ptr) \
-      __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
-  #else
-    #define XXH_PREFETCH(ptr) (void)(ptr)                       /* disabled */
-  #endif
-#endif                                                   /* XXH_NO_PREFETCH */
-
-/* ==========================================
- * XXH3 default settings
- * ========================================== */
-
-#define XXH_SECRET_DEFAULT_SIZE 192         /* minimum XXH3_SECRET_SIZE_MIN */
-
-#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
-  #error "default keyset is not large enough"
-#endif
-
-/* Pseudorandom secret taken directly from FARSH */
-XXH_ALIGN(64)
-static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
-
-    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c,
-    0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb,
-    0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 0xcb, 0x79, 0xe6, 0x4e,
-    0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
-    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6,
-    0x81, 0x3a, 0x26, 0x4c, 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb,
-    0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 0x71, 0x64, 0x48, 0x97,
-    0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
-    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7,
-    0xc7, 0x0b, 0x4f, 0x1d, 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31,
-    0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
-
-    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff,
-    0xfa, 0x13, 0x63, 0xeb, 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49,
-    0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, 0x2b, 0x16, 0xbe, 0x58,
-    0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
-    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca,
-    0xbb, 0x4b, 0x40, 0x7e,
-
-};
-
-#ifdef XXH_OLD_NAMES
-  #define kSecret XXH3_kSecret
-#endif
-
-/*
- * Calculates a 32-bit to 64-bit long multiply.
- *
- * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
- * need to (but it shouldn't need to anyways, it is about 7 instructions to do
- * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
- * use that instead of the normal method.
- *
- * If you are compiling for platforms like Thumb-1 and don't have a better
- * option, you may also want to write your own long multiply routine here.
- *
- * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
- * {
-
- *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
- * }
- */
-#if defined(_MSC_VER) && defined(_M_IX86)
-  #include <intrin.h>
-  #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
-#else
-  /*
-   * Downcast + upcast is usually better than masking on older compilers like
-   * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
-   *
-   * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both
-   * operands and perform a full 64x64 multiply -- entirely redundant on 32-bit.
-   */
-  #define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
-#endif
-
-/*
- * Calculates a 64->128-bit long multiply.
- *
- * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
- */
-static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
-
-  /*
-   * GCC/Clang __uint128_t method.
-   *
-   * On most 64-bit targets, GCC and Clang define a __uint128_t type.
-   * This is usually the best way as it usually uses a native long 64-bit
-   * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
-   *
-   * Usually.
-   *
-   * Despite being a 32-bit platform, Clang (and emscripten) define this type
-   * despite not having the arithmetic for it. This results in a laggy
-   * compiler builtin call which calculates a full 128-bit multiply.
-   * In that case it is best to use the portable one.
-   * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
-   */
-#if defined(__GNUC__) && !defined(__wasm__) && defined(__SIZEOF_INT128__) || \
-    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
-
-  __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
-  XXH128_hash_t     r128;
-  r128.low64 = (xxh_u64)(product);
-  r128.high64 = (xxh_u64)(product >> 64);
-  return r128;
-
-    /*
-     * MSVC for x64's _umul128 method.
-     *
-     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64
-     * *HighProduct);
-     *
-     * This compiles to single operand MUL on x64.
-     */
-#elif defined(_M_X64) || defined(_M_IA64)
-
-  #ifndef _MSC_VER
-    #pragma intrinsic(_umul128)
-  #endif
-  xxh_u64       product_high;
-  xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
-  XXH128_hash_t r128;
-  r128.low64 = product_low;
-  r128.high64 = product_high;
-  return r128;
-
-#else
-  /*
-   * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
-   *
-   * This is a fast and simple grade school multiply, which is shown below
-   * with base 10 arithmetic instead of base 0x100000000.
-   *
-   *           9 3 // D2 lhs = 93
-   *         x 7 5 // D2 rhs = 75
-   *     ----------
-   *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
-   *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
-   *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
-   *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
-   *     ---------
-   *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
-   *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
-   *     ---------
-   *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
-   *
-   * The reasons for adding the products like this are:
-   *  1. It avoids manual carry tracking. Just like how
-   *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
-   *     This avoids a lot of complexity.
-   *
-   *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
-   *     instruction available in ARM's Digital Signal Processing extension
-   *     in 32-bit ARMv6 and later, which is shown below:
-   *
-   *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
-   *         {
-
-   *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
-   *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
-   *             *RdHi = (xxh_u32)(product >> 32);
-   *         }
-   *
-   *     This instruction was designed for efficient long multiplication, and
-   *     allows this to be calculated in only 4 instructions at speeds
-   *     comparable to some 64-bit ALUs.
-   *
-   *  3. It isn't terrible on other platforms. Usually this will be a couple
-   *     of 32-bit ADD/ADCs.
-   */
-
-  /* First calculate all of the cross products. */
-  xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
-  xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
-  xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
-  xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
-
-  /* Now add the products together. These will never overflow. */
-  xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
-  xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
-  xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
-
-  XXH128_hash_t r128;
-  r128.low64 = lower;
-  r128.high64 = upper;
-  return r128;
-#endif
-
-}
-
-/*
- * Does a 64-bit to 128-bit multiply, then XOR folds it.
- *
- * The reason for the separate function is to prevent passing too many structs
- * around by value. This will hopefully inline the multiply, but we don't force
- * it.
- */
-static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
-
-  XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
-  return product.low64 ^ product.high64;
-
-}
-
-/* Seems to produce slightly better code on GCC for some reason. */
-XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) {
-
-  XXH_ASSERT(0 <= shift && shift < 64);
-  return v64 ^ (v64 >> shift);
-
-}
-
-/*
- * We don't need to (or want to) mix as much as XXH64.
- *
- * Short hashes are more evenly distributed, so it isn't necessary.
- */
-static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) {
-
-  h64 = XXH_xorshift64(h64, 37);
-  h64 *= 0x165667919E3779F9ULL;
-  h64 = XXH_xorshift64(h64, 32);
-  return h64;
-
-}
-
-/* ==========================================
- * Short keys
- * ==========================================
- * One of the shortcomings of XXH32 and XXH64 was that their performance was
- * sub-optimal on short lengths. It used an iterative algorithm which strongly
- * favored lengths that were a multiple of 4 or 8.
- *
- * Instead of iterating over individual inputs, we use a set of single shot
- * functions which piece together a range of lengths and operate in constant
- * time.
- *
- * Additionally, the number of multiplies has been significantly reduced. This
- * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
- *
- * Depending on the platform, this may or may not be faster than XXH32, but it
- * is almost guaranteed to be faster than XXH64.
- */
-
-/*
- * At very short lengths, there isn't enough input to fully hide secrets, or use
- * the entire secret.
- *
- * There is also only a limited amount of mixing we can do before significantly
- * impacting performance.
- *
- * Therefore, we use different sections of the secret and always mix two secret
- * samples with an XOR. This should have no effect on performance on the
- * seedless or withSeed variants because everything _should_ be constant folded
- * by modern compilers.
- *
- * The XOR mixing hides individual parts of the secret and increases entropy.
- *
- * This adds an extra layer of strength for custom secrets.
- */
-XXH_FORCE_INLINE XXH64_hash_t XXH3_len_1to3_64b(const xxh_u8 *input, size_t len,
-                                                const xxh_u8 *secret,
-                                                XXH64_hash_t  seed) {
-
-  XXH_ASSERT(input != NULL);
-  XXH_ASSERT(1 <= len && len <= 3);
-  XXH_ASSERT(secret != NULL);
-  /*
-   * len = 1: combined = { input[0], 0x01, input[0], input[0] }
-   * len = 2: combined = { input[1], 0x02, input[0], input[1] }
-   * len = 3: combined = { input[2], 0x03, input[0], input[1] }
-   */
-  {
-
-    xxh_u8 const  c1 = input[0];
-    xxh_u8 const  c2 = input[len >> 1];
-    xxh_u8 const  c3 = input[len - 1];
-    xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) |
-                             ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
-    xxh_u64 const bitflip =
-        (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
-    xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
-    xxh_u64 const mixed = keyed * XXH_PRIME64_1;
-    return XXH3_avalanche(mixed);
-
-  }
-
-}
-
-XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len,
-                                                const xxh_u8 *secret,
-                                                XXH64_hash_t  seed) {
-
-  XXH_ASSERT(input != NULL);
-  XXH_ASSERT(secret != NULL);
-  XXH_ASSERT(4 <= len && len < 8);
-  seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
-  {
-
-    xxh_u32 const input1 = XXH_readLE32(input);
-    xxh_u32 const input2 = XXH_readLE32(input + len - 4);
-    xxh_u64 const bitflip =
-        (XXH_readLE64(secret + 8) ^ XXH_readLE64(secret + 16)) - seed;
-    xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
-    xxh_u64       x = input64 ^ bitflip;
-    /* this mix is inspired by Pelle Evensen's rrmxmx */
-    x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24);
-    x *= 0x9FB21C651E98DF25ULL;
-    x ^= (x >> 35) + len;
-    x *= 0x9FB21C651E98DF25ULL;
-    return XXH_xorshift64(x, 28);
-
-  }
-
-}
-
-XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input,
-                                                 size_t        len,
-                                                 const xxh_u8 *secret,
-                                                 XXH64_hash_t  seed) {
-
-  XXH_ASSERT(input != NULL);
-  XXH_ASSERT(secret != NULL);
-  XXH_ASSERT(8 <= len && len <= 16);
-  {
-
-    xxh_u64 const bitflip1 =
-        (XXH_readLE64(secret + 24) ^ XXH_readLE64(secret + 32)) + seed;
-    xxh_u64 const bitflip2 =
-        (XXH_readLE64(secret + 40) ^ XXH_readLE64(secret + 48)) - seed;
-    xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;
-    xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
-    xxh_u64 const acc = len + XXH_swap64(input_lo) + input_hi +
-                        XXH3_mul128_fold64(input_lo, input_hi);
-    return XXH3_avalanche(acc);
-
-  }
-
-}
-
-XXH_FORCE_INLINE XXH64_hash_t XXH3_len_0to16_64b(const xxh_u8 *input,
-                                                 size_t        len,
-                                                 const xxh_u8 *secret,
-                                                 XXH64_hash_t  seed) {
-
-  XXH_ASSERT(len <= 16);
-  {
-
-    if (XXH_likely(len > 8))
-      return XXH3_len_9to16_64b(input, len, secret, seed);
-    if (XXH_likely(len >= 4))
-      return XXH3_len_4to8_64b(input, len, secret, seed);
-    if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
-    return XXH3_avalanche((XXH_PRIME64_1 + seed) ^ (XXH_readLE64(secret + 56) ^
-                                                    XXH_readLE64(secret + 64)));
-
-  }
-
-}
-
-/*
- * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
- * multiplication by zero, affecting hashes of lengths 17 to 240.
- *
- * However, they are very unlikely.
- *
- * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
- * unseeded non-cryptographic hashes, it does not attempt to defend itself
- * against specially crafted inputs, only random inputs.
- *
- * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
- * cancelling out the secret is taken an arbitrary number of times (addressed
- * in XXH3_accumulate_512), this collision is very unlikely with random inputs
- * and/or proper seeding:
- *
- * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
- * function that is only called up to 16 times per hash with up to 240 bytes of
- * input.
- *
- * This is not too bad for a non-cryptographic hash function, especially with
- * only 64 bit outputs.
- *
- * The 128-bit variant (which trades some speed for strength) is NOT affected
- * by this, although it is always a good idea to use a proper seed if you care
- * about strength.
- */
-XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input,
-                                     const xxh_u8 *XXH_RESTRICT secret,
-                                     xxh_u64                    seed64) {
-
-#if defined(__GNUC__) && !defined(__clang__)  /* GCC, not Clang */ \
-    && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */     \
-    &&                                                             \
-    !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */
-  /*
-   * UGLY HACK:
-   * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
-   * slower code.
-   *
-   * By forcing seed64 into a register, we disrupt the cost model and
-   * cause it to scalarize. See `XXH32_round()`
-   *
-   * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
-   * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
-   * GCC 9.2, despite both emitting scalar code.
-   *
-   * GCC generates much better scalar code than Clang for the rest of XXH3,
-   * which is why finding a more optimal codepath is an interest.
-   */
-  __asm__("" : "+r"(seed64));
-#endif
-  {
-
-    xxh_u64 const input_lo = XXH_readLE64(input);
-    xxh_u64 const input_hi = XXH_readLE64(input + 8);
-    return XXH3_mul128_fold64(input_lo ^ (XXH_readLE64(secret) + seed64),
-                              input_hi ^ (XXH_readLE64(secret + 8) - seed64));
-
-  }
-
-}
-
-/* For mid range keys, XXH3 uses a Mum-hash variant. */
-XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b(
-    const xxh_u8 *XXH_RESTRICT input, size_t len,
-    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
-
-  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-  (void)secretSize;
-  XXH_ASSERT(16 < len && len <= 128);
-
-  {
-
-    xxh_u64 acc = len * XXH_PRIME64_1;
-    if (len > 32) {
-
-      if (len > 64) {
-
-        if (len > 96) {
-
-          acc += XXH3_mix16B(input + 48, secret + 96, seed);
-          acc += XXH3_mix16B(input + len - 64, secret + 112, seed);
-
-        }
-
-        acc += XXH3_mix16B(input + 32, secret + 64, seed);
-        acc += XXH3_mix16B(input + len - 48, secret + 80, seed);
-
-      }
-
-      acc += XXH3_mix16B(input + 16, secret + 32, seed);
-      acc += XXH3_mix16B(input + len - 32, secret + 48, seed);
-
-    }
-
-    acc += XXH3_mix16B(input + 0, secret + 0, seed);
-    acc += XXH3_mix16B(input + len - 16, secret + 16, seed);
-
-    return XXH3_avalanche(acc);
-
-  }
-
-}
-
-#define XXH3_MIDSIZE_MAX 240
-
-XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
-    const xxh_u8 *XXH_RESTRICT input, size_t len,
-    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
-
-  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-  (void)secretSize;
-  XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
-
-#define XXH3_MIDSIZE_STARTOFFSET 3
-#define XXH3_MIDSIZE_LASTOFFSET 17
-
-  {
-
-    xxh_u64   acc = len * XXH_PRIME64_1;
-    int const nbRounds = (int)len / 16;
-    int       i;
-    for (i = 0; i < 8; i++) {
-
-      acc += XXH3_mix16B(input + (16 * i), secret + (16 * i), seed);
-
-    }
-
-    acc = XXH3_avalanche(acc);
-    XXH_ASSERT(nbRounds >= 8);
-#if defined(__clang__)                                /* Clang */ \
-    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
-    && !defined(XXH_ENABLE_AUTOVECTORIZE)              /* Define to disable */
-  /*
-   * UGLY HACK:
-   * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
-   * In everywhere else, it uses scalar code.
-   *
-   * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
-   * would still be slower than UMAAL (see XXH_mult64to128).
-   *
-   * Unfortunately, Clang doesn't handle the long multiplies properly and
-   * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
-   * scalarized into an ugly mess of VMOV.32 instructions.
-   *
-   * This mess is difficult to avoid without turning autovectorization
-   * off completely, but they are usually relatively minor and/or not
-   * worth it to fix.
-   *
-   * This loop is the easiest to fix, as unlike XXH32, this pragma
-   * _actually works_ because it is a loop vectorization instead of an
-   * SLP vectorization.
-   */
-  #pragma clang loop vectorize(disable)
-#endif
-    for (i = 8; i < nbRounds; i++) {
-
-      acc +=
-          XXH3_mix16B(input + (16 * i),
-                      secret + (16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
-
-    }
-
-    /* last bytes */
-    acc += XXH3_mix16B(input + len - 16,
-                       secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET,
-                       seed);
-    return XXH3_avalanche(acc);
-
-  }
-
-}
-
-/* =======     Long Keys     ======= */
-
-#define XXH_STRIPE_LEN 64
-#define XXH_SECRET_CONSUME_RATE \
-  8                     /* nb of secret bytes consumed at each accumulation */
-#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
-
-#ifdef XXH_OLD_NAMES
-  #define STRIPE_LEN XXH_STRIPE_LEN
-  #define ACC_NB XXH_ACC_NB
-#endif
-
-typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
-
-XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) {
-
-  if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
-  memcpy(dst, &v64, sizeof(v64));
-
-}
-
-/* Several intrinsic functions below are supposed to accept __int64 as argument,
- * as documented in
- * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . However,
- * several environments do not define __int64 type, requiring a workaround.
- */
-#if !defined(__VMS) &&       \
-    (defined(__cplusplus) || \
-     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
-typedef int64_t xxh_i64;
-#else
-/* the following type must have a width of 64-bit */
-typedef long long xxh_i64;
-#endif
-
-/*
- * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most
- * optimized.
- *
- * It is a hardened version of UMAC, based off of FARSH's implementation.
- *
- * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
- * implementations, and it is ridiculously fast.
- *
- * We harden it by mixing the original input to the accumulators as well as the
- * product.
- *
- * This means that in the (relatively likely) case of a multiply by zero, the
- * original input is preserved.
- *
- * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
- * cross-pollination, as otherwise the upper and lower halves would be
- * essentially independent.
- *
- * This doesn't matter on 64-bit hashes since they all get merged together in
- * the end, so we skip the extra step.
- *
- * Both XXH3_64bits and XXH3_128bits use this subroutine.
- */
-
-#if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH)
-
-  #ifndef XXH_TARGET_AVX512
-    #define XXH_TARGET_AVX512                   /* disable attribute target */
-  #endif
-
-XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512(
-    void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
-    const void *XXH_RESTRICT secret, XXH3_accWidth_e accWidth) {
-
-  XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
-  XXH_ASSERT((((size_t)acc) & 63) == 0);
-  XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
-
-  {
-
-    /* data_vec    = input[0]; */
-    __m512i const data_vec = _mm512_loadu_si512(input);
-    /* key_vec     = secret[0]; */
-    __m512i const key_vec = _mm512_loadu_si512(secret);
-    /* data_key    = data_vec ^ key_vec; */
-    __m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
-    /* data_key_lo = data_key >> 32; */
-    __m512i const data_key_lo =
-        _mm512_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
-    /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
-    __m512i const product = _mm512_mul_epu32(data_key, data_key_lo);
-    if (accWidth == XXH3_acc_128bits) {
-
-      /* xacc[0] += swap(data_vec); */
-      __m512i const data_swap =
-          _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
-      __m512i const sum = _mm512_add_epi64(*xacc, data_swap);
-      /* xacc[0] += product; */
-      *xacc = _mm512_add_epi64(product, sum);
-
-    } else {                                             /* XXH3_acc_64bits */
-
-      /* xacc[0] += data_vec; */
-      __m512i const sum = _mm512_add_epi64(*xacc, data_vec);
-      /* xacc[0] += product; */
-      *xacc = _mm512_add_epi64(product, sum);
-
-    }
-
-  }
-
-}
-
-/*
- * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
- *
- * Multiplication isn't perfect, as explained by Google in HighwayHash:
- *
- *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
- *  // varying degrees. In descending order of goodness, bytes
- *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
- *  // As expected, the upper and lower bytes are much worse.
- *
- * Source:
- * https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
- *
- * Since our algorithm uses a pseudorandom secret to add some variance into the
- * mix, we don't need to (or want to) mix as often or as much as HighwayHash
- * does.
- *
- * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
- * extraction.
- *
- * Both XXH3_64bits and XXH3_128bits use this subroutine.
- */
-
-XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512(
-    void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
-
-  XXH_ASSERT((((size_t)acc) & 63) == 0);
-  XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
-  {
-
-    XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
-    const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
-
-    /* xacc[0] ^= (xacc[0] >> 47) */
-    __m512i const acc_vec = *xacc;
-    __m512i const shifted = _mm512_srli_epi64(acc_vec, 47);
-    __m512i const data_vec = _mm512_xor_si512(acc_vec, shifted);
-    /* xacc[0] ^= secret; */
-    __m512i const key_vec = _mm512_loadu_si512(secret);
-    __m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
-
-    /* xacc[0] *= XXH_PRIME32_1; */
-    __m512i const data_key_hi =
-        _mm512_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
-    __m512i const prod_lo = _mm512_mul_epu32(data_key, prime32);
-    __m512i const prod_hi = _mm512_mul_epu32(data_key_hi, prime32);
-    *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
-
-  }
-
-}
-
-XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(
-    void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
-
-  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
-  XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
-  XXH_ASSERT(((size_t)customSecret & 63) == 0);
-  (void)(&XXH_writeLE64);
-  {
-
-    int const     nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
-    __m512i const seed = _mm512_mask_set1_epi64(
-        _mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);
-
-    XXH_ALIGN(64) const __m512i *const src = (const __m512i *)XXH3_kSecret;
-    XXH_ALIGN(64) __m512i *const       dest = (__m512i *)customSecret;
-    int                                i;
-    for (i = 0; i < nbRounds; ++i) {
-
-      // GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void
-      // const*', this will warn "discards ‘const’ qualifier".
-      union {
-
-        XXH_ALIGN(64) const __m512i *const cp;
-        XXH_ALIGN(64) void *const p;
-
-      } const remote_const_void = {.cp = src + i};
-
-      dest[i] =
-          _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
-
-    }
-
-  }
-
-}
-
-#endif
-
-#if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH)
-
-  #ifndef XXH_TARGET_AVX2
-    #define XXH_TARGET_AVX2                     /* disable attribute target */
-  #endif
-
-XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2(
-    void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
-    const void *XXH_RESTRICT secret, XXH3_accWidth_e accWidth) {
-
-  XXH_ASSERT((((size_t)acc) & 31) == 0);
-  {
-
-    XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
-    /* Unaligned. This is mainly for pointer arithmetic, and because
-     * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason.
-     */
-    const __m256i *const xinput = (const __m256i *)input;
-    /* Unaligned. This is mainly for pointer arithmetic, and because
-     * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
-    const __m256i *const xsecret = (const __m256i *)secret;
-
-    size_t i;
-    for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
-
-      /* data_vec    = xinput[i]; */
-      __m256i const data_vec = _mm256_loadu_si256(xinput + i);
-      /* key_vec     = xsecret[i]; */
-      __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
-      /* data_key    = data_vec ^ key_vec; */
-      __m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
-      /* data_key_lo = data_key >> 32; */
-      __m256i const data_key_lo =
-          _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
-      /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
-      __m256i const product = _mm256_mul_epu32(data_key, data_key_lo);
-      if (accWidth == XXH3_acc_128bits) {
-
-        /* xacc[i] += swap(data_vec); */
-        __m256i const data_swap =
-            _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
-        __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
-        /* xacc[i] += product; */
-        xacc[i] = _mm256_add_epi64(product, sum);
-
-      } else {                                           /* XXH3_acc_64bits */
-
-        /* xacc[i] += data_vec; */
-        __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
-        /* xacc[i] += product; */
-        xacc[i] = _mm256_add_epi64(product, sum);
-
-      }
-
-    }
-
-  }
-
-}
-
-XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(
-    void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
-
-  XXH_ASSERT((((size_t)acc) & 31) == 0);
-  {
-
-    XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
-    /* Unaligned. This is mainly for pointer arithmetic, and because
-     * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
-    const __m256i *const xsecret = (const __m256i *)secret;
-    const __m256i        prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
-
-    size_t i;
-    for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
-
-      /* xacc[i] ^= (xacc[i] >> 47) */
-      __m256i const acc_vec = xacc[i];
-      __m256i const shifted = _mm256_srli_epi64(acc_vec, 47);
-      __m256i const data_vec = _mm256_xor_si256(acc_vec, shifted);
-      /* xacc[i] ^= xsecret; */
-      __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
-      __m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
-
-      /* xacc[i] *= XXH_PRIME32_1; */
-      __m256i const data_key_hi =
-          _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
-      __m256i const prod_lo = _mm256_mul_epu32(data_key, prime32);
-      __m256i const prod_hi = _mm256_mul_epu32(data_key_hi, prime32);
-      xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
-
-    }
-
-  }
-
-}
-
-XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(
-    void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
-
-  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
-  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
-  XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
-  (void)(&XXH_writeLE64);
-  XXH_PREFETCH(customSecret);
-  {
-
-    __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64,
-                                           -(xxh_i64)seed64, (xxh_i64)seed64);
-
-    XXH_ALIGN(64) const __m256i *const src = (const __m256i *)XXH3_kSecret;
-    XXH_ALIGN(64) __m256i *            dest = (__m256i *)customSecret;
-
-  #if defined(__GNUC__) || defined(__clang__)
-    /*
-     * On GCC & Clang, marking 'dest' as modified will cause the compiler:
-     *   - do not extract the secret from sse registers in the internal loop
-     *   - use less common registers, and avoid pushing these reg into stack
-     * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
-     * customSecret, and on aarch64, this prevented LDP from merging two
-     * loads together for free. Putting the loads together before the stores
-     * properly generates LDP.
-     */
-    __asm__("" : "+r"(dest));
-  #endif
-
-    /* GCC -O2 need unroll loop manually */
-    dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed);
-    dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src + 1), seed);
-    dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src + 2), seed);
-    dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src + 3), seed);
-    dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src + 4), seed);
-    dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src + 5), seed);
-
-  }
-
-}
-
-#endif
-
-#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
-
-  #ifndef XXH_TARGET_SSE2
-    #define XXH_TARGET_SSE2                     /* disable attribute target */
-  #endif
-
-XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2(
-    void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
-    const void *XXH_RESTRICT secret, XXH3_accWidth_e accWidth) {
-
-  /* SSE2 is just a half-scale version of the AVX2 version. */
-  XXH_ASSERT((((size_t)acc) & 15) == 0);
-  {
-
-    XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
-    /* Unaligned. This is mainly for pointer arithmetic, and because
-     * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
-    const __m128i *const xinput = (const __m128i *)input;
-    /* Unaligned. This is mainly for pointer arithmetic, and because
-     * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
-    const __m128i *const xsecret = (const __m128i *)secret;
-
-    size_t i;
-    for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
-
-      /* data_vec    = xinput[i]; */
-      __m128i const data_vec = _mm_loadu_si128(xinput + i);
-      /* key_vec     = xsecret[i]; */
-      __m128i const key_vec = _mm_loadu_si128(xsecret + i);
-      /* data_key    = data_vec ^ key_vec; */
-      __m128i const data_key = _mm_xor_si128(data_vec, key_vec);
-      /* data_key_lo = data_key >> 32; */
-      __m128i const data_key_lo =
-          _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
-      /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
-      __m128i const product = _mm_mul_epu32(data_key, data_key_lo);
-      if (accWidth == XXH3_acc_128bits) {
-
-        /* xacc[i] += swap(data_vec); */
-        __m128i const data_swap =
-            _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
-        __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
-        /* xacc[i] += product; */
-        xacc[i] = _mm_add_epi64(product, sum);
-
-      } else {                                           /* XXH3_acc_64bits */
-
-        /* xacc[i] += data_vec; */
-        __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
-        /* xacc[i] += product; */
-        xacc[i] = _mm_add_epi64(product, sum);
-
-      }
-
-    }
-
-  }
-
-}
-
-XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(
-    void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
-
-  XXH_ASSERT((((size_t)acc) & 15) == 0);
-  {
-
-    XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
-    /* Unaligned. This is mainly for pointer arithmetic, and because
-     * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
-    const __m128i *const xsecret = (const __m128i *)secret;
-    const __m128i        prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
-
-    size_t i;
-    for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
-
-      /* xacc[i] ^= (xacc[i] >> 47) */
-      __m128i const acc_vec = xacc[i];
-      __m128i const shifted = _mm_srli_epi64(acc_vec, 47);
-      __m128i const data_vec = _mm_xor_si128(acc_vec, shifted);
-      /* xacc[i] ^= xsecret[i]; */
-      __m128i const key_vec = _mm_loadu_si128(xsecret + i);
-      __m128i const data_key = _mm_xor_si128(data_vec, key_vec);
-
-      /* xacc[i] *= XXH_PRIME32_1; */
-      __m128i const data_key_hi =
-          _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
-      __m128i const prod_lo = _mm_mul_epu32(data_key, prime32);
-      __m128i const prod_hi = _mm_mul_epu32(data_key_hi, prime32);
-      xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
-
-    }
-
-  }
-
-}
-
-XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(
-    void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
-
-  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
-  (void)(&XXH_writeLE64);
-  {
-
-    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
-
-  #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-    // MSVC 32bit mode does not support _mm_set_epi64x before 2015
-    XXH_ALIGN(16)
-    const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, -(xxh_i64)seed64};
-    __m128i const seed = _mm_load_si128((__m128i const *)seed64x2);
-  #else
-    __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);
-  #endif
-    int i;
-
-    XXH_ALIGN(64) const float *const  src = (float const *)XXH3_kSecret;
-    XXH_ALIGN(XXH_SEC_ALIGN) __m128i *dest = (__m128i *)customSecret;
-  #if defined(__GNUC__) || defined(__clang__)
-    /*
-     * On GCC & Clang, marking 'dest' as modified will cause the compiler:
-     *   - do not extract the secret from sse registers in the internal loop
-     *   - use less common registers, and avoid pushing these reg into stack
-     */
-    __asm__("" : "+r"(dest));
-  #endif
-
-    for (i = 0; i < nbRounds; ++i) {
-
-      dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src + i * 4)), seed);
-
-    }
-
-  }
-
-}
-
-#endif
-
-#if (XXH_VECTOR == XXH_NEON)
-
-XXH_FORCE_INLINE void XXH3_accumulate_512_neon(void *XXH_RESTRICT       acc,
-                                               const void *XXH_RESTRICT input,
-                                               const void *XXH_RESTRICT secret,
-                                               XXH3_accWidth_e accWidth) {
-
-  XXH_ASSERT((((size_t)acc) & 15) == 0);
-  {
-
-    XXH_ALIGN(16) uint64x2_t *const xacc = (uint64x2_t *)acc;
-    /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7.
-     */
-    uint8_t const *const xinput = (const uint8_t *)input;
-    uint8_t const *const xsecret = (const uint8_t *)secret;
-
-    size_t i;
-    for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
-
-      /* data_vec = xinput[i]; */
-      uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
-      /* key_vec  = xsecret[i];  */
-      uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
-      uint64x2_t data_key;
-      uint32x2_t data_key_lo, data_key_hi;
-      if (accWidth == XXH3_acc_64bits) {
-
-        /* xacc[i] += data_vec; */
-        xacc[i] = vaddq_u64(xacc[i], vreinterpretq_u64_u8(data_vec));
-
-      } else {                                          /* XXH3_acc_128bits */
-
-        /* xacc[i] += swap(data_vec); */
-        uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
-        uint64x2_t const swapped = vextq_u64(data64, data64, 1);
-        xacc[i] = vaddq_u64(xacc[i], swapped);
-
-      }
-
-      /* data_key = data_vec ^ key_vec; */
-      data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
-      /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
-       * data_key_hi = (uint32x2_t) (data_key >> 32);
-       * data_key = UNDEFINED; */
-      XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
-      /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
-      xacc[i] = vmlal_u32(xacc[i], data_key_lo, data_key_hi);
-
-    }
-
-  }
-
-}
-
-XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void *XXH_RESTRICT       acc,
-                                            const void *XXH_RESTRICT secret) {
-
-  XXH_ASSERT((((size_t)acc) & 15) == 0);
-
-  {
-
-    uint64x2_t *   xacc = (uint64x2_t *)acc;
-    uint8_t const *xsecret = (uint8_t const *)secret;
-    uint32x2_t     prime = vdup_n_u32(XXH_PRIME32_1);
-
-    size_t i;
-    for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
-
-      /* xacc[i] ^= (xacc[i] >> 47); */
-      uint64x2_t acc_vec = xacc[i];
-      uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
-      uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
-
-      /* xacc[i] ^= xsecret[i]; */
-      uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
-      uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
-
-      /* xacc[i] *= XXH_PRIME32_1 */
-      uint32x2_t data_key_lo, data_key_hi;
-      /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
-       * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
-       * xacc[i] = UNDEFINED; */
-      XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
-      { /*
-         * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
-         *
-         * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
-         * incorrectly "optimize" this:
-         *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
-         *   shifted = vshll_n_u32(tmp, 32);
-         * to this:
-         *   tmp     = "vmulq_u64"(a, b); // no such thing!
-         *   shifted = vshlq_n_u64(tmp, 32);
-         *
-         * However, unlike SSE, Clang lacks a 64-bit multiply routine
-         * for NEON, and it scalarizes two 64-bit multiplies instead.
-         *
-         * vmull_u32 has the same timing as vmul_u32, and it avoids
-         * this bug completely.
-         * See https://bugs.llvm.org/show_bug.cgi?id=39967
-         */
-        uint64x2_t prod_hi = vmull_u32(data_key_hi, prime);
-        /* xacc[i] = prod_hi << 32; */
-        xacc[i] = vshlq_n_u64(prod_hi, 32);
-        /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
-        xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
-
-      }
-
-    }
-
-  }
-
-}
-
-#endif
-
-#if (XXH_VECTOR == XXH_VSX)
-
-XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT       acc,
-                                              const void *XXH_RESTRICT input,
-                                              const void *XXH_RESTRICT secret,
-                                              XXH3_accWidth_e accWidth) {
-
-  xxh_u64x2 *const       xacc = (xxh_u64x2 *)acc;       /* presumed aligned */
-  xxh_u64x2 const *const xinput =
-      (xxh_u64x2 const *)input;                 /* no alignment restriction */
-  xxh_u64x2 const *const xsecret =
-      (xxh_u64x2 const *)secret;                /* no alignment restriction */
-  xxh_u64x2 const v32 = {32, 32};
-  size_t          i;
-  for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
-
-    /* data_vec = xinput[i]; */
-    xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
-    /* key_vec = xsecret[i]; */
-    xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
-    xxh_u64x2 const data_key = data_vec ^ key_vec;
-    /* shuffled = (data_key << 32) | (data_key >> 32); */
-    xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
-    /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled &
-     * 0xFFFFFFFF); */
-    xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
-    xacc[i] += product;
-
-    if (accWidth == XXH3_acc_64bits) {
-
-      xacc[i] += data_vec;
-
-    } else {                                            /* XXH3_acc_128bits */
-
-        /* swap high and low halves */
-  #ifdef __s390x__
-      xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2);
-  #else
-      xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
-  #endif
-      xacc[i] += data_swapped;
-
-    }
-
-  }
-
-}
-
-XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void *XXH_RESTRICT       acc,
-                                           const void *XXH_RESTRICT secret) {
-
-  XXH_ASSERT((((size_t)acc) & 15) == 0);
-
-  {
-
-    xxh_u64x2 *const       xacc = (xxh_u64x2 *)acc;
-    const xxh_u64x2 *const xsecret = (const xxh_u64x2 *)secret;
-    /* constants */
-    xxh_u64x2 const v32 = {32, 32};
-    xxh_u64x2 const v47 = {47, 47};
-    xxh_u32x4 const prime = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1,
-                             XXH_PRIME32_1};
-    size_t          i;
-    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
-
-      /* xacc[i] ^= (xacc[i] >> 47); */
-      xxh_u64x2 const acc_vec = xacc[i];
-      xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
-
-      /* xacc[i] ^= xsecret[i]; */
-      xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
-      xxh_u64x2 const data_key = data_vec ^ key_vec;
-
-      /* xacc[i] *= XXH_PRIME32_1 */
-      /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime &
-       * 0xFFFFFFFF);  */
-      xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);
-      /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
-      xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);
-      xacc[i] = prod_odd + (prod_even << v32);
-
-    }
-
-  }
-
-}
-
-#endif
-
-/* scalar variants - universal */
-
-XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(
-    void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
-    const void *XXH_RESTRICT secret, XXH3_accWidth_e accWidth) {
-
-  XXH_ALIGN(XXH_ACC_ALIGN)
-  xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
-  const xxh_u8 *const xinput =
-      (const xxh_u8 *)input;                    /* no alignment restriction */
-  const xxh_u8 *const xsecret =
-      (const xxh_u8 *)secret;                   /* no alignment restriction */
-  size_t i;
-  XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN - 1)) == 0);
-  for (i = 0; i < XXH_ACC_NB; i++) {
-
-    xxh_u64 const data_val = XXH_readLE64(xinput + 8 * i);
-    xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i * 8);
-
-    if (accWidth == XXH3_acc_64bits) {
-
-      xacc[i] += data_val;
-
-    } else {
-
-      xacc[i ^ 1] += data_val;                       /* swap adjacent lanes */
-
-    }
-
-    xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
-
-  }
-
-}
-
-XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void *XXH_RESTRICT       acc,
-                                              const void *XXH_RESTRICT secret) {
-
-  XXH_ALIGN(XXH_ACC_ALIGN)
-  xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
-  const xxh_u8 *const xsecret =
-      (const xxh_u8 *)secret;                   /* no alignment restriction */
-  size_t i;
-  XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN - 1)) == 0);
-  for (i = 0; i < XXH_ACC_NB; i++) {
-
-    xxh_u64 const key64 = XXH_readLE64(xsecret + 8 * i);
-    xxh_u64       acc64 = xacc[i];
-    acc64 = XXH_xorshift64(acc64, 47);
-    acc64 ^= key64;
-    acc64 *= XXH_PRIME32_1;
-    xacc[i] = acc64;
-
-  }
-
-}
-
-XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(
-    void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
-
-  /*
-   * We need a separate pointer for the hack below,
-   * which requires a non-const pointer.
-   * Any decent compiler will optimize this out otherwise.
-   */
-  const xxh_u8 *kSecretPtr = XXH3_kSecret;
-  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
-
-#if defined(__clang__) && defined(__aarch64__)
-  /*
-   * UGLY HACK:
-   * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
-   * placed sequentially, in order, at the top of the unrolled loop.
-   *
-   * While MOVK is great for generating constants (2 cycles for a 64-bit
-   * constant compared to 4 cycles for LDR), long MOVK chains stall the
-   * integer pipelines:
-   *   I   L   S
-   * MOVK
-   * MOVK
-   * MOVK
-   * MOVK
-   * ADD
-   * SUB      STR
-   *          STR
-   * By forcing loads from memory (as the asm line causes Clang to assume
-   * that XXH3_kSecretPtr has been changed), the pipelines are used more
-   * efficiently:
-   *   I   L   S
-   *      LDR
-   *  ADD LDR
-   *  SUB     STR
-   *          STR
-   * XXH3_64bits_withSeed, len == 256, Snapdragon 835
-   *   without hack: 2654.4 MB/s
-   *   with hack:    3202.9 MB/s
-   */
-  __asm__("" : "+r"(kSecretPtr));
-#endif
-  /*
-   * Note: in debug mode, this overrides the asm optimization
-   * and Clang will emit MOVK chains again.
-   */
-  XXH_ASSERT(kSecretPtr == XXH3_kSecret);
-
-  {
-
-    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
-    int       i;
-    for (i = 0; i < nbRounds; i++) {
-
-      /*
-       * The asm hack causes Clang to assume that kSecretPtr aliases with
-       * customSecret, and on aarch64, this prevented LDP from merging two
-       * loads together for free. Putting the loads together before the stores
-       * properly generates LDP.
-       */
-      xxh_u64 lo = XXH_readLE64(kSecretPtr + 16 * i) + seed64;
-      xxh_u64 hi = XXH_readLE64(kSecretPtr + 16 * i + 8) - seed64;
-      XXH_writeLE64((xxh_u8 *)customSecret + 16 * i, lo);
-      XXH_writeLE64((xxh_u8 *)customSecret + 16 * i + 8, hi);
-
-    }
-
-  }
-
-}
-
-typedef void (*XXH3_f_accumulate_512)(void *XXH_RESTRICT, const void *,
-                                      const void *, XXH3_accWidth_e);
-typedef void (*XXH3_f_scrambleAcc)(void *XXH_RESTRICT, const void *);
-typedef void (*XXH3_f_initCustomSecret)(void *XXH_RESTRICT, xxh_u64);
-
-#if (XXH_VECTOR == XXH_AVX512)
-
-  #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
-  #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
-  #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
-
-#elif (XXH_VECTOR == XXH_AVX2)
-
-  #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
-  #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
-  #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
-
-#elif (XXH_VECTOR == XXH_SSE2)
-
-  #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
-  #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
-  #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
-
-#elif (XXH_VECTOR == XXH_NEON)
-
-  #define XXH3_accumulate_512 XXH3_accumulate_512_neon
-  #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
-  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
-
-#elif (XXH_VECTOR == XXH_VSX)
-
-  #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
-  #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
-  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
-
-#else                                                             /* scalar */
-
-  #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
-  #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
-  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
-
-#endif
-
-#ifndef XXH_PREFETCH_DIST
-  #ifdef __clang__
-    #define XXH_PREFETCH_DIST 320
-  #else
-    #if (XXH_VECTOR == XXH_AVX512)
-      #define XXH_PREFETCH_DIST 512
-    #else
-      #define XXH_PREFETCH_DIST 384
-    #endif
-  #endif                                                       /* __clang__ */
-#endif                                                 /* XXH_PREFETCH_DIST */
-
-/*
- * XXH3_accumulate()
- * Loops over XXH3_accumulate_512().
- * Assumption: nbStripes will not overflow the secret size
- */
-XXH_FORCE_INLINE void XXH3_accumulate(xxh_u64 *XXH_RESTRICT      acc,
-                                      const xxh_u8 *XXH_RESTRICT input,
-                                      const xxh_u8 *XXH_RESTRICT secret,
-                                      size_t                     nbStripes,
-                                      XXH3_accWidth_e            accWidth,
-                                      XXH3_f_accumulate_512      f_acc512) {
-
-  size_t n;
-  for (n = 0; n < nbStripes; n++) {
-
-    const xxh_u8 *const in = input + n * XXH_STRIPE_LEN;
-    XXH_PREFETCH(in + XXH_PREFETCH_DIST);
-    f_acc512(acc, in, secret + n * XXH_SECRET_CONSUME_RATE, accWidth);
-
-  }
-
-}
-
-XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(
-    xxh_u64 *XXH_RESTRICT acc, const xxh_u8 *XXH_RESTRICT input, size_t len,
-    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize,
-    XXH3_accWidth_e accWidth, XXH3_f_accumulate_512 f_acc512,
-    XXH3_f_scrambleAcc f_scramble) {
-
-  size_t const nb_rounds =
-      (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
-  size_t const block_len = XXH_STRIPE_LEN * nb_rounds;
-  size_t const nb_blocks = len / block_len;
-
-  size_t n;
-
-  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-
-  for (n = 0; n < nb_blocks; n++) {
-
-    XXH3_accumulate(acc, input + n * block_len, secret, nb_rounds, accWidth,
-                    f_acc512);
-    f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
-
-  }
-
-  /* last partial block */
-  XXH_ASSERT(len > XXH_STRIPE_LEN);
-  {
-
-    size_t const nbStripes = (len - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
-    XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
-    XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes,
-                    accWidth, f_acc512);
-
-    /* last stripe */
-    if (len & (XXH_STRIPE_LEN - 1)) {
-
-      const xxh_u8 *const p = input + len - XXH_STRIPE_LEN;
-      /* Do not align on 8, so that the secret is different from the scrambler
-       */
-#define XXH_SECRET_LASTACC_START 7
-      f_acc512(acc, p,
-               secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START,
-               accWidth);
-
-    }
-
-  }
-
-}
-
-XXH_FORCE_INLINE xxh_u64 XXH3_mix2Accs(const xxh_u64 *XXH_RESTRICT acc,
-                                       const xxh_u8 *XXH_RESTRICT  secret) {
-
-  return XXH3_mul128_fold64(acc[0] ^ XXH_readLE64(secret),
-                            acc[1] ^ XXH_readLE64(secret + 8));
-
-}
-
-static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
-                                   const xxh_u8 *XXH_RESTRICT  secret,
-                                   xxh_u64                     start) {
-
-  xxh_u64 result64 = start;
-  size_t  i = 0;
-
-  for (i = 0; i < 4; i++) {
-
-    result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i);
-#if defined(__clang__)                                /* Clang */ \
-    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
-    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
-    && !defined(XXH_ENABLE_AUTOVECTORIZE)              /* Define to disable */
-    /*
-     * UGLY HACK:
-     * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
-     * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
-     * XXH3_64bits, len == 256, Snapdragon 835:
-     *   without hack: 2063.7 MB/s
-     *   with hack:    2560.7 MB/s
-     */
-    __asm__("" : "+r"(result64));
-#endif
-
-  }
-
-  return XXH3_avalanche(result64);
-
-}
-
-#define XXH3_INIT_ACC                                                          \
-  {                                                                            \
-                                                                               \
-    XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, XXH_PRIME64_4, \
-        XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1                            \
-                                                                               \
-  }
-
-XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(
-    const xxh_u8 *XXH_RESTRICT input, size_t len,
-    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize,
-    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
-
-  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
-
-  XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize,
-                              XXH3_acc_64bits, f_acc512, f_scramble);
-
-  /* converge into final hash */
-  XXH_STATIC_ASSERT(sizeof(acc) == 64);
-  /* do not align on 8, so that the secret is different from the accumulator */
-#define XXH_SECRET_MERGEACCS_START 11
-  XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
-  return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
-                        (xxh_u64)len * XXH_PRIME64_1);
-
-}
-
-/*
- * It's important for performance that XXH3_hashLong is not inlined.
- */
-XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSecret(
-    const xxh_u8 *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
-    const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
-
-  (void)seed64;
-  return XXH3_hashLong_64b_internal(input, len, secret, secretLen,
-                                    XXH3_accumulate_512, XXH3_scrambleAcc);
-
-}
-
-/*
- * XXH3_hashLong_64b_withSeed():
- * Generate a custom key based on alteration of default XXH3_kSecret with the
- * seed, and then use this key for long mode hashing.
- *
- * This operation is decently fast but nonetheless costs a little bit of time.
- * Try to avoid it whenever possible (typically when seed==0).
- *
- * It's important for performance that XXH3_hashLong is not inlined. Not sure
- * why (uop cache maybe?), but the difference is large and easily measurable.
- */
-XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed_internal(
-    const xxh_u8 *input, size_t len, XXH64_hash_t seed,
-    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble,
-    XXH3_f_initCustomSecret f_initSec) {
-
-  if (seed == 0)
-    return XXH3_hashLong_64b_internal(
-        input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble);
-  {
-
-    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
-    f_initSec(secret, seed);
-    return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
-                                      f_acc512, f_scramble);
-
-  }
-
-}
-
-/*
- * It's important for performance that XXH3_hashLong is not inlined.
- */
-XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed(const xxh_u8 *input,
-                                                      size_t        len,
-                                                      XXH64_hash_t  seed,
-                                                      const xxh_u8 *secret,
-                                                      size_t        secretLen) {
-
-  (void)secret;
-  (void)secretLen;
-  return XXH3_hashLong_64b_withSeed_internal(
-      input, len, seed, XXH3_accumulate_512, XXH3_scrambleAcc,
-      XXH3_initCustomSecret);
-
-}
-
-typedef XXH64_hash_t (*XXH3_hashLong64_f)(const xxh_u8 *XXH_RESTRICT, size_t,
-                                          XXH64_hash_t,
-                                          const xxh_u8 *XXH_RESTRICT, size_t);
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3_64bits_internal(const void *XXH_RESTRICT input, size_t len,
-                     XXH64_hash_t seed64, const void *XXH_RESTRICT secret,
-                     size_t secretLen, XXH3_hashLong64_f f_hashLong) {
-
-  XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
-  /*
-   * If an action is to be taken if `secretLen` condition is not respected,
-   * it should be done here.
-   * For now, it's a contract pre-condition.
-   * Adding a check and a branch here would cost performance at every hash.
-   * Also, note that function signature doesn't offer room to return an error.
-   */
-  if (len <= 16)
-    return XXH3_len_0to16_64b((const xxh_u8 *)input, len,
-                              (const xxh_u8 *)secret, seed64);
-  if (len <= 128)
-    return XXH3_len_17to128_64b((const xxh_u8 *)input, len,
-                                (const xxh_u8 *)secret, secretLen, seed64);
-  if (len <= XXH3_MIDSIZE_MAX)
-    return XXH3_len_129to240_64b((const xxh_u8 *)input, len,
-                                 (const xxh_u8 *)secret, secretLen, seed64);
-  return f_hashLong((const xxh_u8 *)input, len, seed64, (const xxh_u8 *)secret,
-                    secretLen);
-
-}
-
-/* ===   Public entry point   === */
-
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) {
-
-  return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret),
-                              XXH3_hashLong_64b_withSecret);
-
-}
-
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input,
-                                                   size_t      len,
-                                                   const void *secret,
-                                                   size_t      secretSize) {
-
-  return XXH3_64bits_internal(input, len, 0, secret, secretSize,
-                              XXH3_hashLong_64b_withSecret);
-
-}
-
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len,
-                                                 XXH64_hash_t seed) {
-
-  return XXH3_64bits_internal(input, len, seed, XXH3_kSecret,
-                              sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
-
-}
-
-/* ===   XXH3 streaming   === */
-
-/*
- * Malloc's a pointer that is always aligned to align.
- *
- * This must be freed with `XXH_alignedFree()`.
- *
- * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
- * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
- * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
- *
- * This underalignment previously caused a rather obvious crash which went
- * completely unnoticed due to XXH3_createState() not actually being tested.
- * Credit to RedSpah for noticing this bug.
- *
- * The alignment is done manually: Functions like posix_memalign or _mm_malloc
- * are avoided: To maintain portability, we would have to write a fallback
- * like this anyways, and besides, testing for the existence of library
- * functions without relying on external build tools is impossible.
- *
- * The method is simple: Overallocate, manually align, and store the offset
- * to the original behind the returned pointer.
- *
- * Align must be a power of 2 and 8 <= align <= 128.
- */
-static void *XXH_alignedMalloc(size_t s, size_t align) {
-
-  XXH_ASSERT(align <= 128 && align >= 8);                    /* range check */
-  XXH_ASSERT((align & (align - 1)) == 0);                     /* power of 2 */
-  XXH_ASSERT(s != 0 && s < (s + align));                  /* empty/overflow */
-  {  /* Overallocate to make room for manual realignment and an offset byte */
-    xxh_u8 *base = (xxh_u8 *)XXH_malloc(s + align);
-    if (base != NULL) {
-
-      /*
-       * Get the offset needed to align this pointer.
-       *
-       * Even if the returned pointer is aligned, there will always be
-       * at least one byte to store the offset to the original pointer.
-       */
-      size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
-      /* Add the offset for the now-aligned pointer */
-      xxh_u8 *ptr = base + offset;
-
-      XXH_ASSERT((size_t)ptr % align == 0);
-
-      /* Store the offset immediately before the returned pointer. */
-      ptr[-1] = (xxh_u8)offset;
-      return ptr;
-
-    }
-
-    return NULL;
-
-  }
-
-}
-
-/*
- * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
- * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
- */
-static void XXH_alignedFree(void *p) {
-
-  if (p != NULL) {
-
-    xxh_u8 *ptr = (xxh_u8 *)p;
-    /* Get the offset byte we added in XXH_malloc. */
-    xxh_u8 offset = ptr[-1];
-    /* Free the original malloc'd pointer */
-    xxh_u8 *base = ptr - offset;
-    XXH_free(base);
-
-  }
-
-}
-
-XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) {
-
-  return (XXH3_state_t *)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
-
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) {
-
-  XXH_alignedFree(statePtr);
-  return XXH_OK;
-
-}
-
-XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t *      dst_state,
-                                   const XXH3_state_t *src_state) {
-
-  memcpy(dst_state, src_state, sizeof(*dst_state));
-
-}
-
-static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr,
-                                       XXH64_hash_t seed, const xxh_u8 *secret,
-                                       size_t secretSize) {
-
-  XXH_ASSERT(statePtr != NULL);
-  memset(statePtr, 0, sizeof(*statePtr));
-  statePtr->acc[0] = XXH_PRIME32_3;
-  statePtr->acc[1] = XXH_PRIME64_1;
-  statePtr->acc[2] = XXH_PRIME64_2;
-  statePtr->acc[3] = XXH_PRIME64_3;
-  statePtr->acc[4] = XXH_PRIME64_4;
-  statePtr->acc[5] = XXH_PRIME32_2;
-  statePtr->acc[6] = XXH_PRIME64_5;
-  statePtr->acc[7] = XXH_PRIME32_1;
-  statePtr->seed = seed;
-  XXH_ASSERT(secret != NULL);
-  statePtr->extSecret = secret;
-  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-  statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
-  statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
-
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr) {
-
-  if (statePtr == NULL) return XXH_ERROR;
-  XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret,
-                             XXH_SECRET_DEFAULT_SIZE);
-  return XXH_OK;
-
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
-    XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
-
-  if (statePtr == NULL) return XXH_ERROR;
-  XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8 *)secret, secretSize);
-  if (secret == NULL) return XXH_ERROR;
-  if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
-  return XXH_OK;
-
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
-                                                        XXH64_hash_t  seed) {
-
-  if (statePtr == NULL) return XXH_ERROR;
-  XXH3_64bits_reset_internal(statePtr, seed, XXH3_kSecret,
-                             XXH_SECRET_DEFAULT_SIZE);
-  XXH3_initCustomSecret(statePtr->customSecret, seed);
-  statePtr->extSecret = NULL;
-  return XXH_OK;
-
-}
-
-XXH_FORCE_INLINE void XXH3_consumeStripes(
-    xxh_u64 *XXH_RESTRICT acc, size_t *XXH_RESTRICT nbStripesSoFarPtr,
-    size_t nbStripesPerBlock, const xxh_u8 *XXH_RESTRICT input,
-    size_t totalStripes, const xxh_u8 *XXH_RESTRICT secret, size_t secretLimit,
-    XXH3_accWidth_e accWidth, XXH3_f_accumulate_512 f_acc512,
-    XXH3_f_scrambleAcc f_scramble) {
-
-  XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
-  if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
-
-    /* need a scrambling operation */
-    size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
-    XXH3_accumulate(acc, input,
-                    secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE,
-                    nbStripes, accWidth, f_acc512);
-    f_scramble(acc, secret + secretLimit);
-    XXH3_accumulate(acc, input + nbStripes * XXH_STRIPE_LEN, secret,
-                    totalStripes - nbStripes, accWidth, f_acc512);
-    *nbStripesSoFarPtr = totalStripes - nbStripes;
-
-  } else {
-
-    XXH3_accumulate(acc, input,
-                    secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE,
-                    totalStripes, accWidth, f_acc512);
-    *nbStripesSoFarPtr += totalStripes;
-
-  }
-
-}
-
-/*
- * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
- */
-XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
-                                           const xxh_u8 *input, size_t len,
-                                           XXH3_accWidth_e       accWidth,
-                                           XXH3_f_accumulate_512 f_acc512,
-                                           XXH3_f_scrambleAcc    f_scramble) {
-
-  if (input == NULL)
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
-    (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
-    return XXH_OK;
-#else
-    return XXH_ERROR;
-#endif
-
-  {
-
-    const xxh_u8 *const        bEnd = input + len;
-    const unsigned char *const secret =
-        (state->extSecret == NULL) ? state->customSecret : state->extSecret;
-
-    state->totalLen += len;
-
-    if (state->bufferedSize + len <=
-        XXH3_INTERNALBUFFER_SIZE) {                   /* fill in tmp buffer */
-      XXH_memcpy(state->buffer + state->bufferedSize, input, len);
-      state->bufferedSize += (XXH32_hash_t)len;
-      return XXH_OK;
-
-    }
-
-    /* input is now > XXH3_INTERNALBUFFER_SIZE */
-
-#define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
-    XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN ==
-                      0);                                 /* clean multiple */
-
-    /*
-     * There is some input left inside the internal buffer.
-     * Fill it, then consume it.
-     */
-    if (state->bufferedSize) {
-
-      size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
-      XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
-      input += loadSize;
-      XXH3_consumeStripes(state->acc, &state->nbStripesSoFar,
-                          state->nbStripesPerBlock, state->buffer,
-                          XXH3_INTERNALBUFFER_STRIPES, secret,
-                          state->secretLimit, accWidth, f_acc512, f_scramble);
-      state->bufferedSize = 0;
-
-    }
-
-    /* Consume input by full buffer quantities */
-    if (input + XXH3_INTERNALBUFFER_SIZE <= bEnd) {
-
-      const xxh_u8 *const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
-      do {
-
-        XXH3_consumeStripes(state->acc, &state->nbStripesSoFar,
-                            state->nbStripesPerBlock, input,
-                            XXH3_INTERNALBUFFER_STRIPES, secret,
-                            state->secretLimit, accWidth, f_acc512, f_scramble);
-        input += XXH3_INTERNALBUFFER_SIZE;
-
-      } while (input <= limit);
-
-      /* for last partial stripe */
-      memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN,
-             input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
-
-    }
-
-    if (input < bEnd) {                  /* Some remaining input: buffer it */
-      XXH_memcpy(state->buffer, input, (size_t)(bEnd - input));
-      state->bufferedSize = (XXH32_hash_t)(bEnd - input);
-
-    }
-
-  }
-
-  return XXH_OK;
-
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state,
-                                                const void *input, size_t len) {
-
-  return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_acc_64bits,
-                     XXH3_accumulate_512, XXH3_scrambleAcc);
-
-}
-
-XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t *       acc,
-                                       const XXH3_state_t * state,
-                                       const unsigned char *secret,
-                                       XXH3_accWidth_e      accWidth) {
-
-  /*
-   * Digest on a local copy. This way, the state remains unaltered, and it can
-   * continue ingesting more input afterwards.
-   */
-  memcpy(acc, state->acc, sizeof(state->acc));
-  if (state->bufferedSize >= XXH_STRIPE_LEN) {
-
-    size_t const nbStripes = state->bufferedSize / XXH_STRIPE_LEN;
-    size_t       nbStripesSoFar = state->nbStripesSoFar;
-    XXH3_consumeStripes(acc, &nbStripesSoFar, state->nbStripesPerBlock,
-                        state->buffer, nbStripes, secret, state->secretLimit,
-                        accWidth, XXH3_accumulate_512, XXH3_scrambleAcc);
-    if (state->bufferedSize % XXH_STRIPE_LEN) {  /* one last partial stripe */
-      XXH3_accumulate_512(
-          acc, state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
-          secret + state->secretLimit - XXH_SECRET_LASTACC_START, accWidth);
-
-    }
-
-  } else {                                 /* bufferedSize < XXH_STRIPE_LEN */
-
-    if (state->bufferedSize) {                           /* one last stripe */
-      xxh_u8       lastStripe[XXH_STRIPE_LEN];
-      size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
-      memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize,
-             catchupSize);
-      memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
-      XXH3_accumulate_512(
-          acc, lastStripe,
-          secret + state->secretLimit - XXH_SECRET_LASTACC_START, accWidth);
-
-    }
-
-  }
-
-}
-
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) {
-
-  const unsigned char *const secret =
-      (state->extSecret == NULL) ? state->customSecret : state->extSecret;
-  if (state->totalLen > XXH3_MIDSIZE_MAX) {
-
-    XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
-    XXH3_digest_long(acc, state, secret, XXH3_acc_64bits);
-    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
-                          (xxh_u64)state->totalLen * XXH_PRIME64_1);
-
-  }
-
-  /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
-  if (state->seed)
-    return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen,
-                                state->seed);
-  return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
-                                secret, state->secretLimit + XXH_STRIPE_LEN);
-
-}
-
-#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
-
-XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
-                                        const void *customSeed,
-                                        size_t      customSeedSize) {
-
-  XXH_ASSERT(secretBuffer != NULL);
-  if (customSeedSize == 0) {
-
-    memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
-    return;
-
-  }
-
-  XXH_ASSERT(customSeed != NULL);
-
-  {
-
-    size_t const       segmentSize = sizeof(XXH128_hash_t);
-    size_t const       nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
-    XXH128_canonical_t scrambler;
-    XXH64_hash_t       seeds[12];
-    size_t             segnb;
-    XXH_ASSERT(nbSegments == 12);
-    XXH_ASSERT(segmentSize * nbSegments ==
-               XXH_SECRET_DEFAULT_SIZE);                  /* exact multiple */
-    XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
-
-    /*
-     * Copy customSeed to seeds[], truncating or repeating as necessary.
-     */
-    {
-
-      size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
-      size_t filled = toFill;
-      memcpy(seeds, customSeed, toFill);
-      while (filled < sizeof(seeds)) {
-
-        toFill = XXH_MIN(filled, sizeof(seeds) - filled);
-        memcpy((char *)seeds + filled, seeds, toFill);
-        filled += toFill;
-
-      }
-
-    }
-
-    /* generate secret */
-    memcpy(secretBuffer, &scrambler, sizeof(scrambler));
-    for (segnb = 1; segnb < nbSegments; segnb++) {
-
-      size_t const       segmentStart = segnb * segmentSize;
-      XXH128_canonical_t segment;
-      XXH128_canonicalFromHash(&segment,
-                               XXH128(&scrambler, sizeof(scrambler),
-                                      XXH_readLE64(seeds + segnb) + segnb));
-      memcpy((char *)secretBuffer + segmentStart, &segment, sizeof(segment));
-
-    }
-
-  }
-
-}
-
-/* ==========================================
- * XXH3 128 bits (a.k.a XXH128)
- * ==========================================
- * XXH3's 128-bit variant has better mixing and strength than the 64-bit
- * variant, even without counting the significantly larger output size.
- *
- * For example, extra steps are taken to avoid the seed-dependent collisions
- * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
- *
- * This strength naturally comes at the cost of some speed, especially on short
- * lengths. Note that longer hashes are about as fast as the 64-bit version
- * due to it using only a slight modification of the 64-bit loop.
- *
- * XXH128 is also more oriented towards 64-bit machines. It is still extremely
- * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
- */
-
-XXH_FORCE_INLINE XXH128_hash_t XXH3_len_1to3_128b(const xxh_u8 *input,
-                                                  size_t        len,
-                                                  const xxh_u8 *secret,
-                                                  XXH64_hash_t  seed) {
-
-  /* A doubled version of 1to3_64b with different constants. */
-  XXH_ASSERT(input != NULL);
-  XXH_ASSERT(1 <= len && len <= 3);
-  XXH_ASSERT(secret != NULL);
-  /*
-   * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
-   * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
-   * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
-   */
-  {
-
-    xxh_u8 const  c1 = input[0];
-    xxh_u8 const  c2 = input[len >> 1];
-    xxh_u8 const  c3 = input[len - 1];
-    xxh_u32 const combinedl = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) |
-                              ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
-    xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
-    xxh_u64 const bitflipl =
-        (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
-    xxh_u64 const bitfliph =
-        (XXH_readLE32(secret + 8) ^ XXH_readLE32(secret + 12)) - seed;
-    xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
-    xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
-    xxh_u64 const mixedl = keyed_lo * XXH_PRIME64_1;
-    xxh_u64 const mixedh = keyed_hi * XXH_PRIME64_5;
-    XXH128_hash_t h128;
-    h128.low64 = XXH3_avalanche(mixedl);
-    h128.high64 = XXH3_avalanche(mixedh);
-    return h128;
-
-  }
-
-}
-
-XXH_FORCE_INLINE XXH128_hash_t XXH3_len_4to8_128b(const xxh_u8 *input,
-                                                  size_t        len,
-                                                  const xxh_u8 *secret,
-                                                  XXH64_hash_t  seed) {
-
-  XXH_ASSERT(input != NULL);
-  XXH_ASSERT(secret != NULL);
-  XXH_ASSERT(4 <= len && len <= 8);
-  seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
-  {
-
-    xxh_u32 const input_lo = XXH_readLE32(input);
-    xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
-    xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
-    xxh_u64 const bitflip =
-        (XXH_readLE64(secret + 16) ^ XXH_readLE64(secret + 24)) + seed;
-    xxh_u64 const keyed = input_64 ^ bitflip;
-
-    /* Shift len to the left to ensure it is even, this avoids even multiplies.
-     */
-    XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
-
-    m128.high64 += (m128.low64 << 1);
-    m128.low64 ^= (m128.high64 >> 3);
-
-    m128.low64 = XXH_xorshift64(m128.low64, 35);
-    m128.low64 *= 0x9FB21C651E98DF25ULL;
-    m128.low64 = XXH_xorshift64(m128.low64, 28);
-    m128.high64 = XXH3_avalanche(m128.high64);
-    return m128;
-
-  }
-
-}
-
-XXH_FORCE_INLINE XXH128_hash_t XXH3_len_9to16_128b(const xxh_u8 *input,
-                                                   size_t        len,
-                                                   const xxh_u8 *secret,
-                                                   XXH64_hash_t  seed) {
-
-  XXH_ASSERT(input != NULL);
-  XXH_ASSERT(secret != NULL);
-  XXH_ASSERT(9 <= len && len <= 16);
-  {
-
-    xxh_u64 const bitflipl =
-        (XXH_readLE64(secret + 32) ^ XXH_readLE64(secret + 40)) - seed;
-    xxh_u64 const bitfliph =
-        (XXH_readLE64(secret + 48) ^ XXH_readLE64(secret + 56)) + seed;
-    xxh_u64 const input_lo = XXH_readLE64(input);
-    xxh_u64       input_hi = XXH_readLE64(input + len - 8);
-    XXH128_hash_t m128 =
-        XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
-    /*
-     * Put len in the middle of m128 to ensure that the length gets mixed to
-     * both the low and high bits in the 128x64 multiply below.
-     */
-    m128.low64 += (xxh_u64)(len - 1) << 54;
-    input_hi ^= bitfliph;
-    /*
-     * Add the high 32 bits of input_hi to the high 32 bits of m128, then
-     * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
-     * the high 64 bits of m128.
-     *
-     * The best approach to this operation is different on 32-bit and 64-bit.
-     */
-    if (sizeof(void *) < sizeof(xxh_u64)) {                       /* 32-bit */
-      /*
-       * 32-bit optimized version, which is more readable.
-       *
-       * On 32-bit, it removes an ADC and delays a dependency between the two
-       * halves of m128.high64, but it generates an extra mask on 64-bit.
-       */
-      m128.high64 += (input_hi & 0xFFFFFFFF00000000) +
-                     XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
-
-    } else {
-
-      /*
-       * 64-bit optimized (albeit more confusing) version.
-       *
-       * Uses some properties of addition and multiplication to remove the mask:
-       *
-       * Let:
-       *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
-       *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
-       *    c = XXH_PRIME32_2
-       *
-       *    a + (b * c)
-       * Inverse Property: x + y - x == y
-       *    a + (b * (1 + c - 1))
-       * Distributive Property: x * (y + z) == (x * y) + (x * z)
-       *    a + (b * 1) + (b * (c - 1))
-       * Identity Property: x * 1 == x
-       *    a + b + (b * (c - 1))
-       *
-       * Substitute a, b, and c:
-       *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 -
-       * 1))
-       *
-       * Since input_hi.hi + input_hi.lo == input_hi, we get this:
-       *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
-       */
-      m128.high64 +=
-          input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
-
-    }
-
-    /* m128 ^= XXH_swap64(m128 >> 64); */
-    m128.low64 ^= XXH_swap64(m128.high64);
-
-    {                      /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
-      XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
-      h128.high64 += m128.high64 * XXH_PRIME64_2;
-
-      h128.low64 = XXH3_avalanche(h128.low64);
-      h128.high64 = XXH3_avalanche(h128.high64);
-      return h128;
-
-    }
-
-  }
-
-}
-
-/*
- * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
- */
-XXH_FORCE_INLINE XXH128_hash_t XXH3_len_0to16_128b(const xxh_u8 *input,
-                                                   size_t        len,
-                                                   const xxh_u8 *secret,
-                                                   XXH64_hash_t  seed) {
-
-  XXH_ASSERT(len <= 16);
-  {
-
-    if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
-    if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
-    if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
-    {
-
-      XXH128_hash_t h128;
-      xxh_u64 const bitflipl =
-          XXH_readLE64(secret + 64) ^ XXH_readLE64(secret + 72);
-      xxh_u64 const bitfliph =
-          XXH_readLE64(secret + 80) ^ XXH_readLE64(secret + 88);
-      h128.low64 = XXH3_avalanche((XXH_PRIME64_1 + seed) ^ bitflipl);
-      h128.high64 = XXH3_avalanche((XXH_PRIME64_2 - seed) ^ bitfliph);
-      return h128;
-
-    }
-
-  }
-
-}
-
-/*
- * A bit slower than XXH3_mix16B, but handles multiply by zero better.
- */
-XXH_FORCE_INLINE XXH128_hash_t XXH128_mix32B(XXH128_hash_t acc,
-                                             const xxh_u8 *input_1,
-                                             const xxh_u8 *input_2,
-                                             const xxh_u8 *secret,
-                                             XXH64_hash_t  seed) {
-
-  acc.low64 += XXH3_mix16B(input_1, secret + 0, seed);
-  acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
-  acc.high64 += XXH3_mix16B(input_2, secret + 16, seed);
-  acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
-  return acc;
-
-}
-
-XXH_FORCE_INLINE XXH128_hash_t XXH3_len_17to128_128b(
-    const xxh_u8 *XXH_RESTRICT input, size_t len,
-    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
-
-  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-  (void)secretSize;
-  XXH_ASSERT(16 < len && len <= 128);
-
-  {
-
-    XXH128_hash_t acc;
-    acc.low64 = len * XXH_PRIME64_1;
-    acc.high64 = 0;
-    if (len > 32) {
-
-      if (len > 64) {
-
-        if (len > 96) {
-
-          acc = XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96,
-                              seed);
-
-        }
-
-        acc =
-            XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed);
-
-      }
-
-      acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed);
-
-    }
-
-    acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed);
-    {
-
-      XXH128_hash_t h128;
-      h128.low64 = acc.low64 + acc.high64;
-      h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) +
-                    ((len - seed) * XXH_PRIME64_2);
-      h128.low64 = XXH3_avalanche(h128.low64);
-      h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
-      return h128;
-
-    }
-
-  }
-
-}
-
-XXH_NO_INLINE XXH128_hash_t XXH3_len_129to240_128b(
-    const xxh_u8 *XXH_RESTRICT input, size_t len,
-    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
-
-  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-  (void)secretSize;
-  XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
-
-  {
-
-    XXH128_hash_t acc;
-    int const     nbRounds = (int)len / 32;
-    int           i;
-    acc.low64 = len * XXH_PRIME64_1;
-    acc.high64 = 0;
-    for (i = 0; i < 4; i++) {
-
-      acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16,
-                          secret + (32 * i), seed);
-
-    }
-
-    acc.low64 = XXH3_avalanche(acc.low64);
-    acc.high64 = XXH3_avalanche(acc.high64);
-    XXH_ASSERT(nbRounds >= 4);
-    for (i = 4; i < nbRounds; i++) {
-
-      acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16,
-                          secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
-                          seed);
-
-    }
-
-    /* last bytes */
-    acc = XXH128_mix32B(
-        acc, input + len - 16, input + len - 32,
-        secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
-        0ULL - seed);
-
-    {
-
-      XXH128_hash_t h128;
-      h128.low64 = acc.low64 + acc.high64;
-      h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) +
-                    ((len - seed) * XXH_PRIME64_2);
-      h128.low64 = XXH3_avalanche(h128.low64);
-      h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
-      return h128;
-
-    }
-
-  }
-
-}
-
-XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal(
-    const xxh_u8 *XXH_RESTRICT input, size_t len,
-    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize,
-    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
-
-  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
-
-  XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize,
-                              XXH3_acc_128bits, f_acc512, f_scramble);
-
-  /* converge into final hash */
-  XXH_STATIC_ASSERT(sizeof(acc) == 64);
-  XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
-  {
-
-    XXH128_hash_t h128;
-    h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
-                                (xxh_u64)len * XXH_PRIME64_1);
-    h128.high64 = XXH3_mergeAccs(
-        acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
-        ~((xxh_u64)len * XXH_PRIME64_2));
-    return h128;
-
-  }
-
-}
-
-/*
- * It's important for performance that XXH3_hashLong is not inlined.
- */
-XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_defaultSecret(
-    const xxh_u8 *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
-    const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
-
-  (void)seed64;
-  (void)secret;
-  (void)secretLen;
-  return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret,
-                                     sizeof(XXH3_kSecret), XXH3_accumulate_512,
-                                     XXH3_scrambleAcc);
-
-}
-
-/*
- * It's important for performance that XXH3_hashLong is not inlined.
- */
-XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_withSecret(
-    const xxh_u8 *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
-    const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
-
-  (void)seed64;
-  return XXH3_hashLong_128b_internal(input, len, secret, secretLen,
-                                     XXH3_accumulate_512, XXH3_scrambleAcc);
-
-}
-
-XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed_internal(
-    const xxh_u8 *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
-    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble,
-    XXH3_f_initCustomSecret f_initSec) {
-
-  if (seed64 == 0)
-    return XXH3_hashLong_128b_internal(
-        input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble);
-  {
-
-    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
-    f_initSec(secret, seed64);
-    return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret),
-                                       f_acc512, f_scramble);
-
-  }
-
-}
-
-/*
- * It's important for performance that XXH3_hashLong is not inlined.
- */
-XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed(
-    const xxh_u8 *input, size_t len, XXH64_hash_t seed64,
-    const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
-
-  (void)secret;
-  (void)secretLen;
-  return XXH3_hashLong_128b_withSeed_internal(
-      input, len, seed64, XXH3_accumulate_512, XXH3_scrambleAcc,
-      XXH3_initCustomSecret);
-
-}
-
-typedef XXH128_hash_t (*XXH3_hashLong128_f)(const xxh_u8 *XXH_RESTRICT, size_t,
-                                            XXH64_hash_t,
-                                            const xxh_u8 *XXH_RESTRICT, size_t);
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3_128bits_internal(const void *input, size_t len, XXH64_hash_t seed64,
-                      const xxh_u8 *XXH_RESTRICT secret, size_t secretLen,
-                      XXH3_hashLong128_f f_hl128) {
-
-  XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
-  /*
-   * If an action is to be taken if `secret` conditions are not respected,
-   * it should be done here.
-   * For now, it's a contract pre-condition.
-   * Adding a check and a branch here would cost performance at every hash.
-   */
-  if (len <= 16)
-    return XXH3_len_0to16_128b((const xxh_u8 *)input, len, secret, seed64);
-  if (len <= 128)
-    return XXH3_len_17to128_128b((const xxh_u8 *)input, len, secret, secretLen,
-                                 seed64);
-  if (len <= XXH3_MIDSIZE_MAX)
-    return XXH3_len_129to240_128b((const xxh_u8 *)input, len, secret, secretLen,
-                                  seed64);
-  return f_hl128((const xxh_u8 *)input, len, seed64, secret, secretLen);
-
-}
-
-/* ===   Public XXH128 API   === */
-
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) {
-
-  return XXH3_128bits_internal(input, len, 0, XXH3_kSecret,
-                               sizeof(XXH3_kSecret),
-                               XXH3_hashLong_128b_withSecret);
-
-}
-
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input,
-                                                     size_t      len,
-                                                     const void *secret,
-                                                     size_t      secretSize) {
-
-  return XXH3_128bits_internal(input, len, 0, (const xxh_u8 *)secret,
-                               secretSize, XXH3_hashLong_128b_defaultSecret);
-
-}
-
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input,
-                                                   size_t       len,
-                                                   XXH64_hash_t seed) {
-
-  return XXH3_128bits_internal(input, len, seed, XXH3_kSecret,
-                               sizeof(XXH3_kSecret),
-                               XXH3_hashLong_128b_withSeed);
-
-}
-
-XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len,
-                                    XXH64_hash_t seed) {
-
-  return XXH3_128bits_withSeed(input, len, seed);
-
-}
-
-/* ===   XXH3 128-bit streaming   === */
-
-/*
- * All the functions are actually the same as for 64-bit streaming variant.
- * The only difference is the finalizatiom routine.
- */
-
-static void XXH3_128bits_reset_internal(XXH3_state_t *statePtr,
-                                        XXH64_hash_t seed, const xxh_u8 *secret,
-                                        size_t secretSize) {
-
-  XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
-
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr) {
-
-  if (statePtr == NULL) return XXH_ERROR;
-  XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret,
-                              XXH_SECRET_DEFAULT_SIZE);
-  return XXH_OK;
-
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(
-    XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
-
-  if (statePtr == NULL) return XXH_ERROR;
-  XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8 *)secret, secretSize);
-  if (secret == NULL) return XXH_ERROR;
-  if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
-  return XXH_OK;
-
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
-                                                         XXH64_hash_t  seed) {
-
-  if (statePtr == NULL) return XXH_ERROR;
-  XXH3_128bits_reset_internal(statePtr, seed, XXH3_kSecret,
-                              XXH_SECRET_DEFAULT_SIZE);
-  XXH3_initCustomSecret(statePtr->customSecret, seed);
-  statePtr->extSecret = NULL;
-  return XXH_OK;
-
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state,
-                                                 const void *  input,
-                                                 size_t        len) {
-
-  return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_acc_128bits,
-                     XXH3_accumulate_512, XXH3_scrambleAcc);
-
-}
-
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) {
-
-  const unsigned char *const secret =
-      (state->extSecret == NULL) ? state->customSecret : state->extSecret;
-  if (state->totalLen > XXH3_MIDSIZE_MAX) {
-
-    XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
-    XXH3_digest_long(acc, state, secret, XXH3_acc_128bits);
-    XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >=
-               sizeof(acc) + XXH_SECRET_MERGEACCS_START);
-    {
-
-      XXH128_hash_t h128;
-      h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
-                                  (xxh_u64)state->totalLen * XXH_PRIME64_1);
-      h128.high64 =
-          XXH3_mergeAccs(acc,
-                         secret + state->secretLimit + XXH_STRIPE_LEN -
-                             sizeof(acc) - XXH_SECRET_MERGEACCS_START,
-                         ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
-      return h128;
-
-    }
-
-  }
-
-  /* len <= XXH3_MIDSIZE_MAX : short code */
-  if (state->seed)
-    return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen,
-                                 state->seed);
-  return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
-                                 secret, state->secretLimit + XXH_STRIPE_LEN);
-
-}
-
-/* 128-bit utility functions */
-
-#include <string.h>                                       /* memcmp, memcpy */
-
-/* return : 1 is equal, 0 if different */
-XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
-
-  /* note : XXH128_hash_t is compact, it has no padding byte */
-  return !(memcmp(&h1, &h2, sizeof(h1)));
-
-}
-
-/* This prototype is compatible with stdlib's qsort().
- * return : >0 if *h128_1  > *h128_2
- *          <0 if *h128_1  < *h128_2
- *          =0 if *h128_1 == *h128_2  */
-XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) {
-
-  XXH128_hash_t const h1 = *(const XXH128_hash_t *)h128_1;
-  XXH128_hash_t const h2 = *(const XXH128_hash_t *)h128_2;
-  int const           hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
-  /* note : bets that, in most cases, hash values are different */
-  if (hcmp) return hcmp;
-  return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
-
-}
-
-/*======   Canonical representation   ======*/
-XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
-                                             XXH128_hash_t       hash) {
-
-  XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
-  if (XXH_CPU_LITTLE_ENDIAN) {
-
-    hash.high64 = XXH_swap64(hash.high64);
-    hash.low64 = XXH_swap64(hash.low64);
-
-  }
-
-  memcpy(dst, &hash.high64, sizeof(hash.high64));
-  memcpy((char *)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
-
-}
-
-XXH_PUBLIC_API XXH128_hash_t
-XXH128_hashFromCanonical(const XXH128_canonical_t *src) {
-
-  XXH128_hash_t h;
-  h.high64 = XXH_readBE64(src);
-  h.low64 = XXH_readBE64(src->digest + 8);
-  return h;
-
-}
-
-/* Pop our optimization override from above */
-#if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
-    && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
-    && defined(__OPTIMIZE__) &&                                      \
-    !defined(__OPTIMIZE_SIZE__)                      /* respect -O0 and -Os */
-  #pragma GCC pop_options
-#endif
-
-#endif                                                 /* XXH3_H_1397135465 */
-
diff --git a/include/xxhash.h b/include/xxhash.h
index 826f39bd..0472f881 100644
--- a/include/xxhash.h
+++ b/include/xxhash.h
@@ -197,6 +197,7 @@ extern "C" {
     #define XXH_CAT(A, B) A##B
     #define XXH_NAME2(A, B) XXH_CAT(A, B)
     #define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+    /* XXH32 */
     #define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
     #define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
     #define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
@@ -208,6 +209,7 @@ extern "C" {
       XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
     #define XXH32_hashFromCanonical \
       XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+    /* XXH64 */
     #define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
     #define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
     #define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
@@ -219,14 +221,50 @@ extern "C" {
       XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
     #define XXH64_hashFromCanonical \
       XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+    /* XXH3_64bits */
+    #define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+    #define XXH3_64bits_withSecret \
+      XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+    #define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+    #define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+    #define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+    #define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+    #define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+    #define XXH3_64bits_reset_withSeed \
+      XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+    #define XXH3_64bits_reset_withSecret \
+      XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+    #define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+    #define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+    #define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
+    /* XXH3_128bits */
+    #define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+    #define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+    #define XXH3_128bits_withSeed \
+      XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+    #define XXH3_128bits_withSecret \
+      XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+    #define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+    #define XXH3_128bits_reset_withSeed \
+      XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+    #define XXH3_128bits_reset_withSecret \
+      XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+    #define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+    #define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+    #define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+    #define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+    #define XXH128_canonicalFromHash \
+      XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+    #define XXH128_hashFromCanonical \
+      XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
   #endif
 
   /* *************************************
    *  Version
    ***************************************/
   #define XXH_VERSION_MAJOR 0
-  #define XXH_VERSION_MINOR 7
-  #define XXH_VERSION_RELEASE 4
+  #define XXH_VERSION_MINOR 8
+  #define XXH_VERSION_RELEASE 0
   #define XXH_VERSION_NUMBER                                   \
     (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + \
      XXH_VERSION_RELEASE)
@@ -401,145 +439,56 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
 XXH_PUBLIC_API XXH64_hash_t
 XXH64_hashFromCanonical(const XXH64_canonical_t *src);
 
-  #endif                                                /* XXH_NO_LONG_LONG */
-
-#endif                                         /* XXHASH_H_5627135585666179 */
-
-#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
-  #define XXHASH_H_STATIC_13879238742
-/* ****************************************************************************
- * This section contains declarations which are not guaranteed to remain stable.
- * They may change in future versions, becoming incompatible with a different
- * version of the library.
- * These declarations should only be used with static linking.
- * Never use them in association with dynamic linking!
- *****************************************************************************
- */
+/*-**********************************************************************
+ *  XXH3 64-bit variant
+ ************************************************************************/
 
-/*
- * These definitions are only present to allow static allocation of an XXH
- * state, for example, on the stack or in a struct.
- * Never **ever** access members directly.
+/* ************************************************************************
+ * XXH3 is a new hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
+ * faster on small ones compared to XXH64, though exact differences depend on
+ * the platform.
+ *
+ * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
+ * on all platforms.
+ *
+ * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
+ *
+ * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
+ * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
+ * explained in the implementation.
+ *
+ * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+ * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ *
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The 128-bit version adds additional strength, but it is slightly slower.
+ *
+ * Return values of XXH3 and XXH128 are officially finalized starting
+ * with v0.8.0 and will no longer change in future versions.
+ * Avoid storing values from before that release in long-term storage.
+ *
+ * Results produced by v0.7.x are not comparable with results from v0.7.y.
+ * However, the API is completely stable, and it can safely be used for
+ * ephemeral data (local sessions).
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
  */
 
-struct XXH32_state_s {
-
-  XXH32_hash_t total_len_32;
-  XXH32_hash_t large_len;
-  XXH32_hash_t v1;
-  XXH32_hash_t v2;
-  XXH32_hash_t v3;
-  XXH32_hash_t v4;
-  XXH32_hash_t mem32[4];
-  XXH32_hash_t memsize;
-  XXH32_hash_t
-      reserved; /* never read nor write, might be removed in a future version */
-
-};                                            /* typedef'd to XXH32_state_t */
-
-  #ifndef XXH_NO_LONG_LONG       /* defined when there is no 64-bit support */
-
-struct XXH64_state_s {
-
-  XXH64_hash_t total_len;
-  XXH64_hash_t v1;
-  XXH64_hash_t v2;
-  XXH64_hash_t v3;
-  XXH64_hash_t v4;
-  XXH64_hash_t mem64[4];
-  XXH32_hash_t memsize;
-  XXH32_hash_t reserved32;                   /* required for padding anyway */
-  XXH64_hash_t reserved64; /* never read nor write, might be removed in a future
-                              version */
-
-};                                            /* typedef'd to XXH64_state_t */
-
-  /*-**********************************************************************
-   *  XXH3
-   *  New experimental hash
-   ************************************************************************/
-
-  /* ************************************************************************
-   * XXH3 is a new hash algorithm featuring:
-   *  - Improved speed for both small and large inputs
-   *  - True 64-bit and 128-bit outputs
-   *  - SIMD acceleration
-   *  - Improved 32-bit viability
-   *
-   * Speed analysis methodology is explained here:
-   *
-   *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
-   *
-   * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
-   * faster on small ones compared to XXH64, though exact differences depend on
-   * the platform.
-   *
-   * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
-   * on all platforms.
-   *
-   * It benefits greatly from SIMD and 64-bit arithmetic, but does not require
-   * it.
-   *
-   * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
-   * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
-   * explained in the implementation.
-   *
-   * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON,
-   * POWER8, ZVector and scalar targets. This can be controlled with the
-   * XXH_VECTOR macro.
-   *
-   * XXH3 offers 2 variants, _64bits and _128bits.
-   * When only 64 bits are needed, prefer calling the _64bits variant, as it
-   * reduces the amount of mixing, resulting in faster speed on small inputs.
-   *
-   * It's also generally simpler to manipulate a scalar return type than a
-   * struct.
-   *
-   * The 128-bit version adds additional strength, but it is slightly slower.
-   *
-   * The XXH3 algorithm is still in development.
-   * The results it produces may still change in future versions.
-   *
-   * Results produced by v0.7.x are not comparable with results from v0.7.y.
-   * However, the API is completely stable, and it can safely be used for
-   * ephemeral data (local sessions).
-   *
-   * Avoid storing values in long-term storage until the algorithm is finalized.
-   *
-   * Since v0.7.3, XXH3 has reached "release candidate" status, meaning that, if
-   * everything remains fine, its current format will be "frozen" and become the
-   * final one.
-   *
-   * After which, return values of XXH3 and XXH128 will no longer change in
-   * future versions.
-   *
-   * XXH3's return values will be officially finalized upon reaching v0.8.0.
-   *
-   * The API supports one-shot hashing, streaming mode, and custom secrets.
-   */
-
-    #ifdef XXH_NAMESPACE
-      #define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
-      #define XXH3_64bits_withSecret \
-        XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
-      #define XXH3_64bits_withSeed \
-        XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
-
-      #define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
-      #define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
-      #define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
-
-      #define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
-      #define XXH3_64bits_reset_withSeed \
-        XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
-      #define XXH3_64bits_reset_withSecret \
-        XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
-      #define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
-      #define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
-
-      #define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
-    #endif
-
 /* XXH3_64bits():
  * default 64-bit variant, using default secret and default seed of 0.
  * It's the fastest variant. */
@@ -547,8 +496,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *data, size_t len);
 
 /*
  * XXH3_64bits_withSeed():
- * This variant generates a custom secret on the fly based on the default
- * secret, altered using the `seed` value.
+ * This variant generates a custom secret on the fly
+ * based on default secret altered using the `seed` value.
  * While this operation is decently fast, note that it's not completely free.
  * Note: seed==0 produces the same results as XXH3_64bits().
  */
@@ -559,74 +508,28 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *data, size_t len,
      * XXH3_64bits_withSecret():
      * It's possible to provide any blob of bytes as a "secret" to generate the
      * hash. This makes it more difficult for an external actor to prepare an
-     * intentional collision. secretSize *must* be large enough (>=
-     * XXH3_SECRET_SIZE_MIN). The hash quality depends on the secret's high
-     * entropy, meaning that the secret should look like a bunch of random
-     * bytes. Avoid "trivial" sequences such as text or a bunch of repeated
-     * characters. If you are unsure of the "randonmess" of the blob of bytes,
-     * consider making it a "custom seed" instead,
-     * and use "XXH_generateSecret()" to generate a high quality secret.
+     * intentional collision. The main condition is that secretSize *must* be
+     * large enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced
+     * hash values depends on secret's entropy. Technically, the secret must
+     * look like a bunch of random bytes. Avoid "trivial" or structured data
+     * such as repeated sequences or a text document. Whenever unsure about the
+     * "randomness" of the blob of bytes, consider relabelling it as a "custom
+     * seed" instead, and employ "XXH3_generateSecret()" (see below) to generate
+     * a high entropy secret derived from the custom seed.
      */
     #define XXH3_SECRET_SIZE_MIN 136
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len,
                                                    const void *secret,
                                                    size_t      secretSize);
 
-  /* streaming 64-bit */
-
-    #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)  /* C11+ */
-      #include <stdalign.h>
-      #define XXH_ALIGN(n) alignas(n)
-    #elif defined(__GNUC__)
-      #define XXH_ALIGN(n) __attribute__((aligned(n)))
-    #elif defined(_MSC_VER)
-      #define XXH_ALIGN(n) __declspec(align(n))
-    #else
-      #define XXH_ALIGN(n)                                      /* disabled */
-    #endif
-
-    /* Old GCC versions only accept the attribute after the type in structures.
-     */
-    #if !(defined(__STDC_VERSION__) &&              \
-          (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
-        && defined(__GNUC__)
-      #define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
-    #else
-      #define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
-    #endif
-
-typedef struct XXH3_state_s XXH3_state_t;
-
-    #define XXH3_INTERNALBUFFER_SIZE 256
-    #define XXH3_SECRET_DEFAULT_SIZE 192
-struct XXH3_state_s {
-
-  XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
-  /* used to store a custom secret generated from a seed */
-  XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
-  XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
-  XXH32_hash_t         bufferedSize;
-  XXH32_hash_t         reserved32;
-  size_t               nbStripesPerBlock;
-  size_t               nbStripesSoFar;
-  size_t               secretLimit;
-  XXH64_hash_t         totalLen;
-  XXH64_hash_t         seed;
-  XXH64_hash_t         reserved64;
-  const unsigned char *extSecret; /* reference to external secret;
-                                   * if == NULL, use .customSecret instead */
-  /* note: there may be some padding at the end due to alignment on 64 bytes */
-
-};                                             /* typedef'd to XXH3_state_t */
-
-    #undef XXH_ALIGN_MEMBER
-
+/*******   Streaming   *******/
 /*
  * Streaming requires state maintenance.
  * This operation costs memory and CPU.
  * As a consequence, streaming is slower than one-shot hashing.
- * For better performance, prefer one-shot functions whenever possible.
+ * For better performance, prefer one-shot functions whenever applicable.
  */
+typedef struct XXH3_state_s XXH3_state_t;
 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void);
 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr);
 XXH_PUBLIC_API void          XXH3_copyState(XXH3_state_t *      dst_state,
@@ -634,8 +537,8 @@ XXH_PUBLIC_API void          XXH3_copyState(XXH3_state_t *      dst_state,
 
 /*
  * XXH3_64bits_reset():
- * Initialize with the default parameters.
- * The result will be equivalent to `XXH3_64bits()`.
+ * Initialize with default parameters.
+ * digest will be equivalent to `XXH3_64bits()`.
  */
 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr);
 /*
@@ -647,9 +550,12 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
                                                         XXH64_hash_t  seed);
 /*
  * XXH3_64bits_reset_withSecret():
- * `secret` is referenced, and must outlive the hash streaming session, so
- * be careful when using stack arrays.
- * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`.
+ * `secret` is referenced, it _must outlive_ the hash streaming session.
+ * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
  */
 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
     XXH3_state_t *statePtr, const void *secret, size_t secretSize);
@@ -659,31 +565,12 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *statePtr,
                                                 size_t        length);
 XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest(const XXH3_state_t *statePtr);
 
-  /* 128-bit */
-
-    #ifdef XXH_NAMESPACE
-      #define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
-      #define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
-      #define XXH3_128bits_withSeed \
-        XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
-      #define XXH3_128bits_withSecret \
-        XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
-
-      #define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
-      #define XXH3_128bits_reset_withSeed \
-        XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
-      #define XXH3_128bits_reset_withSecret \
-        XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
-      #define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
-      #define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
-
-      #define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
-      #define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
-      #define XXH128_canonicalFromHash \
-        XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
-      #define XXH128_hashFromCanonical \
-        XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
-    #endif
+/* note : canonical representation of XXH3 is the same as XXH64
+ * since they both produce XXH64_hash_t values */
+
+/*-**********************************************************************
+ *  XXH3 128-bit variant
+ ************************************************************************/
 
 typedef struct {
 
@@ -692,16 +579,28 @@ typedef struct {
 
 } XXH128_hash_t;
 
-XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
-                                    XXH64_hash_t seed);
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *data, size_t len);
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(
-    const void *data, size_t len, XXH64_hash_t seed);        /* == XXH128() */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void *data, size_t len,
+                                                   XXH64_hash_t seed);
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *data,
                                                      size_t      len,
                                                      const void *secret,
                                                      size_t      secretSize);
 
+/*******   Streaming   *******/
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ *
+ * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
+ * Use already declared XXH3_createState() and XXH3_freeState().
+ *
+ * All reset and streaming functions have same meaning as their 64-bit
+ * counterpart.
+ */
+
 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr);
 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
                                                          XXH64_hash_t  seed);
@@ -713,7 +612,10 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *statePtr,
                                                  size_t        length);
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *statePtr);
 
-/* Note: For better performance, these functions can be inlined using
+/* Following helper functions make it possible to compare XXH128_hast_t values.
+ * Since XXH128_hash_t is a structure, this capability is not offered by the
+ * language.
+ * Note: For better performance, these functions can be inlined using
  * XXH_INLINE_ALL */
 
 /*!
@@ -745,6 +647,116 @@ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
 XXH_PUBLIC_API XXH128_hash_t
 XXH128_hashFromCanonical(const XXH128_canonical_t *src);
 
+  #endif                                                /* XXH_NO_LONG_LONG */
+
+#endif                                         /* XXHASH_H_5627135585666179 */
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+  #define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ *****************************************************************************
+*/
+
+/*
+ * These definitions are only present to allow static allocation
+ * of XXH states, on stack or in a struct, for example.
+ * Never **ever** access their members directly.
+ */
+
+struct XXH32_state_s {
+
+  XXH32_hash_t total_len_32;
+  XXH32_hash_t large_len;
+  XXH32_hash_t v1;
+  XXH32_hash_t v2;
+  XXH32_hash_t v3;
+  XXH32_hash_t v4;
+  XXH32_hash_t mem32[4];
+  XXH32_hash_t memsize;
+  XXH32_hash_t
+      reserved; /* never read nor write, might be removed in a future version */
+
+};                                            /* typedef'd to XXH32_state_t */
+
+  #ifndef XXH_NO_LONG_LONG       /* defined when there is no 64-bit support */
+
+struct XXH64_state_s {
+
+  XXH64_hash_t total_len;
+  XXH64_hash_t v1;
+  XXH64_hash_t v2;
+  XXH64_hash_t v3;
+  XXH64_hash_t v4;
+  XXH64_hash_t mem64[4];
+  XXH32_hash_t memsize;
+  XXH32_hash_t reserved32;                   /* required for padding anyway */
+  XXH64_hash_t reserved64; /* never read nor write, might be removed in a future
+                              version */
+
+};                                            /* typedef'd to XXH64_state_t */
+
+    #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)  /* C11+ */
+      #include <stdalign.h>
+      #define XXH_ALIGN(n) alignas(n)
+    #elif defined(__GNUC__)
+      #define XXH_ALIGN(n) __attribute__((aligned(n)))
+    #elif defined(_MSC_VER)
+      #define XXH_ALIGN(n) __declspec(align(n))
+    #else
+      #define XXH_ALIGN(n)                                      /* disabled */
+    #endif
+
+    /* Old GCC versions only accept the attribute after the type in structures.
+     */
+    #if !(defined(__STDC_VERSION__) &&              \
+          (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
+        && defined(__GNUC__)
+      #define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+    #else
+      #define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+    #endif
+
+    #define XXH3_INTERNALBUFFER_SIZE 256
+    #define XXH3_SECRET_DEFAULT_SIZE 192
+struct XXH3_state_s {
+
+  XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+  /* used to store a custom secret generated from a seed */
+  XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+  XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+  XXH32_hash_t         bufferedSize;
+  XXH32_hash_t         reserved32;
+  size_t               nbStripesSoFar;
+  XXH64_hash_t         totalLen;
+  size_t               nbStripesPerBlock;
+  size_t               secretLimit;
+  XXH64_hash_t         seed;
+  XXH64_hash_t         reserved64;
+  const unsigned char *extSecret; /* reference to external secret;
+                                   * if == NULL, use .customSecret instead */
+  /* note: there may be some padding at the end due to alignment on 64 bytes */
+
+};                                             /* typedef'd to XXH3_state_t */
+
+    #undef XXH_ALIGN_MEMBER
+
+    /* When the XXH3_state_t structure is merely emplaced on stack,
+     * it should be initialized with XXH3_INITSTATE() or a memset()
+     * in case its first reset uses XXH3_NNbits_reset_withSeed().
+     * This init can be omitted if the first reset uses default or _withSecret
+     * mode. This operation isn't necessary when the state is created with
+     * XXH3_createState(). Note that this doesn't prepare the state for a
+     * streaming operation, it's still necessary to use XXH3_NNbits_reset*()
+     * afterwards.
+     */
+    #define XXH3_INITSTATE(XXH3_state_ptr) \
+      { (XXH3_state_ptr)->seed = 0; }
+
 /* ===   Experimental API   === */
 /* Symbols defined below must be considered tied to a specific library version.
  */
@@ -752,17 +764,19 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src);
 /*
  * XXH3_generateSecret():
  *
- * Derive a secret for use with `*_withSecret()` prototypes of XXH3.
- * Use this if you need a higher level of security than the one provided by
- * 64bit seed.
+ * Derive a high-entropy secret from any user-defined content, named customSeed.
+ * The generated secret can be used in combination with `*_withSecret()`
+ * functions. The `_withSecret()` variants are useful to provide a higher level
+ * of protection than 64-bit seed, as it becomes much more difficult for an
+ * external actor to guess how to impact the calculation logic.
  *
- * Take as input a custom seed of any length and any content,
- * generate from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
- * into already allocated buffer secretBuffer.
- * The generated secret ALWAYS is XXH_SECRET_DEFAULT_SIZE bytes long.
+ * The function accepts as input a custom seed of any length and any content,
+ * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
+ * into an already allocated buffer secretBuffer.
+ * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
  *
  * The generated secret can then be used with any `*_withSecret()` variant.
- * The functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
+ * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
  * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
  * are part of this list. They all accept a `secret` parameter
  * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
@@ -771,8 +785,8 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src);
  * this function can be used to generate a secret of proper quality.
  *
  * customSeed can be anything. It can have any size, even small ones,
- * and its content can be anything, even some "low entropy" source such as a
- * bunch of zeroes. The resulting `secret` will nonetheless respect all expected
+ * and its content can be anything, even stupidly "low entropy" source such as a
+ * bunch of zeroes. The resulting `secret` will nonetheless provide all expected
  * qualities.
  *
  * Supplying NULL as the customSeed copies the default secret into
@@ -783,6 +797,10 @@ XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
                                         const void *customSeed,
                                         size_t      customSeedSize);
 
+/* simple short-cut to pre-selected XXH3_128bits variant */
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
+                                    XXH64_hash_t seed);
+
   #endif                                                /* XXH_NO_LONG_LONG */
 
   #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
@@ -799,17 +817,23 @@ XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
 /*-**********************************************************************
  * xxHash implementation
  *-**********************************************************************
- * xxHash's implementation used to be found in xxhash.c.
+ * xxHash's implementation used to be hosted inside xxhash.c.
  *
- * However, code inlining requires the implementation to be visible to the
- * compiler, usually within the header.
+ * However, inlining requires implementation to be visible to the compiler,
+ * hence be included alongside the header.
+ * Previously, implementation was hosted inside xxhash.c,
+ * which was then #included when inlining was activated.
+ * This construction created issues with a few build and install systems,
+ * as it required xxhash.c to be stored in /include directory.
  *
- * As a workaround, xxhash.c used to be included within xxhash.h. This caused
- * some issues with some build systems, especially ones which treat .c files
- * as source files.
+ * xxHash implementation is now directly integrated within xxhash.h.
+ * As a consequence, xxhash.c is no longer needed in /include.
  *
- * Therefore, the implementation is now directly integrated within xxhash.h.
- * Another small advantage is that xxhash.c is no longer needed in /include.
+ * xxhash.c is still available and is still useful.
+ * In a "normal" setup, when xxhash is not inlined,
+ * xxhash.h only exposes the prototypes and public symbols,
+ * while xxhash.c can be built into an object file xxhash.o
+ * which can then be linked into the final binary.
  ************************************************************************/
 
 #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) || \
@@ -828,10 +852,10 @@ XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
    * Unfortunately, on some target/compiler combinations, the generated assembly
    * is sub-optimal.
    *
-   * The below switch allow to select a different access method for improved
-   * performance.
+   * The below switch allow selection of a different access method
+   * in the search for improved performance.
    * Method 0 (default):
-   *     Use `memcpy()`. Safe and portable.
+   *     Use `memcpy()`. Safe and portable. Default.
    * Method 1:
    *     `__attribute__((packed))` statement. It depends on compiler extensions
    *     and is therefore not portable.
@@ -843,7 +867,7 @@ XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
    *     It can generate buggy code on targets which do not support unaligned
    *     memory accesses.
    *     But in some circumstances, it's the only known way to get the most
-   *     performance (ie GCC + ARMv6)
+   *     performance (example: GCC + ARMv6)
    * Method 3:
    *     Byteshift. This can generate the best code on old compilers which don't
    *     inline small `memcpy()` calls, and it might also be faster on
@@ -924,7 +948,8 @@ XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
    * -fno-inline with GCC or Clang, this will automatically be defined.
    */
   #ifndef XXH_NO_INLINE_HINTS
-    #if defined(__OPTIMIZE_SIZE__) || defined(__NO_INLINE__)
+    #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
+        || defined(__NO_INLINE__)                       /* -O0, -fno-inline */
       #define XXH_NO_INLINE_HINTS 1
     #else
       #define XXH_NO_INLINE_HINTS 0
@@ -950,8 +975,8 @@ XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
    *  Includes & Memory related functions
    ***************************************/
   /*!
-   * Modify the local functions below should you wish to use some other memory
-   * routines for malloc() and free()
+   * Modify the local functions below should you wish to use
+   * different memory routines for malloc() and free()
    */
   #include <stdlib.h>
 
@@ -1137,7 +1162,8 @@ typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess;
      * Try to detect endianness automatically, to avoid the nonstandard behavior
      * in `XXH_isLittleEndian()`
      */
-    #if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || \
+    #if defined(_WIN32) /* Windows is always little endian */ \
+        || defined(__LITTLE_ENDIAN__) ||                      \
         (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
       #define XXH_CPU_LITTLE_ENDIAN 1
     #elif defined(__BIG_ENDIAN__) || \
@@ -1163,7 +1189,7 @@ static int XXH_isLittleEndian(void) {
   return one.c[0];
 
 }
-
+\
       #define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
     #endif
   #endif
@@ -1371,9 +1397,7 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32) {
 
 static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
                               XXH_alignment align) {
-
-  /* dummy comment */
-
+\
   #define XXH_PROCESS1                           \
     do {                                         \
                                                  \
@@ -1778,13 +1802,16 @@ typedef XXH64_hash_t xxh_u64;
      * rerolled.
      */
     #ifndef XXH_REROLL_XXH64
-      #if (defined(__ILP32__) || defined(_ILP32)) ||                           \
-          !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) ||     \
-            defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) || \
-            defined(__PPC64__) || defined(__PPC64LE__) ||                      \
-            defined(__ppc64__) || defined(__powerpc64__) ||                    \
-            defined(__mips64__) || defined(__mips64)) ||                       \
-          (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX)
+      #if (defined(__ILP32__) ||                                              \
+           defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+          || !(defined(__x86_64__) || defined(_M_X64) ||                      \
+               defined(_M_AMD64) /* x86-64 */                                 \
+               || defined(_M_ARM64) || defined(__aarch64__) ||                \
+               defined(__arm64__) /* aarch64 */                               \
+               || defined(__PPC64__) || defined(__PPC64LE__) ||               \
+               defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */       \
+               || defined(__mips64__) || defined(__mips64)) /* mips64 */      \
+          || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX)  /* check limits */
         #define XXH_REROLL_XXH64 1
       #else
         #define XXH_REROLL_XXH64 0
@@ -1923,21 +1950,16 @@ XXH_FORCE_INLINE xxh_u64 XXH_readLE64_align(const void *  ptr,
 
 /*******   xxh64   *******/
 
-static const xxh_u64 XXH_PRIME64_1 =
-    0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111
-                            */
-static const xxh_u64 XXH_PRIME64_2 =
-    0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111
-                            */
-static const xxh_u64 XXH_PRIME64_3 =
-    0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001
-                            */
-static const xxh_u64 XXH_PRIME64_4 =
-    0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011
-                            */
-static const xxh_u64 XXH_PRIME64_5 =
-    0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101
-                            */
+static const xxh_u64 XXH_PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111
+                                                             */
+static const xxh_u64 XXH_PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111
+                                                             */
+static const xxh_u64 XXH_PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001
+                                                             */
+static const xxh_u64 XXH_PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011
+                                                             */
+static const xxh_u64 XXH_PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101
+                                                             */
 
     #ifdef XXH_OLD_NAMES
       #define PRIME64_1 XXH_PRIME64_1
@@ -1980,9 +2002,7 @@ static xxh_u64 XXH64_avalanche(xxh_u64 h64) {
 
 static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len,
                               XXH_alignment align) {
-
-    /* dummy comment */
-
+\
     #define XXH_PROCESS1_64                        \
       do {                                         \
                                                    \
@@ -2428,7 +2448,3132 @@ XXH64_hashFromCanonical(const XXH64_canonical_t *src) {
    *  New generation hash designed for speed on small keys and vectorization
    ************************************************************************ */
 
-    #include "xxh3.h"
+  /* ===   Compiler specifics   === */
+
+    #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L  /* >= C99 */
+      #define XXH_RESTRICT restrict
+    #else
+      /* Note: it might be useful to define __restrict or __restrict__ for some
+       * C++ compilers */
+      #define XXH_RESTRICT                                       /* disable */
+    #endif
+
+    #if (defined(__GNUC__) && (__GNUC__ >= 3)) ||                   \
+        (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \
+        defined(__clang__)
+      #define XXH_likely(x) __builtin_expect(x, 1)
+      #define XXH_unlikely(x) __builtin_expect(x, 0)
+    #else
+      #define XXH_likely(x) (x)
+      #define XXH_unlikely(x) (x)
+    #endif
+
+    #if defined(__GNUC__)
+      #if defined(__AVX2__)
+        #include <immintrin.h>
+      #elif defined(__SSE2__)
+        #include <emmintrin.h>
+      #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+        #define inline __inline__                 /* circumvent a clang bug */
+        #include <arm_neon.h>
+        #undef inline
+      #endif
+    #elif defined(_MSC_VER)
+      #include <intrin.h>
+    #endif
+
+    /*
+     * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+     * remaining a true 64-bit/128-bit hash function.
+     *
+     * This is done by prioritizing a subset of 64-bit operations that can be
+     * emulated without too many steps on the average 32-bit machine.
+     *
+     * For example, these two lines seem similar, and run equally fast on
+     * 64-bit:
+     *
+     *   xxh_u64 x;
+     *   x ^= (x >> 47); // good
+     *   x ^= (x >> 13); // bad
+     *
+     * However, to a 32-bit machine, there is a major difference.
+     *
+     * x ^= (x >> 47) looks like this:
+     *
+     *   x.lo ^= (x.hi >> (47 - 32));
+     *
+     * while x ^= (x >> 13) looks like this:
+     *
+     *   // note: funnel shifts are not usually cheap.
+     *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+     *   x.hi ^= (x.hi >> 13);
+     *
+     * The first one is significantly faster than the second, simply because the
+     * shift is larger than 32. This means:
+     *  - All the bits we need are in the upper 32 bits, so we can ignore the
+     * lower 32 bits in the shift.
+     *  - The shift result will always fit in the lower 32 bits, and therefore,
+     *    we can ignore the upper 32 bits in the xor.
+     *
+     * Thanks to this optimization, XXH3 only requires these features to be
+     * efficient:
+     *
+     *  - Usable unaligned access
+     *  - A 32-bit or 64-bit ALU
+     *      - If 32-bit, a decent ADC instruction
+     *  - A 32 or 64-bit multiply with a 64-bit result
+     *  - For the 128-bit variant, a decent byteswap helps short inputs.
+     *
+     * The first two are already required by XXH32, and almost all 32-bit and
+     * 64-bit platforms which can run XXH32 can run XXH3 efficiently.
+     *
+     * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+     * notable exception.
+     *
+     * First of all, Thumb-1 lacks support for the UMULL instruction which
+     * performs the important long multiply. This means numerous __aeabi_lmul
+     * calls.
+     *
+     * Second of all, the 8 functional registers are just not enough.
+     * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic
+     * need Lo registers, and this shuffling results in thousands more MOVs than
+     * A32.
+     *
+     * A32 and T32 don't have this limitation. They can access all 14 registers,
+     * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+     * shifts is helpful, too.
+     *
+     * Therefore, we do a quick sanity check.
+     *
+     * If compiling Thumb-1 for a target which supports ARM instructions, we
+     * will emit a warning, as it is not a "sane" platform to compile for.
+     *
+     * Usually, if this happens, it is because of an accident and you probably
+     * need to specify -march, as you likely meant to compile for a newer
+     * architecture.
+     *
+     * Credit: large sections of the vectorial and asm source code paths
+     *         have been contributed by @easyaspi314
+     */
+    #if defined(__thumb__) && !defined(__thumb2__) && \
+        defined(__ARM_ARCH_ISA_ARM)
+      #warning "XXH3 is highly inefficient without ARM or Thumb-2."
+    #endif
+
+    /* ==========================================
+     * Vectorization detection
+     * ========================================== */
+    #define XXH_SCALAR 0                         /* Portable scalar version */
+    #define XXH_SSE2 1                 /* SSE2 for Pentium 4 and all x86_64 */
+    #define XXH_AVX2 2                    /* AVX2 for Haswell and Bulldozer */
+    #define XXH_AVX512 3                  /* AVX512 for Skylake and Icelake */
+    #define XXH_NEON 4             /* NEON for most ARMv7-A and all AArch64 */
+    #define XXH_VSX 5                     /* VSX and ZVector for POWER8/z13 */
+
+    #ifndef XXH_VECTOR                    /* can be defined on command line */
+      #if defined(__AVX512F__)
+        #define XXH_VECTOR XXH_AVX512
+      #elif defined(__AVX2__)
+        #define XXH_VECTOR XXH_AVX2
+      #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
+          (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+        #define XXH_VECTOR XXH_SSE2
+      #elif defined(__GNUC__) /* msvc support maybe later */                   \
+          && (defined(__ARM_NEON__) || defined(__ARM_NEON)) &&                 \
+          (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+           || (defined(__BYTE_ORDER__) &&                                      \
+               __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+        #define XXH_VECTOR XXH_NEON
+      #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \
+          (defined(__s390x__) && defined(__VEC__)) &&             \
+              defined(__GNUC__)                             /* TODO: IBM XL */
+        #define XXH_VECTOR XXH_VSX
+      #else
+        #define XXH_VECTOR XXH_SCALAR
+      #endif
+    #endif
+
+    /*
+     * Controls the alignment of the accumulator,
+     * for compatibility with aligned vector loads, which are usually faster.
+     */
+    #ifndef XXH_ACC_ALIGN
+      #if defined(XXH_X86DISPATCH)
+        #define XXH_ACC_ALIGN 64           /* for compatibility with avx512 */
+      #elif XXH_VECTOR == XXH_SCALAR                              /* scalar */
+        #define XXH_ACC_ALIGN 8
+      #elif XXH_VECTOR == XXH_SSE2                                  /* sse2 */
+        #define XXH_ACC_ALIGN 16
+      #elif XXH_VECTOR == XXH_AVX2                                  /* avx2 */
+        #define XXH_ACC_ALIGN 32
+      #elif XXH_VECTOR == XXH_NEON                                  /* neon */
+        #define XXH_ACC_ALIGN 16
+      #elif XXH_VECTOR == XXH_VSX                                    /* vsx */
+        #define XXH_ACC_ALIGN 16
+      #elif XXH_VECTOR == XXH_AVX512                              /* avx512 */
+        #define XXH_ACC_ALIGN 64
+      #endif
+    #endif
+
+    #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \
+        XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+      #define XXH_SEC_ALIGN XXH_ACC_ALIGN
+    #else
+      #define XXH_SEC_ALIGN 8
+    #endif
+
+    /*
+     * UGLY HACK:
+     * GCC usually generates the best code with -O3 for xxHash.
+     *
+     * However, when targeting AVX2, it is overzealous in its unrolling
+     * resulting in code roughly 3/4 the speed of Clang.
+     *
+     * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+     * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+     * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+     *
+     * That is why when compiling the AVX2 version, it is recommended to use
+     * either -O2 -mavx2 -march=haswell or -O2 -mavx2
+     * -mno-avx256-split-unaligned-load for decent performance, or to use Clang
+     * instead.
+     *
+     * Fortunately, we can control the first one with a pragma that forces GCC
+     * into -O2, but the other one we can't control without "failed to inline
+     * always inline function due to target mismatch" warnings.
+     */
+    #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
+        && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+        && defined(__OPTIMIZE__) &&                                      \
+        !defined(__OPTIMIZE_SIZE__)                  /* respect -O0 and -Os */
+      #pragma GCC push_options
+      #pragma GCC optimize("-O2")
+    #endif
+
+    #if XXH_VECTOR == XXH_NEON
+      /*
+       * NEON's setup for vmlal_u32 is a little more complicated than it is on
+       * SSE2, AVX2, and VSX.
+       *
+       * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an
+       * upcast.
+       *
+       * To do the same operation, the 128-bit 'Q' register needs to be split
+       * into two 64-bit 'D' registers, performing this operation::
+       *
+       *   [                a                 |                 b ] |
+       * '---------. .--------'                | |                         x |
+       *            |              .---------' '--------.                |
+       *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32 ]
+       *
+       * Due to significant changes in aarch64, the fastest method for aarch64
+       * is completely different than the fastest method for ARMv7-A.
+       *
+       * ARMv7-A treats D registers as unions overlaying Q registers, so
+       * modifying D11 will modify the high half of Q5. This is similar to how
+       * modifying AH will only affect bits 8-15 of AX on x86.
+       *
+       * VZIP takes two registers, and puts even lanes in one register and odd
+       * lanes in the other.
+       *
+       * On ARMv7-A, this strangely modifies both parameters in place instead of
+       * taking the usual 3-operand form.
+       *
+       * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on
+       * the lower and upper halves of the Q register to end up with the high
+       * and low halves where we want - all in one instruction.
+       *
+       *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1],
+       * d11[1] }
+       *
+       * Unfortunately we need inline assembly for this: Instructions modifying
+       * two registers at once is not possible in GCC or Clang's IR, and they
+       * have to create a copy.
+       *
+       * aarch64 requires a different approach.
+       *
+       * In order to make it easier to write a decent compiler for aarch64, many
+       * quirks were removed, such as conditional execution.
+       *
+       * NEON was also affected by this.
+       *
+       * aarch64 cannot access the high bits of a Q-form register, and writes to
+       * a D-form register zero the high bits, similar to how writes to W-form
+       * scalar registers (or DWORD registers on x86_64) work.
+       *
+       * The formerly free vget_high intrinsics now require a vext (with a few
+       * exceptions)
+       *
+       * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the
+       * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to
+       * only modify one operand.
+       *
+       * The equivalent of the VZIP.32 on the lower and upper halves would be
+       * this mess:
+       *
+       *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1]
+       * } zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] } zip2 v0.2s,
+       * v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+       *
+       * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64
+       * (SHRN):
+       *
+       *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+       *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+       *
+       * This is available on ARMv7-A, but is less efficient than a single
+       * VZIP.32.
+       */
+
+      /*
+       * Function-like macro:
+       * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t
+       * &outHi)
+       * {
+
+       *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+       *     outHi = (uint32x2_t)(in >> 32);
+       *     in = UNDEFINED;
+       * }
+       */
+      #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+          && defined(__GNUC__) && !defined(__aarch64__) && !defined(__arm64__)
+        #define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                                   \
+          do {                                                                                         \
+                                                                                                       \
+            /* Undocumented GCC/Clang operand modifier: %e0 = lower D half,                            \
+             * %f0 = upper D half */                                                                   \
+            /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486             \
+             */                                                                                        \
+            /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \
+             */                                                                                        \
+            __asm__("vzip.32  %e0, %f0" : "+w"(in));                                                   \
+            (outLo) = vget_low_u32(vreinterpretq_u32_u64(in));                                         \
+            (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                        \
+                                                                                                       \
+          } while (0)
+      #else
+        #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
+          do {                                       \
+                                                     \
+            (outLo) = vmovn_u64(in);                 \
+            (outHi) = vshrn_n_u64((in), 32);         \
+                                                     \
+          } while (0)
+      #endif
+    #endif                                        /* XXH_VECTOR == XXH_NEON */
+
+    /*
+     * VSX and Z Vector helpers.
+     *
+     * This is very messy, and any pull requests to clean this up are welcome.
+     *
+     * There are a lot of problems with supporting VSX and s390x, due to
+     * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+     */
+    #if XXH_VECTOR == XXH_VSX
+      #if defined(__s390x__)
+        #include <s390intrin.h>
+      #else
+        /* gcc's altivec.h can have the unwanted consequence to unconditionally
+         * #define bool, vector, and pixel keywords,
+         * with bad consequences for programs already using these keywords for
+         * other purposes. The paragraph defining these macros is skipped when
+         * __APPLE_ALTIVEC__ is defined.
+         * __APPLE_ALTIVEC__ is _generally_ defined automatically by the
+         * compiler, but it seems that, in some cases, it isn't. Force the build
+         * macro to be defined, so that keywords are not altered.
+         */
+        #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
+          #define __APPLE_ALTIVEC__
+        #endif
+        #include <altivec.h>
+      #endif
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char      xxh_u8x16;
+typedef __vector unsigned           xxh_u32x4;
+
+      #ifndef XXH_VSX_BE
+        #if defined(__BIG_ENDIAN__) ||  \
+            (defined(__BYTE_ORDER__) && \
+             __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+          #define XXH_VSX_BE 1
+        #elif defined(__VEC_ELEMENT_REG_ORDER__) && \
+            __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+          #warning \
+              "-maltivec=be is not recommended. Please use native endianness."
+          #define XXH_VSX_BE 1
+        #else
+          #define XXH_VSX_BE 0
+        #endif
+      #endif                                        /* !defined(XXH_VSX_BE) */
+
+      #if XXH_VSX_BE
+        /* A wrapper for POWER9's vec_revb. */
+        #if defined(__POWER9_VECTOR__) || \
+            (defined(__clang__) && defined(__s390x__))
+          #define XXH_vec_revb vec_revb
+        #else
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
+
+  xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                               0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08};
+  return vec_perm(val, val, vByteSwap);
+
+}
+
+        #endif
+      #endif                                                  /* XXH_VSX_BE */
+
+/*
+ * Performs an unaligned load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) {
+
+  xxh_u64x2 ret;
+  memcpy(&ret, ptr, sizeof(xxh_u64x2));
+      #if XXH_VSX_BE
+  ret = XXH_vec_revb(ret);
+      #endif
+  return ret;
+
+}
+
+      /*
+       * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+       *
+       * These intrinsics weren't added until GCC 8, despite existing for a
+       * while, and they are endian dependent. Also, their meaning swap
+       * depending on version.
+       * */
+      #if defined(__s390x__)
+      /* s390x is always big endian, no issue on this platform */
+        #define XXH_vec_mulo vec_mulo
+        #define XXH_vec_mule vec_mule
+      #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
+        /* Clang has a better way to control this, we can just use the builtin
+         * which doesn't swap. */
+        #define XXH_vec_mulo __builtin_altivec_vmulouw
+        #define XXH_vec_mule __builtin_altivec_vmuleuw
+      #else
+/* gcc needs inline assembly */
+/* Adapted from
+ * https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) {
+
+  xxh_u64x2 result;
+  __asm__("vmulouw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+
+}
+
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) {
+
+  xxh_u64x2 result;
+  __asm__("vmuleuw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+
+}
+
+      #endif                                  /* XXH_vec_mulo, XXH_vec_mule */
+    #endif                                         /* XXH_VECTOR == XXH_VSX */
+
+    /* prefetch
+     * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+    #if defined(XXH_NO_PREFETCH)
+      #define XXH_PREFETCH(ptr) (void)(ptr)                     /* disabled */
+    #else
+      #if defined(_MSC_VER) && \
+          (defined(_M_X64) ||  \
+           defined(            \
+               _M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
+        #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+        #define XXH_PREFETCH(ptr) _mm_prefetch((const char *)(ptr), _MM_HINT_T0)
+      #elif defined(__GNUC__) && \
+          ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
+        #define XXH_PREFETCH(ptr) \
+          __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+      #else
+        #define XXH_PREFETCH(ptr) (void)(ptr)                   /* disabled */
+      #endif
+    #endif                                               /* XXH_NO_PREFETCH */
+
+  /* ==========================================
+   * XXH3 default settings
+   * ========================================== */
+
+    #define XXH_SECRET_DEFAULT_SIZE 192     /* minimum XXH3_SECRET_SIZE_MIN */
+
+    #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+      #error "default keyset is not large enough"
+    #endif
+
+/* Pseudorandom secret taken directly from FARSH */
+XXH_ALIGN(64)
+static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c,
+    0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb,
+    0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 0xcb, 0x79, 0xe6, 0x4e,
+    0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6,
+    0x81, 0x3a, 0x26, 0x4c, 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb,
+    0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 0x71, 0x64, 0x48, 0x97,
+    0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7,
+    0xc7, 0x0b, 0x4f, 0x1d, 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31,
+    0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, 0xea, 0xc5, 0xac, 0x83,
+    0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26,
+    0x29, 0xd4, 0x68, 0x9e, 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc,
+    0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, 0x45, 0xcb, 0x3a, 0x8f,
+    0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+
+};
+
+    #ifdef XXH_OLD_NAMES
+      #define kSecret XXH3_kSecret
+    #endif
+
+    /*
+     * Calculates a 32-bit to 64-bit long multiply.
+     *
+     * Wraps __emulu on MSVC x86 because it tends to call __allmul when it
+     * doesn't need to (but it shouldn't need to anyways, it is about 7
+     * instructions to do a 64x64 multiply...). Since we know that this will
+     * _always_ emit MULL, we use that instead of the normal method.
+     *
+     * If you are compiling for platforms like Thumb-1 and don't have a better
+     * option, you may also want to write your own long multiply routine here.
+     *
+     * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+     * {
+
+     *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+     * }
+     */
+    #if defined(_MSC_VER) && defined(_M_IX86)
+      #include <intrin.h>
+      #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+    #else
+      /*
+       * Downcast + upcast is usually better than masking on older compilers
+       * like GCC 4.2 (especially 32-bit ones), all without affecting newer
+       * compilers.
+       *
+       * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both
+       * operands and perform a full 64x64 multiply -- entirely redundant on
+       * 32-bit.
+       */
+      #define XXH_mult32to64(x, y) \
+        ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+    #endif
+
+/*
+ * Calculates a 64->128-bit long multiply.
+ *
+ * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
+ */
+static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
+
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+    #if defined(__GNUC__) && !defined(__wasm__) && \
+            defined(__SIZEOF_INT128__) ||          \
+        (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+  __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+  XXH128_hash_t     r128;
+  r128.low64 = (xxh_u64)(product);
+  r128.high64 = (xxh_u64)(product >> 64);
+  return r128;
+
+      /*
+       * MSVC for x64's _umul128 method.
+       *
+       * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64
+       * *HighProduct);
+       *
+       * This compiles to single operand MUL on x64.
+       */
+    #elif defined(_M_X64) || defined(_M_IA64)
+
+      #ifndef _MSC_VER
+        #pragma intrinsic(_umul128)
+      #endif
+  xxh_u64       product_high;
+  xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+  XXH128_hash_t r128;
+  r128.low64 = product_low;
+  r128.high64 = product_high;
+  return r128;
+
+    #else
+  /*
+   * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+   *
+   * This is a fast and simple grade school multiply, which is shown below
+   * with base 10 arithmetic instead of base 0x100000000.
+   *
+   *           9 3 // D2 lhs = 93
+   *         x 7 5 // D2 rhs = 75
+   *     ----------
+   *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+   *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+   *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+   *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+   *     ---------
+   *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+   *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+   *     ---------
+   *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+   *
+   * The reasons for adding the products like this are:
+   *  1. It avoids manual carry tracking. Just like how
+   *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+   *     This avoids a lot of complexity.
+   *
+   *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+   *     instruction available in ARM's Digital Signal Processing extension
+   *     in 32-bit ARMv6 and later, which is shown below:
+   *
+   *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+   *         {
+
+   *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+   *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+   *             *RdHi = (xxh_u32)(product >> 32);
+   *         }
+   *
+   *     This instruction was designed for efficient long multiplication, and
+   *     allows this to be calculated in only 4 instructions at speeds
+   *     comparable to some 64-bit ALUs.
+   *
+   *  3. It isn't terrible on other platforms. Usually this will be a couple
+   *     of 32-bit ADD/ADCs.
+   */
+
+  /* First calculate all of the cross products. */
+  xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+  xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
+  xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+  xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
+
+  /* Now add the products together. These will never overflow. */
+  xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+  xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
+  xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+  XXH128_hash_t r128;
+  r128.low64 = lower;
+  r128.high64 = upper;
+  return r128;
+    #endif
+
+}
+
+/*
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force
+ * it.
+ */
+static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
+
+  XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+  return product.low64 ^ product.high64;
+
+}
+
+/* Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) {
+
+  XXH_ASSERT(0 <= shift && shift < 64);
+  return v64 ^ (v64 >> shift);
+
+}
+
+/*
+ * This is a fast avalanche stage,
+ * suitable when input bits are already partially mixed
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) {
+
+  h64 = XXH_xorshift64(h64, 37);
+  h64 *= 0x165667919E3779F9ULL;
+  h64 = XXH_xorshift64(h64, 32);
+  return h64;
+
+}
+
+/*
+ * This is a stronger avalanche,
+ * inspired by Pelle Evensen's rrmxmx
+ * preferable when input has not been previously mixed
+ */
+static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) {
+
+  /* this mix is inspired by Pelle Evensen's rrmxmx */
+  h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
+  h64 *= 0x9FB21C651E98DF25ULL;
+  h64 ^= (h64 >> 35) + len;
+  h64 *= 0x9FB21C651E98DF25ULL;
+  return XXH_xorshift64(h64, 28);
+
+}
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant
+ * time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH64_hash_t XXH3_len_1to3_64b(const xxh_u8 *input, size_t len,
+                                                const xxh_u8 *secret,
+                                                XXH64_hash_t  seed) {
+
+  XXH_ASSERT(input != NULL);
+  XXH_ASSERT(1 <= len && len <= 3);
+  XXH_ASSERT(secret != NULL);
+  /*
+   * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+   * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+   * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+   */
+  {
+
+    xxh_u8 const  c1 = input[0];
+    xxh_u8 const  c2 = input[len >> 1];
+    xxh_u8 const  c3 = input[len - 1];
+    xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) |
+                             ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+    xxh_u64 const bitflip =
+        (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
+    xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+    return XXH64_avalanche(keyed);
+
+  }
+
+}
+
+XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len,
+                                                const xxh_u8 *secret,
+                                                XXH64_hash_t  seed) {
+
+  XXH_ASSERT(input != NULL);
+  XXH_ASSERT(secret != NULL);
+  XXH_ASSERT(4 <= len && len < 8);
+  seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+  {
+
+    xxh_u32 const input1 = XXH_readLE32(input);
+    xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+    xxh_u64 const bitflip =
+        (XXH_readLE64(secret + 8) ^ XXH_readLE64(secret + 16)) - seed;
+    xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+    xxh_u64 const keyed = input64 ^ bitflip;
+    return XXH3_rrmxmx(keyed, len);
+
+  }
+
+}
+
+XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input,
+                                                 size_t        len,
+                                                 const xxh_u8 *secret,
+                                                 XXH64_hash_t  seed) {
+
+  XXH_ASSERT(input != NULL);
+  XXH_ASSERT(secret != NULL);
+  XXH_ASSERT(8 <= len && len <= 16);
+  {
+
+    xxh_u64 const bitflip1 =
+        (XXH_readLE64(secret + 24) ^ XXH_readLE64(secret + 32)) + seed;
+    xxh_u64 const bitflip2 =
+        (XXH_readLE64(secret + 40) ^ XXH_readLE64(secret + 48)) - seed;
+    xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;
+    xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+    xxh_u64 const acc = len + XXH_swap64(input_lo) + input_hi +
+                        XXH3_mul128_fold64(input_lo, input_hi);
+    return XXH3_avalanche(acc);
+
+  }
+
+}
+
+XXH_FORCE_INLINE XXH64_hash_t XXH3_len_0to16_64b(const xxh_u8 *input,
+                                                 size_t        len,
+                                                 const xxh_u8 *secret,
+                                                 XXH64_hash_t  seed) {
+
+  XXH_ASSERT(len <= 16);
+  {
+
+    if (XXH_likely(len > 8))
+      return XXH3_len_9to16_64b(input, len, secret, seed);
+    if (XXH_likely(len >= 4))
+      return XXH3_len_4to8_64b(input, len, secret, seed);
+    if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+    return XXH64_avalanche(
+        seed ^ (XXH_readLE64(secret + 56) ^ XXH_readLE64(secret + 64)));
+
+  }
+
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input,
+                                     const xxh_u8 *XXH_RESTRICT secret,
+                                     xxh_u64                    seed64) {
+
+    #if defined(__GNUC__) && !defined(__clang__)  /* GCC, not Clang */ \
+        && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */     \
+        &&                                                             \
+        !defined(                                                      \
+            XXH_ENABLE_AUTOVECTORIZE)  /* Define to disable like XXH32 hack */
+  /*
+   * UGLY HACK:
+   * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+   * slower code.
+   *
+   * By forcing seed64 into a register, we disrupt the cost model and
+   * cause it to scalarize. See `XXH32_round()`
+   *
+   * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+   * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+   * GCC 9.2, despite both emitting scalar code.
+   *
+   * GCC generates much better scalar code than Clang for the rest of XXH3,
+   * which is why finding a more optimal codepath is an interest.
+   */
+  __asm__("" : "+r"(seed64));
+    #endif
+  {
+
+    xxh_u64 const input_lo = XXH_readLE64(input);
+    xxh_u64 const input_hi = XXH_readLE64(input + 8);
+    return XXH3_mul128_fold64(input_lo ^ (XXH_readLE64(secret) + seed64),
+                              input_hi ^ (XXH_readLE64(secret + 8) - seed64));
+
+  }
+
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b(
+    const xxh_u8 *XXH_RESTRICT input, size_t len,
+    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
+
+  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+  (void)secretSize;
+  XXH_ASSERT(16 < len && len <= 128);
+
+  {
+
+    xxh_u64 acc = len * XXH_PRIME64_1;
+    if (len > 32) {
+
+      if (len > 64) {
+
+        if (len > 96) {
+
+          acc += XXH3_mix16B(input + 48, secret + 96, seed);
+          acc += XXH3_mix16B(input + len - 64, secret + 112, seed);
+
+        }
+
+        acc += XXH3_mix16B(input + 32, secret + 64, seed);
+        acc += XXH3_mix16B(input + len - 48, secret + 80, seed);
+
+      }
+
+      acc += XXH3_mix16B(input + 16, secret + 32, seed);
+      acc += XXH3_mix16B(input + len - 32, secret + 48, seed);
+
+    }
+
+    acc += XXH3_mix16B(input + 0, secret + 0, seed);
+    acc += XXH3_mix16B(input + len - 16, secret + 16, seed);
+
+    return XXH3_avalanche(acc);
+
+  }
+
+}
+
+    #define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
+    const xxh_u8 *XXH_RESTRICT input, size_t len,
+    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
+
+  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+  (void)secretSize;
+  XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET 17
+
+  {
+
+    xxh_u64   acc = len * XXH_PRIME64_1;
+    int const nbRounds = (int)len / 16;
+    int       i;
+    for (i = 0; i < 8; i++) {
+
+      acc += XXH3_mix16B(input + (16 * i), secret + (16 * i), seed);
+
+    }
+
+    acc = XXH3_avalanche(acc);
+    XXH_ASSERT(nbRounds >= 8);
+    #if defined(__clang__)                                /* Clang */ \
+        && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+        && !defined(XXH_ENABLE_AUTOVECTORIZE)          /* Define to disable */
+      /*
+       * UGLY HACK:
+       * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+       * In everywhere else, it uses scalar code.
+       *
+       * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+       * would still be slower than UMAAL (see XXH_mult64to128).
+       *
+       * Unfortunately, Clang doesn't handle the long multiplies properly and
+       * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+       * scalarized into an ugly mess of VMOV.32 instructions.
+       *
+       * This mess is difficult to avoid without turning autovectorization
+       * off completely, but they are usually relatively minor and/or not
+       * worth it to fix.
+       *
+       * This loop is the easiest to fix, as unlike XXH32, this pragma
+       * _actually works_ because it is a loop vectorization instead of an
+       * SLP vectorization.
+       */
+      #pragma clang loop vectorize(disable)
+    #endif
+    for (i = 8; i < nbRounds; i++) {
+
+      acc +=
+          XXH3_mix16B(input + (16 * i),
+                      secret + (16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+
+    }
+
+    /* last bytes */
+    acc += XXH3_mix16B(input + len - 16,
+                       secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET,
+                       seed);
+    return XXH3_avalanche(acc);
+
+  }
+
+}
+
+  /* =======     Long Keys     ======= */
+
+    #define XXH_STRIPE_LEN 64
+    #define XXH_SECRET_CONSUME_RATE \
+      8                 /* nb of secret bytes consumed at each accumulation */
+    #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+
+    #ifdef XXH_OLD_NAMES
+      #define STRIPE_LEN XXH_STRIPE_LEN
+      #define ACC_NB XXH_ACC_NB
+    #endif
+
+XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) {
+
+  if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+  memcpy(dst, &v64, sizeof(v64));
+
+}
+
+    /* Several intrinsic functions below are supposed to accept __int64 as
+     * argument, as documented in
+     * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . However,
+     * several environments do not define __int64 type, requiring a workaround.
+     */
+    #if !defined(__VMS) &&                                     \
+        (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
+                                  (__STDC_VERSION__ >= 199901L) /* C99 */))
+typedef int64_t xxh_i64;
+    #else
+/* the following type must have a width of 64-bit */
+typedef long long xxh_i64;
+    #endif
+
+  /*
+   * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the
+   * most optimized.
+   *
+   * It is a hardened version of UMAC, based off of FARSH's implementation.
+   *
+   * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+   * implementations, and it is ridiculously fast.
+   *
+   * We harden it by mixing the original input to the accumulators as well as
+   * the product.
+   *
+   * This means that in the (relatively likely) case of a multiply by zero, the
+   * original input is preserved.
+   *
+   * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+   * cross-pollination, as otherwise the upper and lower halves would be
+   * essentially independent.
+   *
+   * This doesn't matter on 64-bit hashes since they all get merged together in
+   * the end, so we skip the extra step.
+   *
+   * Both XXH3_64bits and XXH3_128bits use this subroutine.
+   */
+
+    #if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH)
+
+      #ifndef XXH_TARGET_AVX512
+        #define XXH_TARGET_AVX512               /* disable attribute target */
+      #endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512(
+    void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
+    const void *XXH_RESTRICT secret) {
+
+  XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
+  XXH_ASSERT((((size_t)acc) & 63) == 0);
+  XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+
+  {
+
+    /* data_vec    = input[0]; */
+    __m512i const data_vec = _mm512_loadu_si512(input);
+    /* key_vec     = secret[0]; */
+    __m512i const key_vec = _mm512_loadu_si512(secret);
+    /* data_key    = data_vec ^ key_vec; */
+    __m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
+    /* data_key_lo = data_key >> 32; */
+    __m512i const data_key_lo =
+        _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
+    /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+    __m512i const product = _mm512_mul_epu32(data_key, data_key_lo);
+    /* xacc[0] += swap(data_vec); */
+    __m512i const data_swap =
+        _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
+    __m512i const sum = _mm512_add_epi64(*xacc, data_swap);
+    /* xacc[0] += product; */
+    *xacc = _mm512_add_epi64(product, sum);
+
+  }
+
+}
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source:
+ * https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash
+ * does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512(
+    void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
+
+  XXH_ASSERT((((size_t)acc) & 63) == 0);
+  XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+  {
+
+    XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
+    const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+
+    /* xacc[0] ^= (xacc[0] >> 47) */
+    __m512i const acc_vec = *xacc;
+    __m512i const shifted = _mm512_srli_epi64(acc_vec, 47);
+    __m512i const data_vec = _mm512_xor_si512(acc_vec, shifted);
+    /* xacc[0] ^= secret; */
+    __m512i const key_vec = _mm512_loadu_si512(secret);
+    __m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
+
+    /* xacc[0] *= XXH_PRIME32_1; */
+    __m512i const data_key_hi =
+        _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
+    __m512i const prod_lo = _mm512_mul_epu32(data_key, prime32);
+    __m512i const prod_hi = _mm512_mul_epu32(data_key_hi, prime32);
+    *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+
+  }
+
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(
+    void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
+
+  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+  XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+  XXH_ASSERT(((size_t)customSecret & 63) == 0);
+  (void)(&XXH_writeLE64);
+  {
+
+    int const     nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+    __m512i const seed = _mm512_mask_set1_epi64(
+        _mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);
+
+    XXH_ALIGN(64) const __m512i *const src = (const __m512i *)XXH3_kSecret;
+    XXH_ALIGN(64) __m512i *const       dest = (__m512i *)customSecret;
+    int                                i;
+    for (i = 0; i < nbRounds; ++i) {
+
+      /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void
+       * const*', this will warn "discards ‘const’ qualifier". */
+      union {
+
+        XXH_ALIGN(64) const __m512i *cp;
+        XXH_ALIGN(64) void *p;
+
+      } remote_const_void;
+
+      remote_const_void.cp = src + i;
+      dest[i] =
+          _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
+
+    }
+
+  }
+
+}
+
+    #endif
+
+    #if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH)
+
+      #ifndef XXH_TARGET_AVX2
+        #define XXH_TARGET_AVX2                 /* disable attribute target */
+      #endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2(
+    void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
+    const void *XXH_RESTRICT secret) {
+
+  XXH_ASSERT((((size_t)acc) & 31) == 0);
+  {
+
+    XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
+    /* Unaligned. This is mainly for pointer arithmetic, and because
+     * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason.
+     */
+    const __m256i *const xinput = (const __m256i *)input;
+    /* Unaligned. This is mainly for pointer arithmetic, and because
+     * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+    const __m256i *const xsecret = (const __m256i *)secret;
+
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
+
+      /* data_vec    = xinput[i]; */
+      __m256i const data_vec = _mm256_loadu_si256(xinput + i);
+      /* key_vec     = xsecret[i]; */
+      __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
+      /* data_key    = data_vec ^ key_vec; */
+      __m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
+      /* data_key_lo = data_key >> 32; */
+      __m256i const data_key_lo =
+          _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
+      /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+      __m256i const product = _mm256_mul_epu32(data_key, data_key_lo);
+      /* xacc[i] += swap(data_vec); */
+      __m256i const data_swap =
+          _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+      __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
+      /* xacc[i] += product; */
+      xacc[i] = _mm256_add_epi64(product, sum);
+
+    }
+
+  }
+
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(
+    void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
+
+  XXH_ASSERT((((size_t)acc) & 31) == 0);
+  {
+
+    XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
+    /* Unaligned. This is mainly for pointer arithmetic, and because
+     * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+    const __m256i *const xsecret = (const __m256i *)secret;
+    const __m256i        prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
+
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
+
+      /* xacc[i] ^= (xacc[i] >> 47) */
+      __m256i const acc_vec = xacc[i];
+      __m256i const shifted = _mm256_srli_epi64(acc_vec, 47);
+      __m256i const data_vec = _mm256_xor_si256(acc_vec, shifted);
+      /* xacc[i] ^= xsecret; */
+      __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
+      __m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
+
+      /* xacc[i] *= XXH_PRIME32_1; */
+      __m256i const data_key_hi =
+          _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
+      __m256i const prod_lo = _mm256_mul_epu32(data_key, prime32);
+      __m256i const prod_hi = _mm256_mul_epu32(data_key_hi, prime32);
+      xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+
+    }
+
+  }
+
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(
+    void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
+
+  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
+  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
+  XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
+  (void)(&XXH_writeLE64);
+  XXH_PREFETCH(customSecret);
+  {
+
+    __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64,
+                                           -(xxh_i64)seed64, (xxh_i64)seed64);
+
+    XXH_ALIGN(64) const __m256i *const src = (const __m256i *)XXH3_kSecret;
+    XXH_ALIGN(64) __m256i *            dest = (__m256i *)customSecret;
+
+      #if defined(__GNUC__) || defined(__clang__)
+    /*
+     * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+     *   - do not extract the secret from sse registers in the internal loop
+     *   - use less common registers, and avoid pushing these reg into stack
+     * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
+     * customSecret, and on aarch64, this prevented LDP from merging two
+     * loads together for free. Putting the loads together before the stores
+     * properly generates LDP.
+     */
+    __asm__("" : "+r"(dest));
+      #endif
+
+    /* GCC -O2 need unroll loop manually */
+    dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed);
+    dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src + 1), seed);
+    dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src + 2), seed);
+    dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src + 3), seed);
+    dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src + 4), seed);
+    dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src + 5), seed);
+
+  }
+
+}
+
+    #endif
+
+    #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+
+      #ifndef XXH_TARGET_SSE2
+        #define XXH_TARGET_SSE2                 /* disable attribute target */
+      #endif
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2(
+    void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
+    const void *XXH_RESTRICT secret) {
+
+  /* SSE2 is just a half-scale version of the AVX2 version. */
+  XXH_ASSERT((((size_t)acc) & 15) == 0);
+  {
+
+    XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
+    /* Unaligned. This is mainly for pointer arithmetic, and because
+     * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+    const __m128i *const xinput = (const __m128i *)input;
+    /* Unaligned. This is mainly for pointer arithmetic, and because
+     * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+    const __m128i *const xsecret = (const __m128i *)secret;
+
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
+
+      /* data_vec    = xinput[i]; */
+      __m128i const data_vec = _mm_loadu_si128(xinput + i);
+      /* key_vec     = xsecret[i]; */
+      __m128i const key_vec = _mm_loadu_si128(xsecret + i);
+      /* data_key    = data_vec ^ key_vec; */
+      __m128i const data_key = _mm_xor_si128(data_vec, key_vec);
+      /* data_key_lo = data_key >> 32; */
+      __m128i const data_key_lo =
+          _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
+      /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+      __m128i const product = _mm_mul_epu32(data_key, data_key_lo);
+      /* xacc[i] += swap(data_vec); */
+      __m128i const data_swap =
+          _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+      __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
+      /* xacc[i] += product; */
+      xacc[i] = _mm_add_epi64(product, sum);
+
+    }
+
+  }
+
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(
+    void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
+
+  XXH_ASSERT((((size_t)acc) & 15) == 0);
+  {
+
+    XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
+    /* Unaligned. This is mainly for pointer arithmetic, and because
+     * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+    const __m128i *const xsecret = (const __m128i *)secret;
+    const __m128i        prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
+
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
+
+      /* xacc[i] ^= (xacc[i] >> 47) */
+      __m128i const acc_vec = xacc[i];
+      __m128i const shifted = _mm_srli_epi64(acc_vec, 47);
+      __m128i const data_vec = _mm_xor_si128(acc_vec, shifted);
+      /* xacc[i] ^= xsecret[i]; */
+      __m128i const key_vec = _mm_loadu_si128(xsecret + i);
+      __m128i const data_key = _mm_xor_si128(data_vec, key_vec);
+
+      /* xacc[i] *= XXH_PRIME32_1; */
+      __m128i const data_key_hi =
+          _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
+      __m128i const prod_lo = _mm_mul_epu32(data_key, prime32);
+      __m128i const prod_hi = _mm_mul_epu32(data_key_hi, prime32);
+      xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+
+    }
+
+  }
+
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(
+    void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
+
+  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+  (void)(&XXH_writeLE64);
+  {
+
+    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
+
+      #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+    // MSVC 32bit mode does not support _mm_set_epi64x before 2015
+    XXH_ALIGN(16)
+    const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, -(xxh_i64)seed64};
+    __m128i const seed = _mm_load_si128((__m128i const *)seed64x2);
+      #else
+    __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);
+      #endif
+    int i;
+
+    XXH_ALIGN(64) const float *const  src = (float const *)XXH3_kSecret;
+    XXH_ALIGN(XXH_SEC_ALIGN) __m128i *dest = (__m128i *)customSecret;
+      #if defined(__GNUC__) || defined(__clang__)
+    /*
+     * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+     *   - do not extract the secret from sse registers in the internal loop
+     *   - use less common registers, and avoid pushing these reg into stack
+     */
+    __asm__("" : "+r"(dest));
+      #endif
+
+    for (i = 0; i < nbRounds; ++i) {
+
+      dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src + i * 4)), seed);
+
+    }
+
+  }
+
+}
+
+    #endif
+
+    #if (XXH_VECTOR == XXH_NEON)
+
+XXH_FORCE_INLINE void XXH3_accumulate_512_neon(
+    void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
+    const void *XXH_RESTRICT secret) {
+
+  XXH_ASSERT((((size_t)acc) & 15) == 0);
+  {
+
+    XXH_ALIGN(16) uint64x2_t *const xacc = (uint64x2_t *)acc;
+    /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7.
+     */
+    uint8_t const *const xinput = (const uint8_t *)input;
+    uint8_t const *const xsecret = (const uint8_t *)secret;
+
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
+
+      /* data_vec = xinput[i]; */
+      uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
+      /* key_vec  = xsecret[i];  */
+      uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
+      uint64x2_t data_key;
+      uint32x2_t data_key_lo, data_key_hi;
+      /* xacc[i] += swap(data_vec); */
+      uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+      uint64x2_t const swapped = vextq_u64(data64, data64, 1);
+      xacc[i] = vaddq_u64(xacc[i], swapped);
+      /* data_key = data_vec ^ key_vec; */
+      data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+      /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
+       * data_key_hi = (uint32x2_t) (data_key >> 32);
+       * data_key = UNDEFINED; */
+      XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+      /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+      xacc[i] = vmlal_u32(xacc[i], data_key_lo, data_key_hi);
+
+    }
+
+  }
+
+}
+
+XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void *XXH_RESTRICT       acc,
+                                            const void *XXH_RESTRICT secret) {
+
+  XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+  {
+
+    uint64x2_t *   xacc = (uint64x2_t *)acc;
+    uint8_t const *xsecret = (uint8_t const *)secret;
+    uint32x2_t     prime = vdup_n_u32(XXH_PRIME32_1);
+
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
+
+      /* xacc[i] ^= (xacc[i] >> 47); */
+      uint64x2_t acc_vec = xacc[i];
+      uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
+      uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
+
+      /* xacc[i] ^= xsecret[i]; */
+      uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
+      uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
+
+      /* xacc[i] *= XXH_PRIME32_1 */
+      uint32x2_t data_key_lo, data_key_hi;
+      /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+       * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+       * xacc[i] = UNDEFINED; */
+      XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+      { /*
+         * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
+         *
+         * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
+         * incorrectly "optimize" this:
+         *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
+         *   shifted = vshll_n_u32(tmp, 32);
+         * to this:
+         *   tmp     = "vmulq_u64"(a, b); // no such thing!
+         *   shifted = vshlq_n_u64(tmp, 32);
+         *
+         * However, unlike SSE, Clang lacks a 64-bit multiply routine
+         * for NEON, and it scalarizes two 64-bit multiplies instead.
+         *
+         * vmull_u32 has the same timing as vmul_u32, and it avoids
+         * this bug completely.
+         * See https://bugs.llvm.org/show_bug.cgi?id=39967
+         */
+        uint64x2_t prod_hi = vmull_u32(data_key_hi, prime);
+        /* xacc[i] = prod_hi << 32; */
+        xacc[i] = vshlq_n_u64(prod_hi, 32);
+        /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
+        xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
+
+      }
+
+    }
+
+  }
+
+}
+
+    #endif
+
+    #if (XXH_VECTOR == XXH_VSX)
+
+XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT       acc,
+                                              const void *XXH_RESTRICT input,
+                                              const void *XXH_RESTRICT secret) {
+
+  xxh_u64x2 *const       xacc = (xxh_u64x2 *)acc;       /* presumed aligned */
+  xxh_u64x2 const *const xinput =
+      (xxh_u64x2 const *)input;                 /* no alignment restriction */
+  xxh_u64x2 const *const xsecret =
+      (xxh_u64x2 const *)secret;                /* no alignment restriction */
+  xxh_u64x2 const v32 = {32, 32};
+  size_t          i;
+  for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+
+    /* data_vec = xinput[i]; */
+    xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
+    /* key_vec = xsecret[i]; */
+    xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
+    xxh_u64x2 const data_key = data_vec ^ key_vec;
+    /* shuffled = (data_key << 32) | (data_key >> 32); */
+    xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+    /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled &
+     * 0xFFFFFFFF); */
+    xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+    xacc[i] += product;
+
+        /* swap high and low halves */
+      #ifdef __s390x__
+    xacc[i] += vec_permi(data_vec, data_vec, 2);
+      #else
+    xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
+      #endif
+
+  }
+
+}
+
+XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void *XXH_RESTRICT       acc,
+                                           const void *XXH_RESTRICT secret) {
+
+  XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+  {
+
+    xxh_u64x2 *const       xacc = (xxh_u64x2 *)acc;
+    const xxh_u64x2 *const xsecret = (const xxh_u64x2 *)secret;
+    /* constants */
+    xxh_u64x2 const v32 = {32, 32};
+    xxh_u64x2 const v47 = {47, 47};
+    xxh_u32x4 const prime = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1,
+                             XXH_PRIME32_1};
+    size_t          i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+
+      /* xacc[i] ^= (xacc[i] >> 47); */
+      xxh_u64x2 const acc_vec = xacc[i];
+      xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+      /* xacc[i] ^= xsecret[i]; */
+      xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
+      xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+      /* xacc[i] *= XXH_PRIME32_1 */
+      /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime &
+       * 0xFFFFFFFF);  */
+      xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);
+      /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+      xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+      xacc[i] = prod_odd + (prod_even << v32);
+
+    }
+
+  }
+
+}
+
+    #endif
+
+/* scalar variants - universal */
+
+XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(
+    void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
+    const void *XXH_RESTRICT secret) {
+
+  XXH_ALIGN(XXH_ACC_ALIGN)
+  xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
+  const xxh_u8 *const xinput =
+      (const xxh_u8 *)input;                    /* no alignment restriction */
+  const xxh_u8 *const xsecret =
+      (const xxh_u8 *)secret;                   /* no alignment restriction */
+  size_t i;
+  XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN - 1)) == 0);
+  for (i = 0; i < XXH_ACC_NB; i++) {
+
+    xxh_u64 const data_val = XXH_readLE64(xinput + 8 * i);
+    xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i * 8);
+    xacc[i ^ 1] += data_val;                         /* swap adjacent lanes */
+    xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+
+  }
+
+}
+
+XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void *XXH_RESTRICT       acc,
+                                              const void *XXH_RESTRICT secret) {
+
+  XXH_ALIGN(XXH_ACC_ALIGN)
+  xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
+  const xxh_u8 *const xsecret =
+      (const xxh_u8 *)secret;                   /* no alignment restriction */
+  size_t i;
+  XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN - 1)) == 0);
+  for (i = 0; i < XXH_ACC_NB; i++) {
+
+    xxh_u64 const key64 = XXH_readLE64(xsecret + 8 * i);
+    xxh_u64       acc64 = xacc[i];
+    acc64 = XXH_xorshift64(acc64, 47);
+    acc64 ^= key64;
+    acc64 *= XXH_PRIME32_1;
+    xacc[i] = acc64;
+
+  }
+
+}
+
+XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(
+    void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
+
+  /*
+   * We need a separate pointer for the hack below,
+   * which requires a non-const pointer.
+   * Any decent compiler will optimize this out otherwise.
+   */
+  const xxh_u8 *kSecretPtr = XXH3_kSecret;
+  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+    #if defined(__clang__) && defined(__aarch64__)
+  /*
+   * UGLY HACK:
+   * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
+   * placed sequentially, in order, at the top of the unrolled loop.
+   *
+   * While MOVK is great for generating constants (2 cycles for a 64-bit
+   * constant compared to 4 cycles for LDR), long MOVK chains stall the
+   * integer pipelines:
+   *   I   L   S
+   * MOVK
+   * MOVK
+   * MOVK
+   * MOVK
+   * ADD
+   * SUB      STR
+   *          STR
+   * By forcing loads from memory (as the asm line causes Clang to assume
+   * that XXH3_kSecretPtr has been changed), the pipelines are used more
+   * efficiently:
+   *   I   L   S
+   *      LDR
+   *  ADD LDR
+   *  SUB     STR
+   *          STR
+   * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+   *   without hack: 2654.4 MB/s
+   *   with hack:    3202.9 MB/s
+   */
+  __asm__("" : "+r"(kSecretPtr));
+    #endif
+  /*
+   * Note: in debug mode, this overrides the asm optimization
+   * and Clang will emit MOVK chains again.
+   */
+  XXH_ASSERT(kSecretPtr == XXH3_kSecret);
+
+  {
+
+    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+    int       i;
+    for (i = 0; i < nbRounds; i++) {
+
+      /*
+       * The asm hack causes Clang to assume that kSecretPtr aliases with
+       * customSecret, and on aarch64, this prevented LDP from merging two
+       * loads together for free. Putting the loads together before the stores
+       * properly generates LDP.
+       */
+      xxh_u64 lo = XXH_readLE64(kSecretPtr + 16 * i) + seed64;
+      xxh_u64 hi = XXH_readLE64(kSecretPtr + 16 * i + 8) - seed64;
+      XXH_writeLE64((xxh_u8 *)customSecret + 16 * i, lo);
+      XXH_writeLE64((xxh_u8 *)customSecret + 16 * i + 8, hi);
+
+    }
+
+  }
+
+}
+
+typedef void (*XXH3_f_accumulate_512)(void *XXH_RESTRICT, const void *,
+                                      const void *);
+typedef void (*XXH3_f_scrambleAcc)(void *XXH_RESTRICT, const void *);
+typedef void (*XXH3_f_initCustomSecret)(void *XXH_RESTRICT, xxh_u64);
+
+    #if (XXH_VECTOR == XXH_AVX512)
+
+      #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+      #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
+      #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+
+    #elif (XXH_VECTOR == XXH_AVX2)
+
+      #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+      #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
+      #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+
+    #elif (XXH_VECTOR == XXH_SSE2)
+
+      #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+      #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
+      #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+
+    #elif (XXH_VECTOR == XXH_NEON)
+
+      #define XXH3_accumulate_512 XXH3_accumulate_512_neon
+      #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
+      #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+    #elif (XXH_VECTOR == XXH_VSX)
+
+      #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+      #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
+      #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+    #else                                                         /* scalar */
+
+      #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+      #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
+      #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+    #endif
+
+    #ifndef XXH_PREFETCH_DIST
+      #ifdef __clang__
+        #define XXH_PREFETCH_DIST 320
+      #else
+        #if (XXH_VECTOR == XXH_AVX512)
+          #define XXH_PREFETCH_DIST 512
+        #else
+          #define XXH_PREFETCH_DIST 384
+        #endif
+      #endif                                                   /* __clang__ */
+    #endif                                             /* XXH_PREFETCH_DIST */
+
+/*
+ * XXH3_accumulate()
+ * Loops over XXH3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+XXH_FORCE_INLINE void XXH3_accumulate(xxh_u64 *XXH_RESTRICT      acc,
+                                      const xxh_u8 *XXH_RESTRICT input,
+                                      const xxh_u8 *XXH_RESTRICT secret,
+                                      size_t                     nbStripes,
+                                      XXH3_f_accumulate_512      f_acc512) {
+
+  size_t n;
+  for (n = 0; n < nbStripes; n++) {
+
+    const xxh_u8 *const in = input + n * XXH_STRIPE_LEN;
+    XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+    f_acc512(acc, in, secret + n * XXH_SECRET_CONSUME_RATE);
+
+  }
+
+}
+
+XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(
+    xxh_u64 *XXH_RESTRICT acc, const xxh_u8 *XXH_RESTRICT input, size_t len,
+    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize,
+    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
+
+  size_t const nbStripesPerBlock =
+      (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+  size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+  size_t const nb_blocks = (len - 1) / block_len;
+
+  size_t n;
+
+  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+  for (n = 0; n < nb_blocks; n++) {
+
+    XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock,
+                    f_acc512);
+    f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+
+  }
+
+  /* last partial block */
+  XXH_ASSERT(len > XXH_STRIPE_LEN);
+  {
+
+    size_t const nbStripes =
+        ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+    XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+    XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes,
+                    f_acc512);
+
+    /* last stripe */
+    {
+
+      const xxh_u8 *const p = input + len - XXH_STRIPE_LEN;
+    #define XXH_SECRET_LASTACC_START \
+      7  /* not aligned on 8, last secret is different from acc & scrambler */
+      f_acc512(acc, p,
+               secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+
+    }
+
+  }
+
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH3_mix2Accs(const xxh_u64 *XXH_RESTRICT acc,
+                                       const xxh_u8 *XXH_RESTRICT  secret) {
+
+  return XXH3_mul128_fold64(acc[0] ^ XXH_readLE64(secret),
+                            acc[1] ^ XXH_readLE64(secret + 8));
+
+}
+
+static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
+                                   const xxh_u8 *XXH_RESTRICT  secret,
+                                   xxh_u64                     start) {
+
+  xxh_u64 result64 = start;
+  size_t  i = 0;
+
+  for (i = 0; i < 4; i++) {
+
+    result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i);
+    #if defined(__clang__)                                /* Clang */ \
+        && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+        && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+        && !defined(XXH_ENABLE_AUTOVECTORIZE)          /* Define to disable */
+    /*
+     * UGLY HACK:
+     * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+     * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+     * XXH3_64bits, len == 256, Snapdragon 835:
+     *   without hack: 2063.7 MB/s
+     *   with hack:    2560.7 MB/s
+     */
+    __asm__("" : "+r"(result64));
+    #endif
+
+  }
+
+  return XXH3_avalanche(result64);
+
+}
+
+    #define XXH3_INIT_ACC                                              \
+      {                                                                \
+                                                                       \
+        XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,    \
+            XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \
+                                                                       \
+      }
+
+XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(
+    const void *XXH_RESTRICT input, size_t len, const void *XXH_RESTRICT secret,
+    size_t secretSize, XXH3_f_accumulate_512 f_acc512,
+    XXH3_f_scrambleAcc f_scramble) {
+
+  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+  XXH3_hashLong_internal_loop(acc, (const xxh_u8 *)input, len,
+                              (const xxh_u8 *)secret, secretSize, f_acc512,
+                              f_scramble);
+
+  /* converge into final hash */
+  XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator
+     */
+    #define XXH_SECRET_MERGEACCS_START 11
+  XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+  return XXH3_mergeAccs(acc,
+                        (const xxh_u8 *)secret + XXH_SECRET_MERGEACCS_START,
+                        (xxh_u64)len * XXH_PRIME64_1);
+
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSecret(
+    const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
+    const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
+
+  (void)seed64;
+  return XXH3_hashLong_64b_internal(input, len, secret, secretLen,
+                                    XXH3_accumulate_512, XXH3_scrambleAcc);
+
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ * Since the function is not inlined, the compiler may not be able to understand
+ * that, in some scenarios, its `secret` argument is actually a compile time
+ * constant. This variant enforces that the compiler can detect that, and uses
+ * this opportunity to streamline the generated code for better performance.
+ */
+XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_default(
+    const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
+    const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
+
+  (void)seed64;
+  (void)secret;
+  (void)secretLen;
+  return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret,
+                                    sizeof(XXH3_kSecret), XXH3_accumulate_512,
+                                    XXH3_scrambleAcc);
+
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default XXH3_kSecret with the
+ * seed, and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed_internal(
+    const void *input, size_t len, XXH64_hash_t seed,
+    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble,
+    XXH3_f_initCustomSecret f_initSec) {
+
+  if (seed == 0)
+    return XXH3_hashLong_64b_internal(
+        input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble);
+  {
+
+    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    f_initSec(secret, seed);
+    return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
+                                      f_acc512, f_scramble);
+
+  }
+
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed(const void *  input,
+                                                      size_t        len,
+                                                      XXH64_hash_t  seed,
+                                                      const xxh_u8 *secret,
+                                                      size_t        secretLen) {
+
+  (void)secret;
+  (void)secretLen;
+  return XXH3_hashLong_64b_withSeed_internal(
+      input, len, seed, XXH3_accumulate_512, XXH3_scrambleAcc,
+      XXH3_initCustomSecret);
+
+}
+
+typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void *XXH_RESTRICT, size_t,
+                                          XXH64_hash_t,
+                                          const xxh_u8 *XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_64bits_internal(const void *XXH_RESTRICT input, size_t len,
+                     XXH64_hash_t seed64, const void *XXH_RESTRICT secret,
+                     size_t secretLen, XXH3_hashLong64_f f_hashLong) {
+
+  XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+  /*
+   * If an action is to be taken if `secretLen` condition is not respected,
+   * it should be done here.
+   * For now, it's a contract pre-condition.
+   * Adding a check and a branch here would cost performance at every hash.
+   * Also, note that function signature doesn't offer room to return an error.
+   */
+  if (len <= 16)
+    return XXH3_len_0to16_64b((const xxh_u8 *)input, len,
+                              (const xxh_u8 *)secret, seed64);
+  if (len <= 128)
+    return XXH3_len_17to128_64b((const xxh_u8 *)input, len,
+                                (const xxh_u8 *)secret, secretLen, seed64);
+  if (len <= XXH3_MIDSIZE_MAX)
+    return XXH3_len_129to240_64b((const xxh_u8 *)input, len,
+                                 (const xxh_u8 *)secret, secretLen, seed64);
+  return f_hashLong(input, len, seed64, (const xxh_u8 *)secret, secretLen);
+
+}
+
+/* ===   Public entry point   === */
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) {
+
+  return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret),
+                              XXH3_hashLong_64b_default);
+
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input,
+                                                   size_t      len,
+                                                   const void *secret,
+                                                   size_t      secretSize) {
+
+  return XXH3_64bits_internal(input, len, 0, secret, secretSize,
+                              XXH3_hashLong_64b_withSecret);
+
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len,
+                                                 XXH64_hash_t seed) {
+
+  return XXH3_64bits_internal(input, len, seed, XXH3_kSecret,
+                              sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+
+}
+
+/* ===   XXH3 streaming   === */
+
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static void *XXH_alignedMalloc(size_t s, size_t align) {
+
+  XXH_ASSERT(align <= 128 && align >= 8);                    /* range check */
+  XXH_ASSERT((align & (align - 1)) == 0);                     /* power of 2 */
+  XXH_ASSERT(s != 0 && s < (s + align));                  /* empty/overflow */
+  {  /* Overallocate to make room for manual realignment and an offset byte */
+    xxh_u8 *base = (xxh_u8 *)XXH_malloc(s + align);
+    if (base != NULL) {
+
+      /*
+       * Get the offset needed to align this pointer.
+       *
+       * Even if the returned pointer is aligned, there will always be
+       * at least one byte to store the offset to the original pointer.
+       */
+      size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+      /* Add the offset for the now-aligned pointer */
+      xxh_u8 *ptr = base + offset;
+
+      XXH_ASSERT((size_t)ptr % align == 0);
+
+      /* Store the offset immediately before the returned pointer. */
+      ptr[-1] = (xxh_u8)offset;
+      return ptr;
+
+    }
+
+    return NULL;
+
+  }
+
+}
+
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void *p) {
+
+  if (p != NULL) {
+
+    xxh_u8 *ptr = (xxh_u8 *)p;
+    /* Get the offset byte we added in XXH_malloc. */
+    xxh_u8 offset = ptr[-1];
+    /* Free the original malloc'd pointer */
+    xxh_u8 *base = ptr - offset;
+    XXH_free(base);
+
+  }
+
+}
+
+XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) {
+
+  XXH3_state_t *const state =
+      (XXH3_state_t *)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+  if (state == NULL) return NULL;
+  XXH3_INITSTATE(state);
+  return state;
+
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) {
+
+  XXH_alignedFree(statePtr);
+  return XXH_OK;
+
+}
+
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t *      dst_state,
+                                   const XXH3_state_t *src_state) {
+
+  memcpy(dst_state, src_state, sizeof(*dst_state));
+
+}
+
+static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr,
+                                       XXH64_hash_t seed, const void *secret,
+                                       size_t secretSize) {
+
+  size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
+  size_t const initLength =
+      offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
+  XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
+  XXH_ASSERT(statePtr != NULL);
+  /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+  memset((char *)statePtr + initStart, 0, initLength);
+  statePtr->acc[0] = XXH_PRIME32_3;
+  statePtr->acc[1] = XXH_PRIME64_1;
+  statePtr->acc[2] = XXH_PRIME64_2;
+  statePtr->acc[3] = XXH_PRIME64_3;
+  statePtr->acc[4] = XXH_PRIME64_4;
+  statePtr->acc[5] = XXH_PRIME32_2;
+  statePtr->acc[6] = XXH_PRIME64_5;
+  statePtr->acc[7] = XXH_PRIME32_1;
+  statePtr->seed = seed;
+  statePtr->extSecret = (const unsigned char *)secret;
+  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+  statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+  statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr) {
+
+  if (statePtr == NULL) return XXH_ERROR;
+  XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret,
+                             XXH_SECRET_DEFAULT_SIZE);
+  return XXH_OK;
+
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
+    XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
+
+  if (statePtr == NULL) return XXH_ERROR;
+  XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
+  if (secret == NULL) return XXH_ERROR;
+  if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+  return XXH_OK;
+
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
+                                                        XXH64_hash_t  seed) {
+
+  if (statePtr == NULL) return XXH_ERROR;
+  if (seed == 0) return XXH3_64bits_reset(statePtr);
+  if (seed != statePtr->seed)
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+  XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+  return XXH_OK;
+
+}
+
+/* Note : when XXH3_consumeStripes() is invoked,
+ * there must be a guarantee that at least one more byte must be consumed from
+ * input
+ * so that the function can blindly consume all stripes using the "normal"
+ * secret segment */
+XXH_FORCE_INLINE void XXH3_consumeStripes(
+    xxh_u64 *XXH_RESTRICT acc, size_t *XXH_RESTRICT nbStripesSoFarPtr,
+    size_t nbStripesPerBlock, const xxh_u8 *XXH_RESTRICT input,
+    size_t nbStripes, const xxh_u8 *XXH_RESTRICT secret, size_t secretLimit,
+    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
+
+  XXH_ASSERT(nbStripes <=
+             nbStripesPerBlock); /* can handle max 1 scramble per invocation */
+  XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+  if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
+
+    /* need a scrambling operation */
+    size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
+    size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
+    XXH3_accumulate(acc, input,
+                    secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE,
+                    nbStripesToEndofBlock, f_acc512);
+    f_scramble(acc, secret + secretLimit);
+    XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret,
+                    nbStripesAfterBlock, f_acc512);
+    *nbStripesSoFarPtr = nbStripesAfterBlock;
+
+  } else {
+
+    XXH3_accumulate(acc, input,
+                    secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE,
+                    nbStripes, f_acc512);
+    *nbStripesSoFarPtr += nbStripes;
+
+  }
+
+}
+
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
+                                           const xxh_u8 *input, size_t len,
+                                           XXH3_f_accumulate_512 f_acc512,
+                                           XXH3_f_scrambleAcc    f_scramble) {
+
+  if (input == NULL)
+    #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
+        (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
+    return XXH_OK;
+    #else
+    return XXH_ERROR;
+    #endif
+
+  {
+
+    const xxh_u8 *const        bEnd = input + len;
+    const unsigned char *const secret =
+        (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+
+    state->totalLen += len;
+
+    if (state->bufferedSize + len <=
+        XXH3_INTERNALBUFFER_SIZE) {                   /* fill in tmp buffer */
+      XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+      state->bufferedSize += (XXH32_hash_t)len;
+      return XXH_OK;
+
+    }
+
+      /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+
+    #define XXH3_INTERNALBUFFER_STRIPES \
+      (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+    XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN ==
+                      0);                                 /* clean multiple */
+
+    /*
+     * Internal buffer is partially filled (always, except at beginning)
+     * Complete it, then consume it.
+     */
+    if (state->bufferedSize) {
+
+      size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+      XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+      input += loadSize;
+      XXH3_consumeStripes(state->acc, &state->nbStripesSoFar,
+                          state->nbStripesPerBlock, state->buffer,
+                          XXH3_INTERNALBUFFER_STRIPES, secret,
+                          state->secretLimit, f_acc512, f_scramble);
+      state->bufferedSize = 0;
+
+    }
+
+    XXH_ASSERT(input < bEnd);
+
+    /* Consume input by a multiple of internal buffer size */
+    if (input + XXH3_INTERNALBUFFER_SIZE < bEnd) {
+
+      const xxh_u8 *const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+      do {
+
+        XXH3_consumeStripes(state->acc, &state->nbStripesSoFar,
+                            state->nbStripesPerBlock, input,
+                            XXH3_INTERNALBUFFER_STRIPES, secret,
+                            state->secretLimit, f_acc512, f_scramble);
+        input += XXH3_INTERNALBUFFER_SIZE;
+
+      } while (input < limit);
+
+      /* for last partial stripe */
+      memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN,
+             input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+
+    }
+
+    XXH_ASSERT(input < bEnd);
+
+    /* Some remaining input (always) : buffer it */
+    XXH_memcpy(state->buffer, input, (size_t)(bEnd - input));
+    state->bufferedSize = (XXH32_hash_t)(bEnd - input);
+
+  }
+
+  return XXH_OK;
+
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state,
+                                                const void *input, size_t len) {
+
+  return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_accumulate_512,
+                     XXH3_scrambleAcc);
+
+}
+
+XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t *       acc,
+                                       const XXH3_state_t * state,
+                                       const unsigned char *secret) {
+
+  /*
+   * Digest on a local copy. This way, the state remains unaltered, and it can
+   * continue ingesting more input afterwards.
+   */
+  memcpy(acc, state->acc, sizeof(state->acc));
+  if (state->bufferedSize >= XXH_STRIPE_LEN) {
+
+    size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+    size_t       nbStripesSoFar = state->nbStripesSoFar;
+    XXH3_consumeStripes(acc, &nbStripesSoFar, state->nbStripesPerBlock,
+                        state->buffer, nbStripes, secret, state->secretLimit,
+                        XXH3_accumulate_512, XXH3_scrambleAcc);
+    /* last stripe */
+    XXH3_accumulate_512(acc,
+                        state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
+                        secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+
+  } else {                                 /* bufferedSize < XXH_STRIPE_LEN */
+
+    xxh_u8       lastStripe[XXH_STRIPE_LEN];
+    size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+    XXH_ASSERT(state->bufferedSize >
+               0);                   /* there is always some input buffered */
+    memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize,
+           catchupSize);
+    memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+    XXH3_accumulate_512(acc, lastStripe,
+                        secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+
+  }
+
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) {
+
+  const unsigned char *const secret =
+      (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+  if (state->totalLen > XXH3_MIDSIZE_MAX) {
+
+    XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+    XXH3_digest_long(acc, state, secret);
+    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
+                          (xxh_u64)state->totalLen * XXH_PRIME64_1);
+
+  }
+
+  /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+  if (state->seed)
+    return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen,
+                                state->seed);
+  return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                secret, state->secretLimit + XXH_STRIPE_LEN);
+
+}
+
+    #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
+                                        const void *customSeed,
+                                        size_t      customSeedSize) {
+
+  XXH_ASSERT(secretBuffer != NULL);
+  if (customSeedSize == 0) {
+
+    memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return;
+
+  }
+
+  XXH_ASSERT(customSeed != NULL);
+
+  {
+
+    size_t const       segmentSize = sizeof(XXH128_hash_t);
+    size_t const       nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
+    XXH128_canonical_t scrambler;
+    XXH64_hash_t       seeds[12];
+    size_t             segnb;
+    XXH_ASSERT(nbSegments == 12);
+    XXH_ASSERT(segmentSize * nbSegments ==
+               XXH_SECRET_DEFAULT_SIZE);                  /* exact multiple */
+    XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+
+    /*
+     * Copy customSeed to seeds[], truncating or repeating as necessary.
+     */
+    {
+
+      size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
+      size_t filled = toFill;
+      memcpy(seeds, customSeed, toFill);
+      while (filled < sizeof(seeds)) {
+
+        toFill = XXH_MIN(filled, sizeof(seeds) - filled);
+        memcpy((char *)seeds + filled, seeds, toFill);
+        filled += toFill;
+
+      }
+
+    }
+
+    /* generate secret */
+    memcpy(secretBuffer, &scrambler, sizeof(scrambler));
+    for (segnb = 1; segnb < nbSegments; segnb++) {
+
+      size_t const       segmentStart = segnb * segmentSize;
+      XXH128_canonical_t segment;
+      XXH128_canonicalFromHash(&segment,
+                               XXH128(&scrambler, sizeof(scrambler),
+                                      XXH_readLE64(seeds + segnb) + segnb));
+      memcpy((char *)secretBuffer + segmentStart, &segment, sizeof(segment));
+
+    }
+
+  }
+
+}
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit
+ * variant, even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH128_hash_t XXH3_len_1to3_128b(const xxh_u8 *input,
+                                                  size_t        len,
+                                                  const xxh_u8 *secret,
+                                                  XXH64_hash_t  seed) {
+
+  /* A doubled version of 1to3_64b with different constants. */
+  XXH_ASSERT(input != NULL);
+  XXH_ASSERT(1 <= len && len <= 3);
+  XXH_ASSERT(secret != NULL);
+  /*
+   * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+   * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+   * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+   */
+  {
+
+    xxh_u8 const  c1 = input[0];
+    xxh_u8 const  c2 = input[len >> 1];
+    xxh_u8 const  c3 = input[len - 1];
+    xxh_u32 const combinedl = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) |
+                              ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+    xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+    xxh_u64 const bitflipl =
+        (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
+    xxh_u64 const bitfliph =
+        (XXH_readLE32(secret + 8) ^ XXH_readLE32(secret + 12)) - seed;
+    xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+    xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+    XXH128_hash_t h128;
+    h128.low64 = XXH64_avalanche(keyed_lo);
+    h128.high64 = XXH64_avalanche(keyed_hi);
+    return h128;
+
+  }
+
+}
+
+XXH_FORCE_INLINE XXH128_hash_t XXH3_len_4to8_128b(const xxh_u8 *input,
+                                                  size_t        len,
+                                                  const xxh_u8 *secret,
+                                                  XXH64_hash_t  seed) {
+
+  XXH_ASSERT(input != NULL);
+  XXH_ASSERT(secret != NULL);
+  XXH_ASSERT(4 <= len && len <= 8);
+  seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+  {
+
+    xxh_u32 const input_lo = XXH_readLE32(input);
+    xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+    xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+    xxh_u64 const bitflip =
+        (XXH_readLE64(secret + 16) ^ XXH_readLE64(secret + 24)) + seed;
+    xxh_u64 const keyed = input_64 ^ bitflip;
+
+    /* Shift len to the left to ensure it is even, this avoids even multiplies.
+     */
+    XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+
+    m128.high64 += (m128.low64 << 1);
+    m128.low64 ^= (m128.high64 >> 3);
+
+    m128.low64 = XXH_xorshift64(m128.low64, 35);
+    m128.low64 *= 0x9FB21C651E98DF25ULL;
+    m128.low64 = XXH_xorshift64(m128.low64, 28);
+    m128.high64 = XXH3_avalanche(m128.high64);
+    return m128;
+
+  }
+
+}
+
+XXH_FORCE_INLINE XXH128_hash_t XXH3_len_9to16_128b(const xxh_u8 *input,
+                                                   size_t        len,
+                                                   const xxh_u8 *secret,
+                                                   XXH64_hash_t  seed) {
+
+  XXH_ASSERT(input != NULL);
+  XXH_ASSERT(secret != NULL);
+  XXH_ASSERT(9 <= len && len <= 16);
+  {
+
+    xxh_u64 const bitflipl =
+        (XXH_readLE64(secret + 32) ^ XXH_readLE64(secret + 40)) - seed;
+    xxh_u64 const bitfliph =
+        (XXH_readLE64(secret + 48) ^ XXH_readLE64(secret + 56)) + seed;
+    xxh_u64 const input_lo = XXH_readLE64(input);
+    xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+    XXH128_hash_t m128 =
+        XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
+    /*
+     * Put len in the middle of m128 to ensure that the length gets mixed to
+     * both the low and high bits in the 128x64 multiply below.
+     */
+    m128.low64 += (xxh_u64)(len - 1) << 54;
+    input_hi ^= bitfliph;
+    /*
+     * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+     * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+     * the high 64 bits of m128.
+     *
+     * The best approach to this operation is different on 32-bit and 64-bit.
+     */
+    if (sizeof(void *) < sizeof(xxh_u64)) {                       /* 32-bit */
+      /*
+       * 32-bit optimized version, which is more readable.
+       *
+       * On 32-bit, it removes an ADC and delays a dependency between the two
+       * halves of m128.high64, but it generates an extra mask on 64-bit.
+       */
+      m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) +
+                     XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
+
+    } else {
+
+      /*
+       * 64-bit optimized (albeit more confusing) version.
+       *
+       * Uses some properties of addition and multiplication to remove the mask:
+       *
+       * Let:
+       *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+       *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+       *    c = XXH_PRIME32_2
+       *
+       *    a + (b * c)
+       * Inverse Property: x + y - x == y
+       *    a + (b * (1 + c - 1))
+       * Distributive Property: x * (y + z) == (x * y) + (x * z)
+       *    a + (b * 1) + (b * (c - 1))
+       * Identity Property: x * 1 == x
+       *    a + b + (b * (c - 1))
+       *
+       * Substitute a, b, and c:
+       *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 -
+       * 1))
+       *
+       * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+       *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+       */
+      m128.high64 +=
+          input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
+
+    }
+
+    /* m128 ^= XXH_swap64(m128 >> 64); */
+    m128.low64 ^= XXH_swap64(m128.high64);
+
+    {                      /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+      XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
+      h128.high64 += m128.high64 * XXH_PRIME64_2;
+
+      h128.low64 = XXH3_avalanche(h128.low64);
+      h128.high64 = XXH3_avalanche(h128.high64);
+      return h128;
+
+    }
+
+  }
+
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH128_hash_t XXH3_len_0to16_128b(const xxh_u8 *input,
+                                                   size_t        len,
+                                                   const xxh_u8 *secret,
+                                                   XXH64_hash_t  seed) {
+
+  XXH_ASSERT(len <= 16);
+  {
+
+    if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+    if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+    if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+    {
+
+      XXH128_hash_t h128;
+      xxh_u64 const bitflipl =
+          XXH_readLE64(secret + 64) ^ XXH_readLE64(secret + 72);
+      xxh_u64 const bitfliph =
+          XXH_readLE64(secret + 80) ^ XXH_readLE64(secret + 88);
+      h128.low64 = XXH64_avalanche(seed ^ bitflipl);
+      h128.high64 = XXH64_avalanche(seed ^ bitfliph);
+      return h128;
+
+    }
+
+  }
+
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t XXH128_mix32B(XXH128_hash_t acc,
+                                             const xxh_u8 *input_1,
+                                             const xxh_u8 *input_2,
+                                             const xxh_u8 *secret,
+                                             XXH64_hash_t  seed) {
+
+  acc.low64 += XXH3_mix16B(input_1, secret + 0, seed);
+  acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+  acc.high64 += XXH3_mix16B(input_2, secret + 16, seed);
+  acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+  return acc;
+
+}
+
+XXH_FORCE_INLINE XXH128_hash_t XXH3_len_17to128_128b(
+    const xxh_u8 *XXH_RESTRICT input, size_t len,
+    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
+
+  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+  (void)secretSize;
+  XXH_ASSERT(16 < len && len <= 128);
+
+  {
+
+    XXH128_hash_t acc;
+    acc.low64 = len * XXH_PRIME64_1;
+    acc.high64 = 0;
+    if (len > 32) {
+
+      if (len > 64) {
+
+        if (len > 96) {
+
+          acc = XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96,
+                              seed);
+
+        }
+
+        acc =
+            XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed);
+
+      }
+
+      acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed);
+
+    }
+
+    acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed);
+    {
+
+      XXH128_hash_t h128;
+      h128.low64 = acc.low64 + acc.high64;
+      h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) +
+                    ((len - seed) * XXH_PRIME64_2);
+      h128.low64 = XXH3_avalanche(h128.low64);
+      h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+      return h128;
+
+    }
+
+  }
+
+}
+
+XXH_NO_INLINE XXH128_hash_t XXH3_len_129to240_128b(
+    const xxh_u8 *XXH_RESTRICT input, size_t len,
+    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
+
+  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+  (void)secretSize;
+  XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+  {
+
+    XXH128_hash_t acc;
+    int const     nbRounds = (int)len / 32;
+    int           i;
+    acc.low64 = len * XXH_PRIME64_1;
+    acc.high64 = 0;
+    for (i = 0; i < 4; i++) {
+
+      acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16,
+                          secret + (32 * i), seed);
+
+    }
+
+    acc.low64 = XXH3_avalanche(acc.low64);
+    acc.high64 = XXH3_avalanche(acc.high64);
+    XXH_ASSERT(nbRounds >= 4);
+    for (i = 4; i < nbRounds; i++) {
+
+      acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16,
+                          secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
+                          seed);
+
+    }
+
+    /* last bytes */
+    acc = XXH128_mix32B(
+        acc, input + len - 16, input + len - 32,
+        secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+        0ULL - seed);
+
+    {
+
+      XXH128_hash_t h128;
+      h128.low64 = acc.low64 + acc.high64;
+      h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) +
+                    ((len - seed) * XXH_PRIME64_2);
+      h128.low64 = XXH3_avalanche(h128.low64);
+      h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+      return h128;
+
+    }
+
+  }
+
+}
+
+XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal(
+    const void *XXH_RESTRICT input, size_t len,
+    const xxh_u8 *XXH_RESTRICT secret, size_t secretSize,
+    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
+
+  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+  XXH3_hashLong_internal_loop(acc, (const xxh_u8 *)input, len, secret,
+                              secretSize, f_acc512, f_scramble);
+
+  /* converge into final hash */
+  XXH_STATIC_ASSERT(sizeof(acc) == 64);
+  XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+  {
+
+    XXH128_hash_t h128;
+    h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
+                                (xxh_u64)len * XXH_PRIME64_1);
+    h128.high64 = XXH3_mergeAccs(
+        acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+        ~((xxh_u64)len * XXH_PRIME64_2));
+    return h128;
+
+  }
+
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_default(
+    const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
+    const void *XXH_RESTRICT secret, size_t secretLen) {
+
+  (void)seed64;
+  (void)secret;
+  (void)secretLen;
+  return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret,
+                                     sizeof(XXH3_kSecret), XXH3_accumulate_512,
+                                     XXH3_scrambleAcc);
+
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_withSecret(
+    const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
+    const void *XXH_RESTRICT secret, size_t secretLen) {
+
+  (void)seed64;
+  return XXH3_hashLong_128b_internal(input, len, (const xxh_u8 *)secret,
+                                     secretLen, XXH3_accumulate_512,
+                                     XXH3_scrambleAcc);
+
+}
+
+XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed_internal(
+    const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
+    XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble,
+    XXH3_f_initCustomSecret f_initSec) {
+
+  if (seed64 == 0)
+    return XXH3_hashLong_128b_internal(
+        input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble);
+  {
+
+    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    f_initSec(secret, seed64);
+    return XXH3_hashLong_128b_internal(input, len, (const xxh_u8 *)secret,
+                                       sizeof(secret), f_acc512, f_scramble);
+
+  }
+
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const void *input, size_t len, XXH64_hash_t seed64,
+                            const void *XXH_RESTRICT secret, size_t secretLen) {
+
+  (void)secret;
+  (void)secretLen;
+  return XXH3_hashLong_128b_withSeed_internal(
+      input, len, seed64, XXH3_accumulate_512, XXH3_scrambleAcc,
+      XXH3_initCustomSecret);
+
+}
+
+typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void *XXH_RESTRICT, size_t,
+                                            XXH64_hash_t,
+                                            const void *XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_128bits_internal(const void *input, size_t len, XXH64_hash_t seed64,
+                      const void *XXH_RESTRICT secret, size_t secretLen,
+                      XXH3_hashLong128_f f_hl128) {
+
+  XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+  /*
+   * If an action is to be taken if `secret` conditions are not respected,
+   * it should be done here.
+   * For now, it's a contract pre-condition.
+   * Adding a check and a branch here would cost performance at every hash.
+   */
+  if (len <= 16)
+    return XXH3_len_0to16_128b((const xxh_u8 *)input, len,
+                               (const xxh_u8 *)secret, seed64);
+  if (len <= 128)
+    return XXH3_len_17to128_128b((const xxh_u8 *)input, len,
+                                 (const xxh_u8 *)secret, secretLen, seed64);
+  if (len <= XXH3_MIDSIZE_MAX)
+    return XXH3_len_129to240_128b((const xxh_u8 *)input, len,
+                                  (const xxh_u8 *)secret, secretLen, seed64);
+  return f_hl128(input, len, seed64, secret, secretLen);
+
+}
+
+/* ===   Public XXH128 API   === */
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) {
+
+  return XXH3_128bits_internal(input, len, 0, XXH3_kSecret,
+                               sizeof(XXH3_kSecret),
+                               XXH3_hashLong_128b_default);
+
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input,
+                                                     size_t      len,
+                                                     const void *secret,
+                                                     size_t      secretSize) {
+
+  return XXH3_128bits_internal(input, len, 0, (const xxh_u8 *)secret,
+                               secretSize, XXH3_hashLong_128b_withSecret);
+
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input,
+                                                   size_t       len,
+                                                   XXH64_hash_t seed) {
+
+  return XXH3_128bits_internal(input, len, seed, XXH3_kSecret,
+                               sizeof(XXH3_kSecret),
+                               XXH3_hashLong_128b_withSeed);
+
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len,
+                                    XXH64_hash_t seed) {
+
+  return XXH3_128bits_withSeed(input, len, seed);
+
+}
+
+/* ===   XXH3 128-bit streaming   === */
+
+/*
+ * All the functions are actually the same as for 64-bit streaming variant.
+ * The only difference is the finalizatiom routine.
+ */
+
+static void XXH3_128bits_reset_internal(XXH3_state_t *statePtr,
+                                        XXH64_hash_t seed, const void *secret,
+                                        size_t secretSize) {
+
+  XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
+
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr) {
+
+  if (statePtr == NULL) return XXH_ERROR;
+  XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret,
+                              XXH_SECRET_DEFAULT_SIZE);
+  return XXH_OK;
+
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(
+    XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
+
+  if (statePtr == NULL) return XXH_ERROR;
+  XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
+  if (secret == NULL) return XXH_ERROR;
+  if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+  return XXH_OK;
+
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
+                                                         XXH64_hash_t  seed) {
+
+  if (statePtr == NULL) return XXH_ERROR;
+  if (seed == 0) return XXH3_128bits_reset(statePtr);
+  if (seed != statePtr->seed)
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+  XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+  return XXH_OK;
+
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state,
+                                                 const void *  input,
+                                                 size_t        len) {
+
+  return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_accumulate_512,
+                     XXH3_scrambleAcc);
+
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) {
+
+  const unsigned char *const secret =
+      (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+  if (state->totalLen > XXH3_MIDSIZE_MAX) {
+
+    XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+    XXH3_digest_long(acc, state, secret);
+    XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >=
+               sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {
+
+      XXH128_hash_t h128;
+      h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
+                                  (xxh_u64)state->totalLen * XXH_PRIME64_1);
+      h128.high64 =
+          XXH3_mergeAccs(acc,
+                         secret + state->secretLimit + XXH_STRIPE_LEN -
+                             sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                         ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
+      return h128;
+
+    }
+
+  }
+
+  /* len <= XXH3_MIDSIZE_MAX : short code */
+  if (state->seed)
+    return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen,
+                                 state->seed);
+  return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                 secret, state->secretLimit + XXH_STRIPE_LEN);
+
+}
+
+  /* 128-bit utility functions */
+
+    #include <string.h>                                   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
+
+  /* note : XXH128_hash_t is compact, it has no padding byte */
+  return !(memcmp(&h1, &h2, sizeof(h1)));
+
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) {
+
+  XXH128_hash_t const h1 = *(const XXH128_hash_t *)h128_1;
+  XXH128_hash_t const h2 = *(const XXH128_hash_t *)h128_2;
+  int const           hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+  /* note : bets that, in most cases, hash values are different */
+  if (hcmp) return hcmp;
+  return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+
+}
+
+/*======   Canonical representation   ======*/
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
+                                             XXH128_hash_t       hash) {
+
+  XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+  if (XXH_CPU_LITTLE_ENDIAN) {
+
+    hash.high64 = XXH_swap64(hash.high64);
+    hash.low64 = XXH_swap64(hash.low64);
+
+  }
+
+  memcpy(dst, &hash.high64, sizeof(hash.high64));
+  memcpy((char *)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t *src) {
+
+  XXH128_hash_t h;
+  h.high64 = XXH_readBE64(src);
+  h.low64 = XXH_readBE64(src->digest + 8);
+  return h;
+
+}
+
+    /* Pop our optimization override from above */
+    #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
+        && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+        && defined(__OPTIMIZE__) &&                                      \
+        !defined(__OPTIMIZE_SIZE__)                  /* respect -O0 and -Os */
+      #pragma GCC pop_options
+    #endif
 
   #endif                                                /* XXH_NO_LONG_LONG */
 
diff --git a/llvm_mode/LLVMInsTrim.so.cc b/instrumentation/LLVMInsTrim.so.cc
index 61a420ba..61a420ba 100644
--- a/llvm_mode/LLVMInsTrim.so.cc
+++ b/instrumentation/LLVMInsTrim.so.cc
diff --git a/llvm_mode/MarkNodes.cc b/instrumentation/MarkNodes.cc
index 20a7df35..20a7df35 100644
--- a/llvm_mode/MarkNodes.cc
+++ b/instrumentation/MarkNodes.cc
diff --git a/llvm_mode/MarkNodes.h b/instrumentation/MarkNodes.h
index 8ddc978d..8ddc978d 100644
--- a/llvm_mode/MarkNodes.h
+++ b/instrumentation/MarkNodes.h
diff --git a/llvm_mode/README.cmplog.md b/instrumentation/README.cmplog.md
index 7f426ec8..5f855e1f 100644
--- a/llvm_mode/README.cmplog.md
+++ b/instrumentation/README.cmplog.md
@@ -30,7 +30,7 @@ cp ./program ./program.cmplog
 
 ## Use
 
-AFL++ has the new -c option that can be used to specify a CmpLog binary (the second
+AFL++ has the new -c option that needs to be used to specify the CmpLog binary (the second
 build).
 
 For example:
@@ -39,4 +39,4 @@ For example:
 afl-fuzz -i input -o output -c ./program.cmplog -m none -- ./program.afl @@
 ```
 
-Be careful to use -m none because CmpLog maps a lot of pages.
+Be sure to use `-m none` because CmpLog can map a lot of pages.
diff --git a/llvm_mode/README.ctx.md b/instrumentation/README.ctx.md
index 14255313..caf2c09a 100644
--- a/llvm_mode/README.ctx.md
+++ b/instrumentation/README.ctx.md
@@ -4,7 +4,7 @@
 
 This is an LLVM-based implementation of the context sensitive branch coverage.
 
-Basically every function gets it's own ID and that ID is combined with the
+Basically every function gets its own ID and that ID is combined with the
 edges of the called functions.
 
 So if both function A and function B call a function C, the coverage
diff --git a/gcc_plugin/README.md b/instrumentation/README.gcc_plugin.md
index f762131e..9d6bc200 100644
--- a/gcc_plugin/README.md
+++ b/instrumentation/README.gcc_plugin.md
@@ -1,7 +1,7 @@
 # GCC-based instrumentation for afl-fuzz
 
   (See [../README.md](../README.md) for the general instruction manual.)
-  (See [../llvm_mode/README.md](../llvm_mode/README.md) for the LLVM-based instrumentation.)
+  (See [README.llvm.md](README.llvm.md) for the LLVM-based instrumentation.)
 
 !!! TODO items are:
 !!!  => inline instrumentation has to work!
diff --git a/instrumentation/README.instrim.md b/instrumentation/README.instrim.md
new file mode 100644
index 00000000..99f6477a
--- /dev/null
+++ b/instrumentation/README.instrim.md
@@ -0,0 +1,30 @@
+# InsTrim
+
+InsTrim: Lightweight Instrumentation for Coverage-guided Fuzzing
+
+## Introduction
+
+InsTrim is the work of Chin-Chia Hsu, Che-Yu Wu, Hsu-Chun Hsiao and Shih-Kun Huang.
+
+It uses a CFG (call flow graph) and markers to instrument just what
+is necessary in the binary (ie less than llvm_mode). As a result the binary is
+about 10-15% faster compared to normal llvm_mode however with some coverage loss.
+It requires at least llvm version 3.8.0 to build.
+If you have LLVM 7+ we recommend PCGUARD instead.
+
+## Usage
+
+Set the environment variable `AFL_LLVM_INSTRUMENT=CFG` or `AFL_LLVM_INSTRIM=1`
+during compilation of the target.
+
+There is also special mode which instruments loops in a way so that
+afl-fuzz can see which loop path has been selected but not being able to
+see how often the loop has been rerun.
+This again is a tradeoff for speed for less path information.
+To enable this mode set `AFL_LLVM_INSTRIM_LOOPHEAD=1`.
+
+## Background
+
+The paper from Chin-Chia Hsu, Che-Yu Wu, Hsu-Chun Hsiao and Shih-Kun Huang:
+[InsTrim: Lightweight Instrumentation for Coverage-guided Fuzzing]
+(https://www.ndss-symposium.org/wp-content/uploads/2018/07/bar2018_14_Hsu_paper.pdf)
diff --git a/llvm_mode/README.instrument_list.md b/instrumentation/README.instrument_list.md
index 1fc06414..60474ec6 100644
--- a/llvm_mode/README.instrument_list.md
+++ b/instrumentation/README.instrument_list.md
@@ -1,8 +1,8 @@
 # Using afl++ with partial instrumentation
 
-  This file describes how you can selectively instrument only the source files
-  or functions that are interesting to you using the LLVM instrumentation
-  provided by afl++
+  This file describes how to selectively instrument only source files
+  or functions that are of interest to you using the LLVM instrumentation
+  provided by afl++.
 
 ## 1) Description and purpose
 
@@ -13,10 +13,10 @@ on the important parts of the program, avoiding undesired noise and
 disturbance by uninteresting code being exercised.
 
 For this purpose, a "partial instrumentation" support en par with llvm sancov
-is provided by afl++ that allows you to specify on a source file and function
+is provided by afl++ that allows to specify on a source file and function
 level which function should be compiled with or without instrumentation.
 
-Note: When using PCGUARD mode - and have llvm 12+ - you can use this instead:
+Note: When using PCGUARD mode - and llvm 12+ - you can use this instead:
 https://clang.llvm.org/docs/SanitizerCoverage.html#partially-disabling-instrumentation
 
 The llvm sancov list format is fully supported by afl++, however afl++ has
@@ -24,8 +24,8 @@ more flexibility.
 
 ## 2) Building the LLVM module
 
-The new code is part of the existing afl++ LLVM module in the llvm_mode/
-subdirectory. There is nothing specifically to do :)
+The new code is part of the existing afl++ LLVM module in the instrumentation/
+subdirectory. There is nothing specifically to do for the build :)
 
 ## 3) How to use the partial instrumentation mode
 
@@ -34,14 +34,14 @@ afl-clang-fast/afl-clang-fast++ or afl-clang-lto/afl-clang-lto++.
 The only required change is that you need to set either the environment variable
 AFL_LLVM_ALLOWLIST or AFL_LLVM_DENYLIST set with a filename.
 
-That file then contains the filenames or functions that should be instrumented
-(AFL_LLVM_ALLOWLIST) or should specifically NOT be instrumented (AFL_LLVM_DENYLIST).
+That file should contain the file names or functions that are to be instrumented
+(AFL_LLVM_ALLOWLIST) or are specifically NOT to be instrumented (AFL_LLVM_DENYLIST).
 
-For matching, the function/filename that is being compiled must end in the
-function/filename entry contained in this instrument file list (to avoid
-breaking the matching when absolute paths are used during compilation).
+For matching to succeed, the function/file name that is being compiled must end in the
+function/file name entry contained in this instrument file list. That is to avoid
+breaking the match when absolute paths are used during compilation.
 
-**NOTE:** In builds with optimization enabled functions might be inlined and would not match!
+**NOTE:** In builds with optimization enabled, functions might be inlined and would not match!
 
 For example if your source tree looks like this:
 ```
@@ -52,13 +52,13 @@ project/feature_b/b1.cpp
 project/feature_b/b2.cpp
 ```
 
-and you only want to test feature_a, then create a instrument file list file containing:
+and you only want to test feature_a, then create an "instrument file list" file containing:
 ```
 feature_a/a1.cpp
 feature_a/a2.cpp
 ```
 
-However if the instrument file list file contains only this, it works as well:
+However if the "instrument file list" file contains only this, it works as well:
 ```
 a1.cpp
 a2.cpp
@@ -67,9 +67,9 @@ but it might lead to files being unwantedly instrumented if the same filename
 exists somewhere else in the project directories.
 
 You can also specify function names. Note that for C++ the function names
-must be mangled to match!
+must be mangled to match! `nm` can print these names.
 
-afl++ is able to identify if an entry is a filename or a function.
+afl++ is able to identify whether an entry is a filename or a function.
 However if you want to be sure (and compliant to the sancov allow/blocklist
 format), you can specify source file entries like this:
 ```
@@ -82,5 +82,6 @@ fun: MallocFoo
 Note that whitespace is ignored and comments (`# foo`) are supported.
 
 ## 4) UNIX-style pattern matching
-You can add UNIX-style pattern matching in the the instrument file list entries.
+
+You can add UNIX-style pattern matching in the "instrument file list" entries.
 See `man fnmatch` for the syntax. We do not set any of the `fnmatch` flags.
diff --git a/llvm_mode/README.laf-intel.md b/instrumentation/README.laf-intel.md
index f63ab2bb..c50a6979 100644
--- a/llvm_mode/README.laf-intel.md
+++ b/instrumentation/README.laf-intel.md
@@ -1,5 +1,15 @@
 # laf-intel instrumentation
 
+## Introduction
+
+This originally is the work of an individual nicknamed laf-intel.
+His blog [Circumventing Fuzzing Roadblocks with Compiler Transformations]
+(https://lafintel.wordpress.com/) and gitlab repo [laf-llvm-pass]
+(https://gitlab.com/laf-intel/laf-llvm-pass/)
+describe some code transformations that
+help afl++ to enter conditional blocks, where conditions consist of
+comparisons of large values.
+
 ## Usage
 
 By default these passes will not run when you compile programs using 
@@ -24,18 +34,22 @@ Enables the split-compares pass.
 By default it will 
 1. simplify operators >= (and <=) into chains of > (<) and == comparisons
 2. change signed integer comparisons to a chain of sign-only comparison
-and unsigned comparisons
+and unsigned integer comparisons
 3. split all unsigned integer comparisons with bit widths of
 64, 32 or 16 bits to chains of 8 bits comparisons.
 
 You can change the behaviour of the last step by setting
 `export AFL_LLVM_LAF_SPLIT_COMPARES_BITW=<bit_width>`, where 
-bit_width may be 64, 32 or 16.
+bit_width may be 64, 32 or 16. For example, a bit_width of 16
+would split larger comparisons down to 16 bit comparisons.
 
 A new experimental feature is splitting floating point comparisons into a
 series of sign, exponent and mantissa comparisons followed by splitting each
 of them into 8 bit comparisons when necessary.
 It is activated with the `AFL_LLVM_LAF_SPLIT_FLOATS` setting.
+Please note that full IEEE 754 functionality is not preserved, that is
+values of nan and infinity will probably behave differently.
+
 Note that setting this automatically activates `AFL_LLVM_LAF_SPLIT_COMPARES`
 
 You can also set `AFL_LLVM_LAF_ALL` and have all of the above enabled :-)
diff --git a/llvm_mode/README.md b/instrumentation/README.llvm.md
index f23d7150..51e9995b 100644
--- a/llvm_mode/README.md
+++ b/instrumentation/README.llvm.md
@@ -1,8 +1,8 @@
 # Fast LLVM-based instrumentation for afl-fuzz
 
-  (See [../README](../README.md) for the general instruction manual.)
+  (See [../README.md](../README.md) for the general instruction manual.)
 
-  (See [../gcc_plugin/README](../gcc_plugin/README.md) for the GCC-based instrumentation.)
+  (See [README.gcc_plugon.md](../README.gcc_plugin.md) for the GCC-based instrumentation.)
 
 ## 1) Introduction
 
@@ -93,15 +93,16 @@ operating mode of AFL, e.g.:
 
 Be sure to also include CXX set to afl-clang-fast++ for C++ code.
 
+Note that afl-clang-fast/afl-clang-fast++ are just pointers to afl-cc.
+You can also use afl-cc/afl-c++ and instead direct it to use LLVM
+instrumentation by either setting `AFL_CC_COMPILER=LLVM` or pass the parameter
+`--afl-llvm` via CFLAGS/CXXFLAGS/CPPFLAGS.
+
 The tool honors roughly the same environmental variables as afl-gcc (see
 [docs/env_variables.md](../docs/env_variables.md)). This includes AFL_USE_ASAN,
 AFL_HARDEN, and AFL_DONT_OPTIMIZE. However AFL_INST_RATIO is not honored
-as it does not serve a good purpose with the more effective instrim CFG
-analysis.
-
-Note: if you want the LLVM helper to be installed on your system for all
-users, you need to build it before issuing 'make install' in the parent
-directory.
+as it does not serve a good purpose with the more effective PCGUARD, LTO and
+ instrim CFG analysis.
 
 ## 3) Options
 
@@ -109,9 +110,9 @@ Several options are present to make llvm_mode faster or help it rearrange
 the code to make afl-fuzz path discovery easier.
 
 If you need just to instrument specific parts of the code, you can the instrument file list
-which C/C++ files to actually instrument. See [README.instrument_list](README.instrument_list.md)
+which C/C++ files to actually instrument. See [README.instrument_list.md](README.instrument_list.md)
 
-For splitting memcmp, strncmp, etc. please see [README.laf-intel](README.laf-intel.md)
+For splitting memcmp, strncmp, etc. please see [README.laf-intel.md](README.laf-intel.md)
 
 Then there are different ways of instrumenting the target:
 
@@ -119,42 +120,42 @@ Then there are different ways of instrumenting the target:
 markers to just instrument what is needed. This increases speed by 10-15%
 without any disadvantages
 If you want to use this, set AFL_LLVM_INSTRUMENT=CFG or AFL_LLVM_INSTRIM=1
-See [README.instrim](README.instrim.md)
+See [README.instrim.md](README.instrim.md)
 
 2. An even better instrumentation strategy uses LTO and link time
 instrumentation. Note that not all targets can compile in this mode, however
 if it works it is the best option you can use.
 Simply use afl-clang-lto/afl-clang-lto++ to use this option.
-See [README.lto](README.lto.md)
+See [README.lto.md](README.lto.md)
 
 3. Alternativly you can choose a completely different coverage method:
 
 3a. N-GRAM coverage - which combines the previous visited edges with the
 current one. This explodes the map but on the other hand has proven to be
 effective for fuzzing.
-See [README.ngram](README.ngram.md)
+See [README.ngram.md](README.ngram.md)
 
 3b. Context sensitive coverage - which combines the visited edges with an
 individual caller ID (the function that called the current one)
-[README.ctx](README.ctx.md)
+[README.ctx.md](README.ctx.md)
 
 Then - additionally to one of the instrumentation options above - there is
 a very effective new instrumentation option called CmpLog as an alternative to
 laf-intel that allow AFL++ to apply mutations similar to Redqueen.
-See [README.cmplog](README.cmplog.md)
+See [README.cmplog.md](README.cmplog.md)
 
 Finally if your llvm version is 8 or lower, you can activate a mode that
 prevents that a counter overflow result in a 0 value. This is good for
 path discovery, but the llvm implementation for x86 for this functionality
 is not optimal and was only fixed in llvm 9.
 You can set this with AFL_LLVM_NOT_ZERO=1
-See [README.neverzero](README.neverzero.md)
+See [README.neverzero.md](README.neverzero.md)
 
 ## 4) Snapshot feature
 
 To speed up fuzzing you can use a linux loadable kernel module which enables
 a snapshot feature.
-See [README.snapshot](README.snapshot.md)
+See [README.snapshot.md](README.snapshot.md)
 
 ## 5) Gotchas, feedback, bugs
 
@@ -182,5 +183,12 @@ targets this way:
 AFL_LLVM_INSTRUMENT=PCGUARD  make
 ```
 
-Note that this us currently the default, as it is the best mode.
+Note that this us currently the default if you use LLVM >= 7, as it is the best
+mode. Recommended is LLVM >= 9.
 If you have llvm 11+ and compiled afl-clang-lto - this is the only better mode.
+
+## 8) Bonus feature: 'dict2file' pass
+
+Just specify `AFL_LLVM_DICT2FILE=/absolute/path/file.txt` and during compilation
+all constant string compare parameters will be written to this file to be
+used with afl-fuzz' `-x` option.
diff --git a/llvm_mode/README.lto.md b/instrumentation/README.lto.md
index 9046c5a8..abdbd2ac 100644
--- a/llvm_mode/README.lto.md
+++ b/instrumentation/README.lto.md
@@ -95,10 +95,7 @@ export PATH=`pwd`/bin:$PATH
 export LLVM_CONFIG=`pwd`/bin/llvm-config
 cd /path/to/AFLplusplus/
 make
-cd llvm_mode
-make
-cd ..
-make install
+sudo make install
 ```
 
 ## How to use afl-clang-lto
@@ -115,18 +112,18 @@ make
 ```
 
 NOTE: some targets also need to set the linker, try both `afl-clang-lto` and
-`afl-ld-lto` for this for `LD=` for `configure`.
+`afl-ld-lto` for `LD=` before `configure`.
 
 ## AUTODICTIONARY feature
 
-While compiling, automatically a dictionary based on string comparisons is
-generated put into the target binary. This dictionary is transfered to afl-fuzz
+While compiling, a dictionary based on string comparisons is automatically
+generated and put into the target binary. This dictionary is transfered to afl-fuzz
 on start. This improves coverage statistically by 5-10% :)
 
 ## Fixed memory map
 
 To speed up fuzzing, it is possible to set a fixed shared memory map.
-Recommened is the value 0x10000.
+Recommended is the value 0x10000.
 In most cases this will work without any problems. However if a target uses
 early constructors, ifuncs or a deferred forkserver this can crash the target.
 On unusual operating systems/processors/kernels or weird libraries this might
@@ -136,14 +133,14 @@ to be dynamic - the original afl way, which is slower).
 
 ## Document edge IDs
 
-Setting `export AFL_LLVM_DOCUMENT_IDS=file` will document to a file which edge
+Setting `export AFL_LLVM_DOCUMENT_IDS=file` will document in a file which edge
 ID was given to which function. This helps to identify functions with variable
 bytes or which functions were touched by an input.
 
 ## Solving difficult targets
 
 Some targets are difficult because the configure script does unusual stuff that
-is unexpected for afl. See the next chapter `Potential issues` how to solve
+is unexpected for afl. See the next chapter `Potential issues` for how to solve
 these.
 
 ### Example: ffmpeg
@@ -151,7 +148,7 @@ these.
 An example of a hard to solve target is ffmpeg. Here is how to successfully
 instrument it:
 
-1. Get and extract the current ffmpeg and change to it's directory
+1. Get and extract the current ffmpeg and change to its directory
 
 2. Running configure with --cc=clang fails and various other items will fail
    when compiling, so we have to trick configure:
@@ -221,13 +218,13 @@ If you see this message:
 /bin/ld: libfoo.a: error adding symbols: archive has no index; run ranlib to add one
 ```
 This is because usually gnu gcc ranlib is being called which cannot deal with clang LTO files.
-The solution is simple: when you ./configure you have also have to set RANLIB=llvm-ranlib and AR=llvm-ar
+The solution is simple: when you ./configure you also have to set RANLIB=llvm-ranlib and AR=llvm-ar
 
 Solution:
 ```
 AR=llvm-ar RANLIB=llvm-ranlib CC=afl-clang-lto CXX=afl-clang-lto++ ./configure --disable-shared
 ```
-and on some target you have to to AR=/RANLIB= even for make as the configure script does not save it.
+and on some targets you have to set AR=/RANLIB= even for make as the configure script does not save it.
 Other targets ignore environment variables and need the parameters set via
 `./configure --cc=... --cxx= --ranlib= ...` etc. (I am looking at you ffmpeg!).
 
@@ -246,8 +243,8 @@ AS=llvm-as  ...
 afl-clang-lto is still work in progress.
 
 Known issues:
-  * Anything that llvm 11+ cannot compile, afl-clang-lto can not compile either - obviously
-  * Anything that does not compile with LTO, afl-clang-lto can not compile either - obviously
+  * Anything that llvm 11+ cannot compile, afl-clang-lto cannot compile either - obviously
+  * Anything that does not compile with LTO, afl-clang-lto cannot compile either - obviously
 
 Hence if building a target with afl-clang-lto fails try to build it with llvm12
 and LTO enabled (`CC=clang-12` `CXX=clang++-12` `CFLAGS=-flto=full` and
@@ -267,14 +264,14 @@ for this in the PassManager: EP_FullLinkTimeOptimizationLast
 ("Fun" info - nobody knows what this is doing. And the developer who
 implemented this didn't respond to emails.)
 
-In December came then the idea to implement this as a pass that is run via
+In December then came the idea to implement this as a pass that is run via
 the llvm "opt" program, which is performed via an own linker that afterwards
 calls the real linker.
 This was first implemented in January and work ... kinda.
-The LTO time instrumentation worked, however the "how" the basic blocks were
+The LTO time instrumentation worked, however "how" the basic blocks were
 instrumented was a problem, as reducing duplicates turned out to be very,
 very difficult with a program that has so many paths and therefore so many
-dependencies. At lot of strategies were implemented - and failed.
+dependencies. A lot of strategies were implemented - and failed.
 And then sat solvers were tried, but with over 10.000 variables that turned
 out to be a dead-end too.
 
diff --git a/llvm_mode/README.neverzero.md b/instrumentation/README.neverzero.md
index 903e5bd3..5c894d6e 100644
--- a/llvm_mode/README.neverzero.md
+++ b/instrumentation/README.neverzero.md
@@ -2,8 +2,8 @@
 
 ## Usage
 
-In larger, complex or reiterative programs the counters that collect the edge
-coverage can easily fill up and wrap around.
+In larger, complex or reiterative programs the byte sized counters that collect
+the edge coverage can easily fill up and wrap around.
 This is not that much of an issue - unless by chance it wraps just to a value
 of zero when the program execution ends.
 In this case afl-fuzz is not able to see that the edge has been accessed and
diff --git a/llvm_mode/README.ngram.md b/instrumentation/README.ngram.md
index de3ba432..de3ba432 100644
--- a/llvm_mode/README.ngram.md
+++ b/instrumentation/README.ngram.md
diff --git a/llvm_mode/README.persistent_mode.md b/instrumentation/README.persistent_mode.md
index 7d2fd93b..e095f036 100644
--- a/llvm_mode/README.persistent_mode.md
+++ b/instrumentation/README.persistent_mode.md
@@ -4,11 +4,11 @@
 
 The most effective way is to fuzz in persistent mode, as the speed can easily
 be x10 or x20 times faster without any disadvanges.
-*All professionel fuzzing is using this mode.*
+*All professional fuzzing is using this mode.*
 
 This requires that the target can be called in a (or several) function(s),
-and that the state can be resetted so that multiple calls be be performed
-without memory leaking and former runs having no impact on following runs
+and that its state can be resetted so that multiple calls can be performed
+without resource leaks and former runs having no impact on following runs
 (this can be seen by the `stability` indicator in the `afl-fuzz` UI).
 
 Examples can be found in [examples/persistent_mode](../examples/persistent_mode).
@@ -67,7 +67,7 @@ add this just after the includes:
 #endif
 ```
 
-## 3) deferred initialization
+## 3) Deferred initialization
 
 AFL tries to optimize performance by executing the targeted binary just once,
 stopping it just before main(), and then cloning this "main" process to get
@@ -112,7 +112,7 @@ With the location selected, add this code in the appropriate spot:
 You don't need the #ifdef guards, but including them ensures that the program
 will keep working normally when compiled with a tool other than afl-clang-fast.
 
-Finally, recompile the program with afl-clang-fast (afl-gcc or afl-clang will
+Finally, recompile the program with afl-clang-fast/lto (afl-gcc or afl-clang will
 *not* generate a deferred-initialization binary) - and you should be all set!
 
 *NOTE:* In the code between `main` and `__AFL_INIT()` should not be any code
@@ -142,7 +142,7 @@ and just before `__AFL_INIT()`:
   __afl_area_ptr = NULL;
 ```
 
-## 4) persistent mode
+## 4) Persistent mode
 
 Some libraries provide APIs that are stateless, or whose state can be reset in
 between processing different input files. When such a reset is performed, a
@@ -183,7 +183,7 @@ PS. Because there are task switches still involved, the mode isn't as fast as
 faster than the normal fork() model, and compared to in-process fuzzing,
 should be a lot more robust.
 
-## 5) shared memory fuzzing
+## 5) Shared memory fuzzing
 
 You can speed up the fuzzing process even more by receiving the fuzzing data
 via shared memory instead of stdin or files.
diff --git a/llvm_mode/README.snapshot.md b/instrumentation/README.snapshot.md
index 9c12a8ba..c40a956a 100644
--- a/llvm_mode/README.snapshot.md
+++ b/instrumentation/README.snapshot.md
@@ -1,7 +1,7 @@
 # AFL++ snapshot feature
 
 Snapshotting is a feature that makes a snapshot from a process and then
-restores it's state, which is faster then forking it again.
+restores its state, which is faster then forking it again.
 
 All targets compiled with llvm_mode are automatically enabled for the
 snapshot feature.
diff --git a/llvm_mode/SanitizerCoverageLTO.so.cc b/instrumentation/SanitizerCoverageLTO.so.cc
index 1dd65188..1dd65188 100644
--- a/llvm_mode/SanitizerCoverageLTO.so.cc
+++ b/instrumentation/SanitizerCoverageLTO.so.cc
diff --git a/llvm_mode/afl-llvm-rt.o.c b/instrumentation/afl-compiler-rt.o.c
index bdafbe0b..a3d75b15 100644
--- a/llvm_mode/afl-llvm-rt.o.c
+++ b/instrumentation/afl-compiler-rt.o.c
@@ -1,11 +1,6 @@
 /*
-   american fuzzy lop++ - LLVM instrumentation bootstrap
-   ---------------------------------------------------
-
-   Written by Laszlo Szekeres <lszekeres@google.com> and
-              Michal Zalewski
-
-   LLVM integration design comes from Laszlo Szekeres.
+   american fuzzy lop++ - instrumentation bootstrap
+   ------------------------------------------------
 
    Copyright 2015, 2016 Google Inc. All rights reserved.
    Copyright 2019-2020 AFLplusplus Project. All rights reserved.
@@ -16,7 +11,6 @@
 
      http://www.apache.org/licenses/LICENSE-2.0
 
-   This code is the rewrite of afl-as.h's main_payload.
 
 */
 
@@ -111,6 +105,22 @@ static u8 is_persistent;
 
 static u8 _is_sancov;
 
+/* Uninspired gcc plugin instrumentation */
+
+void __afl_trace(const u32 x) {
+
+#if 1                                      /* enable for neverZero feature. */
+  __afl_area_ptr[__afl_prev_loc[0] ^ x] +=
+      1 + ((u8)(1 + __afl_area_ptr[__afl_prev_loc[0] ^ x]) == 0);
+#else
+  ++__afl_area_ptr[__afl_prev_loc[0] ^ x];
+#endif
+
+  __afl_prev_loc[0] = (x >> 1);
+  return;
+
+}
+
 /* Error reporting to forkserver controller */
 
 void send_forkserver_error(int error) {
@@ -808,7 +818,7 @@ static void __afl_start_forkserver(void) {
 }
 
 /* A simplified persistent mode handler, used as explained in
- * llvm_mode/README.md. */
+ * README.llvm.md. */
 
 int __afl_persistent_loop(unsigned int max_cnt) {
 
@@ -958,7 +968,7 @@ __attribute__((constructor(0))) void __afl_auto_first(void) {
 
 /* The following stuff deals with supporting -fsanitize-coverage=trace-pc-guard.
    It remains non-operational in the traditional, plugin-backed LLVM mode.
-   For more info about 'trace-pc-guard', see llvm_mode/README.md.
+   For more info about 'trace-pc-guard', see README.llvm.md.
 
    The first function (__sanitizer_cov_trace_pc_guard) is called back on every
    edge (as opposed to every basic block). */
diff --git a/gcc_plugin/afl-gcc-pass.so.cc b/instrumentation/afl-gcc-pass.so.cc
index c5614aca..c5614aca 100644
--- a/gcc_plugin/afl-gcc-pass.so.cc
+++ b/instrumentation/afl-gcc-pass.so.cc
diff --git a/llvm_mode/afl-llvm-common.cc b/instrumentation/afl-llvm-common.cc
index 189b4ec6..189b4ec6 100644
--- a/llvm_mode/afl-llvm-common.cc
+++ b/instrumentation/afl-llvm-common.cc
diff --git a/llvm_mode/afl-llvm-common.h b/instrumentation/afl-llvm-common.h
index a1561d9c..a1561d9c 100644
--- a/llvm_mode/afl-llvm-common.h
+++ b/instrumentation/afl-llvm-common.h
diff --git a/instrumentation/afl-llvm-dict2file.so.cc b/instrumentation/afl-llvm-dict2file.so.cc
new file mode 100644
index 00000000..e87ecce8
--- /dev/null
+++ b/instrumentation/afl-llvm-dict2file.so.cc
@@ -0,0 +1,599 @@
+/*
+   american fuzzy lop++ - LLVM LTO instrumentation pass
+   ----------------------------------------------------
+
+   Written by Marc Heuse <mh@mh-sec.de>
+
+   Copyright 2019-2020 AFLplusplus Project. All rights reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at:
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   This library is plugged into LLVM when invoking clang through afl-clang-lto.
+
+ */
+
+#define AFL_LLVM_PASS
+
+#include "config.h"
+#include "debug.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <ctype.h>
+
+#include <list>
+#include <string>
+#include <fstream>
+#include <set>
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/Constants.h"
+
+#include "afl-llvm-common.h"
+
+#ifndef O_DSYNC
+  #define O_DSYNC O_SYNC
+#endif
+
+using namespace llvm;
+
+namespace {
+
+class AFLdict2filePass : public ModulePass {
+
+ public:
+  static char ID;
+
+  AFLdict2filePass() : ModulePass(ID) {
+
+    if (getenv("AFL_DEBUG")) debug = 1;
+
+  }
+
+  bool runOnModule(Module &M) override;
+
+};
+
+}  // namespace
+
+void dict2file(int fd, u8 *mem, u32 len) {
+
+  int  i, j, binary = 0;
+  char line[MAX_AUTO_EXTRA * 8], tmp[8];
+
+  strcpy(line, "\"");
+  j = 1;
+  for (i = 0; i < len; i++) {
+
+    if (isprint(mem[i])) {
+
+      line[j++] = mem[i];
+
+    } else {
+
+      if (i + 1 != len || mem[i] != 0 || binary || len == 4 || len == 8) {
+
+        line[j] = 0;
+        sprintf(tmp, "\\x%02x", (u8)mem[i]);
+        strcat(line, tmp);
+        j = strlen(line);
+
+      }
+
+      binary = 1;
+
+    }
+
+  }
+
+  line[j] = 0;
+  strcat(line, "\"\n");
+  if (write(fd, line, strlen(line)) <= 0)
+    PFATAL("Could not write to dictionary file");
+  fsync(fd);
+
+  if (!be_quiet) fprintf(stderr, "Found dictionary token: %s", line);
+
+}
+
+bool AFLdict2filePass::runOnModule(Module &M) {
+
+  DenseMap<Value *, std::string *> valueMap;
+  char *                           ptr;
+  int                              fd, found = 0;
+
+  /* Show a banner */
+  setvbuf(stdout, NULL, _IONBF, 0);
+
+  if ((isatty(2) && !getenv("AFL_QUIET")) || debug) {
+
+    SAYF(cCYA "afl-llvm-dict2file" VERSION cRST
+              " by Marc \"vanHauser\" Heuse <mh@mh-sec.de>\n");
+
+  } else
+
+    be_quiet = 1;
+
+  scanForDangerousFunctions(&M);
+
+  ptr = getenv("AFL_LLVM_DICT2FILE");
+
+  if (!ptr || *ptr != '/')
+    FATAL("AFL_LLVM_DICT2FILE is not set to an absolute path: %s", ptr);
+
+  if ((fd = open(ptr, O_WRONLY | O_APPEND | O_CREAT | O_DSYNC, 0644)) < 0)
+    PFATAL("Could not open/create %s.", ptr);
+
+  /* Instrument all the things! */
+
+  for (auto &F : M) {
+
+    if (isIgnoreFunction(&F)) continue;
+
+    /*  Some implementation notes.
+     *
+     *  We try to handle 3 cases:
+     *  - memcmp("foo", arg, 3) <- literal string
+     *  - static char globalvar[] = "foo";
+     *    memcmp(globalvar, arg, 3) <- global variable
+     *  - char localvar[] = "foo";
+     *    memcmp(locallvar, arg, 3) <- local variable
+     *
+     *  The local variable case is the hardest. We can only detect that
+     *  case if there is no reassignment or change in the variable.
+     *  And it might not work across llvm version.
+     *  What we do is hooking the initializer function for local variables
+     *  (llvm.memcpy.p0i8.p0i8.i64) and note the string and the assigned
+     *  variable. And if that variable is then used in a compare function
+     *  we use that noted string.
+     *  This seems not to work for tokens that have a size <= 4 :-(
+     *
+     *  - if the compared length is smaller than the string length we
+     *    save the full string. This is likely better for fuzzing but
+     *    might be wrong in a few cases depending on optimizers
+     *
+     *  - not using StringRef because there is a bug in the llvm 11
+     *    checkout I am using which sometimes points to wrong strings
+     *
+     *  Over and out. Took me a full day. damn. mh/vh
+     */
+
+    for (auto &BB : F) {
+
+      for (auto &IN : BB) {
+
+        CallInst *callInst = nullptr;
+        CmpInst * cmpInst = nullptr;
+
+        if ((cmpInst = dyn_cast<CmpInst>(&IN))) {
+
+          Value *      op = cmpInst->getOperand(1);
+          ConstantInt *ilen = dyn_cast<ConstantInt>(op);
+
+          if (ilen) {
+
+            u64 val2 = 0, val = ilen->getZExtValue();
+            u32 len = 0;
+            if (val > 0x10000 && val < 0xffffffff) len = 4;
+            if (val > 0x100000001 && val < 0xffffffffffffffff) len = 8;
+
+            if (len) {
+
+              auto c = cmpInst->getPredicate();
+
+              switch (c) {
+
+                case CmpInst::FCMP_OGT:  // fall through
+                case CmpInst::FCMP_OLE:  // fall through
+                case CmpInst::ICMP_SLE:  // fall through
+                case CmpInst::ICMP_SGT:
+
+                  // signed comparison and it is a negative constant
+                  if ((len == 4 && (val & 80000000)) ||
+                      (len == 8 && (val & 8000000000000000))) {
+
+                    if ((val & 0xffff) != 1) val2 = val - 1;
+                    break;
+
+                  }
+
+                  // fall through
+
+                case CmpInst::FCMP_UGT:  // fall through
+                case CmpInst::FCMP_ULE:  // fall through
+                case CmpInst::ICMP_UGT:  // fall through
+                case CmpInst::ICMP_ULE:
+                  if ((val & 0xffff) != 0xfffe) val2 = val + 1;
+                  break;
+
+                case CmpInst::FCMP_OLT:  // fall through
+                case CmpInst::FCMP_OGE:  // fall through
+                case CmpInst::ICMP_SLT:  // fall through
+                case CmpInst::ICMP_SGE:
+
+                  // signed comparison and it is a negative constant
+                  if ((len == 4 && (val & 80000000)) ||
+                      (len == 8 && (val & 8000000000000000))) {
+
+                    if ((val & 0xffff) != 1) val2 = val - 1;
+                    break;
+
+                  }
+
+                  // fall through
+
+                case CmpInst::FCMP_ULT:  // fall through
+                case CmpInst::FCMP_UGE:  // fall through
+                case CmpInst::ICMP_ULT:  // fall through
+                case CmpInst::ICMP_UGE:
+                  if ((val & 0xffff) != 1) val2 = val - 1;
+                  break;
+
+                default:
+                  val2 = 0;
+
+              }
+
+              dict2file(fd, (u8 *)&val, len);
+              found++;
+              if (val2) {
+
+                dict2file(fd, (u8 *)&val2, len);
+                found++;
+
+              }
+
+            }
+
+          }
+
+        }
+
+        if ((callInst = dyn_cast<CallInst>(&IN))) {
+
+          bool   isStrcmp = true;
+          bool   isMemcmp = true;
+          bool   isStrncmp = true;
+          bool   isStrcasecmp = true;
+          bool   isStrncasecmp = true;
+          bool   isIntMemcpy = true;
+          bool   addedNull = false;
+          size_t optLen = 0;
+
+          Function *Callee = callInst->getCalledFunction();
+          if (!Callee) continue;
+          if (callInst->getCallingConv() != llvm::CallingConv::C) continue;
+          std::string FuncName = Callee->getName().str();
+          isStrcmp &= !FuncName.compare("strcmp");
+          isMemcmp &= !FuncName.compare("memcmp");
+          isStrncmp &= !FuncName.compare("strncmp");
+          isStrcasecmp &= !FuncName.compare("strcasecmp");
+          isStrncasecmp &= !FuncName.compare("strncasecmp");
+          isIntMemcpy &= !FuncName.compare("llvm.memcpy.p0i8.p0i8.i64");
+
+          if (!isStrcmp && !isMemcmp && !isStrncmp && !isStrcasecmp &&
+              !isStrncasecmp && !isIntMemcpy)
+            continue;
+
+          /* Verify the strcmp/memcmp/strncmp/strcasecmp/strncasecmp function
+           * prototype */
+          FunctionType *FT = Callee->getFunctionType();
+
+          isStrcmp &=
+              FT->getNumParams() == 2 && FT->getReturnType()->isIntegerTy(32) &&
+              FT->getParamType(0) == FT->getParamType(1) &&
+              FT->getParamType(0) == IntegerType::getInt8PtrTy(M.getContext());
+          isStrcasecmp &=
+              FT->getNumParams() == 2 && FT->getReturnType()->isIntegerTy(32) &&
+              FT->getParamType(0) == FT->getParamType(1) &&
+              FT->getParamType(0) == IntegerType::getInt8PtrTy(M.getContext());
+          isMemcmp &= FT->getNumParams() == 3 &&
+                      FT->getReturnType()->isIntegerTy(32) &&
+                      FT->getParamType(0)->isPointerTy() &&
+                      FT->getParamType(1)->isPointerTy() &&
+                      FT->getParamType(2)->isIntegerTy();
+          isStrncmp &= FT->getNumParams() == 3 &&
+                       FT->getReturnType()->isIntegerTy(32) &&
+                       FT->getParamType(0) == FT->getParamType(1) &&
+                       FT->getParamType(0) ==
+                           IntegerType::getInt8PtrTy(M.getContext()) &&
+                       FT->getParamType(2)->isIntegerTy();
+          isStrncasecmp &= FT->getNumParams() == 3 &&
+                           FT->getReturnType()->isIntegerTy(32) &&
+                           FT->getParamType(0) == FT->getParamType(1) &&
+                           FT->getParamType(0) ==
+                               IntegerType::getInt8PtrTy(M.getContext()) &&
+                           FT->getParamType(2)->isIntegerTy();
+
+          if (!isStrcmp && !isMemcmp && !isStrncmp && !isStrcasecmp &&
+              !isStrncasecmp && !isIntMemcpy)
+            continue;
+
+          /* is a str{n,}{case,}cmp/memcmp, check if we have
+           * str{case,}cmp(x, "const") or str{case,}cmp("const", x)
+           * strn{case,}cmp(x, "const", ..) or strn{case,}cmp("const", x, ..)
+           * memcmp(x, "const", ..) or memcmp("const", x, ..) */
+          Value *Str1P = callInst->getArgOperand(0),
+                *Str2P = callInst->getArgOperand(1);
+          std::string Str1, Str2;
+          StringRef   TmpStr;
+          bool        HasStr1 = getConstantStringInfo(Str1P, TmpStr);
+          if (TmpStr.empty()) {
+
+            HasStr1 = false;
+
+          } else {
+
+            HasStr1 = true;
+            Str1 = TmpStr.str();
+
+          }
+
+          bool HasStr2 = getConstantStringInfo(Str2P, TmpStr);
+          if (TmpStr.empty()) {
+
+            HasStr2 = false;
+
+          } else {
+
+            HasStr2 = true;
+            Str2 = TmpStr.str();
+
+          }
+
+          if (debug)
+            fprintf(stderr, "F:%s %p(%s)->\"%s\"(%s) %p(%s)->\"%s\"(%s)\n",
+                    FuncName.c_str(), Str1P, Str1P->getName().str().c_str(),
+                    Str1.c_str(), HasStr1 == true ? "true" : "false", Str2P,
+                    Str2P->getName().str().c_str(), Str2.c_str(),
+                    HasStr2 == true ? "true" : "false");
+
+          // we handle the 2nd parameter first because of llvm memcpy
+          if (!HasStr2) {
+
+            auto *Ptr = dyn_cast<ConstantExpr>(Str2P);
+            if (Ptr && Ptr->isGEPWithNoNotionalOverIndexing()) {
+
+              if (auto *Var = dyn_cast<GlobalVariable>(Ptr->getOperand(0))) {
+
+                if (Var->hasInitializer()) {
+
+                  if (auto *Array =
+                          dyn_cast<ConstantDataArray>(Var->getInitializer())) {
+
+                    HasStr2 = true;
+                    Str2 = Array->getAsString().str();
+
+                  }
+
+                }
+
+              }
+
+            }
+
+          }
+
+          // for the internal memcpy routine we only care for the second
+          // parameter and are not reporting anything.
+          if (isIntMemcpy == true) {
+
+            if (HasStr2 == true) {
+
+              Value *      op2 = callInst->getArgOperand(2);
+              ConstantInt *ilen = dyn_cast<ConstantInt>(op2);
+              if (ilen) {
+
+                uint64_t literalLength = Str2.size();
+                uint64_t optLength = ilen->getZExtValue();
+                if (literalLength + 1 == optLength) {
+
+                  Str2.append("\0", 1);  // add null byte
+                  addedNull = true;
+
+                }
+
+              }
+
+              valueMap[Str1P] = new std::string(Str2);
+
+              if (debug)
+                fprintf(stderr, "Saved: %s for %p\n", Str2.c_str(), Str1P);
+              continue;
+
+            }
+
+            continue;
+
+          }
+
+          // Neither a literal nor a global variable?
+          // maybe it is a local variable that we saved
+          if (!HasStr2) {
+
+            std::string *strng = valueMap[Str2P];
+            if (strng && !strng->empty()) {
+
+              Str2 = *strng;
+              HasStr2 = true;
+              if (debug)
+                fprintf(stderr, "Filled2: %s for %p\n", strng->c_str(), Str2P);
+
+            }
+
+          }
+
+          if (!HasStr1) {
+
+            auto Ptr = dyn_cast<ConstantExpr>(Str1P);
+
+            if (Ptr && Ptr->isGEPWithNoNotionalOverIndexing()) {
+
+              if (auto *Var = dyn_cast<GlobalVariable>(Ptr->getOperand(0))) {
+
+                if (Var->hasInitializer()) {
+
+                  if (auto *Array =
+                          dyn_cast<ConstantDataArray>(Var->getInitializer())) {
+
+                    HasStr1 = true;
+                    Str1 = Array->getAsString().str();
+
+                  }
+
+                }
+
+              }
+
+            }
+
+          }
+
+          // Neither a literal nor a global variable?
+          // maybe it is a local variable that we saved
+          if (!HasStr1) {
+
+            std::string *strng = valueMap[Str1P];
+            if (strng && !strng->empty()) {
+
+              Str1 = *strng;
+              HasStr1 = true;
+              if (debug)
+                fprintf(stderr, "Filled1: %s for %p\n", strng->c_str(), Str1P);
+
+            }
+
+          }
+
+          /* handle cases of one string is const, one string is variable */
+          if (!(HasStr1 ^ HasStr2)) continue;
+
+          std::string thestring;
+
+          if (HasStr1)
+            thestring = Str1;
+          else
+            thestring = Str2;
+
+          optLen = thestring.length();
+
+          if (isMemcmp || isStrncmp || isStrncasecmp) {
+
+            Value *      op2 = callInst->getArgOperand(2);
+            ConstantInt *ilen = dyn_cast<ConstantInt>(op2);
+            if (ilen) {
+
+              uint64_t literalLength = optLen;
+              optLen = ilen->getZExtValue();
+              if (literalLength + 1 == optLen) {  // add null byte
+                thestring.append("\0", 1);
+                addedNull = true;
+
+              }
+
+            }
+
+          }
+
+          // add null byte if this is a string compare function and a null
+          // was not already added
+          if (!isMemcmp) {
+
+            if (addedNull == false) {
+
+              thestring.append("\0", 1);  // add null byte
+              optLen++;
+
+            }
+
+            // ensure we do not have garbage
+            size_t offset = thestring.find('\0', 0);
+            if (offset + 1 < optLen) optLen = offset + 1;
+            thestring = thestring.substr(0, optLen);
+
+          }
+
+          // we take the longer string, even if the compare was to a
+          // shorter part. Note that depending on the optimizer of the
+          // compiler this can be wrong, but it is more likely that this
+          // is helping the fuzzer
+          if (optLen != thestring.length()) optLen = thestring.length();
+          if (optLen > MAX_AUTO_EXTRA) optLen = MAX_AUTO_EXTRA;
+          if (optLen < 3)  // too short? skip
+            continue;
+
+          ptr = (char *)thestring.c_str();
+
+          dict2file(fd, (u8 *)ptr, optLen);
+          found++;
+
+        }
+
+      }
+
+    }
+
+  }
+
+  close(fd);
+
+  /* Say something nice. */
+
+  if (!be_quiet) {
+
+    if (!found)
+      OKF("No entries for a dictionary found.");
+    else
+      OKF("Wrote %d entries to the dictionary file.\n", found);
+
+  }
+
+  return true;
+
+}
+
+char AFLdict2filePass::ID = 0;
+
+static void registerAFLdict2filePass(const PassManagerBuilder &,
+                                     legacy::PassManagerBase &PM) {
+
+  PM.add(new AFLdict2filePass());
+
+}
+
+static RegisterPass<AFLdict2filePass> X("afl-dict2file",
+                                        "afl++ dict2file instrumentation pass",
+                                        false, false);
+
+static RegisterStandardPasses RegisterAFLdict2filePass(
+    PassManagerBuilder::EP_OptimizerLast, registerAFLdict2filePass);
+
+static RegisterStandardPasses RegisterAFLdict2filePass0(
+    PassManagerBuilder::EP_EnabledOnOptLevel0, registerAFLdict2filePass);
+
diff --git a/llvm_mode/afl-llvm-lto-instrumentation.so.cc b/instrumentation/afl-llvm-lto-instrumentation.so.cc
index 125db229..125db229 100644
--- a/llvm_mode/afl-llvm-lto-instrumentation.so.cc
+++ b/instrumentation/afl-llvm-lto-instrumentation.so.cc
diff --git a/llvm_mode/afl-llvm-lto-instrumentlist.so.cc b/instrumentation/afl-llvm-lto-instrumentlist.so.cc
index a7331444..a7331444 100644
--- a/llvm_mode/afl-llvm-lto-instrumentlist.so.cc
+++ b/instrumentation/afl-llvm-lto-instrumentlist.so.cc
diff --git a/llvm_mode/afl-llvm-pass.so.cc b/instrumentation/afl-llvm-pass.so.cc
index 8c8c987a..8c8c987a 100644
--- a/llvm_mode/afl-llvm-pass.so.cc
+++ b/instrumentation/afl-llvm-pass.so.cc
diff --git a/llvm_mode/afl-llvm-rt-lto.o.c b/instrumentation/afl-llvm-rt-lto.o.c
index e53785ff..e53785ff 100644
--- a/llvm_mode/afl-llvm-rt-lto.o.c
+++ b/instrumentation/afl-llvm-rt-lto.o.c
diff --git a/llvm_mode/cmplog-instructions-pass.cc b/instrumentation/cmplog-instructions-pass.cc
index d5de3dbb..d5de3dbb 100644
--- a/llvm_mode/cmplog-instructions-pass.cc
+++ b/instrumentation/cmplog-instructions-pass.cc
diff --git a/llvm_mode/cmplog-routines-pass.cc b/instrumentation/cmplog-routines-pass.cc
index c44f38c4..c44f38c4 100644
--- a/llvm_mode/cmplog-routines-pass.cc
+++ b/instrumentation/cmplog-routines-pass.cc
diff --git a/llvm_mode/compare-transform-pass.so.cc b/instrumentation/compare-transform-pass.so.cc
index acdd0f3b..acdd0f3b 100644
--- a/llvm_mode/compare-transform-pass.so.cc
+++ b/instrumentation/compare-transform-pass.so.cc
diff --git a/llvm_mode/llvm-ngram-coverage.h b/instrumentation/llvm-ngram-coverage.h
index 12b666e9..12b666e9 100644
--- a/llvm_mode/llvm-ngram-coverage.h
+++ b/instrumentation/llvm-ngram-coverage.h
diff --git a/llvm_mode/split-compares-pass.so.cc b/instrumentation/split-compares-pass.so.cc
index 2e57a30a..2fb90e5e 100644
--- a/llvm_mode/split-compares-pass.so.cc
+++ b/instrumentation/split-compares-pass.so.cc
@@ -356,6 +356,8 @@ bool SplitComparesTransform::simplifyIntSignedness(Module &M) {
    * all signed compares to icomps vector */
   for (auto &F : M) {
 
+    if (!isInInstrumentList(&F)) continue;
+
     for (auto &BB : F) {
 
       for (auto &IN : BB) {
@@ -542,6 +544,8 @@ size_t SplitComparesTransform::splitFPCompares(Module &M) {
    * functions were executed only these four predicates should exist */
   for (auto &F : M) {
 
+    if (!isInInstrumentList(&F)) continue;
+
     for (auto &BB : F) {
 
       for (auto &IN : BB) {
@@ -1052,6 +1056,8 @@ size_t SplitComparesTransform::splitIntCompares(Module &M, unsigned bitw) {
    * were executed only these four predicates should exist */
   for (auto &F : M) {
 
+    if (!isInInstrumentList(&F)) continue;
+
     for (auto &BB : F) {
 
       for (auto &IN : BB) {
diff --git a/llvm_mode/split-switches-pass.so.cc b/instrumentation/split-switches-pass.so.cc
index a79d4114..a79d4114 100644
--- a/llvm_mode/split-switches-pass.so.cc
+++ b/instrumentation/split-switches-pass.so.cc
diff --git a/llvm_mode/Makefile b/llvm_mode/Makefile
deleted file mode 100644
index 3666a74d..00000000
--- a/llvm_mode/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-all:
-	@gmake all || echo please install GNUmake
diff --git a/llvm_mode/README.instrim.md b/llvm_mode/README.instrim.md
deleted file mode 100644
index 7758091b..00000000
--- a/llvm_mode/README.instrim.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# InsTrim
-
-InsTrim: Lightweight Instrumentation for Coverage-guided Fuzzing
-
-## Introduction
-
-InsTrim uses CFG and markers to instrument just what is necessary in the
-binary in llvm_mode. It is about 10-15% faster without disadvantages.
-It requires at least llvm version 3.8.0.
-
-## Usage
-
-Set the environment variable `AFL_LLVM_INSTRUMENT=CFG` or `AFL_LLVM_INSTRIM=1`
-during compilation of the target.
-
-There is also an advanced mode which instruments loops in a way so that
-afl-fuzz can see which loop path has been selected but not being able to
-see how often the loop has been rerun.
-This again is a tradeoff for speed for less path information.
-To enable this mode set `AFL_LLVM_INSTRIM_LOOPHEAD=1`.
-
-## Background
-
-The paper: [InsTrim: Lightweight Instrumentation for Coverage-guided Fuzzing]
-(https://www.ndss-symposium.org/wp-content/uploads/2018/07/bar2018_14_Hsu_paper.pdf)
diff --git a/llvm_mode/afl-clang-fast.c b/llvm_mode/afl-clang-fast.c
deleted file mode 100644
index ccdbca9d..00000000
--- a/llvm_mode/afl-clang-fast.c
+++ /dev/null
@@ -1,1143 +0,0 @@
-/*
-   american fuzzy lop++ - LLVM-mode wrapper for clang
-   ------------------------------------------------
-
-   Written by Laszlo Szekeres <lszekeres@google.com> and
-              Michal Zalewski
-
-   LLVM integration design comes from Laszlo Szekeres.
-
-   Copyright 2015, 2016 Google Inc. All rights reserved.
-   Copyright 2019-2020 AFLplusplus Project. All rights reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at:
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   This program is a drop-in replacement for clang, similar in most respects
-   to ../afl-gcc. It tries to figure out compilation mode, adds a bunch
-   of flags, and then calls the real compiler.
-
- */
-
-#define AFL_MAIN
-
-#include "common.h"
-#include "config.h"
-#include "types.h"
-#include "debug.h"
-#include "alloc-inl.h"
-#include "llvm-ngram-coverage.h"
-
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-#include <strings.h>
-#include <limits.h>
-#include <assert.h>
-
-#include "llvm/Config/llvm-config.h"
-
-static u8 * obj_path;                  /* Path to runtime libraries         */
-static u8 **cc_params;                 /* Parameters passed to the real CC  */
-static u32  cc_par_cnt = 1;            /* Param count, including argv0      */
-static u8   llvm_fullpath[PATH_MAX];
-static u8  instrument_mode, instrument_opt_mode, ngram_size, lto_mode, cpp_mode;
-static u8 *lto_flag = AFL_CLANG_FLTO;
-static u8  debug;
-static u8  cwd[4096];
-static u8  cmplog_mode;
-u8         use_stdin = 0;                                          /* dummy */
-// static u8 *march_opt = CFLAGS_OPT;
-
-enum {
-
-  INSTURMENT_DEFAULT = 0,
-  INSTRUMENT_CLASSIC = 1,
-  INSTRUMENT_AFL = 1,
-  INSTRUMENT_PCGUARD = 2,
-  INSTRUMENT_INSTRIM = 3,
-  INSTRUMENT_CFG = 3,
-  INSTRUMENT_LTO = 4,
-  INSTRUMENT_OPT_CTX = 8,
-  INSTRUMENT_OPT_NGRAM = 16
-
-};
-
-char instrument_mode_string[18][18] = {
-
-    "DEFAULT", "CLASSIC", "PCGUARD", "CFG", "LTO", "", "",      "", "CTX", "",
-    "",        "",        "",        "",    "",    "", "NGRAM", ""
-
-};
-
-u8 *getthecwd() {
-
-  static u8 fail[] = "";
-  if (getcwd(cwd, sizeof(cwd)) == NULL) return fail;
-  return cwd;
-
-}
-
-/* Try to find the runtime libraries. If that fails, abort. */
-
-static void find_obj(u8 *argv0) {
-
-  u8 *afl_path = getenv("AFL_PATH");
-  u8 *slash, *tmp;
-
-  if (afl_path) {
-
-#ifdef __ANDROID__
-    tmp = alloc_printf("%s/afl-llvm-rt.so", afl_path);
-#else
-    tmp = alloc_printf("%s/afl-llvm-rt.o", afl_path);
-#endif
-
-    if (!access(tmp, R_OK)) {
-
-      obj_path = afl_path;
-      ck_free(tmp);
-      return;
-
-    }
-
-    ck_free(tmp);
-
-  }
-
-  slash = strrchr(argv0, '/');
-
-  if (slash) {
-
-    u8 *dir;
-
-    *slash = 0;
-    dir = ck_strdup(argv0);
-    *slash = '/';
-
-#ifdef __ANDROID__
-    tmp = alloc_printf("%s/afl-llvm-rt.so", dir);
-#else
-    tmp = alloc_printf("%s/afl-llvm-rt.o", dir);
-#endif
-
-    if (!access(tmp, R_OK)) {
-
-      obj_path = dir;
-      ck_free(tmp);
-      return;
-
-    }
-
-    ck_free(tmp);
-    ck_free(dir);
-
-  }
-
-#ifdef __ANDROID__
-  if (!access(AFL_PATH "/afl-llvm-rt.so", R_OK)) {
-
-#else
-  if (!access(AFL_PATH "/afl-llvm-rt.o", R_OK)) {
-
-#endif
-
-    obj_path = AFL_PATH;
-    return;
-
-  }
-
-  FATAL(
-      "Unable to find 'afl-llvm-rt.o' or 'afl-llvm-pass.so'. Please set "
-      "AFL_PATH");
-
-}
-
-/* Copy argv to cc_params, making the necessary edits. */
-
-static void edit_params(u32 argc, char **argv, char **envp) {
-
-  u8 fortify_set = 0, asan_set = 0, x_set = 0, bit_mode = 0, shared_linking = 0,
-     preprocessor_only = 0;
-  u8  have_pic = 0;
-  u8 *name;
-
-  cc_params = ck_alloc((argc + 128) * sizeof(u8 *));
-
-  name = strrchr(argv[0], '/');
-  if (!name)
-    name = argv[0];
-  else
-    ++name;
-
-  if (lto_mode)
-    if (lto_flag[0] != '-')
-      FATAL(
-          "Using afl-clang-lto is not possible because Makefile magic did not "
-          "identify the correct -flto flag");
-
-  if (!strcmp(name, "afl-clang-fast++") || !strcmp(name, "afl-clang-lto++") ||
-      !strcmp(name, "afl-clang++")) {
-
-    u8 *alt_cxx = getenv("AFL_CXX");
-    if (USE_BINDIR)
-      snprintf(llvm_fullpath, sizeof(llvm_fullpath), "%s/clang++", LLVM_BINDIR);
-    else
-      sprintf(llvm_fullpath, CLANGPP_BIN);
-    cc_params[0] = alt_cxx && *alt_cxx ? alt_cxx : (u8 *)llvm_fullpath;
-    cpp_mode = 1;
-
-  } else if (!strcmp(name, "afl-clang-fast") ||
-
-             !strcmp(name, "afl-clang-lto") || !strcmp(name, "afl-clang")) {
-
-    u8 *alt_cc = getenv("AFL_CC");
-    if (USE_BINDIR)
-      snprintf(llvm_fullpath, sizeof(llvm_fullpath), "%s/clang", LLVM_BINDIR);
-    else
-      sprintf(llvm_fullpath, CLANG_BIN);
-    cc_params[0] = alt_cc && *alt_cc ? alt_cc : (u8 *)llvm_fullpath;
-
-  } else {
-
-    fprintf(stderr, "Name of the binary: %s\n", argv[0]);
-    FATAL(
-        "Name of the binary is not a known name, expected afl-clang-fast(++) "
-        "or afl-clang-lto(++)");
-
-  }
-
-  cc_params[cc_par_cnt++] = "-Wno-unused-command-line-argument";
-
-  if (lto_mode && cpp_mode)
-    cc_params[cc_par_cnt++] = "-lc++";  // needed by fuzzbench, early
-
-  /* There are several ways to compile with afl-clang-fast. In the traditional
-     mode, we use afl-llvm-pass.so, then there is libLLVMInsTrim.so which is
-     faster and creates less map pollution.
-     Then there is the 'trace-pc-guard' mode, we use native LLVM
-     instrumentation callbacks instead. For trace-pc-guard see:
-     http://clang.llvm.org/docs/SanitizerCoverage.html#tracing-pcs-with-guards
-     The best instrumentatation is with the LTO modes, the classic and
-     InsTrimLTO, the latter is faster. The LTO modes are activated by using
-     afl-clang-lto(++)
-   */
-
-  if (lto_mode) {
-
-    if (getenv("AFL_LLVM_INSTRUMENT_FILE") != NULL ||
-        getenv("AFL_LLVM_WHITELIST") || getenv("AFL_LLVM_ALLOWLIST") ||
-        getenv("AFL_LLVM_DENYLIST") || getenv("AFL_LLVM_BLOCKLIST")) {
-
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] = "-load";
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] =
-          alloc_printf("%s/afl-llvm-lto-instrumentlist.so", obj_path);
-
-    }
-
-  }
-
-  // laf
-  if (getenv("LAF_SPLIT_SWITCHES") || getenv("AFL_LLVM_LAF_SPLIT_SWITCHES")) {
-
-    if (lto_mode) {
-
-      cc_params[cc_par_cnt++] =
-          alloc_printf("-Wl,-mllvm=-load=%s/split-switches-pass.so", obj_path);
-
-    } else {
-
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] = "-load";
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] =
-          alloc_printf("%s/split-switches-pass.so", obj_path);
-
-    }
-
-  }
-
-  if (getenv("LAF_TRANSFORM_COMPARES") ||
-      getenv("AFL_LLVM_LAF_TRANSFORM_COMPARES")) {
-
-    if (lto_mode) {
-
-      cc_params[cc_par_cnt++] = alloc_printf(
-          "-Wl,-mllvm=-load=%s/compare-transform-pass.so", obj_path);
-
-    } else {
-
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] = "-load";
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] =
-          alloc_printf("%s/compare-transform-pass.so", obj_path);
-
-    }
-
-  }
-
-  if (getenv("LAF_SPLIT_COMPARES") || getenv("AFL_LLVM_LAF_SPLIT_COMPARES") ||
-      getenv("AFL_LLVM_LAF_SPLIT_FLOATS")) {
-
-    if (lto_mode) {
-
-      cc_params[cc_par_cnt++] =
-          alloc_printf("-Wl,-mllvm=-load=%s/split-compares-pass.so", obj_path);
-
-    } else {
-
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] = "-load";
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] =
-          alloc_printf("%s/split-compares-pass.so", obj_path);
-
-    }
-
-  }
-
-  // /laf
-
-  unsetenv("AFL_LD");
-  unsetenv("AFL_LD_CALLER");
-  if (cmplog_mode) {
-
-    if (lto_mode) {
-
-      cc_params[cc_par_cnt++] =
-          alloc_printf("-Wl,-mllvm=-load=%s/cmplog-routines-pass.so", obj_path);
-      cc_params[cc_par_cnt++] =
-          alloc_printf("-Wl,-mllvm=-load=%s/split-switches-pass.so", obj_path);
-      cc_params[cc_par_cnt++] = alloc_printf(
-          "-Wl,-mllvm=-load=%s/cmplog-instructions-pass.so", obj_path);
-
-    } else {
-
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] = "-load";
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] =
-          alloc_printf("%s/cmplog-routines-pass.so", obj_path);
-
-      // reuse split switches from laf
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] = "-load";
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] =
-          alloc_printf("%s/split-switches-pass.so", obj_path);
-
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] = "-load";
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] =
-          alloc_printf("%s/cmplog-instructions-pass.so", obj_path);
-
-    }
-
-    cc_params[cc_par_cnt++] = "-fno-inline";
-
-  }
-
-  if (lto_mode) {
-
-#if defined(AFL_CLANG_LDPATH) && LLVM_VERSION_MAJOR >= 12
-    u8 *ld_ptr = strrchr(AFL_REAL_LD, '/');
-    if (!ld_ptr) ld_ptr = "ld.lld";
-    cc_params[cc_par_cnt++] = alloc_printf("-fuse-ld=%s", ld_ptr);
-    cc_params[cc_par_cnt++] = alloc_printf("--ld-path=%s", AFL_REAL_LD);
-#else
-    cc_params[cc_par_cnt++] = alloc_printf("-fuse-ld=%s", AFL_REAL_LD);
-#endif
-
-    cc_params[cc_par_cnt++] = "-Wl,--allow-multiple-definition";
-
-    if (instrument_mode == INSTRUMENT_CFG)
-      cc_params[cc_par_cnt++] =
-          alloc_printf("-Wl,-mllvm=-load=%s/SanitizerCoverageLTO.so", obj_path);
-    else
-
-      cc_params[cc_par_cnt++] = alloc_printf(
-          "-Wl,-mllvm=-load=%s/afl-llvm-lto-instrumentation.so", obj_path);
-    cc_params[cc_par_cnt++] = lto_flag;
-
-  } else {
-
-    if (instrument_mode == INSTRUMENT_PCGUARD) {
-
-#if LLVM_VERSION_MAJOR > 4 ||   \
-    (LLVM_VERSION_MAJOR == 4 && \
-     (LLVM_VERSION_MINOR > 0 || LLVM_VERSION_PATCH >= 1))
-      cc_params[cc_par_cnt++] =
-          "-fsanitize-coverage=trace-pc-guard";  // edge coverage by default
-#else
-      FATAL("pcguard instrumentation requires llvm 4.0.1+");
-#endif
-
-    } else {
-
-      cc_params[cc_par_cnt++] = "-Xclang";
-      cc_params[cc_par_cnt++] = "-load";
-      cc_params[cc_par_cnt++] = "-Xclang";
-      if (instrument_mode == INSTRUMENT_CFG)
-        cc_params[cc_par_cnt++] =
-            alloc_printf("%s/libLLVMInsTrim.so", obj_path);
-      else
-        cc_params[cc_par_cnt++] = alloc_printf("%s/afl-llvm-pass.so", obj_path);
-
-    }
-
-  }
-
-  // cc_params[cc_par_cnt++] = "-Qunused-arguments";
-
-  // in case LLVM is installed not via a package manager or "make install"
-  // e.g. compiled download or compiled from github then it's ./lib directory
-  // might not be in the search path. Add it if so.
-  u8 *libdir = strdup(LLVM_LIBDIR);
-  if (cpp_mode && strlen(libdir) && strncmp(libdir, "/usr", 4) &&
-      strncmp(libdir, "/lib", 4)) {
-
-    cc_params[cc_par_cnt++] = "-rpath";
-    cc_params[cc_par_cnt++] = libdir;
-
-  } else {
-
-    free(libdir);
-
-  }
-
-  u32 idx;
-  if (lto_mode && argc > 1) {
-
-    for (idx = 1; idx < argc; idx++) {
-
-      if (!strncasecmp(argv[idx], "-fpic", 5)) have_pic = 1;
-
-    }
-
-    if (!have_pic) cc_params[cc_par_cnt++] = "-fPIC";
-
-  }
-
-  /* Detect stray -v calls from ./configure scripts. */
-
-  while (--argc) {
-
-    u8 *cur = *(++argv);
-
-    if (!strcmp(cur, "-m32")) bit_mode = 32;
-    if (!strcmp(cur, "armv7a-linux-androideabi")) bit_mode = 32;
-    if (!strcmp(cur, "-m64")) bit_mode = 64;
-
-    if (!strcmp(cur, "-x")) x_set = 1;
-
-    if (!strcmp(cur, "-fsanitize=address") || !strcmp(cur, "-fsanitize=memory"))
-      asan_set = 1;
-
-    if (strstr(cur, "FORTIFY_SOURCE")) fortify_set = 1;
-
-    if (!strcmp(cur, "-Wl,-z,defs") || !strcmp(cur, "-Wl,--no-undefined"))
-      continue;
-
-    if (lto_mode && !strncmp(cur, "-fuse-ld=", 9)) continue;
-    if (lto_mode && !strncmp(cur, "--ld-path=", 10)) continue;
-
-    if (!strcmp(cur, "-E")) preprocessor_only = 1;
-    if (!strcmp(cur, "-shared")) shared_linking = 1;
-
-    cc_params[cc_par_cnt++] = cur;
-
-  }
-
-  if (getenv("AFL_HARDEN")) {
-
-    cc_params[cc_par_cnt++] = "-fstack-protector-all";
-
-    if (!fortify_set) cc_params[cc_par_cnt++] = "-D_FORTIFY_SOURCE=2";
-
-  }
-
-  if (!asan_set) {
-
-    if (getenv("AFL_USE_ASAN")) {
-
-      if (getenv("AFL_USE_MSAN")) FATAL("ASAN and MSAN are mutually exclusive");
-
-      if (getenv("AFL_HARDEN"))
-        FATAL("ASAN and AFL_HARDEN are mutually exclusive");
-
-      cc_params[cc_par_cnt++] = "-U_FORTIFY_SOURCE";
-      cc_params[cc_par_cnt++] = "-fsanitize=address";
-
-    } else if (getenv("AFL_USE_MSAN")) {
-
-      if (getenv("AFL_USE_ASAN")) FATAL("ASAN and MSAN are mutually exclusive");
-
-      if (getenv("AFL_HARDEN"))
-        FATAL("MSAN and AFL_HARDEN are mutually exclusive");
-
-      cc_params[cc_par_cnt++] = "-U_FORTIFY_SOURCE";
-      cc_params[cc_par_cnt++] = "-fsanitize=memory";
-
-    }
-
-  }
-
-  if (getenv("AFL_USE_UBSAN")) {
-
-    cc_params[cc_par_cnt++] = "-fsanitize=undefined";
-    cc_params[cc_par_cnt++] = "-fsanitize-undefined-trap-on-error";
-    cc_params[cc_par_cnt++] = "-fno-sanitize-recover=all";
-
-  }
-
-  if (getenv("AFL_USE_CFISAN")) {
-
-    if (!lto_mode) {
-
-      uint32_t i = 0, found = 0;
-      while (envp[i] != NULL && !found)
-        if (strncmp("-flto", envp[i++], 5) == 0) found = 1;
-      if (!found) cc_params[cc_par_cnt++] = "-flto";
-
-    }
-
-    cc_params[cc_par_cnt++] = "-fsanitize=cfi";
-    cc_params[cc_par_cnt++] = "-fvisibility=hidden";
-
-  }
-
-  if (!getenv("AFL_DONT_OPTIMIZE")) {
-
-    cc_params[cc_par_cnt++] = "-g";
-    cc_params[cc_par_cnt++] = "-O3";
-    cc_params[cc_par_cnt++] = "-funroll-loops";
-    // if (strlen(march_opt) > 1 && march_opt[0] == '-')
-    //  cc_params[cc_par_cnt++] = march_opt;
-
-  }
-
-  if (getenv("AFL_NO_BUILTIN") || getenv("AFL_LLVM_LAF_TRANSFORM_COMPARES") ||
-      getenv("LAF_TRANSFORM_COMPARES") || lto_mode) {
-
-    cc_params[cc_par_cnt++] = "-fno-builtin-strcmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strncmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strcasecmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strncasecmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-memcmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-bcmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strstr";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strcasestr";
-
-  }
-
-#if defined(USEMMAP) && !defined(__HAIKU__)
-  cc_params[cc_par_cnt++] = "-lrt";
-#endif
-
-  cc_params[cc_par_cnt++] = "-D__AFL_HAVE_MANUAL_CONTROL=1";
-  cc_params[cc_par_cnt++] = "-D__AFL_COMPILER=1";
-  cc_params[cc_par_cnt++] = "-DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION=1";
-
-  /* When the user tries to use persistent or deferred forkserver modes by
-     appending a single line to the program, we want to reliably inject a
-     signature into the binary (to be picked up by afl-fuzz) and we want
-     to call a function from the runtime .o file. This is unnecessarily
-     painful for three reasons:
-
-     1) We need to convince the compiler not to optimize out the signature.
-        This is done with __attribute__((used)).
-
-     2) We need to convince the linker, when called with -Wl,--gc-sections,
-        not to do the same. This is done by forcing an assignment to a
-        'volatile' pointer.
-
-     3) We need to declare __afl_persistent_loop() in the global namespace,
-        but doing this within a method in a class is hard - :: and extern "C"
-        are forbidden and __attribute__((alias(...))) doesn't work. Hence the
-        __asm__ aliasing trick.
-
-   */
-
-  cc_params[cc_par_cnt++] =
-      "-D__AFL_FUZZ_INIT()="
-      "int __afl_sharedmem_fuzzing = 1;"
-      "extern unsigned int *__afl_fuzz_len;"
-      "extern unsigned char *__afl_fuzz_ptr;"
-      "unsigned char __afl_fuzz_alt[1024000];"
-      "unsigned char *__afl_fuzz_alt_ptr = __afl_fuzz_alt;";
-  cc_params[cc_par_cnt++] =
-      "-D__AFL_FUZZ_TESTCASE_BUF=(__afl_fuzz_ptr ? __afl_fuzz_ptr : "
-      "__afl_fuzz_alt_ptr)";
-  cc_params[cc_par_cnt++] =
-      "-D__AFL_FUZZ_TESTCASE_LEN=(__afl_fuzz_ptr ? *__afl_fuzz_len : "
-      "(*__afl_fuzz_len = read(0, __afl_fuzz_alt_ptr, 1024000)) == 0xffffffff "
-      "? 0 : *__afl_fuzz_len)";
-
-  cc_params[cc_par_cnt++] =
-      "-D__AFL_LOOP(_A)="
-      "({ static volatile char *_B __attribute__((used)); "
-      " _B = (char*)\"" PERSIST_SIG
-      "\"; "
-#ifdef __APPLE__
-      "__attribute__((visibility(\"default\"))) "
-      "int _L(unsigned int) __asm__(\"___afl_persistent_loop\"); "
-#else
-      "__attribute__((visibility(\"default\"))) "
-      "int _L(unsigned int) __asm__(\"__afl_persistent_loop\"); "
-#endif                                                        /* ^__APPLE__ */
-      "_L(_A); })";
-
-  cc_params[cc_par_cnt++] =
-      "-D__AFL_INIT()="
-      "do { static volatile char *_A __attribute__((used)); "
-      " _A = (char*)\"" DEFER_SIG
-      "\"; "
-#ifdef __APPLE__
-      "__attribute__((visibility(\"default\"))) "
-      "void _I(void) __asm__(\"___afl_manual_init\"); "
-#else
-      "__attribute__((visibility(\"default\"))) "
-      "void _I(void) __asm__(\"__afl_manual_init\"); "
-#endif                                                        /* ^__APPLE__ */
-      "_I(); } while (0)";
-
-  if (x_set) {
-
-    cc_params[cc_par_cnt++] = "-x";
-    cc_params[cc_par_cnt++] = "none";
-
-  }
-
-  if (preprocessor_only) {
-
-    /* In the preprocessor_only case (-E), we are not actually compiling at
-       all but requesting the compiler to output preprocessed sources only.
-       We must not add the runtime in this case because the compiler will
-       simply output its binary content back on stdout, breaking any build
-       systems that rely on a separate source preprocessing step. */
-    cc_params[cc_par_cnt] = NULL;
-    return;
-
-  }
-
-#ifndef __ANDROID__
-  switch (bit_mode) {
-
-    case 0:
-      cc_params[cc_par_cnt++] = alloc_printf("%s/afl-llvm-rt.o", obj_path);
-      if (lto_mode)
-        cc_params[cc_par_cnt++] =
-            alloc_printf("%s/afl-llvm-rt-lto.o", obj_path);
-      break;
-
-    case 32:
-      cc_params[cc_par_cnt++] = alloc_printf("%s/afl-llvm-rt-32.o", obj_path);
-      if (access(cc_params[cc_par_cnt - 1], R_OK))
-        FATAL("-m32 is not supported by your compiler");
-      if (lto_mode) {
-
-        cc_params[cc_par_cnt++] =
-            alloc_printf("%s/afl-llvm-rt-lto-32.o", obj_path);
-        if (access(cc_params[cc_par_cnt - 1], R_OK))
-          FATAL("-m32 is not supported by your compiler");
-
-      }
-
-      break;
-
-    case 64:
-      cc_params[cc_par_cnt++] = alloc_printf("%s/afl-llvm-rt-64.o", obj_path);
-      if (access(cc_params[cc_par_cnt - 1], R_OK))
-        FATAL("-m64 is not supported by your compiler");
-      if (lto_mode) {
-
-        cc_params[cc_par_cnt++] =
-            alloc_printf("%s/afl-llvm-rt-lto-64.o", obj_path);
-        if (access(cc_params[cc_par_cnt - 1], R_OK))
-          FATAL("-m64 is not supported by your compiler");
-
-      }
-
-      break;
-
-  }
-
-  #ifndef __APPLE__
-  if (!shared_linking)
-    cc_params[cc_par_cnt++] =
-        alloc_printf("-Wl,--dynamic-list=%s/dynamic_list.txt", obj_path);
-  #endif
-
-#endif
-
-  cc_params[cc_par_cnt] = NULL;
-
-}
-
-/* Main entry point */
-
-int main(int argc, char **argv, char **envp) {
-
-  int   i;
-  char *callname = "afl-clang-fast", *ptr = NULL;
-
-  if (getenv("AFL_DEBUG")) {
-
-    debug = 1;
-    if (strcmp(getenv("AFL_DEBUG"), "0") == 0) unsetenv("AFL_DEBUG");
-
-  } else if (getenv("AFL_QUIET"))
-
-    be_quiet = 1;
-
-  if (getenv("USE_TRACE_PC") || getenv("AFL_USE_TRACE_PC") ||
-      getenv("AFL_LLVM_USE_TRACE_PC") || getenv("AFL_TRACE_PC")) {
-
-    if (instrument_mode == 0)
-      instrument_mode = INSTRUMENT_PCGUARD;
-    else if (instrument_mode != INSTRUMENT_PCGUARD)
-      FATAL("you can not set AFL_LLVM_INSTRUMENT and AFL_TRACE_PC together");
-
-  }
-
-  if ((getenv("AFL_LLVM_INSTRUMENT_FILE") != NULL ||
-       getenv("AFL_LLVM_WHITELIST") || getenv("AFL_LLVM_ALLOWLIST") ||
-       getenv("AFL_LLVM_DENYLIST") || getenv("AFL_LLVM_BLOCKLIST")) &&
-      getenv("AFL_DONT_OPTIMIZE"))
-    WARNF(
-        "AFL_LLVM_ALLOWLIST/DENYLIST and AFL_DONT_OPTIMIZE cannot be combined "
-        "for file matching, only function matching!");
-
-  if (getenv("AFL_LLVM_INSTRIM") || getenv("INSTRIM") ||
-      getenv("INSTRIM_LIB")) {
-
-    if (instrument_mode == 0)
-      instrument_mode = INSTRUMENT_CFG;
-    else if (instrument_mode != INSTRUMENT_CFG)
-      FATAL(
-          "you can not set AFL_LLVM_INSTRUMENT and AFL_LLVM_INSTRIM together");
-
-  }
-
-  if (getenv("AFL_LLVM_CTX")) instrument_opt_mode |= INSTRUMENT_OPT_CTX;
-
-  if (getenv("AFL_LLVM_NGRAM_SIZE")) {
-
-    instrument_opt_mode |= INSTRUMENT_OPT_NGRAM;
-    ngram_size = atoi(getenv("AFL_LLVM_NGRAM_SIZE"));
-    if (ngram_size < 2 || ngram_size > NGRAM_SIZE_MAX)
-      FATAL(
-          "NGRAM instrumentation mode must be between 2 and NGRAM_SIZE_MAX "
-          "(%u)",
-          NGRAM_SIZE_MAX);
-
-  }
-
-  if (getenv("AFL_LLVM_INSTRUMENT")) {
-
-    u8 *ptr = strtok(getenv("AFL_LLVM_INSTRUMENT"), ":,;");
-
-    while (ptr) {
-
-      if (strncasecmp(ptr, "afl", strlen("afl")) == 0 ||
-          strncasecmp(ptr, "classic", strlen("classic")) == 0) {
-
-        if (instrument_mode == INSTRUMENT_LTO) {
-
-          instrument_mode = INSTRUMENT_CLASSIC;
-          lto_mode = 1;
-
-        } else if (!instrument_mode || instrument_mode == INSTRUMENT_AFL)
-
-          instrument_mode = INSTRUMENT_AFL;
-        else
-          FATAL("main instrumentation mode already set with %s",
-                instrument_mode_string[instrument_mode]);
-
-      }
-
-      if (strncasecmp(ptr, "pc-guard", strlen("pc-guard")) == 0 ||
-          strncasecmp(ptr, "pcguard", strlen("pcguard")) == 0) {
-
-        if (!instrument_mode || instrument_mode == INSTRUMENT_PCGUARD)
-          instrument_mode = INSTRUMENT_PCGUARD;
-        else
-          FATAL("main instrumentation mode already set with %s",
-                instrument_mode_string[instrument_mode]);
-
-      }
-
-      if (strncasecmp(ptr, "cfg", strlen("cfg")) == 0 ||
-          strncasecmp(ptr, "instrim", strlen("instrim")) == 0) {
-
-        if (instrument_mode == INSTRUMENT_LTO) {
-
-          instrument_mode = INSTRUMENT_CFG;
-          lto_mode = 1;
-
-        } else if (!instrument_mode || instrument_mode == INSTRUMENT_CFG)
-
-          instrument_mode = INSTRUMENT_CFG;
-        else
-          FATAL("main instrumentation mode already set with %s",
-                instrument_mode_string[instrument_mode]);
-
-      }
-
-      if (strncasecmp(ptr, "lto", strlen("lto")) == 0) {
-
-        lto_mode = 1;
-        if (!instrument_mode || instrument_mode == INSTRUMENT_LTO)
-          instrument_mode = INSTRUMENT_LTO;
-        else if (instrument_mode != INSTRUMENT_CFG)
-          FATAL("main instrumentation mode already set with %s",
-                instrument_mode_string[instrument_mode]);
-
-      }
-
-      if (strncasecmp(ptr, "ctx", strlen("ctx")) == 0) {
-
-        instrument_opt_mode |= INSTRUMENT_OPT_CTX;
-        setenv("AFL_LLVM_CTX", "1", 1);
-
-      }
-
-      if (strncasecmp(ptr, "ngram", strlen("ngram")) == 0) {
-
-        ptr += strlen("ngram");
-        while (*ptr && (*ptr < '0' || *ptr > '9')) {
-
-          ptr++;
-
-        }
-
-        if (!*ptr) {
-
-          ptr = getenv("AFL_LLVM_NGRAM_SIZE");
-          if (!ptr || !*ptr) {
-
-            FATAL(
-                "you must set the NGRAM size with (e.g. for value 2) "
-                "AFL_LLVM_INSTRUMENT=ngram-2");
-
-          }
-
-        }
-
-        ngram_size = atoi(ptr);
-        if (ngram_size < 2 || ngram_size > NGRAM_SIZE_MAX)
-          FATAL(
-              "NGRAM instrumentation option must be between 2 and "
-              "NGRAM_SIZE_MAX "
-              "(%u)",
-              NGRAM_SIZE_MAX);
-        instrument_opt_mode |= (INSTRUMENT_OPT_NGRAM);
-        ptr = alloc_printf("%u", ngram_size);
-        setenv("AFL_LLVM_NGRAM_SIZE", ptr, 1);
-
-      }
-
-      ptr = strtok(NULL, ":,;");
-
-    }
-
-  }
-
-  if (strstr(argv[0], "afl-clang-lto") != NULL) {
-
-    if (instrument_mode == 0 || instrument_mode == INSTRUMENT_LTO ||
-        instrument_mode == INSTRUMENT_CFG) {
-
-      lto_mode = 1;
-      callname = "afl-clang-lto";
-      if (!instrument_mode) {
-
-        instrument_mode = INSTRUMENT_CFG;
-        ptr = instrument_mode_string[instrument_mode];
-
-      }
-
-    } else if (instrument_mode == INSTRUMENT_LTO ||
-
-               instrument_mode == INSTRUMENT_CLASSIC) {
-
-      lto_mode = 1;
-      callname = "afl-clang-lto";
-
-    } else {
-
-      if (!be_quiet)
-        WARNF("afl-clang-lto called with mode %s, using that mode instead",
-              instrument_mode_string[instrument_mode]);
-
-    }
-
-  }
-
-  if (instrument_mode == 0) {
-
-#if LLVM_VERSION_MAJOR <= 6
-    instrument_mode = INSTRUMENT_AFL;
-#else
-    if (getenv("AFL_LLVM_INSTRUMENT_FILE") != NULL ||
-        getenv("AFL_LLVM_WHITELIST") || getenv("AFL_LLVM_ALLOWLIST") ||
-        getenv("AFL_LLVM_DENYLIST") || getenv("AFL_LLVM_BLOCKLIST")) {
-
-      instrument_mode = INSTRUMENT_AFL;
-      WARNF(
-          "switching to classic instrumentation because "
-          "AFL_LLVM_ALLOWLIST/DENYLIST does not work with PCGUARD. Use "
-          "-fsanitize-coverage-allowlist=allowlist.txt or "
-          "-fsanitize-coverage-blocklist=denylist.txt if you want to use "
-          "PCGUARD. Requires llvm 12+. See https://clang.llvm.org/docs/ "
-          "SanitizerCoverage.html#partially-disabling-instrumentation");
-
-    } else
-
-      instrument_mode = INSTRUMENT_PCGUARD;
-#endif
-
-  }
-
-  if (instrument_opt_mode && lto_mode)
-    FATAL(
-        "CTX and NGRAM can not be used in LTO mode (and would make LTO "
-        "useless)");
-
-  if (!instrument_opt_mode) {
-
-    if (lto_mode && instrument_mode == INSTRUMENT_CFG)
-      ptr = alloc_printf("InsTrimLTO");
-    else
-      ptr = instrument_mode_string[instrument_mode];
-
-  } else if (instrument_opt_mode == INSTRUMENT_OPT_CTX)
-
-    ptr = alloc_printf("%s + CTX", instrument_mode_string[instrument_mode]);
-  else if (instrument_opt_mode == INSTRUMENT_OPT_NGRAM)
-    ptr = alloc_printf("%s + NGRAM-%u", instrument_mode_string[instrument_mode],
-                       ngram_size);
-  else
-    ptr = alloc_printf("%s + CTX + NGRAM-%u",
-                       instrument_mode_string[instrument_mode], ngram_size);
-
-#ifndef AFL_CLANG_FLTO
-  if (lto_mode)
-    FATAL(
-        "instrumentation mode LTO specified but LLVM support not available "
-        "(requires LLVM 11 or higher)");
-#endif
-
-  if (instrument_opt_mode && instrument_mode != INSTRUMENT_CLASSIC &&
-      instrument_mode != INSTRUMENT_CFG)
-    FATAL(
-        "CTX and NGRAM instrumentation options can only be used with CFG "
-        "(recommended) and CLASSIC instrumentation modes!");
-
-  if (getenv("AFL_LLVM_SKIP_NEVERZERO") && getenv("AFL_LLVM_NOT_ZERO"))
-    FATAL(
-        "AFL_LLVM_NOT_ZERO and AFL_LLVM_SKIP_NEVERZERO can not be set "
-        "together");
-
-  if (instrument_mode == INSTRUMENT_PCGUARD &&
-      (getenv("AFL_LLVM_INSTRUMENT_FILE") != NULL ||
-       getenv("AFL_LLVM_WHITELIST") || getenv("AFL_LLVM_ALLOWLIST") ||
-       getenv("AFL_LLVM_DENYLIST") || getenv("AFL_LLVM_BLOCKLIST")))
-    FATAL(
-        "Instrumentation type PCGUARD does not support "
-        "AFL_LLVM_ALLOWLIST/DENYLIST! Use "
-        "-fsanitize-coverage-allowlist=allowlist.txt or "
-        "-fsanitize-coverage-blocklist=denylist.txt instead (requires llvm "
-        "12+), see "
-        "https://clang.llvm.org/docs/"
-        "SanitizerCoverage.html#partially-disabling-instrumentation");
-
-  if (argc < 2 || strcmp(argv[1], "-h") == 0) {
-
-    if (!lto_mode)
-      printf("afl-clang-fast" VERSION " by <lszekeres@google.com> in %s mode\n",
-             ptr);
-    else
-      printf("afl-clang-lto" VERSION
-             "  by Marc \"vanHauser\" Heuse <mh@mh-sec.de> in %s mode\n",
-             ptr);
-
-    SAYF(
-        "\n"
-        "%s[++] [options]\n"
-        "\n"
-        "This is a helper application for afl-fuzz. It serves as a drop-in "
-        "replacement\n"
-        "for clang, letting you recompile third-party code with the "
-        "required "
-        "runtime\n"
-        "instrumentation. A common use pattern would be one of the "
-        "following:\n\n"
-
-        "  CC=%s/afl-clang-fast ./configure\n"
-        "  CXX=%s/afl-clang-fast++ ./configure\n\n"
-
-        "In contrast to the traditional afl-clang tool, this version is "
-        "implemented as\n"
-        "an LLVM pass and tends to offer improved performance with slow "
-        "programs.\n\n"
-
-        "Environment variables used:\n"
-        "AFL_CC: path to the C compiler to use\n"
-        "AFL_CXX: path to the C++ compiler to use\n"
-        "AFL_DEBUG: enable developer debugging output\n"
-        "AFL_DONT_OPTIMIZE: disable optimization instead of -O3\n"
-        "AFL_HARDEN: adds code hardening to catch memory bugs\n"
-        "AFL_INST_RATIO: percentage of branches to instrument\n"
-#if LLVM_VERSION_MAJOR < 9
-        "AFL_LLVM_NOT_ZERO: use cycling trace counters that skip zero\n"
-#else
-        "AFL_LLVM_SKIP_NEVERZERO: do not skip zero on trace counters\n"
-#endif
-        "AFL_LLVM_LAF_SPLIT_COMPARES: enable cascaded comparisons\n"
-        "AFL_LLVM_LAF_SPLIT_COMPARES_BITW: size limit (default 8)\n"
-        "AFL_LLVM_LAF_SPLIT_SWITCHES: casc. comp. in 'switch'\n"
-        " to cascaded comparisons\n"
-        "AFL_LLVM_LAF_SPLIT_FLOATS: transform floating point comp. to "
-        "cascaded comp.\n"
-        "AFL_LLVM_LAF_TRANSFORM_COMPARES: transform library comparison "
-        "function calls\n"
-        "AFL_LLVM_LAF_ALL: enables all LAF splits/transforms\n"
-        "AFL_LLVM_INSTRUMENT_ALLOW/AFL_LLVM_INSTRUMENT_DENY: enable instrument"
-        "allow/deny listing (selective instrumentation)\n"
-        "AFL_NO_BUILTIN: compile for use with libtokencap.so\n"
-        "AFL_PATH: path to instrumenting pass and runtime "
-        "(afl-llvm-rt.*o)\n"
-        "AFL_LLVM_DOCUMENT_IDS: document edge IDs given to which function (LTO "
-        "only)\n"
-        "AFL_QUIET: suppress verbose output\n"
-        "AFL_USE_ASAN: activate address sanitizer\n"
-        "AFL_USE_CFISAN: activate control flow sanitizer\n"
-        "AFL_USE_MSAN: activate memory sanitizer\n"
-        "AFL_USE_UBSAN: activate undefined behaviour sanitizer\n",
-        callname, BIN_PATH, BIN_PATH);
-
-    SAYF(
-        "\nafl-clang-fast specific environment variables:\n"
-        "AFL_LLVM_CMPLOG: log operands of comparisons (RedQueen mutator)\n"
-        "AFL_LLVM_INSTRUMENT: set instrumentation mode: AFL, CFG "
-        "(INSTRIM), PCGUARD [DEFAULT], LTO, CTX, NGRAM-2 ... NGRAM-16\n"
-        " You can also use the old environment variables instead:\n"
-        "  AFL_LLVM_USE_TRACE_PC: use LLVM trace-pc-guard instrumentation "
-        "[DEFAULT]\n"
-        "  AFL_LLVM_INSTRIM: use light weight instrumentation InsTrim\n"
-        "  AFL_LLVM_INSTRIM_LOOPHEAD: optimize loop tracing for speed ("
-        "option to INSTRIM)\n"
-        "  AFL_LLVM_CTX: use context sensitive coverage\n"
-        "  AFL_LLVM_NGRAM_SIZE: use ngram prev_loc count coverage\n");
-
-#ifdef AFL_CLANG_FLTO
-    SAYF(
-        "\nafl-clang-lto specific environment variables:\n"
-        "AFL_LLVM_MAP_ADDR: use a fixed coverage map address (speed), e.g. "
-        "0x10000\n"
-        "AFL_LLVM_DOCUMENT_IDS: write all edge IDs and the corresponding "
-        "functions they are in into this file\n"
-        "AFL_LLVM_LTO_DONTWRITEID: don't write the highest ID used to a "
-        "global var\n"
-        "AFL_LLVM_LTO_STARTID: from which ID to start counting from for a "
-        "bb\n"
-        "AFL_REAL_LD: use this lld linker instead of the compiled in path\n"
-        "\nafl-clang-lto was built with linker target \"%s\" and LTO flags "
-        "\"%s\"\n"
-        "If anything fails - be sure to read README.lto.md!\n",
-        AFL_REAL_LD, AFL_CLANG_FLTO);
-#endif
-
-    SAYF(
-        "\nafl-clang-fast was built for llvm %s with the llvm binary path "
-        "of \"%s\".\n",
-        LLVM_VERSION, LLVM_BINDIR);
-
-    SAYF("\n");
-
-    exit(1);
-
-  } else if ((isatty(2) && !be_quiet) ||
-
-             getenv("AFL_DEBUG") != NULL) {
-
-    if (!lto_mode)
-
-      SAYF(cCYA "afl-clang-fast" VERSION cRST
-                " by <lszekeres@google.com> in %s mode\n",
-           ptr);
-
-    else
-
-      SAYF(cCYA "afl-clang-lto" VERSION cRST
-                " by Marc \"vanHauser\" Heuse <mh@mh-sec.de> in mode %s\n",
-           ptr);
-
-  }
-
-  u8 *ptr2;
-  if (!be_quiet && !lto_mode &&
-      ((ptr2 = getenv("AFL_MAP_SIZE")) || (ptr2 = getenv("AFL_MAPSIZE")))) {
-
-    u32 map_size = atoi(ptr2);
-    if (map_size != MAP_SIZE)
-      WARNF("AFL_MAP_SIZE is not supported by afl-clang-fast");
-
-  }
-
-  if (debug) {
-
-    SAYF(cMGN "[D]" cRST " cd \"%s\";", getthecwd());
-    for (i = 0; i < argc; i++)
-      SAYF(" \"%s\"", argv[i]);
-    SAYF("\n");
-
-  }
-
-  check_environment_vars(envp);
-
-  if (getenv("AFL_LLVM_LAF_ALL")) {
-
-    setenv("AFL_LLVM_LAF_SPLIT_SWITCHES", "1", 1);
-    setenv("AFL_LLVM_LAF_SPLIT_COMPARES", "1", 1);
-    setenv("AFL_LLVM_LAF_SPLIT_FLOATS", "1", 1);
-    setenv("AFL_LLVM_LAF_TRANSFORM_COMPARES", "1", 1);
-
-  }
-
-  cmplog_mode = getenv("AFL_CMPLOG") || getenv("AFL_LLVM_CMPLOG");
-  if (!be_quiet && cmplog_mode)
-    printf("CmpLog mode by <andreafioraldi@gmail.com>\n");
-
-#ifndef __ANDROID__
-  find_obj(argv[0]);
-#endif
-
-  edit_params(argc, argv, envp);
-
-  if (debug) {
-
-    SAYF(cMGN "[D]" cRST " cd \"%s\";", getthecwd());
-    for (i = 0; i < cc_par_cnt; i++)
-      SAYF(" \"%s\"", cc_params[i]);
-    SAYF("\n");
-
-  }
-
-  execvp(cc_params[0], (char **)cc_params);
-
-  FATAL("Oops, failed to execute '%s' - check your PATH", cc_params[0]);
-
-  return 0;
-
-}
-
diff --git a/qemu_mode/patches/afl-qemu-cpu-inl.h b/qemu_mode/patches/afl-qemu-cpu-inl.h
index 63b7581d..0e38f38b 100644
--- a/qemu_mode/patches/afl-qemu-cpu-inl.h
+++ b/qemu_mode/patches/afl-qemu-cpu-inl.h
@@ -466,7 +466,7 @@ void afl_forkserver(CPUState *cpu) {
 }
 
 /* A simplified persistent mode handler, used as explained in
- * llvm_mode/README.md. */
+ * instrumentation/README.llvm.md */
 
 void afl_persistent_loop(void) {
 
diff --git a/src/afl-cc.c b/src/afl-cc.c
new file mode 100644
index 00000000..e11ce40a
--- /dev/null
+++ b/src/afl-cc.c
@@ -0,0 +1,1544 @@
+/*
+   american fuzzy lop++ - compiler instrumentation wrapper
+   -------------------------------------------------------
+
+   Written by Michal Zalewski, Laszlo Szekeres and Marc Heuse
+
+   Copyright 2015, 2016 Google Inc. All rights reserved.
+   Copyright 2019-2020 AFLplusplus Project. All rights reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at:
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ */
+
+#define AFL_MAIN
+
+#include "common.h"
+#include "config.h"
+#include "types.h"
+#include "debug.h"
+#include "alloc-inl.h"
+#include "llvm-ngram-coverage.h"
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <limits.h>
+#include <assert.h>
+
+#if (LLVM_MAJOR - 0 == 0)
+  #undef LLVM_MAJOR
+#endif
+#if !defined(LLVM_MAJOR)
+  #define LLVM_MAJOR 0
+#endif
+
+static u8 * obj_path;                  /* Path to runtime libraries         */
+static u8 **cc_params;                 /* Parameters passed to the real CC  */
+static u32  cc_par_cnt = 1;            /* Param count, including argv0      */
+static u8   llvm_fullpath[PATH_MAX];
+static u8   instrument_mode, instrument_opt_mode, ngram_size, lto_mode,
+    compiler_mode, plusplus_mode;
+static u8  have_gcc, have_llvm, have_gcc_plugin, have_lto;
+static u8 *lto_flag = AFL_CLANG_FLTO, *argvnull;
+static u8  debug;
+static u8  cwd[4096];
+static u8  cmplog_mode;
+u8         use_stdin;                                              /* dummy */
+// static u8 *march_opt = CFLAGS_OPT;
+
+enum {
+
+  INSTURMENT_DEFAULT = 0,
+  INSTRUMENT_CLASSIC = 1,
+  INSTRUMENT_AFL = 1,
+  INSTRUMENT_PCGUARD = 2,
+  INSTRUMENT_INSTRIM = 3,
+  INSTRUMENT_CFG = 3,
+  INSTRUMENT_LTO = 4,
+  INSTRUMENT_OPT_CTX = 8,
+  INSTRUMENT_OPT_NGRAM = 16
+
+};
+
+char instrument_mode_string[18][18] = {
+
+    "DEFAULT", "CLASSIC", "PCGUARD", "CFG", "LTO", "", "",      "", "CTX", "",
+    "",        "",        "",        "",    "",    "", "NGRAM", ""
+
+};
+
+enum {
+
+  UNSET = 0,
+  LTO = 1,
+  LLVM = 2,
+  GCC_PLUGIN = 3,
+  GCC = 4
+
+};
+
+char compiler_mode_string[6][12] = {
+
+    "AUTOSELECT", "LLVM-LTO", "LLVM", "GCC_PLUGIN",
+    "GCC",        ""
+
+};
+
+u8 *getthecwd() {
+
+  static u8 fail[] = "";
+  if (getcwd(cwd, sizeof(cwd)) == NULL) return fail;
+  return cwd;
+
+}
+
+/* Try to find the runtime libraries. If that fails, abort. */
+
+static u8 *find_object(u8 *obj, u8 *argv0) {
+
+  u8 *afl_path = getenv("AFL_PATH");
+  u8 *slash = NULL, *tmp;
+
+  if (afl_path) {
+
+#ifdef __ANDROID__
+    tmp = alloc_printf("%s/%s", afl_path, obj);
+#else
+    tmp = alloc_printf("%s/%s", afl_path, obj);
+#endif
+
+    if (!access(tmp, R_OK)) {
+
+      obj_path = afl_path;
+      return tmp;
+
+    }
+
+    ck_free(tmp);
+
+  }
+
+  if (argv0) slash = strrchr(argv0, '/');
+
+  if (slash) {
+
+    u8 *dir;
+
+    *slash = 0;
+    dir = ck_strdup(argv0);
+    *slash = '/';
+
+#ifdef __ANDROID__
+    tmp = alloc_printf("%s/%s", dir, obj);
+#else
+    tmp = alloc_printf("%s/%s", dir, obj);
+#endif
+
+    if (!access(tmp, R_OK)) {
+
+      obj_path = dir;
+      return tmp;
+
+    }
+
+    ck_free(tmp);
+    ck_free(dir);
+
+  }
+
+  tmp = alloc_printf("%s/%s", AFL_PATH, obj);
+#ifdef __ANDROID__
+  if (!access(tmp, R_OK)) {
+
+#else
+  if (!access(tmp, R_OK)) {
+
+#endif
+
+    obj_path = AFL_PATH;
+    return tmp;
+
+  }
+
+  ck_free(tmp);
+  return NULL;
+
+}
+
+/* Try to find the runtime libraries. If that fails, abort. */
+
+static void find_obj(u8 *argv0) {
+
+  u8 *afl_path = getenv("AFL_PATH");
+  u8 *slash, *tmp;
+
+  if (afl_path) {
+
+#ifdef __ANDROID__
+    tmp = alloc_printf("%s/afl-compiler-rt.so", afl_path);
+#else
+    tmp = alloc_printf("%s/afl-compiler-rt.o", afl_path);
+#endif
+
+    if (!access(tmp, R_OK)) {
+
+      obj_path = afl_path;
+      ck_free(tmp);
+      return;
+
+    }
+
+    ck_free(tmp);
+
+  }
+
+  slash = strrchr(argv0, '/');
+
+  if (slash) {
+
+    u8 *dir;
+
+    *slash = 0;
+    dir = ck_strdup(argv0);
+    *slash = '/';
+
+#ifdef __ANDROID__
+    tmp = alloc_printf("%s/afl-compiler-rt.so", dir);
+#else
+    tmp = alloc_printf("%s/afl-compiler-rt.o", dir);
+#endif
+
+    if (!access(tmp, R_OK)) {
+
+      obj_path = dir;
+      ck_free(tmp);
+      return;
+
+    }
+
+    ck_free(tmp);
+    ck_free(dir);
+
+  }
+
+#ifdef __ANDROID__
+  if (!access(AFL_PATH "/afl-compiler-rt.so", R_OK)) {
+
+#else
+  if (!access(AFL_PATH "/afl-compiler-rt.o", R_OK)) {
+
+#endif
+
+    obj_path = AFL_PATH;
+    return;
+
+  }
+
+  FATAL(
+      "Unable to find 'afl-compiler-rt.o' or 'afl-llvm-pass.so'. Please set "
+      "AFL_PATH");
+
+}
+
+/* Copy argv to cc_params, making the necessary edits. */
+
+static void edit_params(u32 argc, char **argv, char **envp) {
+
+  u8 fortify_set = 0, asan_set = 0, x_set = 0, bit_mode = 0, shared_linking = 0,
+     preprocessor_only = 0, have_unroll = 0, have_o = 0, have_pic = 0;
+  u8 *name;
+
+  cc_params = ck_alloc((argc + 128) * sizeof(u8 *));
+
+  name = strrchr(argv[0], '/');
+  if (!name)
+    name = argv[0];
+  else
+    ++name;
+
+  if (lto_mode) {
+
+    if (lto_flag[0] != '-')
+      FATAL(
+          "Using afl-clang-lto is not possible because Makefile magic did not "
+          "identify the correct -flto flag");
+    else
+      compiler_mode = LTO;
+
+  }
+
+  if (plusplus_mode) {
+
+    u8 *alt_cxx = getenv("AFL_CXX");
+
+    if (!alt_cxx) {
+
+      if (compiler_mode >= GCC_PLUGIN) {
+
+        alt_cxx = "g++";
+
+      } else {
+
+        if (USE_BINDIR)
+          snprintf(llvm_fullpath, sizeof(llvm_fullpath), "%s/clang++",
+                   LLVM_BINDIR);
+        else
+          snprintf(llvm_fullpath, sizeof(llvm_fullpath), CLANGPP_BIN);
+        alt_cxx = llvm_fullpath;
+
+      }
+
+    }
+
+    cc_params[0] = alt_cxx;
+
+  } else {
+
+    u8 *alt_cc = getenv("AFL_CC");
+
+    if (!alt_cc) {
+
+      if (compiler_mode >= GCC_PLUGIN) {
+
+        alt_cc = "gcc";
+
+      } else {
+
+        if (USE_BINDIR)
+          snprintf(llvm_fullpath, sizeof(llvm_fullpath), "%s/clang",
+                   LLVM_BINDIR);
+        else
+          snprintf(llvm_fullpath, sizeof(llvm_fullpath), CLANGPP_BIN);
+        alt_cc = llvm_fullpath;
+
+      }
+
+    }
+
+    cc_params[0] = alt_cc;
+
+  }
+
+  if (compiler_mode == GCC) {
+
+    cc_params[cc_par_cnt++] = "-B";
+    cc_params[cc_par_cnt++] = obj_path;
+
+  }
+
+  if (compiler_mode == GCC_PLUGIN) {
+
+    char *fplugin_arg =
+        alloc_printf("-fplugin=%s", find_object("afl-gcc-pass.so", argvnull));
+    cc_params[cc_par_cnt++] = fplugin_arg;
+
+  }
+
+  if (compiler_mode == LLVM || compiler_mode == LTO) {
+
+    cc_params[cc_par_cnt++] = "-Wno-unused-command-line-argument";
+
+    if (lto_mode && plusplus_mode)
+      cc_params[cc_par_cnt++] = "-lc++";  // needed by fuzzbench, early
+
+    if (lto_mode) {
+
+      if (getenv("AFL_LLVM_INSTRUMENT_FILE") != NULL ||
+          getenv("AFL_LLVM_WHITELIST") || getenv("AFL_LLVM_ALLOWLIST") ||
+          getenv("AFL_LLVM_DENYLIST") || getenv("AFL_LLVM_BLOCKLIST")) {
+
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] = "-load";
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] =
+            alloc_printf("%s/afl-llvm-lto-instrumentlist.so", obj_path);
+
+      }
+
+    }
+
+    if (getenv("AFL_LLVM_DICT2FILE")) {
+
+      cc_params[cc_par_cnt++] = "-Xclang";
+      cc_params[cc_par_cnt++] = "-load";
+      cc_params[cc_par_cnt++] = "-Xclang";
+      cc_params[cc_par_cnt++] =
+          alloc_printf("%s/afl-llvm-dict2file.so", obj_path);
+
+    }
+
+    // laf
+    if (getenv("LAF_SPLIT_SWITCHES") || getenv("AFL_LLVM_LAF_SPLIT_SWITCHES")) {
+
+      if (lto_mode) {
+
+        cc_params[cc_par_cnt++] = alloc_printf(
+            "-Wl,-mllvm=-load=%s/split-switches-pass.so", obj_path);
+
+      } else {
+
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] = "-load";
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] =
+            alloc_printf("%s/split-switches-pass.so", obj_path);
+
+      }
+
+    }
+
+    if (getenv("LAF_TRANSFORM_COMPARES") ||
+        getenv("AFL_LLVM_LAF_TRANSFORM_COMPARES")) {
+
+      if (lto_mode) {
+
+        cc_params[cc_par_cnt++] = alloc_printf(
+            "-Wl,-mllvm=-load=%s/compare-transform-pass.so", obj_path);
+
+      } else {
+
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] = "-load";
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] =
+            alloc_printf("%s/compare-transform-pass.so", obj_path);
+
+      }
+
+    }
+
+    if (getenv("LAF_SPLIT_COMPARES") || getenv("AFL_LLVM_LAF_SPLIT_COMPARES") ||
+        getenv("AFL_LLVM_LAF_SPLIT_FLOATS")) {
+
+      if (lto_mode) {
+
+        cc_params[cc_par_cnt++] = alloc_printf(
+            "-Wl,-mllvm=-load=%s/split-compares-pass.so", obj_path);
+
+      } else {
+
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] = "-load";
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] =
+            alloc_printf("%s/split-compares-pass.so", obj_path);
+
+      }
+
+    }
+
+    // /laf
+
+    unsetenv("AFL_LD");
+    unsetenv("AFL_LD_CALLER");
+    if (cmplog_mode) {
+
+      if (lto_mode) {
+
+        cc_params[cc_par_cnt++] = alloc_printf(
+            "-Wl,-mllvm=-load=%s/cmplog-routines-pass.so", obj_path);
+        cc_params[cc_par_cnt++] = alloc_printf(
+            "-Wl,-mllvm=-load=%s/split-switches-pass.so", obj_path);
+        cc_params[cc_par_cnt++] = alloc_printf(
+            "-Wl,-mllvm=-load=%s/cmplog-instructions-pass.so", obj_path);
+
+      } else {
+
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] = "-load";
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] =
+            alloc_printf("%s/cmplog-routines-pass.so", obj_path);
+
+        // reuse split switches from laf
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] = "-load";
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] =
+            alloc_printf("%s/split-switches-pass.so", obj_path);
+
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] = "-load";
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] =
+            alloc_printf("%s/cmplog-instructions-pass.so", obj_path);
+
+      }
+
+      cc_params[cc_par_cnt++] = "-fno-inline";
+
+    }
+
+    if (lto_mode) {
+
+      u8 *ld_path = strdup(AFL_REAL_LD);
+      if (!*ld_path) ld_path = "ld.lld";
+#if defined(AFL_CLANG_LDPATH) && LLVM_MAJOR >= 12
+      cc_params[cc_par_cnt++] = alloc_printf("--ld-path=%s", ld_path);
+#else
+      cc_params[cc_par_cnt++] = alloc_printf("-fuse-ld=%s", ld_path);
+#endif
+
+      cc_params[cc_par_cnt++] = "-Wl,--allow-multiple-definition";
+
+      if (instrument_mode == INSTRUMENT_CFG)
+        cc_params[cc_par_cnt++] = alloc_printf(
+            "-Wl,-mllvm=-load=%s/SanitizerCoverageLTO.so", obj_path);
+      else
+
+        cc_params[cc_par_cnt++] = alloc_printf(
+            "-Wl,-mllvm=-load=%s/afl-llvm-lto-instrumentation.so", obj_path);
+      cc_params[cc_par_cnt++] = lto_flag;
+
+    } else {
+
+      if (instrument_mode == INSTRUMENT_PCGUARD) {
+
+#if LLVM_MAJOR >= 4
+        cc_params[cc_par_cnt++] =
+            "-fsanitize-coverage=trace-pc-guard";  // edge coverage by default
+#else
+        FATAL("pcguard instrumentation requires llvm 4.0.1+");
+#endif
+
+      } else {
+
+        cc_params[cc_par_cnt++] = "-Xclang";
+        cc_params[cc_par_cnt++] = "-load";
+        cc_params[cc_par_cnt++] = "-Xclang";
+        if (instrument_mode == INSTRUMENT_CFG)
+          cc_params[cc_par_cnt++] =
+              alloc_printf("%s/libLLVMInsTrim.so", obj_path);
+        else
+          cc_params[cc_par_cnt++] =
+              alloc_printf("%s/afl-llvm-pass.so", obj_path);
+
+      }
+
+    }
+
+    // cc_params[cc_par_cnt++] = "-Qunused-arguments";
+
+    // in case LLVM is installed not via a package manager or "make install"
+    // e.g. compiled download or compiled from github then its ./lib directory
+    // might not be in the search path. Add it if so.
+    u8 *libdir = strdup(LLVM_LIBDIR);
+    if (plusplus_mode && strlen(libdir) && strncmp(libdir, "/usr", 4) &&
+        strncmp(libdir, "/lib", 4)) {
+
+      cc_params[cc_par_cnt++] = "-rpath";
+      cc_params[cc_par_cnt++] = libdir;
+
+    } else {
+
+      free(libdir);
+
+    }
+
+    u32 idx;
+    if (lto_mode && argc > 1) {
+
+      for (idx = 1; idx < argc; idx++) {
+
+        if (!strncasecmp(argv[idx], "-fpic", 5)) have_pic = 1;
+
+      }
+
+      if (!have_pic) cc_params[cc_par_cnt++] = "-fPIC";
+
+    }
+
+  }
+
+  /* Detect stray -v calls from ./configure scripts. */
+
+  while (--argc) {
+
+    u8 *cur = *(++argv);
+
+    if (!strncmp(cur, "--afl", 5)) continue;
+    if (lto_mode && !strncmp(cur, "-fuse-ld=", 9)) continue;
+    if (lto_mode && !strncmp(cur, "--ld-path=", 10)) continue;
+    if (!strcmp(cur, "-Wl,-z,defs") || !strcmp(cur, "-Wl,--no-undefined"))
+      continue;
+
+    if (!strcmp(cur, "-m32")) bit_mode = 32;
+    if (!strcmp(cur, "armv7a-linux-androideabi")) bit_mode = 32;
+    if (!strcmp(cur, "-m64")) bit_mode = 64;
+
+    if (!strcmp(cur, "-fsanitize=address") || !strcmp(cur, "-fsanitize=memory"))
+      asan_set = 1;
+
+    if (strstr(cur, "FORTIFY_SOURCE")) fortify_set = 1;
+
+    if (!strcmp(cur, "-x")) x_set = 1;
+    if (!strcmp(cur, "-E")) preprocessor_only = 1;
+    if (!strcmp(cur, "-shared")) shared_linking = 1;
+
+    if (!strncmp(cur, "-O", 2)) have_o = 1;
+    if (!strncmp(cur, "-f", 2) && strstr(cur, "unroll-loop")) have_unroll = 1;
+
+    cc_params[cc_par_cnt++] = cur;
+
+  }
+
+  if (getenv("AFL_HARDEN")) {
+
+    cc_params[cc_par_cnt++] = "-fstack-protector-all";
+
+    if (!fortify_set) cc_params[cc_par_cnt++] = "-D_FORTIFY_SOURCE=2";
+
+  }
+
+  if (!asan_set) {
+
+    if (getenv("AFL_USE_ASAN")) {
+
+      if (getenv("AFL_USE_MSAN")) FATAL("ASAN and MSAN are mutually exclusive");
+
+      if (getenv("AFL_HARDEN"))
+        FATAL("ASAN and AFL_HARDEN are mutually exclusive");
+
+      cc_params[cc_par_cnt++] = "-U_FORTIFY_SOURCE";
+      cc_params[cc_par_cnt++] = "-fsanitize=address";
+
+    } else if (getenv("AFL_USE_MSAN")) {
+
+      if (getenv("AFL_USE_ASAN")) FATAL("ASAN and MSAN are mutually exclusive");
+
+      if (getenv("AFL_HARDEN"))
+        FATAL("MSAN and AFL_HARDEN are mutually exclusive");
+
+      cc_params[cc_par_cnt++] = "-U_FORTIFY_SOURCE";
+      cc_params[cc_par_cnt++] = "-fsanitize=memory";
+
+    }
+
+  }
+
+  if (getenv("AFL_USE_UBSAN")) {
+
+    cc_params[cc_par_cnt++] = "-fsanitize=undefined";
+    cc_params[cc_par_cnt++] = "-fsanitize-undefined-trap-on-error";
+    cc_params[cc_par_cnt++] = "-fno-sanitize-recover=all";
+
+  }
+
+  if (getenv("AFL_USE_CFISAN")) {
+
+    if (!lto_mode) {
+
+      uint32_t i = 0, found = 0;
+      while (envp[i] != NULL && !found)
+        if (strncmp("-flto", envp[i++], 5) == 0) found = 1;
+      if (!found) cc_params[cc_par_cnt++] = "-flto";
+
+    }
+
+    cc_params[cc_par_cnt++] = "-fsanitize=cfi";
+    cc_params[cc_par_cnt++] = "-fvisibility=hidden";
+
+  }
+
+  if (!getenv("AFL_DONT_OPTIMIZE")) {
+
+    cc_params[cc_par_cnt++] = "-g";
+    if (!have_o) cc_params[cc_par_cnt++] = "-O3";
+    if (!have_unroll) cc_params[cc_par_cnt++] = "-funroll-loops";
+    // if (strlen(march_opt) > 1 && march_opt[0] == '-')
+    //  cc_params[cc_par_cnt++] = march_opt;
+
+  }
+
+  if (getenv("AFL_NO_BUILTIN") || getenv("AFL_LLVM_LAF_TRANSFORM_COMPARES") ||
+      getenv("LAF_TRANSFORM_COMPARES") || lto_mode) {
+
+    cc_params[cc_par_cnt++] = "-fno-builtin-strcmp";
+    cc_params[cc_par_cnt++] = "-fno-builtin-strncmp";
+    cc_params[cc_par_cnt++] = "-fno-builtin-strcasecmp";
+    cc_params[cc_par_cnt++] = "-fno-builtin-strncasecmp";
+    cc_params[cc_par_cnt++] = "-fno-builtin-memcmp";
+    cc_params[cc_par_cnt++] = "-fno-builtin-bcmp";
+    cc_params[cc_par_cnt++] = "-fno-builtin-strstr";
+    cc_params[cc_par_cnt++] = "-fno-builtin-strcasestr";
+
+  }
+
+#if defined(USEMMAP) && !defined(__HAIKU__)
+  cc_params[cc_par_cnt++] = "-lrt";
+#endif
+
+  cc_params[cc_par_cnt++] = "-D__AFL_HAVE_MANUAL_CONTROL=1";
+  cc_params[cc_par_cnt++] = "-D__AFL_COMPILER=1";
+  cc_params[cc_par_cnt++] = "-DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION=1";
+
+  /* When the user tries to use persistent or deferred forkserver modes by
+     appending a single line to the program, we want to reliably inject a
+     signature into the binary (to be picked up by afl-fuzz) and we want
+     to call a function from the runtime .o file. This is unnecessarily
+     painful for three reasons:
+
+     1) We need to convince the compiler not to optimize out the signature.
+        This is done with __attribute__((used)).
+
+     2) We need to convince the linker, when called with -Wl,--gc-sections,
+        not to do the same. This is done by forcing an assignment to a
+        'volatile' pointer.
+
+     3) We need to declare __afl_persistent_loop() in the global namespace,
+        but doing this within a method in a class is hard - :: and extern "C"
+        are forbidden and __attribute__((alias(...))) doesn't work. Hence the
+        __asm__ aliasing trick.
+
+   */
+
+  cc_params[cc_par_cnt++] =
+      "-D__AFL_FUZZ_INIT()="
+      "int __afl_sharedmem_fuzzing = 1;"
+      "extern unsigned int *__afl_fuzz_len;"
+      "extern unsigned char *__afl_fuzz_ptr;"
+      "unsigned char __afl_fuzz_alt[1024000];"
+      "unsigned char *__afl_fuzz_alt_ptr = __afl_fuzz_alt;";
+  cc_params[cc_par_cnt++] =
+      "-D__AFL_FUZZ_TESTCASE_BUF=(__afl_fuzz_ptr ? __afl_fuzz_ptr : "
+      "__afl_fuzz_alt_ptr)";
+  cc_params[cc_par_cnt++] =
+      "-D__AFL_FUZZ_TESTCASE_LEN=(__afl_fuzz_ptr ? *__afl_fuzz_len : "
+      "(*__afl_fuzz_len = read(0, __afl_fuzz_alt_ptr, 1024000)) == 0xffffffff "
+      "? 0 : *__afl_fuzz_len)";
+
+  cc_params[cc_par_cnt++] =
+      "-D__AFL_LOOP(_A)="
+      "({ static volatile char *_B __attribute__((used)); "
+      " _B = (char*)\"" PERSIST_SIG
+      "\"; "
+#ifdef __APPLE__
+      "__attribute__((visibility(\"default\"))) "
+      "int _L(unsigned int) __asm__(\"___afl_persistent_loop\"); "
+#else
+      "__attribute__((visibility(\"default\"))) "
+      "int _L(unsigned int) __asm__(\"__afl_persistent_loop\"); "
+#endif                                                        /* ^__APPLE__ */
+      "_L(_A); })";
+
+  cc_params[cc_par_cnt++] =
+      "-D__AFL_INIT()="
+      "do { static volatile char *_A __attribute__((used)); "
+      " _A = (char*)\"" DEFER_SIG
+      "\"; "
+#ifdef __APPLE__
+      "__attribute__((visibility(\"default\"))) "
+      "void _I(void) __asm__(\"___afl_manual_init\"); "
+#else
+      "__attribute__((visibility(\"default\"))) "
+      "void _I(void) __asm__(\"__afl_manual_init\"); "
+#endif                                                        /* ^__APPLE__ */
+      "_I(); } while (0)";
+
+  if (x_set) {
+
+    cc_params[cc_par_cnt++] = "-x";
+    cc_params[cc_par_cnt++] = "none";
+
+  }
+
+  if (preprocessor_only) {
+
+    /* In the preprocessor_only case (-E), we are not actually compiling at
+       all but requesting the compiler to output preprocessed sources only.
+       We must not add the runtime in this case because the compiler will
+       simply output its binary content back on stdout, breaking any build
+       systems that rely on a separate source preprocessing step. */
+    cc_params[cc_par_cnt] = NULL;
+    return;
+
+  }
+
+#ifndef __ANDROID__
+
+  if (compiler_mode != GCC) {
+
+    switch (bit_mode) {
+
+      case 0:
+        cc_params[cc_par_cnt++] =
+            alloc_printf("%s/afl-compiler-rt.o", obj_path);
+        if (lto_mode)
+          cc_params[cc_par_cnt++] =
+              alloc_printf("%s/afl-llvm-rt-lto.o", obj_path);
+        break;
+
+      case 32:
+        cc_params[cc_par_cnt++] =
+            alloc_printf("%s/afl-compiler-rt-32.o", obj_path);
+        if (access(cc_params[cc_par_cnt - 1], R_OK))
+          FATAL("-m32 is not supported by your compiler");
+        if (lto_mode) {
+
+          cc_params[cc_par_cnt++] =
+              alloc_printf("%s/afl-llvm-rt-lto-32.o", obj_path);
+          if (access(cc_params[cc_par_cnt - 1], R_OK))
+            FATAL("-m32 is not supported by your compiler");
+
+        }
+
+        break;
+
+      case 64:
+        cc_params[cc_par_cnt++] =
+            alloc_printf("%s/afl-compiler-rt-64.o", obj_path);
+        if (access(cc_params[cc_par_cnt - 1], R_OK))
+          FATAL("-m64 is not supported by your compiler");
+        if (lto_mode) {
+
+          cc_params[cc_par_cnt++] =
+              alloc_printf("%s/afl-llvm-rt-lto-64.o", obj_path);
+          if (access(cc_params[cc_par_cnt - 1], R_OK))
+            FATAL("-m64 is not supported by your compiler");
+
+        }
+
+        break;
+
+    }
+
+  #ifndef __APPLE__
+    if (!shared_linking)
+      cc_params[cc_par_cnt++] =
+          alloc_printf("-Wl,--dynamic-list=%s/dynamic_list.txt", obj_path);
+  #endif
+
+  }
+
+#endif
+
+  cc_params[cc_par_cnt] = NULL;
+
+}
+
+/* Main entry point */
+
+int main(int argc, char **argv, char **envp) {
+
+  int   i;
+  char *callname = argv[0], *ptr = NULL;
+
+  if (getenv("AFL_DEBUG")) {
+
+    debug = 1;
+    if (strcmp(getenv("AFL_DEBUG"), "0") == 0) unsetenv("AFL_DEBUG");
+
+  } else if (getenv("AFL_QUIET"))
+
+    be_quiet = 1;
+
+  if ((ptr = strrchr(callname, '/')) != NULL) callname = ptr + 1;
+  argvnull = (u8 *)argv[0];
+  check_environment_vars(envp);
+
+  if ((ptr = find_object("as", argv[0])) != NULL) {
+
+    have_gcc = 1;
+    ck_free(ptr);
+
+  }
+
+#if (LLVM_MAJOR > 2)
+
+  if ((ptr = find_object("SanitizerCoverageLTO.so", argv[0])) != NULL) {
+
+    have_lto = 1;
+    ck_free(ptr);
+
+  }
+
+  if ((ptr = find_object("cmplog-routines-pass.so", argv[0])) != NULL) {
+
+    have_llvm = 1;
+    ck_free(ptr);
+
+  }
+
+#endif
+
+  if ((ptr = find_object("afl-gcc-pass.so", argv[0])) != NULL) {
+
+    have_gcc_plugin = 1;
+    ck_free(ptr);
+
+  }
+
+#if (LLVM_MAJOR > 2)
+
+  if (strncmp(callname, "afl-clang-fast", 14) == 0) {
+
+    compiler_mode = LLVM;
+
+  } else if (strncmp(callname, "afl-clang-lto", 13) == 0 ||
+
+             strncmp(callname, "afl-lto", 7) == 0) {
+
+    compiler_mode = LTO;
+
+  } else
+
+#endif
+      if (strncmp(callname, "afl-gcc-fast", 12) == 0 ||
+
+          strncmp(callname, "afl-g++-fast", 12) == 0) {
+
+    compiler_mode = GCC_PLUGIN;
+
+  } else if (strncmp(callname, "afl-gcc", 7) == 0 ||
+
+             strncmp(callname, "afl-g++", 7) == 0) {
+
+    compiler_mode = GCC;
+
+  }
+
+  if ((ptr = getenv("AFL_CC_COMPILER"))) {
+
+    if (compiler_mode) {
+
+      WARNF(
+          "\"AFL_CC_COMPILER\" is set but a specific compiler was already "
+          "selected by command line parameter or symlink, ignoring the "
+          "environment variable!");
+
+    } else {
+
+      if (strncasecmp(ptr, "LTO", 3) == 0) {
+
+        compiler_mode = LTO;
+
+      } else if (strncasecmp(ptr, "LLVM", 4) == 0) {
+
+        compiler_mode = LLVM;
+
+      } else if (strncasecmp(ptr, "GCC_P", 5) == 0 ||
+
+                 strncasecmp(ptr, "GCC-P", 5) == 0 ||
+                 strncasecmp(ptr, "GCCP", 4) == 0) {
+
+        compiler_mode = GCC_PLUGIN;
+
+      } else if (strcasecmp(ptr, "GCC") == 0) {
+
+        compiler_mode = GCC;
+
+      } else
+
+        FATAL("Unknown AFL_CC_COMPILER mode: %s\n", ptr);
+
+    }
+
+  }
+
+  for (i = 1; i < argc; i++) {
+
+    if (strncmp(argv[i], "--afl", 5) == 0) {
+
+      if (compiler_mode)
+        WARNF(
+            "--afl-... compiler mode supersedes the AFL_CC_COMPILER and "
+            "symlink compiler selection!");
+
+      ptr = argv[i];
+      ptr += 5;
+      while (*ptr == '-')
+        ptr++;
+
+      if (strncasecmp(ptr, "LTO", 3) == 0) {
+
+        compiler_mode = LTO;
+
+      } else if (strncasecmp(ptr, "LLVM", 4) == 0) {
+
+        compiler_mode = LLVM;
+
+      } else if (strncasecmp(ptr, "GCC_P", 5) == 0 ||
+
+                 strncasecmp(ptr, "GCC-P", 5) == 0 ||
+                 strncasecmp(ptr, "GCCP", 4) == 0) {
+
+        compiler_mode = GCC_PLUGIN;
+
+      } else if (strcasecmp(ptr, "GCC") == 0) {
+
+        compiler_mode = GCC;
+
+      } else
+
+        FATAL("Unknown --afl-... compiler mode: %s\n", argv[i]);
+
+    }
+
+  }
+
+  if (strlen(callname) > 2 &&
+      (strncmp(callname + strlen(callname) - 2, "++", 2) == 0 ||
+       strstr(callname, "-g++") != NULL))
+    plusplus_mode = 1;
+
+  if (getenv("USE_TRACE_PC") || getenv("AFL_USE_TRACE_PC") ||
+      getenv("AFL_LLVM_USE_TRACE_PC") || getenv("AFL_TRACE_PC")) {
+
+    if (instrument_mode == 0)
+      instrument_mode = INSTRUMENT_PCGUARD;
+    else if (instrument_mode != INSTRUMENT_PCGUARD)
+      FATAL("you can not set AFL_LLVM_INSTRUMENT and AFL_TRACE_PC together");
+
+  }
+
+  if ((getenv("AFL_LLVM_INSTRUMENT_FILE") != NULL ||
+       getenv("AFL_LLVM_WHITELIST") || getenv("AFL_LLVM_ALLOWLIST") ||
+       getenv("AFL_LLVM_DENYLIST") || getenv("AFL_LLVM_BLOCKLIST")) &&
+      getenv("AFL_DONT_OPTIMIZE"))
+    WARNF(
+        "AFL_LLVM_ALLOWLIST/DENYLIST and AFL_DONT_OPTIMIZE cannot be combined "
+        "for file matching, only function matching!");
+
+  if (getenv("AFL_LLVM_INSTRIM") || getenv("INSTRIM") ||
+      getenv("INSTRIM_LIB")) {
+
+    if (instrument_mode == 0)
+      instrument_mode = INSTRUMENT_CFG;
+    else if (instrument_mode != INSTRUMENT_CFG)
+      FATAL(
+          "you can not set AFL_LLVM_INSTRUMENT and AFL_LLVM_INSTRIM together");
+
+  }
+
+  if (getenv("AFL_LLVM_CTX")) instrument_opt_mode |= INSTRUMENT_OPT_CTX;
+
+  if (getenv("AFL_LLVM_NGRAM_SIZE")) {
+
+    instrument_opt_mode |= INSTRUMENT_OPT_NGRAM;
+    ngram_size = atoi(getenv("AFL_LLVM_NGRAM_SIZE"));
+    if (ngram_size < 2 || ngram_size > NGRAM_SIZE_MAX)
+      FATAL(
+          "NGRAM instrumentation mode must be between 2 and NGRAM_SIZE_MAX "
+          "(%u)",
+          NGRAM_SIZE_MAX);
+
+  }
+
+  if (getenv("AFL_LLVM_INSTRUMENT")) {
+
+    u8 *ptr = strtok(getenv("AFL_LLVM_INSTRUMENT"), ":,;");
+
+    while (ptr) {
+
+      if (strncasecmp(ptr, "afl", strlen("afl")) == 0 ||
+          strncasecmp(ptr, "classic", strlen("classic")) == 0) {
+
+        if (instrument_mode == INSTRUMENT_LTO) {
+
+          instrument_mode = INSTRUMENT_CLASSIC;
+          lto_mode = 1;
+
+        } else if (!instrument_mode || instrument_mode == INSTRUMENT_AFL)
+
+          instrument_mode = INSTRUMENT_AFL;
+        else
+          FATAL("main instrumentation mode already set with %s",
+                instrument_mode_string[instrument_mode]);
+
+      }
+
+      if (strncasecmp(ptr, "pc-guard", strlen("pc-guard")) == 0 ||
+          strncasecmp(ptr, "pcguard", strlen("pcguard")) == 0) {
+
+        if (!instrument_mode || instrument_mode == INSTRUMENT_PCGUARD)
+          instrument_mode = INSTRUMENT_PCGUARD;
+        else
+          FATAL("main instrumentation mode already set with %s",
+                instrument_mode_string[instrument_mode]);
+
+      }
+
+      if (strncasecmp(ptr, "cfg", strlen("cfg")) == 0 ||
+          strncasecmp(ptr, "instrim", strlen("instrim")) == 0) {
+
+        if (instrument_mode == INSTRUMENT_LTO) {
+
+          instrument_mode = INSTRUMENT_CFG;
+          lto_mode = 1;
+
+        } else if (!instrument_mode || instrument_mode == INSTRUMENT_CFG)
+
+          instrument_mode = INSTRUMENT_CFG;
+        else
+          FATAL("main instrumentation mode already set with %s",
+                instrument_mode_string[instrument_mode]);
+
+      }
+
+      if (strncasecmp(ptr, "lto", strlen("lto")) == 0) {
+
+        lto_mode = 1;
+        if (!instrument_mode || instrument_mode == INSTRUMENT_LTO)
+          instrument_mode = INSTRUMENT_LTO;
+        else if (instrument_mode != INSTRUMENT_CFG)
+          FATAL("main instrumentation mode already set with %s",
+                instrument_mode_string[instrument_mode]);
+
+      }
+
+      if (strncasecmp(ptr, "ctx", strlen("ctx")) == 0) {
+
+        instrument_opt_mode |= INSTRUMENT_OPT_CTX;
+        setenv("AFL_LLVM_CTX", "1", 1);
+
+      }
+
+      if (strncasecmp(ptr, "ngram", strlen("ngram")) == 0) {
+
+        ptr += strlen("ngram");
+        while (*ptr && (*ptr < '0' || *ptr > '9'))
+          ptr++;
+
+        if (!*ptr) {
+
+          if ((ptr = getenv("AFL_LLVM_NGRAM_SIZE")) != NULL)
+            FATAL(
+                "you must set the NGRAM size with (e.g. for value 2) "
+                "AFL_LLVM_INSTRUMENT=ngram-2");
+
+        }
+
+        ngram_size = atoi(ptr);
+        if (ngram_size < 2 || ngram_size > NGRAM_SIZE_MAX)
+          FATAL(
+              "NGRAM instrumentation option must be between 2 and "
+              "NGRAM_SIZE_MAX "
+              "(%u)",
+              NGRAM_SIZE_MAX);
+        instrument_opt_mode |= (INSTRUMENT_OPT_NGRAM);
+        ptr = alloc_printf("%u", ngram_size);
+        setenv("AFL_LLVM_NGRAM_SIZE", ptr, 1);
+
+      }
+
+      ptr = strtok(NULL, ":,;");
+
+    }
+
+  }
+
+  if (!compiler_mode) {
+
+    // lto is not a default because outside of afl-cc RANLIB and AR have to
+    // be set to llvm versions so this would work
+    if (have_llvm)
+      compiler_mode = LLVM;
+    else if (have_gcc_plugin)
+      compiler_mode = GCC_PLUGIN;
+    else if (have_gcc)
+      compiler_mode = GCC;
+    else if (have_lto)
+      compiler_mode = LTO;
+    else
+      FATAL("no compiler mode available");
+
+  }
+
+  if (argc < 2 || strncmp(argv[1], "-h", 2) == 0) {
+
+    char *fp;
+    fp = realpath(argv[0], NULL);
+
+    printf("afl-cc" VERSION
+           " by Michal Zalewski, Laszlo Szekeres, Marc Heuse\n");
+
+    SAYF(
+        "\n"
+        "afl-cc/afl-c++ [options]\n"
+        "\n"
+        "This is a helper application for afl-fuzz. It serves as a drop-in "
+        "replacement\n"
+        "for gcc and clang, letting you recompile third-party code with the "
+        "required\n"
+        "runtime instrumentation. A common use pattern would be one of the "
+        "following:\n\n"
+
+        "  CC=afl-cc CXX=afl-c++ ./configure --disable-shared\n"
+        "  cmake -DCMAKE_C_COMPILERC=afl-cc -DCMAKE_CXX_COMPILER=afl-c++ .\n"
+        "  CC=afl-cc CXX=afl-c++ meson\n\n");
+
+    SAYF(
+        "                                     |---------------- FEATURES "
+        "---------------|\n"
+        "MODES:                                NCC PERSIST SNAP DICT   LAF "
+        "CMPLOG SELECT\n"
+        "  [LTO] llvm LTO:          %s%s\n"
+        "      PCGUARD              DEFAULT    yes yes     yes  yes    yes yes "
+        "   yes\n"
+        "      CLASSIC                         yes yes     yes  yes    yes yes "
+        "   yes\n"
+        "  [LLVM] llvm:             %s%s\n"
+        "      PCGUARD              %s    yes yes     yes  module yes yes    "
+        "extern\n"
+        "      CLASSIC              %s    no  yes     yes  module yes yes    "
+        "yes\n"
+        "        - NORMAL\n"
+        "        - CTX\n"
+        "        - NGRAM-{2-16}\n"
+        "      INSTRIM                         no  yes     yes  module yes yes "
+        "   yes\n"
+        "        - NORMAL\n"
+        "        - CTX\n"
+        "        - NGRAM-{2-16}\n"
+        "  [GCC_PLUGIN] gcc plugin: %s%s\n"
+        "      CLASSIC              DEFAULT    no  yes     yes  no     no  no  "
+        "   simple\n"
+        "  [GCC] simple gcc:        %s%s\n"
+        "      CLASSIC              DEFAULT    no  no      no   no     no  no  "
+        "   no\n\n",
+        have_lto ? "AVAILABLE" : "unavailable!",
+        compiler_mode == LTO ? " [SELECTED]" : "",
+        have_llvm ? "AVAILABLE" : "unavailable!",
+        compiler_mode == LLVM ? " [SELECTED]" : "",
+        LLVM_MAJOR > 6 ? "DEFAULT" : "       ",
+        LLVM_MAJOR > 6 ? "       " : "DEFAULT",
+        have_gcc_plugin ? "AVAILABLE" : "unavailable!",
+        compiler_mode == GCC_PLUGIN ? " [SELECTED]" : "",
+        have_gcc ? "AVAILABLE" : "unavailable!",
+        compiler_mode == GCC ? " [SELECTED]" : "");
+
+    SAYF(
+        "Modes:\n"
+        "  To select the compiler mode use a symlink version (e.g. "
+        "afl-clang-fast), set\n"
+        "  the environment variable AFL_CC_COMPILER to a mode (e.g. LLVM) or "
+        "use the\n"
+        "  command line parameter --afl-MODE (e.g. --afl-llvm). If none is "
+        "selected,\n"
+        "  afl-cc will select the best available (LLVM -> GCC_PLUGIN -> GCC).\n"
+        "  The best is LTO but it often needs RANLIB and AR settings outside "
+        "of afl-cc.\n\n");
+
+    SAYF(
+        "Sub-Modes: (set via env AFL_LLVM_INSTRUMENT, afl-cc selects the best "
+        "available)\n"
+        "  PCGUARD: Dominator tree instrumentation (best!) (README.llvm.md)\n"
+        "  CLASSIC: decision target instrumentation (README.llvm.md)\n"
+        "  CTX:     CLASSIC + callee context (instrumentation/README.ctx.md)\n"
+        "  NGRAM-x: CLASSIC + previous path "
+        "((instrumentation/README.ngram.md)\n"
+        "  INSTRIM: Dominator tree (for LLVM <= 6.0) "
+        "(instrumentation/README.instrim.md)\n\n");
+
+    SAYF(
+        "Features: (see documentation links)\n"
+        "  NCC:    non-colliding coverage [automatic] (that is an amazing "
+        "thing!)\n"
+        "          (instrumentation/README.lto.md)\n"
+        "  PERSIST: persistent mode support [code] (huge speed increase!)\n"
+        "          (instrumentation/README.persistent_mode.md)\n"
+        "  SNAP:   linux lkm snapshot module support [automatic] (speed "
+        "increase)\n"
+        "          (https://github.com/AFLplusplus/AFL-Snapshot-LKM/)\n"
+        "  DICT:   dictionary in the target [yes=automatic or llvm module "
+        "pass]\n"
+        "          (instrumentation/README.lto.md + "
+        "instrumentation/README.llvm.md)\n"
+        "  LAF:    comparison splitting [env] "
+        "(instrumentation/README.laf-intel.md)\n"
+        "  CMPLOG: input2state exploration [env] "
+        "(instrumentation/README.cmplog.md)\n"
+        "  SELECT: selective instrumentation (allow/deny) on filename or "
+        "function [env]\n"
+        "          (instrumentation/README.instrument_list.md)\n\n");
+
+    if (argc < 2 || strncmp(argv[1], "-hh", 3)) {
+
+      SAYF(
+          "To see all environment variables for the configuration of afl-cc "
+          "use \"-hh\".\n");
+
+    } else {
+
+      SAYF(
+          "Environment variables used:\n"
+          "  AFL_CC: path to the C compiler to use\n"
+          "  AFL_CXX: path to the C++ compiler to use\n"
+          "  AFL_DEBUG: enable developer debugging output\n"
+          "  AFL_DONT_OPTIMIZE: disable optimization instead of -O3\n"
+          "  AFL_HARDEN: adds code hardening to catch memory bugs\n"
+          "  AFL_INST_RATIO: percentage of branches to instrument\n"
+#if LLVM_MAJOR < 9
+          "  AFL_LLVM_NOT_ZERO: use cycling trace counters that skip zero\n"
+#else
+          "  AFL_LLVM_SKIP_NEVERZERO: do not skip zero on trace counters\n"
+#endif
+          "  AFL_LLVM_DICT2FILE: generate an afl dictionary based on found "
+          "comparisons\n"
+          "  AFL_LLVM_LAF_ALL: enables all LAF splits/transforms\n"
+          "  AFL_LLVM_LAF_SPLIT_COMPARES: enable cascaded comparisons\n"
+          "  AFL_LLVM_LAF_SPLIT_COMPARES_BITW: size limit (default 8)\n"
+          "  AFL_LLVM_LAF_SPLIT_SWITCHES: cascaded comparisons on switches\n"
+          "  AFL_LLVM_LAF_SPLIT_FLOATS: cascaded comparisons on floats\n"
+          "  AFL_LLVM_LAF_TRANSFORM_COMPARES: cascade comparisons for string "
+          "functions\n"
+          "  AFL_LLVM_INSTRUMENT_ALLOW/AFL_LLVM_INSTRUMENT_DENY: enable "
+          "instrument allow/\n"
+          "    deny listing (selective instrumentation)\n"
+          "  AFL_NO_BUILTIN: no builtins for string compare functions (for "
+          "libtokencap.so)\n"
+          "  AFL_PATH: path to instrumenting pass and runtime  "
+          "(afl-compiler-rt.*o)\n"
+          "  AFL_LLVM_DOCUMENT_IDS: document edge IDs given to which function "
+          "(LTO only)\n"
+          "  AFL_QUIET: suppress verbose output\n"
+          "  AFL_USE_ASAN: activate address sanitizer\n"
+          "  AFL_USE_CFISAN: activate control flow sanitizer\n"
+          "  AFL_USE_MSAN: activate memory sanitizer\n"
+          "  AFL_USE_UBSAN: activate undefined behaviour sanitizer\n",
+          BIN_PATH, BIN_PATH);
+
+      SAYF(
+          "\nLLVM/LTO/afl-clang-fast/afl-clang-lto specific environment "
+          "variables:\n"
+          "  AFL_LLVM_CMPLOG: log operands of comparisons (RedQueen mutator)\n"
+          "  AFL_LLVM_INSTRUMENT: set instrumentation mode: CLASSIC, INSTRIM, "
+          "PCGUARD, LTO, CTX, NGRAM-2 ... NGRAM-16\n"
+          " You can also use the old environment variables instead:\n"
+          "  AFL_LLVM_USE_TRACE_PC: use LLVM trace-pc-guard instrumentation\n"
+          "  AFL_LLVM_INSTRIM: use light weight instrumentation InsTrim\n"
+          "  AFL_LLVM_INSTRIM_LOOPHEAD: optimize loop tracing for speed "
+          "(option to INSTRIM)\n"
+          "  AFL_LLVM_CTX: use context sensitive coverage (for CLASSIC and "
+          "INSTRIM)\n"
+          "  AFL_LLVM_NGRAM_SIZE: use ngram prev_loc count coverage (for "
+          "CLASSIC and INSTRIM)\n");
+
+#ifdef AFL_CLANG_FLTO
+      SAYF(
+          "\nLTO/afl-clang-lto specific environment variables:\n"
+          "AFL_LLVM_MAP_ADDR: use a fixed coverage map address (speed), e.g. "
+          "0x10000\n"
+          "AFL_LLVM_DOCUMENT_IDS: write all edge IDs and the corresponding "
+          "functions they are in into this file\n"
+          "AFL_LLVM_LTO_DONTWRITEID: don't write the highest ID used to a "
+          "global var\n"
+          "AFL_LLVM_LTO_STARTID: from which ID to start counting from for a "
+          "bb\n"
+          "AFL_REAL_LD: use this lld linker instead of the compiled in path\n"
+          "\nafl-clang-lto was built with linker target \"%s\" and LTO flags "
+          "\"%s\"\n"
+          "If anything fails - be sure to read README.lto.md!\n",
+          AFL_REAL_LD, AFL_CLANG_FLTO);
+#endif
+
+    }
+
+    SAYF(
+        "For any information on the available instrumentations and options "
+        "please \n"
+        "consult the README.md, especially section 3.1 about instrumenting "
+        "targets.\n\n");
+
+#if (LLVM_MAJOR > 2)
+    if (have_lto)
+      SAYF("afl-cc LTO with ld=%s %s\n", AFL_REAL_LD, AFL_CLANG_FLTO);
+    if (have_llvm)
+      SAYF("afl-cc LLVM version %d with the the binary path \"%s\".\n",
+           LLVM_MAJOR, LLVM_BINDIR);
+    if (have_lto || have_llvm) SAYF("\n");
+#endif
+
+    SAYF(
+        "Do not be overwhelmed :) afl-cc uses good defaults if no options are "
+        "selected.\n"
+        "Read the documentation for FEATURES though, all are good but few are "
+        "defaults.\n\n");
+
+    exit(1);
+
+  }
+
+  if (compiler_mode == LTO) {
+
+    if (instrument_mode == 0 || instrument_mode == INSTRUMENT_LTO ||
+        instrument_mode == INSTRUMENT_CFG) {
+
+      lto_mode = 1;
+      if (!instrument_mode) {
+
+        instrument_mode = INSTRUMENT_CFG;
+        ptr = instrument_mode_string[instrument_mode];
+
+      }
+
+    } else if (instrument_mode == INSTRUMENT_LTO ||
+
+               instrument_mode == INSTRUMENT_CLASSIC) {
+
+      lto_mode = 1;
+
+    } else {
+
+      if (!be_quiet)
+        WARNF("afl-clang-lto called with mode %s, using that mode instead",
+              instrument_mode_string[instrument_mode]);
+
+    }
+
+  }
+
+  if (instrument_mode == 0 && compiler_mode < GCC_PLUGIN) {
+
+#if LLVM_MAJOR <= 6
+    instrument_mode = INSTRUMENT_AFL;
+#else
+    if (getenv("AFL_LLVM_INSTRUMENT_FILE") != NULL ||
+        getenv("AFL_LLVM_WHITELIST") || getenv("AFL_LLVM_ALLOWLIST") ||
+        getenv("AFL_LLVM_DENYLIST") || getenv("AFL_LLVM_BLOCKLIST")) {
+
+      instrument_mode = INSTRUMENT_AFL;
+      WARNF(
+          "switching to classic instrumentation because "
+          "AFL_LLVM_ALLOWLIST/DENYLIST does not work with PCGUARD. Use "
+          "-fsanitize-coverage-allowlist=allowlist.txt or "
+          "-fsanitize-coverage-blocklist=denylist.txt if you want to use "
+          "PCGUARD. Requires llvm 12+. See https://clang.llvm.org/docs/ "
+          "SanitizerCoverage.html#partially-disabling-instrumentation");
+
+    } else
+
+      instrument_mode = INSTRUMENT_PCGUARD;
+#endif
+
+  }
+
+  if (instrument_opt_mode && compiler_mode != LLVM)
+    FATAL("CTX and NGRAM can only be used in LLVM mode");
+
+  if (!instrument_opt_mode) {
+
+    if (lto_mode && instrument_mode == INSTRUMENT_CFG)
+      instrument_mode = INSTRUMENT_PCGUARD;
+    ptr = instrument_mode_string[instrument_mode];
+
+  } else {
+
+    if (instrument_opt_mode == INSTRUMENT_OPT_CTX)
+
+      ptr = alloc_printf("%s + CTX", instrument_mode_string[instrument_mode]);
+    else if (instrument_opt_mode == INSTRUMENT_OPT_NGRAM)
+      ptr = alloc_printf("%s + NGRAM-%u",
+                         instrument_mode_string[instrument_mode], ngram_size);
+    else
+      ptr = alloc_printf("%s + CTX + NGRAM-%u",
+                         instrument_mode_string[instrument_mode], ngram_size);
+
+  }
+
+#ifndef AFL_CLANG_FLTO
+  if (lto_mode)
+    FATAL(
+        "instrumentation mode LTO specified but LLVM support not available "
+        "(requires LLVM 11 or higher)");
+#endif
+
+  if (instrument_opt_mode && instrument_mode != INSTRUMENT_CLASSIC &&
+      instrument_mode != INSTRUMENT_CFG)
+    FATAL(
+        "CTX and NGRAM instrumentation options can only be used with CFG "
+        "(recommended) and CLASSIC instrumentation modes!");
+
+  if (getenv("AFL_LLVM_SKIP_NEVERZERO") && getenv("AFL_LLVM_NOT_ZERO"))
+    FATAL(
+        "AFL_LLVM_NOT_ZERO and AFL_LLVM_SKIP_NEVERZERO can not be set "
+        "together");
+
+  if (instrument_mode == INSTRUMENT_PCGUARD &&
+      (getenv("AFL_LLVM_INSTRUMENT_FILE") != NULL ||
+       getenv("AFL_LLVM_WHITELIST") || getenv("AFL_LLVM_ALLOWLIST") ||
+       getenv("AFL_LLVM_DENYLIST") || getenv("AFL_LLVM_BLOCKLIST")))
+    FATAL(
+        "Instrumentation type PCGUARD does not support "
+        "AFL_LLVM_ALLOWLIST/DENYLIST! Use "
+        "-fsanitize-coverage-allowlist=allowlist.txt or "
+        "-fsanitize-coverage-blocklist=denylist.txt instead (requires llvm "
+        "12+), see "
+        "https://clang.llvm.org/docs/"
+        "SanitizerCoverage.html#partially-disabling-instrumentation");
+
+  u8 *ptr2;
+
+  if ((ptr2 = getenv("AFL_LLVM_DICT2FILE")) != NULL && *ptr2 != '/')
+    FATAL("AFL_LLVM_DICT2FILE must be set to an absolute file path");
+
+  if ((isatty(2) && !be_quiet) || debug) {
+
+    SAYF(cCYA
+         "afl-cc " VERSION cRST
+         " by Michal Zalewski, Laszlo Szekeres, Marc Heuse - mode: %s-%s\n",
+         compiler_mode_string[compiler_mode], ptr);
+
+  }
+
+  if (!be_quiet && !lto_mode &&
+      ((ptr2 = getenv("AFL_MAP_SIZE")) || (ptr2 = getenv("AFL_MAPSIZE")))) {
+
+    u32 map_size = atoi(ptr2);
+    if (map_size != MAP_SIZE)
+      WARNF("AFL_MAP_SIZE is not supported by afl-clang-fast");
+
+  }
+
+  if (debug) {
+
+    SAYF(cMGN "[D]" cRST " cd \"%s\";", getthecwd());
+    for (i = 0; i < argc; i++)
+      SAYF(" \"%s\"", argv[i]);
+    SAYF("\n");
+
+  }
+
+  if (getenv("AFL_LLVM_LAF_ALL")) {
+
+    setenv("AFL_LLVM_LAF_SPLIT_SWITCHES", "1", 1);
+    setenv("AFL_LLVM_LAF_SPLIT_COMPARES", "1", 1);
+    setenv("AFL_LLVM_LAF_SPLIT_FLOATS", "1", 1);
+    setenv("AFL_LLVM_LAF_TRANSFORM_COMPARES", "1", 1);
+
+  }
+
+  cmplog_mode = getenv("AFL_CMPLOG") || getenv("AFL_LLVM_CMPLOG");
+  if (!be_quiet && cmplog_mode)
+    printf("CmpLog mode by <andreafioraldi@gmail.com>\n");
+
+#ifndef __ANDROID__
+  find_obj(argv[0]);
+#endif
+
+  edit_params(argc, argv, envp);
+
+  if (debug) {
+
+    SAYF(cMGN "[D]" cRST " cd \"%s\";", getthecwd());
+    for (i = 0; i < cc_par_cnt; i++)
+      SAYF(" \"%s\"", cc_params[i]);
+    SAYF("\n");
+
+  }
+
+  execvp(cc_params[0], (char **)cc_params);
+
+  FATAL("Oops, failed to execute '%s' - check your PATH", cc_params[0]);
+
+  return 0;
+
+}
+
diff --git a/src/afl-common.c b/src/afl-common.c
index 367dec72..d66440aa 100644
--- a/src/afl-common.c
+++ b/src/afl-common.c
@@ -877,3 +877,36 @@ u32 get_map_size(void) {
 
 }
 
+/* Create a stream file */
+
+FILE *create_ffile(u8 *fn) {
+
+  s32   fd;
+  FILE *f;
+
+  fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0600);
+
+  if (fd < 0) { PFATAL("Unable to create '%s'", fn); }
+
+  f = fdopen(fd, "w");
+
+  if (!f) { PFATAL("fdopen() failed"); }
+
+  return f;
+
+}
+
+/* Create a file */
+
+s32 create_file(u8 *fn) {
+
+  s32 fd;
+
+  fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0600);
+
+  if (fd < 0) { PFATAL("Unable to create '%s'", fn); }
+
+  return fd;
+
+}
+
diff --git a/src/afl-forkserver.c b/src/afl-forkserver.c
index 93203cb2..58932bc4 100644
--- a/src/afl-forkserver.c
+++ b/src/afl-forkserver.c
@@ -240,6 +240,23 @@ static void afl_fauxsrv_execv(afl_forkserver_t *fsrv, char **argv) {
 
     if (!child_pid) {  // New child
 
+      close(fsrv->out_dir_fd);
+      close(fsrv->dev_null_fd);
+      close(fsrv->dev_urandom_fd);
+
+      if (fsrv->plot_file != NULL) {
+
+        fclose(fsrv->plot_file);
+        fsrv->plot_file = NULL;
+
+      }
+
+      // enable terminating on sigpipe in the childs
+      struct sigaction sa;
+      memset((char *)&sa, 0, sizeof(sa));
+      sa.sa_handler = SIG_DFL;
+      sigaction(SIGPIPE, &sa, NULL);
+
       signal(SIGCHLD, old_sigchld_handler);
       // FORKSRV_FD is for communication with AFL, we don't need it in the
       // child.
@@ -361,11 +378,16 @@ void afl_fsrv_start(afl_forkserver_t *fsrv, char **argv,
 
     /* CHILD PROCESS */
 
+    // enable terminating on sigpipe in the childs
+    struct sigaction sa;
+    memset((char *)&sa, 0, sizeof(sa));
+    sa.sa_handler = SIG_DFL;
+    sigaction(SIGPIPE, &sa, NULL);
+
     struct rlimit r;
 
     /* Umpf. On OpenBSD, the default fd limit for root users is set to
        soft 128. Let's try to fix that... */
-
     if (!getrlimit(RLIMIT_NOFILE, &r) && r.rlim_cur < FORKSRV_FD + 2) {
 
       r.rlim_cur = FORKSRV_FD + 2;
@@ -432,7 +454,12 @@ void afl_fsrv_start(afl_forkserver_t *fsrv, char **argv,
     close(fsrv->dev_null_fd);
     close(fsrv->dev_urandom_fd);
 
-    if (fsrv->plot_file != NULL) { fclose(fsrv->plot_file); }
+    if (fsrv->plot_file != NULL) {
+
+      fclose(fsrv->plot_file);
+      fsrv->plot_file = NULL;
+
+    }
 
     /* This should improve performance a bit, since it stops the linker from
        doing extra work post-fork(). */
diff --git a/src/afl-fuzz-extras.c b/src/afl-fuzz-extras.c
index d6c368d1..58ce5b6f 100644
--- a/src/afl-fuzz-extras.c
+++ b/src/afl-fuzz-extras.c
@@ -101,7 +101,8 @@ void load_extras_file(afl_state_t *afl, u8 *fname, u32 *min_len, u32 *max_len,
 
     if (rptr < lptr || *rptr != '"') {
 
-      FATAL("Malformed name=\"value\" pair in line %u.", cur_line);
+      WARNF("Malformed name=\"value\" pair in line %u.", cur_line);
+      continue;
 
     }
 
@@ -141,13 +142,19 @@ void load_extras_file(afl_state_t *afl, u8 *fname, u32 *min_len, u32 *max_len,
 
     if (*lptr != '"') {
 
-      FATAL("Malformed name=\"keyword\" pair in line %u.", cur_line);
+      WARNF("Malformed name=\"keyword\" pair in line %u.", cur_line);
+      continue;
 
     }
 
     ++lptr;
 
-    if (!*lptr) { FATAL("Empty keyword in line %u.", cur_line); }
+    if (!*lptr) {
+
+      WARNF("Empty keyword in line %u.", cur_line);
+      continue;
+
+    }
 
     /* Okay, let's allocate memory and copy data between "...", handling
        \xNN escaping, \\, and \". */
@@ -169,7 +176,9 @@ void load_extras_file(afl_state_t *afl, u8 *fname, u32 *min_len, u32 *max_len,
 
         case 1 ... 31:
         case 128 ... 255:
-          FATAL("Non-printable characters in line %u.", cur_line);
+          WARNF("Non-printable characters in line %u.", cur_line);
+          continue;
+          break;
 
         case '\\':
 
@@ -185,7 +194,8 @@ void load_extras_file(afl_state_t *afl, u8 *fname, u32 *min_len, u32 *max_len,
 
           if (*lptr != 'x' || !isxdigit(lptr[1]) || !isxdigit(lptr[2])) {
 
-            FATAL("Invalid escaping (not \\xNN) in line %u.", cur_line);
+            WARNF("Invalid escaping (not \\xNN) in line %u.", cur_line);
+            continue;
 
           }
 
@@ -209,10 +219,11 @@ void load_extras_file(afl_state_t *afl, u8 *fname, u32 *min_len, u32 *max_len,
 
     if (afl->extras[afl->extras_cnt].len > MAX_DICT_FILE) {
 
-      FATAL(
+      WARNF(
           "Keyword too big in line %u (%s, limit is %s)", cur_line,
           stringify_mem_size(val_bufs[0], sizeof(val_bufs[0]), klen),
           stringify_mem_size(val_bufs[1], sizeof(val_bufs[1]), MAX_DICT_FILE));
+      continue;
 
     }
 
@@ -232,14 +243,19 @@ static void extras_check_and_sort(afl_state_t *afl, u32 min_len, u32 max_len,
 
   u8 val_bufs[2][STRINGIFY_VAL_SIZE_MAX];
 
-  if (!afl->extras_cnt) { FATAL("No usable files in '%s'", dir); }
+  if (!afl->extras_cnt) {
+
+    WARNF("No usable data in '%s'", dir);
+    return;
+
+  }
 
   qsort(afl->extras, afl->extras_cnt, sizeof(struct extra_data),
         compare_extras_len);
 
-  OKF("Loaded %u extra tokens, size range %s to %s.", afl->extras_cnt,
-      stringify_mem_size(val_bufs[0], sizeof(val_bufs[0]), min_len),
-      stringify_mem_size(val_bufs[1], sizeof(val_bufs[1]), max_len));
+  ACTF("Loaded %u extra tokens, size range %s to %s.", afl->extras_cnt,
+       stringify_mem_size(val_bufs[0], sizeof(val_bufs[0]), min_len),
+       stringify_mem_size(val_bufs[1], sizeof(val_bufs[1]), max_len));
 
   if (max_len > 32) {
 
@@ -250,8 +266,8 @@ static void extras_check_and_sort(afl_state_t *afl, u32 min_len, u32 max_len,
 
   if (afl->extras_cnt > afl->max_det_extras) {
 
-    OKF("More than %d tokens - will use them probabilistically.",
-        afl->max_det_extras);
+    WARNF("More than %d tokens - will use them probabilistically.",
+          afl->max_det_extras);
 
   }
 
@@ -320,9 +336,10 @@ void load_extras(afl_state_t *afl, u8 *dir) {
     if (st.st_size > MAX_DICT_FILE) {
 
       WARNF(
-          "Extra '%s' is very big (%s, limit is %s)", fn,
+          "Extra '%s' is too big (%s, limit is %s)", fn,
           stringify_mem_size(val_bufs[0], sizeof(val_bufs[0]), st.st_size),
           stringify_mem_size(val_bufs[1], sizeof(val_bufs[1]), MAX_DICT_FILE));
+      continue;
 
     }
 
@@ -370,16 +387,74 @@ static inline u8 memcmp_nocase(u8 *m1, u8 *m2, u32 len) {
 
 }
 
-/* Adds a new extra / dict entry. Used for LTO autodict. */
+/* Removes duplicates from the loaded extras. This can happen if multiple files
+   are loaded */
+
+void dedup_extras(afl_state_t *afl) {
+
+  if (afl->extras_cnt < 2) return;
+
+  u32 i, j, orig_cnt = afl->extras_cnt;
+
+  for (i = 0; i < afl->extras_cnt - 1; i++) {
+
+    for (j = i + 1; j < afl->extras_cnt; j++) {
+
+    restart_dedup:
+
+      // if the goto was used we could be at the end of the list
+      if (j >= afl->extras_cnt || afl->extras[i].len != afl->extras[j].len)
+        break;
+
+      if (memcmp(afl->extras[i].data, afl->extras[j].data,
+                 afl->extras[i].len) == 0) {
+
+        ck_free(afl->extras[j].data);
+        if (j + 1 < afl->extras_cnt)  // not at the end of the list?
+          memmove((char *)&afl->extras[j], (char *)&afl->extras[j + 1],
+                  (afl->extras_cnt - j - 1) * sizeof(struct extra_data));
+        afl->extras_cnt--;
+        goto restart_dedup;  // restart if several duplicates are in a row
+
+      }
+
+    }
+
+  }
+
+  if (afl->extras_cnt != orig_cnt)
+    afl->extras = afl_realloc((void **)&afl->extras,
+                              afl->extras_cnt * sizeof(struct extra_data));
+
+}
+
+/* Adds a new extra / dict entry. */
 void add_extra(afl_state_t *afl, u8 *mem, u32 len) {
 
-  u8 val_bufs[2][STRINGIFY_VAL_SIZE_MAX];
+  u8  val_bufs[2][STRINGIFY_VAL_SIZE_MAX];
+  u32 i, found = 0;
+
+  for (i = 0; i < afl->extras_cnt; i++) {
+
+    if (afl->extras[i].len == len) {
+
+      if (memcmp(afl->extras[i].data, mem, len) == 0) return;
+      found = 1;
+
+    } else {
+
+      if (found) break;
+
+    }
+
+  }
 
   if (len > MAX_DICT_FILE) {
 
-    WARNF("Extra '%.*s' is very big (%s, limit is %s)", (int)len, mem,
+    WARNF("Extra '%.*s' is too big (%s, limit is %s)", (int)len, mem,
           stringify_mem_size(val_bufs[0], sizeof(val_bufs[0]), len),
           stringify_mem_size(val_bufs[1], sizeof(val_bufs[1]), MAX_DICT_FILE));
+    return;
 
   } else if (len > 32) {
 
@@ -405,8 +480,8 @@ void add_extra(afl_state_t *afl, u8 *mem, u32 len) {
 
   if (afl->extras_cnt == afl->max_det_extras + 1) {
 
-    OKF("More than %d tokens - will use them probabilistically.",
-        afl->max_det_extras);
+    WARNF("More than %d tokens - will use them probabilistically.",
+          afl->max_det_extras);
 
   }
 
@@ -609,7 +684,7 @@ void load_auto(afl_state_t *afl) {
 
   } else {
 
-    OKF("No auto-generated dictionary tokens to reuse.");
+    ACTF("No auto-generated dictionary tokens to reuse.");
 
   }
 
diff --git a/src/afl-fuzz-init.c b/src/afl-fuzz-init.c
index 102f04b9..c834e5db 100644
--- a/src/afl-fuzz-init.c
+++ b/src/afl-fuzz-init.c
@@ -611,37 +611,43 @@ void read_foreign_testcases(afl_state_t *afl, int first) {
 /* Read all testcases from the input directory, then queue them for testing.
    Called at startup. */
 
-void read_testcases(afl_state_t *afl) {
+void read_testcases(afl_state_t *afl, u8 *directory) {
 
   struct dirent **nl;
-  s32             nl_cnt;
+  s32             nl_cnt, subdirs = 1;
   u32             i;
-  u8 *            fn1;
-
-  u8 val_buf[2][STRINGIFY_VAL_SIZE_MAX];
+  u8 *            fn1, *dir = directory;
+  u8              val_buf[2][STRINGIFY_VAL_SIZE_MAX];
 
   /* Auto-detect non-in-place resumption attempts. */
 
-  fn1 = alloc_printf("%s/queue", afl->in_dir);
-  if (!access(fn1, F_OK)) {
+  if (dir == NULL) {
 
-    afl->in_dir = fn1;
+    fn1 = alloc_printf("%s/queue", afl->in_dir);
+    if (!access(fn1, F_OK)) {
 
-  } else {
+      afl->in_dir = fn1;
+      subdirs = 0;
 
-    ck_free(fn1);
+    } else {
+
+      ck_free(fn1);
+
+    }
+
+    dir = afl->in_dir;
 
   }
 
-  ACTF("Scanning '%s'...", afl->in_dir);
+  ACTF("Scanning '%s'...", dir);
 
   /* We use scandir() + alphasort() rather than readdir() because otherwise,
      the ordering of test cases would vary somewhat randomly and would be
      difficult to control. */
 
-  nl_cnt = scandir(afl->in_dir, &nl, NULL, alphasort);
+  nl_cnt = scandir(dir, &nl, NULL, alphasort);
 
-  if (nl_cnt < 0) {
+  if (nl_cnt < 0 && directory == NULL) {
 
     if (errno == ENOENT || errno == ENOTDIR) {
 
@@ -656,7 +662,7 @@ void read_testcases(afl_state_t *afl) {
 
     }
 
-    PFATAL("Unable to open '%s'", afl->in_dir);
+    PFATAL("Unable to open '%s'", dir);
 
   }
 
@@ -674,19 +680,29 @@ void read_testcases(afl_state_t *afl) {
     u8 dfn[PATH_MAX];
     snprintf(dfn, PATH_MAX, "%s/.state/deterministic_done/%s", afl->in_dir,
              nl[i]->d_name);
-    u8 *fn2 = alloc_printf("%s/%s", afl->in_dir, nl[i]->d_name);
+    u8 *fn2 = alloc_printf("%s/%s", dir, nl[i]->d_name);
 
     u8 passed_det = 0;
 
-    free(nl[i]);                                             /* not tracked */
-
     if (lstat(fn2, &st) || access(fn2, R_OK)) {
 
       PFATAL("Unable to access '%s'", fn2);
 
     }
 
-    /* This also takes care of . and .. */
+    /* obviously we want to skip "descending" into . and .. directories,
+       however it is a good idea to skip also directories that start with
+       a dot */
+    if (subdirs && S_ISDIR(st.st_mode) && nl[i]->d_name[0] != '.') {
+
+      free(nl[i]);                                           /* not tracked */
+      read_testcases(afl, fn2);
+      ck_free(fn2);
+      continue;
+
+    }
+
+    free(nl[i]);
 
     if (!S_ISREG(st.st_mode) || !st.st_size || strstr(fn2, "/README.txt")) {
 
@@ -718,7 +734,7 @@ void read_testcases(afl_state_t *afl) {
 
   free(nl);                                                  /* not tracked */
 
-  if (!afl->queued_paths) {
+  if (!afl->queued_paths && directory == NULL) {
 
     SAYF("\n" cLRD "[-] " cRST
          "Looks like there are no valid test cases in the input directory! The "
@@ -985,6 +1001,76 @@ void perform_dry_run(afl_state_t *afl) {
 
   }
 
+  /* Now we remove all entries from the queue that have a duplicate trace map */
+
+  q = afl->queue;
+  struct queue_entry *p, *prev = NULL;
+  int                 duplicates = 0;
+
+restart_outer_cull_loop:
+
+  while (q) {
+
+    if (q->cal_failed || !q->exec_cksum) continue;
+
+  restart_inner_cull_loop:
+
+    p = q->next;
+
+    while (p) {
+
+      if (!p->cal_failed && p->exec_cksum == q->exec_cksum) {
+
+        duplicates = 1;
+        --afl->pending_not_fuzzed;
+
+        // We do not remove any of the memory allocated because for
+        // splicing the data might still be interesting.
+        // We only decouple them from the linked list.
+        // This will result in some leaks at exit, but who cares.
+
+        // we keep the shorter file
+        if (p->len >= q->len) {
+
+          q->next = p->next;
+          goto restart_inner_cull_loop;
+
+        } else {
+
+          if (prev)
+            prev->next = q = p;
+          else
+            afl->queue = q = p;
+          goto restart_outer_cull_loop;
+
+        }
+
+      }
+
+      p = p->next;
+
+    }
+
+    prev = q;
+    q = q->next;
+
+  }
+
+  if (duplicates) {
+
+    afl->max_depth = 0;
+    q = afl->queue;
+    while (q) {
+
+      if (q->depth > afl->max_depth) afl->max_depth = q->depth;
+      q = q->next;
+
+    }
+
+    afl->queue = afl->queue_top = afl->queue;
+
+  }
+
   OKF("All test cases processed.");
 
 }
diff --git a/src/afl-fuzz-one.c b/src/afl-fuzz-one.c
index bf568c38..5737c1f5 100644
--- a/src/afl-fuzz-one.c
+++ b/src/afl-fuzz-one.c
@@ -1707,20 +1707,8 @@ custom_mutator_stage:
 
           } while (tid == afl->current_entry && afl->queued_paths > 1);
 
-          target = afl->queue;
-
-          while (tid >= 100) {
-
-            target = target->next_100;
-            tid -= 100;
-
-          }
-
-          while (tid--) {
-
-            target = target->next;
-
-          }
+          afl->splicing_with = tid;
+          target = afl->queue_buf[tid];
 
           /* Make sure that the target has a reasonable length. */
 
@@ -4518,20 +4506,7 @@ pacemaker_fuzzing:
         } while (tid == afl->current_entry);
 
         afl->splicing_with = tid;
-        target = afl->queue;
-
-        while (tid >= 100) {
-
-          target = target->next_100;
-          tid -= 100;
-
-        }
-
-        while (tid--) {
-
-          target = target->next;
-
-        }
+        target = afl->queue_buf[tid];
 
         /* Make sure that the target has a reasonable length. */
 
diff --git a/src/afl-fuzz-queue.c b/src/afl-fuzz-queue.c
index 0c472845..8c7bfc55 100644
--- a/src/afl-fuzz-queue.c
+++ b/src/afl-fuzz-queue.c
@@ -138,9 +138,9 @@ static u8 check_if_text(struct queue_entry *q) {
     }
 
     // non-overlong 2-byte
-    if (((0xC2 <= buf[offset + 0] && buf[offset + 0] <= 0xDF) &&
-         (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0xBF)) &&
-        len - offset > 1) {
+    if (len - offset > 1 &&
+        ((0xC2 <= buf[offset + 0] && buf[offset + 0] <= 0xDF) &&
+         (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0xBF))) {
 
       offset += 2;
       utf8++;
@@ -230,7 +230,7 @@ void add_to_queue(afl_state_t *afl, u8 *fname, u32 len, u8 passed_det) {
 
   } else {
 
-    afl->q_prev100 = afl->queue = afl->queue_top = q;
+    afl->queue = afl->queue_top = q;
 
   }
 
@@ -239,13 +239,6 @@ void add_to_queue(afl_state_t *afl, u8 *fname, u32 len, u8 passed_det) {
 
   afl->cycles_wo_finds = 0;
 
-  if (!(afl->queued_paths % 100)) {
-
-    afl->q_prev100->next_100 = q;
-    afl->q_prev100 = q;
-
-  }
-
   struct queue_entry **queue_buf = afl_realloc(
       AFL_BUF_PARAM(queue), afl->queued_paths * sizeof(struct queue_entry *));
   if (unlikely(!queue_buf)) { PFATAL("alloc"); }
@@ -281,15 +274,15 @@ void add_to_queue(afl_state_t *afl, u8 *fname, u32 len, u8 passed_det) {
 
 void destroy_queue(afl_state_t *afl) {
 
-  struct queue_entry *q = afl->queue, *n;
+  struct queue_entry *q;
+  u32                 i;
 
-  while (q) {
+  for (i = 0; i < afl->queued_paths; i++) {
 
-    n = q->next;
+    q = afl->queue_buf[i];
     ck_free(q->fname);
     ck_free(q->trace_mini);
     ck_free(q);
-    q = n;
 
   }
 
diff --git a/src/afl-fuzz-redqueen.c b/src/afl-fuzz-redqueen.c
index 73d00f9a..9a9ac33f 100644
--- a/src/afl-fuzz-redqueen.c
+++ b/src/afl-fuzz-redqueen.c
@@ -265,7 +265,7 @@ static u8 its_fuzz(afl_state_t *afl, u8 *buf, u32 len, u8 *status) {
 }
 
 static int strntoll(const char *str, size_t sz, char **end, int base,
-                    long long* out) {
+                    long long *out) {
 
   char        buf[64];
   long long   ret;
@@ -273,16 +273,13 @@ static int strntoll(const char *str, size_t sz, char **end, int base,
 
   for (; beg && sz && *beg == ' '; beg++, sz--) {};
 
-  if (!sz)
-    return 1;
-  if (sz >= sizeof(buf))
-    sz = sizeof(buf) -1;
+  if (!sz) return 1;
+  if (sz >= sizeof(buf)) sz = sizeof(buf) - 1;
 
   memcpy(buf, beg, sz);
   buf[sz] = '\0';
   ret = strtoll(buf, end, base);
-  if ((ret == LLONG_MIN || ret == LLONG_MAX) && errno == ERANGE)
-    return 1;
+  if ((ret == LLONG_MIN || ret == LLONG_MAX) && errno == ERANGE) return 1;
   if (end) *end = (char *)beg + (*end - buf);
   *out = ret;
 
@@ -291,7 +288,7 @@ static int strntoll(const char *str, size_t sz, char **end, int base,
 }
 
 static int strntoull(const char *str, size_t sz, char **end, int base,
-                     unsigned long long* out) {
+                     unsigned long long *out) {
 
   char               buf[64];
   unsigned long long ret;
@@ -300,16 +297,13 @@ static int strntoull(const char *str, size_t sz, char **end, int base,
   for (; beg && sz && *beg == ' '; beg++, sz--)
     ;
 
-  if (!sz)
-    return 1;
-  if (sz >= sizeof(buf))
-    sz = sizeof(buf) -1;
+  if (!sz) return 1;
+  if (sz >= sizeof(buf)) sz = sizeof(buf) - 1;
 
   memcpy(buf, beg, sz);
   buf[sz] = '\0';
   ret = strtoull(buf, end, base);
-  if (ret == ULLONG_MAX && errno == ERANGE)
-    return 1;
+  if (ret == ULLONG_MAX && errno == ERANGE) return 1;
   if (end) *end = (char *)beg + (*end - buf);
   *out = ret;
 
@@ -350,6 +344,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
         use_unum = 1;
 
     } else
+
       use_num = 1;
 
   }
diff --git a/src/afl-fuzz-stats.c b/src/afl-fuzz-stats.c
index 0ce35cb7..dfc0cd97 100644
--- a/src/afl-fuzz-stats.c
+++ b/src/afl-fuzz-stats.c
@@ -24,8 +24,65 @@
  */
 
 #include "afl-fuzz.h"
+#include "envs.h"
 #include <limits.h>
 
+/* Write fuzzer setup file */
+
+void write_setup_file(afl_state_t *afl, u32 argc, char **argv) {
+
+  char *val;
+  u8    fn[PATH_MAX];
+  snprintf(fn, PATH_MAX, "%s/fuzzer_setup", afl->out_dir);
+  FILE *f = create_ffile(fn);
+  u32   i;
+
+  fprintf(f, "# environment variables:\n");
+  u32 s_afl_env = (u32)sizeof(afl_environment_variables) /
+                      sizeof(afl_environment_variables[0]) -
+                  1U;
+
+  for (i = 0; i < s_afl_env; ++i) {
+
+    if ((val = getenv(afl_environment_variables[i])) != NULL) {
+
+      fprintf(f, "%s=%s\n", afl_environment_variables[i], val);
+
+    }
+
+  }
+
+  fprintf(f, "# command line:\n");
+
+  size_t j;
+  for (i = 0; i < argc; ++i) {
+
+    if (i) fprintf(f, " ");
+    if (index(argv[i], '\'')) {
+
+      fprintf(f, "'");
+      for (j = 0; j < strlen(argv[i]); j++)
+        if (argv[i][j] == '\'')
+          fprintf(f, "'\"'\"'");
+        else
+          fprintf(f, "%c", argv[i][j]);
+      fprintf(f, "'");
+
+    } else {
+
+      fprintf(f, "'%s'", argv[i]);
+
+    }
+
+  }
+
+  fprintf(f, "\n");
+
+  fclose(f);
+  (void)(afl_environment_deprecated);
+
+}
+
 /* Update stats file for unattended monitoring. */
 
 void write_stats_file(afl_state_t *afl, double bitmap_cvg, double stability,
@@ -35,21 +92,13 @@ void write_stats_file(afl_state_t *afl, double bitmap_cvg, double stability,
   struct rusage rus;
 #endif
 
-  unsigned long long int cur_time = get_cur_time();
-  u8                     fn[PATH_MAX];
-  s32                    fd;
-  FILE *                 f;
-  u32                    t_bytes = count_non_255_bytes(afl, afl->virgin_bits);
+  u64   cur_time = get_cur_time();
+  u32   t_bytes = count_non_255_bytes(afl, afl->virgin_bits);
+  u8    fn[PATH_MAX];
+  FILE *f;
 
   snprintf(fn, PATH_MAX, "%s/fuzzer_stats", afl->out_dir);
-
-  fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0600);
-
-  if (fd < 0) { PFATAL("Unable to create '%s'", fn); }
-
-  f = fdopen(fd, "w");
-
-  if (!f) { PFATAL("fdopen() failed"); }
+  f = create_ffile(fn);
 
   /* Keep last values in case we're called from another context
      where exec/sec stats and such are not readily available. */
@@ -163,11 +212,12 @@ void write_stats_file(afl_state_t *afl, double bitmap_cvg, double stability,
               ? ""
               : "default",
           afl->orig_cmdline);
+
   /* ignore errors */
 
   if (afl->debug) {
 
-    uint32_t i = 0;
+    u32 i = 0;
     fprintf(f, "virgin_bytes     :");
     for (i = 0; i < afl->fsrv.map_size; i++) {
 
@@ -933,10 +983,9 @@ void show_stats(afl_state_t *afl) {
 void show_init_stats(afl_state_t *afl) {
 
   struct queue_entry *q = afl->queue;
-  u32                 min_bits = 0, max_bits = 0;
+  u32                 min_bits = 0, max_bits = 0, max_len = 0, count = 0;
   u64                 min_us = 0, max_us = 0;
   u64                 avg_us = 0;
-  u32                 max_len = 0;
 
   u8 val_bufs[4][STRINGIFY_VAL_SIZE_MAX];
 #define IB(i) val_bufs[(i)], sizeof(val_bufs[(i)])
@@ -957,6 +1006,7 @@ void show_init_stats(afl_state_t *afl) {
 
     if (q->len > max_len) { max_len = q->len; }
 
+    ++count;
     q = q->next;
 
   }
@@ -1023,11 +1073,12 @@ void show_init_stats(afl_state_t *afl) {
   OKF("Here are some useful stats:\n\n"
 
       cGRA "    Test case count : " cRST
-      "%u favored, %u variable, %u total\n" cGRA "       Bitmap range : " cRST
+      "%u favored, %u variable, %u ignored, %u total\n" cGRA
+      "       Bitmap range : " cRST
       "%u to %u bits (average: %0.02f bits)\n" cGRA
       "        Exec timing : " cRST "%s to %s us (average: %s us)\n",
-      afl->queued_favored, afl->queued_variable, afl->queued_paths, min_bits,
-      max_bits,
+      afl->queued_favored, afl->queued_variable, afl->queued_paths - count,
+      afl->queued_paths, min_bits, max_bits,
       ((double)afl->total_bitmap_size) /
           (afl->total_bitmap_entries ? afl->total_bitmap_entries : 1),
       stringify_int(IB(0), min_us), stringify_int(IB(1), max_us),
diff --git a/src/afl-fuzz.c b/src/afl-fuzz.c
index 0df6c15c..73ca6aaa 100644
--- a/src/afl-fuzz.c
+++ b/src/afl-fuzz.c
@@ -119,8 +119,8 @@ static void usage(u8 *argv0, int more_help) {
       "etc.)\n"
       "  -d            - quick & dirty mode (skips deterministic steps)\n"
       "  -n            - fuzz without instrumentation (non-instrumented mode)\n"
-      "  -x dict_file  - optional fuzzer dictionary (see README.md, its really "
-      "good!)\n\n"
+      "  -x dict_file  - fuzzer dictionary (see README.md, specify up to 4 "
+      "times)\n\n"
 
       "Testing settings:\n"
       "  -s seed       - use a fixed seed for the RNG\n"
@@ -243,11 +243,11 @@ static int stricmp(char const *a, char const *b) {
 
 int main(int argc, char **argv_orig, char **envp) {
 
-  s32    opt;
+  s32    opt, i;
   u64    prev_queued = 0;
   u32    sync_interval_cnt = 0, seek_to, show_help = 0, map_size = MAP_SIZE;
-  u8 *   extras_dir = 0;
-  u8     mem_limit_given = 0, exit_1 = 0, debug = 0;
+  u8 *   extras_dir[4];
+  u8     mem_limit_given = 0, exit_1 = 0, debug = 0, extras_dir_cnt = 0;
   char **use_argv;
 
   struct timeval  tv;
@@ -450,8 +450,13 @@ int main(int argc, char **argv_orig, char **envp) {
 
       case 'x':                                               /* dictionary */
 
-        if (extras_dir) { FATAL("Multiple -x options not supported"); }
-        extras_dir = optarg;
+        if (extras_dir_cnt >= 4) {
+
+          FATAL("More than four -x options are not supported");
+
+        }
+
+        extras_dir[extras_dir_cnt++] = optarg;
         break;
 
       case 't': {                                                /* timeout */
@@ -828,10 +833,6 @@ int main(int argc, char **argv_orig, char **envp) {
       "Eißfeldt, Andrea Fioraldi and Dominik Maier");
   OKF("afl++ is open source, get it at "
       "https://github.com/AFLplusplus/AFLplusplus");
-  OKF("Power schedules from github.com/mboehme/aflfast");
-  OKF("Python Mutator and llvm_mode instrument file list from "
-      "github.com/choller/afl");
-  OKF("MOpt Mutator from github.com/puppet-meteor/MOpt-AFL");
 
   if (afl->sync_id && afl->is_main_node &&
       afl->afl_env.afl_custom_mutator_only) {
@@ -1128,16 +1129,27 @@ int main(int argc, char **argv_orig, char **envp) {
 
   setup_custom_mutators(afl);
 
+  write_setup_file(afl, argc, argv);
+
   setup_cmdline_file(afl, argv + optind);
 
-  read_testcases(afl);
+  read_testcases(afl, NULL);
   // read_foreign_testcases(afl, 1); for the moment dont do this
+  OKF("Loaded a total of %u seeds.", afl->queued_paths);
 
   load_auto(afl);
 
   pivot_inputs(afl);
 
-  if (extras_dir) { load_extras(afl, extras_dir); }
+  if (extras_dir_cnt) {
+
+    for (i = 0; i < extras_dir_cnt; i++)
+      load_extras(afl, extras_dir[i]);
+
+    dedup_extras(afl);
+    OKF("Loaded a total of %u extras.", afl->extras_cnt);
+
+  }
 
   if (!afl->timeout_given) { find_timeout(afl); }
 
diff --git a/src/afl-gcc.c b/src/afl-gcc.c
deleted file mode 100644
index 97564aea..00000000
--- a/src/afl-gcc.c
+++ /dev/null
@@ -1,488 +0,0 @@
-/*
-   american fuzzy lop++ - wrapper for GCC and clang
-   ------------------------------------------------
-
-   Originally written by Michal Zalewski
-
-   Now maintained by Marc Heuse <mh@mh-sec.de>,
-                        Heiko Eißfeldt <heiko.eissfeldt@hexco.de> and
-                        Andrea Fioraldi <andreafioraldi@gmail.com>
-
-   Copyright 2016, 2017 Google Inc. All rights reserved.
-   Copyright 2019-2020 AFLplusplus Project. All rights reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at:
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   This program is a drop-in replacement for GCC or clang. The most common way
-   of using it is to pass the path to afl-gcc or afl-clang via CC when invoking
-   ./configure.
-
-   (Of course, use CXX and point it to afl-g++ / afl-clang++ for C++ code.)
-
-   The wrapper needs to know the path to afl-as (renamed to 'as'). The default
-   is /usr/local/lib/afl/. A convenient way to specify alternative directories
-   would be to set AFL_PATH.
-
-   If AFL_HARDEN is set, the wrapper will compile the target app with various
-   hardening options that may help detect memory management issues more
-   reliably. You can also specify AFL_USE_ASAN to enable ASAN.
-
-   If you want to call a non-default compiler as a next step of the chain,
-   specify its location via AFL_CC or AFL_CXX.
-
- */
-
-#define AFL_MAIN
-
-#include "config.h"
-#include "types.h"
-#include "debug.h"
-#include "alloc-inl.h"
-
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-
-static u8 * as_path;                   /* Path to the AFL 'as' wrapper      */
-static u8 **cc_params;                 /* Parameters passed to the real CC  */
-static u32  cc_par_cnt = 1;            /* Param count, including argv0      */
-static u8   be_quiet,                  /* Quiet mode                        */
-    clang_mode;                        /* Invoked as afl-clang*?            */
-
-/* Try to find our "fake" GNU assembler in AFL_PATH or at the location derived
-   from argv[0]. If that fails, abort. */
-
-static void find_as(u8 *argv0) {
-
-  u8 *afl_path = getenv("AFL_PATH");
-  u8 *slash, *tmp;
-
-  if (afl_path) {
-
-    tmp = alloc_printf("%s/as", afl_path);
-
-    if (!access(tmp, X_OK)) {
-
-      as_path = afl_path;
-      ck_free(tmp);
-      return;
-
-    }
-
-    ck_free(tmp);
-
-  }
-
-  slash = strrchr(argv0, '/');
-
-  if (slash) {
-
-    u8 *dir;
-
-    *slash = 0;
-    dir = ck_strdup(argv0);
-    *slash = '/';
-
-    tmp = alloc_printf("%s/afl-as", dir);
-
-    if (!access(tmp, X_OK)) {
-
-      as_path = dir;
-      ck_free(tmp);
-      return;
-
-    }
-
-    ck_free(tmp);
-    ck_free(dir);
-
-  }
-
-  if (!access(AFL_PATH "/as", X_OK)) {
-
-    as_path = AFL_PATH;
-    return;
-
-  }
-
-  FATAL("Unable to find AFL wrapper binary for 'as'. Please set AFL_PATH");
-
-}
-
-/* Copy argv to cc_params, making the necessary edits. */
-
-static void edit_params(u32 argc, char **argv) {
-
-  u8  fortify_set = 0, asan_set = 0;
-  u8 *name;
-
-#if defined(__FreeBSD__) && defined(WORD_SIZE_64)
-  u8 m32_set = 0;
-#endif
-
-  cc_params = ck_alloc((argc + 128) * sizeof(u8 *));
-
-  name = strrchr(argv[0], '/');
-  if (!name) {
-
-    name = argv[0];
-
-    /* This should never happen but fixes a scan-build warning */
-    if (!name) { FATAL("Empty argv set"); }
-
-  } else {
-
-    ++name;
-
-  }
-
-  if (!strncmp(name, "afl-clang", 9)) {
-
-    clang_mode = 1;
-
-    setenv(CLANG_ENV_VAR, "1", 1);
-
-    if (!strcmp(name, "afl-clang++")) {
-
-      u8 *alt_cxx = getenv("AFL_CXX");
-      cc_params[0] = alt_cxx && *alt_cxx ? alt_cxx : (u8 *)"clang++";
-
-    } else if (!strcmp(name, "afl-clang")) {
-
-      u8 *alt_cc = getenv("AFL_CC");
-      cc_params[0] = alt_cc && *alt_cc ? alt_cc : (u8 *)"clang";
-
-    } else {
-
-      fprintf(stderr, "Name of the binary: %s\n", argv[0]);
-      FATAL("Name of the binary is not a known name, expected afl-clang(++)");
-
-    }
-
-  } else {
-
-    /* With GCJ and Eclipse installed, you can actually compile Java! The
-       instrumentation will work (amazingly). Alas, unhandled exceptions do
-       not call abort(), so afl-fuzz would need to be modified to equate
-       non-zero exit codes with crash conditions when working with Java
-       binaries. Meh. */
-
-#ifdef __APPLE__
-
-    if (!strcmp(name, "afl-g++")) {
-
-      cc_params[0] = getenv("AFL_CXX");
-
-    } else if (!strcmp(name, "afl-gcj")) {
-
-      cc_params[0] = getenv("AFL_GCJ");
-
-    } else if (!strcmp(name, "afl-gcc")) {
-
-      cc_params[0] = getenv("AFL_CC");
-
-    } else {
-
-      fprintf(stderr, "Name of the binary: %s\n", argv[0]);
-      FATAL("Name of the binary is not a known name, expected afl-gcc/g++/gcj");
-
-    }
-
-    if (!cc_params[0]) {
-
-      SAYF("\n" cLRD "[-] " cRST
-           "On Apple systems, 'gcc' is usually just a wrapper for clang. "
-           "Please use the\n"
-           "    'afl-clang' utility instead of 'afl-gcc'. If you really have "
-           "GCC installed,\n"
-           "    set AFL_CC or AFL_CXX to specify the correct path to that "
-           "compiler.\n");
-
-      FATAL("AFL_CC or AFL_CXX required on MacOS X");
-
-    }
-
-#else
-
-    if (!strcmp(name, "afl-g++")) {
-
-      u8 *alt_cxx = getenv("AFL_CXX");
-      cc_params[0] = alt_cxx && *alt_cxx ? alt_cxx : (u8 *)"g++";
-
-    } else if (!strcmp(name, "afl-gcj")) {
-
-      u8 *alt_cc = getenv("AFL_GCJ");
-      cc_params[0] = alt_cc && *alt_cc ? alt_cc : (u8 *)"gcj";
-
-    } else if (!strcmp(name, "afl-gcc")) {
-
-      u8 *alt_cc = getenv("AFL_CC");
-      cc_params[0] = alt_cc && *alt_cc ? alt_cc : (u8 *)"gcc";
-
-    } else {
-
-      fprintf(stderr, "Name of the binary: %s\n", argv[0]);
-      FATAL("Name of the binary is not a known name, expected afl-gcc/g++/gcj");
-
-    }
-
-#endif                                                         /* __APPLE__ */
-
-  }
-
-  while (--argc) {
-
-    u8 *cur = *(++argv);
-
-    if (!strncmp(cur, "-B", 2)) {
-
-      if (!be_quiet) { WARNF("-B is already set, overriding"); }
-
-      if (!cur[2] && argc > 1) {
-
-        argc--;
-        argv++;
-
-      }
-
-      continue;
-
-    }
-
-    if (!strcmp(cur, "-integrated-as")) { continue; }
-
-    if (!strcmp(cur, "-pipe")) { continue; }
-
-#if defined(__FreeBSD__) && defined(WORD_SIZE_64)
-    if (!strcmp(cur, "-m32")) m32_set = 1;
-#endif
-
-    if (!strcmp(cur, "-fsanitize=address") ||
-        !strcmp(cur, "-fsanitize=memory")) {
-
-      asan_set = 1;
-
-    }
-
-    if (strstr(cur, "FORTIFY_SOURCE")) { fortify_set = 1; }
-
-    cc_params[cc_par_cnt++] = cur;
-
-  }
-
-  cc_params[cc_par_cnt++] = "-B";
-  cc_params[cc_par_cnt++] = as_path;
-
-  if (clang_mode) { cc_params[cc_par_cnt++] = "-no-integrated-as"; }
-
-  if (getenv("AFL_HARDEN")) {
-
-    cc_params[cc_par_cnt++] = "-fstack-protector-all";
-
-    if (!fortify_set) { cc_params[cc_par_cnt++] = "-D_FORTIFY_SOURCE=2"; }
-
-  }
-
-  if (asan_set) {
-
-    /* Pass this on to afl-as to adjust map density. */
-
-    setenv("AFL_USE_ASAN", "1", 1);
-
-  } else if (getenv("AFL_USE_ASAN")) {
-
-    if (getenv("AFL_USE_MSAN")) {
-
-      FATAL("ASAN and MSAN are mutually exclusive");
-
-    }
-
-    if (getenv("AFL_HARDEN")) {
-
-      FATAL("ASAN and AFL_HARDEN are mutually exclusive");
-
-    }
-
-    cc_params[cc_par_cnt++] = "-U_FORTIFY_SOURCE";
-    cc_params[cc_par_cnt++] = "-fsanitize=address";
-
-  } else if (getenv("AFL_USE_MSAN")) {
-
-    if (getenv("AFL_USE_ASAN")) {
-
-      FATAL("ASAN and MSAN are mutually exclusive");
-
-    }
-
-    if (getenv("AFL_HARDEN")) {
-
-      FATAL("MSAN and AFL_HARDEN are mutually exclusive");
-
-    }
-
-    cc_params[cc_par_cnt++] = "-U_FORTIFY_SOURCE";
-    cc_params[cc_par_cnt++] = "-fsanitize=memory";
-
-  }
-
-  if (getenv("AFL_USE_UBSAN")) {
-
-    cc_params[cc_par_cnt++] = "-fsanitize=undefined";
-    cc_params[cc_par_cnt++] = "-fsanitize-undefined-trap-on-error";
-    cc_params[cc_par_cnt++] = "-fno-sanitize-recover=all";
-
-  }
-
-#if defined(USEMMAP) && !defined(__HAIKU__)
-  cc_params[cc_par_cnt++] = "-lrt";
-#endif
-
-  if (!getenv("AFL_DONT_OPTIMIZE")) {
-
-#if defined(__FreeBSD__) && defined(WORD_SIZE_64)
-
-    /* On 64-bit FreeBSD systems, clang -g -m32 is broken, but -m32 itself
-       works OK. This has nothing to do with us, but let's avoid triggering
-       that bug. */
-
-    if (!clang_mode || !m32_set) cc_params[cc_par_cnt++] = "-g";
-
-#else
-
-    cc_params[cc_par_cnt++] = "-g";
-
-#endif
-
-    cc_params[cc_par_cnt++] = "-O3";
-    cc_params[cc_par_cnt++] = "-funroll-loops";
-
-    /* Two indicators that you're building for fuzzing; one of them is
-       AFL-specific, the other is shared with libfuzzer. */
-
-    cc_params[cc_par_cnt++] = "-D__AFL_COMPILER=1";
-    cc_params[cc_par_cnt++] = "-DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION=1";
-
-  }
-
-  if (getenv("AFL_NO_BUILTIN")) {
-
-    cc_params[cc_par_cnt++] = "-fno-builtin-strcmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strncmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strcasecmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strncasecmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-memcmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-bcmp";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strstr";
-    cc_params[cc_par_cnt++] = "-fno-builtin-strcasestr";
-
-  }
-
-  cc_params[cc_par_cnt] = NULL;
-
-}
-
-/* Main entry point */
-
-int main(int argc, char **argv) {
-
-  char *env_info =
-      "Environment variables used by afl-gcc:\n"
-      "AFL_CC: path to the C compiler to use\n"
-      "AFL_CXX: path to the C++ compiler to use\n"
-      "AFL_GCJ: path to the java compiler to use\n"
-      "AFL_PATH: path to the instrumenting assembler\n"
-      "AFL_DONT_OPTIMIZE: disable optimization instead of -O3\n"
-      "AFL_NO_BUILTIN: compile for use with libtokencap.so\n"
-      "AFL_QUIET: suppress verbose output\n"
-      "AFL_CAL_FAST: speed up the initial calibration\n"
-      "AFL_HARDEN: adds code hardening to catch memory bugs\n"
-      "AFL_USE_ASAN: activate address sanitizer\n"
-      "AFL_USE_MSAN: activate memory sanitizer\n"
-      "AFL_USE_UBSAN: activate undefined behaviour sanitizer\n"
-
-      "\nEnvironment variables used by afl-as (called by afl-gcc):\n"
-      "AFL_AS: path to the assembler to use\n"
-      "TMPDIR: set the directory for temporary files of afl-as\n"
-      "TEMP: fall back path to directory for temporary files\n"
-      "TMP: fall back path to directory for temporary files\n"
-      "AFL_INST_RATIO: percentage of branches to instrument\n"
-      "AFL_QUIET: suppress verbose output\n"
-      "AFL_KEEP_ASSEMBLY: leave instrumented assembly files\n"
-      "AFL_AS_FORCE_INSTRUMENT: force instrumentation for asm sources\n";
-
-  if (argc == 2 && strncmp(argv[1], "-h", 2) == 0) {
-
-    printf("afl-cc" VERSION " by Michal Zalewski\n\n");
-    printf("%s \n\n", argv[0]);
-    printf("afl-gcc has no command line options\n\n%s\n", env_info);
-    printf(
-        "NOTE: afl-gcc is deprecated, llvm_mode is much faster and has more "
-        "options\n");
-    return -1;
-
-  }
-
-  if ((isatty(2) && !getenv("AFL_QUIET")) || getenv("AFL_DEBUG") != NULL) {
-
-    SAYF(cCYA "afl-cc" VERSION cRST " by Michal Zalewski\n");
-    SAYF(cYEL "[!] " cBRI "NOTE: " cRST
-              "afl-gcc is deprecated, llvm_mode is much faster and has more "
-              "options\n");
-
-  } else {
-
-    be_quiet = 1;
-
-  }
-
-  if (argc < 2) {
-
-    SAYF(
-        "\n"
-        "This is a helper application for afl-fuzz. It serves as a drop-in "
-        "replacement\n"
-        "for gcc or clang, letting you recompile third-party code with the "
-        "required\n"
-        "runtime instrumentation. A common use pattern would be one of the "
-        "following:\n\n"
-
-        "  CC=%s/afl-gcc ./configure\n"
-        "  CXX=%s/afl-g++ ./configure\n\n%s"
-
-        ,
-        BIN_PATH, BIN_PATH, env_info);
-
-    exit(1);
-
-  }
-
-  u8 *ptr;
-  if (!be_quiet &&
-      ((ptr = getenv("AFL_MAP_SIZE")) || (ptr = getenv("AFL_MAPSIZE")))) {
-
-    u32 map_size = atoi(ptr);
-    if (map_size != MAP_SIZE) {
-
-      WARNF("AFL_MAP_SIZE is not supported by afl-gcc");
-
-    }
-
-  }
-
-  find_as(argv[0]);
-
-  edit_params(argc, argv);
-
-  execvp(cc_params[0], (char **)cc_params);
-
-  FATAL("Oops, failed to execute '%s' - check your PATH", cc_params[0]);
-
-  return 0;
-
-}
-
diff --git a/llvm_mode/afl-ld-lto.c b/src/afl-ld-lto.c
index 771e2d0d..771e2d0d 100644
--- a/llvm_mode/afl-ld-lto.c
+++ b/src/afl-ld-lto.c
diff --git a/src/afl-performance.c b/src/afl-performance.c
index a9d7cefa..7a80ac4b 100644
--- a/src/afl-performance.c
+++ b/src/afl-performance.c
@@ -22,7 +22,10 @@
 #include <stdint.h>
 #include "afl-fuzz.h"
 #include "types.h"
-#include "xxh3.h"
+
+#define XXH_INLINE_ALL
+#include "xxhash.h"
+#undef XXH_INLINE_ALL
 
 /* we use xoshiro256** instead of rand/random because it is 10x faster and has
    better randomness properties. */
diff --git a/test/test-gcc-plugin.sh b/test/test-gcc-plugin.sh
index 2ed10a72..8b8cbd8e 100755
--- a/test/test-gcc-plugin.sh
+++ b/test/test-gcc-plugin.sh
@@ -3,7 +3,7 @@
 . ./test-pre.sh
 
 $ECHO "$BLUE[*] Testing: gcc_plugin"
-test -e ../afl-gcc-fast -a -e ../afl-gcc-rt.o && {
+test -e ../afl-gcc-fast -a -e ../afl-compiler-rt.o && {
   SAVE_AFL_CC=${AFL_CC}
   export AFL_CC=`command -v gcc`
   ../afl-gcc-fast -o test-instr.plain.gccpi ../test-instr.c > /dev/null 2>&1
diff --git a/test/test-unittests.sh b/test/test-unittests.sh
index f540b5f8..58c2eea9 100755
--- a/test/test-unittests.sh
+++ b/test/test-unittests.sh
@@ -7,3 +7,5 @@ unset AFL_CC
 make -C .. unit || CODE=1 INCOMPLETE=1 :
 
 . ./test-post.sh
+
+rm -rf unittests/unit_hash unittests/unit_rand