about summary refs log tree commit diff
diff options
context:
space:
mode:
authorvan Hauser <vh@thc.org>2021-11-07 14:09:09 +0100
committerGitHub <noreply@github.com>2021-11-07 14:09:09 +0100
commitfb443eaf2372ccd1825699c978fd0d662155fb9e (patch)
treeff019fc0b0704c16d68655d0f3864ec4cda49d30
parent5b06413a5f109f310a62e36111a18d7325b246c3 (diff)
parent2ddbaa439ca78b0ae8cc6691d9657f5783b2d5e8 (diff)
downloadafl++-fb443eaf2372ccd1825699c978fd0d662155fb9e.tar.gz
Merge pull request #1141 from AFLplusplus/afl4
cmplog enhancement variant
-rw-r--r--GNUmakefile2
-rw-r--r--docs/Changelog.md2
-rw-r--r--frida_mode/src/cmplog/cmplog_arm64.c51
-rw-r--r--frida_mode/src/cmplog/cmplog_x64.c50
-rw-r--r--frida_mode/src/cmplog/cmplog_x86.c51
-rw-r--r--include/afl-fuzz.h1
-rw-r--r--include/cmplog.h13
-rw-r--r--include/config.h4
-rw-r--r--include/types.h3
-rw-r--r--include/xxhash.h3084
-rw-r--r--instrumentation/afl-compiler-rt.o.c205
-rw-r--r--instrumentation/cmplog-routines-pass.cc244
-rw-r--r--qemu_mode/QEMUAFL_VERSION2
m---------qemu_mode/qemuafl0
-rw-r--r--src/afl-forkserver.c24
-rw-r--r--src/afl-fuzz-one.c25
-rw-r--r--src/afl-fuzz-queue.c91
-rw-r--r--src/afl-fuzz-redqueen.c363
-rw-r--r--src/afl-fuzz-stats.c3
-rw-r--r--src/afl-fuzz.c19
-rw-r--r--src/afl-performance.c6
-rw-r--r--test/test-cmplog.c21
22 files changed, 2844 insertions, 1420 deletions
diff --git a/GNUmakefile b/GNUmakefile
index ad2642f3..06840786 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -32,7 +32,7 @@ VERSION     = $(shell grep '^$(HASH)define VERSION ' ../config.h | cut -d '"' -f
 # PROGS intentionally omit afl-as, which gets installed elsewhere.
 
 PROGS       = afl-fuzz afl-showmap afl-tmin afl-gotcpu afl-analyze
-SH_PROGS    = afl-plot afl-cmin afl-cmin.bash afl-whatsup afl-system-config afl-persistent-config
+SH_PROGS    = afl-plot afl-cmin afl-cmin.bash afl-whatsup afl-system-config afl-persistent-config afl-cc
 MANPAGES=$(foreach p, $(PROGS) $(SH_PROGS), $(p).8) afl-as.8
 ASAN_OPTIONS=detect_leaks=0
 
diff --git a/docs/Changelog.md b/docs/Changelog.md
index 7c77a6bf..2c72b5f2 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -18,6 +18,8 @@ sending a mail to <afl-users+subscribe@googlegroups.com>.
     - fix -n dumb mode (nobody should use this)
     - fix stability issue with LTO and cmplog
     - better banner
+    - more effective cmplog mode
+    - more often update the UI when in input2stage mode
   - frida_mode: David Carlier added Android support :)
   - afl-showmap, afl-tmin and afl-analyze:
     - honor persistent mode for more speed. thanks to dloffre-snl for
diff --git a/frida_mode/src/cmplog/cmplog_arm64.c b/frida_mode/src/cmplog/cmplog_arm64.c
index dd97f38d..ccc8e89e 100644
--- a/frida_mode/src/cmplog/cmplog_arm64.c
+++ b/frida_mode/src/cmplog/cmplog_arm64.c
@@ -104,9 +104,9 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) {
   gsize x0 = ctx_read_reg(context, ARM64_REG_X0);
   gsize x1 = ctx_read_reg(context, ARM64_REG_X1);
 
-  if (((G_MAXULONG - x0) < 32) || ((G_MAXULONG - x1) < 32)) return;
+  if (((G_MAXULONG - x0) < 31) || ((G_MAXULONG - x1) < 31)) return;
 
-  if (!cmplog_is_readable(x0, 32) || !cmplog_is_readable(x1, 32)) return;
+  if (!cmplog_is_readable(x0, 31) || !cmplog_is_readable(x1, 31)) return;
 
   void *ptr1 = GSIZE_TO_POINTER(x0);
   void *ptr2 = GSIZE_TO_POINTER(x1);
@@ -116,18 +116,36 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) {
   k = (k >> 4) ^ (k << 8);
   k &= CMP_MAP_W - 1;
 
-  __afl_cmp_map->headers[k].type = CMP_TYPE_RTN;
+  if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) {
+
+    __afl_cmp_map->headers[k].type = CMP_TYPE_RTN;
+    __afl_cmp_map->headers[k].hits = 0;
+
+  }
+
+  u32 hits = 0;
+
+  if (__afl_cmp_map->headers[k].hits == 0) {
+
+    __afl_cmp_map->headers[k].shape = 30;
+
+  } else {
+
+    hits = __afl_cmp_map->headers[k].hits;
+
+  }
 
-  u32 hits = __afl_cmp_map->headers[k].hits;
   __afl_cmp_map->headers[k].hits = hits + 1;
 
-  __afl_cmp_map->headers[k].shape = 31;
+  __afl_cmp_map->headers[k].shape = 30;
 
   hits &= CMP_MAP_RTN_H - 1;
+  ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0_len = 31;
+  ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1_len = 31;
   gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0, ptr1,
-             32);
+             31);
   gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1, ptr2,
-             32);
+             31);
 
 }
 
@@ -193,12 +211,23 @@ static void cmplog_handle_cmp_sub(GumCpuContext *context, gsize operand1,
   k = (k >> 4) ^ (k << 8);
   k &= CMP_MAP_W - 1;
 
-  __afl_cmp_map->headers[k].type = CMP_TYPE_INS;
+  if (__afl_cmp_map->headers[k].type != CMP_TYPE_INS)
+    __afl_cmp_map->headers[k].hits = 0;
 
-  u32 hits = __afl_cmp_map->headers[k].hits;
-  __afl_cmp_map->headers[k].hits = hits + 1;
+  u32 hits = 0;
+
+  if (__afl_cmp_map->headers[k].hits == 0) {
+
+    __afl_cmp_map->headers[k].type = CMP_TYPE_INS;
+    __afl_cmp_map->headers[k].shape = (size - 1);
+
+  } else {
 
-  __afl_cmp_map->headers[k].shape = (size - 1);
+    hits = __afl_cmp_map->headers[k].hits;
+
+  }
+
+  __afl_cmp_map->headers[k].hits = hits + 1;
 
   hits &= CMP_MAP_H - 1;
   __afl_cmp_map->log[k][hits].v0 = operand1;
diff --git a/frida_mode/src/cmplog/cmplog_x64.c b/frida_mode/src/cmplog/cmplog_x64.c
index 0d18767a..5319f727 100644
--- a/frida_mode/src/cmplog/cmplog_x64.c
+++ b/frida_mode/src/cmplog/cmplog_x64.c
@@ -99,9 +99,9 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) {
   gsize rdi = ctx_read_reg(context, X86_REG_RDI);
   gsize rsi = ctx_read_reg(context, X86_REG_RSI);
 
-  if (((G_MAXULONG - rdi) < 32) || ((G_MAXULONG - rsi) < 32)) return;
+  if (((G_MAXULONG - rdi) < 31) || ((G_MAXULONG - rsi) < 31)) return;
 
-  if (!cmplog_is_readable(rdi, 32) || !cmplog_is_readable(rsi, 32)) return;
+  if (!cmplog_is_readable(rdi, 31) || !cmplog_is_readable(rsi, 31)) return;
 
   void *ptr1 = GSIZE_TO_POINTER(rdi);
   void *ptr2 = GSIZE_TO_POINTER(rsi);
@@ -111,18 +111,34 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) {
   k = (k >> 4) ^ (k << 8);
   k &= CMP_MAP_W - 1;
 
-  __afl_cmp_map->headers[k].type = CMP_TYPE_RTN;
+  if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) {
 
-  u32 hits = __afl_cmp_map->headers[k].hits;
-  __afl_cmp_map->headers[k].hits = hits + 1;
+    __afl_cmp_map->headers[k].type = CMP_TYPE_RTN;
+    __afl_cmp_map->headers[k].hits = 0;
+
+  }
+
+  u32 hits = 0;
+
+  if (__afl_cmp_map->headers[k].hits == 0) {
+
+    __afl_cmp_map->headers[k].shape = 30;
+
+  } else {
+
+    hits = __afl_cmp_map->headers[k].hits;
+
+  }
 
-  __afl_cmp_map->headers[k].shape = 31;
+  __afl_cmp_map->headers[k].hits = hits + 1;
 
   hits &= CMP_MAP_RTN_H - 1;
+  ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0_len = 31;
+  ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1_len = 31;
   gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0, ptr1,
-             32);
+             31);
   gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1, ptr2,
-             32);
+             31);
 
 }
 
@@ -179,13 +195,23 @@ static void cmplog_handle_cmp_sub(GumCpuContext *context, gsize operand1,
   k = (k >> 4) ^ (k << 8);
   k &= CMP_MAP_W - 7;
 
-  __afl_cmp_map->headers[k].type = CMP_TYPE_INS;
+  if (__afl_cmp_map->headers[k].type != CMP_TYPE_INS)
+    __afl_cmp_map->headers[k].hits = 0;
 
-  u32 hits = __afl_cmp_map->headers[k].hits;
-  __afl_cmp_map->headers[k].hits = hits + 1;
+  u32 hits = 0;
+
+  if (__afl_cmp_map->headers[k].hits == 0) {
+
+    __afl_cmp_map->headers[k].type = CMP_TYPE_INS;
+    __afl_cmp_map->headers[k].shape = (size - 1);
 
-  __afl_cmp_map->headers[k].shape = (size - 1);
+  } else {
 
+    hits = __afl_cmp_map->headers[k].hits;
+
+  }
+
+  __afl_cmp_map->headers[k].hits = hits + 1;
   hits &= CMP_MAP_H - 1;
   __afl_cmp_map->log[k][hits].v0 = operand1;
   __afl_cmp_map->log[k][hits].v1 = operand2;
diff --git a/frida_mode/src/cmplog/cmplog_x86.c b/frida_mode/src/cmplog/cmplog_x86.c
index dd666c34..27d06720 100644
--- a/frida_mode/src/cmplog/cmplog_x86.c
+++ b/frida_mode/src/cmplog/cmplog_x86.c
@@ -104,9 +104,9 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) {
   gsize arg1 = esp[0];
   gsize arg2 = esp[1];
 
-  if (((G_MAXULONG - arg1) < 32) || ((G_MAXULONG - arg2) < 32)) return;
+  if (((G_MAXULONG - arg1) < 31) || ((G_MAXULONG - arg2) < 31)) return;
 
-  if (!cmplog_is_readable(arg1, 32) || !cmplog_is_readable(arg2, 32)) return;
+  if (!cmplog_is_readable(arg1, 31) || !cmplog_is_readable(arg2, 31)) return;
 
   void *ptr1 = GSIZE_TO_POINTER(arg1);
   void *ptr2 = GSIZE_TO_POINTER(arg2);
@@ -116,18 +116,34 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) {
   k = (k >> 4) ^ (k << 8);
   k &= CMP_MAP_W - 1;
 
-  __afl_cmp_map->headers[k].type = CMP_TYPE_RTN;
+  if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) {
 
-  u32 hits = __afl_cmp_map->headers[k].hits;
-  __afl_cmp_map->headers[k].hits = hits + 1;
+    __afl_cmp_map->headers[k].type = CMP_TYPE_RTN;
+    __afl_cmp_map->headers[k].hits = 0;
+
+  }
+
+  u32 hits = 0;
+
+  if (__afl_cmp_map->headers[k].hits == 0) {
 
-  __afl_cmp_map->headers[k].shape = 31;
+    __afl_cmp_map->headers[k].shape = 30;
+
+  } else {
+
+    hits = __afl_cmp_map->headers[k].hits;
+
+  }
+
+  __afl_cmp_map->headers[k].hits = hits + 1;
 
   hits &= CMP_MAP_RTN_H - 1;
+  ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0_len = 31;
+  ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1_len = 31;
   gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0, ptr1,
-             32);
+             31);
   gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1, ptr2,
-             32);
+             31);
 
 }
 
@@ -184,12 +200,23 @@ static void cmplog_handle_cmp_sub(GumCpuContext *context, gsize operand1,
   k = (k >> 4) ^ (k << 8);
   k &= CMP_MAP_W - 1;
 
-  __afl_cmp_map->headers[k].type = CMP_TYPE_INS;
+  if (__afl_cmp_map->headers[k].type != CMP_TYPE_INS)
+    __afl_cmp_map->headers[k].hits = 0;
 
-  u32 hits = __afl_cmp_map->headers[k].hits;
-  __afl_cmp_map->headers[k].hits = hits + 1;
+  u32 hits = 0;
+
+  if (__afl_cmp_map->headers[k].hits == 0) {
+
+    __afl_cmp_map->headers[k].type = CMP_TYPE_INS;
+    __afl_cmp_map->headers[k].shape = (size - 1);
+
+  } else {
 
-  __afl_cmp_map->headers[k].shape = (size - 1);
+    hits = __afl_cmp_map->headers[k].hits;
+
+  }
+
+  __afl_cmp_map->headers[k].hits = hits + 1;
 
   hits &= CMP_MAP_H - 1;
   __afl_cmp_map->log[k][hits].v0 = operand1;
diff --git a/include/afl-fuzz.h b/include/afl-fuzz.h
index e73ea1a4..f3d6d99d 100644
--- a/include/afl-fuzz.h
+++ b/include/afl-fuzz.h
@@ -1135,6 +1135,7 @@ void   setup_signal_handlers(void);
 void   save_cmdline(afl_state_t *, u32, char **);
 void   read_foreign_testcases(afl_state_t *, int);
 void   write_crash_readme(afl_state_t *afl);
+u8     check_if_text_buf(u8 *buf, u32 len);
 
 /* CmpLog */
 
diff --git a/include/cmplog.h b/include/cmplog.h
index 1c15d2b8..8778a4b6 100644
--- a/include/cmplog.h
+++ b/include/cmplog.h
@@ -48,7 +48,8 @@ struct cmp_header {
   unsigned shape : 5;
   unsigned type : 2;
   unsigned attribute : 4;
-  unsigned reserved : 5;
+  unsigned overflow : 1;
+  unsigned reserved : 4;
 
 } __attribute__((packed));
 
@@ -59,14 +60,16 @@ struct cmp_operands {
   u64 v0_128;
   u64 v1_128;
 
-};
+} __attribute__((packed));
 
 struct cmpfn_operands {
 
-  u8 v0[32];
-  u8 v1[32];
+  u8 v0[31];
+  u8 v0_len;
+  u8 v1[31];
+  u8 v1_len;
 
-};
+} __attribute__((packed));
 
 typedef struct cmp_operands cmp_map_list[CMP_MAP_H];
 
diff --git a/include/config.h b/include/config.h
index 3aee9b00..b787152f 100644
--- a/include/config.h
+++ b/include/config.h
@@ -267,8 +267,8 @@
    (first value), and to keep in memory as candidates. The latter should be much
    higher than the former. */
 
-#define USE_AUTO_EXTRAS 128
-#define MAX_AUTO_EXTRAS (USE_AUTO_EXTRAS * 64)
+#define USE_AUTO_EXTRAS 4096
+#define MAX_AUTO_EXTRAS (USE_AUTO_EXTRAS * 8)
 
 /* Scaling factor for the effector map used to skip some of the more
    expensive deterministic steps. The actual divisor is set to
diff --git a/include/types.h b/include/types.h
index e945f0f5..bbcc2f81 100644
--- a/include/types.h
+++ b/include/types.h
@@ -46,6 +46,8 @@ typedef uint128_t         u128;
 #define FS_ERROR_SHM_OPEN 4
 #define FS_ERROR_SHMAT 8
 #define FS_ERROR_MMAP 16
+#define FS_ERROR_OLD_CMPLOG 32
+#define FS_ERROR_OLD_CMPLOG_QEMU 64
 
 /* Reporting options */
 #define FS_OPT_ENABLED 0x80000001
@@ -53,6 +55,7 @@ typedef uint128_t         u128;
 #define FS_OPT_SNAPSHOT 0x20000000
 #define FS_OPT_AUTODICT 0x10000000
 #define FS_OPT_SHDMEM_FUZZ 0x01000000
+#define FS_OPT_NEWCMPLOG 0x02000000
 #define FS_OPT_OLD_AFLPP_WORKAROUND 0x0f000000
 // FS_OPT_MAX_MAPSIZE is 8388608 = 0x800000 = 2^23 = 1 << 22
 #define FS_OPT_MAX_MAPSIZE ((0x00fffffeU >> 1) + 1)
diff --git a/include/xxhash.h b/include/xxhash.h
index 006d3f3d..0ca2b852 100644
--- a/include/xxhash.h
+++ b/include/xxhash.h
@@ -32,7 +32,12 @@
  *   - xxHash homepage: https://www.xxhash.com
  *   - xxHash source repository: https://github.com/Cyan4973/xxHash
  */
-
+/*!
+ * @mainpage xxHash
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
 /* TODO: update */
 /* Notice extracted from xxHash homepage:
 
@@ -45,7 +50,7 @@ Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo
 Name            Speed       Q.Score   Author
 xxHash          5.4 GB/s     10
 CrapWow         3.2 GB/s      2       Andrew
-MumurHash 3a    2.7 GB/s     10       Austin Appleby
+MurmurHash 3a   2.7 GB/s     10       Austin Appleby
 SpookyHash      2.0 GB/s     10       Bob Jenkins
 SBox            1.4 GB/s      9       Bret Mulvey
 Lookup3         1.2 GB/s      9       Bob Jenkins
@@ -119,29 +124,78 @@ extern "C" {
 
 /*
  * This part deals with the special case where a unit wants to inline xxHash,
- * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
- * as part of some previously included *.h header file.
+ * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
+ * such as part of some previously included *.h header file.
  * Without further action, the new include would just be ignored,
  * and functions would effectively _not_ be inlined (silent failure).
  * The following macros solve this situation by prefixing all inlined names,
  * avoiding naming collision with previous inclusions.
  */
-  #ifdef XXH_NAMESPACE
-    #error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
-  /*
-   * Note: Alternative: #undef all symbols (it's a pretty large list).
-   * Without #error: it compiles, but functions are actually not inlined.
-   */
-  #endif
+/* Before that, we unconditionally #undef all symbols,
+ * in case they were already defined with XXH_NAMESPACE.
+ * They will then be redefined for XXH_INLINE_ALL
+ */
+  #undef XXH_versionNumber
+/* XXH32 */
+  #undef XXH32
+  #undef XXH32_createState
+  #undef XXH32_freeState
+  #undef XXH32_reset
+  #undef XXH32_update
+  #undef XXH32_digest
+  #undef XXH32_copyState
+  #undef XXH32_canonicalFromHash
+  #undef XXH32_hashFromCanonical
+/* XXH64 */
+  #undef XXH64
+  #undef XXH64_createState
+  #undef XXH64_freeState
+  #undef XXH64_reset
+  #undef XXH64_update
+  #undef XXH64_digest
+  #undef XXH64_copyState
+  #undef XXH64_canonicalFromHash
+  #undef XXH64_hashFromCanonical
+/* XXH3_64bits */
+  #undef XXH3_64bits
+  #undef XXH3_64bits_withSecret
+  #undef XXH3_64bits_withSeed
+  #undef XXH3_createState
+  #undef XXH3_freeState
+  #undef XXH3_copyState
+  #undef XXH3_64bits_reset
+  #undef XXH3_64bits_reset_withSeed
+  #undef XXH3_64bits_reset_withSecret
+  #undef XXH3_64bits_update
+  #undef XXH3_64bits_digest
+  #undef XXH3_generateSecret
+/* XXH3_128bits */
+  #undef XXH128
+  #undef XXH3_128bits
+  #undef XXH3_128bits_withSeed
+  #undef XXH3_128bits_withSecret
+  #undef XXH3_128bits_reset
+  #undef XXH3_128bits_reset_withSeed
+  #undef XXH3_128bits_reset_withSecret
+  #undef XXH3_128bits_update
+  #undef XXH3_128bits_digest
+  #undef XXH128_isEqual
+  #undef XXH128_cmp
+  #undef XXH128_canonicalFromHash
+  #undef XXH128_hashFromCanonical
+/* Finally, free the namespace itself */
+  #undef XXH_NAMESPACE
+
+/* employ the namespace for XXH_INLINE_ALL */
   #define XXH_NAMESPACE XXH_INLINE_
 /*
- * Some identifiers (enums, type names) are not symbols, but they must
- * still be renamed to avoid redeclaration.
+ * Some identifiers (enums, type names) are not symbols,
+ * but they must nonetheless be renamed to avoid redeclaration.
  * Alternative solution: do not redeclare them.
- * However, this requires some #ifdefs, and is a more dispersed action.
- * Meanwhile, renaming can be achieved in a single block
+ * However, this requires some #ifdefs, and has a more dispersed impact.
+ * Meanwhile, renaming can be achieved in a single place.
  */
-  #define XXH_IPREF(Id) XXH_INLINE_##Id
+  #define XXH_IPREF(Id) XXH_NAMESPACE##Id
   #define XXH_OK XXH_IPREF(XXH_OK)
   #define XXH_ERROR XXH_IPREF(XXH_ERROR)
   #define XXH_errorcode XXH_IPREF(XXH_errorcode)
@@ -166,6 +220,12 @@ extern "C" {
 #ifndef XXHASH_H_5627135585666179
   #define XXHASH_H_5627135585666179 1
 
+  /*!
+   * @defgroup public Public API
+   * Contains details on the public xxHash functions.
+   * @{
+
+   */
   /* specific declaration modes for Windows */
   #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
     #if defined(WIN32) && defined(_MSC_VER) && \
@@ -180,19 +240,24 @@ extern "C" {
     #endif
   #endif
 
-  /*!
-   * XXH_NAMESPACE, aka Namespace Emulation:
-   *
-   * If you want to include _and expose_ xxHash functions from within your own
-   * library, but also want to avoid symbol collisions with other libraries
-   * which may also include xxHash, you can use XXH_NAMESPACE to automatically
-   * prefix any public symbol from xxhash library with the value of
-   * XXH_NAMESPACE (therefore, avoid empty or numeric values).
-   *
-   * Note that no change is required within the calling program as long as it
-   * includes `xxhash.h`: Regular symbol names will be automatically translated
-   * by this header.
-   */
+  #ifdef XXH_DOXYGEN
+    /*!
+     * @brief Emulate a namespace by transparently prefixing all symbols.
+     *
+     * If you want to include _and expose_ xxHash functions from within your own
+     * library, but also want to avoid symbol collisions with other libraries
+     * which may also include xxHash, you can use XXH_NAMESPACE to automatically
+     * prefix any public symbol from xxhash library with the value of
+     * XXH_NAMESPACE (therefore, avoid empty or numeric values).
+     *
+     * Note that no change is required within the calling program as long as it
+     * includes `xxhash.h`: Regular symbol names will be automatically
+     * translated by this header.
+     */
+    #define XXH_NAMESPACE                                 /* YOUR NAME HERE */
+    #undef XXH_NAMESPACE
+  #endif
+
   #ifdef XXH_NAMESPACE
     #define XXH_CAT(A, B) A##B
     #define XXH_NAME2(A, B) XXH_CAT(A, B)
@@ -264,10 +329,19 @@ extern "C" {
    ***************************************/
   #define XXH_VERSION_MAJOR 0
   #define XXH_VERSION_MINOR 8
-  #define XXH_VERSION_RELEASE 0
+  #define XXH_VERSION_RELEASE 1
   #define XXH_VERSION_NUMBER                                   \
     (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + \
      XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is only useful when xxHash is compiled as a shared library, as it is
+ * independent of the version defined in the header.
+ *
+ * @return `XXH_VERSION_NUMBER` as of when the libray was compiled.
+ */
 XXH_PUBLIC_API unsigned XXH_versionNumber(void);
 
   /* ****************************
@@ -279,15 +353,24 @@ typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode;
   /*-**********************************************************************
    *  32-bit hash
    ************************************************************************/
-  #if !defined(__VMS) &&       \
+  #if defined(XXH_DOXYGEN)                 /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+
+  #elif !defined(__VMS) &&     \
       (defined(__cplusplus) || \
        (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
     #include <stdint.h>
-typedef uint32_t XXH32_hash_t;
+typedef uint32_t      XXH32_hash_t;
+
   #else
     #include <limits.h>
     #if UINT_MAX == 0xFFFFFFFFUL
-typedef unsigned int  XXH32_hash_t;
+typedef unsigned int XXH32_hash_t;
     #else
       #if ULONG_MAX == 0xFFFFFFFFUL
 typedef unsigned long XXH32_hash_t;
@@ -298,24 +381,52 @@ typedef unsigned long XXH32_hash_t;
   #endif
 
 /*!
- * XXH32():
- *  Calculate the 32-bit hash of sequence "length" bytes stored at memory
- * address "input". The memory between input & input+length must be valid
- * (allocated and read-accessible). "seed" can be used to alter the result
- * predictably. Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher
- * benchmark): 5.4 GB/s
- *
- * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
- * and offers true 64/128 bit hash results. It provides a superior level of
- * dispersion, and greatly reduces the risks of collisions.
+ * @}
+ *
+ * @defgroup xxh32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH32 is considered rather weak by today's standards.
+ *   The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit
+ *   systems, and offers true 64/128 bit hash results. It provides a superior
+ *   level of dispersion, and greatly reduces the risks of collisions.
+ *
+ * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
+ * @see @ref xxh32_impl for implementation details
+ * @{
+
+ */
+
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in
+ * size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit hash value.
+ *
+ * @see
+ *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
  */
 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length,
                                   XXH32_hash_t seed);
 
-/*******   Streaming   *******/
-
-/*
- * Streaming functions generate the xxHash value from an incrememtal input.
+/*!
+ * Streaming functions generate the xxHash value from an incremental input.
  * This method is slower than single-call functions, due to state management.
  * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
  *
@@ -336,19 +447,125 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length,
  * digest, and generate new hash values later on by invoking `XXH*_digest()`.
  *
  * When done, release the state using `XXH*_freeState()`.
+ *
+ * Example code for incrementally hashing a file:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <xxhash.h>
+ *    #define BUFFER_SIZE 256
+ *
+ *    // Note: XXH64 and XXH3 use the same interface.
+ *    XXH32_hash_t
+ *    hashFile(FILE* stream)
+ *    {
+
+ *        XXH32_state_t* state;
+ *        unsigned char buf[BUFFER_SIZE];
+ *        size_t amt;
+ *        XXH32_hash_t hash;
+ *
+ *        state = XXH32_createState();       // Create a state
+ *        assert(state != NULL);             // Error check here
+ *        XXH32_reset(state, 0xbaad5eed);    // Reset state with our seed
+ *        while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
+
+ *            XXH32_update(state, buf, amt); // Hash the file in chunks
+ *        }
+ *        hash = XXH32_digest(state);        // Finalize the hash
+ *        XXH32_freeState(state);            // Clean up
+ *        return hash;
+ *    }
+ * @endcode
+ */
+
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
  */
+typedef struct XXH32_state_s XXH32_state_t;
 
-typedef struct XXH32_state_s XXH32_state_t;              /* incomplete type */
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * Must be freed with XXH32_freeState().
+ * @return An allocated XXH32_state_t on success, `NULL` on failure.
+ */
 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void);
-XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t *statePtr);
-XXH_PUBLIC_API void           XXH32_copyState(XXH32_state_t *      dst_state,
-                                              const XXH32_state_t *src_state);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * Must be allocated with XXH32_createState().
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref
+ * XXH32_createState().
+ * @return XXH_OK.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t *      dst_state,
+                                    const XXH32_state_t *src_state);
 
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
                                          XXH32_hash_t   seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in
+ * size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *statePtr,
                                           const void *input, size_t length);
-XXH_PUBLIC_API XXH32_hash_t  XXH32_digest(const XXH32_state_t *statePtr);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated xxHash32 value from that state.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr);
 
 /*******   Canonical representation   *******/
 
@@ -373,48 +590,158 @@ XXH_PUBLIC_API XXH32_hash_t  XXH32_digest(const XXH32_state_t *statePtr);
  * canonical format.
  */
 
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
 typedef struct {
 
-  unsigned char digest[4];
+  unsigned char digest[4];                      /*!< Hash bytes, big endian */
 
 } XXH32_canonical_t;
 
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ */
 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
                                             XXH32_hash_t       hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ */
 XXH_PUBLIC_API XXH32_hash_t
 XXH32_hashFromCanonical(const XXH32_canonical_t *src);
 
+  #ifdef __has_attribute
+    #define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+  #else
+    #define XXH_HAS_ATTRIBUTE(x) 0
+  #endif
+
+  /* C-language Attributes are added in C23. */
+  #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && \
+      defined(__has_c_attribute)
+    #define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+  #else
+    #define XXH_HAS_C_ATTRIBUTE(x) 0
+  #endif
+
+  #if defined(__cplusplus) && defined(__has_cpp_attribute)
+    #define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+  #else
+    #define XXH_HAS_CPP_ATTRIBUTE(x) 0
+  #endif
+
+  /*
+  Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough'
+  attribute introduced in CPP17 and C23. CPP17 :
+  https://en.cppreference.com/w/cpp/language/attributes/fallthrough C23   :
+  https://en.cppreference.com/w/c/language/attributes/fallthrough
+  */
+  #if XXH_HAS_C_ATTRIBUTE(x)
+    #define XXH_FALLTHROUGH [[fallthrough]]
+  #elif XXH_HAS_CPP_ATTRIBUTE(x)
+    #define XXH_FALLTHROUGH [[fallthrough]]
+  #elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+    #define XXH_FALLTHROUGH __attribute__((fallthrough))
+  #else
+    #define XXH_FALLTHROUGH
+  #endif
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+
+ */
+
   #ifndef XXH_NO_LONG_LONG
     /*-**********************************************************************
      *  64-bit hash
      ************************************************************************/
-    #if !defined(__VMS) &&                                     \
+    #if defined(XXH_DOXYGEN)                    /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+    #elif !defined(__VMS) &&                                   \
         (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
                                   (__STDC_VERSION__ >= 199901L) /* C99 */))
       #include <stdint.h>
 typedef uint64_t XXH64_hash_t;
     #else
+      #include <limits.h>
+      #if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+/* LP64 ABI says uint64_t is unsigned long */
+typedef unsigned long XXH64_hash_t;
+      #else
 /* the following type must have a width of 64-bit */
 typedef unsigned long long XXH64_hash_t;
+      #endif
     #endif
 
 /*!
- * XXH64():
- * Returns the 64-bit hash of sequence of length @length stored at memory
- * address @input.
- * @seed can be used to alter the result predictably.
+ * @}
+ *
+ * @defgroup xxh64_family XXH64 family
+ * @ingroup public
+ * @{
+
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results. It provides a superior level of
+ *   dispersion, and greatly reduces the risks of collisions.
+ */
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
  *
  * This function usually runs faster on 64-bit systems, but slower on 32-bit
  * systems (see benchmark).
  *
- * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
- * and offers true 64/128 bit hash results. It provides a superior level of
- * dispersion, and greatly reduces the risks of collisions.
+ * @param input The block of data to be hashed, at least @p length bytes in
+ * size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit hash.
+ *
+ * @see
+ *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
  */
 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t length,
                                   XXH64_hash_t seed);
 
 /*******   Streaming   *******/
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ */
 typedef struct XXH64_state_s XXH64_state_t;              /* incomplete type */
 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void);
 XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t *statePtr);
@@ -439,12 +766,15 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
 XXH_PUBLIC_API XXH64_hash_t
 XXH64_hashFromCanonical(const XXH64_canonical_t *src);
 
-/*-**********************************************************************
- *  XXH3 64-bit variant
- ************************************************************************/
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup xxh3_family XXH3 family
+ * @ingroup public
+ * @{
 
-/* ************************************************************************
- * XXH3 is a new hash algorithm featuring:
+ *
+ * XXH3 is a more recent hash algorithm featuring:
  *  - Improved speed for both small and large inputs
  *  - True 64-bit and 128-bit outputs
  *  - SIMD acceleration
@@ -454,41 +784,38 @@ XXH64_hashFromCanonical(const XXH64_canonical_t *src);
  *
  *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
  *
- * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
- * faster on small ones compared to XXH64, though exact differences depend on
- * the platform.
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
  *
- * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
- * on all platforms.
- *
- * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
- *
- * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
- * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
- * explained in the implementation.
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Any 32-bit and 64-bit targets that can run XXH32 smoothly
+ * can run XXH3 at competitive speeds, even without vector support.
+ * Further details are explained in the implementation.
  *
  * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
- * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
+ * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generage exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
  *
  * XXH3 offers 2 variants, _64bits and _128bits.
- * When only 64 bits are needed, prefer calling the _64bits variant, as it
- * reduces the amount of mixing, resulting in faster speed on small inputs.
  *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
  * It's also generally simpler to manipulate a scalar return type than a struct.
  *
- * The 128-bit version adds additional strength, but it is slightly slower.
- *
- * Return values of XXH3 and XXH128 are officially finalized starting
- * with v0.8.0 and will no longer change in future versions.
- * Avoid storing values from before that release in long-term storage.
- *
- * Results produced by v0.7.x are not comparable with results from v0.7.y.
- * However, the API is completely stable, and it can safely be used for
- * ephemeral data (local sessions).
- *
  * The API supports one-shot hashing, streaming mode, and custom secrets.
  */
 
+/*-**********************************************************************
+ *  XXH3 64-bit variant
+ ************************************************************************/
+
 /* XXH3_64bits():
  * default 64-bit variant, using default secret and default seed of 0.
  * It's the fastest variant. */
@@ -504,20 +831,28 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *data, size_t len);
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *data, size_t len,
                                                  XXH64_hash_t seed);
 
-    /*
-     * XXH3_64bits_withSecret():
-     * It's possible to provide any blob of bytes as a "secret" to generate the
-     * hash. This makes it more difficult for an external actor to prepare an
-     * intentional collision. The main condition is that secretSize *must* be
-     * large enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced
-     * hash values depends on secret's entropy. Technically, the secret must
-     * look like a bunch of random bytes. Avoid "trivial" or structured data
-     * such as repeated sequences or a text document. Whenever unsure about the
-     * "randomness" of the blob of bytes, consider relabelling it as a "custom
-     * seed" instead, and employ "XXH3_generateSecret()" (see below) to generate
-     * a high entropy secret derived from the custom seed.
+    /*!
+     * The bare minimum size for a custom secret.
+     *
+     * @see
+     *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+     *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
      */
     #define XXH3_SECRET_SIZE_MIN 136
+
+/*
+ * XXH3_64bits_withSecret():
+ * It's possible to provide any blob of bytes as a "secret" to generate the
+ * hash. This makes it more difficult for an external actor to prepare an
+ * intentional collision. The main condition is that secretSize *must* be large
+ * enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced hash
+ * values depends on secret's entropy. Technically, the secret must look like a
+ * bunch of random bytes. Avoid "trivial" or structured data such as repeated
+ * sequences or a text document. Whenever unsure about the "randomness" of the
+ * blob of bytes, consider relabelling it as a "custom seed" instead, and employ
+ * "XXH3_generateSecret()" (see below) to generate a high entropy secret derived
+ * from the custom seed.
+ */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len,
                                                    const void *secret,
                                                    size_t      secretSize);
@@ -529,6 +864,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len,
  * As a consequence, streaming is slower than one-shot hashing.
  * For better performance, prefer one-shot functions whenever applicable.
  */
+
+/*!
+ * @brief The state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ */
 typedef struct XXH3_state_s XXH3_state_t;
 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void);
 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr);
@@ -572,10 +913,16 @@ XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest(const XXH3_state_t *statePtr);
  *  XXH3 128-bit variant
  ************************************************************************/
 
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
 typedef struct {
 
-  XXH64_hash_t low64;
-  XXH64_hash_t high64;
+  XXH64_hash_t low64;                     /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+  XXH64_hash_t high64;                                   /*!< `value >> 64` */
 
 } XXH128_hash_t;
 
@@ -649,6 +996,9 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src);
 
   #endif                                                /* XXH_NO_LONG_LONG */
 
+/*!
+ * @}
+ */
 #endif                                         /* XXHASH_H_5627135585666179 */
 
 #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
@@ -660,7 +1010,7 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src);
  * These declarations should only be used with static linking.
  * Never use them in association with dynamic linking!
  *****************************************************************************
- */
+*/
 
 /*
  * These definitions are only present to allow static allocation
@@ -668,41 +1018,72 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src);
  * Never **ever** access their members directly.
  */
 
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
 struct XXH32_state_s {
 
-  XXH32_hash_t total_len_32;
-  XXH32_hash_t large_len;
-  XXH32_hash_t v1;
-  XXH32_hash_t v2;
-  XXH32_hash_t v3;
-  XXH32_hash_t v4;
-  XXH32_hash_t mem32[4];
-  XXH32_hash_t memsize;
-  XXH32_hash_t
-      reserved; /* never read nor write, might be removed in a future version */
+  XXH32_hash_t total_len_32;          /*!< Total length hashed, modulo 2^32 */
+  XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref
+                                total_len_32 overflow) */
+  XXH32_hash_t v1;                              /*!< First accumulator lane */
+  XXH32_hash_t v2;                             /*!< Second accumulator lane */
+  XXH32_hash_t v3;                              /*!< Third accumulator lane */
+  XXH32_hash_t v4;                             /*!< Fourth accumulator lane */
+  XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as
+                                unsigned char[16]. */
+  XXH32_hash_t memsize;                   /*!< Amount of data in @ref mem32 */
+  XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may
+                            be removed. */
 
 };                                            /* typedef'd to XXH32_state_t */
 
   #ifndef XXH_NO_LONG_LONG       /* defined when there is no 64-bit support */
 
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
 struct XXH64_state_s {
 
-  XXH64_hash_t total_len;
-  XXH64_hash_t v1;
-  XXH64_hash_t v2;
-  XXH64_hash_t v3;
-  XXH64_hash_t v4;
-  XXH64_hash_t mem64[4];
-  XXH32_hash_t memsize;
-  XXH32_hash_t reserved32;                   /* required for padding anyway */
-  XXH64_hash_t reserved64; /* never read nor write, might be removed in a future
-                              version */
+  XXH64_hash_t total_len;  /*!< Total length hashed. This is always 64-bit. */
+  XXH64_hash_t v1;                              /*!< First accumulator lane */
+  XXH64_hash_t v2;                             /*!< Second accumulator lane */
+  XXH64_hash_t v3;                              /*!< Third accumulator lane */
+  XXH64_hash_t v4;                             /*!< Fourth accumulator lane */
+  XXH64_hash_t mem64[4];   /*!< Internal buffer for partial reads. Treated as
+                              unsigned char[32]. */
+  XXH32_hash_t memsize;                   /*!< Amount of data in @ref mem64 */
+  XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+  XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it
+                              may be removed. */
 
 };                                            /* typedef'd to XXH64_state_t */
 
-    #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)  /* C11+ */
+    #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 \
+                                                                    */
       #include <stdalign.h>
       #define XXH_ALIGN(n) alignas(n)
+    #elif defined(__cplusplus) && (__cplusplus >= 201103L)      /* >= C++11 */
+      /* In C++ alignas() is a keyword */
+      #define XXH_ALIGN(n) alignas(n)
     #elif defined(__GNUC__)
       #define XXH_ALIGN(n) __attribute__((aligned(n)))
     #elif defined(_MSC_VER)
@@ -713,39 +1094,94 @@ struct XXH64_state_s {
 
     /* Old GCC versions only accept the attribute after the type in structures.
      */
-    #if !(defined(__STDC_VERSION__) &&              \
-          (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
+    #if !(defined(__STDC_VERSION__) &&                                        \
+          (__STDC_VERSION__ >= 201112L))                       /* C11+ */     \
+        && !(defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
         && defined(__GNUC__)
       #define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
     #else
       #define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
     #endif
 
+    /*!
+     * @brief The size of the internal XXH3 buffer.
+     *
+     * This is the optimal update size for incremental hashing.
+     *
+     * @see XXH3_64b_update(), XXH3_128b_update().
+     */
     #define XXH3_INTERNALBUFFER_SIZE 256
+
+    /*!
+     * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+     *
+     * This is the size used in @ref XXH3_kSecret and the seeded functions.
+     *
+     * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+     */
     #define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
+ * Otherwise it is an opaque type.
+ * Never use this definition in combination with dynamic library.
+ * This allows fields to safely be changed in the future.
+ *
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`,
+ * it will not be sufficiently aligned.
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do never access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
 struct XXH3_state_s {
 
   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
-  /* used to store a custom secret generated from a seed */
+  /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref
+   * XXH64_state_s */
   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+  /*!< Used to store a custom secret generated from a seed. */
   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
-  XXH32_hash_t         bufferedSize;
-  XXH32_hash_t         reserved32;
-  size_t               nbStripesSoFar;
-  XXH64_hash_t         totalLen;
-  size_t               nbStripesPerBlock;
-  size_t               secretLimit;
-  XXH64_hash_t         seed;
-  XXH64_hash_t         reserved64;
-  const unsigned char *extSecret; /* reference to external secret;
-                                   * if == NULL, use .customSecret instead */
+  /*!< The internal buffer. @see XXH32_state_s::mem32 */
+  XXH32_hash_t bufferedSize;
+  /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+  XXH32_hash_t reserved32;
+  /*!< Reserved field. Needed for padding on 64-bit. */
+  size_t nbStripesSoFar;
+  /*!< Number or stripes processed. */
+  XXH64_hash_t totalLen;
+  /*!< Total length hashed. 64-bit even on 32-bit targets. */
+  size_t nbStripesPerBlock;
+  /*!< Number of stripes per block. */
+  size_t secretLimit;
+  /*!< Size of @ref customSecret or @ref extSecret */
+  XXH64_hash_t seed;
+  /*!< Seed for _withSeed variants. Must be zero otherwise, @see
+   * XXH3_INITSTATE() */
+  XXH64_hash_t reserved64;
+  /*!< Reserved field. */
+  const unsigned char *extSecret;
+  /*!< Reference to an external secret for the _withSecret variants, NULL
+   *   for other variants. */
   /* note: there may be some padding at the end due to alignment on 64 bytes */
 
 };                                             /* typedef'd to XXH3_state_t */
 
     #undef XXH_ALIGN_MEMBER
 
-    /* When the XXH3_state_t structure is merely emplaced on stack,
+    /*!
+     * @brief Initializes a stack-allocated `XXH3_state_s`.
+     *
+     * When the @ref XXH3_state_t structure is merely emplaced on stack,
      * it should be initialized with XXH3_INITSTATE() or a memset()
      * in case its first reset uses XXH3_NNbits_reset_withSeed().
      * This init can be omitted if the first reset uses default or _withSecret
@@ -802,7 +1238,6 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
                                     XXH64_hash_t seed);
 
   #endif                                                /* XXH_NO_LONG_LONG */
-
   #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
     #define XXH_IMPLEMENTATION
   #endif
@@ -844,81 +1279,183 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
   /* *************************************
    *  Tuning parameters
    ***************************************/
+
   /*!
-   * XXH_FORCE_MEMORY_ACCESS:
-   * By default, access to unaligned memory is controlled by `memcpy()`, which
-   * is safe and portable.
-   *
-   * Unfortunately, on some target/compiler combinations, the generated assembly
-   * is sub-optimal.
+   * @defgroup tuning Tuning parameters
+   * @{
+
    *
-   * The below switch allow selection of a different access method
-   * in the search for improved performance.
-   * Method 0 (default):
-   *     Use `memcpy()`. Safe and portable. Default.
-   * Method 1:
-   *     `__attribute__((packed))` statement. It depends on compiler extensions
-   *     and is therefore not portable.
-   *     This method is safe if your compiler supports it, and *generally* as
-   *     fast or faster than `memcpy`.
-   * Method 2:
-   *     Direct access via cast. This method doesn't depend on the compiler but
-   *     violates the C standard.
-   *     It can generate buggy code on targets which do not support unaligned
-   *     memory accesses.
-   *     But in some circumstances, it's the only known way to get the most
-   *     performance (example: GCC + ARMv6)
-   * Method 3:
-   *     Byteshift. This can generate the best code on old compilers which don't
-   *     inline small `memcpy()` calls, and it might also be faster on
-   * big-endian systems which lack a native byteswap instruction. See
-   * https://stackoverflow.com/a/32095106/646947 for details. Prefer these
-   * methods in priority order (0 > 1 > 2 > 3)
+   * Various macros to control xxHash's behavior.
    */
+  #ifdef XXH_DOXYGEN
+    /*!
+     * @brief Define this to disable 64-bit code.
+     *
+     * Useful if only using the @ref xxh32_family and you have a strict C90
+     * compiler.
+     */
+    #define XXH_NO_LONG_LONG
+    #undef XXH_NO_LONG_LONG                               /* don't actually */
+    /*!
+     * @brief Controls how unaligned memory is accessed.
+     *
+     * By default, access to unaligned memory is controlled by `memcpy()`, which
+     * is safe and portable.
+     *
+     * Unfortunately, on some target/compiler combinations, the generated
+     * assembly is sub-optimal.
+     *
+     * The below switch allow selection of a different access method
+     * in the search for improved performance.
+     *
+     * @par Possible options:
+     *
+     *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+     *   @par
+     *     Use `memcpy()`. Safe and portable. Note that most modern compilers
+     * will eliminate the function call and treat it as an unaligned access.
+     *
+     *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
+     *   @par
+     *     Depends on compiler extensions and is therefore not portable.
+     *     This method is safe _if_ your compiler supports it,
+     *     and *generally* as fast or faster than `memcpy`.
+     *
+     *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+     *  @par
+     *     Casts directly and dereferences. This method doesn't depend on the
+     *     compiler, but it violates the C standard as it directly dereferences
+     * an unaligned pointer. It can generate buggy code on targets which do not
+     *     support unaligned memory accesses, but in some circumstances, it's
+     * the only known way to get the most performance.
+     *
+     *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+     *  @par
+     *     Also portable. This can generate the best code on old compilers which
+     * don't inline small `memcpy()` calls, and it might also be faster on
+     * big-endian systems which lack a native byteswap instruction. However,
+     * some compilers will emit literal byteshifts even if the target supports
+     * unaligned access.
+     *  .
+     *
+     * @warning
+     *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+     *   care, as what works on one compiler/platform/optimization level may
+     * cause another to read garbage data or even crash.
+     *
+     * See https://stackoverflow.com/a/32095106/646947 for details.
+     *
+     * Prefer these methods in priority order (0 > 3 > 1 > 2)
+     */
+    #define XXH_FORCE_MEMORY_ACCESS 0
+    /*!
+     * @def XXH_ACCEPT_NULL_INPUT_POINTER
+     * @brief Whether to add explicit `NULL` checks.
+     *
+     * If the input pointer is `NULL` and the length is non-zero, xxHash's
+     * default behavior is to dereference it, triggering a segfault.
+     *
+     * When this macro is enabled, xxHash actively checks the input for a null
+     * pointer. If it is, the result for null input pointers is the same as a
+     * zero-length input.
+     */
+    #define XXH_ACCEPT_NULL_INPUT_POINTER 0
+    /*!
+     * @def XXH_FORCE_ALIGN_CHECK
+     * @brief If defined to non-zero, adds a special path for aligned inputs
+     * (XXH32() and XXH64() only).
+     *
+     * This is an important performance trick for architectures without decent
+     * unaligned memory access performance.
+     *
+     * It checks for input alignment, and when conditions are met, uses a "fast
+     * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+     * faster_ read speed.
+     *
+     * The check costs one initial branch per hash, which is generally
+     * negligible, but not zero.
+     *
+     * Moreover, it's not useful to generate an additional code path if memory
+     * access uses the same instruction for both aligned and unaligned
+     * addresses (e.g. x86 and aarch64).
+     *
+     * In these cases, the alignment check can be removed by setting this macro
+     * to 0. Then the code will always use unaligned memory access. Align check
+     * is automatically disabled on x86, x64 & arm64, which are platforms known
+     * to offer good unaligned memory accesses performance.
+     *
+     * This option does not affect XXH3 (only XXH32 and XXH64).
+     */
+    #define XXH_FORCE_ALIGN_CHECK 0
+
+    /*!
+     * @def XXH_NO_INLINE_HINTS
+     * @brief When non-zero, sets all functions to `static`.
+     *
+     * By default, xxHash tries to force the compiler to inline almost all
+     * internal functions.
+     *
+     * This can usually improve performance due to reduced jumping and improved
+     * constant folding, but significantly increases the size of the binary
+     * which might not be favorable.
+     *
+     * Additionally, sometimes the forced inlining can be detrimental to
+     * performance, depending on the architecture.
+     *
+     * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+     * compiler full control on whether to inline or not.
+     *
+     * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+     * -fno-inline with GCC or Clang, this will automatically be defined.
+     */
+    #define XXH_NO_INLINE_HINTS 0
+
+    /*!
+     * @def XXH_REROLL
+     * @brief Whether to reroll `XXH32_finalize`.
+     *
+     * For performance, `XXH32_finalize` uses an unrolled loop
+     * in the form of a switch statement.
+     *
+     * This is not always desirable, as it generates larger code,
+     * and depending on the architecture, may even be slower
+     *
+     * This is automatically defined with `-Os`/`-Oz` on GCC and Clang.
+     */
+    #define XXH_REROLL 0
+
+    /*!
+     * @internal
+     * @brief Redefines old internal names.
+     *
+     * For compatibility with code that uses xxHash's internals before the names
+     * were changed to improve namespacing. There is no other reason to use
+     * this.
+     */
+    #define XXH_OLD_NAMES
+    #undef XXH_OLD_NAMES                 /* don't actually use, it is ugly. */
+  #endif                                                     /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
   #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command \
                                      line for example */
-    #if !defined(__clang__) && defined(__GNUC__) &&                \
-        defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && \
-        (__ARM_ARCH == 6)
-      #define XXH_FORCE_MEMORY_ACCESS 2
-    #elif !defined(__clang__) &&                            \
-        ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
-         (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+  /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
+    #if !defined(__clang__) &&                                          \
+        ((defined(__INTEL_COMPILER) && !defined(_WIN32)) ||             \
+         (defined(__GNUC__) &&                                          \
+          ((defined(__ARM_ARCH) && __ARM_ARCH >= 7) ||                  \
+           (defined(__mips__) && (__mips <= 5 || __mips_isa_rev < 6) && \
+            (!defined(__mips16) || defined(__mips_mips16e2))))))
       #define XXH_FORCE_MEMORY_ACCESS 1
     #endif
   #endif
 
-  /*!
-   * XXH_ACCEPT_NULL_INPUT_POINTER:
-   * If the input pointer is NULL, xxHash's default behavior is to dereference
-   * it, triggering a segfault. When this macro is enabled, xxHash actively
-   * checks the input for a null pointer. If it is, the result for null input
-   * pointers is the same as a zero-length input.
-   */
   #ifndef XXH_ACCEPT_NULL_INPUT_POINTER        /* can be defined externally */
     #define XXH_ACCEPT_NULL_INPUT_POINTER 0
   #endif
 
-  /*!
-   * XXH_FORCE_ALIGN_CHECK:
-   * This is an important performance trick
-   * for architectures without decent unaligned memory access performance.
-   * It checks for input alignment, and when conditions are met,
-   * uses a "fast path" employing direct 32-bit/64-bit read,
-   * resulting in _dramatically faster_ read speed.
-   *
-   * The check costs one initial branch per hash, which is generally negligible,
-   * but not zero. Moreover, it's not useful to generate binary for an
-   * additional code path if memory access uses same instruction for both
-   * aligned and unaligned adresses.
-   *
-   * In these cases, the alignment check can be removed by setting this macro to
-   * 0. Then the code will always use unaligned memory access. Align check is
-   * automatically disabled on x86, x64 & arm64, which are platforms known to
-   * offer good unaligned memory accesses performance.
-   *
-   * This option does not affect XXH3 (only XXH32 and XXH64).
-   */
   #ifndef XXH_FORCE_ALIGN_CHECK                /* can be defined externally */
     #if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || \
         defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64)  /* visual */
@@ -928,25 +1465,6 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
     #endif
   #endif
 
-  /*!
-   * XXH_NO_INLINE_HINTS:
-   *
-   * By default, xxHash tries to force the compiler to inline almost all
-   * internal functions.
-   *
-   * This can usually improve performance due to reduced jumping and improved
-   * constant folding, but significantly increases the size of the binary which
-   * might not be favorable.
-   *
-   * Additionally, sometimes the forced inlining can be detrimental to
-   * performance, depending on the architecture.
-   *
-   * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
-   * compiler full control on whether to inline or not.
-   *
-   * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
-   * -fno-inline with GCC or Clang, this will automatically be defined.
-   */
   #ifndef XXH_NO_INLINE_HINTS
     #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
         || defined(__NO_INLINE__)                       /* -O0, -fno-inline */
@@ -956,44 +1474,57 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
     #endif
   #endif
 
-  /*!
-   * XXH_REROLL:
-   * Whether to reroll XXH32_finalize, and XXH64_finalize,
-   * instead of using an unrolled jump table/if statement loop.
-   *
-   * This is automatically defined on -Os/-Oz on GCC and Clang.
-   */
   #ifndef XXH_REROLL
-    #if defined(__OPTIMIZE_SIZE__)
+    #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ || \
+        (defined(__GNUC__) && !defined(__clang__))
+    /* The if/then loop is preferable to switch/case on gcc (on x64) */
       #define XXH_REROLL 1
     #else
       #define XXH_REROLL 0
     #endif
   #endif
 
+  /*!
+   * @defgroup impl Implementation
+   * @{
+
+   */
+
   /* *************************************
    *  Includes & Memory related functions
    ***************************************/
-  /*!
+  /*
    * Modify the local functions below should you wish to use
    * different memory routines for malloc() and free()
    */
   #include <stdlib.h>
 
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
 static void *XXH_malloc(size_t s) {
 
   return malloc(s);
 
 }
 
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
 static void XXH_free(void *p) {
 
   free(p);
 
 }
 
-  /*! and for memcpy() */
   #include <string.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than memcpy().
+ */
 static void *XXH_memcpy(void *dest, const void *src, size_t size) {
 
   return memcpy(dest, src, size);
@@ -1037,7 +1568,11 @@ static void *XXH_memcpy(void *dest, const void *src, size_t size) {
   /* *************************************
    *  Debug
    ***************************************/
-  /*
+  /*!
+   * @ingroup tuning
+   * @def XXH_DEBUGLEVEL
+   * @brief Sets the debugging level.
+   *
    * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
    * compiler's command line options. The value must be a number.
    */
@@ -1057,12 +1592,58 @@ static void *XXH_memcpy(void *dest, const void *src, size_t size) {
   #endif
 
   /* note: use after variable declarations */
-  #define XXH_STATIC_ASSERT(c)            \
-    do {                                  \
-                                          \
-      enum { XXH_sa = 1 / (int)(!!(c)) }; \
-                                          \
-    } while (0)
+  #ifndef XXH_STATIC_ASSERT
+    #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11 */
+      #include <assert.h>
+      #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
+        do {                                       \
+                                                   \
+          static_assert((c), m);                   \
+                                                   \
+        } while (0)
+    #elif defined(__cplusplus) && (__cplusplus >= 201103L)         /* C++11 */
+      #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
+        do {                                       \
+                                                   \
+          static_assert((c), m);                   \
+                                                   \
+        } while (0)
+    #else
+      #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
+        do {                                       \
+                                                   \
+          struct xxh_sa {                          \
+                                                   \
+            char x[(c) ? 1 : -1];                  \
+                                                   \
+          };                                       \
+                                                   \
+        } while (0)
+    #endif
+    #define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c), #c)
+  #endif
+
+  /*!
+   * @internal
+   * @def XXH_COMPILER_GUARD(var)
+   * @brief Used to prevent unwanted optimizations for @p var.
+   *
+   * It uses an empty GCC inline assembly statement with a register constraint
+   * which forces @p var into a general purpose register (eg eax, ebx, ecx
+   * on x86) and marks it as modified.
+   *
+   * This is used in a few places to avoid unwanted autovectorization (e.g.
+   * XXH32_round()). All vectorization we want is explicit via intrinsics,
+   * and _usually_ isn't wanted elsewhere.
+   *
+   * We also use it to prevent unwanted constant folding for AArch64 in
+   * XXH3_initCustomSecret_scalar().
+   */
+  #ifdef __GNUC__
+    #define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r"(var))
+  #else
+    #define XXH_COMPILER_GUARD(var) ((void)0)
+  #endif
 
   /* *************************************
    *  Basic Types
@@ -1085,6 +1666,56 @@ typedef XXH32_hash_t xxh_u32;
 
 /* ***   Memory access   *** */
 
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
   #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
   /*
    * Manual byteshift. Best for old compilers which don't inline memcpy.
@@ -1146,16 +1777,23 @@ static xxh_u32 XXH_read32(const void *memPtr) {
 
   #endif                                  /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
 
-/* ***   Endianess   *** */
-typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess;
+  /* ***   Endianness   *** */
 
   /*!
-   * XXH_CPU_LITTLE_ENDIAN:
+   * @ingroup tuning
+   * @def XXH_CPU_LITTLE_ENDIAN
+   * @brief Whether the target is little endian.
+   *
    * Defined to 1 if the target is little endian, or 0 if it is big endian.
    * It can be defined externally, for example on the compiler command line.
    *
-   * If it is not defined, a runtime check (which is usually constant folded)
-   * is used instead.
+   * If it is not defined,
+   * a runtime check (which is usually constant folded) is used instead.
+   *
+   * @note
+   *   This is not necessarily defined to an integer constant.
+   *
+   * @see XXH_isLittleEndian() for the runtime check.
    */
   #ifndef XXH_CPU_LITTLE_ENDIAN
     /*
@@ -1170,8 +1808,11 @@ typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess;
         (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
       #define XXH_CPU_LITTLE_ENDIAN 0
     #else
-/*
- * runtime test, presumed to simplify to a constant by compiler
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
  */
 static int XXH_isLittleEndian(void) {
 
@@ -1189,7 +1830,7 @@ static int XXH_isLittleEndian(void) {
   return one.c[0];
 
 }
-
+\
       #define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
     #endif
   #endif
@@ -1205,6 +1846,19 @@ static int XXH_isLittleEndian(void) {
     #define XXH_HAS_BUILTIN(x) 0
   #endif
 
+  /*!
+   * @internal
+   * @def XXH_rotl32(x,r)
+   * @brief 32-bit rotate left.
+   *
+   * @param x The 32-bit integer to be rotated.
+   * @param r The number of bits to rotate.
+   * @pre
+   *   @p r > 0 && @p r < 32
+   * @note
+   *   @p x and @p r may be evaluated multiple times.
+   * @return The rotated result.
+   */
   #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) && \
       XXH_HAS_BUILTIN(__builtin_rotateleft64)
     #define XXH_rotl32 __builtin_rotateleft32
@@ -1219,6 +1873,14 @@ static int XXH_isLittleEndian(void) {
     #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
   #endif
 
+  /*!
+   * @internal
+   * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+   * @brief A 32-bit byteswap.
+   *
+   * @param x The 32-bit integer to byteswap.
+   * @return @p x, byteswapped.
+   */
   #if defined(_MSC_VER)                                    /* Visual Studio */
     #define XXH_swap32 _byteswap_ulong
   #elif XXH_GCC_VERSION >= 403
@@ -1236,7 +1898,17 @@ static xxh_u32 XXH_swap32(xxh_u32 x) {
 /* ***************************
  *  Memory reads
  *****************************/
-typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+
+  XXH_aligned,                                                 /*!< Aligned */
+  XXH_unaligned                                     /*!< Possibly unaligned */
+
+} XXH_alignment;
 
   /*
    * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
@@ -1295,6 +1967,7 @@ XXH_FORCE_INLINE xxh_u32 XXH_readLE32_align(const void *  ptr,
 /* *************************************
  *  Misc
  ***************************************/
+/*! @ingroup public */
 XXH_PUBLIC_API unsigned XXH_versionNumber(void) {
 
   return XXH_VERSION_NUMBER;
@@ -1304,16 +1977,19 @@ XXH_PUBLIC_API unsigned XXH_versionNumber(void) {
 /* *******************************************************************
  *  32-bit hash functions
  *********************************************************************/
-static const xxh_u32 XXH_PRIME32_1 =
-    0x9E3779B1U;                      /* 0b10011110001101110111100110110001 */
-static const xxh_u32 XXH_PRIME32_2 =
-    0x85EBCA77U;                      /* 0b10000101111010111100101001110111 */
-static const xxh_u32 XXH_PRIME32_3 =
-    0xC2B2AE3DU;                      /* 0b11000010101100101010111000111101 */
-static const xxh_u32 XXH_PRIME32_4 =
-    0x27D4EB2FU;                      /* 0b00100111110101001110101100101111 */
-static const xxh_u32 XXH_PRIME32_5 =
-    0x165667B1U;                      /* 0b00010110010101100110011110110001 */
+/*!
+ * @}
+ * @defgroup xxh32_impl XXH32 implementation
+ * @ingroup impl
+ * @{
+
+ */
+/* #define instead of static const, to be used as initializers */
+  #define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */
+  #define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */
+  #define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */
+  #define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */
+  #define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */
 
   #ifdef XXH_OLD_NAMES
     #define PRIME32_1 XXH_PRIME32_1
@@ -1323,19 +1999,29 @@ static const xxh_u32 XXH_PRIME32_5 =
     #define PRIME32_5 XXH_PRIME32_5
   #endif
 
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
 static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) {
 
   acc += input * XXH_PRIME32_2;
   acc = XXH_rotl32(acc, 13);
   acc *= XXH_PRIME32_1;
-  #if defined(__GNUC__) && defined(__SSE4_1__) && \
+  #if (defined(__SSE4_1__) || defined(__aarch64__)) && \
       !defined(XXH_ENABLE_AUTOVECTORIZE)
   /*
    * UGLY HACK:
-   * This inline assembly hack forces acc into a normal register. This is the
-   * only thing that prevents GCC and Clang from autovectorizing the XXH32
-   * loop (pragmas and attributes don't work for some resason) without globally
-   * disabling SSE4.1.
+   * A compiler fence is the only thing that prevents GCC and Clang from
+   * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+   * reason) without globally disabling SSE4.1.
    *
    * The reason we want to avoid vectorization is because despite working on
    * 4 integers at a time, there are multiple factors slowing XXH32 down on
@@ -1360,28 +2046,26 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) {
    *   can load data, while v3 can multiply. SSE forces them to operate
    *   together.
    *
-   * How this hack works:
-   * __asm__(""       // Declare an assembly block but don't declare any
-   * instructions :       // However, as an Input/Output Operand,
-   *          "+r"    // constrain a read/write operand (+) as a general purpose
-   * register (r). (acc)   // and set acc as the operand
-   * );
-   *
-   * Because of the 'r', the compiler has promised that seed will be in a
-   * general purpose register and the '+' says that it will be 'read/write',
-   * so it has to assume it has changed. It is like volatile without all the
-   * loads and stores.
-   *
-   * Since the argument has to be in a normal register (not an SSE register),
-   * each time XXH32_round is called, it is impossible to vectorize.
+   * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
+   * and it is pointless writing a NEON implementation that is basically the
+   * same speed as scalar for XXH32.
    */
-  __asm__("" : "+r"(acc));
+  XXH_COMPILER_GUARD(acc);
   #endif
   return acc;
 
 }
 
-/* mix all bits */
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param h32 The hash to avalanche.
+ * @return The avalanched hash.
+ */
 static xxh_u32 XXH32_avalanche(xxh_u32 h32) {
 
   h32 ^= h32 >> 15;
@@ -1395,11 +2079,23 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32) {
 
   #define XXH_get32bits(p) XXH_readLE32_align(p, align)
 
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param h32 The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ */
 static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
                               XXH_alignment align) {
-
-  /* dummy comment */
-
+\
   #define XXH_PROCESS1                           \
     do {                                         \
                                                  \
@@ -1443,20 +2139,20 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
 
       case 12:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 8:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 4:
         XXH_PROCESS4;
         return XXH32_avalanche(h32);
 
       case 13:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 9:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 5:
         XXH_PROCESS4;
         XXH_PROCESS1;
@@ -1464,10 +2160,10 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
 
       case 14:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 10:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 6:
         XXH_PROCESS4;
         XXH_PROCESS1;
@@ -1476,22 +2172,22 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
 
       case 15:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 11:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 7:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 3:
         XXH_PROCESS1;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 2:
         XXH_PROCESS1;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 1:
         XXH_PROCESS1;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 0:
         return XXH32_avalanche(h32);
 
@@ -1512,10 +2208,18 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
     #undef XXH_PROCESS4
   #endif
 
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input, len, seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
 XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len,
                                             xxh_u32 seed, XXH_alignment align) {
 
-  const xxh_u8 *bEnd = input + len;
+  const xxh_u8 *bEnd = input ? input + len : NULL;
   xxh_u32       h32;
 
   #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
@@ -1565,6 +2269,7 @@ XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len,
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
                                   XXH32_hash_t seed) {
 
@@ -1574,9 +2279,7 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
     XXH32_reset(&state, seed);
     XXH32_update(&state, (const xxh_u8*)input, len);
     return XXH32_digest(&state);
-
   #else
-
   if (XXH_FORCE_ALIGN_CHECK) {
 
     if ((((size_t)input) & 3) ==
@@ -1593,13 +2296,16 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
 }
 
 /*******   Hash streaming   *******/
-
+/*!
+ * @ingroup xxh32_family
+ */
 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void) {
 
   return (XXH32_state_t *)XXH_malloc(sizeof(XXH32_state_t));
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) {
 
   XXH_free(statePtr);
@@ -1607,6 +2313,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) {
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t *      dstState,
                                     const XXH32_state_t *srcState) {
 
@@ -1614,6 +2321,7 @@ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t *      dstState,
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
                                          XXH32_hash_t   seed) {
 
@@ -1630,6 +2338,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state,
                                           const void *input, size_t len) {
 
@@ -1719,6 +2428,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state,
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) {
 
   xxh_u32 h32;
@@ -1743,7 +2453,8 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) {
 
 /*******   Canonical representation   *******/
 
-/*
+/*!
+ * @ingroup xxh32_family
  * The default return values from XXH functions are unsigned 32 and 64 bit
  * integers.
  *
@@ -1765,6 +2476,7 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH32_hash_t
 XXH32_hashFromCanonical(const XXH32_canonical_t *src) {
 
@@ -1777,7 +2489,12 @@ XXH32_hashFromCanonical(const XXH32_canonical_t *src) {
 /* *******************************************************************
  *  64-bit hash functions
  *********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
 
+ */
 /*******   Memory access   *******/
 
 typedef XXH64_hash_t xxh_u64;
@@ -1786,40 +2503,6 @@ typedef XXH64_hash_t xxh_u64;
       #define U64 xxh_u64
     #endif
 
-    /*!
-     * XXH_REROLL_XXH64:
-     * Whether to reroll the XXH64_finalize() loop.
-     *
-     * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
-     * performance gain on 64-bit hosts, as only one jump is required.
-     *
-     * However, on 32-bit hosts, because arithmetic needs to be done with two
-     * 32-bit registers, and 64-bit arithmetic needs to be simulated, it isn't
-     * beneficial to unroll. The code becomes ridiculously large (the largest
-     * function in the binary on i386!), and rerolling it saves anywhere from
-     * 3kB to 20kB. It is also slightly faster because it fits into cache better
-     * and is more likely to be inlined by the compiler.
-     *
-     * If XXH_REROLL is defined, this is ignored and the loop is always
-     * rerolled.
-     */
-    #ifndef XXH_REROLL_XXH64
-      #if (defined(__ILP32__) ||                                              \
-           defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
-          || !(defined(__x86_64__) || defined(_M_X64) ||                      \
-               defined(_M_AMD64) /* x86-64 */                                 \
-               || defined(_M_ARM64) || defined(__aarch64__) ||                \
-               defined(__arm64__) /* aarch64 */                               \
-               || defined(__PPC64__) || defined(__PPC64LE__) ||               \
-               defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */       \
-               || defined(__mips64__) || defined(__mips64)) /* mips64 */      \
-          || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX)  /* check limits */
-        #define XXH_REROLL_XXH64 1
-      #else
-        #define XXH_REROLL_XXH64 0
-      #endif
-    #endif                                    /* !defined(XXH_REROLL_XXH64) */
-
     #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
     /*
      * Manual byteshift. Best for old compilers which don't inline memcpy.
@@ -1950,23 +2633,35 @@ XXH_FORCE_INLINE xxh_u64 XXH_readLE64_align(const void *  ptr,
 
 }
 
-/*******   xxh64   *******/
+    /*******   xxh64   *******/
+    /*!
+     * @}
+     * @defgroup xxh64_impl XXH64 implementation
+     * @ingroup impl
+     * @{
 
-static const xxh_u64 XXH_PRIME64_1 =
-    0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111
-                            */
-static const xxh_u64 XXH_PRIME64_2 =
-    0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111
-                            */
-static const xxh_u64 XXH_PRIME64_3 =
-    0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001
-                            */
-static const xxh_u64 XXH_PRIME64_4 =
-    0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011
-                            */
-static const xxh_u64 XXH_PRIME64_5 =
-    0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101
-                            */
+     */
+    /* #define rather that static const, to be used as initializers */
+    #define XXH_PRIME64_1                                                                         \
+      0x9E3779B185EBCA87ULL /*!<                                                                  \
+                               0b1001111000110111011110011011000110000101111010111100101010000111 \
+                             */
+    #define XXH_PRIME64_2                                                                         \
+      0xC2B2AE3D27D4EB4FULL /*!<                                                                  \
+                               0b1100001010110010101011100011110100100111110101001110101101001111 \
+                             */
+    #define XXH_PRIME64_3                                                                         \
+      0x165667B19E3779F9ULL /*!<                                                                  \
+                               0b0001011001010110011001111011000110011110001101110111100111111001 \
+                             */
+    #define XXH_PRIME64_4                                                                         \
+      0x85EBCA77C2B2AE63ULL /*!<                                                                  \
+                               0b1000010111101011110010100111011111000010101100101010111001100011 \
+                             */
+    #define XXH_PRIME64_5                                                                         \
+      0x27D4EB2F165667C5ULL /*!<                                                                  \
+                               0b0010011111010100111010110010111100010110010101100110011111000101 \
+                             */
 
     #ifdef XXH_OLD_NAMES
       #define PRIME64_1 XXH_PRIME64_1
@@ -2010,185 +2705,35 @@ static xxh_u64 XXH64_avalanche(xxh_u64 h64) {
 static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len,
                               XXH_alignment align) {
 
-    /* dummy comment */
-
-    #define XXH_PROCESS1_64                        \
-      do {                                         \
-                                                   \
-        h64 ^= (*ptr++) * XXH_PRIME64_5;           \
-        h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \
-                                                   \
-      } while (0)
-
-    #define XXH_PROCESS4_64                                        \
-      do {                                                         \
-                                                                   \
-        h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;      \
-        ptr += 4;                                                  \
-        h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \
-                                                                   \
-      } while (0)
-
-    #define XXH_PROCESS8_64                                        \
-      do {                                                         \
-                                                                   \
-        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));     \
-        ptr += 8;                                                  \
-        h64 ^= k1;                                                 \
-        h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4; \
-                                                                   \
-      } while (0)
-
-  /* Rerolled version for 32-bit targets is faster and much smaller. */
-  if (XXH_REROLL || XXH_REROLL_XXH64) {
-
-    len &= 31;
-    while (len >= 8) {
-
-      XXH_PROCESS8_64;
-      len -= 8;
-
-    }
-
-    if (len >= 4) {
-
-      XXH_PROCESS4_64;
-      len -= 4;
-
-    }
-
-    while (len > 0) {
+  len &= 31;
+  while (len >= 8) {
 
-      XXH_PROCESS1_64;
-      --len;
+    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+    ptr += 8;
+    h64 ^= k1;
+    h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+    len -= 8;
 
-    }
+  }
 
-    return XXH64_avalanche(h64);
+  if (len >= 4) {
 
-  } else {
+    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+    ptr += 4;
+    h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+    len -= 4;
 
-    switch (len & 31) {
+  }
 
-      case 24:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 16:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 8:
-        XXH_PROCESS8_64;
-        return XXH64_avalanche(h64);
-
-      case 28:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 20:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 12:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 4:
-        XXH_PROCESS4_64;
-        return XXH64_avalanche(h64);
-
-      case 25:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 17:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 9:
-        XXH_PROCESS8_64;
-        XXH_PROCESS1_64;
-        return XXH64_avalanche(h64);
-
-      case 29:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 21:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 13:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 5:
-        XXH_PROCESS4_64;
-        XXH_PROCESS1_64;
-        return XXH64_avalanche(h64);
-
-      case 26:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 18:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 10:
-        XXH_PROCESS8_64;
-        XXH_PROCESS1_64;
-        XXH_PROCESS1_64;
-        return XXH64_avalanche(h64);
-
-      case 30:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 22:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 14:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 6:
-        XXH_PROCESS4_64;
-        XXH_PROCESS1_64;
-        XXH_PROCESS1_64;
-        return XXH64_avalanche(h64);
-
-      case 27:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 19:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 11:
-        XXH_PROCESS8_64;
-        XXH_PROCESS1_64;
-        XXH_PROCESS1_64;
-        XXH_PROCESS1_64;
-        return XXH64_avalanche(h64);
-
-      case 31:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 23:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 15:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 7:
-        XXH_PROCESS4_64;
-        /* fallthrough */
-      case 3:
-        XXH_PROCESS1_64;
-        /* fallthrough */
-      case 2:
-        XXH_PROCESS1_64;
-        /* fallthrough */
-      case 1:
-        XXH_PROCESS1_64;
-        /* fallthrough */
-      case 0:
-        return XXH64_avalanche(h64);
+  while (len > 0) {
 
-    }
+    h64 ^= (*ptr++) * XXH_PRIME64_5;
+    h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
+    --len;
 
   }
 
-  /* impossible to reach */
-  XXH_ASSERT(0);
-  return 0;          /* unreachable, but some compilers complain without it */
+  return XXH64_avalanche(h64);
 
 }
 
@@ -2205,7 +2750,7 @@ static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len,
 XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len,
                                             xxh_u64 seed, XXH_alignment align) {
 
-  const xxh_u8 *bEnd = input + len;
+  const xxh_u8 *bEnd = input ? input + len : NULL;
   xxh_u64       h64;
 
     #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
@@ -2259,6 +2804,7 @@ XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len,
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
                                   XXH64_hash_t seed) {
 
@@ -2268,9 +2814,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
     XXH64_reset(&state, seed);
     XXH64_update(&state, (const xxh_u8*)input, len);
     return XXH64_digest(&state);
-
     #else
-
   if (XXH_FORCE_ALIGN_CHECK) {
 
     if ((((size_t)input) & 7) ==
@@ -2289,12 +2833,14 @@ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
 
 /*******   Hash Streaming   *******/
 
+/*! @ingroup xxh64_family*/
 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void) {
 
   return (XXH64_state_t *)XXH_malloc(sizeof(XXH64_state_t));
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) {
 
   XXH_free(statePtr);
@@ -2302,6 +2848,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) {
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t *      dstState,
                                     const XXH64_state_t *srcState) {
 
@@ -2309,6 +2856,7 @@ XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t *      dstState,
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
                                          XXH64_hash_t   seed) {
 
@@ -2325,6 +2873,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state,
                                           const void *input, size_t len) {
 
@@ -2403,6 +2952,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state,
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) {
 
   xxh_u64 h64;
@@ -2436,6 +2986,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) {
 
 /******* Canonical representation   *******/
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
                                             XXH64_hash_t       hash) {
 
@@ -2445,6 +2996,7 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH64_hash_t
 XXH64_hashFromCanonical(const XXH64_canonical_t *src) {
 
@@ -2452,380 +3004,452 @@ XXH64_hashFromCanonical(const XXH64_canonical_t *src) {
 
 }
 
-  /* *********************************************************************
-   *  XXH3
-   *  New generation hash designed for speed on small keys and vectorization
-   ************************************************************************ */
+    #ifndef XXH_NO_XXH3
 
-  /* ===   Compiler specifics   === */
+    /* *********************************************************************
+     *  XXH3
+     *  New generation hash designed for speed on small keys and vectorization
+     ************************************************************************ */
+    /*!
+     * @}
+     * @defgroup xxh3_impl XXH3 implementation
+     * @ingroup impl
+     * @{
 
-    #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L  /* >= C99 */
-      #define XXH_RESTRICT restrict
-    #else
-      /* Note: it might be useful to define __restrict or __restrict__ for some
-       * C++ compilers */
-      #define XXH_RESTRICT                                       /* disable */
-    #endif
+     */
 
-    #if (defined(__GNUC__) && (__GNUC__ >= 3)) ||                   \
-        (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \
-        defined(__clang__)
-      #define XXH_likely(x) __builtin_expect(x, 1)
-      #define XXH_unlikely(x) __builtin_expect(x, 0)
-    #else
-      #define XXH_likely(x) (x)
-      #define XXH_unlikely(x) (x)
-    #endif
+    /* ===   Compiler specifics   === */
 
-    #if defined(__GNUC__)
-      #if defined(__AVX2__)
-        #include <immintrin.h>
-      #elif defined(__SSE2__)
-        #include <emmintrin.h>
-      #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
-        #define inline __inline__                 /* circumvent a clang bug */
-        #include <arm_neon.h>
-        #undef inline
+      #if ((defined(sun) || defined(__sun)) &&                                \
+           __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested \
+                           with GCC 5.5 */
+        #define XXH_RESTRICT                                     /* disable */
+      #elif defined(__STDC_VERSION__) && \
+          __STDC_VERSION__ >= 199901L                             /* >= C99 */
+        #define XXH_RESTRICT restrict
+      #else
+        /* Note: it might be useful to define __restrict or __restrict__ for
+         * some C++ compilers */
+        #define XXH_RESTRICT                                     /* disable */
       #endif
-    #elif defined(_MSC_VER)
-      #include <intrin.h>
-    #endif
-
-    /*
-     * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
-     * remaining a true 64-bit/128-bit hash function.
-     *
-     * This is done by prioritizing a subset of 64-bit operations that can be
-     * emulated without too many steps on the average 32-bit machine.
-     *
-     * For example, these two lines seem similar, and run equally fast on
-     * 64-bit:
-     *
-     *   xxh_u64 x;
-     *   x ^= (x >> 47); // good
-     *   x ^= (x >> 13); // bad
-     *
-     * However, to a 32-bit machine, there is a major difference.
-     *
-     * x ^= (x >> 47) looks like this:
-     *
-     *   x.lo ^= (x.hi >> (47 - 32));
-     *
-     * while x ^= (x >> 13) looks like this:
-     *
-     *   // note: funnel shifts are not usually cheap.
-     *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
-     *   x.hi ^= (x.hi >> 13);
-     *
-     * The first one is significantly faster than the second, simply because the
-     * shift is larger than 32. This means:
-     *  - All the bits we need are in the upper 32 bits, so we can ignore the
-     * lower 32 bits in the shift.
-     *  - The shift result will always fit in the lower 32 bits, and therefore,
-     *    we can ignore the upper 32 bits in the xor.
-     *
-     * Thanks to this optimization, XXH3 only requires these features to be
-     * efficient:
-     *
-     *  - Usable unaligned access
-     *  - A 32-bit or 64-bit ALU
-     *      - If 32-bit, a decent ADC instruction
-     *  - A 32 or 64-bit multiply with a 64-bit result
-     *  - For the 128-bit variant, a decent byteswap helps short inputs.
-     *
-     * The first two are already required by XXH32, and almost all 32-bit and
-     * 64-bit platforms which can run XXH32 can run XXH3 efficiently.
-     *
-     * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
-     * notable exception.
-     *
-     * First of all, Thumb-1 lacks support for the UMULL instruction which
-     * performs the important long multiply. This means numerous __aeabi_lmul
-     * calls.
-     *
-     * Second of all, the 8 functional registers are just not enough.
-     * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic
-     * need Lo registers, and this shuffling results in thousands more MOVs than
-     * A32.
-     *
-     * A32 and T32 don't have this limitation. They can access all 14 registers,
-     * do a 32->64 multiply with UMULL, and the flexible operand allowing free
-     * shifts is helpful, too.
-     *
-     * Therefore, we do a quick sanity check.
-     *
-     * If compiling Thumb-1 for a target which supports ARM instructions, we
-     * will emit a warning, as it is not a "sane" platform to compile for.
-     *
-     * Usually, if this happens, it is because of an accident and you probably
-     * need to specify -march, as you likely meant to compile for a newer
-     * architecture.
-     *
-     * Credit: large sections of the vectorial and asm source code paths
-     *         have been contributed by @easyaspi314
-     */
-    #if defined(__thumb__) && !defined(__thumb2__) && \
-        defined(__ARM_ARCH_ISA_ARM)
-      #warning "XXH3 is highly inefficient without ARM or Thumb-2."
-    #endif
 
-    /* ==========================================
-     * Vectorization detection
-     * ========================================== */
-    #define XXH_SCALAR 0                         /* Portable scalar version */
-    #define XXH_SSE2 1                 /* SSE2 for Pentium 4 and all x86_64 */
-    #define XXH_AVX2 2                    /* AVX2 for Haswell and Bulldozer */
-    #define XXH_AVX512 3                  /* AVX512 for Skylake and Icelake */
-    #define XXH_NEON 4             /* NEON for most ARMv7-A and all AArch64 */
-    #define XXH_VSX 5                     /* VSX and ZVector for POWER8/z13 */
-
-    #ifndef XXH_VECTOR                    /* can be defined on command line */
-      #if defined(__AVX512F__)
-        #define XXH_VECTOR XXH_AVX512
-      #elif defined(__AVX2__)
-        #define XXH_VECTOR XXH_AVX2
-      #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
-          (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
-        #define XXH_VECTOR XXH_SSE2
-      #elif defined(__GNUC__) /* msvc support maybe later */                   \
-          && (defined(__ARM_NEON__) || defined(__ARM_NEON)) &&                 \
-          (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
-           || (defined(__BYTE_ORDER__) &&                                      \
-               __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
-        #define XXH_VECTOR XXH_NEON
-      #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \
-          (defined(__s390x__) && defined(__VEC__)) &&             \
-              defined(__GNUC__)                             /* TODO: IBM XL */
-        #define XXH_VECTOR XXH_VSX
+      #if (defined(__GNUC__) && (__GNUC__ >= 3)) ||                   \
+          (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \
+          defined(__clang__)
+        #define XXH_likely(x) __builtin_expect(x, 1)
+        #define XXH_unlikely(x) __builtin_expect(x, 0)
       #else
-        #define XXH_VECTOR XXH_SCALAR
+        #define XXH_likely(x) (x)
+        #define XXH_unlikely(x) (x)
       #endif
-    #endif
 
-    /*
-     * Controls the alignment of the accumulator,
-     * for compatibility with aligned vector loads, which are usually faster.
-     */
-    #ifndef XXH_ACC_ALIGN
-      #if defined(XXH_X86DISPATCH)
-        #define XXH_ACC_ALIGN 64           /* for compatibility with avx512 */
-      #elif XXH_VECTOR == XXH_SCALAR                              /* scalar */
-        #define XXH_ACC_ALIGN 8
-      #elif XXH_VECTOR == XXH_SSE2                                  /* sse2 */
-        #define XXH_ACC_ALIGN 16
-      #elif XXH_VECTOR == XXH_AVX2                                  /* avx2 */
-        #define XXH_ACC_ALIGN 32
-      #elif XXH_VECTOR == XXH_NEON                                  /* neon */
-        #define XXH_ACC_ALIGN 16
-      #elif XXH_VECTOR == XXH_VSX                                    /* vsx */
-        #define XXH_ACC_ALIGN 16
-      #elif XXH_VECTOR == XXH_AVX512                              /* avx512 */
-        #define XXH_ACC_ALIGN 64
+      #if defined(__GNUC__)
+        #if defined(__AVX2__)
+          #include <immintrin.h>
+        #elif defined(__SSE2__)
+          #include <emmintrin.h>
+        #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+          #define inline __inline__               /* circumvent a clang bug */
+          #include <arm_neon.h>
+          #undef inline
+        #endif
+      #elif defined(_MSC_VER)
+        #include <intrin.h>
       #endif
-    #endif
 
-    #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \
-        XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
-      #define XXH_SEC_ALIGN XXH_ACC_ALIGN
-    #else
-      #define XXH_SEC_ALIGN 8
-    #endif
-
-    /*
-     * UGLY HACK:
-     * GCC usually generates the best code with -O3 for xxHash.
-     *
-     * However, when targeting AVX2, it is overzealous in its unrolling
-     * resulting in code roughly 3/4 the speed of Clang.
-     *
-     * There are other issues, such as GCC splitting _mm256_loadu_si256 into
-     * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
-     * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
-     *
-     * That is why when compiling the AVX2 version, it is recommended to use
-     * either -O2 -mavx2 -march=haswell or -O2 -mavx2
-     * -mno-avx256-split-unaligned-load for decent performance, or to use Clang
-     * instead.
-     *
-     * Fortunately, we can control the first one with a pragma that forces GCC
-     * into -O2, but the other one we can't control without "failed to inline
-     * always inline function due to target mismatch" warnings.
-     */
-    #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
-        && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
-        && defined(__OPTIMIZE__) &&                                      \
-        !defined(__OPTIMIZE_SIZE__)                  /* respect -O0 and -Os */
-      #pragma GCC push_options
-      #pragma GCC optimize("-O2")
-    #endif
-
-    #if XXH_VECTOR == XXH_NEON
       /*
-       * NEON's setup for vmlal_u32 is a little more complicated than it is on
-       * SSE2, AVX2, and VSX.
+       * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+       * remaining a true 64-bit/128-bit hash function.
        *
-       * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an
-       * upcast.
+       * This is done by prioritizing a subset of 64-bit operations that can be
+       * emulated without too many steps on the average 32-bit machine.
        *
-       * To do the same operation, the 128-bit 'Q' register needs to be split
-       * into two 64-bit 'D' registers, performing this operation::
+       * For example, these two lines seem similar, and run equally fast on
+       * 64-bit:
        *
-       *   [                a                 |                 b ] |
-       * '---------. .--------'                | |                         x |
-       *            |              .---------' '--------.                |
-       *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32 ]
+       *   xxh_u64 x;
+       *   x ^= (x >> 47); // good
+       *   x ^= (x >> 13); // bad
        *
-       * Due to significant changes in aarch64, the fastest method for aarch64
-       * is completely different than the fastest method for ARMv7-A.
+       * However, to a 32-bit machine, there is a major difference.
        *
-       * ARMv7-A treats D registers as unions overlaying Q registers, so
-       * modifying D11 will modify the high half of Q5. This is similar to how
-       * modifying AH will only affect bits 8-15 of AX on x86.
+       * x ^= (x >> 47) looks like this:
        *
-       * VZIP takes two registers, and puts even lanes in one register and odd
-       * lanes in the other.
+       *   x.lo ^= (x.hi >> (47 - 32));
        *
-       * On ARMv7-A, this strangely modifies both parameters in place instead of
-       * taking the usual 3-operand form.
+       * while x ^= (x >> 13) looks like this:
        *
-       * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on
-       * the lower and upper halves of the Q register to end up with the high
-       * and low halves where we want - all in one instruction.
+       *   // note: funnel shifts are not usually cheap.
+       *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+       *   x.hi ^= (x.hi >> 13);
        *
-       *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1],
-       * d11[1] }
+       * The first one is significantly faster than the second, simply because
+       * the shift is larger than 32. This means:
+       *  - All the bits we need are in the upper 32 bits, so we can ignore the
+       * lower 32 bits in the shift.
+       *  - The shift result will always fit in the lower 32 bits, and
+       * therefore, we can ignore the upper 32 bits in the xor.
        *
-       * Unfortunately we need inline assembly for this: Instructions modifying
-       * two registers at once is not possible in GCC or Clang's IR, and they
-       * have to create a copy.
+       * Thanks to this optimization, XXH3 only requires these features to be
+       * efficient:
        *
-       * aarch64 requires a different approach.
+       *  - Usable unaligned access
+       *  - A 32-bit or 64-bit ALU
+       *      - If 32-bit, a decent ADC instruction
+       *  - A 32 or 64-bit multiply with a 64-bit result
+       *  - For the 128-bit variant, a decent byteswap helps short inputs.
        *
-       * In order to make it easier to write a decent compiler for aarch64, many
-       * quirks were removed, such as conditional execution.
+       * The first two are already required by XXH32, and almost all 32-bit and
+       * 64-bit platforms which can run XXH32 can run XXH3 efficiently.
        *
-       * NEON was also affected by this.
+       * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is
+       * one notable exception.
        *
-       * aarch64 cannot access the high bits of a Q-form register, and writes to
-       * a D-form register zero the high bits, similar to how writes to W-form
-       * scalar registers (or DWORD registers on x86_64) work.
+       * First of all, Thumb-1 lacks support for the UMULL instruction which
+       * performs the important long multiply. This means numerous __aeabi_lmul
+       * calls.
        *
-       * The formerly free vget_high intrinsics now require a vext (with a few
-       * exceptions)
+       * Second of all, the 8 functional registers are just not enough.
+       * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic
+       * need Lo registers, and this shuffling results in thousands more MOVs
+       * than A32.
        *
-       * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the
-       * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to
-       * only modify one operand.
+       * A32 and T32 don't have this limitation. They can access all 14
+       * registers, do a 32->64 multiply with UMULL, and the flexible operand
+       * allowing free shifts is helpful, too.
        *
-       * The equivalent of the VZIP.32 on the lower and upper halves would be
-       * this mess:
+       * Therefore, we do a quick sanity check.
        *
-       *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1]
-       * } zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] } zip2 v0.2s,
-       * v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+       * If compiling Thumb-1 for a target which supports ARM instructions, we
+       * will emit a warning, as it is not a "sane" platform to compile for.
        *
-       * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64
-       * (SHRN):
+       * Usually, if this happens, it is because of an accident and you probably
+       * need to specify -march, as you likely meant to compile for a newer
+       * architecture.
        *
-       *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
-       *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
-       *
-       * This is available on ARMv7-A, but is less efficient than a single
-       * VZIP.32.
+       * Credit: large sections of the vectorial and asm source code paths
+       *         have been contributed by @easyaspi314
        */
+      #if defined(__thumb__) && !defined(__thumb2__) && \
+          defined(__ARM_ARCH_ISA_ARM)
+        #warning "XXH3 is highly inefficient without ARM or Thumb-2."
+      #endif
+
+    /* ==========================================
+     * Vectorization detection
+     * ========================================== */
+
+      #ifdef XXH_DOXYGEN
+        /*!
+         * @ingroup tuning
+         * @brief Overrides the vectorization implementation chosen for XXH3.
+         *
+         * Can be defined to 0 to disable SIMD or any of the values mentioned in
+         * @ref XXH_VECTOR_TYPE.
+         *
+         * If this is not defined, it uses predefined macros to determine the
+         * best implementation.
+         */
+        #define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Note that these are actually implemented as macros.
+ *
+ * If this is not defined, it is detected automatically.
+ * @ref XXH_X86DISPATCH overrides this.
+ */
+enum XXH_VECTOR_TYPE /* fake enum */ {
+
+  XXH_SCALAR = 0,                              /*!< Portable scalar version */
+  XXH_SSE2 = 1,   /*!<
+                   * SSE2 for Pentium 4, Opteron, all x86_64.
+                   *
+                   * @note SSE2 is also guaranteed on Windows 10, macOS, and
+                   * Android x86.
+                   */
+  XXH_AVX2 = 2,                         /*!< AVX2 for Haswell and Bulldozer */
+  XXH_AVX512 = 3,                       /*!< AVX512 for Skylake and Icelake */
+  XXH_NEON = 4,                  /*!< NEON for most ARMv7-A and all AArch64 */
+  XXH_VSX = 5,                 /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+
+};
+
+        /*!
+         * @ingroup tuning
+         * @brief Selects the minimum alignment for XXH3's accumulators.
+         *
+         * When using SIMD, this should match the alignment reqired for said
+         * vector type, so, for example, 32 for AVX2.
+         *
+         * Default: Auto detected.
+         */
+        #define XXH_ACC_ALIGN 8
+      #endif
+
+      /* Actual definition */
+      #ifndef XXH_DOXYGEN
+        #define XXH_SCALAR 0
+        #define XXH_SSE2 1
+        #define XXH_AVX2 2
+        #define XXH_AVX512 3
+        #define XXH_NEON 4
+        #define XXH_VSX 5
+      #endif
+
+      #ifndef XXH_VECTOR                  /* can be defined on command line */
+        #if defined(__AVX512F__)
+          #define XXH_VECTOR XXH_AVX512
+        #elif defined(__AVX2__)
+          #define XXH_VECTOR XXH_AVX2
+        #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
+            (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+          #define XXH_VECTOR XXH_SSE2
+        #elif defined(__GNUC__) /* msvc support maybe later */               \
+            && (defined(__ARM_NEON__) || defined(__ARM_NEON)) &&             \
+            (defined(                                                        \
+                 __LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+             || (defined(__BYTE_ORDER__) &&                                  \
+                 __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+          #define XXH_VECTOR XXH_NEON
+        #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \
+            (defined(__s390x__) && defined(__VEC__)) &&             \
+                defined(__GNUC__)                           /* TODO: IBM XL */
+          #define XXH_VECTOR XXH_VSX
+        #else
+          #define XXH_VECTOR XXH_SCALAR
+        #endif
+      #endif
 
       /*
-       * Function-like macro:
-       * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t
-       * &outHi)
-       * {
-
-       *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
-       *     outHi = (uint32x2_t)(in >> 32);
-       *     in = UNDEFINED;
-       * }
+       * Controls the alignment of the accumulator,
+       * for compatibility with aligned vector loads, which are usually faster.
        */
-      #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
-          && defined(__GNUC__) && !defined(__aarch64__) && !defined(__arm64__)
-        #define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                                   \
-          do {                                                                                         \
-                                                                                                       \
-            /* Undocumented GCC/Clang operand modifier: %e0 = lower D half,                            \
-             * %f0 = upper D half */                                                                   \
-            /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486             \
-             */                                                                                        \
-            /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \
-             */                                                                                        \
-            __asm__("vzip.32  %e0, %f0" : "+w"(in));                                                   \
-            (outLo) = vget_low_u32(vreinterpretq_u32_u64(in));                                         \
-            (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                        \
-                                                                                                       \
-          } while (0)
+      #ifndef XXH_ACC_ALIGN
+        #if defined(XXH_X86DISPATCH)
+          #define XXH_ACC_ALIGN 64         /* for compatibility with avx512 */
+        #elif XXH_VECTOR == XXH_SCALAR                            /* scalar */
+          #define XXH_ACC_ALIGN 8
+        #elif XXH_VECTOR == XXH_SSE2                                /* sse2 */
+          #define XXH_ACC_ALIGN 16
+        #elif XXH_VECTOR == XXH_AVX2                                /* avx2 */
+          #define XXH_ACC_ALIGN 32
+        #elif XXH_VECTOR == XXH_NEON                                /* neon */
+          #define XXH_ACC_ALIGN 16
+        #elif XXH_VECTOR == XXH_VSX                                  /* vsx */
+          #define XXH_ACC_ALIGN 16
+        #elif XXH_VECTOR == XXH_AVX512                            /* avx512 */
+          #define XXH_ACC_ALIGN 64
+        #endif
+      #endif
 
+      #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \
+          XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+        #define XXH_SEC_ALIGN XXH_ACC_ALIGN
       #else
-        #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
-          do {                                       \
-                                                     \
-            (outLo) = vmovn_u64(in);                 \
-            (outHi) = vshrn_n_u64((in), 32);         \
-                                                     \
-          } while (0)
+        #define XXH_SEC_ALIGN 8
+      #endif
 
+      /*
+       * UGLY HACK:
+       * GCC usually generates the best code with -O3 for xxHash.
+       *
+       * However, when targeting AVX2, it is overzealous in its unrolling
+       * resulting in code roughly 3/4 the speed of Clang.
+       *
+       * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+       * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization
+       * which only applies to Sandy and Ivy Bridge... which don't even support
+       * AVX2.
+       *
+       * That is why when compiling the AVX2 version, it is recommended to use
+       * either -O2 -mavx2 -march=haswell or -O2 -mavx2
+       * -mno-avx256-split-unaligned-load for decent performance, or to use
+       * Clang instead.
+       *
+       * Fortunately, we can control the first one with a pragma that forces GCC
+       * into -O2, but the other one we can't control without "failed to inline
+       * always inline function due to target mismatch" warnings.
+       */
+      #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
+          && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+          && defined(__OPTIMIZE__) &&                                      \
+          !defined(__OPTIMIZE_SIZE__)                /* respect -O0 and -Os */
+        #pragma GCC push_options
+        #pragma GCC optimize("-O2")
       #endif
-    #endif                                        /* XXH_VECTOR == XXH_NEON */
 
-    /*
-     * VSX and Z Vector helpers.
-     *
-     * This is very messy, and any pull requests to clean this up are welcome.
-     *
-     * There are a lot of problems with supporting VSX and s390x, due to
-     * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
-     */
-    #if XXH_VECTOR == XXH_VSX
-      #if defined(__s390x__)
-        #include <s390intrin.h>
-      #else
-        /* gcc's altivec.h can have the unwanted consequence to unconditionally
-         * #define bool, vector, and pixel keywords,
-         * with bad consequences for programs already using these keywords for
-         * other purposes. The paragraph defining these macros is skipped when
-         * __APPLE_ALTIVEC__ is defined.
-         * __APPLE_ALTIVEC__ is _generally_ defined automatically by the
-         * compiler, but it seems that, in some cases, it isn't. Force the build
-         * macro to be defined, so that keywords are not altered.
+      #if XXH_VECTOR == XXH_NEON
+        /*
+         * NEON's setup for vmlal_u32 is a little more complicated than it is on
+         * SSE2, AVX2, and VSX.
+         *
+         * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an
+         * upcast.
+         *
+         * To do the same operation, the 128-bit 'Q' register needs to be split
+         * into two 64-bit 'D' registers, performing this operation::
+         *
+         *   [                a                 |                 b ] |
+         * '---------. .--------'                | |                         x |
+         *            |              .---------' '--------.                |
+         *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32
+         * ]
+         *
+         * Due to significant changes in aarch64, the fastest method for aarch64
+         * is completely different than the fastest method for ARMv7-A.
+         *
+         * ARMv7-A treats D registers as unions overlaying Q registers, so
+         * modifying D11 will modify the high half of Q5. This is similar to how
+         * modifying AH will only affect bits 8-15 of AX on x86.
+         *
+         * VZIP takes two registers, and puts even lanes in one register and odd
+         * lanes in the other.
+         *
+         * On ARMv7-A, this strangely modifies both parameters in place instead
+         * of taking the usual 3-operand form.
+         *
+         * Therefore, if we want to do this, we can simply use a D-form VZIP.32
+         * on the lower and upper halves of the Q register to end up with the
+         * high and low halves where we want - all in one instruction.
+         *
+         *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = {
+
+         * d10[1], d11[1] }
+         *
+         * Unfortunately we need inline assembly for this: Instructions
+         * modifying two registers at once is not possible in GCC or Clang's IR,
+         * and they have to create a copy.
+         *
+         * aarch64 requires a different approach.
+         *
+         * In order to make it easier to write a decent compiler for aarch64,
+         * many quirks were removed, such as conditional execution.
+         *
+         * NEON was also affected by this.
+         *
+         * aarch64 cannot access the high bits of a Q-form register, and writes
+         * to a D-form register zero the high bits, similar to how writes to
+         * W-form scalar registers (or DWORD registers on x86_64) work.
+         *
+         * The formerly free vget_high intrinsics now require a vext (with a few
+         * exceptions)
+         *
+         * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the
+         * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to
+         * only modify one operand.
+         *
+         * The equivalent of the VZIP.32 on the lower and upper halves would be
+         * this mess:
+         *
+         *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0],
+         * v0[1] } zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] } zip2
+         * v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+         *
+         * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64
+         * (SHRN):
+         *
+         *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+         *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+         *
+         * This is available on ARMv7-A, but is less efficient than a single
+         * VZIP.32.
          */
-        #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
-          #define __APPLE_ALTIVEC__
-        #endif
-        #include <altivec.h>
-      #endif
 
-typedef __vector unsigned long long xxh_u64x2;
-typedef __vector unsigned char      xxh_u8x16;
-typedef __vector unsigned           xxh_u32x4;
+        /*!
+         * Function-like macro:
+         * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t
+         * &outHi)
+         * {
 
-      #ifndef XXH_VSX_BE
-        #if defined(__BIG_ENDIAN__) ||  \
-            (defined(__BYTE_ORDER__) && \
-             __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-          #define XXH_VSX_BE 1
-        #elif defined(__VEC_ELEMENT_REG_ORDER__) && \
-            __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
-          #warning \
-              "-maltivec=be is not recommended. Please use native endianness."
-          #define XXH_VSX_BE 1
+         *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+         *     outHi = (uint32x2_t)(in >> 32);
+         *     in = UNDEFINED;
+         * }
+         */
+        #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+            && defined(__GNUC__) && !defined(__aarch64__) &&   \
+            !defined(__arm64__)
+          #define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                                   \
+            do {                                                                                         \
+                                                                                                         \
+              /* Undocumented GCC/Clang operand modifier: %e0 = lower D half,                            \
+               * %f0 = upper D half */                                                                   \
+              /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486             \
+               */                                                                                        \
+              /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \
+               */                                                                                        \
+              __asm__("vzip.32  %e0, %f0" : "+w"(in));                                                   \
+              (outLo) = vget_low_u32(vreinterpretq_u32_u64(in));                                         \
+              (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                        \
+                                                                                                         \
+            } while (0)
         #else
-          #define XXH_VSX_BE 0
+          #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
+            do {                                       \
+                                                       \
+              (outLo) = vmovn_u64(in);                 \
+              (outHi) = vshrn_n_u64((in), 32);         \
+                                                       \
+            } while (0)
         #endif
-      #endif                                        /* !defined(XXH_VSX_BE) */
+      #endif                                      /* XXH_VECTOR == XXH_NEON */
 
-      #if XXH_VSX_BE
-        /* A wrapper for POWER9's vec_revb. */
-        #if defined(__POWER9_VECTOR__) || \
-            (defined(__clang__) && defined(__s390x__))
-          #define XXH_vec_revb vec_revb
+      /*
+       * VSX and Z Vector helpers.
+       *
+       * This is very messy, and any pull requests to clean this up are welcome.
+       *
+       * There are a lot of problems with supporting VSX and s390x, due to
+       * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+       */
+      #if XXH_VECTOR == XXH_VSX
+        #if defined(__s390x__)
+          #include <s390intrin.h>
         #else
+          /* gcc's altivec.h can have the unwanted consequence to
+           * unconditionally #define bool, vector, and pixel keywords, with bad
+           * consequences for programs already using these keywords for other
+           * purposes. The paragraph defining these macros is skipped when
+           * __APPLE_ALTIVEC__ is defined.
+           * __APPLE_ALTIVEC__ is _generally_ defined automatically by the
+           * compiler, but it seems that, in some cases, it isn't. Force the
+           * build macro to be defined, so that keywords are not altered.
+           */
+          #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
+            #define __APPLE_ALTIVEC__
+          #endif
+          #include <altivec.h>
+        #endif
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char      xxh_u8x16;
+typedef __vector unsigned           xxh_u32x4;
+
+        #ifndef XXH_VSX_BE
+          #if defined(__BIG_ENDIAN__) ||  \
+              (defined(__BYTE_ORDER__) && \
+               __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            #define XXH_VSX_BE 1
+          #elif defined(__VEC_ELEMENT_REG_ORDER__) && \
+              __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+            #warning \
+                "-maltivec=be is not recommended. Please use native endianness."
+            #define XXH_VSX_BE 1
+          #else
+            #define XXH_VSX_BE 0
+          #endif
+        #endif                                      /* !defined(XXH_VSX_BE) */
+
+        #if XXH_VSX_BE
+          #if defined(__POWER9_VECTOR__) || \
+              (defined(__clang__) && defined(__s390x__))
+            #define XXH_vec_revb vec_revb
+          #else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
 
   xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
@@ -2834,40 +3458,40 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
 
 }
 
-        #endif
-      #endif                                                  /* XXH_VSX_BE */
+          #endif
+        #endif                                                /* XXH_VSX_BE */
 
-/*
- * Performs an unaligned load and byte swaps it on big endian.
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
  */
 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) {
 
   xxh_u64x2 ret;
   memcpy(&ret, ptr, sizeof(xxh_u64x2));
-      #if XXH_VSX_BE
+        #if XXH_VSX_BE
   ret = XXH_vec_revb(ret);
-      #endif
+        #endif
   return ret;
 
 }
 
-      /*
-       * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
-       *
-       * These intrinsics weren't added until GCC 8, despite existing for a
-       * while, and they are endian dependent. Also, their meaning swap
-       * depending on version.
-       * */
-      #if defined(__s390x__)
-      /* s390x is always big endian, no issue on this platform */
-        #define XXH_vec_mulo vec_mulo
-        #define XXH_vec_mule vec_mule
-      #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
-        /* Clang has a better way to control this, we can just use the builtin
-         * which doesn't swap. */
-        #define XXH_vec_mulo __builtin_altivec_vmulouw
-        #define XXH_vec_mule __builtin_altivec_vmuleuw
-      #else
+        /*
+         * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+         *
+         * These intrinsics weren't added until GCC 8, despite existing for a
+         * while, and they are endian dependent. Also, their meaning swap
+         * depending on version.
+         * */
+        #if defined(__s390x__)
+        /* s390x is always big endian, no issue on this platform */
+          #define XXH_vec_mulo vec_mulo
+          #define XXH_vec_mule vec_mule
+        #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
+          /* Clang has a better way to control this, we can just use the builtin
+           * which doesn't swap. */
+          #define XXH_vec_mulo __builtin_altivec_vmulouw
+          #define XXH_vec_mule __builtin_altivec_vmuleuw
+        #else
 /* gcc needs inline assembly */
 /* Adapted from
  * https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
@@ -2887,40 +3511,41 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) {
 
 }
 
-      #endif                                  /* XXH_vec_mulo, XXH_vec_mule */
-    #endif                                         /* XXH_VECTOR == XXH_VSX */
+        #endif                                /* XXH_vec_mulo, XXH_vec_mule */
+      #endif                                       /* XXH_VECTOR == XXH_VSX */
 
-    /* prefetch
-     * can be disabled, by declaring XXH_NO_PREFETCH build macro */
-    #if defined(XXH_NO_PREFETCH)
-      #define XXH_PREFETCH(ptr) (void)(ptr)                     /* disabled */
-    #else
-      #if defined(_MSC_VER) && \
-          (defined(_M_X64) ||  \
-           defined(            \
-               _M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
-        #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-        #define XXH_PREFETCH(ptr) _mm_prefetch((const char *)(ptr), _MM_HINT_T0)
-      #elif defined(__GNUC__) && \
-          ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
-        #define XXH_PREFETCH(ptr) \
-          __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
-      #else
+      /* prefetch
+       * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+      #if defined(XXH_NO_PREFETCH)
         #define XXH_PREFETCH(ptr) (void)(ptr)                   /* disabled */
-      #endif
-    #endif                                               /* XXH_NO_PREFETCH */
+      #else
+        #if defined(_MSC_VER) && \
+            (defined(_M_X64) ||  \
+             defined(            \
+                 _M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */
+          #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+          #define XXH_PREFETCH(ptr) \
+            _mm_prefetch((const char *)(ptr), _MM_HINT_T0)
+        #elif defined(__GNUC__) && \
+            ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
+          #define XXH_PREFETCH(ptr) \
+            __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+        #else
+          #define XXH_PREFETCH(ptr) (void)(ptr)                 /* disabled */
+        #endif
+      #endif                                             /* XXH_NO_PREFETCH */
 
-  /* ==========================================
-   * XXH3 default settings
-   * ========================================== */
+    /* ==========================================
+     * XXH3 default settings
+     * ========================================== */
 
-    #define XXH_SECRET_DEFAULT_SIZE 192     /* minimum XXH3_SECRET_SIZE_MIN */
+      #define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
 
-    #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
-      #error "default keyset is not large enough"
-    #endif
+      #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+        #error "default keyset is not large enough"
+      #endif
 
-/* Pseudorandom secret taken directly from FARSH */
+/*! Pseudorandom secret taken directly from FARSH. */
 XXH_ALIGN(64)
 static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
 
@@ -2943,69 +3568,79 @@ static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
 
 };
 
-    #ifdef XXH_OLD_NAMES
-      #define kSecret XXH3_kSecret
-    #endif
+      #ifdef XXH_OLD_NAMES
+        #define kSecret XXH3_kSecret
+      #endif
 
-    /*
-     * Calculates a 32-bit to 64-bit long multiply.
-     *
-     * Wraps __emulu on MSVC x86 because it tends to call __allmul when it
-     * doesn't need to (but it shouldn't need to anyways, it is about 7
-     * instructions to do a 64x64 multiply...). Since we know that this will
-     * _always_ emit MULL, we use that instead of the normal method.
-     *
-     * If you are compiling for platforms like Thumb-1 and don't have a better
-     * option, you may also want to write your own long multiply routine here.
-     *
-     * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
-     * {
+      #ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it
+ * doesn't need to (but it shouldn't need to anyways, it is about 7 instructions
+ * to do a 64x64 multiply...). Since we know that this will _always_ emit
+ * `MULL`, we use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better
+ * option, you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) {
 
-     *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
-     * }
-     */
-    #if defined(_MSC_VER) && defined(_M_IX86)
-      #include <intrin.h>
-      #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
-    #else
-      /*
-       * Downcast + upcast is usually better than masking on older compilers
-       * like GCC 4.2 (especially 32-bit ones), all without affecting newer
-       * compilers.
-       *
-       * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both
-       * operands and perform a full 64x64 multiply -- entirely redundant on
-       * 32-bit.
-       */
-      #define XXH_mult32to64(x, y) \
-        ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
-    #endif
+  return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
 
-/*
- * Calculates a 64->128-bit long multiply.
+}
+
+      #elif defined(_MSC_VER) && defined(_M_IX86)
+        #include <intrin.h>
+        #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+      #else
+        /*
+         * Downcast + upcast is usually better than masking on older compilers
+         * like GCC 4.2 (especially 32-bit ones), all without affecting newer
+         * compilers.
+         *
+         * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both
+         * operands and perform a full 64x64 multiply -- entirely redundant on
+         * 32-bit.
+         */
+        #define XXH_mult32to64(x, y) \
+          ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+      #endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
  *
- * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
+ * @param lhs, rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
  */
 static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
 
-    /*
-     * GCC/Clang __uint128_t method.
-     *
-     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
-     * This is usually the best way as it usually uses a native long 64-bit
-     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
-     *
-     * Usually.
-     *
-     * Despite being a 32-bit platform, Clang (and emscripten) define this type
-     * despite not having the arithmetic for it. This results in a laggy
-     * compiler builtin call which calculates a full 128-bit multiply.
-     * In that case it is best to use the portable one.
-     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
-     */
-    #if defined(__GNUC__) && !defined(__wasm__) && \
-            defined(__SIZEOF_INT128__) ||          \
-        (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+      /*
+       * GCC/Clang __uint128_t method.
+       *
+       * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+       * This is usually the best way as it usually uses a native long 64-bit
+       * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+       *
+       * Usually.
+       *
+       * Despite being a 32-bit platform, Clang (and emscripten) define this
+       * type despite not having the arithmetic for it. This results in a laggy
+       * compiler builtin call which calculates a full 128-bit multiply.
+       * In that case it is best to use the portable one.
+       * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+       */
+      #if defined(__GNUC__) && !defined(__wasm__) && \
+              defined(__SIZEOF_INT128__) ||          \
+          (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
 
   __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
   XXH128_hash_t     r128;
@@ -3013,19 +3648,19 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
   r128.high64 = (xxh_u64)(product >> 64);
   return r128;
 
-      /*
-       * MSVC for x64's _umul128 method.
-       *
-       * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64
-       * *HighProduct);
-       *
-       * This compiles to single operand MUL on x64.
-       */
-    #elif defined(_M_X64) || defined(_M_IA64)
+        /*
+         * MSVC for x64's _umul128 method.
+         *
+         * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64
+         * *HighProduct);
+         *
+         * This compiles to single operand MUL on x64.
+         */
+      #elif defined(_M_X64) || defined(_M_IA64)
 
-      #ifndef _MSC_VER
-        #pragma intrinsic(_umul128)
-      #endif
+        #ifndef _MSC_VER
+          #pragma intrinsic(_umul128)
+        #endif
   xxh_u64       product_high;
   xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
   XXH128_hash_t r128;
@@ -3033,7 +3668,7 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
   r128.high64 = product_high;
   return r128;
 
-    #else
+      #else
   /*
    * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
    *
@@ -3093,16 +3728,20 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
   r128.low64 = lower;
   r128.high64 = upper;
   return r128;
-    #endif
+      #endif
 
 }
 
-/*
- * Does a 64-bit to 128-bit multiply, then XOR folds it.
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
  *
  * The reason for the separate function is to prevent passing too many structs
  * around by value. This will hopefully inline the multiply, but we don't force
  * it.
+ *
+ * @param lhs, rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
  */
 static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
 
@@ -3111,7 +3750,7 @@ static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
 
 }
 
-/* Seems to produce slightly better code on GCC for some reason. */
+/*! Seems to produce slightly better code on GCC for some reason. */
 XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) {
 
   XXH_ASSERT(0 <= shift && shift < 64);
@@ -3216,7 +3855,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len,
 
   XXH_ASSERT(input != NULL);
   XXH_ASSERT(secret != NULL);
-  XXH_ASSERT(4 <= len && len < 8);
+  XXH_ASSERT(4 <= len && len <= 8);
   seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
   {
 
@@ -3239,7 +3878,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input,
 
   XXH_ASSERT(input != NULL);
   XXH_ASSERT(secret != NULL);
-  XXH_ASSERT(8 <= len && len <= 16);
+  XXH_ASSERT(9 <= len && len <= 16);
   {
 
     xxh_u64 const bitflip1 =
@@ -3306,11 +3945,10 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input,
                                      const xxh_u8 *XXH_RESTRICT secret,
                                      xxh_u64                    seed64) {
 
-    #if defined(__GNUC__) && !defined(__clang__)  /* GCC, not Clang */ \
-        && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */     \
-        &&                                                             \
-        !defined(                                                      \
-            XXH_ENABLE_AUTOVECTORIZE)  /* Define to disable like XXH32 hack */
+      #if defined(__GNUC__) && !defined(__clang__)  /* GCC, not Clang */      \
+          && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */          \
+          && !defined(XXH_ENABLE_AUTOVECTORIZE)     /* Define to disable like \
+                                                       XXH32 hack */
   /*
    * UGLY HACK:
    * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
@@ -3326,8 +3964,8 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input,
    * GCC generates much better scalar code than Clang for the rest of XXH3,
    * which is why finding a more optimal codepath is an interest.
    */
-  __asm__("" : "+r"(seed64));
-    #endif
+  XXH_COMPILER_GUARD(seed64);
+      #endif
   {
 
     xxh_u64 const input_lo = XXH_readLE64(input);
@@ -3381,7 +4019,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b(
 
 }
 
-    #define XXH3_MIDSIZE_MAX 240
+      #define XXH3_MIDSIZE_MAX 240
 
 XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
     const xxh_u8 *XXH_RESTRICT input, size_t len,
@@ -3391,8 +4029,8 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
   (void)secretSize;
   XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
 
-    #define XXH3_MIDSIZE_STARTOFFSET 3
-    #define XXH3_MIDSIZE_LASTOFFSET 17
+      #define XXH3_MIDSIZE_STARTOFFSET 3
+      #define XXH3_MIDSIZE_LASTOFFSET 17
 
   {
 
@@ -3407,31 +4045,31 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
 
     acc = XXH3_avalanche(acc);
     XXH_ASSERT(nbRounds >= 8);
-    #if defined(__clang__)                                /* Clang */ \
-        && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
-        && !defined(XXH_ENABLE_AUTOVECTORIZE)          /* Define to disable */
-      /*
-       * UGLY HACK:
-       * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
-       * In everywhere else, it uses scalar code.
-       *
-       * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
-       * would still be slower than UMAAL (see XXH_mult64to128).
-       *
-       * Unfortunately, Clang doesn't handle the long multiplies properly and
-       * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
-       * scalarized into an ugly mess of VMOV.32 instructions.
-       *
-       * This mess is difficult to avoid without turning autovectorization
-       * off completely, but they are usually relatively minor and/or not
-       * worth it to fix.
-       *
-       * This loop is the easiest to fix, as unlike XXH32, this pragma
-       * _actually works_ because it is a loop vectorization instead of an
-       * SLP vectorization.
-       */
-      #pragma clang loop vectorize(disable)
-    #endif
+      #if defined(__clang__)                                /* Clang */ \
+          && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+          && !defined(XXH_ENABLE_AUTOVECTORIZE)        /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+      #endif
     for (i = 8; i < nbRounds; i++) {
 
       acc +=
@@ -3450,17 +4088,17 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
 
 }
 
-  /* =======     Long Keys     ======= */
+    /* =======     Long Keys     ======= */
 
-    #define XXH_STRIPE_LEN 64
-    #define XXH_SECRET_CONSUME_RATE \
-      8                 /* nb of secret bytes consumed at each accumulation */
-    #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+      #define XXH_STRIPE_LEN 64
+      #define XXH_SECRET_CONSUME_RATE \
+        8               /* nb of secret bytes consumed at each accumulation */
+      #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
 
-    #ifdef XXH_OLD_NAMES
-      #define STRIPE_LEN XXH_STRIPE_LEN
-      #define ACC_NB XXH_ACC_NB
-    #endif
+      #ifdef XXH_OLD_NAMES
+        #define STRIPE_LEN XXH_STRIPE_LEN
+        #define ACC_NB XXH_ACC_NB
+      #endif
 
 XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) {
 
@@ -3469,56 +4107,58 @@ XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) {
 
 }
 
-    /* Several intrinsic functions below are supposed to accept __int64 as
-     * argument, as documented in
-     * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . However,
-     * several environments do not define __int64 type, requiring a workaround.
-     */
-    #if !defined(__VMS) &&                                     \
-        (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
-                                  (__STDC_VERSION__ >= 199901L) /* C99 */))
+      /* Several intrinsic functions below are supposed to accept __int64 as
+       * argument, as documented in
+       * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+       * However, several environments do not define __int64 type,
+       * requiring a workaround.
+       */
+      #if !defined(__VMS) &&                                     \
+          (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
+                                    (__STDC_VERSION__ >= 199901L) /* C99 */))
 typedef int64_t xxh_i64;
-    #else
+      #else
 /* the following type must have a width of 64-bit */
 typedef long long xxh_i64;
-    #endif
+      #endif
 
-  /*
-   * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the
-   * most optimized.
-   *
-   * It is a hardened version of UMAC, based off of FARSH's implementation.
-   *
-   * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
-   * implementations, and it is ridiculously fast.
-   *
-   * We harden it by mixing the original input to the accumulators as well as
-   * the product.
-   *
-   * This means that in the (relatively likely) case of a multiply by zero, the
-   * original input is preserved.
-   *
-   * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
-   * cross-pollination, as otherwise the upper and lower halves would be
-   * essentially independent.
-   *
-   * This doesn't matter on 64-bit hashes since they all get merged together in
-   * the end, so we skip the extra step.
-   *
-   * Both XXH3_64bits and XXH3_128bits use this subroutine.
-   */
+    /*
+     * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the
+     * most optimized.
+     *
+     * It is a hardened version of UMAC, based off of FARSH's implementation.
+     *
+     * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+     * implementations, and it is ridiculously fast.
+     *
+     * We harden it by mixing the original input to the accumulators as well as
+     * the product.
+     *
+     * This means that in the (relatively likely) case of a multiply by zero,
+     * the original input is preserved.
+     *
+     * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+     * cross-pollination, as otherwise the upper and lower halves would be
+     * essentially independent.
+     *
+     * This doesn't matter on 64-bit hashes since they all get merged together
+     * in the end, so we skip the extra step.
+     *
+     * Both XXH3_64bits and XXH3_128bits use this subroutine.
+     */
 
-    #if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH)
+      #if (XXH_VECTOR == XXH_AVX512) || \
+          (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
 
-      #ifndef XXH_TARGET_AVX512
-        #define XXH_TARGET_AVX512               /* disable attribute target */
-      #endif
+        #ifndef XXH_TARGET_AVX512
+          #define XXH_TARGET_AVX512             /* disable attribute target */
+        #endif
 
 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512(
     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
     const void *XXH_RESTRICT secret) {
 
-  XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
+  __m512i *const xacc = (__m512i *)acc;
   XXH_ASSERT((((size_t)acc) & 63) == 0);
   XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
 
@@ -3576,8 +4216,8 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512(
   XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
   {
 
-    XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
-    const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+    __m512i *const xacc = (__m512i *)acc;
+    const __m512i  prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
 
     /* xacc[0] ^= (xacc[0] >> 47) */
     __m512i const acc_vec = *xacc;
@@ -3609,19 +4249,21 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(
 
     int const     nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
     __m512i const seed = _mm512_mask_set1_epi64(
-        _mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);
+        _mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
 
-    XXH_ALIGN(64) const __m512i *const src = (const __m512i *)XXH3_kSecret;
-    XXH_ALIGN(64) __m512i *const       dest = (__m512i *)customSecret;
-    int                                i;
+    const __m512i *const src = (const __m512i *)((const void *)XXH3_kSecret);
+    __m512i *const       dest = (__m512i *)customSecret;
+    int                  i;
+    XXH_ASSERT(((size_t)src & 63) == 0);               /* control alignment */
+    XXH_ASSERT(((size_t)dest & 63) == 0);
     for (i = 0; i < nbRounds; ++i) {
 
       /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void
-       * const*', this will warn "discards ‘const’ qualifier". */
+       * const*', this will warn "discards 'const' qualifier". */
       union {
 
-        XXH_ALIGN(64) const __m512i *cp;
-        XXH_ALIGN(64) void *p;
+        const __m512i *cp;
+        void *         p;
 
       } remote_const_void;
 
@@ -3635,13 +4277,14 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(
 
 }
 
-    #endif
+      #endif
 
-    #if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH)
+      #if (XXH_VECTOR == XXH_AVX2) || \
+          (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
 
-      #ifndef XXH_TARGET_AVX2
-        #define XXH_TARGET_AVX2                 /* disable attribute target */
-      #endif
+        #ifndef XXH_TARGET_AVX2
+          #define XXH_TARGET_AVX2               /* disable attribute target */
+        #endif
 
 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2(
     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
@@ -3650,7 +4293,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2(
   XXH_ASSERT((((size_t)acc) & 31) == 0);
   {
 
-    XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
+    __m256i *const xacc = (__m256i *)acc;
     /* Unaligned. This is mainly for pointer arithmetic, and because
      * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason.
      */
@@ -3692,7 +4335,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(
   XXH_ASSERT((((size_t)acc) & 31) == 0);
   {
 
-    XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
+    __m256i *const xacc = (__m256i *)acc;
     /* Unaligned. This is mainly for pointer arithmetic, and because
      * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
     const __m256i *const xsecret = (const __m256i *)secret;
@@ -3732,24 +4375,23 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(
   XXH_PREFETCH(customSecret);
   {
 
-    __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64,
-                                           -(xxh_i64)seed64, (xxh_i64)seed64);
+    __m256i const seed =
+        _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64,
+                          (xxh_i64)(0U - seed64), (xxh_i64)seed64);
 
-    XXH_ALIGN(64) const __m256i *const src = (const __m256i *)XXH3_kSecret;
-    XXH_ALIGN(64) __m256i *            dest = (__m256i *)customSecret;
+    const __m256i *const src = (const __m256i *)((const void *)XXH3_kSecret);
+    __m256i *            dest = (__m256i *)customSecret;
 
-      #if defined(__GNUC__) || defined(__clang__)
+        #if defined(__GNUC__) || defined(__clang__)
     /*
      * On GCC & Clang, marking 'dest' as modified will cause the compiler:
      *   - do not extract the secret from sse registers in the internal loop
      *   - use less common registers, and avoid pushing these reg into stack
-     * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
-     * customSecret, and on aarch64, this prevented LDP from merging two
-     * loads together for free. Putting the loads together before the stores
-     * properly generates LDP.
      */
-    __asm__("" : "+r"(dest));
-      #endif
+    XXH_COMPILER_GUARD(dest);
+        #endif
+    XXH_ASSERT(((size_t)src & 31) == 0);               /* control alignment */
+    XXH_ASSERT(((size_t)dest & 31) == 0);
 
     /* GCC -O2 need unroll loop manually */
     dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed);
@@ -3763,13 +4405,14 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(
 
 }
 
-    #endif
+      #endif
 
-    #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+      /* x86dispatch always generates SSE2 */
+      #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
 
-      #ifndef XXH_TARGET_SSE2
-        #define XXH_TARGET_SSE2                 /* disable attribute target */
-      #endif
+        #ifndef XXH_TARGET_SSE2
+          #define XXH_TARGET_SSE2               /* disable attribute target */
+        #endif
 
 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2(
     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
@@ -3779,7 +4422,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2(
   XXH_ASSERT((((size_t)acc) & 15) == 0);
   {
 
-    XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
+    __m128i *const xacc = (__m128i *)acc;
     /* Unaligned. This is mainly for pointer arithmetic, and because
      * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
     const __m128i *const xinput = (const __m128i *)input;
@@ -3820,7 +4463,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(
   XXH_ASSERT((((size_t)acc) & 15) == 0);
   {
 
-    XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
+    __m128i *const xacc = (__m128i *)acc;
     /* Unaligned. This is mainly for pointer arithmetic, and because
      * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
     const __m128i *const xsecret = (const __m128i *)secret;
@@ -3859,30 +4502,34 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(
 
     int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
 
-      #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-    // MSVC 32bit mode does not support _mm_set_epi64x before 2015
+        #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+    /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
     XXH_ALIGN(16)
-    const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, -(xxh_i64)seed64};
+    const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, (xxh_i64)(0U - seed64)};
     __m128i const seed = _mm_load_si128((__m128i const *)seed64x2);
-      #else
-    __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);
-      #endif
+        #else
+    __m128i const seed =
+        _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+        #endif
     int i;
 
-    XXH_ALIGN(64) const float *const  src = (float const *)XXH3_kSecret;
-    XXH_ALIGN(XXH_SEC_ALIGN) __m128i *dest = (__m128i *)customSecret;
-      #if defined(__GNUC__) || defined(__clang__)
+    const void *const src16 = XXH3_kSecret;
+    __m128i *         dst16 = (__m128i *)customSecret;
+        #if defined(__GNUC__) || defined(__clang__)
     /*
      * On GCC & Clang, marking 'dest' as modified will cause the compiler:
      *   - do not extract the secret from sse registers in the internal loop
      *   - use less common registers, and avoid pushing these reg into stack
      */
-    __asm__("" : "+r"(dest));
-      #endif
+    XXH_COMPILER_GUARD(dst16);
+        #endif
+    XXH_ASSERT(((size_t)src16 & 15) == 0);             /* control alignment */
+    XXH_ASSERT(((size_t)dst16 & 15) == 0);
 
     for (i = 0; i < nbRounds; ++i) {
 
-      dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src + i * 4)), seed);
+      dst16[i] =
+          _mm_add_epi64(_mm_load_si128((const __m128i *)src16 + i), seed);
 
     }
 
@@ -3890,9 +4537,9 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(
 
 }
 
-    #endif
+      #endif
 
-    #if (XXH_VECTOR == XXH_NEON)
+      #if (XXH_VECTOR == XXH_NEON)
 
 XXH_FORCE_INLINE void XXH3_accumulate_512_neon(
     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
@@ -3901,7 +4548,7 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_neon(
   XXH_ASSERT((((size_t)acc) & 15) == 0);
   {
 
-    XXH_ALIGN(16) uint64x2_t *const xacc = (uint64x2_t *)acc;
+    uint64x2_t *const xacc = (uint64x2_t *)acc;
     /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7.
      */
     uint8_t const *const xinput = (const uint8_t *)input;
@@ -3996,9 +4643,9 @@ XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void *XXH_RESTRICT       acc,
 
 }
 
-    #endif
+      #endif
 
-    #if (XXH_VECTOR == XXH_VSX)
+      #if (XXH_VECTOR == XXH_VSX)
 
 XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT       acc,
                                               const void *XXH_RESTRICT input,
@@ -4025,12 +4672,12 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT       acc,
     xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
     xacc[i] += product;
 
-        /* swap high and low halves */
-      #ifdef __s390x__
+          /* swap high and low halves */
+        #ifdef __s390x__
     xacc[i] += vec_permi(data_vec, data_vec, 2);
-      #else
+        #else
     xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
-      #endif
+        #endif
 
   }
 
@@ -4075,7 +4722,7 @@ XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void *XXH_RESTRICT       acc,
 
 }
 
-    #endif
+      #endif
 
 /* scalar variants - universal */
 
@@ -4083,7 +4730,6 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(
     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
     const void *XXH_RESTRICT secret) {
 
-  XXH_ALIGN(XXH_ACC_ALIGN)
   xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
   const xxh_u8 *const xinput =
       (const xxh_u8 *)input;                    /* no alignment restriction */
@@ -4105,7 +4751,6 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(
 XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void *XXH_RESTRICT       acc,
                                               const void *XXH_RESTRICT secret) {
 
-  XXH_ALIGN(XXH_ACC_ALIGN)
   xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
   const xxh_u8 *const xsecret =
       (const xxh_u8 *)secret;                   /* no alignment restriction */
@@ -4135,7 +4780,7 @@ XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(
   const xxh_u8 *kSecretPtr = XXH3_kSecret;
   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
 
-    #if defined(__clang__) && defined(__aarch64__)
+      #if defined(__clang__) && defined(__aarch64__)
   /*
    * UGLY HACK:
    * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
@@ -4164,8 +4809,8 @@ XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(
    *   without hack: 2654.4 MB/s
    *   with hack:    3202.9 MB/s
    */
-  __asm__("" : "+r"(kSecretPtr));
-    #endif
+  XXH_COMPILER_GUARD(kSecretPtr);
+      #endif
   /*
    * Note: in debug mode, this overrides the asm optimization
    * and Clang will emit MOVK chains again.
@@ -4200,55 +4845,55 @@ typedef void (*XXH3_f_accumulate_512)(void *XXH_RESTRICT, const void *,
 typedef void (*XXH3_f_scrambleAcc)(void *XXH_RESTRICT, const void *);
 typedef void (*XXH3_f_initCustomSecret)(void *XXH_RESTRICT, xxh_u64);
 
-    #if (XXH_VECTOR == XXH_AVX512)
+      #if (XXH_VECTOR == XXH_AVX512)
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+        #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
 
-    #elif (XXH_VECTOR == XXH_AVX2)
+      #elif (XXH_VECTOR == XXH_AVX2)
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+        #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
 
-    #elif (XXH_VECTOR == XXH_SSE2)
+      #elif (XXH_VECTOR == XXH_SSE2)
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+        #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
 
-    #elif (XXH_VECTOR == XXH_NEON)
+      #elif (XXH_VECTOR == XXH_NEON)
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_neon
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+        #define XXH3_accumulate_512 XXH3_accumulate_512_neon
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
 
-    #elif (XXH_VECTOR == XXH_VSX)
+      #elif (XXH_VECTOR == XXH_VSX)
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+        #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
 
-    #else                                                         /* scalar */
+      #else                                                       /* scalar */
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+        #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
 
-    #endif
+      #endif
 
-    #ifndef XXH_PREFETCH_DIST
-      #ifdef __clang__
-        #define XXH_PREFETCH_DIST 320
-      #else
-        #if (XXH_VECTOR == XXH_AVX512)
-          #define XXH_PREFETCH_DIST 512
+      #ifndef XXH_PREFETCH_DIST
+        #ifdef __clang__
+          #define XXH_PREFETCH_DIST 320
         #else
-          #define XXH_PREFETCH_DIST 384
-        #endif
-      #endif                                                   /* __clang__ */
-    #endif                                             /* XXH_PREFETCH_DIST */
+          #if (XXH_VECTOR == XXH_AVX512)
+            #define XXH_PREFETCH_DIST 512
+          #else
+            #define XXH_PREFETCH_DIST 384
+          #endif
+        #endif                                                 /* __clang__ */
+      #endif                                           /* XXH_PREFETCH_DIST */
 
 /*
  * XXH3_accumulate()
@@ -4308,8 +4953,9 @@ XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(
     {
 
       const xxh_u8 *const p = input + len - XXH_STRIPE_LEN;
-    #define XXH_SECRET_LASTACC_START \
-      7  /* not aligned on 8, last secret is different from acc & scrambler */
+      #define XXH_SECRET_LASTACC_START                                       \
+        7 /* not aligned on 8, last secret is different from acc & scrambler \
+           */
       f_acc512(acc, p,
                secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
 
@@ -4337,10 +4983,10 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
   for (i = 0; i < 4; i++) {
 
     result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i);
-    #if defined(__clang__)                                /* Clang */ \
-        && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
-        && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
-        && !defined(XXH_ENABLE_AUTOVECTORIZE)          /* Define to disable */
+      #if defined(__clang__)                                /* Clang */ \
+          && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+          && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+          && !defined(XXH_ENABLE_AUTOVECTORIZE)        /* Define to disable */
     /*
      * UGLY HACK:
      * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
@@ -4349,8 +4995,8 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
      *   without hack: 2063.7 MB/s
      *   with hack:    2560.7 MB/s
      */
-    __asm__("" : "+r"(result64));
-    #endif
+    XXH_COMPILER_GUARD(result64);
+      #endif
 
   }
 
@@ -4358,13 +5004,13 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
 
 }
 
-    #define XXH3_INIT_ACC                                              \
-      {                                                                \
-                                                                       \
-        XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,    \
-            XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \
-                                                                       \
-      }
+      #define XXH3_INIT_ACC                                              \
+        {                                                                \
+                                                                         \
+          XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,    \
+              XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \
+                                                                         \
+        }
 
 XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(
     const void *XXH_RESTRICT input, size_t len, const void *XXH_RESTRICT secret,
@@ -4379,9 +5025,9 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(
 
   /* converge into final hash */
   XXH_STATIC_ASSERT(sizeof(acc) == 64);
-    /* do not align on 8, so that the secret is different from the accumulator
-     */
-    #define XXH_SECRET_MERGEACCS_START 11
+      /* do not align on 8, so that the secret is different from the accumulator
+       */
+      #define XXH_SECRET_MERGEACCS_START 11
   XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
   return XXH3_mergeAccs(acc,
                         (const xxh_u8 *)secret + XXH_SECRET_MERGEACCS_START,
@@ -4501,6 +5147,7 @@ XXH3_64bits_internal(const void *XXH_RESTRICT input, size_t len,
 
 /* ===   Public entry point   === */
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) {
 
   return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret),
@@ -4508,6 +5155,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) {
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input,
                                                    size_t      len,
                                                    const void *secret,
@@ -4518,6 +5166,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len,
                                                  XXH64_hash_t seed) {
 
@@ -4603,6 +5252,7 @@ static void XXH_alignedFree(void *p) {
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) {
 
   XXH3_state_t *const state =
@@ -4613,6 +5263,7 @@ XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) {
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) {
 
   XXH_alignedFree(statePtr);
@@ -4620,6 +5271,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) {
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t *      dst_state,
                                    const XXH3_state_t *src_state) {
 
@@ -4627,9 +5279,8 @@ XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t *      dst_state,
 
 }
 
-static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr,
-                                       XXH64_hash_t seed, const void *secret,
-                                       size_t secretSize) {
+static void XXH3_reset_internal(XXH3_state_t *statePtr, XXH64_hash_t seed,
+                                const void *secret, size_t secretSize) {
 
   size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
   size_t const initLength =
@@ -4654,26 +5305,28 @@ static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr) {
 
   if (statePtr == NULL) return XXH_ERROR;
-  XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret,
-                             XXH_SECRET_DEFAULT_SIZE);
+  XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
   return XXH_OK;
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
     XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
 
   if (statePtr == NULL) return XXH_ERROR;
-  XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
+  XXH3_reset_internal(statePtr, 0, secret, secretSize);
   if (secret == NULL) return XXH_ERROR;
   if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
   return XXH_OK;
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
                                                         XXH64_hash_t  seed) {
 
@@ -4681,7 +5334,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
   if (seed == 0) return XXH3_64bits_reset(statePtr);
   if (seed != statePtr->seed)
     XXH3_initCustomSecret(statePtr->customSecret, seed);
-  XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+  XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
   return XXH_OK;
 
 }
@@ -4733,12 +5386,12 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
                                            XXH3_f_scrambleAcc    f_scramble) {
 
   if (input == NULL)
-    #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
-        (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
+      #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
+          (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
     return XXH_OK;
-    #else
+      #else
     return XXH_ERROR;
-    #endif
+      #endif
 
   {
 
@@ -4747,6 +5400,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
         (state->extSecret == NULL) ? state->customSecret : state->extSecret;
 
     state->totalLen += len;
+    XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
 
     if (state->bufferedSize + len <=
         XXH3_INTERNALBUFFER_SIZE) {                   /* fill in tmp buffer */
@@ -4756,10 +5410,10 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
 
     }
 
-      /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
 
-    #define XXH3_INTERNALBUFFER_STRIPES \
-      (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+      #define XXH3_INTERNALBUFFER_STRIPES \
+        (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
     XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN ==
                       0);                                 /* clean multiple */
 
@@ -4783,7 +5437,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
     XXH_ASSERT(input < bEnd);
 
     /* Consume input by a multiple of internal buffer size */
-    if (input + XXH3_INTERNALBUFFER_SIZE < bEnd) {
+    if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
 
       const xxh_u8 *const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
       do {
@@ -4814,6 +5468,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state,
                                                 const void *input, size_t len) {
 
@@ -4859,6 +5514,7 @@ XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t *       acc,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) {
 
   const unsigned char *const secret =
@@ -4881,8 +5537,9 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) {
 
 }
 
-    #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+      #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
                                         const void *customSeed,
                                         size_t      customSeedSize) {
@@ -5398,6 +6055,7 @@ XXH3_128bits_internal(const void *input, size_t len, XXH64_hash_t seed64,
 
 /* ===   Public XXH128 API   === */
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) {
 
   return XXH3_128bits_internal(input, len, 0, XXH3_kSecret,
@@ -5406,6 +6064,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) {
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input,
                                                      size_t      len,
                                                      const void *secret,
@@ -5416,6 +6075,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input,
                                                    size_t       len,
                                                    XXH64_hash_t seed) {
@@ -5426,6 +6086,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len,
                                     XXH64_hash_t seed) {
 
@@ -5437,37 +6098,31 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len,
 
 /*
  * All the functions are actually the same as for 64-bit streaming variant.
- * The only difference is the finalizatiom routine.
+ * The only difference is the finalization routine.
  */
 
-static void XXH3_128bits_reset_internal(XXH3_state_t *statePtr,
-                                        XXH64_hash_t seed, const void *secret,
-                                        size_t secretSize) {
-
-  XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
-
-}
-
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr) {
 
   if (statePtr == NULL) return XXH_ERROR;
-  XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret,
-                              XXH_SECRET_DEFAULT_SIZE);
+  XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
   return XXH_OK;
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(
     XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
 
   if (statePtr == NULL) return XXH_ERROR;
-  XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
+  XXH3_reset_internal(statePtr, 0, secret, secretSize);
   if (secret == NULL) return XXH_ERROR;
   if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
   return XXH_OK;
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
                                                          XXH64_hash_t  seed) {
 
@@ -5475,11 +6130,12 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
   if (seed == 0) return XXH3_128bits_reset(statePtr);
   if (seed != statePtr->seed)
     XXH3_initCustomSecret(statePtr->customSecret, seed);
-  XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+  XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
   return XXH_OK;
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state,
                                                  const void *  input,
                                                  size_t        len) {
@@ -5489,6 +6145,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) {
 
   const unsigned char *const secret =
@@ -5524,11 +6181,12 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) {
 
 }
 
-  /* 128-bit utility functions */
+    /* 128-bit utility functions */
 
-    #include <string.h>                                   /* memcmp, memcpy */
+      #include <string.h>                                 /* memcmp, memcpy */
 
 /* return : 1 is equal, 0 if different */
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
 
   /* note : XXH128_hash_t is compact, it has no padding byte */
@@ -5540,6 +6198,7 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
  * return : >0 if *h128_1  > *h128_2
  *          <0 if *h128_1  < *h128_2
  *          =0 if *h128_1 == *h128_2  */
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) {
 
   XXH128_hash_t const h1 = *(const XXH128_hash_t *)h128_1;
@@ -5552,6 +6211,7 @@ XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) {
 }
 
 /*======   Canonical representation   ======*/
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
                                              XXH128_hash_t       hash) {
 
@@ -5568,6 +6228,7 @@ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t
 XXH128_hashFromCanonical(const XXH128_canonical_t *src) {
 
@@ -5578,16 +6239,21 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src) {
 
 }
 
-    /* Pop our optimization override from above */
-    #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
-        && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
-        && defined(__OPTIMIZE__) &&                                      \
-        !defined(__OPTIMIZE_SIZE__)                  /* respect -O0 and -Os */
-      #pragma GCC pop_options
-    #endif
+      /* Pop our optimization override from above */
+      #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
+          && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+          && defined(__OPTIMIZE__) &&                                      \
+          !defined(__OPTIMIZE_SIZE__)                /* respect -O0 and -Os */
+        #pragma GCC pop_options
+      #endif
 
-  #endif                                                /* XXH_NO_LONG_LONG */
+    #endif                                              /* XXH_NO_LONG_LONG */
+
+  #endif                                                     /* XXH_NO_XXH3 */
 
+/*!
+ * @}
+ */
 #endif                                                /* XXH_IMPLEMENTATION */
 
 #if defined(__cplusplus)
diff --git a/instrumentation/afl-compiler-rt.o.c b/instrumentation/afl-compiler-rt.o.c
index b2802a29..91c690c0 100644
--- a/instrumentation/afl-compiler-rt.o.c
+++ b/instrumentation/afl-compiler-rt.o.c
@@ -22,6 +22,10 @@
 #include "cmplog.h"
 #include "llvm-alternative-coverage.h"
 
+#define XXH_INLINE_ALL
+#include "xxhash.h"
+#undef XXH_INLINE_ALL
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <signal.h>
@@ -154,6 +158,8 @@ static void at_exit(int signal) {
 
 }
 
+#define default_hash(a, b) XXH3_64bits(a, b)
+
 /* Uninspired gcc plugin instrumentation */
 
 void __afl_trace(const u32 x) {
@@ -669,7 +675,7 @@ static void __afl_start_snapshots(void) {
   /* Phone home and tell the parent that we're OK. If parent isn't there,
      assume we're not running in forkserver mode and just execute program. */
 
-  status |= (FS_OPT_ENABLED | FS_OPT_SNAPSHOT);
+  status |= (FS_OPT_ENABLED | FS_OPT_SNAPSHOT | FS_OPT_NEWCMPLOG);
   if (__afl_sharedmem_fuzzing != 0) status |= FS_OPT_SHDMEM_FUZZ;
   if (__afl_map_size <= FS_OPT_MAX_MAPSIZE)
     status |= (FS_OPT_SET_MAPSIZE(__afl_map_size) | FS_OPT_MAPSIZE);
@@ -935,7 +941,12 @@ static void __afl_start_forkserver(void) {
   }
 
   if (__afl_sharedmem_fuzzing != 0) { status_for_fsrv |= FS_OPT_SHDMEM_FUZZ; }
-  if (status_for_fsrv) { status_for_fsrv |= (FS_OPT_ENABLED); }
+  if (status_for_fsrv) {
+
+    status_for_fsrv |= (FS_OPT_ENABLED | FS_OPT_NEWCMPLOG);
+
+  }
+
   memcpy(tmp, &status_for_fsrv, 4);
 
   /* Phone home and tell the parent that we're OK. If parent isn't there,
@@ -1499,8 +1510,7 @@ void __cmplog_ins_hook1(uint8_t arg1, uint8_t arg2, uint8_t attr) {
   if (unlikely(!__afl_cmp_map || arg1 == arg2)) return;
 
   uintptr_t k = (uintptr_t)__builtin_return_address(0);
-  k = (k >> 4) ^ (k << 8);
-  k &= CMP_MAP_W - 1;
+  k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1));
 
   u32 hits;
 
@@ -1530,8 +1540,7 @@ void __cmplog_ins_hook2(uint16_t arg1, uint16_t arg2, uint8_t attr) {
   if (unlikely(!__afl_cmp_map || arg1 == arg2)) return;
 
   uintptr_t k = (uintptr_t)__builtin_return_address(0);
-  k = (k >> 4) ^ (k << 8);
-  k &= CMP_MAP_W - 1;
+  k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1));
 
   u32 hits;
 
@@ -1569,8 +1578,7 @@ void __cmplog_ins_hook4(uint32_t arg1, uint32_t arg2, uint8_t attr) {
   if (unlikely(!__afl_cmp_map || arg1 == arg2)) return;
 
   uintptr_t k = (uintptr_t)__builtin_return_address(0);
-  k = (k >> 4) ^ (k << 8);
-  k &= CMP_MAP_W - 1;
+  k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1));
 
   u32 hits;
 
@@ -1608,8 +1616,7 @@ void __cmplog_ins_hook8(uint64_t arg1, uint64_t arg2, uint8_t attr) {
   if (unlikely(!__afl_cmp_map || arg1 == arg2)) return;
 
   uintptr_t k = (uintptr_t)__builtin_return_address(0);
-  k = (k >> 4) ^ (k << 8);
-  k &= CMP_MAP_W - 1;
+  k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1));
 
   u32 hits;
 
@@ -1652,8 +1659,7 @@ void __cmplog_ins_hookN(uint128_t arg1, uint128_t arg2, uint8_t attr,
   if (unlikely(!__afl_cmp_map || arg1 == arg2)) return;
 
   uintptr_t k = (uintptr_t)__builtin_return_address(0);
-  k = (k >> 4) ^ (k << 8);
-  k &= CMP_MAP_W - 1;
+  k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1));
 
   u32 hits;
 
@@ -1696,8 +1702,7 @@ void __cmplog_ins_hook16(uint128_t arg1, uint128_t arg2, uint8_t attr) {
   if (likely(!__afl_cmp_map)) return;
 
   uintptr_t k = (uintptr_t)__builtin_return_address(0);
-  k = (k >> 4) ^ (k << 8);
-  k &= CMP_MAP_W - 1;
+  k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1));
 
   u32 hits;
 
@@ -1802,8 +1807,8 @@ void __sanitizer_cov_trace_switch(uint64_t val, uint64_t *cases) {
   for (uint64_t i = 0; i < cases[0]; i++) {
 
     uintptr_t k = (uintptr_t)__builtin_return_address(0) + i;
-    k = (k >> 4) ^ (k << 8);
-    k &= CMP_MAP_W - 1;
+    k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) &
+                    (CMP_MAP_W - 1));
 
     u32 hits;
 
@@ -1880,6 +1885,159 @@ static int area_is_valid(void *ptr, size_t len) {
 
 }
 
+void __cmplog_rtn_hook_n(u8 *ptr1, u8 *ptr2, u64 len) {
+
+  /*
+    u32 i;
+    if (area_is_valid(ptr1, 32) <= 0 || area_is_valid(ptr2, 32) <= 0) return;
+    fprintf(stderr, "rtn_n len=%u arg0=", len);
+    for (i = 0; i < len; i++)
+      fprintf(stderr, "%02x", ptr1[i]);
+    fprintf(stderr, " arg1=");
+    for (i = 0; i < len; i++)
+      fprintf(stderr, "%02x", ptr2[i]);
+    fprintf(stderr, "\n");
+  */
+
+  if (likely(!__afl_cmp_map)) return;
+  // fprintf(stderr, "RTN1 %p %p %u\n", ptr1, ptr2, len);
+  if (unlikely(!len)) return;
+  int l = MIN(31, len);
+
+  // fprintf(stderr, "RTN2 %u\n", l);
+  uintptr_t k = (uintptr_t)__builtin_return_address(0);
+  k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1));
+
+  u32 hits;
+
+  if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) {
+
+    __afl_cmp_map->headers[k].type = CMP_TYPE_RTN;
+    __afl_cmp_map->headers[k].hits = 1;
+    __afl_cmp_map->headers[k].shape = l - 1;
+    hits = 0;
+
+  } else {
+
+    hits = __afl_cmp_map->headers[k].hits++;
+
+    if (__afl_cmp_map->headers[k].shape < l) {
+
+      __afl_cmp_map->headers[k].shape = l - 1;
+
+    }
+
+  }
+
+  struct cmpfn_operands *cmpfn = (struct cmpfn_operands *)__afl_cmp_map->log[k];
+  hits &= CMP_MAP_RTN_H - 1;
+
+  cmpfn[hits].v0_len = l;
+  cmpfn[hits].v1_len = l;
+  __builtin_memcpy(cmpfn[hits].v0, ptr1, l);
+  __builtin_memcpy(cmpfn[hits].v1, ptr2, l);
+  // fprintf(stderr, "RTN3\n");
+
+}
+
+void __cmplog_rtn_hook_strn(u8 *ptr1, u8 *ptr2, u64 len) {
+
+  /*
+    if (area_is_valid(ptr1, 32) <= 0 || area_is_valid(ptr2, 32) <= 0) return;
+    fprintf(stderr, "rtn_strn len=%u arg0=%s arg1=%s\n", len, ptr1, ptr2);
+  */
+
+  if (likely(!__afl_cmp_map)) return;
+  // fprintf(stderr, "RTN1 %p %p %u\n", ptr1, ptr2, len);
+  if (unlikely(!len)) return;
+  int l = MIN(31, len + 1);
+
+  // fprintf(stderr, "RTN2 %u\n", l);
+  uintptr_t k = (uintptr_t)__builtin_return_address(0);
+  k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1));
+
+  u32 hits;
+
+  if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) {
+
+    __afl_cmp_map->headers[k].type = CMP_TYPE_RTN;
+    __afl_cmp_map->headers[k].hits = 1;
+    __afl_cmp_map->headers[k].shape = l - 1;
+    hits = 0;
+
+  } else {
+
+    hits = __afl_cmp_map->headers[k].hits++;
+
+    if (__afl_cmp_map->headers[k].shape < l) {
+
+      __afl_cmp_map->headers[k].shape = l - 1;
+
+    }
+
+  }
+
+  struct cmpfn_operands *cmpfn = (struct cmpfn_operands *)__afl_cmp_map->log[k];
+  hits &= CMP_MAP_RTN_H - 1;
+
+  cmpfn[hits].v0_len = 0x80 + l;
+  cmpfn[hits].v1_len = 0x80 + l;
+  __builtin_memcpy(cmpfn[hits].v0, ptr1, l);
+  __builtin_memcpy(cmpfn[hits].v1, ptr2, l);
+  // fprintf(stderr, "RTN3\n");
+
+}
+
+void __cmplog_rtn_hook_str(u8 *ptr1, u8 *ptr2) {
+
+  /*
+    if (area_is_valid(ptr1, 32) <= 0 || area_is_valid(ptr2, 32) <= 0) return;
+    fprintf(stderr, "rtn_str arg0=%s arg1=%s\n", ptr1, ptr2);
+  */
+
+  if (likely(!__afl_cmp_map)) return;
+  // fprintf(stderr, "RTN1 %p %p\n", ptr1, ptr2);
+  if (unlikely(!ptr1 || !ptr2)) return;
+  int len1 = MIN(31, strlen(ptr1) + 1);
+  int len2 = MIN(31, strlen(ptr2) + 1);
+  int l = MIN(MAX(len1, len2), 31);
+
+  // fprintf(stderr, "RTN2 %u\n", l);
+  uintptr_t k = (uintptr_t)__builtin_return_address(0);
+  k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1));
+
+  u32 hits;
+
+  if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) {
+
+    __afl_cmp_map->headers[k].type = CMP_TYPE_RTN;
+    __afl_cmp_map->headers[k].hits = 1;
+    __afl_cmp_map->headers[k].shape = l - 1;
+    hits = 0;
+
+  } else {
+
+    hits = __afl_cmp_map->headers[k].hits++;
+
+    if (__afl_cmp_map->headers[k].shape < l) {
+
+      __afl_cmp_map->headers[k].shape = l - 1;
+
+    }
+
+  }
+
+  struct cmpfn_operands *cmpfn = (struct cmpfn_operands *)__afl_cmp_map->log[k];
+  hits &= CMP_MAP_RTN_H - 1;
+
+  cmpfn[hits].v0_len = 0x80 + len1;
+  cmpfn[hits].v1_len = 0x80 + len2;
+  __builtin_memcpy(cmpfn[hits].v0, ptr1, len1);
+  __builtin_memcpy(cmpfn[hits].v1, ptr2, len2);
+  // fprintf(stderr, "RTN3\n");
+
+}
+
 void __cmplog_rtn_hook(u8 *ptr1, u8 *ptr2) {
 
   /*
@@ -1900,12 +2058,11 @@ void __cmplog_rtn_hook(u8 *ptr1, u8 *ptr2) {
   if ((l1 = area_is_valid(ptr1, 32)) <= 0 ||
       (l2 = area_is_valid(ptr2, 32)) <= 0)
     return;
-  int len = MIN(l1, l2);
+  int len = MIN(31, MIN(l1, l2));
 
   // fprintf(stderr, "RTN2 %u\n", len);
   uintptr_t k = (uintptr_t)__builtin_return_address(0);
-  k = (k >> 4) ^ (k << 8);
-  k &= CMP_MAP_W - 1;
+  k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1));
 
   u32 hits;
 
@@ -1928,11 +2085,13 @@ void __cmplog_rtn_hook(u8 *ptr1, u8 *ptr2) {
 
   }
 
+  struct cmpfn_operands *cmpfn = (struct cmpfn_operands *)__afl_cmp_map->log[k];
   hits &= CMP_MAP_RTN_H - 1;
-  __builtin_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0,
-                   ptr1, len);
-  __builtin_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1,
-                   ptr2, len);
+
+  cmpfn[hits].v0_len = len;
+  cmpfn[hits].v1_len = len;
+  __builtin_memcpy(cmpfn[hits].v0, ptr1, len);
+  __builtin_memcpy(cmpfn[hits].v1, ptr2, len);
   // fprintf(stderr, "RTN3\n");
 
 }
diff --git a/instrumentation/cmplog-routines-pass.cc b/instrumentation/cmplog-routines-pass.cc
index 01b7a373..fb514edc 100644
--- a/instrumentation/cmplog-routines-pass.cc
+++ b/instrumentation/cmplog-routines-pass.cc
@@ -87,12 +87,14 @@ char CmpLogRoutines::ID = 0;
 
 bool CmpLogRoutines::hookRtns(Module &M) {
 
-  std::vector<CallInst *> calls, llvmStdStd, llvmStdC, gccStdStd, gccStdC;
-  LLVMContext &           C = M.getContext();
+  std::vector<CallInst *> calls, llvmStdStd, llvmStdC, gccStdStd, gccStdC,
+      Memcmp, Strcmp, Strncmp;
+  LLVMContext &C = M.getContext();
 
   Type *VoidTy = Type::getVoidTy(C);
   // PointerType *VoidPtrTy = PointerType::get(VoidTy, 0);
   IntegerType *Int8Ty = IntegerType::getInt8Ty(C);
+  IntegerType *Int64Ty = IntegerType::getInt64Ty(C);
   PointerType *i8PtrTy = PointerType::get(Int8Ty, 0);
 
 #if LLVM_VERSION_MAJOR < 9
@@ -184,6 +186,60 @@ bool CmpLogRoutines::hookRtns(Module &M) {
   FunctionCallee cmplogGccStdC = c4;
 #endif
 
+#if LLVM_VERSION_MAJOR < 9
+  Constant *
+#else
+  FunctionCallee
+#endif
+      c5 = M.getOrInsertFunction("__cmplog_rtn_hook_n", VoidTy, i8PtrTy,
+                                 i8PtrTy, Int64Ty
+#if LLVM_VERSION_MAJOR < 5
+                                 ,
+                                 NULL
+#endif
+      );
+#if LLVM_VERSION_MAJOR < 9
+  Function *cmplogHookFnN = cast<Function>(c5);
+#else
+  FunctionCallee cmplogHookFnN = c5;
+#endif
+
+#if LLVM_VERSION_MAJOR < 9
+  Constant *
+#else
+  FunctionCallee
+#endif
+      c6 = M.getOrInsertFunction("__cmplog_rtn_hook_strn", VoidTy, i8PtrTy,
+                                 i8PtrTy, Int64Ty
+#if LLVM_VERSION_MAJOR < 5
+                                 ,
+                                 NULL
+#endif
+      );
+#if LLVM_VERSION_MAJOR < 9
+  Function *cmplogHookFnStrN = cast<Function>(c6);
+#else
+  FunctionCallee cmplogHookFnStrN = c6;
+#endif
+
+#if LLVM_VERSION_MAJOR < 9
+  Constant *
+#else
+  FunctionCallee
+#endif
+      c7 = M.getOrInsertFunction("__cmplog_rtn_hook_str", VoidTy, i8PtrTy,
+                                 i8PtrTy
+#if LLVM_VERSION_MAJOR < 5
+                                 ,
+                                 NULL
+#endif
+      );
+#if LLVM_VERSION_MAJOR < 9
+  Function *cmplogHookFnStr = cast<Function>(c7);
+#else
+  FunctionCallee cmplogHookFnStr = c7;
+#endif
+
   GlobalVariable *AFLCmplogPtr = M.getNamedGlobal("__afl_cmp_map");
 
   if (!AFLCmplogPtr) {
@@ -214,12 +270,93 @@ bool CmpLogRoutines::hookRtns(Module &M) {
           if (callInst->getCallingConv() != llvm::CallingConv::C) continue;
 
           FunctionType *FT = Callee->getFunctionType();
+          std::string   FuncName = Callee->getName().str();
 
           bool isPtrRtn = FT->getNumParams() >= 2 &&
                           !FT->getReturnType()->isVoidTy() &&
                           FT->getParamType(0) == FT->getParamType(1) &&
                           FT->getParamType(0)->isPointerTy();
 
+          bool isPtrRtnN = FT->getNumParams() >= 3 &&
+                           !FT->getReturnType()->isVoidTy() &&
+                           FT->getParamType(0) == FT->getParamType(1) &&
+                           FT->getParamType(0)->isPointerTy() &&
+                           FT->getParamType(2)->isIntegerTy();
+          if (isPtrRtnN) {
+
+            auto intTyOp =
+                dyn_cast<IntegerType>(callInst->getArgOperand(2)->getType());
+            if (intTyOp) {
+
+              if (intTyOp->getBitWidth() != 32 &&
+                  intTyOp->getBitWidth() != 64) {
+
+                isPtrRtnN = false;
+
+              }
+
+            }
+
+          }
+
+          bool isMemcmp =
+              (!FuncName.compare("memcmp") || !FuncName.compare("bcmp") ||
+               !FuncName.compare("CRYPTO_memcmp") ||
+               !FuncName.compare("OPENSSL_memcmp") ||
+               !FuncName.compare("memcmp_const_time") ||
+               !FuncName.compare("memcmpct"));
+          isMemcmp &= FT->getNumParams() == 3 &&
+                      FT->getReturnType()->isIntegerTy(32) &&
+                      FT->getParamType(0)->isPointerTy() &&
+                      FT->getParamType(1)->isPointerTy() &&
+                      FT->getParamType(2)->isIntegerTy();
+
+          bool isStrcmp =
+              (!FuncName.compare("strcmp") || !FuncName.compare("xmlStrcmp") ||
+               !FuncName.compare("xmlStrEqual") ||
+               !FuncName.compare("g_strcmp0") ||
+               !FuncName.compare("curl_strequal") ||
+               !FuncName.compare("strcsequal") ||
+               !FuncName.compare("strcasecmp") ||
+               !FuncName.compare("stricmp") ||
+               !FuncName.compare("ap_cstr_casecmp") ||
+               !FuncName.compare("OPENSSL_strcasecmp") ||
+               !FuncName.compare("xmlStrcasecmp") ||
+               !FuncName.compare("g_strcasecmp") ||
+               !FuncName.compare("g_ascii_strcasecmp") ||
+               !FuncName.compare("Curl_strcasecompare") ||
+               !FuncName.compare("Curl_safe_strcasecompare") ||
+               !FuncName.compare("cmsstrcasecmp") ||
+               !FuncName.compare("strstr") ||
+               !FuncName.compare("g_strstr_len") ||
+               !FuncName.compare("ap_strcasestr") ||
+               !FuncName.compare("xmlStrstr") ||
+               !FuncName.compare("xmlStrcasestr") ||
+               !FuncName.compare("g_str_has_prefix") ||
+               !FuncName.compare("g_str_has_suffix"));
+          isStrcmp &=
+              FT->getNumParams() == 2 && FT->getReturnType()->isIntegerTy(32) &&
+              FT->getParamType(0) == FT->getParamType(1) &&
+              FT->getParamType(0) == IntegerType::getInt8PtrTy(M.getContext());
+
+          bool isStrncmp = (!FuncName.compare("strncmp") ||
+                            !FuncName.compare("xmlStrncmp") ||
+                            !FuncName.compare("curl_strnequal") ||
+                            !FuncName.compare("strncasecmp") ||
+                            !FuncName.compare("strnicmp") ||
+                            !FuncName.compare("ap_cstr_casecmpn") ||
+                            !FuncName.compare("OPENSSL_strncasecmp") ||
+                            !FuncName.compare("xmlStrncasecmp") ||
+                            !FuncName.compare("g_ascii_strncasecmp") ||
+                            !FuncName.compare("Curl_strncasecompare") ||
+                            !FuncName.compare("g_strncasecmp"));
+          isStrncmp &= FT->getNumParams() == 3 &&
+                       FT->getReturnType()->isIntegerTy(32) &&
+                       FT->getParamType(0) == FT->getParamType(1) &&
+                       FT->getParamType(0) ==
+                           IntegerType::getInt8PtrTy(M.getContext()) &&
+                       FT->getParamType(2)->isIntegerTy();
+
           bool isGccStdStringStdString =
               Callee->getName().find("__is_charIT_EE7__value") !=
                   std::string::npos &&
@@ -267,13 +404,19 @@ bool CmpLogRoutines::hookRtns(Module &M) {
           */
 
           if (isGccStdStringCString || isGccStdStringStdString ||
-              isLlvmStdStringStdString || isLlvmStdStringCString) {
+              isLlvmStdStringStdString || isLlvmStdStringCString || isMemcmp ||
+              isStrcmp || isStrncmp) {
 
-            isPtrRtn = false;
+            isPtrRtnN = isPtrRtn = false;
 
           }
 
+          if (isPtrRtnN) { isPtrRtn = false; }
+
           if (isPtrRtn) { calls.push_back(callInst); }
+          if (isMemcmp || isPtrRtnN) { Memcmp.push_back(callInst); }
+          if (isStrcmp) { Strcmp.push_back(callInst); }
+          if (isStrncmp) { Strncmp.push_back(callInst); }
           if (isGccStdStringStdString) { gccStdStd.push_back(callInst); }
           if (isGccStdStringCString) { gccStdC.push_back(callInst); }
           if (isLlvmStdStringStdString) { llvmStdStd.push_back(callInst); }
@@ -288,7 +431,8 @@ bool CmpLogRoutines::hookRtns(Module &M) {
   }
 
   if (!calls.size() && !gccStdStd.size() && !gccStdC.size() &&
-      !llvmStdStd.size() && !llvmStdC.size())
+      !llvmStdStd.size() && !llvmStdC.size() && !Memcmp.size() &&
+      Strcmp.size() && Strncmp.size())
     return false;
 
   /*
@@ -323,6 +467,96 @@ bool CmpLogRoutines::hookRtns(Module &M) {
 
   }
 
+  for (auto &callInst : Memcmp) {
+
+    Value *v1P = callInst->getArgOperand(0), *v2P = callInst->getArgOperand(1),
+          *v3P = callInst->getArgOperand(2);
+
+    IRBuilder<> IRB2(callInst->getParent());
+    IRB2.SetInsertPoint(callInst);
+
+    LoadInst *CmpPtr = IRB2.CreateLoad(AFLCmplogPtr);
+    CmpPtr->setMetadata(M.getMDKindID("nosanitize"), MDNode::get(C, None));
+    auto is_not_null = IRB2.CreateICmpNE(CmpPtr, Null);
+    auto ThenTerm = SplitBlockAndInsertIfThen(is_not_null, callInst, false);
+
+    IRBuilder<> IRB(ThenTerm);
+
+    std::vector<Value *> args;
+    Value *              v1Pcasted = IRB.CreatePointerCast(v1P, i8PtrTy);
+    Value *              v2Pcasted = IRB.CreatePointerCast(v2P, i8PtrTy);
+    Value *              v3Pbitcast = IRB.CreateBitCast(
+        v3P, IntegerType::get(C, v3P->getType()->getPrimitiveSizeInBits()));
+    Value *v3Pcasted =
+        IRB.CreateIntCast(v3Pbitcast, IntegerType::get(C, 64), false);
+    args.push_back(v1Pcasted);
+    args.push_back(v2Pcasted);
+    args.push_back(v3Pcasted);
+
+    IRB.CreateCall(cmplogHookFnN, args);
+
+    // errs() << callInst->getCalledFunction()->getName() << "\n";
+
+  }
+
+  for (auto &callInst : Strcmp) {
+
+    Value *v1P = callInst->getArgOperand(0), *v2P = callInst->getArgOperand(1);
+
+    IRBuilder<> IRB2(callInst->getParent());
+    IRB2.SetInsertPoint(callInst);
+
+    LoadInst *CmpPtr = IRB2.CreateLoad(AFLCmplogPtr);
+    CmpPtr->setMetadata(M.getMDKindID("nosanitize"), MDNode::get(C, None));
+    auto is_not_null = IRB2.CreateICmpNE(CmpPtr, Null);
+    auto ThenTerm = SplitBlockAndInsertIfThen(is_not_null, callInst, false);
+
+    IRBuilder<> IRB(ThenTerm);
+
+    std::vector<Value *> args;
+    Value *              v1Pcasted = IRB.CreatePointerCast(v1P, i8PtrTy);
+    Value *              v2Pcasted = IRB.CreatePointerCast(v2P, i8PtrTy);
+    args.push_back(v1Pcasted);
+    args.push_back(v2Pcasted);
+
+    IRB.CreateCall(cmplogHookFnStr, args);
+
+    // errs() << callInst->getCalledFunction()->getName() << "\n";
+
+  }
+
+  for (auto &callInst : Strncmp) {
+
+    Value *v1P = callInst->getArgOperand(0), *v2P = callInst->getArgOperand(1),
+          *v3P = callInst->getArgOperand(2);
+
+    IRBuilder<> IRB2(callInst->getParent());
+    IRB2.SetInsertPoint(callInst);
+
+    LoadInst *CmpPtr = IRB2.CreateLoad(AFLCmplogPtr);
+    CmpPtr->setMetadata(M.getMDKindID("nosanitize"), MDNode::get(C, None));
+    auto is_not_null = IRB2.CreateICmpNE(CmpPtr, Null);
+    auto ThenTerm = SplitBlockAndInsertIfThen(is_not_null, callInst, false);
+
+    IRBuilder<> IRB(ThenTerm);
+
+    std::vector<Value *> args;
+    Value *              v1Pcasted = IRB.CreatePointerCast(v1P, i8PtrTy);
+    Value *              v2Pcasted = IRB.CreatePointerCast(v2P, i8PtrTy);
+    Value *              v3Pbitcast = IRB.CreateBitCast(
+        v3P, IntegerType::get(C, v3P->getType()->getPrimitiveSizeInBits()));
+    Value *v3Pcasted =
+        IRB.CreateIntCast(v3Pbitcast, IntegerType::get(C, 64), false);
+    args.push_back(v1Pcasted);
+    args.push_back(v2Pcasted);
+    args.push_back(v3Pcasted);
+
+    IRB.CreateCall(cmplogHookFnStrN, args);
+
+    // errs() << callInst->getCalledFunction()->getName() << "\n";
+
+  }
+
   for (auto &callInst : gccStdStd) {
 
     Value *v1P = callInst->getArgOperand(0), *v2P = callInst->getArgOperand(1);
diff --git a/qemu_mode/QEMUAFL_VERSION b/qemu_mode/QEMUAFL_VERSION
index 7bdedf7b..0ffae35c 100644
--- a/qemu_mode/QEMUAFL_VERSION
+++ b/qemu_mode/QEMUAFL_VERSION
@@ -1 +1 @@
-71ed0d206f
+8809a2b2ebf089d3427dd8f6a0044bcc2e13b389
diff --git a/qemu_mode/qemuafl b/qemu_mode/qemuafl
-Subproject 71ed0d206fd3d877420dceb4993a1011a4637ae
+Subproject 8809a2b2ebf089d3427dd8f6a0044bcc2e13b38
diff --git a/src/afl-forkserver.c b/src/afl-forkserver.c
index 44b6c6f9..6320a26b 100644
--- a/src/afl-forkserver.c
+++ b/src/afl-forkserver.c
@@ -342,6 +342,16 @@ static void report_error_and_exit(int error) {
           "the fuzzing target reports that the mmap() call to the shared "
           "memory failed.");
       break;
+    case FS_ERROR_OLD_CMPLOG:
+      FATAL(
+          "the -c cmplog target was instrumented with an too old afl++ "
+          "version, you need to recompile it.");
+      break;
+    case FS_ERROR_OLD_CMPLOG_QEMU:
+      FATAL(
+          "The AFL++ QEMU/FRIDA loaders are from an older version, for -c you "
+          "need to recompile it.\n");
+      break;
     default:
       FATAL("unknown error code %d from fuzzing target!", error);
 
@@ -663,6 +673,20 @@ void afl_fsrv_start(afl_forkserver_t *fsrv, char **argv,
       if ((status & FS_OPT_OLD_AFLPP_WORKAROUND) == FS_OPT_OLD_AFLPP_WORKAROUND)
         status = (status & 0xf0ffffff);
 
+      if ((status & FS_OPT_NEWCMPLOG) == 0 && fsrv->cmplog_binary) {
+
+        if (fsrv->qemu_mode || fsrv->frida_mode) {
+
+          report_error_and_exit(FS_ERROR_OLD_CMPLOG_QEMU);
+
+        } else {
+
+          report_error_and_exit(FS_ERROR_OLD_CMPLOG);
+
+        }
+
+      }
+
       if ((status & FS_OPT_SNAPSHOT) == FS_OPT_SNAPSHOT) {
 
         fsrv->snapshot = 1;
diff --git a/src/afl-fuzz-one.c b/src/afl-fuzz-one.c
index a1134a22..f4d3b77f 100644
--- a/src/afl-fuzz-one.c
+++ b/src/afl-fuzz-one.c
@@ -448,11 +448,11 @@ u8 fuzz_one_original(afl_state_t *afl) {
 
     ACTF(
         "Fuzzing test case #%u (%u total, %llu uniq crashes found, "
-        "perf_score=%0.0f, exec_us=%llu, hits=%u, map=%u)...",
+        "perf_score=%0.0f, exec_us=%llu, hits=%u, map=%u, ascii=%u)...",
         afl->current_entry, afl->queued_paths, afl->unique_crashes,
         afl->queue_cur->perf_score, afl->queue_cur->exec_us,
         likely(afl->n_fuzz) ? afl->n_fuzz[afl->queue_cur->n_fuzz_entry] : 0,
-        afl->queue_cur->bitmap_size);
+        afl->queue_cur->bitmap_size, afl->queue_cur->is_ascii);
     fflush(stdout);
 
   }
@@ -2003,11 +2003,16 @@ havoc_stage:
      where we take the input file and make random stacked tweaks. */
 
 #define MAX_HAVOC_ENTRY 59                                      /* 55 to 60 */
+#define MUTATE_ASCII_DICT 64
 
   u32 r_max, r;
 
   r_max = (MAX_HAVOC_ENTRY + 1) + (afl->extras_cnt ? 4 : 0) +
-          (afl->a_extras_cnt ? 4 : 0);
+          (afl->a_extras_cnt
+               ? (unlikely(afl->cmplog_binary && afl->queue_cur->is_ascii)
+                      ? MUTATE_ASCII_DICT
+                      : 4)
+               : 0);
 
   if (unlikely(afl->expand_havoc && afl->ready_for_splicing_count > 1)) {
 
@@ -2592,7 +2597,15 @@ havoc_stage:
 
           if (afl->a_extras_cnt) {
 
-            if (r < 2) {
+            u32 r_cmp = 2;
+
+            if (unlikely(afl->cmplog_binary && afl->queue_cur->is_ascii)) {
+
+              r_cmp = MUTATE_ASCII_DICT >> 1;
+
+            }
+
+            if (r < r_cmp) {
 
               /* Use the dictionary. */
 
@@ -2612,7 +2625,7 @@ havoc_stage:
 
               break;
 
-            } else if (r < 4) {
+            } else if (r < (r_cmp << 1)) {
 
               u32 use_extra = rand_below(afl, afl->a_extras_cnt);
               u32 extra_len = afl->a_extras[use_extra].len;
@@ -2641,7 +2654,7 @@ havoc_stage:
 
             } else {
 
-              r -= 4;
+              r -= (r_cmp << 1);
 
             }
 
diff --git a/src/afl-fuzz-queue.c b/src/afl-fuzz-queue.c
index 33c2b561..1523d556 100644
--- a/src/afl-fuzz-queue.c
+++ b/src/afl-fuzz-queue.c
@@ -315,7 +315,96 @@ void mark_as_redundant(afl_state_t *afl, struct queue_entry *q, u8 state) {
 
 }
 
-/* check if ascii or UTF-8 */
+/* check if pointer is ascii or UTF-8 */
+
+u8 check_if_text_buf(u8 *buf, u32 len) {
+
+  u32 offset = 0, ascii = 0, utf8 = 0;
+
+  while (offset < len) {
+
+    // ASCII: <= 0x7F to allow ASCII control characters
+    if ((buf[offset + 0] == 0x09 || buf[offset + 0] == 0x0A ||
+         buf[offset + 0] == 0x0D ||
+         (0x20 <= buf[offset + 0] && buf[offset + 0] <= 0x7E))) {
+
+      offset++;
+      utf8++;
+      ascii++;
+      continue;
+
+    }
+
+    if (isascii((int)buf[offset]) || isprint((int)buf[offset])) {
+
+      ascii++;
+      // we continue though as it can also be a valid utf8
+
+    }
+
+    // non-overlong 2-byte
+    if (len - offset > 1 &&
+        ((0xC2 <= buf[offset + 0] && buf[offset + 0] <= 0xDF) &&
+         (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0xBF))) {
+
+      offset += 2;
+      utf8++;
+      continue;
+
+    }
+
+    // excluding overlongs
+    if ((len - offset > 2) &&
+        ((buf[offset + 0] == 0xE0 &&
+          (0xA0 <= buf[offset + 1] && buf[offset + 1] <= 0xBF) &&
+          (0x80 <= buf[offset + 2] &&
+           buf[offset + 2] <= 0xBF)) ||  // straight 3-byte
+         (((0xE1 <= buf[offset + 0] && buf[offset + 0] <= 0xEC) ||
+           buf[offset + 0] == 0xEE || buf[offset + 0] == 0xEF) &&
+          (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0xBF) &&
+          (0x80 <= buf[offset + 2] &&
+           buf[offset + 2] <= 0xBF)) ||  // excluding surrogates
+         (buf[offset + 0] == 0xED &&
+          (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0x9F) &&
+          (0x80 <= buf[offset + 2] && buf[offset + 2] <= 0xBF)))) {
+
+      offset += 3;
+      utf8++;
+      continue;
+
+    }
+
+    // planes 1-3
+    if ((len - offset > 3) &&
+        ((buf[offset + 0] == 0xF0 &&
+          (0x90 <= buf[offset + 1] && buf[offset + 1] <= 0xBF) &&
+          (0x80 <= buf[offset + 2] && buf[offset + 2] <= 0xBF) &&
+          (0x80 <= buf[offset + 3] &&
+           buf[offset + 3] <= 0xBF)) ||  // planes 4-15
+         ((0xF1 <= buf[offset + 0] && buf[offset + 0] <= 0xF3) &&
+          (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0xBF) &&
+          (0x80 <= buf[offset + 2] && buf[offset + 2] <= 0xBF) &&
+          (0x80 <= buf[offset + 3] && buf[offset + 3] <= 0xBF)) ||  // plane 16
+         (buf[offset + 0] == 0xF4 &&
+          (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0x8F) &&
+          (0x80 <= buf[offset + 2] && buf[offset + 2] <= 0xBF) &&
+          (0x80 <= buf[offset + 3] && buf[offset + 3] <= 0xBF)))) {
+
+      offset += 4;
+      utf8++;
+      continue;
+
+    }
+
+    offset++;
+
+  }
+
+  return (utf8 > ascii ? utf8 : ascii);
+
+}
+
+/* check if queue entry is ascii or UTF-8 */
 
 static u8 check_if_text(afl_state_t *afl, struct queue_entry *q) {
 
diff --git a/src/afl-fuzz-redqueen.c b/src/afl-fuzz-redqueen.c
index a1d6e021..0a6e5eee 100644
--- a/src/afl-fuzz-redqueen.c
+++ b/src/afl-fuzz-redqueen.c
@@ -45,6 +45,23 @@ enum {
 
 };
 
+// add to dictionary enum
+// DEFAULT = 1, notTXT = 2, FOUND = 4, notSAME = 8
+enum {
+
+  DICT_ADD_NEVER = 0,
+  DICT_ADD_NOTFOUND_SAME_TXT = 1,
+  DICT_ADD_NOTFOUND_SAME = 3,
+  DICT_ADD_FOUND_SAME_TXT = 5,
+  DICT_ADD_FOUND_SAME = 7,
+  DICT_ADD_NOTFOUND_TXT = 9,
+  DICT_ADD_NOTFOUND = 11,
+  DICT_ADD_FOUND_TXT = 13,
+  DICT_ADD_FOUND = 15,
+  DICT_ADD_ANY = DICT_ADD_FOUND
+
+};
+
 // CMPLOG LVL
 enum {
 
@@ -54,6 +71,8 @@ enum {
 
 };
 
+#define DICT_ADD_STRATEGY DICT_ADD_FOUND_SAME
+
 struct range {
 
   u32           start;
@@ -64,6 +83,10 @@ struct range {
 
 };
 
+static u32 hshape;
+static u64 screen_update;
+static u64 last_update;
+
 static struct range *add_range(struct range *ranges, u32 start, u32 end) {
 
   struct range *r = ck_alloc_nozero(sizeof(struct range));
@@ -252,7 +275,6 @@ static u8 colorization(afl_state_t *afl, u8 *buf, u32 len,
   u64 start_time = get_cur_time();
 #endif
 
-  u32 screen_update;
   u64 orig_hit_cnt, new_hit_cnt, exec_cksum;
   orig_hit_cnt = afl->queued_paths + afl->unique_crashes;
 
@@ -261,24 +283,6 @@ static u8 colorization(afl_state_t *afl, u8 *buf, u32 len,
   afl->stage_max = (len << 1);
   afl->stage_cur = 0;
 
-  if (likely(afl->queue_cur->exec_us)) {
-
-    if (likely((100000 / 2) >= afl->queue_cur->exec_us)) {
-
-      screen_update = 100000 / afl->queue_cur->exec_us;
-
-    } else {
-
-      screen_update = 1;
-
-    }
-
-  } else {
-
-    screen_update = 100000;
-
-  }
-
   // in colorization we do not classify counts, hence we have to calculate
   // the original checksum.
   if (unlikely(get_exec_checksum(afl, buf, len, &exec_cksum))) {
@@ -348,7 +352,7 @@ static u8 colorization(afl_state_t *afl, u8 *buf, u32 len,
 
     }
 
-    if (++afl->stage_cur % screen_update) { show_stats(afl); };
+    if (++afl->stage_cur % screen_update == 0) { show_stats(afl); };
 
   }
 
@@ -440,10 +444,10 @@ static u8 colorization(afl_state_t *afl, u8 *buf, u32 len,
     fprintf(
         f,
         "Colorization: fname=%s len=%u ms=%llu result=%u execs=%u found=%llu "
-        "taint=%u\n",
+        "taint=%u ascii=%u auto_extra_before=%u\n",
         afl->queue_cur->fname, len, get_cur_time() - start_time,
         afl->queue_cur->colorized, afl->stage_cur, new_hit_cnt - orig_hit_cnt,
-        positions);
+        positions, afl->queue_cur->is_ascii ? 1 : 0, afl->a_extras_cnt);
 
   #ifndef _DEBUG
     if (afl->not_on_tty) { fclose(f); }
@@ -759,11 +763,18 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
 
   u32 its_len = MIN(len - idx, taint_len);
 
+  if (afl->fsrv.total_execs - last_update > screen_update) {
+
+    show_stats(afl);
+    last_update = afl->fsrv.total_execs;
+
+  }
+
   // fprintf(stderr,
   //         "Encode: %llx->%llx into %llx(<-%llx) at idx=%u "
   //         "taint_len=%u shape=%u attr=%u\n",
   //         o_pattern, pattern, repl, changed_val, idx, taint_len,
-  //         h->shape + 1, attr);
+  //         hshape, attr);
 
   //#ifdef CMPLOG_SOLVE_TRANSFORM
   // reverse atoi()/strnu?toll() is expensive, so we only to it in lvl 3
@@ -845,7 +856,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
       u64 b_val, o_b_val, mask;
       u8  bytes;
 
-      switch (SHAPE_BYTES(h->shape)) {
+      switch (hshape) {
 
         case 0:
         case 1:
@@ -924,7 +935,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
       s64 diff = pattern - b_val;
       s64 o_diff = o_pattern - o_b_val;
       /* fprintf(stderr, "DIFF1 idx=%03u shape=%02u %llx-%llx=%lx\n", idx,
-                 h->shape + 1, o_pattern, o_b_val, o_diff);
+                 hshape, o_pattern, o_b_val, o_diff);
          fprintf(stderr, "DIFF1 %016llx %llx-%llx=%lx\n", repl, pattern,
                  b_val, diff); */
       if (diff == o_diff && diff) {
@@ -953,7 +964,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
         s64 o_diff = o_pattern ^ o_b_val;
 
         /* fprintf(stderr, "DIFF2 idx=%03u shape=%02u %llx-%llx=%lx\n",
-                   idx, h->shape + 1, o_pattern, o_b_val, o_diff);
+                   idx, hshape, o_pattern, o_b_val, o_diff);
            fprintf(stderr,
                    "DIFF2 %016llx %llx-%llx=%lx\n", repl, pattern, b_val, diff);
         */
@@ -1002,7 +1013,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
         }
 
         /* fprintf(stderr, "DIFF3 idx=%03u shape=%02u %llx-%llx=%lx\n",
-                   idx, h->shape + 1, o_pattern, o_b_val, o_diff);
+                   idx, hshape, o_pattern, o_b_val, o_diff);
            fprintf(stderr,
                    "DIFF3 %016llx %llx-%llx=%lx\n", repl, pattern, b_val, diff);
         */
@@ -1051,7 +1062,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
         }
 
         /* fprintf(stderr, "DIFF4 idx=%03u shape=%02u %llx-%llx=%lx\n",
-                   idx, h->shape + 1, o_pattern, o_b_val, o_diff);
+                   idx, hshape, o_pattern, o_b_val, o_diff);
            fprintf(stderr,
                    "DIFF4 %016llx %llx-%llx=%lx\n", repl, pattern, b_val, diff);
         */
@@ -1089,7 +1100,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
 
   if ((lvl & LVL1) || attr >= IS_FP_MOD) {
 
-    if (SHAPE_BYTES(h->shape) >= 8 && *status != 1) {
+    if (hshape >= 8 && *status != 1) {
 
       // if (its_len >= 8)
       //   fprintf(stderr,
@@ -1132,7 +1143,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
 
     }
 
-    if (SHAPE_BYTES(h->shape) >= 4 && *status != 1) {
+    if (hshape >= 4 && *status != 1) {
 
       // if (its_len >= 4 && (attr <= 1 || attr >= 8))
       //   fprintf(stderr,
@@ -1173,7 +1184,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
 
     }
 
-    if (SHAPE_BYTES(h->shape) >= 2 && *status != 1) {
+    if (hshape >= 2 && *status != 1) {
 
       if (its_len >= 2 &&
           ((*buf_16 == (u16)pattern && *o_buf_16 == (u16)o_pattern) ||
@@ -1244,11 +1255,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
 
   }
 
-  if (!(attr & (IS_GREATER | IS_LESSER)) || SHAPE_BYTES(h->shape) < 4) {
-
-    return 0;
-
-  }
+  if (!(attr & (IS_GREATER | IS_LESSER)) || hshape < 4) { return 0; }
 
   // transform >= to < and <= to >
   if ((attr & IS_EQUAL) && (attr & (IS_GREATER | IS_LESSER))) {
@@ -1272,7 +1279,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
 
     if (attr & IS_GREATER) {
 
-      if (SHAPE_BYTES(h->shape) == 4 && its_len >= 4) {
+      if (hshape == 4 && its_len >= 4) {
 
         float *f = (float *)&repl;
         float  g = *f;
@@ -1280,7 +1287,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
         u32 *r = (u32 *)&g;
         repl_new = (u32)*r;
 
-      } else if (SHAPE_BYTES(h->shape) == 8 && its_len >= 8) {
+      } else if (hshape == 8 && its_len >= 8) {
 
         double *f = (double *)&repl;
         double  g = *f;
@@ -1307,7 +1314,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
 
     } else {
 
-      if (SHAPE_BYTES(h->shape) == 4) {
+      if (hshape == 4) {
 
         float *f = (float *)&repl;
         float  g = *f;
@@ -1315,7 +1322,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
         u32 *r = (u32 *)&g;
         repl_new = (u32)*r;
 
-      } else if (SHAPE_BYTES(h->shape) == 8) {
+      } else if (hshape == 8) {
 
         double *f = (double *)&repl;
         double  g = *f;
@@ -1342,7 +1349,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
     }
 
     // transform double to float, llvm likes to do that internally ...
-    if (SHAPE_BYTES(h->shape) == 8 && its_len >= 4) {
+    if (hshape == 8 && its_len >= 4) {
 
       double *f = (double *)&repl;
       float   g = (float)*f;
@@ -1353,7 +1360,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
       memcpy(((char *)&repl_new) + 4, (char *)&g, 4);
 #endif
       changed_val = repl_new;
-      h->shape = 3;  // modify shape
+      hshape = 4;  // modify shape
 
       // fprintf(stderr, "DOUBLE2FLOAT %llx\n", repl_new);
 
@@ -1361,12 +1368,12 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h,
               afl, h, pattern, repl_new, o_pattern, changed_val, 16, idx,
               taint_len, orig_buf, buf, cbuf, len, 1, lvl, status))) {
 
-        h->shape = 7;  // recover shape
+        hshape = 8;  // recover shape
         return 1;
 
       }
 
-      h->shape = 7;  // recover shape
+      hshape = 8;  // recover shape
 
     }
 
@@ -1421,6 +1428,13 @@ static u8 cmp_extend_encodingN(afl_state_t *afl, struct cmp_header *h,
                                u32 taint_len, u8 *orig_buf, u8 *buf, u8 *cbuf,
                                u32 len, u8 do_reverse, u8 lvl, u8 *status) {
 
+  if (afl->fsrv.total_execs - last_update > screen_update) {
+
+    show_stats(afl);
+    last_update = afl->fsrv.total_execs;
+
+  }
+
   u8 *ptr = (u8 *)&buf[idx];
   u8 *o_ptr = (u8 *)&orig_buf[idx];
   u8 *p = (u8 *)&pattern;
@@ -1428,52 +1442,51 @@ static u8 cmp_extend_encodingN(afl_state_t *afl, struct cmp_header *h,
   u8 *r = (u8 *)&repl;
   u8  backup[16];
   u32 its_len = MIN(len - idx, taint_len);
-  u32 shape = h->shape + 1;
   #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
   size_t off = 0;
   #else
-  size_t off = 16 - shape;
+  size_t off = 16 - hshape;
   #endif
 
-  if (its_len >= shape) {
+  if (its_len >= hshape) {
 
   #ifdef _DEBUG
     fprintf(stderr, "TestUN: %u>=%u (len=%u idx=%u attr=%u off=%lu) (%u) ",
-            its_len, shape, len, idx, attr, off, do_reverse);
+            its_len, hshape, len, idx, attr, off, do_reverse);
     u32 i;
     u8 *o_r = (u8 *)&changed_val;
-    for (i = 0; i < shape; i++)
+    for (i = 0; i < hshape; i++)
       fprintf(stderr, "%02x", ptr[i]);
     fprintf(stderr, "==");
-    for (i = 0; i < shape; i++)
+    for (i = 0; i < hshape; i++)
       fprintf(stderr, "%02x", p[off + i]);
     fprintf(stderr, " ");
-    for (i = 0; i < shape; i++)
+    for (i = 0; i < hshape; i++)
       fprintf(stderr, "%02x", o_ptr[i]);
     fprintf(stderr, "==");
-    for (i = 0; i < shape; i++)
+    for (i = 0; i < hshape; i++)
       fprintf(stderr, "%02x", o_p[off + i]);
     fprintf(stderr, " <= ");
-    for (i = 0; i < shape; i++)
+    for (i = 0; i < hshape; i++)
       fprintf(stderr, "%02x", r[off + i]);
     fprintf(stderr, "<-");
-    for (i = 0; i < shape; i++)
+    for (i = 0; i < hshape; i++)
       fprintf(stderr, "%02x", o_r[off + i]);
     fprintf(stderr, "\n");
   #endif
 
-    if (!memcmp(ptr, p + off, shape) && !memcmp(o_ptr, o_p + off, shape)) {
+    if (!memcmp(ptr, p + off, hshape) && !memcmp(o_ptr, o_p + off, hshape)) {
 
-      memcpy(backup, ptr, shape);
-      memcpy(ptr, r + off, shape);
+      memcpy(backup, ptr, hshape);
+      memcpy(ptr, r + off, hshape);
 
       if (unlikely(its_fuzz(afl, buf, len, status))) { return 1; }
 
   #ifdef CMPLOG_COMBINE
-      if (*status == 1) { memcpy(cbuf + idx, r, shape); }
+      if (*status == 1) { memcpy(cbuf + idx, r, hshape); }
   #endif
 
-      memcpy(ptr, backup, shape);
+      memcpy(ptr, backup, hshape);
 
   #ifdef _DEBUG
       fprintf(stderr, "Status=%u\n", *status);
@@ -1485,10 +1498,10 @@ static u8 cmp_extend_encodingN(afl_state_t *afl, struct cmp_header *h,
     if (do_reverse && *status != 1) {
 
       if (unlikely(cmp_extend_encodingN(
-              afl, h, SWAPN(pattern, (shape << 3)), SWAPN(repl, (shape << 3)),
-              SWAPN(o_pattern, (shape << 3)), SWAPN(changed_val, (shape << 3)),
-              attr, idx, taint_len, orig_buf, buf, cbuf, len, 0, lvl,
-              status))) {
+              afl, h, SWAPN(pattern, (hshape << 3)), SWAPN(repl, (hshape << 3)),
+              SWAPN(o_pattern, (hshape << 3)),
+              SWAPN(changed_val, (hshape << 3)), attr, idx, taint_len, orig_buf,
+              buf, cbuf, len, 0, lvl, status))) {
 
         return 1;
 
@@ -1615,6 +1628,8 @@ static u8 cmp_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf,
   u8  s_v0_inc = 1, s_v1_inc = 1;
   u8  s_v0_dec = 1, s_v1_dec = 1;
 
+  hshape = SHAPE_BYTES(h->shape);
+
   if (h->hits > CMP_MAP_H) {
 
     loggeds = CMP_MAP_H;
@@ -1626,7 +1641,7 @@ static u8 cmp_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf,
   }
 
 #ifdef WORD_SIZE_64
-  switch (SHAPE_BYTES(h->shape)) {
+  switch (hshape) {
 
     case 1:
     case 2:
@@ -1679,8 +1694,7 @@ static u8 cmp_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf,
 
 #ifdef _DEBUG
     fprintf(stderr, "Handling: %llx->%llx vs %llx->%llx attr=%u shape=%u\n",
-            orig_o->v0, o->v0, orig_o->v1, o->v1, h->attribute,
-            SHAPE_BYTES(h->shape));
+            orig_o->v0, o->v0, orig_o->v1, o->v1, h->attribute, hshape);
 #endif
 
     t = taint;
@@ -1830,27 +1844,41 @@ static u8 cmp_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf,
             "END: %llx->%llx vs %llx->%llx attr=%u i=%u found=%u "
             "isN=%u size=%u\n",
             orig_o->v0, o->v0, orig_o->v1, o->v1, h->attribute, i, found_one,
-            is_n, SHAPE_BYTES(h->shape));
+            is_n, hshape);
 #endif
 
-    // If failed, add to dictionary
-    if (!found_one) {
+    // we only learn 16 bit +
+    if (hshape > 1) {
 
-      if (afl->pass_stats[key].total == 0) {
+      if (!found_one || afl->queue_cur->is_ascii) {
 
 #ifdef WORD_SIZE_64
         if (unlikely(is_n)) {
 
-          try_to_add_to_dictN(afl, s128_v0, SHAPE_BYTES(h->shape));
-          try_to_add_to_dictN(afl, s128_v1, SHAPE_BYTES(h->shape));
+          if (!found_one ||
+              check_if_text_buf((u8 *)&s128_v0, SHAPE_BYTES(h->shape)) ==
+                  SHAPE_BYTES(h->shape))
+            try_to_add_to_dictN(afl, s128_v0, SHAPE_BYTES(h->shape));
+          if (!found_one ||
+              check_if_text_buf((u8 *)&s128_v1, SHAPE_BYTES(h->shape)) ==
+                  SHAPE_BYTES(h->shape))
+            try_to_add_to_dictN(afl, s128_v1, SHAPE_BYTES(h->shape));
 
         } else
 
 #endif
         {
 
-          try_to_add_to_dict(afl, o->v0, SHAPE_BYTES(h->shape));
-          try_to_add_to_dict(afl, o->v1, SHAPE_BYTES(h->shape));
+          if (!memcmp((u8 *)&o->v0, (u8 *)&orig_o->v0, SHAPE_BYTES(h->shape)) &&
+              (!found_one ||
+               check_if_text_buf((u8 *)&o->v0, SHAPE_BYTES(h->shape)) ==
+                   SHAPE_BYTES(h->shape)))
+            try_to_add_to_dict(afl, o->v0, SHAPE_BYTES(h->shape));
+          if (!memcmp((u8 *)&o->v1, (u8 *)&orig_o->v1, SHAPE_BYTES(h->shape)) &&
+              (!found_one ||
+               check_if_text_buf((u8 *)&o->v1, SHAPE_BYTES(h->shape)) ==
+                   SHAPE_BYTES(h->shape)))
+            try_to_add_to_dict(afl, o->v1, SHAPE_BYTES(h->shape));
 
         }
 
@@ -1882,8 +1910,9 @@ static u8 cmp_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf,
 
 }
 
-static u8 rtn_extend_encoding(afl_state_t *afl, u8 *pattern, u8 *repl,
-                              u8 *o_pattern, u8 *changed_val, u8 plen, u32 idx,
+static u8 rtn_extend_encoding(afl_state_t *afl, u8 entry,
+                              struct cmpfn_operands *o,
+                              struct cmpfn_operands *orig_o, u32 idx,
                               u32 taint_len, u8 *orig_buf, u8 *buf, u8 *cbuf,
                               u32 len, u8 lvl, u8 *status) {
 
@@ -1894,9 +1923,60 @@ static u8 rtn_extend_encoding(afl_state_t *afl, u8 *pattern, u8 *repl,
   //  (void)(changed_val);
   //#endif
 
+  if (afl->fsrv.total_execs - last_update > screen_update) {
+
+    show_stats(afl);
+    last_update = afl->fsrv.total_execs;
+
+  }
+
+  u8 *pattern, *repl, *o_pattern, *changed_val;
+  u8  l0, l1, ol0, ol1;
+
+  if (entry == 0) {
+
+    pattern = o->v0;
+    repl = o->v1;
+    o_pattern = orig_o->v0;
+    changed_val = orig_o->v1;
+    l0 = o->v0_len;
+    ol0 = orig_o->v0_len;
+    l1 = o->v1_len;
+    ol1 = orig_o->v1_len;
+
+  } else {
+
+    pattern = o->v1;
+    repl = o->v0;
+    o_pattern = orig_o->v1;
+    changed_val = orig_o->v0;
+    l0 = o->v1_len;
+    ol0 = orig_o->v1_len;
+    l1 = o->v0_len;
+    ol1 = orig_o->v0_len;
+
+  }
+
+  if (l0 >= 0x80 || ol0 >= 0x80) {
+
+    l0 -= 0x80;
+    l1 -= 0x80;
+    ol0 -= 0x80;
+    ol1 -= 0x80;
+
+  }
+
+  if (l0 == 0 || l1 == 0 || ol0 == 0 || ol1 == 0 || l0 > 31 || l1 > 31 ||
+      ol0 > 31 || ol1 > 31) {
+
+    l0 = l1 = ol0 = ol1 = hshape;
+
+  }
+
+  u8  lmax = MAX(l0, ol0);
   u8  save[40];
   u32 saved_idx = idx, pre, from = 0, to = 0, i, j;
-  u32 its_len = MIN((u32)plen, len - idx);
+  u32 its_len = MIN(MIN(lmax, hshape), len - idx);
   its_len = MIN(its_len, taint_len);
   u32 saved_its_len = its_len;
 
@@ -1912,7 +1992,8 @@ static u8 rtn_extend_encoding(afl_state_t *afl, u8 *pattern, u8 *repl,
   (void)(j);
 
 #ifdef _DEBUG
-  fprintf(stderr, "RTN T idx=%u lvl=%02x ", idx, lvl);
+  fprintf(stderr, "RTN T idx=%u lvl=%02x is_txt=%u shape=%u/%u ", idx, lvl,
+          o->v0_len >= 0x80 ? 1 : 0, hshape, l0);
   for (j = 0; j < 8; j++)
     fprintf(stderr, "%02x", orig_buf[idx + j]);
   fprintf(stderr, " -> ");
@@ -1972,10 +2053,10 @@ static u8 rtn_extend_encoding(afl_state_t *afl, u8 *pattern, u8 *repl,
 
   }
 
-  //#ifdef CMPLOG_SOLVE_TRANSFORM
-
   if (*status == 1) return 0;
 
+  // transform solving
+
   if (afl->cmplog_enable_transform && (lvl & LVL3)) {
 
     u32 toupper = 0, tolower = 0, xor = 0, arith = 0, tohex = 0, fromhex = 0;
@@ -2322,6 +2403,8 @@ static u8 rtn_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf,
   u32                i, j, idx, have_taint = 1, taint_len, loggeds;
   u8                 status = 0, found_one = 0;
 
+  hshape = SHAPE_BYTES(h->shape);
+
   if (h->hits > CMP_MAP_RTN_H) {
 
     loggeds = CMP_MAP_RTN_H;
@@ -2353,18 +2436,22 @@ static u8 rtn_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf,
     }
 
     /*
-      struct cmp_header *hh = &afl->orig_cmp_map->headers[key];
-      fprintf(stderr, "RTN N hits=%u id=%u shape=%u attr=%u v0=", h->hits,
-              h->id, h->shape, h->attribute);
-      for (j = 0; j < 8; j++) fprintf(stderr, "%02x", o->v0[j]);
-      fprintf(stderr, " v1=");
-      for (j = 0; j < 8; j++) fprintf(stderr, "%02x", o->v1[j]);
-      fprintf(stderr, "\nRTN O hits=%u id=%u shape=%u attr=%u o0=",
-              hh->hits, hh->id, hh->shape, hh->attribute);
-      for (j = 0; j < 8; j++) fprintf(stderr, "%02x", orig_o->v0[j]);
-      fprintf(stderr, " o1=");
-      for (j = 0; j < 8; j++) fprintf(stderr, "%02x", orig_o->v1[j]);
-      fprintf(stderr, "\n");
+    struct cmp_header *hh = &afl->orig_cmp_map->headers[key];
+    fprintf(stderr, "RTN N hits=%u id=%u shape=%u attr=%u v0=", h->hits, h->id,
+            hshape, h->attribute);
+    for (j = 0; j < 8; j++)
+      fprintf(stderr, "%02x", o->v0[j]);
+    fprintf(stderr, " v1=");
+    for (j = 0; j < 8; j++)
+      fprintf(stderr, "%02x", o->v1[j]);
+    fprintf(stderr, "\nRTN O hits=%u id=%u shape=%u attr=%u o0=", hh->hits,
+            hh->id, hshape, hh->attribute);
+    for (j = 0; j < 8; j++)
+      fprintf(stderr, "%02x", orig_o->v0[j]);
+    fprintf(stderr, " o1=");
+    for (j = 0; j < 8; j++)
+      fprintf(stderr, "%02x", orig_o->v1[j]);
+    fprintf(stderr, "\n");
     */
 
     t = taint;
@@ -2400,25 +2487,24 @@ static u8 rtn_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf,
 
 #ifdef _DEBUG
       int w;
-      fprintf(stderr, "key=%u idx=%u len=%u o0=", key, idx,
-              SHAPE_BYTES(h->shape));
-      for (w = 0; w < SHAPE_BYTES(h->shape); ++w)
+      fprintf(stderr, "key=%u idx=%u len=%u o0=", key, idx, hshape);
+      for (w = 0; w < hshape; ++w)
         fprintf(stderr, "%02x", orig_o->v0[w]);
       fprintf(stderr, " v0=");
-      for (w = 0; w < SHAPE_BYTES(h->shape); ++w)
+      for (w = 0; w < hshape; ++w)
         fprintf(stderr, "%02x", o->v0[w]);
       fprintf(stderr, " o1=");
-      for (w = 0; w < SHAPE_BYTES(h->shape); ++w)
+      for (w = 0; w < hshape; ++w)
         fprintf(stderr, "%02x", orig_o->v1[w]);
       fprintf(stderr, " v1=");
-      for (w = 0; w < SHAPE_BYTES(h->shape); ++w)
+      for (w = 0; w < hshape; ++w)
         fprintf(stderr, "%02x", o->v1[w]);
       fprintf(stderr, "\n");
 #endif
 
-      if (unlikely(rtn_extend_encoding(
-              afl, o->v0, o->v1, orig_o->v0, orig_o->v1, SHAPE_BYTES(h->shape),
-              idx, taint_len, orig_buf, buf, cbuf, len, lvl, &status))) {
+      if (unlikely(rtn_extend_encoding(afl, 0, o, orig_o, idx, taint_len,
+                                       orig_buf, buf, cbuf, len, lvl,
+                                       &status))) {
 
         return 1;
 
@@ -2433,9 +2519,9 @@ static u8 rtn_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf,
 
       status = 0;
 
-      if (unlikely(rtn_extend_encoding(
-              afl, o->v1, o->v0, orig_o->v1, orig_o->v0, SHAPE_BYTES(h->shape),
-              idx, taint_len, orig_buf, buf, cbuf, len, lvl, &status))) {
+      if (unlikely(rtn_extend_encoding(afl, 1, o, orig_o, idx, taint_len,
+                                       orig_buf, buf, cbuf, len, lvl,
+                                       &status))) {
 
         return 1;
 
@@ -2450,16 +2536,42 @@ static u8 rtn_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf,
 
     }
 
-    // If failed, add to dictionary
-    if (!found_one && (lvl & LVL1)) {
+    //  if (unlikely(!afl->pass_stats[key].total)) {
+
+    if ((!found_one && (lvl & LVL1)) || afl->queue_cur->is_ascii) {
+
+      // if (unlikely(!afl->pass_stats[key].total)) {
+
+      u32 shape_len = SHAPE_BYTES(h->shape);
+      u32 v0_len = shape_len, v1_len = shape_len;
+      if (afl->queue_cur->is_ascii ||
+          check_if_text_buf((u8 *)&o->v0, shape_len) == shape_len) {
+
+        if (strlen(o->v0)) v0_len = strlen(o->v0);
+
+      }
 
-      if (unlikely(!afl->pass_stats[key].total)) {
+      if (afl->queue_cur->is_ascii ||
+          check_if_text_buf((u8 *)&o->v1, shape_len) == shape_len) {
 
-        maybe_add_auto(afl, o->v0, SHAPE_BYTES(h->shape));
-        maybe_add_auto(afl, o->v1, SHAPE_BYTES(h->shape));
+        if (strlen(o->v1)) v1_len = strlen(o->v1);
 
       }
 
+      // fprintf(stderr, "SHOULD: found:%u ascii:%u text?%u:%u %u:%s %u:%s \n",
+      // found_one, afl->queue_cur->is_ascii, check_if_text_buf((u8 *)&o->v0,
+      // shape_len), check_if_text_buf((u8 *)&o->v1, shape_len), v0_len,
+      // o->v0, v1_len, o->v1);
+
+      if (!memcmp(o->v0, orig_o->v0, v0_len) ||
+          (!found_one || check_if_text_buf((u8 *)&o->v0, v0_len) == v0_len))
+        maybe_add_auto(afl, o->v0, v0_len);
+      if (!memcmp(o->v1, orig_o->v1, v1_len) ||
+          (!found_one || check_if_text_buf((u8 *)&o->v1, v1_len) == v1_len))
+        maybe_add_auto(afl, o->v1, v1_len);
+
+      //}
+
     }
 
   rtn_fuzz_next_iter:
@@ -2492,6 +2604,23 @@ u8 input_to_state_stage(afl_state_t *afl, u8 *orig_buf, u8 *buf, u32 len) {
   }
 
   struct tainted *taint = NULL;
+  if (likely(afl->queue_cur->exec_us)) {
+
+    if (likely((100000 / 2) >= afl->queue_cur->exec_us)) {
+
+      screen_update = 100000 / afl->queue_cur->exec_us;
+
+    } else {
+
+      screen_update = 1;
+
+    }
+
+  } else {
+
+    screen_update = 100000;
+
+  }
 
   if (!afl->queue_cur->taint || !afl->queue_cur->cmplog_colorinput) {
 
@@ -2592,8 +2721,6 @@ u8 input_to_state_stage(afl_state_t *afl, u8 *orig_buf, u8 *buf, u32 len) {
   u64 orig_hit_cnt, new_hit_cnt;
   u64 orig_execs = afl->fsrv.total_execs;
   orig_hit_cnt = afl->queued_paths + afl->unique_crashes;
-  u64 screen_update = 100000 / afl->queue_cur->exec_us,
-      execs = afl->fsrv.total_execs;
 
   afl->stage_name = "input-to-state";
   afl->stage_short = "its";
@@ -2630,11 +2757,13 @@ u8 input_to_state_stage(afl_state_t *afl, u8 *orig_buf, u8 *buf, u32 len) {
 
     if (afl->shm.cmp_map->headers[k].type == CMP_TYPE_INS) {
 
+      // fprintf(stderr, "INS %u\n", k);
       afl->stage_max +=
           MIN((u32)(afl->shm.cmp_map->headers[k].hits), (u32)CMP_MAP_H);
 
     } else {
 
+      // fprintf(stderr, "RTN %u\n", k);
       afl->stage_max +=
           MIN((u32)(afl->shm.cmp_map->headers[k].hits), (u32)CMP_MAP_RTN_H);
 
@@ -2673,13 +2802,6 @@ u8 input_to_state_stage(afl_state_t *afl, u8 *orig_buf, u8 *buf, u32 len) {
 
     }
 
-    if (afl->fsrv.total_execs - execs > screen_update) {
-
-      execs = afl->fsrv.total_execs;
-      show_stats(afl);
-
-    }
-
   }
 
   r = 0;
@@ -2795,9 +2917,10 @@ exit_its:
   if (f) {
 
     fprintf(f,
-            "Cmplog: fname=%s len=%u ms=%llu result=%u finds=%llu entries=%u\n",
+            "Cmplog: fname=%s len=%u ms=%llu result=%u finds=%llu entries=%u "
+            "auto_extra_after=%u\n",
             afl->queue_cur->fname, len, get_cur_time() - start_time, r,
-            new_hit_cnt - orig_hit_cnt, cmp_locations);
+            new_hit_cnt - orig_hit_cnt, cmp_locations, afl->a_extras_cnt);
 
   #ifndef _DEBUG
     if (afl->not_on_tty) { fclose(f); }
diff --git a/src/afl-fuzz-stats.c b/src/afl-fuzz-stats.c
index 1d48a76c..808bf258 100644
--- a/src/afl-fuzz-stats.c
+++ b/src/afl-fuzz-stats.c
@@ -278,6 +278,7 @@ void write_stats_file(afl_state_t *afl, u32 t_bytes, double bitmap_cvg,
           "total_edges       : %u\n"
           "var_byte_count    : %u\n"
           "havoc_expansion   : %u\n"
+          "auto_dict_entries : %u\n"
           "testcache_size    : %llu\n"
           "testcache_count   : %u\n"
           "testcache_evict   : %u\n"
@@ -316,7 +317,7 @@ void write_stats_file(afl_state_t *afl, u32 t_bytes, double bitmap_cvg,
           -1,
 #endif
           t_bytes, afl->fsrv.real_map_size, afl->var_byte_count,
-          afl->expand_havoc, afl->q_testcase_cache_size,
+          afl->expand_havoc, afl->a_extras_cnt, afl->q_testcase_cache_size,
           afl->q_testcase_cache_count, afl->q_testcase_evictions,
           afl->use_banner, afl->unicorn_mode ? "unicorn" : "",
           afl->fsrv.qemu_mode ? "qemu " : "",
diff --git a/src/afl-fuzz.c b/src/afl-fuzz.c
index a4093693..c08b8fbb 100644
--- a/src/afl-fuzz.c
+++ b/src/afl-fuzz.c
@@ -2235,13 +2235,12 @@ int main(int argc, char **argv_orig, char **envp) {
 
   }
 
-  write_bitmap(afl);
-  save_auto(afl);
-
 stop_fuzzing:
 
   afl->force_ui_update = 1;  // ensure the screen is reprinted
   show_stats(afl);           // print the screen one last time
+  write_bitmap(afl);
+  save_auto(afl);
 
   SAYF(CURSOR_SHOW cLRD "\n\n+++ Testing aborted %s +++\n" cRST,
        afl->stop_soon == 2 ? "programmatically" : "by user");
@@ -2270,6 +2269,20 @@ stop_fuzzing:
 
   }
 
+  if (afl->not_on_tty) {
+
+    u32 t_bytes = count_non_255_bytes(afl, afl->virgin_bits);
+    u8  time_tmp[64];
+    u_stringify_time_diff(time_tmp, get_cur_time(), afl->start_time);
+    ACTF(
+        "Statistics: %u new paths found, %.02f%% coverage achieved, %llu "
+        "crashes found, %llu timeouts found, total runtime %s",
+        afl->queued_discovered,
+        ((double)t_bytes * 100) / afl->fsrv.real_map_size, afl->unique_crashes,
+        afl->unique_hangs, time_tmp);
+
+  }
+
   #ifdef PROFILING
   SAYF(cYEL "[!] " cRST
             "Profiling information: %llu ms total work, %llu ns/run\n",
diff --git a/src/afl-performance.c b/src/afl-performance.c
index c6fa554b..04507410 100644
--- a/src/afl-performance.c
+++ b/src/afl-performance.c
@@ -90,7 +90,8 @@ inline u32 hash32(u8 *key, u32 len, u32 seed) {
 
 #endif
 
-  return (u32)XXH64(key, len, seed);
+  (void)seed;
+  return (u32)XXH3_64bits(key, len);
 
 }
 
@@ -102,7 +103,8 @@ inline u64 hash64(u8 *key, u32 len, u64 seed) {
 
 #endif
 
-  return XXH64(key, len, seed);
+  (void)seed;
+  return XXH3_64bits(key, len);
 
 }
 
diff --git a/test/test-cmplog.c b/test/test-cmplog.c
index b077e3ab..262df6bd 100644
--- a/test/test-cmplog.c
+++ b/test/test-cmplog.c
@@ -1,15 +1,13 @@
 #include <stdio.h>
 #include <string.h>
+#include <stdint.h>
 #include <stdarg.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <unistd.h>
-int main(int argc, char *argv[]) {
 
-  char    buf[1024];
-  ssize_t i;
-  if ((i = read(0, buf, sizeof(buf) - 1)) < 24) return 0;
-  buf[i] = 0;
+int LLVMFuzzerTestOneInput(const uint8_t *buf, size_t i) {
+  if (i < 24) return 0;
   if (buf[0] != 'A') return 0;
   if (buf[1] != 'B') return 0;
   if (buf[2] != 'C') return 0;
@@ -18,6 +16,17 @@ int main(int argc, char *argv[]) {
   if (strncmp(buf + 12, "IJKL", 4) == 0 && strcmp(buf + 16, "DEADBEEF") == 0)
     abort();
   return 0;
-
 }
 
+#ifdef __AFL_COMPILER
+int main(int argc, char *argv[]) {
+  unsigned char    buf[1024];
+  ssize_t i;
+  while(__AFL_LOOP(1000)) {
+    i = read(0, (char*)buf, sizeof(buf) - 1);
+    if (i > 0) buf[i] = 0;
+    LLVMFuzzerTestOneInput(buf, i);
+  }
+  return 0;
+}
+#endif