16 files changed, 1916 insertions, 1234 deletions
diff --git a/include/afl-as.h b/include/afl-as.h
index 3c12c68f..2a2e8ad7 100644
--- a/include/afl-as.h
+++ b/include/afl-as.h
@@ -16,7 +16,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
    This file houses the assembly-level instrumentation injected into fuzzed
    programs. The instrumentation stores XORed pairs of data: identifiers of the
@@ -396,7 +396,7 @@ static const u8 *main_payload_32 =
   "\n";
 
 /* The OpenBSD hack is due to lahf and sahf not being recognized by some
-   versions of binutils: http://marc.info/?l=openbsd-cvs&m=141636589924400
+   versions of binutils: https://marc.info/?l=openbsd-cvs&m=141636589924400
 
    The Apple code is a bit different when calling libc functions because
    they are doing relocations differently from everybody else. We also need
diff --git a/include/afl-fuzz.h b/include/afl-fuzz.h
index 4b19e698..f3d6d99d 100644
--- a/include/afl-fuzz.h
+++ b/include/afl-fuzz.h
@@ -16,7 +16,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
    This is the real deal: the program takes an instrumented binary and
    attempts a variety of basic fuzzing tricks, paying close attention to
@@ -1130,12 +1130,12 @@ void   get_core_count(afl_state_t *);
 void   fix_up_sync(afl_state_t *);
 void   check_asan_opts(afl_state_t *);
 void   check_binary(afl_state_t *, u8 *);
-void   fix_up_banner(afl_state_t *, u8 *);
 void   check_if_tty(afl_state_t *);
 void   setup_signal_handlers(void);
 void   save_cmdline(afl_state_t *, u32, char **);
 void   read_foreign_testcases(afl_state_t *, int);
 void   write_crash_readme(afl_state_t *afl);
+u8     check_if_text_buf(u8 *buf, u32 len);
 
 /* CmpLog */
 
diff --git a/include/afl-prealloc.h b/include/afl-prealloc.h
index fa6c9b70..87bbb1cc 100644
--- a/include/afl-prealloc.h
+++ b/include/afl-prealloc.h
@@ -16,7 +16,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
  */
 
diff --git a/include/alloc-inl.h b/include/alloc-inl.h
index c914da5f..0c540330 100644
--- a/include/alloc-inl.h
+++ b/include/alloc-inl.h
@@ -16,7 +16,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
    This allocator is not designed to resist malicious attackers (the canaries
    are small and predictable), but provides a robust and portable way to detect
diff --git a/include/cmplog.h b/include/cmplog.h
index 878ed60c..8778a4b6 100644
--- a/include/cmplog.h
+++ b/include/cmplog.h
@@ -18,7 +18,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
    Shared code to handle the shared memory. This is used by the fuzzer
    as well the other components like afl-tmin, afl-showmap, etc...
@@ -48,7 +48,8 @@ struct cmp_header {
   unsigned shape : 5;
   unsigned type : 2;
   unsigned attribute : 4;
-  unsigned reserved : 5;
+  unsigned overflow : 1;
+  unsigned reserved : 4;
 
 } __attribute__((packed));
 
@@ -59,14 +60,16 @@ struct cmp_operands {
   u64 v0_128;
   u64 v1_128;
 
-};
+} __attribute__((packed));
 
 struct cmpfn_operands {
 
-  u8 v0[32];
-  u8 v1[32];
+  u8 v0[31];
+  u8 v0_len;
+  u8 v1[31];
+  u8 v1_len;
 
-};
+} __attribute__((packed));
 
 typedef struct cmp_operands cmp_map_list[CMP_MAP_H];
 
diff --git a/include/common.h b/include/common.h
index 7bba9e91..6c8e3b3a 100644
--- a/include/common.h
+++ b/include/common.h
@@ -16,7 +16,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
    Gather some functions common to multiple executables
 
@@ -38,6 +38,7 @@
 
 #define STRINGIFY_VAL_SIZE_MAX (16)
 
+u32  check_binary_signatures(u8 *fn);
 void detect_file_args(char **argv, u8 *prog_in, bool *use_stdin);
 void print_suggested_envs(char *mispelled_env);
 void check_environment_vars(char **env);
@@ -45,6 +46,7 @@ void check_environment_vars(char **env);
 char **argv_cpy_dup(int argc, char **argv);
 void   argv_cpy_free(char **argv);
 
+char **get_cs_argv(u8 *own_loc, u8 **target_path_p, int argc, char **argv);
 char **get_qemu_argv(u8 *own_loc, u8 **target_path_p, int argc, char **argv);
 char **get_wine_argv(u8 *own_loc, u8 **target_path_p, int argc, char **argv);
 char * get_afl_env(char *env);
diff --git a/include/config.h b/include/config.h
index da74989e..b787152f 100644
--- a/include/config.h
+++ b/include/config.h
@@ -16,7 +16,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
  */
 
@@ -237,11 +237,11 @@
    (note that if this value is changed, several areas in afl-cc.c, afl-fuzz.c
    and afl-fuzz-state.c have to be changed as well! */
 
-#define MAX_FILE (1 * 1024 * 1024U)
+#define MAX_FILE (1 * 1024 * 1024L)
 
 /* The same, for the test case minimizer: */
 
-#define TMIN_MAX_FILE (10 * 1024 * 1024)
+#define TMIN_MAX_FILE (10 * 1024 * 1024L)
 
 /* Block normalization steps for afl-tmin: */
 
@@ -267,8 +267,8 @@
    (first value), and to keep in memory as candidates. The latter should be much
    higher than the former. */
 
-#define USE_AUTO_EXTRAS 128
-#define MAX_AUTO_EXTRAS (USE_AUTO_EXTRAS * 64)
+#define USE_AUTO_EXTRAS 4096
+#define MAX_AUTO_EXTRAS (USE_AUTO_EXTRAS * 8)
 
 /* Scaling factor for the effector map used to skip some of the more
    expensive deterministic steps. The actual divisor is set to
diff --git a/include/debug.h b/include/debug.h
index f8df5711..feb7f52d 100644
--- a/include/debug.h
+++ b/include/debug.h
@@ -16,7 +16,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
  */
 
diff --git a/include/envs.h b/include/envs.h
index e3957147..a3ba5e88 100644
--- a/include/envs.h
+++ b/include/envs.h
@@ -54,10 +54,12 @@ static char *afl_environment_variables[] = {
     "AFL_FAST_CAL",
     "AFL_FORCE_UI",
     "AFL_FRIDA_DEBUG_MAPS",
+    "AFL_FRIDA_DRIVER_NO_HOOK",
     "AFL_FRIDA_EXCLUDE_RANGES",
     "AFL_FRIDA_INST_COVERAGE_FILE",
     "AFL_FRIDA_INST_DEBUG_FILE",
     "AFL_FRIDA_INST_JIT",
+    "AFL_FRIDA_INST_NO_BACKPATCH",
     "AFL_FRIDA_INST_NO_OPTIMIZE",
     "AFL_FRIDA_INST_NO_PREFETCH",
     "AFL_FRIDA_INST_NO_PREFETCH_BACKPATCH",
@@ -74,8 +76,11 @@ static char *afl_environment_variables[] = {
     "AFL_FRIDA_PERSISTENT_DEBUG",
     "AFL_FRIDA_PERSISTENT_HOOK",
     "AFL_FRIDA_PERSISTENT_RET",
+    "AFL_FRIDA_STALKER_IC_ENTRIES",
+    "AFL_FRIDA_STALKER_ADJACENT_BLOCKS",
     "AFL_FRIDA_STATS_FILE",
     "AFL_FRIDA_STATS_INTERVAL",
+    "AFL_FRIDA_TRACEABLE",
     "AFL_FUZZER_ARGS",  // oss-fuzz
     "AFL_GDB",
     "AFL_GCC_ALLOWLIST",
@@ -202,6 +207,7 @@ static char *afl_environment_variables[] = {
     "AFL_USE_MSAN",
     "AFL_USE_TRACE_PC",
     "AFL_USE_UBSAN",
+    "AFL_USE_TSAN",
     "AFL_USE_CFISAN",
     "AFL_USE_LSAN",
     "AFL_WINE_PATH",
diff --git a/include/forkserver.h b/include/forkserver.h
index c6f7de00..464f208d 100644
--- a/include/forkserver.h
+++ b/include/forkserver.h
@@ -18,7 +18,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
    Shared code that implements a forkserver. This is used by the fuzzer
    as well the other components like afl-tmin.
@@ -82,6 +82,8 @@ typedef struct afl_forkserver {
 
   bool frida_asan;                    /* if running with asan in frida mode */
 
+  bool cs_mode;                      /* if running in CoreSight mode or not */
+
   bool use_stdin;                       /* use stdin for sending data       */
 
   bool no_unlink;                       /* do not unlink cur_input          */
diff --git a/include/hash.h b/include/hash.h
index 9319ab95..9bb34ff8 100644
--- a/include/hash.h
+++ b/include/hash.h
@@ -21,7 +21,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
  */
 
diff --git a/include/list.h b/include/list.h
index 7ec81cbe..d49e56da 100644
--- a/include/list.h
+++ b/include/list.h
@@ -16,7 +16,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
    This allocator is not designed to resist malicious attackers (the canaries
    are small and predictable), but provides a robust and portable way to detect
diff --git a/include/sharedmem.h b/include/sharedmem.h
index fdc947f9..93080d0f 100644
--- a/include/sharedmem.h
+++ b/include/sharedmem.h
@@ -18,7 +18,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
    Shared code to handle the shared memory. This is used by the fuzzer
    as well the other components like afl-tmin, afl-showmap, etc...
diff --git a/include/snapshot-inl.h b/include/snapshot-inl.h
index a18187ef..7234bbaa 100644
--- a/include/snapshot-inl.h
+++ b/include/snapshot-inl.h
@@ -18,7 +18,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
  */
 
diff --git a/include/types.h b/include/types.h
index 7b94fb83..bbcc2f81 100644
--- a/include/types.h
+++ b/include/types.h
@@ -16,7 +16,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at:
 
-     http://www.apache.org/licenses/LICENSE-2.0
+     https://www.apache.org/licenses/LICENSE-2.0
 
  */
 
@@ -46,6 +46,8 @@ typedef uint128_t         u128;
 #define FS_ERROR_SHM_OPEN 4
 #define FS_ERROR_SHMAT 8
 #define FS_ERROR_MMAP 16
+#define FS_ERROR_OLD_CMPLOG 32
+#define FS_ERROR_OLD_CMPLOG_QEMU 64
 
 /* Reporting options */
 #define FS_OPT_ENABLED 0x80000001
@@ -53,6 +55,7 @@ typedef uint128_t         u128;
 #define FS_OPT_SNAPSHOT 0x20000000
 #define FS_OPT_AUTODICT 0x10000000
 #define FS_OPT_SHDMEM_FUZZ 0x01000000
+#define FS_OPT_NEWCMPLOG 0x02000000
 #define FS_OPT_OLD_AFLPP_WORKAROUND 0x0f000000
 // FS_OPT_MAX_MAPSIZE is 8388608 = 0x800000 = 2^23 = 1 << 22
 #define FS_OPT_MAX_MAPSIZE ((0x00fffffeU >> 1) + 1)
diff --git a/include/xxhash.h b/include/xxhash.h
index 006d3f3d..0ca2b852 100644
--- a/include/xxhash.h
+++ b/include/xxhash.h
@@ -32,7 +32,12 @@
  *   - xxHash homepage: https://www.xxhash.com
  *   - xxHash source repository: https://github.com/Cyan4973/xxHash
  */
-
+/*!
+ * @mainpage xxHash
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
 /* TODO: update */
 /* Notice extracted from xxHash homepage:
 
@@ -45,7 +50,7 @@ Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo
 Name            Speed       Q.Score   Author
 xxHash          5.4 GB/s     10
 CrapWow         3.2 GB/s      2       Andrew
-MumurHash 3a    2.7 GB/s     10       Austin Appleby
+MurmurHash 3a   2.7 GB/s     10       Austin Appleby
 SpookyHash      2.0 GB/s     10       Bob Jenkins
 SBox            1.4 GB/s      9       Bret Mulvey
 Lookup3         1.2 GB/s      9       Bob Jenkins
@@ -119,29 +124,78 @@ extern "C" {
 
 /*
  * This part deals with the special case where a unit wants to inline xxHash,
- * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
- * as part of some previously included *.h header file.
+ * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
+ * such as part of some previously included *.h header file.
  * Without further action, the new include would just be ignored,
  * and functions would effectively _not_ be inlined (silent failure).
  * The following macros solve this situation by prefixing all inlined names,
  * avoiding naming collision with previous inclusions.
  */
-  #ifdef XXH_NAMESPACE
-    #error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
-  /*
-   * Note: Alternative: #undef all symbols (it's a pretty large list).
-   * Without #error: it compiles, but functions are actually not inlined.
-   */
-  #endif
+/* Before that, we unconditionally #undef all symbols,
+ * in case they were already defined with XXH_NAMESPACE.
+ * They will then be redefined for XXH_INLINE_ALL
+ */
+  #undef XXH_versionNumber
+/* XXH32 */
+  #undef XXH32
+  #undef XXH32_createState
+  #undef XXH32_freeState
+  #undef XXH32_reset
+  #undef XXH32_update
+  #undef XXH32_digest
+  #undef XXH32_copyState
+  #undef XXH32_canonicalFromHash
+  #undef XXH32_hashFromCanonical
+/* XXH64 */
+  #undef XXH64
+  #undef XXH64_createState
+  #undef XXH64_freeState
+  #undef XXH64_reset
+  #undef XXH64_update
+  #undef XXH64_digest
+  #undef XXH64_copyState
+  #undef XXH64_canonicalFromHash
+  #undef XXH64_hashFromCanonical
+/* XXH3_64bits */
+  #undef XXH3_64bits
+  #undef XXH3_64bits_withSecret
+  #undef XXH3_64bits_withSeed
+  #undef XXH3_createState
+  #undef XXH3_freeState
+  #undef XXH3_copyState
+  #undef XXH3_64bits_reset
+  #undef XXH3_64bits_reset_withSeed
+  #undef XXH3_64bits_reset_withSecret
+  #undef XXH3_64bits_update
+  #undef XXH3_64bits_digest
+  #undef XXH3_generateSecret
+/* XXH3_128bits */
+  #undef XXH128
+  #undef XXH3_128bits
+  #undef XXH3_128bits_withSeed
+  #undef XXH3_128bits_withSecret
+  #undef XXH3_128bits_reset
+  #undef XXH3_128bits_reset_withSeed
+  #undef XXH3_128bits_reset_withSecret
+  #undef XXH3_128bits_update
+  #undef XXH3_128bits_digest
+  #undef XXH128_isEqual
+  #undef XXH128_cmp
+  #undef XXH128_canonicalFromHash
+  #undef XXH128_hashFromCanonical
+/* Finally, free the namespace itself */
+  #undef XXH_NAMESPACE
+
+/* employ the namespace for XXH_INLINE_ALL */
   #define XXH_NAMESPACE XXH_INLINE_
 /*
- * Some identifiers (enums, type names) are not symbols, but they must
- * still be renamed to avoid redeclaration.
+ * Some identifiers (enums, type names) are not symbols,
+ * but they must nonetheless be renamed to avoid redeclaration.
  * Alternative solution: do not redeclare them.
- * However, this requires some #ifdefs, and is a more dispersed action.
- * Meanwhile, renaming can be achieved in a single block
+ * However, this requires some #ifdefs, and has a more dispersed impact.
+ * Meanwhile, renaming can be achieved in a single place.
  */
-  #define XXH_IPREF(Id) XXH_INLINE_##Id
+  #define XXH_IPREF(Id) XXH_NAMESPACE##Id
   #define XXH_OK XXH_IPREF(XXH_OK)
   #define XXH_ERROR XXH_IPREF(XXH_ERROR)
   #define XXH_errorcode XXH_IPREF(XXH_errorcode)
@@ -166,6 +220,12 @@ extern "C" {
 #ifndef XXHASH_H_5627135585666179
   #define XXHASH_H_5627135585666179 1
 
+  /*!
+   * @defgroup public Public API
+   * Contains details on the public xxHash functions.
+   * @{
+
+   */
   /* specific declaration modes for Windows */
   #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
     #if defined(WIN32) && defined(_MSC_VER) && \
@@ -180,19 +240,24 @@ extern "C" {
     #endif
   #endif
 
-  /*!
-   * XXH_NAMESPACE, aka Namespace Emulation:
-   *
-   * If you want to include _and expose_ xxHash functions from within your own
-   * library, but also want to avoid symbol collisions with other libraries
-   * which may also include xxHash, you can use XXH_NAMESPACE to automatically
-   * prefix any public symbol from xxhash library with the value of
-   * XXH_NAMESPACE (therefore, avoid empty or numeric values).
-   *
-   * Note that no change is required within the calling program as long as it
-   * includes `xxhash.h`: Regular symbol names will be automatically translated
-   * by this header.
-   */
+  #ifdef XXH_DOXYGEN
+    /*!
+     * @brief Emulate a namespace by transparently prefixing all symbols.
+     *
+     * If you want to include _and expose_ xxHash functions from within your own
+     * library, but also want to avoid symbol collisions with other libraries
+     * which may also include xxHash, you can use XXH_NAMESPACE to automatically
+     * prefix any public symbol from xxhash library with the value of
+     * XXH_NAMESPACE (therefore, avoid empty or numeric values).
+     *
+     * Note that no change is required within the calling program as long as it
+     * includes `xxhash.h`: Regular symbol names will be automatically
+     * translated by this header.
+     */
+    #define XXH_NAMESPACE                                 /* YOUR NAME HERE */
+    #undef XXH_NAMESPACE
+  #endif
+
   #ifdef XXH_NAMESPACE
     #define XXH_CAT(A, B) A##B
     #define XXH_NAME2(A, B) XXH_CAT(A, B)
@@ -264,10 +329,19 @@ extern "C" {
    ***************************************/
   #define XXH_VERSION_MAJOR 0
   #define XXH_VERSION_MINOR 8
-  #define XXH_VERSION_RELEASE 0
+  #define XXH_VERSION_RELEASE 1
   #define XXH_VERSION_NUMBER                                   \
     (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + \
      XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is only useful when xxHash is compiled as a shared library, as it is
+ * independent of the version defined in the header.
+ *
+ * @return `XXH_VERSION_NUMBER` as of when the libray was compiled.
+ */
 XXH_PUBLIC_API unsigned XXH_versionNumber(void);
 
   /* ****************************
@@ -279,15 +353,24 @@ typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode;
   /*-**********************************************************************
    *  32-bit hash
    ************************************************************************/
-  #if !defined(__VMS) &&       \
+  #if defined(XXH_DOXYGEN)                 /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+
+  #elif !defined(__VMS) &&     \
       (defined(__cplusplus) || \
        (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
     #include <stdint.h>
-typedef uint32_t XXH32_hash_t;
+typedef uint32_t      XXH32_hash_t;
+
   #else
     #include <limits.h>
     #if UINT_MAX == 0xFFFFFFFFUL
-typedef unsigned int  XXH32_hash_t;
+typedef unsigned int XXH32_hash_t;
     #else
       #if ULONG_MAX == 0xFFFFFFFFUL
 typedef unsigned long XXH32_hash_t;
@@ -298,24 +381,52 @@ typedef unsigned long XXH32_hash_t;
   #endif
 
 /*!
- * XXH32():
- *  Calculate the 32-bit hash of sequence "length" bytes stored at memory
- * address "input". The memory between input & input+length must be valid
- * (allocated and read-accessible). "seed" can be used to alter the result
- * predictably. Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher
- * benchmark): 5.4 GB/s
- *
- * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
- * and offers true 64/128 bit hash results. It provides a superior level of
- * dispersion, and greatly reduces the risks of collisions.
+ * @}
+ *
+ * @defgroup xxh32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH32 is considered rather weak by today's standards.
+ *   The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit
+ *   systems, and offers true 64/128 bit hash results. It provides a superior
+ *   level of dispersion, and greatly reduces the risks of collisions.
+ *
+ * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
+ * @see @ref xxh32_impl for implementation details
+ * @{
+
+ */
+
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in
+ * size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit hash value.
+ *
+ * @see
+ *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
  */
 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length,
                                   XXH32_hash_t seed);
 
-/*******   Streaming   *******/
-
-/*
- * Streaming functions generate the xxHash value from an incrememtal input.
+/*!
+ * Streaming functions generate the xxHash value from an incremental input.
  * This method is slower than single-call functions, due to state management.
  * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
  *
@@ -336,19 +447,125 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length,
  * digest, and generate new hash values later on by invoking `XXH*_digest()`.
  *
  * When done, release the state using `XXH*_freeState()`.
+ *
+ * Example code for incrementally hashing a file:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <xxhash.h>
+ *    #define BUFFER_SIZE 256
+ *
+ *    // Note: XXH64 and XXH3 use the same interface.
+ *    XXH32_hash_t
+ *    hashFile(FILE* stream)
+ *    {
+
+ *        XXH32_state_t* state;
+ *        unsigned char buf[BUFFER_SIZE];
+ *        size_t amt;
+ *        XXH32_hash_t hash;
+ *
+ *        state = XXH32_createState();       // Create a state
+ *        assert(state != NULL);             // Error check here
+ *        XXH32_reset(state, 0xbaad5eed);    // Reset state with our seed
+ *        while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
+
+ *            XXH32_update(state, buf, amt); // Hash the file in chunks
+ *        }
+ *        hash = XXH32_digest(state);        // Finalize the hash
+ *        XXH32_freeState(state);            // Clean up
+ *        return hash;
+ *    }
+ * @endcode
+ */
+
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
  */
+typedef struct XXH32_state_s XXH32_state_t;
 
-typedef struct XXH32_state_s XXH32_state_t;              /* incomplete type */
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * Must be freed with XXH32_freeState().
+ * @return An allocated XXH32_state_t on success, `NULL` on failure.
+ */
 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void);
-XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t *statePtr);
-XXH_PUBLIC_API void           XXH32_copyState(XXH32_state_t *      dst_state,
-                                              const XXH32_state_t *src_state);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * Must be allocated with XXH32_createState().
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref
+ * XXH32_createState().
+ * @return XXH_OK.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t *      dst_state,
+                                    const XXH32_state_t *src_state);
 
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
                                          XXH32_hash_t   seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in
+ * size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *statePtr,
                                           const void *input, size_t length);
-XXH_PUBLIC_API XXH32_hash_t  XXH32_digest(const XXH32_state_t *statePtr);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated xxHash32 value from that state.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr);
 
 /*******   Canonical representation   *******/
 
@@ -373,48 +590,158 @@ XXH_PUBLIC_API XXH32_hash_t  XXH32_digest(const XXH32_state_t *statePtr);
  * canonical format.
  */
 
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
 typedef struct {
 
-  unsigned char digest[4];
+  unsigned char digest[4];                      /*!< Hash bytes, big endian */
 
 } XXH32_canonical_t;
 
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ */
 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
                                             XXH32_hash_t       hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ */
 XXH_PUBLIC_API XXH32_hash_t
 XXH32_hashFromCanonical(const XXH32_canonical_t *src);
 
+  #ifdef __has_attribute
+    #define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+  #else
+    #define XXH_HAS_ATTRIBUTE(x) 0
+  #endif
+
+  /* C-language Attributes are added in C23. */
+  #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && \
+      defined(__has_c_attribute)
+    #define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+  #else
+    #define XXH_HAS_C_ATTRIBUTE(x) 0
+  #endif
+
+  #if defined(__cplusplus) && defined(__has_cpp_attribute)
+    #define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+  #else
+    #define XXH_HAS_CPP_ATTRIBUTE(x) 0
+  #endif
+
+  /*
+  Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough'
+  attribute introduced in CPP17 and C23. CPP17 :
+  https://en.cppreference.com/w/cpp/language/attributes/fallthrough C23   :
+  https://en.cppreference.com/w/c/language/attributes/fallthrough
+  */
+  #if XXH_HAS_C_ATTRIBUTE(x)
+    #define XXH_FALLTHROUGH [[fallthrough]]
+  #elif XXH_HAS_CPP_ATTRIBUTE(x)
+    #define XXH_FALLTHROUGH [[fallthrough]]
+  #elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+    #define XXH_FALLTHROUGH __attribute__((fallthrough))
+  #else
+    #define XXH_FALLTHROUGH
+  #endif
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+
+ */
+
   #ifndef XXH_NO_LONG_LONG
     /*-**********************************************************************
      *  64-bit hash
      ************************************************************************/
-    #if !defined(__VMS) &&                                     \
+    #if defined(XXH_DOXYGEN)                    /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+    #elif !defined(__VMS) &&                                   \
         (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
                                   (__STDC_VERSION__ >= 199901L) /* C99 */))
       #include <stdint.h>
 typedef uint64_t XXH64_hash_t;
     #else
+      #include <limits.h>
+      #if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+/* LP64 ABI says uint64_t is unsigned long */
+typedef unsigned long XXH64_hash_t;
+      #else
 /* the following type must have a width of 64-bit */
 typedef unsigned long long XXH64_hash_t;
+      #endif
     #endif
 
 /*!
- * XXH64():
- * Returns the 64-bit hash of sequence of length @length stored at memory
- * address @input.
- * @seed can be used to alter the result predictably.
+ * @}
+ *
+ * @defgroup xxh64_family XXH64 family
+ * @ingroup public
+ * @{
+
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results. It provides a superior level of
+ *   dispersion, and greatly reduces the risks of collisions.
+ */
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
  *
  * This function usually runs faster on 64-bit systems, but slower on 32-bit
  * systems (see benchmark).
  *
- * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
- * and offers true 64/128 bit hash results. It provides a superior level of
- * dispersion, and greatly reduces the risks of collisions.
+ * @param input The block of data to be hashed, at least @p length bytes in
+ * size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit hash.
+ *
+ * @see
+ *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
  */
 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t length,
                                   XXH64_hash_t seed);
 
 /*******   Streaming   *******/
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ */
 typedef struct XXH64_state_s XXH64_state_t;              /* incomplete type */
 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void);
 XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t *statePtr);
@@ -439,12 +766,15 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
 XXH_PUBLIC_API XXH64_hash_t
 XXH64_hashFromCanonical(const XXH64_canonical_t *src);
 
-/*-**********************************************************************
- *  XXH3 64-bit variant
- ************************************************************************/
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup xxh3_family XXH3 family
+ * @ingroup public
+ * @{
 
-/* ************************************************************************
- * XXH3 is a new hash algorithm featuring:
+ *
+ * XXH3 is a more recent hash algorithm featuring:
  *  - Improved speed for both small and large inputs
  *  - True 64-bit and 128-bit outputs
  *  - SIMD acceleration
@@ -454,41 +784,38 @@ XXH64_hashFromCanonical(const XXH64_canonical_t *src);
  *
  *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
  *
- * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
- * faster on small ones compared to XXH64, though exact differences depend on
- * the platform.
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
  *
- * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
- * on all platforms.
- *
- * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
- *
- * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
- * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
- * explained in the implementation.
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Any 32-bit and 64-bit targets that can run XXH32 smoothly
+ * can run XXH3 at competitive speeds, even without vector support.
+ * Further details are explained in the implementation.
  *
  * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
- * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
+ * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generage exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
  *
  * XXH3 offers 2 variants, _64bits and _128bits.
- * When only 64 bits are needed, prefer calling the _64bits variant, as it
- * reduces the amount of mixing, resulting in faster speed on small inputs.
  *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
  * It's also generally simpler to manipulate a scalar return type than a struct.
  *
- * The 128-bit version adds additional strength, but it is slightly slower.
- *
- * Return values of XXH3 and XXH128 are officially finalized starting
- * with v0.8.0 and will no longer change in future versions.
- * Avoid storing values from before that release in long-term storage.
- *
- * Results produced by v0.7.x are not comparable with results from v0.7.y.
- * However, the API is completely stable, and it can safely be used for
- * ephemeral data (local sessions).
- *
  * The API supports one-shot hashing, streaming mode, and custom secrets.
  */
 
+/*-**********************************************************************
+ *  XXH3 64-bit variant
+ ************************************************************************/
+
 /* XXH3_64bits():
  * default 64-bit variant, using default secret and default seed of 0.
  * It's the fastest variant. */
@@ -504,20 +831,28 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *data, size_t len);
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *data, size_t len,
                                                  XXH64_hash_t seed);
 
-    /*
-     * XXH3_64bits_withSecret():
-     * It's possible to provide any blob of bytes as a "secret" to generate the
-     * hash. This makes it more difficult for an external actor to prepare an
-     * intentional collision. The main condition is that secretSize *must* be
-     * large enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced
-     * hash values depends on secret's entropy. Technically, the secret must
-     * look like a bunch of random bytes. Avoid "trivial" or structured data
-     * such as repeated sequences or a text document. Whenever unsure about the
-     * "randomness" of the blob of bytes, consider relabelling it as a "custom
-     * seed" instead, and employ "XXH3_generateSecret()" (see below) to generate
-     * a high entropy secret derived from the custom seed.
+    /*!
+     * The bare minimum size for a custom secret.
+     *
+     * @see
+     *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+     *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
      */
     #define XXH3_SECRET_SIZE_MIN 136
+
+/*
+ * XXH3_64bits_withSecret():
+ * It's possible to provide any blob of bytes as a "secret" to generate the
+ * hash. This makes it more difficult for an external actor to prepare an
+ * intentional collision. The main condition is that secretSize *must* be large
+ * enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced hash
+ * values depends on secret's entropy. Technically, the secret must look like a
+ * bunch of random bytes. Avoid "trivial" or structured data such as repeated
+ * sequences or a text document. Whenever unsure about the "randomness" of the
+ * blob of bytes, consider relabelling it as a "custom seed" instead, and employ
+ * "XXH3_generateSecret()" (see below) to generate a high entropy secret derived
+ * from the custom seed.
+ */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len,
                                                    const void *secret,
                                                    size_t      secretSize);
@@ -529,6 +864,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len,
  * As a consequence, streaming is slower than one-shot hashing.
  * For better performance, prefer one-shot functions whenever applicable.
  */
+
+/*!
+ * @brief The state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ */
 typedef struct XXH3_state_s XXH3_state_t;
 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void);
 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr);
@@ -572,10 +913,16 @@ XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest(const XXH3_state_t *statePtr);
  *  XXH3 128-bit variant
  ************************************************************************/
 
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
 typedef struct {
 
-  XXH64_hash_t low64;
-  XXH64_hash_t high64;
+  XXH64_hash_t low64;                     /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+  XXH64_hash_t high64;                                   /*!< `value >> 64` */
 
 } XXH128_hash_t;
 
@@ -649,6 +996,9 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src);
 
   #endif                                                /* XXH_NO_LONG_LONG */
 
+/*!
+ * @}
+ */
 #endif                                         /* XXHASH_H_5627135585666179 */
 
 #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
@@ -660,7 +1010,7 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src);
  * These declarations should only be used with static linking.
  * Never use them in association with dynamic linking!
  *****************************************************************************
- */
+*/
 
 /*
  * These definitions are only present to allow static allocation
@@ -668,41 +1018,72 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src);
  * Never **ever** access their members directly.
  */
 
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
 struct XXH32_state_s {
 
-  XXH32_hash_t total_len_32;
-  XXH32_hash_t large_len;
-  XXH32_hash_t v1;
-  XXH32_hash_t v2;
-  XXH32_hash_t v3;
-  XXH32_hash_t v4;
-  XXH32_hash_t mem32[4];
-  XXH32_hash_t memsize;
-  XXH32_hash_t
-      reserved; /* never read nor write, might be removed in a future version */
+  XXH32_hash_t total_len_32;          /*!< Total length hashed, modulo 2^32 */
+  XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref
+                                total_len_32 overflow) */
+  XXH32_hash_t v1;                              /*!< First accumulator lane */
+  XXH32_hash_t v2;                             /*!< Second accumulator lane */
+  XXH32_hash_t v3;                              /*!< Third accumulator lane */
+  XXH32_hash_t v4;                             /*!< Fourth accumulator lane */
+  XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as
+                                unsigned char[16]. */
+  XXH32_hash_t memsize;                   /*!< Amount of data in @ref mem32 */
+  XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may
+                            be removed. */
 
 };                                            /* typedef'd to XXH32_state_t */
 
   #ifndef XXH_NO_LONG_LONG       /* defined when there is no 64-bit support */
 
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
 struct XXH64_state_s {
 
-  XXH64_hash_t total_len;
-  XXH64_hash_t v1;
-  XXH64_hash_t v2;
-  XXH64_hash_t v3;
-  XXH64_hash_t v4;
-  XXH64_hash_t mem64[4];
-  XXH32_hash_t memsize;
-  XXH32_hash_t reserved32;                   /* required for padding anyway */
-  XXH64_hash_t reserved64; /* never read nor write, might be removed in a future
-                              version */
+  XXH64_hash_t total_len;  /*!< Total length hashed. This is always 64-bit. */
+  XXH64_hash_t v1;                              /*!< First accumulator lane */
+  XXH64_hash_t v2;                             /*!< Second accumulator lane */
+  XXH64_hash_t v3;                              /*!< Third accumulator lane */
+  XXH64_hash_t v4;                             /*!< Fourth accumulator lane */
+  XXH64_hash_t mem64[4];   /*!< Internal buffer for partial reads. Treated as
+                              unsigned char[32]. */
+  XXH32_hash_t memsize;                   /*!< Amount of data in @ref mem64 */
+  XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+  XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it
+                              may be removed. */
 
 };                                            /* typedef'd to XXH64_state_t */
 
-    #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)  /* C11+ */
+    #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 \
+                                                                    */
       #include <stdalign.h>
       #define XXH_ALIGN(n) alignas(n)
+    #elif defined(__cplusplus) && (__cplusplus >= 201103L)      /* >= C++11 */
+      /* In C++ alignas() is a keyword */
+      #define XXH_ALIGN(n) alignas(n)
     #elif defined(__GNUC__)
       #define XXH_ALIGN(n) __attribute__((aligned(n)))
     #elif defined(_MSC_VER)
@@ -713,39 +1094,94 @@ struct XXH64_state_s {
 
     /* Old GCC versions only accept the attribute after the type in structures.
      */
-    #if !(defined(__STDC_VERSION__) &&              \
-          (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
+    #if !(defined(__STDC_VERSION__) &&                                        \
+          (__STDC_VERSION__ >= 201112L))                       /* C11+ */     \
+        && !(defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
         && defined(__GNUC__)
       #define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
     #else
       #define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
     #endif
 
+    /*!
+     * @brief The size of the internal XXH3 buffer.
+     *
+     * This is the optimal update size for incremental hashing.
+     *
+     * @see XXH3_64b_update(), XXH3_128b_update().
+     */
     #define XXH3_INTERNALBUFFER_SIZE 256
+
+    /*!
+     * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+     *
+     * This is the size used in @ref XXH3_kSecret and the seeded functions.
+     *
+     * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+     */
     #define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
+ * Otherwise it is an opaque type.
+ * Never use this definition in combination with dynamic library.
+ * This allows fields to safely be changed in the future.
+ *
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`,
+ * it will not be sufficiently aligned.
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do never access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
 struct XXH3_state_s {
 
   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
-  /* used to store a custom secret generated from a seed */
+  /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref
+   * XXH64_state_s */
   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+  /*!< Used to store a custom secret generated from a seed. */
   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
-  XXH32_hash_t         bufferedSize;
-  XXH32_hash_t         reserved32;
-  size_t               nbStripesSoFar;
-  XXH64_hash_t         totalLen;
-  size_t               nbStripesPerBlock;
-  size_t               secretLimit;
-  XXH64_hash_t         seed;
-  XXH64_hash_t         reserved64;
-  const unsigned char *extSecret; /* reference to external secret;
-                                   * if == NULL, use .customSecret instead */
+  /*!< The internal buffer. @see XXH32_state_s::mem32 */
+  XXH32_hash_t bufferedSize;
+  /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+  XXH32_hash_t reserved32;
+  /*!< Reserved field. Needed for padding on 64-bit. */
+  size_t nbStripesSoFar;
+  /*!< Number or stripes processed. */
+  XXH64_hash_t totalLen;
+  /*!< Total length hashed. 64-bit even on 32-bit targets. */
+  size_t nbStripesPerBlock;
+  /*!< Number of stripes per block. */
+  size_t secretLimit;
+  /*!< Size of @ref customSecret or @ref extSecret */
+  XXH64_hash_t seed;
+  /*!< Seed for _withSeed variants. Must be zero otherwise, @see
+   * XXH3_INITSTATE() */
+  XXH64_hash_t reserved64;
+  /*!< Reserved field. */
+  const unsigned char *extSecret;
+  /*!< Reference to an external secret for the _withSecret variants, NULL
+   *   for other variants. */
   /* note: there may be some padding at the end due to alignment on 64 bytes */
 
 };                                             /* typedef'd to XXH3_state_t */
 
     #undef XXH_ALIGN_MEMBER
 
-    /* When the XXH3_state_t structure is merely emplaced on stack,
+    /*!
+     * @brief Initializes a stack-allocated `XXH3_state_s`.
+     *
+     * When the @ref XXH3_state_t structure is merely emplaced on stack,
      * it should be initialized with XXH3_INITSTATE() or a memset()
      * in case its first reset uses XXH3_NNbits_reset_withSeed().
      * This init can be omitted if the first reset uses default or _withSecret
@@ -802,7 +1238,6 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
                                     XXH64_hash_t seed);
 
   #endif                                                /* XXH_NO_LONG_LONG */
-
   #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
     #define XXH_IMPLEMENTATION
   #endif
@@ -844,81 +1279,183 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
   /* *************************************
    *  Tuning parameters
    ***************************************/
+
   /*!
-   * XXH_FORCE_MEMORY_ACCESS:
-   * By default, access to unaligned memory is controlled by `memcpy()`, which
-   * is safe and portable.
-   *
-   * Unfortunately, on some target/compiler combinations, the generated assembly
-   * is sub-optimal.
+   * @defgroup tuning Tuning parameters
+   * @{
+
    *
-   * The below switch allow selection of a different access method
-   * in the search for improved performance.
-   * Method 0 (default):
-   *     Use `memcpy()`. Safe and portable. Default.
-   * Method 1:
-   *     `__attribute__((packed))` statement. It depends on compiler extensions
-   *     and is therefore not portable.
-   *     This method is safe if your compiler supports it, and *generally* as
-   *     fast or faster than `memcpy`.
-   * Method 2:
-   *     Direct access via cast. This method doesn't depend on the compiler but
-   *     violates the C standard.
-   *     It can generate buggy code on targets which do not support unaligned
-   *     memory accesses.
-   *     But in some circumstances, it's the only known way to get the most
-   *     performance (example: GCC + ARMv6)
-   * Method 3:
-   *     Byteshift. This can generate the best code on old compilers which don't
-   *     inline small `memcpy()` calls, and it might also be faster on
-   * big-endian systems which lack a native byteswap instruction. See
-   * https://stackoverflow.com/a/32095106/646947 for details. Prefer these
-   * methods in priority order (0 > 1 > 2 > 3)
+   * Various macros to control xxHash's behavior.
    */
+  #ifdef XXH_DOXYGEN
+    /*!
+     * @brief Define this to disable 64-bit code.
+     *
+     * Useful if only using the @ref xxh32_family and you have a strict C90
+     * compiler.
+     */
+    #define XXH_NO_LONG_LONG
+    #undef XXH_NO_LONG_LONG                               /* don't actually */
+    /*!
+     * @brief Controls how unaligned memory is accessed.
+     *
+     * By default, access to unaligned memory is controlled by `memcpy()`, which
+     * is safe and portable.
+     *
+     * Unfortunately, on some target/compiler combinations, the generated
+     * assembly is sub-optimal.
+     *
+     * The below switch allow selection of a different access method
+     * in the search for improved performance.
+     *
+     * @par Possible options:
+     *
+     *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+     *   @par
+     *     Use `memcpy()`. Safe and portable. Note that most modern compilers
+     * will eliminate the function call and treat it as an unaligned access.
+     *
+     *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
+     *   @par
+     *     Depends on compiler extensions and is therefore not portable.
+     *     This method is safe _if_ your compiler supports it,
+     *     and *generally* as fast or faster than `memcpy`.
+     *
+     *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+     *  @par
+     *     Casts directly and dereferences. This method doesn't depend on the
+     *     compiler, but it violates the C standard as it directly dereferences
+     * an unaligned pointer. It can generate buggy code on targets which do not
+     *     support unaligned memory accesses, but in some circumstances, it's
+     * the only known way to get the most performance.
+     *
+     *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+     *  @par
+     *     Also portable. This can generate the best code on old compilers which
+     * don't inline small `memcpy()` calls, and it might also be faster on
+     * big-endian systems which lack a native byteswap instruction. However,
+     * some compilers will emit literal byteshifts even if the target supports
+     * unaligned access.
+     *  .
+     *
+     * @warning
+     *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+     *   care, as what works on one compiler/platform/optimization level may
+     * cause another to read garbage data or even crash.
+     *
+     * See https://stackoverflow.com/a/32095106/646947 for details.
+     *
+     * Prefer these methods in priority order (0 > 3 > 1 > 2)
+     */
+    #define XXH_FORCE_MEMORY_ACCESS 0
+    /*!
+     * @def XXH_ACCEPT_NULL_INPUT_POINTER
+     * @brief Whether to add explicit `NULL` checks.
+     *
+     * If the input pointer is `NULL` and the length is non-zero, xxHash's
+     * default behavior is to dereference it, triggering a segfault.
+     *
+     * When this macro is enabled, xxHash actively checks the input for a null
+     * pointer. If it is, the result for null input pointers is the same as a
+     * zero-length input.
+     */
+    #define XXH_ACCEPT_NULL_INPUT_POINTER 0
+    /*!
+     * @def XXH_FORCE_ALIGN_CHECK
+     * @brief If defined to non-zero, adds a special path for aligned inputs
+     * (XXH32() and XXH64() only).
+     *
+     * This is an important performance trick for architectures without decent
+     * unaligned memory access performance.
+     *
+     * It checks for input alignment, and when conditions are met, uses a "fast
+     * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+     * faster_ read speed.
+     *
+     * The check costs one initial branch per hash, which is generally
+     * negligible, but not zero.
+     *
+     * Moreover, it's not useful to generate an additional code path if memory
+     * access uses the same instruction for both aligned and unaligned
+     * addresses (e.g. x86 and aarch64).
+     *
+     * In these cases, the alignment check can be removed by setting this macro
+     * to 0. Then the code will always use unaligned memory access. Align check
+     * is automatically disabled on x86, x64 & arm64, which are platforms known
+     * to offer good unaligned memory accesses performance.
+     *
+     * This option does not affect XXH3 (only XXH32 and XXH64).
+     */
+    #define XXH_FORCE_ALIGN_CHECK 0
+
+    /*!
+     * @def XXH_NO_INLINE_HINTS
+     * @brief When non-zero, sets all functions to `static`.
+     *
+     * By default, xxHash tries to force the compiler to inline almost all
+     * internal functions.
+     *
+     * This can usually improve performance due to reduced jumping and improved
+     * constant folding, but significantly increases the size of the binary
+     * which might not be favorable.
+     *
+     * Additionally, sometimes the forced inlining can be detrimental to
+     * performance, depending on the architecture.
+     *
+     * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+     * compiler full control on whether to inline or not.
+     *
+     * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+     * -fno-inline with GCC or Clang, this will automatically be defined.
+     */
+    #define XXH_NO_INLINE_HINTS 0
+
+    /*!
+     * @def XXH_REROLL
+     * @brief Whether to reroll `XXH32_finalize`.
+     *
+     * For performance, `XXH32_finalize` uses an unrolled loop
+     * in the form of a switch statement.
+     *
+     * This is not always desirable, as it generates larger code,
+     * and depending on the architecture, may even be slower
+     *
+     * This is automatically defined with `-Os`/`-Oz` on GCC and Clang.
+     */
+    #define XXH_REROLL 0
+
+    /*!
+     * @internal
+     * @brief Redefines old internal names.
+     *
+     * For compatibility with code that uses xxHash's internals before the names
+     * were changed to improve namespacing. There is no other reason to use
+     * this.
+     */
+    #define XXH_OLD_NAMES
+    #undef XXH_OLD_NAMES                 /* don't actually use, it is ugly. */
+  #endif                                                     /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
   #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command \
                                      line for example */
-    #if !defined(__clang__) && defined(__GNUC__) &&                \
-        defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && \
-        (__ARM_ARCH == 6)
-      #define XXH_FORCE_MEMORY_ACCESS 2
-    #elif !defined(__clang__) &&                            \
-        ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
-         (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+  /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
+    #if !defined(__clang__) &&                                          \
+        ((defined(__INTEL_COMPILER) && !defined(_WIN32)) ||             \
+         (defined(__GNUC__) &&                                          \
+          ((defined(__ARM_ARCH) && __ARM_ARCH >= 7) ||                  \
+           (defined(__mips__) && (__mips <= 5 || __mips_isa_rev < 6) && \
+            (!defined(__mips16) || defined(__mips_mips16e2))))))
       #define XXH_FORCE_MEMORY_ACCESS 1
     #endif
   #endif
 
-  /*!
-   * XXH_ACCEPT_NULL_INPUT_POINTER:
-   * If the input pointer is NULL, xxHash's default behavior is to dereference
-   * it, triggering a segfault. When this macro is enabled, xxHash actively
-   * checks the input for a null pointer. If it is, the result for null input
-   * pointers is the same as a zero-length input.
-   */
   #ifndef XXH_ACCEPT_NULL_INPUT_POINTER        /* can be defined externally */
     #define XXH_ACCEPT_NULL_INPUT_POINTER 0
   #endif
 
-  /*!
-   * XXH_FORCE_ALIGN_CHECK:
-   * This is an important performance trick
-   * for architectures without decent unaligned memory access performance.
-   * It checks for input alignment, and when conditions are met,
-   * uses a "fast path" employing direct 32-bit/64-bit read,
-   * resulting in _dramatically faster_ read speed.
-   *
-   * The check costs one initial branch per hash, which is generally negligible,
-   * but not zero. Moreover, it's not useful to generate binary for an
-   * additional code path if memory access uses same instruction for both
-   * aligned and unaligned adresses.
-   *
-   * In these cases, the alignment check can be removed by setting this macro to
-   * 0. Then the code will always use unaligned memory access. Align check is
-   * automatically disabled on x86, x64 & arm64, which are platforms known to
-   * offer good unaligned memory accesses performance.
-   *
-   * This option does not affect XXH3 (only XXH32 and XXH64).
-   */
   #ifndef XXH_FORCE_ALIGN_CHECK                /* can be defined externally */
     #if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || \
         defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64)  /* visual */
@@ -928,25 +1465,6 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
     #endif
   #endif
 
-  /*!
-   * XXH_NO_INLINE_HINTS:
-   *
-   * By default, xxHash tries to force the compiler to inline almost all
-   * internal functions.
-   *
-   * This can usually improve performance due to reduced jumping and improved
-   * constant folding, but significantly increases the size of the binary which
-   * might not be favorable.
-   *
-   * Additionally, sometimes the forced inlining can be detrimental to
-   * performance, depending on the architecture.
-   *
-   * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
-   * compiler full control on whether to inline or not.
-   *
-   * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
-   * -fno-inline with GCC or Clang, this will automatically be defined.
-   */
   #ifndef XXH_NO_INLINE_HINTS
     #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
         || defined(__NO_INLINE__)                       /* -O0, -fno-inline */
@@ -956,44 +1474,57 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
     #endif
   #endif
 
-  /*!
-   * XXH_REROLL:
-   * Whether to reroll XXH32_finalize, and XXH64_finalize,
-   * instead of using an unrolled jump table/if statement loop.
-   *
-   * This is automatically defined on -Os/-Oz on GCC and Clang.
-   */
   #ifndef XXH_REROLL
-    #if defined(__OPTIMIZE_SIZE__)
+    #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ || \
+        (defined(__GNUC__) && !defined(__clang__))
+    /* The if/then loop is preferable to switch/case on gcc (on x64) */
       #define XXH_REROLL 1
     #else
       #define XXH_REROLL 0
     #endif
   #endif
 
+  /*!
+   * @defgroup impl Implementation
+   * @{
+
+   */
+
   /* *************************************
    *  Includes & Memory related functions
    ***************************************/
-  /*!
+  /*
    * Modify the local functions below should you wish to use
    * different memory routines for malloc() and free()
    */
   #include <stdlib.h>
 
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
 static void *XXH_malloc(size_t s) {
 
   return malloc(s);
 
 }
 
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
 static void XXH_free(void *p) {
 
   free(p);
 
 }
 
-  /*! and for memcpy() */
   #include <string.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than memcpy().
+ */
 static void *XXH_memcpy(void *dest, const void *src, size_t size) {
 
   return memcpy(dest, src, size);
@@ -1037,7 +1568,11 @@ static void *XXH_memcpy(void *dest, const void *src, size_t size) {
   /* *************************************
    *  Debug
    ***************************************/
-  /*
+  /*!
+   * @ingroup tuning
+   * @def XXH_DEBUGLEVEL
+   * @brief Sets the debugging level.
+   *
    * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
    * compiler's command line options. The value must be a number.
    */
@@ -1057,12 +1592,58 @@ static void *XXH_memcpy(void *dest, const void *src, size_t size) {
   #endif
 
   /* note: use after variable declarations */
-  #define XXH_STATIC_ASSERT(c)            \
-    do {                                  \
-                                          \
-      enum { XXH_sa = 1 / (int)(!!(c)) }; \
-                                          \
-    } while (0)
+  #ifndef XXH_STATIC_ASSERT
+    #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11 */
+      #include <assert.h>
+      #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
+        do {                                       \
+                                                   \
+          static_assert((c), m);                   \
+                                                   \
+        } while (0)
+    #elif defined(__cplusplus) && (__cplusplus >= 201103L)         /* C++11 */
+      #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
+        do {                                       \
+                                                   \
+          static_assert((c), m);                   \
+                                                   \
+        } while (0)
+    #else
+      #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
+        do {                                       \
+                                                   \
+          struct xxh_sa {                          \
+                                                   \
+            char x[(c) ? 1 : -1];                  \
+                                                   \
+          };                                       \
+                                                   \
+        } while (0)
+    #endif
+    #define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c), #c)
+  #endif
+
+  /*!
+   * @internal
+   * @def XXH_COMPILER_GUARD(var)
+   * @brief Used to prevent unwanted optimizations for @p var.
+   *
+   * It uses an empty GCC inline assembly statement with a register constraint
+   * which forces @p var into a general purpose register (eg eax, ebx, ecx
+   * on x86) and marks it as modified.
+   *
+   * This is used in a few places to avoid unwanted autovectorization (e.g.
+   * XXH32_round()). All vectorization we want is explicit via intrinsics,
+   * and _usually_ isn't wanted elsewhere.
+   *
+   * We also use it to prevent unwanted constant folding for AArch64 in
+   * XXH3_initCustomSecret_scalar().
+   */
+  #ifdef __GNUC__
+    #define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r"(var))
+  #else
+    #define XXH_COMPILER_GUARD(var) ((void)0)
+  #endif
 
   /* *************************************
    *  Basic Types
@@ -1085,6 +1666,56 @@ typedef XXH32_hash_t xxh_u32;
 
 /* ***   Memory access   *** */
 
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
   #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
   /*
    * Manual byteshift. Best for old compilers which don't inline memcpy.
@@ -1146,16 +1777,23 @@ static xxh_u32 XXH_read32(const void *memPtr) {
 
   #endif                                  /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
 
-/* ***   Endianess   *** */
-typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess;
+  /* ***   Endianness   *** */
 
   /*!
-   * XXH_CPU_LITTLE_ENDIAN:
+   * @ingroup tuning
+   * @def XXH_CPU_LITTLE_ENDIAN
+   * @brief Whether the target is little endian.
+   *
    * Defined to 1 if the target is little endian, or 0 if it is big endian.
    * It can be defined externally, for example on the compiler command line.
    *
-   * If it is not defined, a runtime check (which is usually constant folded)
-   * is used instead.
+   * If it is not defined,
+   * a runtime check (which is usually constant folded) is used instead.
+   *
+   * @note
+   *   This is not necessarily defined to an integer constant.
+   *
+   * @see XXH_isLittleEndian() for the runtime check.
    */
   #ifndef XXH_CPU_LITTLE_ENDIAN
     /*
@@ -1170,8 +1808,11 @@ typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess;
         (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
       #define XXH_CPU_LITTLE_ENDIAN 0
     #else
-/*
- * runtime test, presumed to simplify to a constant by compiler
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
  */
 static int XXH_isLittleEndian(void) {
 
@@ -1189,7 +1830,7 @@ static int XXH_isLittleEndian(void) {
   return one.c[0];
 
 }
-
+\
       #define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
     #endif
   #endif
@@ -1205,6 +1846,19 @@ static int XXH_isLittleEndian(void) {
     #define XXH_HAS_BUILTIN(x) 0
   #endif
 
+  /*!
+   * @internal
+   * @def XXH_rotl32(x,r)
+   * @brief 32-bit rotate left.
+   *
+   * @param x The 32-bit integer to be rotated.
+   * @param r The number of bits to rotate.
+   * @pre
+   *   @p r > 0 && @p r < 32
+   * @note
+   *   @p x and @p r may be evaluated multiple times.
+   * @return The rotated result.
+   */
   #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) && \
       XXH_HAS_BUILTIN(__builtin_rotateleft64)
     #define XXH_rotl32 __builtin_rotateleft32
@@ -1219,6 +1873,14 @@ static int XXH_isLittleEndian(void) {
     #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
   #endif
 
+  /*!
+   * @internal
+   * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+   * @brief A 32-bit byteswap.
+   *
+   * @param x The 32-bit integer to byteswap.
+   * @return @p x, byteswapped.
+   */
   #if defined(_MSC_VER)                                    /* Visual Studio */
     #define XXH_swap32 _byteswap_ulong
   #elif XXH_GCC_VERSION >= 403
@@ -1236,7 +1898,17 @@ static xxh_u32 XXH_swap32(xxh_u32 x) {
 /* ***************************
  *  Memory reads
  *****************************/
-typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+
+  XXH_aligned,                                                 /*!< Aligned */
+  XXH_unaligned                                     /*!< Possibly unaligned */
+
+} XXH_alignment;
 
   /*
    * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
@@ -1295,6 +1967,7 @@ XXH_FORCE_INLINE xxh_u32 XXH_readLE32_align(const void *  ptr,
 /* *************************************
  *  Misc
  ***************************************/
+/*! @ingroup public */
 XXH_PUBLIC_API unsigned XXH_versionNumber(void) {
 
   return XXH_VERSION_NUMBER;
@@ -1304,16 +1977,19 @@ XXH_PUBLIC_API unsigned XXH_versionNumber(void) {
 /* *******************************************************************
  *  32-bit hash functions
  *********************************************************************/
-static const xxh_u32 XXH_PRIME32_1 =
-    0x9E3779B1U;                      /* 0b10011110001101110111100110110001 */
-static const xxh_u32 XXH_PRIME32_2 =
-    0x85EBCA77U;                      /* 0b10000101111010111100101001110111 */
-static const xxh_u32 XXH_PRIME32_3 =
-    0xC2B2AE3DU;                      /* 0b11000010101100101010111000111101 */
-static const xxh_u32 XXH_PRIME32_4 =
-    0x27D4EB2FU;                      /* 0b00100111110101001110101100101111 */
-static const xxh_u32 XXH_PRIME32_5 =
-    0x165667B1U;                      /* 0b00010110010101100110011110110001 */
+/*!
+ * @}
+ * @defgroup xxh32_impl XXH32 implementation
+ * @ingroup impl
+ * @{
+
+ */
+/* #define instead of static const, to be used as initializers */
+  #define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */
+  #define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */
+  #define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */
+  #define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */
+  #define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */
 
   #ifdef XXH_OLD_NAMES
     #define PRIME32_1 XXH_PRIME32_1
@@ -1323,19 +1999,29 @@ static const xxh_u32 XXH_PRIME32_5 =
     #define PRIME32_5 XXH_PRIME32_5
   #endif
 
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
 static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) {
 
   acc += input * XXH_PRIME32_2;
   acc = XXH_rotl32(acc, 13);
   acc *= XXH_PRIME32_1;
-  #if defined(__GNUC__) && defined(__SSE4_1__) && \
+  #if (defined(__SSE4_1__) || defined(__aarch64__)) && \
       !defined(XXH_ENABLE_AUTOVECTORIZE)
   /*
    * UGLY HACK:
-   * This inline assembly hack forces acc into a normal register. This is the
-   * only thing that prevents GCC and Clang from autovectorizing the XXH32
-   * loop (pragmas and attributes don't work for some resason) without globally
-   * disabling SSE4.1.
+   * A compiler fence is the only thing that prevents GCC and Clang from
+   * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+   * reason) without globally disabling SSE4.1.
    *
    * The reason we want to avoid vectorization is because despite working on
    * 4 integers at a time, there are multiple factors slowing XXH32 down on
@@ -1360,28 +2046,26 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) {
    *   can load data, while v3 can multiply. SSE forces them to operate
    *   together.
    *
-   * How this hack works:
-   * __asm__(""       // Declare an assembly block but don't declare any
-   * instructions :       // However, as an Input/Output Operand,
-   *          "+r"    // constrain a read/write operand (+) as a general purpose
-   * register (r). (acc)   // and set acc as the operand
-   * );
-   *
-   * Because of the 'r', the compiler has promised that seed will be in a
-   * general purpose register and the '+' says that it will be 'read/write',
-   * so it has to assume it has changed. It is like volatile without all the
-   * loads and stores.
-   *
-   * Since the argument has to be in a normal register (not an SSE register),
-   * each time XXH32_round is called, it is impossible to vectorize.
+   * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
+   * and it is pointless writing a NEON implementation that is basically the
+   * same speed as scalar for XXH32.
    */
-  __asm__("" : "+r"(acc));
+  XXH_COMPILER_GUARD(acc);
   #endif
   return acc;
 
 }
 
-/* mix all bits */
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param h32 The hash to avalanche.
+ * @return The avalanched hash.
+ */
 static xxh_u32 XXH32_avalanche(xxh_u32 h32) {
 
   h32 ^= h32 >> 15;
@@ -1395,11 +2079,23 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32) {
 
   #define XXH_get32bits(p) XXH_readLE32_align(p, align)
 
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param h32 The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ */
 static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
                               XXH_alignment align) {
-
-  /* dummy comment */
-
+\
   #define XXH_PROCESS1                           \
     do {                                         \
                                                  \
@@ -1443,20 +2139,20 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
 
       case 12:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 8:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 4:
         XXH_PROCESS4;
         return XXH32_avalanche(h32);
 
       case 13:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 9:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 5:
         XXH_PROCESS4;
         XXH_PROCESS1;
@@ -1464,10 +2160,10 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
 
       case 14:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 10:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 6:
         XXH_PROCESS4;
         XXH_PROCESS1;
@@ -1476,22 +2172,22 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
 
       case 15:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 11:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 7:
         XXH_PROCESS4;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 3:
         XXH_PROCESS1;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 2:
         XXH_PROCESS1;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 1:
         XXH_PROCESS1;
-        /* fallthrough */
+        XXH_FALLTHROUGH;
       case 0:
         return XXH32_avalanche(h32);
 
@@ -1512,10 +2208,18 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
     #undef XXH_PROCESS4
   #endif
 
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input, len, seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
 XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len,
                                             xxh_u32 seed, XXH_alignment align) {
 
-  const xxh_u8 *bEnd = input + len;
+  const xxh_u8 *bEnd = input ? input + len : NULL;
   xxh_u32       h32;
 
   #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
@@ -1565,6 +2269,7 @@ XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len,
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
                                   XXH32_hash_t seed) {
 
@@ -1574,9 +2279,7 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
     XXH32_reset(&state, seed);
     XXH32_update(&state, (const xxh_u8*)input, len);
     return XXH32_digest(&state);
-
   #else
-
   if (XXH_FORCE_ALIGN_CHECK) {
 
     if ((((size_t)input) & 3) ==
@@ -1593,13 +2296,16 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
 }
 
 /*******   Hash streaming   *******/
-
+/*!
+ * @ingroup xxh32_family
+ */
 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void) {
 
   return (XXH32_state_t *)XXH_malloc(sizeof(XXH32_state_t));
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) {
 
   XXH_free(statePtr);
@@ -1607,6 +2313,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) {
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t *      dstState,
                                     const XXH32_state_t *srcState) {
 
@@ -1614,6 +2321,7 @@ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t *      dstState,
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
                                          XXH32_hash_t   seed) {
 
@@ -1630,6 +2338,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state,
                                           const void *input, size_t len) {
 
@@ -1719,6 +2428,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state,
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) {
 
   xxh_u32 h32;
@@ -1743,7 +2453,8 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) {
 
 /*******   Canonical representation   *******/
 
-/*
+/*!
+ * @ingroup xxh32_family
  * The default return values from XXH functions are unsigned 32 and 64 bit
  * integers.
  *
@@ -1765,6 +2476,7 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
 
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH32_hash_t
 XXH32_hashFromCanonical(const XXH32_canonical_t *src) {
 
@@ -1777,7 +2489,12 @@ XXH32_hashFromCanonical(const XXH32_canonical_t *src) {
 /* *******************************************************************
  *  64-bit hash functions
  *********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
 
+ */
 /*******   Memory access   *******/
 
 typedef XXH64_hash_t xxh_u64;
@@ -1786,40 +2503,6 @@ typedef XXH64_hash_t xxh_u64;
       #define U64 xxh_u64
     #endif
 
-    /*!
-     * XXH_REROLL_XXH64:
-     * Whether to reroll the XXH64_finalize() loop.
-     *
-     * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
-     * performance gain on 64-bit hosts, as only one jump is required.
-     *
-     * However, on 32-bit hosts, because arithmetic needs to be done with two
-     * 32-bit registers, and 64-bit arithmetic needs to be simulated, it isn't
-     * beneficial to unroll. The code becomes ridiculously large (the largest
-     * function in the binary on i386!), and rerolling it saves anywhere from
-     * 3kB to 20kB. It is also slightly faster because it fits into cache better
-     * and is more likely to be inlined by the compiler.
-     *
-     * If XXH_REROLL is defined, this is ignored and the loop is always
-     * rerolled.
-     */
-    #ifndef XXH_REROLL_XXH64
-      #if (defined(__ILP32__) ||                                              \
-           defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
-          || !(defined(__x86_64__) || defined(_M_X64) ||                      \
-               defined(_M_AMD64) /* x86-64 */                                 \
-               || defined(_M_ARM64) || defined(__aarch64__) ||                \
-               defined(__arm64__) /* aarch64 */                               \
-               || defined(__PPC64__) || defined(__PPC64LE__) ||               \
-               defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */       \
-               || defined(__mips64__) || defined(__mips64)) /* mips64 */      \
-          || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX)  /* check limits */
-        #define XXH_REROLL_XXH64 1
-      #else
-        #define XXH_REROLL_XXH64 0
-      #endif
-    #endif                                    /* !defined(XXH_REROLL_XXH64) */
-
     #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
     /*
      * Manual byteshift. Best for old compilers which don't inline memcpy.
@@ -1950,23 +2633,35 @@ XXH_FORCE_INLINE xxh_u64 XXH_readLE64_align(const void *  ptr,
 
 }
 
-/*******   xxh64   *******/
+    /*******   xxh64   *******/
+    /*!
+     * @}
+     * @defgroup xxh64_impl XXH64 implementation
+     * @ingroup impl
+     * @{
 
-static const xxh_u64 XXH_PRIME64_1 =
-    0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111
-                            */
-static const xxh_u64 XXH_PRIME64_2 =
-    0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111
-                            */
-static const xxh_u64 XXH_PRIME64_3 =
-    0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001
-                            */
-static const xxh_u64 XXH_PRIME64_4 =
-    0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011
-                            */
-static const xxh_u64 XXH_PRIME64_5 =
-    0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101
-                            */
+     */
+    /* #define rather that static const, to be used as initializers */
+    #define XXH_PRIME64_1                                                                         \
+      0x9E3779B185EBCA87ULL /*!<                                                                  \
+                               0b1001111000110111011110011011000110000101111010111100101010000111 \
+                             */
+    #define XXH_PRIME64_2                                                                         \
+      0xC2B2AE3D27D4EB4FULL /*!<                                                                  \
+                               0b1100001010110010101011100011110100100111110101001110101101001111 \
+                             */
+    #define XXH_PRIME64_3                                                                         \
+      0x165667B19E3779F9ULL /*!<                                                                  \
+                               0b0001011001010110011001111011000110011110001101110111100111111001 \
+                             */
+    #define XXH_PRIME64_4                                                                         \
+      0x85EBCA77C2B2AE63ULL /*!<                                                                  \
+                               0b1000010111101011110010100111011111000010101100101010111001100011 \
+                             */
+    #define XXH_PRIME64_5                                                                         \
+      0x27D4EB2F165667C5ULL /*!<                                                                  \
+                               0b0010011111010100111010110010111100010110010101100110011111000101 \
+                             */
 
     #ifdef XXH_OLD_NAMES
       #define PRIME64_1 XXH_PRIME64_1
@@ -2010,185 +2705,35 @@ static xxh_u64 XXH64_avalanche(xxh_u64 h64) {
 static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len,
                               XXH_alignment align) {
 
-    /* dummy comment */
-
-    #define XXH_PROCESS1_64                        \
-      do {                                         \
-                                                   \
-        h64 ^= (*ptr++) * XXH_PRIME64_5;           \
-        h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \
-                                                   \
-      } while (0)
-
-    #define XXH_PROCESS4_64                                        \
-      do {                                                         \
-                                                                   \
-        h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;      \
-        ptr += 4;                                                  \
-        h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \
-                                                                   \
-      } while (0)
-
-    #define XXH_PROCESS8_64                                        \
-      do {                                                         \
-                                                                   \
-        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));     \
-        ptr += 8;                                                  \
-        h64 ^= k1;                                                 \
-        h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4; \
-                                                                   \
-      } while (0)
-
-  /* Rerolled version for 32-bit targets is faster and much smaller. */
-  if (XXH_REROLL || XXH_REROLL_XXH64) {
-
-    len &= 31;
-    while (len >= 8) {
-
-      XXH_PROCESS8_64;
-      len -= 8;
-
-    }
-
-    if (len >= 4) {
-
-      XXH_PROCESS4_64;
-      len -= 4;
-
-    }
-
-    while (len > 0) {
+  len &= 31;
+  while (len >= 8) {
 
-      XXH_PROCESS1_64;
-      --len;
+    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+    ptr += 8;
+    h64 ^= k1;
+    h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+    len -= 8;
 
-    }
+  }
 
-    return XXH64_avalanche(h64);
+  if (len >= 4) {
 
-  } else {
+    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+    ptr += 4;
+    h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+    len -= 4;
 
-    switch (len & 31) {
+  }
 
-      case 24:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 16:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 8:
-        XXH_PROCESS8_64;
-        return XXH64_avalanche(h64);
-
-      case 28:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 20:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 12:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 4:
-        XXH_PROCESS4_64;
-        return XXH64_avalanche(h64);
-
-      case 25:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 17:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 9:
-        XXH_PROCESS8_64;
-        XXH_PROCESS1_64;
-        return XXH64_avalanche(h64);
-
-      case 29:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 21:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 13:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 5:
-        XXH_PROCESS4_64;
-        XXH_PROCESS1_64;
-        return XXH64_avalanche(h64);
-
-      case 26:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 18:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 10:
-        XXH_PROCESS8_64;
-        XXH_PROCESS1_64;
-        XXH_PROCESS1_64;
-        return XXH64_avalanche(h64);
-
-      case 30:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 22:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 14:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 6:
-        XXH_PROCESS4_64;
-        XXH_PROCESS1_64;
-        XXH_PROCESS1_64;
-        return XXH64_avalanche(h64);
-
-      case 27:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 19:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 11:
-        XXH_PROCESS8_64;
-        XXH_PROCESS1_64;
-        XXH_PROCESS1_64;
-        XXH_PROCESS1_64;
-        return XXH64_avalanche(h64);
-
-      case 31:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 23:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 15:
-        XXH_PROCESS8_64;
-        /* fallthrough */
-      case 7:
-        XXH_PROCESS4_64;
-        /* fallthrough */
-      case 3:
-        XXH_PROCESS1_64;
-        /* fallthrough */
-      case 2:
-        XXH_PROCESS1_64;
-        /* fallthrough */
-      case 1:
-        XXH_PROCESS1_64;
-        /* fallthrough */
-      case 0:
-        return XXH64_avalanche(h64);
+  while (len > 0) {
 
-    }
+    h64 ^= (*ptr++) * XXH_PRIME64_5;
+    h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
+    --len;
 
   }
 
-  /* impossible to reach */
-  XXH_ASSERT(0);
-  return 0;          /* unreachable, but some compilers complain without it */
+  return XXH64_avalanche(h64);
 
 }
 
@@ -2205,7 +2750,7 @@ static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len,
 XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len,
                                             xxh_u64 seed, XXH_alignment align) {
 
-  const xxh_u8 *bEnd = input + len;
+  const xxh_u8 *bEnd = input ? input + len : NULL;
   xxh_u64       h64;
 
     #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
@@ -2259,6 +2804,7 @@ XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len,
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
                                   XXH64_hash_t seed) {
 
@@ -2268,9 +2814,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
     XXH64_reset(&state, seed);
     XXH64_update(&state, (const xxh_u8*)input, len);
     return XXH64_digest(&state);
-
     #else
-
   if (XXH_FORCE_ALIGN_CHECK) {
 
     if ((((size_t)input) & 7) ==
@@ -2289,12 +2833,14 @@ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
 
 /*******   Hash Streaming   *******/
 
+/*! @ingroup xxh64_family*/
 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void) {
 
   return (XXH64_state_t *)XXH_malloc(sizeof(XXH64_state_t));
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) {
 
   XXH_free(statePtr);
@@ -2302,6 +2848,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) {
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t *      dstState,
                                     const XXH64_state_t *srcState) {
 
@@ -2309,6 +2856,7 @@ XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t *      dstState,
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
                                          XXH64_hash_t   seed) {
 
@@ -2325,6 +2873,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state,
                                           const void *input, size_t len) {
 
@@ -2403,6 +2952,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state,
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) {
 
   xxh_u64 h64;
@@ -2436,6 +2986,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) {
 
 /******* Canonical representation   *******/
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
                                             XXH64_hash_t       hash) {
 
@@ -2445,6 +2996,7 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
 
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH64_hash_t
 XXH64_hashFromCanonical(const XXH64_canonical_t *src) {
 
@@ -2452,380 +3004,452 @@ XXH64_hashFromCanonical(const XXH64_canonical_t *src) {
 
 }
 
-  /* *********************************************************************
-   *  XXH3
-   *  New generation hash designed for speed on small keys and vectorization
-   ************************************************************************ */
+    #ifndef XXH_NO_XXH3
 
-  /* ===   Compiler specifics   === */
+    /* *********************************************************************
+     *  XXH3
+     *  New generation hash designed for speed on small keys and vectorization
+     ************************************************************************ */
+    /*!
+     * @}
+     * @defgroup xxh3_impl XXH3 implementation
+     * @ingroup impl
+     * @{
 
-    #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L  /* >= C99 */
-      #define XXH_RESTRICT restrict
-    #else
-      /* Note: it might be useful to define __restrict or __restrict__ for some
-       * C++ compilers */
-      #define XXH_RESTRICT                                       /* disable */
-    #endif
+     */
 
-    #if (defined(__GNUC__) && (__GNUC__ >= 3)) ||                   \
-        (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \
-        defined(__clang__)
-      #define XXH_likely(x) __builtin_expect(x, 1)
-      #define XXH_unlikely(x) __builtin_expect(x, 0)
-    #else
-      #define XXH_likely(x) (x)
-      #define XXH_unlikely(x) (x)
-    #endif
+    /* ===   Compiler specifics   === */
 
-    #if defined(__GNUC__)
-      #if defined(__AVX2__)
-        #include <immintrin.h>
-      #elif defined(__SSE2__)
-        #include <emmintrin.h>
-      #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
-        #define inline __inline__                 /* circumvent a clang bug */
-        #include <arm_neon.h>
-        #undef inline
+      #if ((defined(sun) || defined(__sun)) &&                                \
+           __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested \
+                           with GCC 5.5 */
+        #define XXH_RESTRICT                                     /* disable */
+      #elif defined(__STDC_VERSION__) && \
+          __STDC_VERSION__ >= 199901L                             /* >= C99 */
+        #define XXH_RESTRICT restrict
+      #else
+        /* Note: it might be useful to define __restrict or __restrict__ for
+         * some C++ compilers */
+        #define XXH_RESTRICT                                     /* disable */
       #endif
-    #elif defined(_MSC_VER)
-      #include <intrin.h>
-    #endif
-
-    /*
-     * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
-     * remaining a true 64-bit/128-bit hash function.
-     *
-     * This is done by prioritizing a subset of 64-bit operations that can be
-     * emulated without too many steps on the average 32-bit machine.
-     *
-     * For example, these two lines seem similar, and run equally fast on
-     * 64-bit:
-     *
-     *   xxh_u64 x;
-     *   x ^= (x >> 47); // good
-     *   x ^= (x >> 13); // bad
-     *
-     * However, to a 32-bit machine, there is a major difference.
-     *
-     * x ^= (x >> 47) looks like this:
-     *
-     *   x.lo ^= (x.hi >> (47 - 32));
-     *
-     * while x ^= (x >> 13) looks like this:
-     *
-     *   // note: funnel shifts are not usually cheap.
-     *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
-     *   x.hi ^= (x.hi >> 13);
-     *
-     * The first one is significantly faster than the second, simply because the
-     * shift is larger than 32. This means:
-     *  - All the bits we need are in the upper 32 bits, so we can ignore the
-     * lower 32 bits in the shift.
-     *  - The shift result will always fit in the lower 32 bits, and therefore,
-     *    we can ignore the upper 32 bits in the xor.
-     *
-     * Thanks to this optimization, XXH3 only requires these features to be
-     * efficient:
-     *
-     *  - Usable unaligned access
-     *  - A 32-bit or 64-bit ALU
-     *      - If 32-bit, a decent ADC instruction
-     *  - A 32 or 64-bit multiply with a 64-bit result
-     *  - For the 128-bit variant, a decent byteswap helps short inputs.
-     *
-     * The first two are already required by XXH32, and almost all 32-bit and
-     * 64-bit platforms which can run XXH32 can run XXH3 efficiently.
-     *
-     * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
-     * notable exception.
-     *
-     * First of all, Thumb-1 lacks support for the UMULL instruction which
-     * performs the important long multiply. This means numerous __aeabi_lmul
-     * calls.
-     *
-     * Second of all, the 8 functional registers are just not enough.
-     * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic
-     * need Lo registers, and this shuffling results in thousands more MOVs than
-     * A32.
-     *
-     * A32 and T32 don't have this limitation. They can access all 14 registers,
-     * do a 32->64 multiply with UMULL, and the flexible operand allowing free
-     * shifts is helpful, too.
-     *
-     * Therefore, we do a quick sanity check.
-     *
-     * If compiling Thumb-1 for a target which supports ARM instructions, we
-     * will emit a warning, as it is not a "sane" platform to compile for.
-     *
-     * Usually, if this happens, it is because of an accident and you probably
-     * need to specify -march, as you likely meant to compile for a newer
-     * architecture.
-     *
-     * Credit: large sections of the vectorial and asm source code paths
-     *         have been contributed by @easyaspi314
-     */
-    #if defined(__thumb__) && !defined(__thumb2__) && \
-        defined(__ARM_ARCH_ISA_ARM)
-      #warning "XXH3 is highly inefficient without ARM or Thumb-2."
-    #endif
 
-    /* ==========================================
-     * Vectorization detection
-     * ========================================== */
-    #define XXH_SCALAR 0                         /* Portable scalar version */
-    #define XXH_SSE2 1                 /* SSE2 for Pentium 4 and all x86_64 */
-    #define XXH_AVX2 2                    /* AVX2 for Haswell and Bulldozer */
-    #define XXH_AVX512 3                  /* AVX512 for Skylake and Icelake */
-    #define XXH_NEON 4             /* NEON for most ARMv7-A and all AArch64 */
-    #define XXH_VSX 5                     /* VSX and ZVector for POWER8/z13 */
-
-    #ifndef XXH_VECTOR                    /* can be defined on command line */
-      #if defined(__AVX512F__)
-        #define XXH_VECTOR XXH_AVX512
-      #elif defined(__AVX2__)
-        #define XXH_VECTOR XXH_AVX2
-      #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
-          (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
-        #define XXH_VECTOR XXH_SSE2
-      #elif defined(__GNUC__) /* msvc support maybe later */                   \
-          && (defined(__ARM_NEON__) || defined(__ARM_NEON)) &&                 \
-          (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
-           || (defined(__BYTE_ORDER__) &&                                      \
-               __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
-        #define XXH_VECTOR XXH_NEON
-      #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \
-          (defined(__s390x__) && defined(__VEC__)) &&             \
-              defined(__GNUC__)                             /* TODO: IBM XL */
-        #define XXH_VECTOR XXH_VSX
+      #if (defined(__GNUC__) && (__GNUC__ >= 3)) ||                   \
+          (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \
+          defined(__clang__)
+        #define XXH_likely(x) __builtin_expect(x, 1)
+        #define XXH_unlikely(x) __builtin_expect(x, 0)
       #else
-        #define XXH_VECTOR XXH_SCALAR
+        #define XXH_likely(x) (x)
+        #define XXH_unlikely(x) (x)
       #endif
-    #endif
 
-    /*
-     * Controls the alignment of the accumulator,
-     * for compatibility with aligned vector loads, which are usually faster.
-     */
-    #ifndef XXH_ACC_ALIGN
-      #if defined(XXH_X86DISPATCH)
-        #define XXH_ACC_ALIGN 64           /* for compatibility with avx512 */
-      #elif XXH_VECTOR == XXH_SCALAR                              /* scalar */
-        #define XXH_ACC_ALIGN 8
-      #elif XXH_VECTOR == XXH_SSE2                                  /* sse2 */
-        #define XXH_ACC_ALIGN 16
-      #elif XXH_VECTOR == XXH_AVX2                                  /* avx2 */
-        #define XXH_ACC_ALIGN 32
-      #elif XXH_VECTOR == XXH_NEON                                  /* neon */
-        #define XXH_ACC_ALIGN 16
-      #elif XXH_VECTOR == XXH_VSX                                    /* vsx */
-        #define XXH_ACC_ALIGN 16
-      #elif XXH_VECTOR == XXH_AVX512                              /* avx512 */
-        #define XXH_ACC_ALIGN 64
+      #if defined(__GNUC__)
+        #if defined(__AVX2__)
+          #include <immintrin.h>
+        #elif defined(__SSE2__)
+          #include <emmintrin.h>
+        #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+          #define inline __inline__               /* circumvent a clang bug */
+          #include <arm_neon.h>
+          #undef inline
+        #endif
+      #elif defined(_MSC_VER)
+        #include <intrin.h>
       #endif
-    #endif
 
-    #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \
-        XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
-      #define XXH_SEC_ALIGN XXH_ACC_ALIGN
-    #else
-      #define XXH_SEC_ALIGN 8
-    #endif
-
-    /*
-     * UGLY HACK:
-     * GCC usually generates the best code with -O3 for xxHash.
-     *
-     * However, when targeting AVX2, it is overzealous in its unrolling
-     * resulting in code roughly 3/4 the speed of Clang.
-     *
-     * There are other issues, such as GCC splitting _mm256_loadu_si256 into
-     * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
-     * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
-     *
-     * That is why when compiling the AVX2 version, it is recommended to use
-     * either -O2 -mavx2 -march=haswell or -O2 -mavx2
-     * -mno-avx256-split-unaligned-load for decent performance, or to use Clang
-     * instead.
-     *
-     * Fortunately, we can control the first one with a pragma that forces GCC
-     * into -O2, but the other one we can't control without "failed to inline
-     * always inline function due to target mismatch" warnings.
-     */
-    #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
-        && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
-        && defined(__OPTIMIZE__) &&                                      \
-        !defined(__OPTIMIZE_SIZE__)                  /* respect -O0 and -Os */
-      #pragma GCC push_options
-      #pragma GCC optimize("-O2")
-    #endif
-
-    #if XXH_VECTOR == XXH_NEON
       /*
-       * NEON's setup for vmlal_u32 is a little more complicated than it is on
-       * SSE2, AVX2, and VSX.
+       * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+       * remaining a true 64-bit/128-bit hash function.
        *
-       * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an
-       * upcast.
+       * This is done by prioritizing a subset of 64-bit operations that can be
+       * emulated without too many steps on the average 32-bit machine.
        *
-       * To do the same operation, the 128-bit 'Q' register needs to be split
-       * into two 64-bit 'D' registers, performing this operation::
+       * For example, these two lines seem similar, and run equally fast on
+       * 64-bit:
        *
-       *   [                a                 |                 b ] |
-       * '---------. .--------'                | |                         x |
-       *            |              .---------' '--------.                |
-       *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32 ]
+       *   xxh_u64 x;
+       *   x ^= (x >> 47); // good
+       *   x ^= (x >> 13); // bad
        *
-       * Due to significant changes in aarch64, the fastest method for aarch64
-       * is completely different than the fastest method for ARMv7-A.
+       * However, to a 32-bit machine, there is a major difference.
        *
-       * ARMv7-A treats D registers as unions overlaying Q registers, so
-       * modifying D11 will modify the high half of Q5. This is similar to how
-       * modifying AH will only affect bits 8-15 of AX on x86.
+       * x ^= (x >> 47) looks like this:
        *
-       * VZIP takes two registers, and puts even lanes in one register and odd
-       * lanes in the other.
+       *   x.lo ^= (x.hi >> (47 - 32));
        *
-       * On ARMv7-A, this strangely modifies both parameters in place instead of
-       * taking the usual 3-operand form.
+       * while x ^= (x >> 13) looks like this:
        *
-       * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on
-       * the lower and upper halves of the Q register to end up with the high
-       * and low halves where we want - all in one instruction.
+       *   // note: funnel shifts are not usually cheap.
+       *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+       *   x.hi ^= (x.hi >> 13);
        *
-       *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1],
-       * d11[1] }
+       * The first one is significantly faster than the second, simply because
+       * the shift is larger than 32. This means:
+       *  - All the bits we need are in the upper 32 bits, so we can ignore the
+       * lower 32 bits in the shift.
+       *  - The shift result will always fit in the lower 32 bits, and
+       * therefore, we can ignore the upper 32 bits in the xor.
        *
-       * Unfortunately we need inline assembly for this: Instructions modifying
-       * two registers at once is not possible in GCC or Clang's IR, and they
-       * have to create a copy.
+       * Thanks to this optimization, XXH3 only requires these features to be
+       * efficient:
        *
-       * aarch64 requires a different approach.
+       *  - Usable unaligned access
+       *  - A 32-bit or 64-bit ALU
+       *      - If 32-bit, a decent ADC instruction
+       *  - A 32 or 64-bit multiply with a 64-bit result
+       *  - For the 128-bit variant, a decent byteswap helps short inputs.
        *
-       * In order to make it easier to write a decent compiler for aarch64, many
-       * quirks were removed, such as conditional execution.
+       * The first two are already required by XXH32, and almost all 32-bit and
+       * 64-bit platforms which can run XXH32 can run XXH3 efficiently.
        *
-       * NEON was also affected by this.
+       * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is
+       * one notable exception.
        *
-       * aarch64 cannot access the high bits of a Q-form register, and writes to
-       * a D-form register zero the high bits, similar to how writes to W-form
-       * scalar registers (or DWORD registers on x86_64) work.
+       * First of all, Thumb-1 lacks support for the UMULL instruction which
+       * performs the important long multiply. This means numerous __aeabi_lmul
+       * calls.
        *
-       * The formerly free vget_high intrinsics now require a vext (with a few
-       * exceptions)
+       * Second of all, the 8 functional registers are just not enough.
+       * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic
+       * need Lo registers, and this shuffling results in thousands more MOVs
+       * than A32.
        *
-       * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the
-       * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to
-       * only modify one operand.
+       * A32 and T32 don't have this limitation. They can access all 14
+       * registers, do a 32->64 multiply with UMULL, and the flexible operand
+       * allowing free shifts is helpful, too.
        *
-       * The equivalent of the VZIP.32 on the lower and upper halves would be
-       * this mess:
+       * Therefore, we do a quick sanity check.
        *
-       *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1]
-       * } zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] } zip2 v0.2s,
-       * v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+       * If compiling Thumb-1 for a target which supports ARM instructions, we
+       * will emit a warning, as it is not a "sane" platform to compile for.
        *
-       * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64
-       * (SHRN):
+       * Usually, if this happens, it is because of an accident and you probably
+       * need to specify -march, as you likely meant to compile for a newer
+       * architecture.
        *
-       *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
-       *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
-       *
-       * This is available on ARMv7-A, but is less efficient than a single
-       * VZIP.32.
+       * Credit: large sections of the vectorial and asm source code paths
+       *         have been contributed by @easyaspi314
        */
+      #if defined(__thumb__) && !defined(__thumb2__) && \
+          defined(__ARM_ARCH_ISA_ARM)
+        #warning "XXH3 is highly inefficient without ARM or Thumb-2."
+      #endif
+
+    /* ==========================================
+     * Vectorization detection
+     * ========================================== */
+
+      #ifdef XXH_DOXYGEN
+        /*!
+         * @ingroup tuning
+         * @brief Overrides the vectorization implementation chosen for XXH3.
+         *
+         * Can be defined to 0 to disable SIMD or any of the values mentioned in
+         * @ref XXH_VECTOR_TYPE.
+         *
+         * If this is not defined, it uses predefined macros to determine the
+         * best implementation.
+         */
+        #define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Note that these are actually implemented as macros.
+ *
+ * If this is not defined, it is detected automatically.
+ * @ref XXH_X86DISPATCH overrides this.
+ */
+enum XXH_VECTOR_TYPE /* fake enum */ {
+
+  XXH_SCALAR = 0,                              /*!< Portable scalar version */
+  XXH_SSE2 = 1,   /*!<
+                   * SSE2 for Pentium 4, Opteron, all x86_64.
+                   *
+                   * @note SSE2 is also guaranteed on Windows 10, macOS, and
+                   * Android x86.
+                   */
+  XXH_AVX2 = 2,                         /*!< AVX2 for Haswell and Bulldozer */
+  XXH_AVX512 = 3,                       /*!< AVX512 for Skylake and Icelake */
+  XXH_NEON = 4,                  /*!< NEON for most ARMv7-A and all AArch64 */
+  XXH_VSX = 5,                 /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+
+};
+
+        /*!
+         * @ingroup tuning
+         * @brief Selects the minimum alignment for XXH3's accumulators.
+         *
+         * When using SIMD, this should match the alignment reqired for said
+         * vector type, so, for example, 32 for AVX2.
+         *
+         * Default: Auto detected.
+         */
+        #define XXH_ACC_ALIGN 8
+      #endif
+
+      /* Actual definition */
+      #ifndef XXH_DOXYGEN
+        #define XXH_SCALAR 0
+        #define XXH_SSE2 1
+        #define XXH_AVX2 2
+        #define XXH_AVX512 3
+        #define XXH_NEON 4
+        #define XXH_VSX 5
+      #endif
+
+      #ifndef XXH_VECTOR                  /* can be defined on command line */
+        #if defined(__AVX512F__)
+          #define XXH_VECTOR XXH_AVX512
+        #elif defined(__AVX2__)
+          #define XXH_VECTOR XXH_AVX2
+        #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
+            (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+          #define XXH_VECTOR XXH_SSE2
+        #elif defined(__GNUC__) /* msvc support maybe later */               \
+            && (defined(__ARM_NEON__) || defined(__ARM_NEON)) &&             \
+            (defined(                                                        \
+                 __LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+             || (defined(__BYTE_ORDER__) &&                                  \
+                 __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+          #define XXH_VECTOR XXH_NEON
+        #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \
+            (defined(__s390x__) && defined(__VEC__)) &&             \
+                defined(__GNUC__)                           /* TODO: IBM XL */
+          #define XXH_VECTOR XXH_VSX
+        #else
+          #define XXH_VECTOR XXH_SCALAR
+        #endif
+      #endif
 
       /*
-       * Function-like macro:
-       * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t
-       * &outHi)
-       * {
-
-       *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
-       *     outHi = (uint32x2_t)(in >> 32);
-       *     in = UNDEFINED;
-       * }
+       * Controls the alignment of the accumulator,
+       * for compatibility with aligned vector loads, which are usually faster.
        */
-      #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
-          && defined(__GNUC__) && !defined(__aarch64__) && !defined(__arm64__)
-        #define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                                   \
-          do {                                                                                         \
-                                                                                                       \
-            /* Undocumented GCC/Clang operand modifier: %e0 = lower D half,                            \
-             * %f0 = upper D half */                                                                   \
-            /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486             \
-             */                                                                                        \
-            /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \
-             */                                                                                        \
-            __asm__("vzip.32  %e0, %f0" : "+w"(in));                                                   \
-            (outLo) = vget_low_u32(vreinterpretq_u32_u64(in));                                         \
-            (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                        \
-                                                                                                       \
-          } while (0)
+      #ifndef XXH_ACC_ALIGN
+        #if defined(XXH_X86DISPATCH)
+          #define XXH_ACC_ALIGN 64         /* for compatibility with avx512 */
+        #elif XXH_VECTOR == XXH_SCALAR                            /* scalar */
+          #define XXH_ACC_ALIGN 8
+        #elif XXH_VECTOR == XXH_SSE2                                /* sse2 */
+          #define XXH_ACC_ALIGN 16
+        #elif XXH_VECTOR == XXH_AVX2                                /* avx2 */
+          #define XXH_ACC_ALIGN 32
+        #elif XXH_VECTOR == XXH_NEON                                /* neon */
+          #define XXH_ACC_ALIGN 16
+        #elif XXH_VECTOR == XXH_VSX                                  /* vsx */
+          #define XXH_ACC_ALIGN 16
+        #elif XXH_VECTOR == XXH_AVX512                            /* avx512 */
+          #define XXH_ACC_ALIGN 64
+        #endif
+      #endif
 
+      #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \
+          XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+        #define XXH_SEC_ALIGN XXH_ACC_ALIGN
       #else
-        #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
-          do {                                       \
-                                                     \
-            (outLo) = vmovn_u64(in);                 \
-            (outHi) = vshrn_n_u64((in), 32);         \
-                                                     \
-          } while (0)
+        #define XXH_SEC_ALIGN 8
+      #endif
 
+      /*
+       * UGLY HACK:
+       * GCC usually generates the best code with -O3 for xxHash.
+       *
+       * However, when targeting AVX2, it is overzealous in its unrolling
+       * resulting in code roughly 3/4 the speed of Clang.
+       *
+       * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+       * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization
+       * which only applies to Sandy and Ivy Bridge... which don't even support
+       * AVX2.
+       *
+       * That is why when compiling the AVX2 version, it is recommended to use
+       * either -O2 -mavx2 -march=haswell or -O2 -mavx2
+       * -mno-avx256-split-unaligned-load for decent performance, or to use
+       * Clang instead.
+       *
+       * Fortunately, we can control the first one with a pragma that forces GCC
+       * into -O2, but the other one we can't control without "failed to inline
+       * always inline function due to target mismatch" warnings.
+       */
+      #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
+          && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+          && defined(__OPTIMIZE__) &&                                      \
+          !defined(__OPTIMIZE_SIZE__)                /* respect -O0 and -Os */
+        #pragma GCC push_options
+        #pragma GCC optimize("-O2")
       #endif
-    #endif                                        /* XXH_VECTOR == XXH_NEON */
 
-    /*
-     * VSX and Z Vector helpers.
-     *
-     * This is very messy, and any pull requests to clean this up are welcome.
-     *
-     * There are a lot of problems with supporting VSX and s390x, due to
-     * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
-     */
-    #if XXH_VECTOR == XXH_VSX
-      #if defined(__s390x__)
-        #include <s390intrin.h>
-      #else
-        /* gcc's altivec.h can have the unwanted consequence to unconditionally
-         * #define bool, vector, and pixel keywords,
-         * with bad consequences for programs already using these keywords for
-         * other purposes. The paragraph defining these macros is skipped when
-         * __APPLE_ALTIVEC__ is defined.
-         * __APPLE_ALTIVEC__ is _generally_ defined automatically by the
-         * compiler, but it seems that, in some cases, it isn't. Force the build
-         * macro to be defined, so that keywords are not altered.
+      #if XXH_VECTOR == XXH_NEON
+        /*
+         * NEON's setup for vmlal_u32 is a little more complicated than it is on
+         * SSE2, AVX2, and VSX.
+         *
+         * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an
+         * upcast.
+         *
+         * To do the same operation, the 128-bit 'Q' register needs to be split
+         * into two 64-bit 'D' registers, performing this operation::
+         *
+         *   [                a                 |                 b ] |
+         * '---------. .--------'                | |                         x |
+         *            |              .---------' '--------.                |
+         *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32
+         * ]
+         *
+         * Due to significant changes in aarch64, the fastest method for aarch64
+         * is completely different than the fastest method for ARMv7-A.
+         *
+         * ARMv7-A treats D registers as unions overlaying Q registers, so
+         * modifying D11 will modify the high half of Q5. This is similar to how
+         * modifying AH will only affect bits 8-15 of AX on x86.
+         *
+         * VZIP takes two registers, and puts even lanes in one register and odd
+         * lanes in the other.
+         *
+         * On ARMv7-A, this strangely modifies both parameters in place instead
+         * of taking the usual 3-operand form.
+         *
+         * Therefore, if we want to do this, we can simply use a D-form VZIP.32
+         * on the lower and upper halves of the Q register to end up with the
+         * high and low halves where we want - all in one instruction.
+         *
+         *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = {
+
+         * d10[1], d11[1] }
+         *
+         * Unfortunately we need inline assembly for this: Instructions
+         * modifying two registers at once is not possible in GCC or Clang's IR,
+         * and they have to create a copy.
+         *
+         * aarch64 requires a different approach.
+         *
+         * In order to make it easier to write a decent compiler for aarch64,
+         * many quirks were removed, such as conditional execution.
+         *
+         * NEON was also affected by this.
+         *
+         * aarch64 cannot access the high bits of a Q-form register, and writes
+         * to a D-form register zero the high bits, similar to how writes to
+         * W-form scalar registers (or DWORD registers on x86_64) work.
+         *
+         * The formerly free vget_high intrinsics now require a vext (with a few
+         * exceptions)
+         *
+         * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the
+         * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to
+         * only modify one operand.
+         *
+         * The equivalent of the VZIP.32 on the lower and upper halves would be
+         * this mess:
+         *
+         *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0],
+         * v0[1] } zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] } zip2
+         * v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+         *
+         * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64
+         * (SHRN):
+         *
+         *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+         *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+         *
+         * This is available on ARMv7-A, but is less efficient than a single
+         * VZIP.32.
          */
-        #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
-          #define __APPLE_ALTIVEC__
-        #endif
-        #include <altivec.h>
-      #endif
 
-typedef __vector unsigned long long xxh_u64x2;
-typedef __vector unsigned char      xxh_u8x16;
-typedef __vector unsigned           xxh_u32x4;
+        /*!
+         * Function-like macro:
+         * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t
+         * &outHi)
+         * {
 
-      #ifndef XXH_VSX_BE
-        #if defined(__BIG_ENDIAN__) ||  \
-            (defined(__BYTE_ORDER__) && \
-             __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-          #define XXH_VSX_BE 1
-        #elif defined(__VEC_ELEMENT_REG_ORDER__) && \
-            __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
-          #warning \
-              "-maltivec=be is not recommended. Please use native endianness."
-          #define XXH_VSX_BE 1
+         *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+         *     outHi = (uint32x2_t)(in >> 32);
+         *     in = UNDEFINED;
+         * }
+         */
+        #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+            && defined(__GNUC__) && !defined(__aarch64__) &&   \
+            !defined(__arm64__)
+          #define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                                   \
+            do {                                                                                         \
+                                                                                                         \
+              /* Undocumented GCC/Clang operand modifier: %e0 = lower D half,                            \
+               * %f0 = upper D half */                                                                   \
+              /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486             \
+               */                                                                                        \
+              /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \
+               */                                                                                        \
+              __asm__("vzip.32  %e0, %f0" : "+w"(in));                                                   \
+              (outLo) = vget_low_u32(vreinterpretq_u32_u64(in));                                         \
+              (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                        \
+                                                                                                         \
+            } while (0)
         #else
-          #define XXH_VSX_BE 0
+          #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
+            do {                                       \
+                                                       \
+              (outLo) = vmovn_u64(in);                 \
+              (outHi) = vshrn_n_u64((in), 32);         \
+                                                       \
+            } while (0)
         #endif
-      #endif                                        /* !defined(XXH_VSX_BE) */
+      #endif                                      /* XXH_VECTOR == XXH_NEON */
 
-      #if XXH_VSX_BE
-        /* A wrapper for POWER9's vec_revb. */
-        #if defined(__POWER9_VECTOR__) || \
-            (defined(__clang__) && defined(__s390x__))
-          #define XXH_vec_revb vec_revb
+      /*
+       * VSX and Z Vector helpers.
+       *
+       * This is very messy, and any pull requests to clean this up are welcome.
+       *
+       * There are a lot of problems with supporting VSX and s390x, due to
+       * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+       */
+      #if XXH_VECTOR == XXH_VSX
+        #if defined(__s390x__)
+          #include <s390intrin.h>
         #else
+          /* gcc's altivec.h can have the unwanted consequence to
+           * unconditionally #define bool, vector, and pixel keywords, with bad
+           * consequences for programs already using these keywords for other
+           * purposes. The paragraph defining these macros is skipped when
+           * __APPLE_ALTIVEC__ is defined.
+           * __APPLE_ALTIVEC__ is _generally_ defined automatically by the
+           * compiler, but it seems that, in some cases, it isn't. Force the
+           * build macro to be defined, so that keywords are not altered.
+           */
+          #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
+            #define __APPLE_ALTIVEC__
+          #endif
+          #include <altivec.h>
+        #endif
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char      xxh_u8x16;
+typedef __vector unsigned           xxh_u32x4;
+
+        #ifndef XXH_VSX_BE
+          #if defined(__BIG_ENDIAN__) ||  \
+              (defined(__BYTE_ORDER__) && \
+               __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            #define XXH_VSX_BE 1
+          #elif defined(__VEC_ELEMENT_REG_ORDER__) && \
+              __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+            #warning \
+                "-maltivec=be is not recommended. Please use native endianness."
+            #define XXH_VSX_BE 1
+          #else
+            #define XXH_VSX_BE 0
+          #endif
+        #endif                                      /* !defined(XXH_VSX_BE) */
+
+        #if XXH_VSX_BE
+          #if defined(__POWER9_VECTOR__) || \
+              (defined(__clang__) && defined(__s390x__))
+            #define XXH_vec_revb vec_revb
+          #else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
 
   xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
@@ -2834,40 +3458,40 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
 
 }
 
-        #endif
-      #endif                                                  /* XXH_VSX_BE */
+          #endif
+        #endif                                                /* XXH_VSX_BE */
 
-/*
- * Performs an unaligned load and byte swaps it on big endian.
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
  */
 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) {
 
   xxh_u64x2 ret;
   memcpy(&ret, ptr, sizeof(xxh_u64x2));
-      #if XXH_VSX_BE
+        #if XXH_VSX_BE
   ret = XXH_vec_revb(ret);
-      #endif
+        #endif
   return ret;
 
 }
 
-      /*
-       * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
-       *
-       * These intrinsics weren't added until GCC 8, despite existing for a
-       * while, and they are endian dependent. Also, their meaning swap
-       * depending on version.
-       * */
-      #if defined(__s390x__)
-      /* s390x is always big endian, no issue on this platform */
-        #define XXH_vec_mulo vec_mulo
-        #define XXH_vec_mule vec_mule
-      #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
-        /* Clang has a better way to control this, we can just use the builtin
-         * which doesn't swap. */
-        #define XXH_vec_mulo __builtin_altivec_vmulouw
-        #define XXH_vec_mule __builtin_altivec_vmuleuw
-      #else
+        /*
+         * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+         *
+         * These intrinsics weren't added until GCC 8, despite existing for a
+         * while, and they are endian dependent. Also, their meaning swap
+         * depending on version.
+         * */
+        #if defined(__s390x__)
+        /* s390x is always big endian, no issue on this platform */
+          #define XXH_vec_mulo vec_mulo
+          #define XXH_vec_mule vec_mule
+        #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
+          /* Clang has a better way to control this, we can just use the builtin
+           * which doesn't swap. */
+          #define XXH_vec_mulo __builtin_altivec_vmulouw
+          #define XXH_vec_mule __builtin_altivec_vmuleuw
+        #else
 /* gcc needs inline assembly */
 /* Adapted from
  * https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
@@ -2887,40 +3511,41 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) {
 
 }
 
-      #endif                                  /* XXH_vec_mulo, XXH_vec_mule */
-    #endif                                         /* XXH_VECTOR == XXH_VSX */
+        #endif                                /* XXH_vec_mulo, XXH_vec_mule */
+      #endif                                       /* XXH_VECTOR == XXH_VSX */
 
-    /* prefetch
-     * can be disabled, by declaring XXH_NO_PREFETCH build macro */
-    #if defined(XXH_NO_PREFETCH)
-      #define XXH_PREFETCH(ptr) (void)(ptr)                     /* disabled */
-    #else
-      #if defined(_MSC_VER) && \
-          (defined(_M_X64) ||  \
-           defined(            \
-               _M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
-        #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-        #define XXH_PREFETCH(ptr) _mm_prefetch((const char *)(ptr), _MM_HINT_T0)
-      #elif defined(__GNUC__) && \
-          ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
-        #define XXH_PREFETCH(ptr) \
-          __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
-      #else
+      /* prefetch
+       * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+      #if defined(XXH_NO_PREFETCH)
         #define XXH_PREFETCH(ptr) (void)(ptr)                   /* disabled */
-      #endif
-    #endif                                               /* XXH_NO_PREFETCH */
+      #else
+        #if defined(_MSC_VER) && \
+            (defined(_M_X64) ||  \
+             defined(            \
+                 _M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */
+          #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+          #define XXH_PREFETCH(ptr) \
+            _mm_prefetch((const char *)(ptr), _MM_HINT_T0)
+        #elif defined(__GNUC__) && \
+            ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
+          #define XXH_PREFETCH(ptr) \
+            __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+        #else
+          #define XXH_PREFETCH(ptr) (void)(ptr)                 /* disabled */
+        #endif
+      #endif                                             /* XXH_NO_PREFETCH */
 
-  /* ==========================================
-   * XXH3 default settings
-   * ========================================== */
+    /* ==========================================
+     * XXH3 default settings
+     * ========================================== */
 
-    #define XXH_SECRET_DEFAULT_SIZE 192     /* minimum XXH3_SECRET_SIZE_MIN */
+      #define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
 
-    #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
-      #error "default keyset is not large enough"
-    #endif
+      #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+        #error "default keyset is not large enough"
+      #endif
 
-/* Pseudorandom secret taken directly from FARSH */
+/*! Pseudorandom secret taken directly from FARSH. */
 XXH_ALIGN(64)
 static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
 
@@ -2943,69 +3568,79 @@ static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
 
 };
 
-    #ifdef XXH_OLD_NAMES
-      #define kSecret XXH3_kSecret
-    #endif
+      #ifdef XXH_OLD_NAMES
+        #define kSecret XXH3_kSecret
+      #endif
 
-    /*
-     * Calculates a 32-bit to 64-bit long multiply.
-     *
-     * Wraps __emulu on MSVC x86 because it tends to call __allmul when it
-     * doesn't need to (but it shouldn't need to anyways, it is about 7
-     * instructions to do a 64x64 multiply...). Since we know that this will
-     * _always_ emit MULL, we use that instead of the normal method.
-     *
-     * If you are compiling for platforms like Thumb-1 and don't have a better
-     * option, you may also want to write your own long multiply routine here.
-     *
-     * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
-     * {
+      #ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it
+ * doesn't need to (but it shouldn't need to anyways, it is about 7 instructions
+ * to do a 64x64 multiply...). Since we know that this will _always_ emit
+ * `MULL`, we use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better
+ * option, you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) {
 
-     *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
-     * }
-     */
-    #if defined(_MSC_VER) && defined(_M_IX86)
-      #include <intrin.h>
-      #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
-    #else
-      /*
-       * Downcast + upcast is usually better than masking on older compilers
-       * like GCC 4.2 (especially 32-bit ones), all without affecting newer
-       * compilers.
-       *
-       * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both
-       * operands and perform a full 64x64 multiply -- entirely redundant on
-       * 32-bit.
-       */
-      #define XXH_mult32to64(x, y) \
-        ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
-    #endif
+  return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
 
-/*
- * Calculates a 64->128-bit long multiply.
+}
+
+      #elif defined(_MSC_VER) && defined(_M_IX86)
+        #include <intrin.h>
+        #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+      #else
+        /*
+         * Downcast + upcast is usually better than masking on older compilers
+         * like GCC 4.2 (especially 32-bit ones), all without affecting newer
+         * compilers.
+         *
+         * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both
+         * operands and perform a full 64x64 multiply -- entirely redundant on
+         * 32-bit.
+         */
+        #define XXH_mult32to64(x, y) \
+          ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+      #endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
  *
- * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
+ * @param lhs, rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
  */
 static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
 
-    /*
-     * GCC/Clang __uint128_t method.
-     *
-     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
-     * This is usually the best way as it usually uses a native long 64-bit
-     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
-     *
-     * Usually.
-     *
-     * Despite being a 32-bit platform, Clang (and emscripten) define this type
-     * despite not having the arithmetic for it. This results in a laggy
-     * compiler builtin call which calculates a full 128-bit multiply.
-     * In that case it is best to use the portable one.
-     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
-     */
-    #if defined(__GNUC__) && !defined(__wasm__) && \
-            defined(__SIZEOF_INT128__) ||          \
-        (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+      /*
+       * GCC/Clang __uint128_t method.
+       *
+       * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+       * This is usually the best way as it usually uses a native long 64-bit
+       * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+       *
+       * Usually.
+       *
+       * Despite being a 32-bit platform, Clang (and emscripten) define this
+       * type despite not having the arithmetic for it. This results in a laggy
+       * compiler builtin call which calculates a full 128-bit multiply.
+       * In that case it is best to use the portable one.
+       * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+       */
+      #if defined(__GNUC__) && !defined(__wasm__) && \
+              defined(__SIZEOF_INT128__) ||          \
+          (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
 
   __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
   XXH128_hash_t     r128;
@@ -3013,19 +3648,19 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
   r128.high64 = (xxh_u64)(product >> 64);
   return r128;
 
-      /*
-       * MSVC for x64's _umul128 method.
-       *
-       * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64
-       * *HighProduct);
-       *
-       * This compiles to single operand MUL on x64.
-       */
-    #elif defined(_M_X64) || defined(_M_IA64)
+        /*
+         * MSVC for x64's _umul128 method.
+         *
+         * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64
+         * *HighProduct);
+         *
+         * This compiles to single operand MUL on x64.
+         */
+      #elif defined(_M_X64) || defined(_M_IA64)
 
-      #ifndef _MSC_VER
-        #pragma intrinsic(_umul128)
-      #endif
+        #ifndef _MSC_VER
+          #pragma intrinsic(_umul128)
+        #endif
   xxh_u64       product_high;
   xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
   XXH128_hash_t r128;
@@ -3033,7 +3668,7 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
   r128.high64 = product_high;
   return r128;
 
-    #else
+      #else
   /*
    * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
    *
@@ -3093,16 +3728,20 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
   r128.low64 = lower;
   r128.high64 = upper;
   return r128;
-    #endif
+      #endif
 
 }
 
-/*
- * Does a 64-bit to 128-bit multiply, then XOR folds it.
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
  *
  * The reason for the separate function is to prevent passing too many structs
  * around by value. This will hopefully inline the multiply, but we don't force
  * it.
+ *
+ * @param lhs, rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
  */
 static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
 
@@ -3111,7 +3750,7 @@ static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
 
 }
 
-/* Seems to produce slightly better code on GCC for some reason. */
+/*! Seems to produce slightly better code on GCC for some reason. */
 XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) {
 
   XXH_ASSERT(0 <= shift && shift < 64);
@@ -3216,7 +3855,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len,
 
   XXH_ASSERT(input != NULL);
   XXH_ASSERT(secret != NULL);
-  XXH_ASSERT(4 <= len && len < 8);
+  XXH_ASSERT(4 <= len && len <= 8);
   seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
   {
 
@@ -3239,7 +3878,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input,
 
   XXH_ASSERT(input != NULL);
   XXH_ASSERT(secret != NULL);
-  XXH_ASSERT(8 <= len && len <= 16);
+  XXH_ASSERT(9 <= len && len <= 16);
   {
 
     xxh_u64 const bitflip1 =
@@ -3306,11 +3945,10 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input,
                                      const xxh_u8 *XXH_RESTRICT secret,
                                      xxh_u64                    seed64) {
 
-    #if defined(__GNUC__) && !defined(__clang__)  /* GCC, not Clang */ \
-        && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */     \
-        &&                                                             \
-        !defined(                                                      \
-            XXH_ENABLE_AUTOVECTORIZE)  /* Define to disable like XXH32 hack */
+      #if defined(__GNUC__) && !defined(__clang__)  /* GCC, not Clang */      \
+          && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */          \
+          && !defined(XXH_ENABLE_AUTOVECTORIZE)     /* Define to disable like \
+                                                       XXH32 hack */
   /*
    * UGLY HACK:
    * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
@@ -3326,8 +3964,8 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input,
    * GCC generates much better scalar code than Clang for the rest of XXH3,
    * which is why finding a more optimal codepath is an interest.
    */
-  __asm__("" : "+r"(seed64));
-    #endif
+  XXH_COMPILER_GUARD(seed64);
+      #endif
   {
 
     xxh_u64 const input_lo = XXH_readLE64(input);
@@ -3381,7 +4019,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b(
 
 }
 
-    #define XXH3_MIDSIZE_MAX 240
+      #define XXH3_MIDSIZE_MAX 240
 
 XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
     const xxh_u8 *XXH_RESTRICT input, size_t len,
@@ -3391,8 +4029,8 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
   (void)secretSize;
   XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
 
-    #define XXH3_MIDSIZE_STARTOFFSET 3
-    #define XXH3_MIDSIZE_LASTOFFSET 17
+      #define XXH3_MIDSIZE_STARTOFFSET 3
+      #define XXH3_MIDSIZE_LASTOFFSET 17
 
   {
 
@@ -3407,31 +4045,31 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
 
     acc = XXH3_avalanche(acc);
     XXH_ASSERT(nbRounds >= 8);
-    #if defined(__clang__)                                /* Clang */ \
-        && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
-        && !defined(XXH_ENABLE_AUTOVECTORIZE)          /* Define to disable */
-      /*
-       * UGLY HACK:
-       * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
-       * In everywhere else, it uses scalar code.
-       *
-       * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
-       * would still be slower than UMAAL (see XXH_mult64to128).
-       *
-       * Unfortunately, Clang doesn't handle the long multiplies properly and
-       * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
-       * scalarized into an ugly mess of VMOV.32 instructions.
-       *
-       * This mess is difficult to avoid without turning autovectorization
-       * off completely, but they are usually relatively minor and/or not
-       * worth it to fix.
-       *
-       * This loop is the easiest to fix, as unlike XXH32, this pragma
-       * _actually works_ because it is a loop vectorization instead of an
-       * SLP vectorization.
-       */
-      #pragma clang loop vectorize(disable)
-    #endif
+      #if defined(__clang__)                                /* Clang */ \
+          && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+          && !defined(XXH_ENABLE_AUTOVECTORIZE)        /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+      #endif
     for (i = 8; i < nbRounds; i++) {
 
       acc +=
@@ -3450,17 +4088,17 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
 
 }
 
-  /* =======     Long Keys     ======= */
+    /* =======     Long Keys     ======= */
 
-    #define XXH_STRIPE_LEN 64
-    #define XXH_SECRET_CONSUME_RATE \
-      8                 /* nb of secret bytes consumed at each accumulation */
-    #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+      #define XXH_STRIPE_LEN 64
+      #define XXH_SECRET_CONSUME_RATE \
+        8               /* nb of secret bytes consumed at each accumulation */
+      #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
 
-    #ifdef XXH_OLD_NAMES
-      #define STRIPE_LEN XXH_STRIPE_LEN
-      #define ACC_NB XXH_ACC_NB
-    #endif
+      #ifdef XXH_OLD_NAMES
+        #define STRIPE_LEN XXH_STRIPE_LEN
+        #define ACC_NB XXH_ACC_NB
+      #endif
 
 XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) {
 
@@ -3469,56 +4107,58 @@ XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) {
 
 }
 
-    /* Several intrinsic functions below are supposed to accept __int64 as
-     * argument, as documented in
-     * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . However,
-     * several environments do not define __int64 type, requiring a workaround.
-     */
-    #if !defined(__VMS) &&                                     \
-        (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
-                                  (__STDC_VERSION__ >= 199901L) /* C99 */))
+      /* Several intrinsic functions below are supposed to accept __int64 as
+       * argument, as documented in
+       * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+       * However, several environments do not define __int64 type,
+       * requiring a workaround.
+       */
+      #if !defined(__VMS) &&                                     \
+          (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
+                                    (__STDC_VERSION__ >= 199901L) /* C99 */))
 typedef int64_t xxh_i64;
-    #else
+      #else
 /* the following type must have a width of 64-bit */
 typedef long long xxh_i64;
-    #endif
+      #endif
 
-  /*
-   * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the
-   * most optimized.
-   *
-   * It is a hardened version of UMAC, based off of FARSH's implementation.
-   *
-   * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
-   * implementations, and it is ridiculously fast.
-   *
-   * We harden it by mixing the original input to the accumulators as well as
-   * the product.
-   *
-   * This means that in the (relatively likely) case of a multiply by zero, the
-   * original input is preserved.
-   *
-   * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
-   * cross-pollination, as otherwise the upper and lower halves would be
-   * essentially independent.
-   *
-   * This doesn't matter on 64-bit hashes since they all get merged together in
-   * the end, so we skip the extra step.
-   *
-   * Both XXH3_64bits and XXH3_128bits use this subroutine.
-   */
+    /*
+     * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the
+     * most optimized.
+     *
+     * It is a hardened version of UMAC, based off of FARSH's implementation.
+     *
+     * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+     * implementations, and it is ridiculously fast.
+     *
+     * We harden it by mixing the original input to the accumulators as well as
+     * the product.
+     *
+     * This means that in the (relatively likely) case of a multiply by zero,
+     * the original input is preserved.
+     *
+     * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+     * cross-pollination, as otherwise the upper and lower halves would be
+     * essentially independent.
+     *
+     * This doesn't matter on 64-bit hashes since they all get merged together
+     * in the end, so we skip the extra step.
+     *
+     * Both XXH3_64bits and XXH3_128bits use this subroutine.
+     */
 
-    #if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH)
+      #if (XXH_VECTOR == XXH_AVX512) || \
+          (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
 
-      #ifndef XXH_TARGET_AVX512
-        #define XXH_TARGET_AVX512               /* disable attribute target */
-      #endif
+        #ifndef XXH_TARGET_AVX512
+          #define XXH_TARGET_AVX512             /* disable attribute target */
+        #endif
 
 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512(
     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
     const void *XXH_RESTRICT secret) {
 
-  XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
+  __m512i *const xacc = (__m512i *)acc;
   XXH_ASSERT((((size_t)acc) & 63) == 0);
   XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
 
@@ -3576,8 +4216,8 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512(
   XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
   {
 
-    XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
-    const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+    __m512i *const xacc = (__m512i *)acc;
+    const __m512i  prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
 
     /* xacc[0] ^= (xacc[0] >> 47) */
     __m512i const acc_vec = *xacc;
@@ -3609,19 +4249,21 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(
 
     int const     nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
     __m512i const seed = _mm512_mask_set1_epi64(
-        _mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);
+        _mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
 
-    XXH_ALIGN(64) const __m512i *const src = (const __m512i *)XXH3_kSecret;
-    XXH_ALIGN(64) __m512i *const       dest = (__m512i *)customSecret;
-    int                                i;
+    const __m512i *const src = (const __m512i *)((const void *)XXH3_kSecret);
+    __m512i *const       dest = (__m512i *)customSecret;
+    int                  i;
+    XXH_ASSERT(((size_t)src & 63) == 0);               /* control alignment */
+    XXH_ASSERT(((size_t)dest & 63) == 0);
     for (i = 0; i < nbRounds; ++i) {
 
       /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void
-       * const*', this will warn "discards ‘const’ qualifier". */
+       * const*', this will warn "discards 'const' qualifier". */
       union {
 
-        XXH_ALIGN(64) const __m512i *cp;
-        XXH_ALIGN(64) void *p;
+        const __m512i *cp;
+        void *         p;
 
       } remote_const_void;
 
@@ -3635,13 +4277,14 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(
 
 }
 
-    #endif
+      #endif
 
-    #if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH)
+      #if (XXH_VECTOR == XXH_AVX2) || \
+          (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
 
-      #ifndef XXH_TARGET_AVX2
-        #define XXH_TARGET_AVX2                 /* disable attribute target */
-      #endif
+        #ifndef XXH_TARGET_AVX2
+          #define XXH_TARGET_AVX2               /* disable attribute target */
+        #endif
 
 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2(
     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
@@ -3650,7 +4293,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2(
   XXH_ASSERT((((size_t)acc) & 31) == 0);
   {
 
-    XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
+    __m256i *const xacc = (__m256i *)acc;
     /* Unaligned. This is mainly for pointer arithmetic, and because
      * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason.
      */
@@ -3692,7 +4335,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(
   XXH_ASSERT((((size_t)acc) & 31) == 0);
   {
 
-    XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
+    __m256i *const xacc = (__m256i *)acc;
     /* Unaligned. This is mainly for pointer arithmetic, and because
      * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
     const __m256i *const xsecret = (const __m256i *)secret;
@@ -3732,24 +4375,23 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(
   XXH_PREFETCH(customSecret);
   {
 
-    __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64,
-                                           -(xxh_i64)seed64, (xxh_i64)seed64);
+    __m256i const seed =
+        _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64,
+                          (xxh_i64)(0U - seed64), (xxh_i64)seed64);
 
-    XXH_ALIGN(64) const __m256i *const src = (const __m256i *)XXH3_kSecret;
-    XXH_ALIGN(64) __m256i *            dest = (__m256i *)customSecret;
+    const __m256i *const src = (const __m256i *)((const void *)XXH3_kSecret);
+    __m256i *            dest = (__m256i *)customSecret;
 
-      #if defined(__GNUC__) || defined(__clang__)
+        #if defined(__GNUC__) || defined(__clang__)
     /*
      * On GCC & Clang, marking 'dest' as modified will cause the compiler:
      *   - do not extract the secret from sse registers in the internal loop
      *   - use less common registers, and avoid pushing these reg into stack
-     * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
-     * customSecret, and on aarch64, this prevented LDP from merging two
-     * loads together for free. Putting the loads together before the stores
-     * properly generates LDP.
      */
-    __asm__("" : "+r"(dest));
-      #endif
+    XXH_COMPILER_GUARD(dest);
+        #endif
+    XXH_ASSERT(((size_t)src & 31) == 0);               /* control alignment */
+    XXH_ASSERT(((size_t)dest & 31) == 0);
 
     /* GCC -O2 need unroll loop manually */
     dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed);
@@ -3763,13 +4405,14 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(
 
 }
 
-    #endif
+      #endif
 
-    #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+      /* x86dispatch always generates SSE2 */
+      #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
 
-      #ifndef XXH_TARGET_SSE2
-        #define XXH_TARGET_SSE2                 /* disable attribute target */
-      #endif
+        #ifndef XXH_TARGET_SSE2
+          #define XXH_TARGET_SSE2               /* disable attribute target */
+        #endif
 
 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2(
     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
@@ -3779,7 +4422,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2(
   XXH_ASSERT((((size_t)acc) & 15) == 0);
   {
 
-    XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
+    __m128i *const xacc = (__m128i *)acc;
     /* Unaligned. This is mainly for pointer arithmetic, and because
      * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
     const __m128i *const xinput = (const __m128i *)input;
@@ -3820,7 +4463,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(
   XXH_ASSERT((((size_t)acc) & 15) == 0);
   {
 
-    XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
+    __m128i *const xacc = (__m128i *)acc;
     /* Unaligned. This is mainly for pointer arithmetic, and because
      * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
     const __m128i *const xsecret = (const __m128i *)secret;
@@ -3859,30 +4502,34 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(
 
     int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
 
-      #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-    // MSVC 32bit mode does not support _mm_set_epi64x before 2015
+        #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+    /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
     XXH_ALIGN(16)
-    const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, -(xxh_i64)seed64};
+    const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, (xxh_i64)(0U - seed64)};
     __m128i const seed = _mm_load_si128((__m128i const *)seed64x2);
-      #else
-    __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);
-      #endif
+        #else
+    __m128i const seed =
+        _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+        #endif
     int i;
 
-    XXH_ALIGN(64) const float *const  src = (float const *)XXH3_kSecret;
-    XXH_ALIGN(XXH_SEC_ALIGN) __m128i *dest = (__m128i *)customSecret;
-      #if defined(__GNUC__) || defined(__clang__)
+    const void *const src16 = XXH3_kSecret;
+    __m128i *         dst16 = (__m128i *)customSecret;
+        #if defined(__GNUC__) || defined(__clang__)
     /*
      * On GCC & Clang, marking 'dest' as modified will cause the compiler:
      *   - do not extract the secret from sse registers in the internal loop
      *   - use less common registers, and avoid pushing these reg into stack
      */
-    __asm__("" : "+r"(dest));
-      #endif
+    XXH_COMPILER_GUARD(dst16);
+        #endif
+    XXH_ASSERT(((size_t)src16 & 15) == 0);             /* control alignment */
+    XXH_ASSERT(((size_t)dst16 & 15) == 0);
 
     for (i = 0; i < nbRounds; ++i) {
 
-      dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src + i * 4)), seed);
+      dst16[i] =
+          _mm_add_epi64(_mm_load_si128((const __m128i *)src16 + i), seed);
 
     }
 
@@ -3890,9 +4537,9 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(
 
 }
 
-    #endif
+      #endif
 
-    #if (XXH_VECTOR == XXH_NEON)
+      #if (XXH_VECTOR == XXH_NEON)
 
 XXH_FORCE_INLINE void XXH3_accumulate_512_neon(
     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
@@ -3901,7 +4548,7 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_neon(
   XXH_ASSERT((((size_t)acc) & 15) == 0);
   {
 
-    XXH_ALIGN(16) uint64x2_t *const xacc = (uint64x2_t *)acc;
+    uint64x2_t *const xacc = (uint64x2_t *)acc;
     /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7.
      */
     uint8_t const *const xinput = (const uint8_t *)input;
@@ -3996,9 +4643,9 @@ XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void *XXH_RESTRICT       acc,
 
 }
 
-    #endif
+      #endif
 
-    #if (XXH_VECTOR == XXH_VSX)
+      #if (XXH_VECTOR == XXH_VSX)
 
 XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT       acc,
                                               const void *XXH_RESTRICT input,
@@ -4025,12 +4672,12 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT       acc,
     xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
     xacc[i] += product;
 
-        /* swap high and low halves */
-      #ifdef __s390x__
+          /* swap high and low halves */
+        #ifdef __s390x__
     xacc[i] += vec_permi(data_vec, data_vec, 2);
-      #else
+        #else
     xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
-      #endif
+        #endif
 
   }
 
@@ -4075,7 +4722,7 @@ XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void *XXH_RESTRICT       acc,
 
 }
 
-    #endif
+      #endif
 
 /* scalar variants - universal */
 
@@ -4083,7 +4730,6 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(
     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
     const void *XXH_RESTRICT secret) {
 
-  XXH_ALIGN(XXH_ACC_ALIGN)
   xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
   const xxh_u8 *const xinput =
       (const xxh_u8 *)input;                    /* no alignment restriction */
@@ -4105,7 +4751,6 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(
 XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void *XXH_RESTRICT       acc,
                                               const void *XXH_RESTRICT secret) {
 
-  XXH_ALIGN(XXH_ACC_ALIGN)
   xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
   const xxh_u8 *const xsecret =
       (const xxh_u8 *)secret;                   /* no alignment restriction */
@@ -4135,7 +4780,7 @@ XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(
   const xxh_u8 *kSecretPtr = XXH3_kSecret;
   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
 
-    #if defined(__clang__) && defined(__aarch64__)
+      #if defined(__clang__) && defined(__aarch64__)
   /*
    * UGLY HACK:
    * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
@@ -4164,8 +4809,8 @@ XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(
    *   without hack: 2654.4 MB/s
    *   with hack:    3202.9 MB/s
    */
-  __asm__("" : "+r"(kSecretPtr));
-    #endif
+  XXH_COMPILER_GUARD(kSecretPtr);
+      #endif
   /*
    * Note: in debug mode, this overrides the asm optimization
    * and Clang will emit MOVK chains again.
@@ -4200,55 +4845,55 @@ typedef void (*XXH3_f_accumulate_512)(void *XXH_RESTRICT, const void *,
 typedef void (*XXH3_f_scrambleAcc)(void *XXH_RESTRICT, const void *);
 typedef void (*XXH3_f_initCustomSecret)(void *XXH_RESTRICT, xxh_u64);
 
-    #if (XXH_VECTOR == XXH_AVX512)
+      #if (XXH_VECTOR == XXH_AVX512)
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+        #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
 
-    #elif (XXH_VECTOR == XXH_AVX2)
+      #elif (XXH_VECTOR == XXH_AVX2)
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+        #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
 
-    #elif (XXH_VECTOR == XXH_SSE2)
+      #elif (XXH_VECTOR == XXH_SSE2)
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+        #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
 
-    #elif (XXH_VECTOR == XXH_NEON)
+      #elif (XXH_VECTOR == XXH_NEON)
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_neon
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+        #define XXH3_accumulate_512 XXH3_accumulate_512_neon
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
 
-    #elif (XXH_VECTOR == XXH_VSX)
+      #elif (XXH_VECTOR == XXH_VSX)
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+        #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
 
-    #else                                                         /* scalar */
+      #else                                                       /* scalar */
 
-      #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
-      #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
-      #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+        #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+        #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
+        #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
 
-    #endif
+      #endif
 
-    #ifndef XXH_PREFETCH_DIST
-      #ifdef __clang__
-        #define XXH_PREFETCH_DIST 320
-      #else
-        #if (XXH_VECTOR == XXH_AVX512)
-          #define XXH_PREFETCH_DIST 512
+      #ifndef XXH_PREFETCH_DIST
+        #ifdef __clang__
+          #define XXH_PREFETCH_DIST 320
         #else
-          #define XXH_PREFETCH_DIST 384
-        #endif
-      #endif                                                   /* __clang__ */
-    #endif                                             /* XXH_PREFETCH_DIST */
+          #if (XXH_VECTOR == XXH_AVX512)
+            #define XXH_PREFETCH_DIST 512
+          #else
+            #define XXH_PREFETCH_DIST 384
+          #endif
+        #endif                                                 /* __clang__ */
+      #endif                                           /* XXH_PREFETCH_DIST */
 
 /*
  * XXH3_accumulate()
@@ -4308,8 +4953,9 @@ XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(
     {
 
       const xxh_u8 *const p = input + len - XXH_STRIPE_LEN;
-    #define XXH_SECRET_LASTACC_START \
-      7  /* not aligned on 8, last secret is different from acc & scrambler */
+      #define XXH_SECRET_LASTACC_START                                       \
+        7 /* not aligned on 8, last secret is different from acc & scrambler \
+           */
       f_acc512(acc, p,
                secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
 
@@ -4337,10 +4983,10 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
   for (i = 0; i < 4; i++) {
 
     result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i);
-    #if defined(__clang__)                                /* Clang */ \
-        && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
-        && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
-        && !defined(XXH_ENABLE_AUTOVECTORIZE)          /* Define to disable */
+      #if defined(__clang__)                                /* Clang */ \
+          && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+          && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+          && !defined(XXH_ENABLE_AUTOVECTORIZE)        /* Define to disable */
     /*
      * UGLY HACK:
      * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
@@ -4349,8 +4995,8 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
      *   without hack: 2063.7 MB/s
      *   with hack:    2560.7 MB/s
      */
-    __asm__("" : "+r"(result64));
-    #endif
+    XXH_COMPILER_GUARD(result64);
+      #endif
 
   }
 
@@ -4358,13 +5004,13 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
 
 }
 
-    #define XXH3_INIT_ACC                                              \
-      {                                                                \
-                                                                       \
-        XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,    \
-            XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \
-                                                                       \
-      }
+      #define XXH3_INIT_ACC                                              \
+        {                                                                \
+                                                                         \
+          XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,    \
+              XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \
+                                                                         \
+        }
 
 XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(
     const void *XXH_RESTRICT input, size_t len, const void *XXH_RESTRICT secret,
@@ -4379,9 +5025,9 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(
 
   /* converge into final hash */
   XXH_STATIC_ASSERT(sizeof(acc) == 64);
-    /* do not align on 8, so that the secret is different from the accumulator
-     */
-    #define XXH_SECRET_MERGEACCS_START 11
+      /* do not align on 8, so that the secret is different from the accumulator
+       */
+      #define XXH_SECRET_MERGEACCS_START 11
   XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
   return XXH3_mergeAccs(acc,
                         (const xxh_u8 *)secret + XXH_SECRET_MERGEACCS_START,
@@ -4501,6 +5147,7 @@ XXH3_64bits_internal(const void *XXH_RESTRICT input, size_t len,
 
 /* ===   Public entry point   === */
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) {
 
   return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret),
@@ -4508,6 +5155,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) {
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input,
                                                    size_t      len,
                                                    const void *secret,
@@ -4518,6 +5166,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len,
                                                  XXH64_hash_t seed) {
 
@@ -4603,6 +5252,7 @@ static void XXH_alignedFree(void *p) {
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) {
 
   XXH3_state_t *const state =
@@ -4613,6 +5263,7 @@ XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) {
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) {
 
   XXH_alignedFree(statePtr);
@@ -4620,6 +5271,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) {
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t *      dst_state,
                                    const XXH3_state_t *src_state) {
 
@@ -4627,9 +5279,8 @@ XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t *      dst_state,
 
 }
 
-static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr,
-                                       XXH64_hash_t seed, const void *secret,
-                                       size_t secretSize) {
+static void XXH3_reset_internal(XXH3_state_t *statePtr, XXH64_hash_t seed,
+                                const void *secret, size_t secretSize) {
 
   size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
   size_t const initLength =
@@ -4654,26 +5305,28 @@ static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr) {
 
   if (statePtr == NULL) return XXH_ERROR;
-  XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret,
-                             XXH_SECRET_DEFAULT_SIZE);
+  XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
   return XXH_OK;
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
     XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
 
   if (statePtr == NULL) return XXH_ERROR;
-  XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
+  XXH3_reset_internal(statePtr, 0, secret, secretSize);
   if (secret == NULL) return XXH_ERROR;
   if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
   return XXH_OK;
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
                                                         XXH64_hash_t  seed) {
 
@@ -4681,7 +5334,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
   if (seed == 0) return XXH3_64bits_reset(statePtr);
   if (seed != statePtr->seed)
     XXH3_initCustomSecret(statePtr->customSecret, seed);
-  XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+  XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
   return XXH_OK;
 
 }
@@ -4733,12 +5386,12 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
                                            XXH3_f_scrambleAcc    f_scramble) {
 
   if (input == NULL)
-    #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
-        (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
+      #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
+          (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
     return XXH_OK;
-    #else
+      #else
     return XXH_ERROR;
-    #endif
+      #endif
 
   {
 
@@ -4747,6 +5400,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
         (state->extSecret == NULL) ? state->customSecret : state->extSecret;
 
     state->totalLen += len;
+    XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
 
     if (state->bufferedSize + len <=
         XXH3_INTERNALBUFFER_SIZE) {                   /* fill in tmp buffer */
@@ -4756,10 +5410,10 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
 
     }
 
-      /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
 
-    #define XXH3_INTERNALBUFFER_STRIPES \
-      (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+      #define XXH3_INTERNALBUFFER_STRIPES \
+        (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
     XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN ==
                       0);                                 /* clean multiple */
 
@@ -4783,7 +5437,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
     XXH_ASSERT(input < bEnd);
 
     /* Consume input by a multiple of internal buffer size */
-    if (input + XXH3_INTERNALBUFFER_SIZE < bEnd) {
+    if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
 
       const xxh_u8 *const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
       do {
@@ -4814,6 +5468,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state,
                                                 const void *input, size_t len) {
 
@@ -4859,6 +5514,7 @@ XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t *       acc,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) {
 
   const unsigned char *const secret =
@@ -4881,8 +5537,9 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) {
 
 }
 
-    #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+      #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
                                         const void *customSeed,
                                         size_t      customSeedSize) {
@@ -5398,6 +6055,7 @@ XXH3_128bits_internal(const void *input, size_t len, XXH64_hash_t seed64,
 
 /* ===   Public XXH128 API   === */
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) {
 
   return XXH3_128bits_internal(input, len, 0, XXH3_kSecret,
@@ -5406,6 +6064,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) {
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input,
                                                      size_t      len,
                                                      const void *secret,
@@ -5416,6 +6075,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input,
                                                    size_t       len,
                                                    XXH64_hash_t seed) {
@@ -5426,6 +6086,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len,
                                     XXH64_hash_t seed) {
 
@@ -5437,37 +6098,31 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len,
 
 /*
  * All the functions are actually the same as for 64-bit streaming variant.
- * The only difference is the finalizatiom routine.
+ * The only difference is the finalization routine.
  */
 
-static void XXH3_128bits_reset_internal(XXH3_state_t *statePtr,
-                                        XXH64_hash_t seed, const void *secret,
-                                        size_t secretSize) {
-
-  XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
-
-}
-
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr) {
 
   if (statePtr == NULL) return XXH_ERROR;
-  XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret,
-                              XXH_SECRET_DEFAULT_SIZE);
+  XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
   return XXH_OK;
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(
     XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
 
   if (statePtr == NULL) return XXH_ERROR;
-  XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
+  XXH3_reset_internal(statePtr, 0, secret, secretSize);
   if (secret == NULL) return XXH_ERROR;
   if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
   return XXH_OK;
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
                                                          XXH64_hash_t  seed) {
 
@@ -5475,11 +6130,12 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
   if (seed == 0) return XXH3_128bits_reset(statePtr);
   if (seed != statePtr->seed)
     XXH3_initCustomSecret(statePtr->customSecret, seed);
-  XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+  XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
   return XXH_OK;
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state,
                                                  const void *  input,
                                                  size_t        len) {
@@ -5489,6 +6145,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) {
 
   const unsigned char *const secret =
@@ -5524,11 +6181,12 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) {
 
 }
 
-  /* 128-bit utility functions */
+    /* 128-bit utility functions */
 
-    #include <string.h>                                   /* memcmp, memcpy */
+      #include <string.h>                                 /* memcmp, memcpy */
 
 /* return : 1 is equal, 0 if different */
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
 
   /* note : XXH128_hash_t is compact, it has no padding byte */
@@ -5540,6 +6198,7 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
  * return : >0 if *h128_1  > *h128_2
  *          <0 if *h128_1  < *h128_2
  *          =0 if *h128_1 == *h128_2  */
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) {
 
   XXH128_hash_t const h1 = *(const XXH128_hash_t *)h128_1;
@@ -5552,6 +6211,7 @@ XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) {
 }
 
 /*======   Canonical representation   ======*/
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
                                              XXH128_hash_t       hash) {
 
@@ -5568,6 +6228,7 @@ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
 
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t
 XXH128_hashFromCanonical(const XXH128_canonical_t *src) {
 
@@ -5578,16 +6239,21 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src) {
 
 }
 
-    /* Pop our optimization override from above */
-    #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
-        && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
-        && defined(__OPTIMIZE__) &&                                      \
-        !defined(__OPTIMIZE_SIZE__)                  /* respect -O0 and -Os */
-      #pragma GCC pop_options
-    #endif
+      /* Pop our optimization override from above */
+      #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
+          && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+          && defined(__OPTIMIZE__) &&                                      \
+          !defined(__OPTIMIZE_SIZE__)                /* respect -O0 and -Os */
+        #pragma GCC pop_options
+      #endif
 
-  #endif                                                /* XXH_NO_LONG_LONG */
+    #endif                                              /* XXH_NO_LONG_LONG */
+
+  #endif                                                     /* XXH_NO_XXH3 */
 
+/*!
+ * @}
+ */
 #endif                                                /* XXH_IMPLEMENTATION */
 
 #if defined(__cplusplus)