diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/afl-as.h | 4 | ||||
-rw-r--r-- | include/afl-fuzz.h | 4 | ||||
-rw-r--r-- | include/afl-prealloc.h | 2 | ||||
-rw-r--r-- | include/alloc-inl.h | 2 | ||||
-rw-r--r-- | include/cmplog.h | 15 | ||||
-rw-r--r-- | include/common.h | 4 | ||||
-rw-r--r-- | include/config.h | 10 | ||||
-rw-r--r-- | include/debug.h | 2 | ||||
-rw-r--r-- | include/envs.h | 6 | ||||
-rw-r--r-- | include/forkserver.h | 4 | ||||
-rw-r--r-- | include/hash.h | 2 | ||||
-rw-r--r-- | include/list.h | 2 | ||||
-rw-r--r-- | include/sharedmem.h | 2 | ||||
-rw-r--r-- | include/snapshot-inl.h | 2 | ||||
-rw-r--r-- | include/types.h | 5 | ||||
-rw-r--r-- | include/xxhash.h | 3084 |
16 files changed, 1916 insertions, 1234 deletions
diff --git a/include/afl-as.h b/include/afl-as.h index 3c12c68f..2a2e8ad7 100644 --- a/include/afl-as.h +++ b/include/afl-as.h @@ -16,7 +16,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 This file houses the assembly-level instrumentation injected into fuzzed programs. The instrumentation stores XORed pairs of data: identifiers of the @@ -396,7 +396,7 @@ static const u8 *main_payload_32 = "\n"; /* The OpenBSD hack is due to lahf and sahf not being recognized by some - versions of binutils: http://marc.info/?l=openbsd-cvs&m=141636589924400 + versions of binutils: https://marc.info/?l=openbsd-cvs&m=141636589924400 The Apple code is a bit different when calling libc functions because they are doing relocations differently from everybody else. We also need diff --git a/include/afl-fuzz.h b/include/afl-fuzz.h index 4b19e698..f3d6d99d 100644 --- a/include/afl-fuzz.h +++ b/include/afl-fuzz.h @@ -16,7 +16,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 This is the real deal: the program takes an instrumented binary and attempts a variety of basic fuzzing tricks, paying close attention to @@ -1130,12 +1130,12 @@ void get_core_count(afl_state_t *); void fix_up_sync(afl_state_t *); void check_asan_opts(afl_state_t *); void check_binary(afl_state_t *, u8 *); -void fix_up_banner(afl_state_t *, u8 *); void check_if_tty(afl_state_t *); void setup_signal_handlers(void); void save_cmdline(afl_state_t *, u32, char **); void read_foreign_testcases(afl_state_t *, int); void write_crash_readme(afl_state_t *afl); +u8 check_if_text_buf(u8 *buf, u32 len); /* CmpLog */ diff --git a/include/afl-prealloc.h b/include/afl-prealloc.h index fa6c9b70..87bbb1cc 100644 --- a/include/afl-prealloc.h +++ b/include/afl-prealloc.h @@ -16,7 +16,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 */ diff --git a/include/alloc-inl.h b/include/alloc-inl.h index c914da5f..0c540330 100644 --- a/include/alloc-inl.h +++ b/include/alloc-inl.h @@ -16,7 +16,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 This allocator is not designed to resist malicious attackers (the canaries are small and predictable), but provides a robust and portable way to detect diff --git a/include/cmplog.h b/include/cmplog.h index 878ed60c..8778a4b6 100644 --- a/include/cmplog.h +++ b/include/cmplog.h @@ -18,7 +18,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 Shared code to handle the shared memory. This is used by the fuzzer as well the other components like afl-tmin, afl-showmap, etc... @@ -48,7 +48,8 @@ struct cmp_header { unsigned shape : 5; unsigned type : 2; unsigned attribute : 4; - unsigned reserved : 5; + unsigned overflow : 1; + unsigned reserved : 4; } __attribute__((packed)); @@ -59,14 +60,16 @@ struct cmp_operands { u64 v0_128; u64 v1_128; -}; +} __attribute__((packed)); struct cmpfn_operands { - u8 v0[32]; - u8 v1[32]; + u8 v0[31]; + u8 v0_len; + u8 v1[31]; + u8 v1_len; -}; +} __attribute__((packed)); typedef struct cmp_operands cmp_map_list[CMP_MAP_H]; diff --git a/include/common.h b/include/common.h index 7bba9e91..6c8e3b3a 100644 --- a/include/common.h +++ b/include/common.h @@ -16,7 +16,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 Gather some functions common to multiple executables @@ -38,6 +38,7 @@ #define STRINGIFY_VAL_SIZE_MAX (16) +u32 check_binary_signatures(u8 *fn); void detect_file_args(char **argv, u8 *prog_in, bool *use_stdin); void print_suggested_envs(char *mispelled_env); void check_environment_vars(char **env); @@ -45,6 +46,7 @@ void check_environment_vars(char **env); char **argv_cpy_dup(int argc, char **argv); void argv_cpy_free(char **argv); +char **get_cs_argv(u8 *own_loc, u8 **target_path_p, int argc, char **argv); char **get_qemu_argv(u8 *own_loc, u8 **target_path_p, int argc, char **argv); char **get_wine_argv(u8 *own_loc, u8 **target_path_p, int argc, char **argv); char * get_afl_env(char *env); diff --git a/include/config.h b/include/config.h index da74989e..b787152f 100644 --- a/include/config.h +++ b/include/config.h @@ -16,7 +16,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 */ @@ -237,11 +237,11 @@ (note that if this value is changed, several areas in afl-cc.c, afl-fuzz.c and afl-fuzz-state.c have to be changed as well! */ -#define MAX_FILE (1 * 1024 * 1024U) +#define MAX_FILE (1 * 1024 * 1024L) /* The same, for the test case minimizer: */ -#define TMIN_MAX_FILE (10 * 1024 * 1024) +#define TMIN_MAX_FILE (10 * 1024 * 1024L) /* Block normalization steps for afl-tmin: */ @@ -267,8 +267,8 @@ (first value), and to keep in memory as candidates. The latter should be much higher than the former. */ -#define USE_AUTO_EXTRAS 128 -#define MAX_AUTO_EXTRAS (USE_AUTO_EXTRAS * 64) +#define USE_AUTO_EXTRAS 4096 +#define MAX_AUTO_EXTRAS (USE_AUTO_EXTRAS * 8) /* Scaling factor for the effector map used to skip some of the more expensive deterministic steps. The actual divisor is set to diff --git a/include/debug.h b/include/debug.h index f8df5711..feb7f52d 100644 --- a/include/debug.h +++ b/include/debug.h @@ -16,7 +16,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 */ diff --git a/include/envs.h b/include/envs.h index e3957147..a3ba5e88 100644 --- a/include/envs.h +++ b/include/envs.h @@ -54,10 +54,12 @@ static char *afl_environment_variables[] = { "AFL_FAST_CAL", "AFL_FORCE_UI", "AFL_FRIDA_DEBUG_MAPS", + "AFL_FRIDA_DRIVER_NO_HOOK", "AFL_FRIDA_EXCLUDE_RANGES", "AFL_FRIDA_INST_COVERAGE_FILE", "AFL_FRIDA_INST_DEBUG_FILE", "AFL_FRIDA_INST_JIT", + "AFL_FRIDA_INST_NO_BACKPATCH", "AFL_FRIDA_INST_NO_OPTIMIZE", "AFL_FRIDA_INST_NO_PREFETCH", "AFL_FRIDA_INST_NO_PREFETCH_BACKPATCH", @@ -74,8 +76,11 @@ static char *afl_environment_variables[] = { "AFL_FRIDA_PERSISTENT_DEBUG", "AFL_FRIDA_PERSISTENT_HOOK", "AFL_FRIDA_PERSISTENT_RET", + "AFL_FRIDA_STALKER_IC_ENTRIES", + "AFL_FRIDA_STALKER_ADJACENT_BLOCKS", "AFL_FRIDA_STATS_FILE", "AFL_FRIDA_STATS_INTERVAL", + "AFL_FRIDA_TRACEABLE", "AFL_FUZZER_ARGS", // oss-fuzz "AFL_GDB", "AFL_GCC_ALLOWLIST", @@ -202,6 +207,7 @@ static char *afl_environment_variables[] = { "AFL_USE_MSAN", "AFL_USE_TRACE_PC", "AFL_USE_UBSAN", + "AFL_USE_TSAN", "AFL_USE_CFISAN", "AFL_USE_LSAN", "AFL_WINE_PATH", diff --git a/include/forkserver.h b/include/forkserver.h index c6f7de00..464f208d 100644 --- a/include/forkserver.h +++ b/include/forkserver.h @@ -18,7 +18,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 Shared code that implements a forkserver. This is used by the fuzzer as well the other components like afl-tmin. @@ -82,6 +82,8 @@ typedef struct afl_forkserver { bool frida_asan; /* if running with asan in frida mode */ + bool cs_mode; /* if running in CoreSight mode or not */ + bool use_stdin; /* use stdin for sending data */ bool no_unlink; /* do not unlink cur_input */ diff --git a/include/hash.h b/include/hash.h index 9319ab95..9bb34ff8 100644 --- a/include/hash.h +++ b/include/hash.h @@ -21,7 +21,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 */ diff --git a/include/list.h b/include/list.h index 7ec81cbe..d49e56da 100644 --- a/include/list.h +++ b/include/list.h @@ -16,7 +16,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 This allocator is not designed to resist malicious attackers (the canaries are small and predictable), but provides a robust and portable way to detect diff --git a/include/sharedmem.h b/include/sharedmem.h index fdc947f9..93080d0f 100644 --- a/include/sharedmem.h +++ b/include/sharedmem.h @@ -18,7 +18,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 Shared code to handle the shared memory. This is used by the fuzzer as well the other components like afl-tmin, afl-showmap, etc... diff --git a/include/snapshot-inl.h b/include/snapshot-inl.h index a18187ef..7234bbaa 100644 --- a/include/snapshot-inl.h +++ b/include/snapshot-inl.h @@ -18,7 +18,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 */ diff --git a/include/types.h b/include/types.h index 7b94fb83..bbcc2f81 100644 --- a/include/types.h +++ b/include/types.h @@ -16,7 +16,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at: - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 */ @@ -46,6 +46,8 @@ typedef uint128_t u128; #define FS_ERROR_SHM_OPEN 4 #define FS_ERROR_SHMAT 8 #define FS_ERROR_MMAP 16 +#define FS_ERROR_OLD_CMPLOG 32 +#define FS_ERROR_OLD_CMPLOG_QEMU 64 /* Reporting options */ #define FS_OPT_ENABLED 0x80000001 @@ -53,6 +55,7 @@ typedef uint128_t u128; #define FS_OPT_SNAPSHOT 0x20000000 #define FS_OPT_AUTODICT 0x10000000 #define FS_OPT_SHDMEM_FUZZ 0x01000000 +#define FS_OPT_NEWCMPLOG 0x02000000 #define FS_OPT_OLD_AFLPP_WORKAROUND 0x0f000000 // FS_OPT_MAX_MAPSIZE is 8388608 = 0x800000 = 2^23 = 1 << 22 #define FS_OPT_MAX_MAPSIZE ((0x00fffffeU >> 1) + 1) diff --git a/include/xxhash.h b/include/xxhash.h index 006d3f3d..0ca2b852 100644 --- a/include/xxhash.h +++ b/include/xxhash.h @@ -32,7 +32,12 @@ * - xxHash homepage: https://www.xxhash.com * - xxHash source repository: https://github.com/Cyan4973/xxHash */ - +/*! + * @mainpage xxHash + * + * @file xxhash.h + * xxHash prototypes and implementation + */ /* TODO: update */ /* Notice extracted from xxHash homepage: @@ -45,7 +50,7 @@ Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo Name Speed Q.Score Author xxHash 5.4 GB/s 10 CrapWow 3.2 GB/s 2 Andrew -MumurHash 3a 2.7 GB/s 10 Austin Appleby +MurmurHash 3a 2.7 GB/s 10 Austin Appleby SpookyHash 2.0 GB/s 10 Bob Jenkins SBox 1.4 GB/s 9 Bret Mulvey Lookup3 1.2 GB/s 9 Bob Jenkins @@ -119,29 +124,78 @@ extern "C" { /* * This part deals with the special case where a unit wants to inline xxHash, - * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such - * as part of some previously included *.h header file. + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. * Without further action, the new include would just be ignored, * and functions would effectively _not_ be inlined (silent failure). * The following macros solve this situation by prefixing all inlined names, * avoiding naming collision with previous inclusions. */ - #ifdef XXH_NAMESPACE - #error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported" - /* - * Note: Alternative: #undef all symbols (it's a pretty large list). - * Without #error: it compiles, but functions are actually not inlined. - */ - #endif +/* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ + #undef XXH_versionNumber +/* XXH32 */ + #undef XXH32 + #undef XXH32_createState + #undef XXH32_freeState + #undef XXH32_reset + #undef XXH32_update + #undef XXH32_digest + #undef XXH32_copyState + #undef XXH32_canonicalFromHash + #undef XXH32_hashFromCanonical +/* XXH64 */ + #undef XXH64 + #undef XXH64_createState + #undef XXH64_freeState + #undef XXH64_reset + #undef XXH64_update + #undef XXH64_digest + #undef XXH64_copyState + #undef XXH64_canonicalFromHash + #undef XXH64_hashFromCanonical +/* XXH3_64bits */ + #undef XXH3_64bits + #undef XXH3_64bits_withSecret + #undef XXH3_64bits_withSeed + #undef XXH3_createState + #undef XXH3_freeState + #undef XXH3_copyState + #undef XXH3_64bits_reset + #undef XXH3_64bits_reset_withSeed + #undef XXH3_64bits_reset_withSecret + #undef XXH3_64bits_update + #undef XXH3_64bits_digest + #undef XXH3_generateSecret +/* XXH3_128bits */ + #undef XXH128 + #undef XXH3_128bits + #undef XXH3_128bits_withSeed + #undef XXH3_128bits_withSecret + #undef XXH3_128bits_reset + #undef XXH3_128bits_reset_withSeed + #undef XXH3_128bits_reset_withSecret + #undef XXH3_128bits_update + #undef XXH3_128bits_digest + #undef XXH128_isEqual + #undef XXH128_cmp + #undef XXH128_canonicalFromHash + #undef XXH128_hashFromCanonical +/* Finally, free the namespace itself */ + #undef XXH_NAMESPACE + +/* employ the namespace for XXH_INLINE_ALL */ #define XXH_NAMESPACE XXH_INLINE_ /* - * Some identifiers (enums, type names) are not symbols, but they must - * still be renamed to avoid redeclaration. + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. * Alternative solution: do not redeclare them. - * However, this requires some #ifdefs, and is a more dispersed action. - * Meanwhile, renaming can be achieved in a single block + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. */ - #define XXH_IPREF(Id) XXH_INLINE_##Id + #define XXH_IPREF(Id) XXH_NAMESPACE##Id #define XXH_OK XXH_IPREF(XXH_OK) #define XXH_ERROR XXH_IPREF(XXH_ERROR) #define XXH_errorcode XXH_IPREF(XXH_errorcode) @@ -166,6 +220,12 @@ extern "C" { #ifndef XXHASH_H_5627135585666179 #define XXHASH_H_5627135585666179 1 + /*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + + */ /* specific declaration modes for Windows */ #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) #if defined(WIN32) && defined(_MSC_VER) && \ @@ -180,19 +240,24 @@ extern "C" { #endif #endif - /*! - * XXH_NAMESPACE, aka Namespace Emulation: - * - * If you want to include _and expose_ xxHash functions from within your own - * library, but also want to avoid symbol collisions with other libraries - * which may also include xxHash, you can use XXH_NAMESPACE to automatically - * prefix any public symbol from xxhash library with the value of - * XXH_NAMESPACE (therefore, avoid empty or numeric values). - * - * Note that no change is required within the calling program as long as it - * includes `xxhash.h`: Regular symbol names will be automatically translated - * by this header. - */ + #ifdef XXH_DOXYGEN + /*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries + * which may also include xxHash, you can use XXH_NAMESPACE to automatically + * prefix any public symbol from xxhash library with the value of + * XXH_NAMESPACE (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically + * translated by this header. + */ + #define XXH_NAMESPACE /* YOUR NAME HERE */ + #undef XXH_NAMESPACE + #endif + #ifdef XXH_NAMESPACE #define XXH_CAT(A, B) A##B #define XXH_NAME2(A, B) XXH_CAT(A, B) @@ -264,10 +329,19 @@ extern "C" { ***************************************/ #define XXH_VERSION_MAJOR 0 #define XXH_VERSION_MINOR 8 - #define XXH_VERSION_RELEASE 0 + #define XXH_VERSION_RELEASE 1 #define XXH_VERSION_NUMBER \ (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + \ XXH_VERSION_RELEASE) + +/*! + * @brief Obtains the xxHash version. + * + * This is only useful when xxHash is compiled as a shared library, as it is + * independent of the version defined in the header. + * + * @return `XXH_VERSION_NUMBER` as of when the libray was compiled. + */ XXH_PUBLIC_API unsigned XXH_versionNumber(void); /* **************************** @@ -279,15 +353,24 @@ typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode; /*-********************************************************************** * 32-bit hash ************************************************************************/ - #if !defined(__VMS) && \ + #if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */ +/*! + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; + + #elif !defined(__VMS) && \ (defined(__cplusplus) || \ (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) #include <stdint.h> -typedef uint32_t XXH32_hash_t; +typedef uint32_t XXH32_hash_t; + #else #include <limits.h> #if UINT_MAX == 0xFFFFFFFFUL -typedef unsigned int XXH32_hash_t; +typedef unsigned int XXH32_hash_t; #else #if ULONG_MAX == 0xFFFFFFFFUL typedef unsigned long XXH32_hash_t; @@ -298,24 +381,52 @@ typedef unsigned long XXH32_hash_t; #endif /*! - * XXH32(): - * Calculate the 32-bit hash of sequence "length" bytes stored at memory - * address "input". The memory between input & input+length must be valid - * (allocated and read-accessible). "seed" can be used to alter the result - * predictably. Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher - * benchmark): 5.4 GB/s - * - * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, - * and offers true 64/128 bit hash results. It provides a superior level of - * dispersion, and greatly reduces the risks of collisions. + * @} + * + * @defgroup xxh32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is considered rather weak by today's standards. + * The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit + * systems, and offers true 64/128 bit hash results. It provides a superior + * level of dispersion, and greatly reduces the risks of collisions. + * + * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families + * @see @ref xxh32_impl for implementation details + * @{ + + */ + +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * @param input The block of data to be hashed, at least @p length bytes in + * size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit hash value. + * + * @see + * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. */ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length, XXH32_hash_t seed); -/******* Streaming *******/ - -/* - * Streaming functions generate the xxHash value from an incrememtal input. +/*! + * Streaming functions generate the xxHash value from an incremental input. * This method is slower than single-call functions, due to state management. * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. * @@ -336,19 +447,125 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length, * digest, and generate new hash values later on by invoking `XXH*_digest()`. * * When done, release the state using `XXH*_freeState()`. + * + * Example code for incrementally hashing a file: + * @code{.c} + * #include <stdio.h> + * #include <xxhash.h> + * #define BUFFER_SIZE 256 + * + * // Note: XXH64 and XXH3 use the same interface. + * XXH32_hash_t + * hashFile(FILE* stream) + * { + + * XXH32_state_t* state; + * unsigned char buf[BUFFER_SIZE]; + * size_t amt; + * XXH32_hash_t hash; + * + * state = XXH32_createState(); // Create a state + * assert(state != NULL); // Error check here + * XXH32_reset(state, 0xbaad5eed); // Reset state with our seed + * while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) { + + * XXH32_update(state, buf, amt); // Hash the file in chunks + * } + * hash = XXH32_digest(state); // Finalize the hash + * XXH32_freeState(state); // Clean up + * return hash; + * } + * @endcode + */ + +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. */ +typedef struct XXH32_state_s XXH32_state_t; -typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * Must be freed with XXH32_freeState(). + * @return An allocated XXH32_state_t on success, `NULL` on failure. + */ XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void); -XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr); -XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dst_state, - const XXH32_state_t *src_state); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * Must be allocated with XXH32_createState(). + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref + * XXH32_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dst_state, + const XXH32_state_t *src_state); +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr, XXH32_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in + * size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *statePtr, const void *input, size_t length); -XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr); + +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash32 value from that state. + */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr); /******* Canonical representation *******/ @@ -373,48 +590,158 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr); * canonical format. */ +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ typedef struct { - unsigned char digest[4]; + unsigned char digest[4]; /*!< Hash bytes, big endian */ } XXH32_canonical_t; +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t *src); + #ifdef __has_attribute + #define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) + #else + #define XXH_HAS_ATTRIBUTE(x) 0 + #endif + + /* C-language Attributes are added in C23. */ + #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && \ + defined(__has_c_attribute) + #define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) + #else + #define XXH_HAS_C_ATTRIBUTE(x) 0 + #endif + + #if defined(__cplusplus) && defined(__has_cpp_attribute) + #define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) + #else + #define XXH_HAS_CPP_ATTRIBUTE(x) 0 + #endif + + /* + Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' + attribute introduced in CPP17 and C23. CPP17 : + https://en.cppreference.com/w/cpp/language/attributes/fallthrough C23 : + https://en.cppreference.com/w/c/language/attributes/fallthrough + */ + #if XXH_HAS_C_ATTRIBUTE(x) + #define XXH_FALLTHROUGH [[fallthrough]] + #elif XXH_HAS_CPP_ATTRIBUTE(x) + #define XXH_FALLTHROUGH [[fallthrough]] + #elif XXH_HAS_ATTRIBUTE(__fallthrough__) + #define XXH_FALLTHROUGH __attribute__((fallthrough)) + #else + #define XXH_FALLTHROUGH + #endif + +/*! + * @} + * @ingroup public + * @{ + + */ + #ifndef XXH_NO_LONG_LONG /*-********************************************************************** * 64-bit hash ************************************************************************/ - #if !defined(__VMS) && \ + #if defined(XXH_DOXYGEN) /* don't include <stdint.h> */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; + #elif !defined(__VMS) && \ (defined(__cplusplus) || (defined(__STDC_VERSION__) && \ (__STDC_VERSION__ >= 199901L) /* C99 */)) #include <stdint.h> typedef uint64_t XXH64_hash_t; #else + #include <limits.h> + #if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL +/* LP64 ABI says uint64_t is unsigned long */ +typedef unsigned long XXH64_hash_t; + #else /* the following type must have a width of 64-bit */ typedef unsigned long long XXH64_hash_t; + #endif #endif /*! - * XXH64(): - * Returns the 64-bit hash of sequence of length @length stored at memory - * address @input. - * @seed can be used to alter the result predictably. + * @} + * + * @defgroup xxh64_family XXH64 family + * @ingroup public + * @{ + + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. It provides a superior level of + * dispersion, and greatly reduces the risks of collisions. + */ + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. * * This function usually runs faster on 64-bit systems, but slower on 32-bit * systems (see benchmark). * - * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, - * and offers true 64/128 bit hash results. It provides a superior level of - * dispersion, and greatly reduces the risks of collisions. + * @param input The block of data to be hashed, at least @p length bytes in + * size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit hash. + * + * @see + * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. */ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t length, XXH64_hash_t seed); /******* Streaming *******/ +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + */ typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void); XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr); @@ -439,12 +766,15 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst, XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t *src); -/*-********************************************************************** - * XXH3 64-bit variant - ************************************************************************/ +/*! + * @} + * ************************************************************************ + * @defgroup xxh3_family XXH3 family + * @ingroup public + * @{ -/* ************************************************************************ - * XXH3 is a new hash algorithm featuring: + * + * XXH3 is a more recent hash algorithm featuring: * - Improved speed for both small and large inputs * - True 64-bit and 128-bit outputs * - SIMD acceleration @@ -454,41 +784,38 @@ XXH64_hashFromCanonical(const XXH64_canonical_t *src); * * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html * - * In general, expect XXH3 to run about ~2x faster on large inputs and >3x - * faster on small ones compared to XXH64, though exact differences depend on - * the platform. + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. * - * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash - * on all platforms. - * - * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it. - * - * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run - * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are - * explained in the implementation. + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Any 32-bit and 64-bit targets that can run XXH32 smoothly + * can run XXH3 at competitive speeds, even without vector support. + * Further details are explained in the implementation. * * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, - * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro. + * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generage exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. * * XXH3 offers 2 variants, _64bits and _128bits. - * When only 64 bits are needed, prefer calling the _64bits variant, as it - * reduces the amount of mixing, resulting in faster speed on small inputs. * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. * It's also generally simpler to manipulate a scalar return type than a struct. * - * The 128-bit version adds additional strength, but it is slightly slower. - * - * Return values of XXH3 and XXH128 are officially finalized starting - * with v0.8.0 and will no longer change in future versions. - * Avoid storing values from before that release in long-term storage. - * - * Results produced by v0.7.x are not comparable with results from v0.7.y. - * However, the API is completely stable, and it can safely be used for - * ephemeral data (local sessions). - * * The API supports one-shot hashing, streaming mode, and custom secrets. */ +/*-********************************************************************** + * XXH3 64-bit variant + ************************************************************************/ + /* XXH3_64bits(): * default 64-bit variant, using default secret and default seed of 0. * It's the fastest variant. */ @@ -504,20 +831,28 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *data, size_t len); XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *data, size_t len, XXH64_hash_t seed); - /* - * XXH3_64bits_withSecret(): - * It's possible to provide any blob of bytes as a "secret" to generate the - * hash. This makes it more difficult for an external actor to prepare an - * intentional collision. The main condition is that secretSize *must* be - * large enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced - * hash values depends on secret's entropy. Technically, the secret must - * look like a bunch of random bytes. Avoid "trivial" or structured data - * such as repeated sequences or a text document. Whenever unsure about the - * "randomness" of the blob of bytes, consider relabelling it as a "custom - * seed" instead, and employ "XXH3_generateSecret()" (see below) to generate - * a high entropy secret derived from the custom seed. + /*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). */ #define XXH3_SECRET_SIZE_MIN 136 + +/* + * XXH3_64bits_withSecret(): + * It's possible to provide any blob of bytes as a "secret" to generate the + * hash. This makes it more difficult for an external actor to prepare an + * intentional collision. The main condition is that secretSize *must* be large + * enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced hash + * values depends on secret's entropy. Technically, the secret must look like a + * bunch of random bytes. Avoid "trivial" or structured data such as repeated + * sequences or a text document. Whenever unsure about the "randomness" of the + * blob of bytes, consider relabelling it as a "custom seed" instead, and employ + * "XXH3_generateSecret()" (see below) to generate a high entropy secret derived + * from the custom seed. + */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len, const void *secret, size_t secretSize); @@ -529,6 +864,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len, * As a consequence, streaming is slower than one-shot hashing. * For better performance, prefer one-shot functions whenever applicable. */ + +/*! + * @brief The state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + */ typedef struct XXH3_state_s XXH3_state_t; XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void); XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr); @@ -572,10 +913,16 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *statePtr); * XXH3 128-bit variant ************************************************************************/ +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ typedef struct { - XXH64_hash_t low64; - XXH64_hash_t high64; + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ } XXH128_hash_t; @@ -649,6 +996,9 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src); #endif /* XXH_NO_LONG_LONG */ +/*! + * @} + */ #endif /* XXHASH_H_5627135585666179 */ #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) @@ -660,7 +1010,7 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src); * These declarations should only be used with static linking. * Never use them in association with dynamic linking! ***************************************************************************** - */ +*/ /* * These definitions are only present to allow static allocation @@ -668,41 +1018,72 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src); * Never **ever** access their members directly. */ +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ struct XXH32_state_s { - XXH32_hash_t total_len_32; - XXH32_hash_t large_len; - XXH32_hash_t v1; - XXH32_hash_t v2; - XXH32_hash_t v3; - XXH32_hash_t v4; - XXH32_hash_t mem32[4]; - XXH32_hash_t memsize; - XXH32_hash_t - reserved; /* never read nor write, might be removed in a future version */ + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref + total_len_32 overflow) */ + XXH32_hash_t v1; /*!< First accumulator lane */ + XXH32_hash_t v2; /*!< Second accumulator lane */ + XXH32_hash_t v3; /*!< Third accumulator lane */ + XXH32_hash_t v4; /*!< Fourth accumulator lane */ + XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as + unsigned char[16]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may + be removed. */ }; /* typedef'd to XXH32_state_t */ #ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ struct XXH64_state_s { - XXH64_hash_t total_len; - XXH64_hash_t v1; - XXH64_hash_t v2; - XXH64_hash_t v3; - XXH64_hash_t v4; - XXH64_hash_t mem64[4]; - XXH32_hash_t memsize; - XXH32_hash_t reserved32; /* required for padding anyway */ - XXH64_hash_t reserved64; /* never read nor write, might be removed in a future - version */ + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t v1; /*!< First accumulator lane */ + XXH64_hash_t v2; /*!< Second accumulator lane */ + XXH64_hash_t v3; /*!< Third accumulator lane */ + XXH64_hash_t v4; /*!< Fourth accumulator lane */ + XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as + unsigned char[32]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it + may be removed. */ }; /* typedef'd to XXH64_state_t */ - #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ + #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 \ + */ #include <stdalign.h> #define XXH_ALIGN(n) alignas(n) + #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ + /* In C++ alignas() is a keyword */ + #define XXH_ALIGN(n) alignas(n) #elif defined(__GNUC__) #define XXH_ALIGN(n) __attribute__((aligned(n))) #elif defined(_MSC_VER) @@ -713,39 +1094,94 @@ struct XXH64_state_s { /* Old GCC versions only accept the attribute after the type in structures. */ - #if !(defined(__STDC_VERSION__) && \ - (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + #if !(defined(__STDC_VERSION__) && \ + (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && !(defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ && defined(__GNUC__) #define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) #else #define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type #endif + /*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ #define XXH3_INTERNALBUFFER_SIZE 256 + + /*! + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ #define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. + * + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do never access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ struct XXH3_state_s { XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); - /* used to store a custom secret generated from a seed */ + /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref + * XXH64_state_s */ XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); - XXH32_hash_t bufferedSize; - XXH32_hash_t reserved32; - size_t nbStripesSoFar; - XXH64_hash_t totalLen; - size_t nbStripesPerBlock; - size_t secretLimit; - XXH64_hash_t seed; - XXH64_hash_t reserved64; - const unsigned char *extSecret; /* reference to external secret; - * if == NULL, use .customSecret instead */ + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t reserved32; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see + * XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char *extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ /* note: there may be some padding at the end due to alignment on 64 bytes */ }; /* typedef'd to XXH3_state_t */ #undef XXH_ALIGN_MEMBER - /* When the XXH3_state_t structure is merely emplaced on stack, + /*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, * it should be initialized with XXH3_INITSTATE() or a memset() * in case its first reset uses XXH3_NNbits_reset_withSeed(). * This init can be omitted if the first reset uses default or _withSecret @@ -802,7 +1238,6 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len, XXH64_hash_t seed); #endif /* XXH_NO_LONG_LONG */ - #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) #define XXH_IMPLEMENTATION #endif @@ -844,81 +1279,183 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len, /* ************************************* * Tuning parameters ***************************************/ + /*! - * XXH_FORCE_MEMORY_ACCESS: - * By default, access to unaligned memory is controlled by `memcpy()`, which - * is safe and portable. - * - * Unfortunately, on some target/compiler combinations, the generated assembly - * is sub-optimal. + * @defgroup tuning Tuning parameters + * @{ + * - * The below switch allow selection of a different access method - * in the search for improved performance. - * Method 0 (default): - * Use `memcpy()`. Safe and portable. Default. - * Method 1: - * `__attribute__((packed))` statement. It depends on compiler extensions - * and is therefore not portable. - * This method is safe if your compiler supports it, and *generally* as - * fast or faster than `memcpy`. - * Method 2: - * Direct access via cast. This method doesn't depend on the compiler but - * violates the C standard. - * It can generate buggy code on targets which do not support unaligned - * memory accesses. - * But in some circumstances, it's the only known way to get the most - * performance (example: GCC + ARMv6) - * Method 3: - * Byteshift. This can generate the best code on old compilers which don't - * inline small `memcpy()` calls, and it might also be faster on - * big-endian systems which lack a native byteswap instruction. See - * https://stackoverflow.com/a/32095106/646947 for details. Prefer these - * methods in priority order (0 > 1 > 2 > 3) + * Various macros to control xxHash's behavior. */ + #ifdef XXH_DOXYGEN + /*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref xxh32_family and you have a strict C90 + * compiler. + */ + #define XXH_NO_LONG_LONG + #undef XXH_NO_LONG_LONG /* don't actually */ + /*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which + * is safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated + * assembly is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers + * will eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences + * an unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's + * the only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which + * don't inline small `memcpy()` calls, and it might also be faster on + * big-endian systems which lack a native byteswap instruction. However, + * some compilers will emit literal byteshifts even if the target supports + * unaligned access. + * . + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may + * cause another to read garbage data or even crash. + * + * See https://stackoverflow.com/a/32095106/646947 for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ + #define XXH_FORCE_MEMORY_ACCESS 0 + /*! + * @def XXH_ACCEPT_NULL_INPUT_POINTER + * @brief Whether to add explicit `NULL` checks. + * + * If the input pointer is `NULL` and the length is non-zero, xxHash's + * default behavior is to dereference it, triggering a segfault. + * + * When this macro is enabled, xxHash actively checks the input for a null + * pointer. If it is, the result for null input pointers is the same as a + * zero-length input. + */ + #define XXH_ACCEPT_NULL_INPUT_POINTER 0 + /*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs + * (XXH32() and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally + * negligible, but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro + * to 0. Then the code will always use unaligned memory access. Align check + * is automatically disabled on x86, x64 & arm64, which are platforms known + * to offer good unaligned memory accesses performance. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ + #define XXH_FORCE_ALIGN_CHECK 0 + + /*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all + * internal functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary + * which might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to + * performance, depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using + * -fno-inline with GCC or Clang, this will automatically be defined. + */ + #define XXH_NO_INLINE_HINTS 0 + + /*! + * @def XXH_REROLL + * @brief Whether to reroll `XXH32_finalize`. + * + * For performance, `XXH32_finalize` uses an unrolled loop + * in the form of a switch statement. + * + * This is not always desirable, as it generates larger code, + * and depending on the architecture, may even be slower + * + * This is automatically defined with `-Os`/`-Oz` on GCC and Clang. + */ + #define XXH_REROLL 0 + + /*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use + * this. + */ + #define XXH_OLD_NAMES + #undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + #endif /* XXH_DOXYGEN */ +/*! + * @} + */ + #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command \ line for example */ - #if !defined(__clang__) && defined(__GNUC__) && \ - defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && \ - (__ARM_ARCH == 6) - #define XXH_FORCE_MEMORY_ACCESS 2 - #elif !defined(__clang__) && \ - ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ - (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) + /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */ + #if !defined(__clang__) && \ + ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && \ + ((defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \ + (defined(__mips__) && (__mips <= 5 || __mips_isa_rev < 6) && \ + (!defined(__mips16) || defined(__mips_mips16e2)))))) #define XXH_FORCE_MEMORY_ACCESS 1 #endif #endif - /*! - * XXH_ACCEPT_NULL_INPUT_POINTER: - * If the input pointer is NULL, xxHash's default behavior is to dereference - * it, triggering a segfault. When this macro is enabled, xxHash actively - * checks the input for a null pointer. If it is, the result for null input - * pointers is the same as a zero-length input. - */ #ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ #define XXH_ACCEPT_NULL_INPUT_POINTER 0 #endif - /*! - * XXH_FORCE_ALIGN_CHECK: - * This is an important performance trick - * for architectures without decent unaligned memory access performance. - * It checks for input alignment, and when conditions are met, - * uses a "fast path" employing direct 32-bit/64-bit read, - * resulting in _dramatically faster_ read speed. - * - * The check costs one initial branch per hash, which is generally negligible, - * but not zero. Moreover, it's not useful to generate binary for an - * additional code path if memory access uses same instruction for both - * aligned and unaligned adresses. - * - * In these cases, the alignment check can be removed by setting this macro to - * 0. Then the code will always use unaligned memory access. Align check is - * automatically disabled on x86, x64 & arm64, which are platforms known to - * offer good unaligned memory accesses performance. - * - * This option does not affect XXH3 (only XXH32 and XXH64). - */ #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ #if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || \ defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */ @@ -928,25 +1465,6 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len, #endif #endif - /*! - * XXH_NO_INLINE_HINTS: - * - * By default, xxHash tries to force the compiler to inline almost all - * internal functions. - * - * This can usually improve performance due to reduced jumping and improved - * constant folding, but significantly increases the size of the binary which - * might not be favorable. - * - * Additionally, sometimes the forced inlining can be detrimental to - * performance, depending on the architecture. - * - * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the - * compiler full control on whether to inline or not. - * - * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using - * -fno-inline with GCC or Clang, this will automatically be defined. - */ #ifndef XXH_NO_INLINE_HINTS #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ || defined(__NO_INLINE__) /* -O0, -fno-inline */ @@ -956,44 +1474,57 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len, #endif #endif - /*! - * XXH_REROLL: - * Whether to reroll XXH32_finalize, and XXH64_finalize, - * instead of using an unrolled jump table/if statement loop. - * - * This is automatically defined on -Os/-Oz on GCC and Clang. - */ #ifndef XXH_REROLL - #if defined(__OPTIMIZE_SIZE__) + #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ || \ + (defined(__GNUC__) && !defined(__clang__)) + /* The if/then loop is preferable to switch/case on gcc (on x64) */ #define XXH_REROLL 1 #else #define XXH_REROLL 0 #endif #endif + /*! + * @defgroup impl Implementation + * @{ + + */ + /* ************************************* * Includes & Memory related functions ***************************************/ - /*! + /* * Modify the local functions below should you wish to use * different memory routines for malloc() and free() */ #include <stdlib.h> +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ static void *XXH_malloc(size_t s) { return malloc(s); } +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ static void XXH_free(void *p) { free(p); } - /*! and for memcpy() */ #include <string.h> + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ static void *XXH_memcpy(void *dest, const void *src, size_t size) { return memcpy(dest, src, size); @@ -1037,7 +1568,11 @@ static void *XXH_memcpy(void *dest, const void *src, size_t size) { /* ************************************* * Debug ***************************************/ - /* + /*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * * XXH_DEBUGLEVEL is expected to be defined externally, typically via the * compiler's command line options. The value must be a number. */ @@ -1057,12 +1592,58 @@ static void *XXH_memcpy(void *dest, const void *src, size_t size) { #endif /* note: use after variable declarations */ - #define XXH_STATIC_ASSERT(c) \ - do { \ - \ - enum { XXH_sa = 1 / (int)(!!(c)) }; \ - \ - } while (0) + #ifndef XXH_STATIC_ASSERT + #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ + #include <assert.h> + #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \ + do { \ + \ + static_assert((c), m); \ + \ + } while (0) + #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ + #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \ + do { \ + \ + static_assert((c), m); \ + \ + } while (0) + #else + #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \ + do { \ + \ + struct xxh_sa { \ + \ + char x[(c) ? 1 : -1]; \ + \ + }; \ + \ + } while (0) + #endif + #define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c), #c) + #endif + + /*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ + #ifdef __GNUC__ + #define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r"(var)) + #else + #define XXH_COMPILER_GUARD(var) ((void)0) + #endif /* ************************************* * Basic Types @@ -1085,6 +1666,56 @@ typedef XXH32_hash_t xxh_u32; /* *** Memory access *** */ +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3)) /* * Manual byteshift. Best for old compilers which don't inline memcpy. @@ -1146,16 +1777,23 @@ static xxh_u32 XXH_read32(const void *memPtr) { #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ -/* *** Endianess *** */ -typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess; + /* *** Endianness *** */ /*! - * XXH_CPU_LITTLE_ENDIAN: + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * * Defined to 1 if the target is little endian, or 0 if it is big endian. * It can be defined externally, for example on the compiler command line. * - * If it is not defined, a runtime check (which is usually constant folded) - * is used instead. + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. */ #ifndef XXH_CPU_LITTLE_ENDIAN /* @@ -1170,8 +1808,11 @@ typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess; (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define XXH_CPU_LITTLE_ENDIAN 0 #else -/* - * runtime test, presumed to simplify to a constant by compiler +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. */ static int XXH_isLittleEndian(void) { @@ -1189,7 +1830,7 @@ static int XXH_isLittleEndian(void) { return one.c[0]; } - +\ #define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() #endif #endif @@ -1205,6 +1846,19 @@ static int XXH_isLittleEndian(void) { #define XXH_HAS_BUILTIN(x) 0 #endif + /*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) && \ XXH_HAS_BUILTIN(__builtin_rotateleft64) #define XXH_rotl32 __builtin_rotateleft32 @@ -1219,6 +1873,14 @@ static int XXH_isLittleEndian(void) { #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r)))) #endif + /*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to byteswap. + * @return @p x, byteswapped. + */ #if defined(_MSC_VER) /* Visual Studio */ #define XXH_swap32 _byteswap_ulong #elif XXH_GCC_VERSION >= 403 @@ -1236,7 +1898,17 @@ static xxh_u32 XXH_swap32(xxh_u32 x) { /* *************************** * Memory reads *****************************/ -typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ + +} XXH_alignment; /* * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. @@ -1295,6 +1967,7 @@ XXH_FORCE_INLINE xxh_u32 XXH_readLE32_align(const void * ptr, /* ************************************* * Misc ***************************************/ +/*! @ingroup public */ XXH_PUBLIC_API unsigned XXH_versionNumber(void) { return XXH_VERSION_NUMBER; @@ -1304,16 +1977,19 @@ XXH_PUBLIC_API unsigned XXH_versionNumber(void) { /* ******************************************************************* * 32-bit hash functions *********************************************************************/ -static const xxh_u32 XXH_PRIME32_1 = - 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ -static const xxh_u32 XXH_PRIME32_2 = - 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ -static const xxh_u32 XXH_PRIME32_3 = - 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ -static const xxh_u32 XXH_PRIME32_4 = - 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ -static const xxh_u32 XXH_PRIME32_5 = - 0x165667B1U; /* 0b00010110010101100110011110110001 */ +/*! + * @} + * @defgroup xxh32_impl XXH32 implementation + * @ingroup impl + * @{ + + */ +/* #define instead of static const, to be used as initializers */ + #define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ + #define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ + #define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ + #define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ + #define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ #ifdef XXH_OLD_NAMES #define PRIME32_1 XXH_PRIME32_1 @@ -1323,19 +1999,29 @@ static const xxh_u32 XXH_PRIME32_5 = #define PRIME32_5 XXH_PRIME32_5 #endif +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) { acc += input * XXH_PRIME32_2; acc = XXH_rotl32(acc, 13); acc *= XXH_PRIME32_1; - #if defined(__GNUC__) && defined(__SSE4_1__) && \ + #if (defined(__SSE4_1__) || defined(__aarch64__)) && \ !defined(XXH_ENABLE_AUTOVECTORIZE) /* * UGLY HACK: - * This inline assembly hack forces acc into a normal register. This is the - * only thing that prevents GCC and Clang from autovectorizing the XXH32 - * loop (pragmas and attributes don't work for some resason) without globally - * disabling SSE4.1. + * A compiler fence is the only thing that prevents GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. * * The reason we want to avoid vectorization is because despite working on * 4 integers at a time, there are multiple factors slowing XXH32 down on @@ -1360,28 +2046,26 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) { * can load data, while v3 can multiply. SSE forces them to operate * together. * - * How this hack works: - * __asm__("" // Declare an assembly block but don't declare any - * instructions : // However, as an Input/Output Operand, - * "+r" // constrain a read/write operand (+) as a general purpose - * register (r). (acc) // and set acc as the operand - * ); - * - * Because of the 'r', the compiler has promised that seed will be in a - * general purpose register and the '+' says that it will be 'read/write', - * so it has to assume it has changed. It is like volatile without all the - * loads and stores. - * - * Since the argument has to be in a normal register (not an SSE register), - * each time XXH32_round is called, it is impossible to vectorize. + * This is also enabled on AArch64, as Clang autovectorizes it incorrectly + * and it is pointless writing a NEON implementation that is basically the + * same speed as scalar for XXH32. */ - __asm__("" : "+r"(acc)); + XXH_COMPILER_GUARD(acc); #endif return acc; } -/* mix all bits */ +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param h32 The hash to avalanche. + * @return The avalanched hash. + */ static xxh_u32 XXH32_avalanche(xxh_u32 h32) { h32 ^= h32 >> 15; @@ -1395,11 +2079,23 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32) { #define XXH_get32bits(p) XXH_readLE32_align(p, align) +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param h32 The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + */ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len, XXH_alignment align) { - - /* dummy comment */ - +\ #define XXH_PROCESS1 \ do { \ \ @@ -1443,20 +2139,20 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len, case 12: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 8: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 4: XXH_PROCESS4; return XXH32_avalanche(h32); case 13: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 9: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 5: XXH_PROCESS4; XXH_PROCESS1; @@ -1464,10 +2160,10 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len, case 14: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 10: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 6: XXH_PROCESS4; XXH_PROCESS1; @@ -1476,22 +2172,22 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len, case 15: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 11: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 7: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 3: XXH_PROCESS1; - /* fallthrough */ + XXH_FALLTHROUGH; case 2: XXH_PROCESS1; - /* fallthrough */ + XXH_FALLTHROUGH; case 1: XXH_PROCESS1; - /* fallthrough */ + XXH_FALLTHROUGH; case 0: return XXH32_avalanche(h32); @@ -1512,10 +2208,18 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len, #undef XXH_PROCESS4 #endif +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input, len, seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len, xxh_u32 seed, XXH_alignment align) { - const xxh_u8 *bEnd = input + len; + const xxh_u8 *bEnd = input ? input + len : NULL; xxh_u32 h32; #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ @@ -1565,6 +2269,7 @@ XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len, } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len, XXH32_hash_t seed) { @@ -1574,9 +2279,7 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len, XXH32_reset(&state, seed); XXH32_update(&state, (const xxh_u8*)input, len); return XXH32_digest(&state); - #else - if (XXH_FORCE_ALIGN_CHECK) { if ((((size_t)input) & 3) == @@ -1593,13 +2296,16 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len, } /******* Hash streaming *******/ - +/*! + * @ingroup xxh32_family + */ XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void) { return (XXH32_state_t *)XXH_malloc(sizeof(XXH32_state_t)); } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) { XXH_free(statePtr); @@ -1607,6 +2313,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) { } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dstState, const XXH32_state_t *srcState) { @@ -1614,6 +2321,7 @@ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dstState, } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr, XXH32_hash_t seed) { @@ -1630,6 +2338,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr, } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state, const void *input, size_t len) { @@ -1719,6 +2428,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state, } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) { xxh_u32 h32; @@ -1743,7 +2453,8 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) { /******* Canonical representation *******/ -/* +/*! + * @ingroup xxh32_family * The default return values from XXH functions are unsigned 32 and 64 bit * integers. * @@ -1765,6 +2476,7 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst, } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t *src) { @@ -1777,7 +2489,12 @@ XXH32_hashFromCanonical(const XXH32_canonical_t *src) { /* ******************************************************************* * 64-bit hash functions *********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ /******* Memory access *******/ typedef XXH64_hash_t xxh_u64; @@ -1786,40 +2503,6 @@ typedef XXH64_hash_t xxh_u64; #define U64 xxh_u64 #endif - /*! - * XXH_REROLL_XXH64: - * Whether to reroll the XXH64_finalize() loop. - * - * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a - * performance gain on 64-bit hosts, as only one jump is required. - * - * However, on 32-bit hosts, because arithmetic needs to be done with two - * 32-bit registers, and 64-bit arithmetic needs to be simulated, it isn't - * beneficial to unroll. The code becomes ridiculously large (the largest - * function in the binary on i386!), and rerolling it saves anywhere from - * 3kB to 20kB. It is also slightly faster because it fits into cache better - * and is more likely to be inlined by the compiler. - * - * If XXH_REROLL is defined, this is ignored and the loop is always - * rerolled. - */ - #ifndef XXH_REROLL_XXH64 - #if (defined(__ILP32__) || \ - defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \ - || !(defined(__x86_64__) || defined(_M_X64) || \ - defined(_M_AMD64) /* x86-64 */ \ - || defined(_M_ARM64) || defined(__aarch64__) || \ - defined(__arm64__) /* aarch64 */ \ - || defined(__PPC64__) || defined(__PPC64LE__) || \ - defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \ - || defined(__mips64__) || defined(__mips64)) /* mips64 */ \ - || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */ - #define XXH_REROLL_XXH64 1 - #else - #define XXH_REROLL_XXH64 0 - #endif - #endif /* !defined(XXH_REROLL_XXH64) */ - #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3)) /* * Manual byteshift. Best for old compilers which don't inline memcpy. @@ -1950,23 +2633,35 @@ XXH_FORCE_INLINE xxh_u64 XXH_readLE64_align(const void * ptr, } -/******* xxh64 *******/ + /******* xxh64 *******/ + /*! + * @} + * @defgroup xxh64_impl XXH64 implementation + * @ingroup impl + * @{ -static const xxh_u64 XXH_PRIME64_1 = - 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 - */ -static const xxh_u64 XXH_PRIME64_2 = - 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 - */ -static const xxh_u64 XXH_PRIME64_3 = - 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 - */ -static const xxh_u64 XXH_PRIME64_4 = - 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 - */ -static const xxh_u64 XXH_PRIME64_5 = - 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 - */ + */ + /* #define rather that static const, to be used as initializers */ + #define XXH_PRIME64_1 \ + 0x9E3779B185EBCA87ULL /*!< \ + 0b1001111000110111011110011011000110000101111010111100101010000111 \ + */ + #define XXH_PRIME64_2 \ + 0xC2B2AE3D27D4EB4FULL /*!< \ + 0b1100001010110010101011100011110100100111110101001110101101001111 \ + */ + #define XXH_PRIME64_3 \ + 0x165667B19E3779F9ULL /*!< \ + 0b0001011001010110011001111011000110011110001101110111100111111001 \ + */ + #define XXH_PRIME64_4 \ + 0x85EBCA77C2B2AE63ULL /*!< \ + 0b1000010111101011110010100111011111000010101100101010111001100011 \ + */ + #define XXH_PRIME64_5 \ + 0x27D4EB2F165667C5ULL /*!< \ + 0b0010011111010100111010110010111100010110010101100110011111000101 \ + */ #ifdef XXH_OLD_NAMES #define PRIME64_1 XXH_PRIME64_1 @@ -2010,185 +2705,35 @@ static xxh_u64 XXH64_avalanche(xxh_u64 h64) { static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len, XXH_alignment align) { - /* dummy comment */ - - #define XXH_PROCESS1_64 \ - do { \ - \ - h64 ^= (*ptr++) * XXH_PRIME64_5; \ - h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \ - \ - } while (0) - - #define XXH_PROCESS4_64 \ - do { \ - \ - h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; \ - ptr += 4; \ - h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \ - \ - } while (0) - - #define XXH_PROCESS8_64 \ - do { \ - \ - xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \ - ptr += 8; \ - h64 ^= k1; \ - h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4; \ - \ - } while (0) - - /* Rerolled version for 32-bit targets is faster and much smaller. */ - if (XXH_REROLL || XXH_REROLL_XXH64) { - - len &= 31; - while (len >= 8) { - - XXH_PROCESS8_64; - len -= 8; - - } - - if (len >= 4) { - - XXH_PROCESS4_64; - len -= 4; - - } - - while (len > 0) { + len &= 31; + while (len >= 8) { - XXH_PROCESS1_64; - --len; + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + h64 ^= k1; + h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; - } + } - return XXH64_avalanche(h64); + if (len >= 4) { - } else { + h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; - switch (len & 31) { + } - case 24: - XXH_PROCESS8_64; - /* fallthrough */ - case 16: - XXH_PROCESS8_64; - /* fallthrough */ - case 8: - XXH_PROCESS8_64; - return XXH64_avalanche(h64); - - case 28: - XXH_PROCESS8_64; - /* fallthrough */ - case 20: - XXH_PROCESS8_64; - /* fallthrough */ - case 12: - XXH_PROCESS8_64; - /* fallthrough */ - case 4: - XXH_PROCESS4_64; - return XXH64_avalanche(h64); - - case 25: - XXH_PROCESS8_64; - /* fallthrough */ - case 17: - XXH_PROCESS8_64; - /* fallthrough */ - case 9: - XXH_PROCESS8_64; - XXH_PROCESS1_64; - return XXH64_avalanche(h64); - - case 29: - XXH_PROCESS8_64; - /* fallthrough */ - case 21: - XXH_PROCESS8_64; - /* fallthrough */ - case 13: - XXH_PROCESS8_64; - /* fallthrough */ - case 5: - XXH_PROCESS4_64; - XXH_PROCESS1_64; - return XXH64_avalanche(h64); - - case 26: - XXH_PROCESS8_64; - /* fallthrough */ - case 18: - XXH_PROCESS8_64; - /* fallthrough */ - case 10: - XXH_PROCESS8_64; - XXH_PROCESS1_64; - XXH_PROCESS1_64; - return XXH64_avalanche(h64); - - case 30: - XXH_PROCESS8_64; - /* fallthrough */ - case 22: - XXH_PROCESS8_64; - /* fallthrough */ - case 14: - XXH_PROCESS8_64; - /* fallthrough */ - case 6: - XXH_PROCESS4_64; - XXH_PROCESS1_64; - XXH_PROCESS1_64; - return XXH64_avalanche(h64); - - case 27: - XXH_PROCESS8_64; - /* fallthrough */ - case 19: - XXH_PROCESS8_64; - /* fallthrough */ - case 11: - XXH_PROCESS8_64; - XXH_PROCESS1_64; - XXH_PROCESS1_64; - XXH_PROCESS1_64; - return XXH64_avalanche(h64); - - case 31: - XXH_PROCESS8_64; - /* fallthrough */ - case 23: - XXH_PROCESS8_64; - /* fallthrough */ - case 15: - XXH_PROCESS8_64; - /* fallthrough */ - case 7: - XXH_PROCESS4_64; - /* fallthrough */ - case 3: - XXH_PROCESS1_64; - /* fallthrough */ - case 2: - XXH_PROCESS1_64; - /* fallthrough */ - case 1: - XXH_PROCESS1_64; - /* fallthrough */ - case 0: - return XXH64_avalanche(h64); + while (len > 0) { - } + h64 ^= (*ptr++) * XXH_PRIME64_5; + h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; + --len; } - /* impossible to reach */ - XXH_ASSERT(0); - return 0; /* unreachable, but some compilers complain without it */ + return XXH64_avalanche(h64); } @@ -2205,7 +2750,7 @@ static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len, XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len, xxh_u64 seed, XXH_alignment align) { - const xxh_u8 *bEnd = input + len; + const xxh_u8 *bEnd = input ? input + len : NULL; xxh_u64 h64; #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ @@ -2259,6 +2804,7 @@ XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len, } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len, XXH64_hash_t seed) { @@ -2268,9 +2814,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len, XXH64_reset(&state, seed); XXH64_update(&state, (const xxh_u8*)input, len); return XXH64_digest(&state); - #else - if (XXH_FORCE_ALIGN_CHECK) { if ((((size_t)input) & 7) == @@ -2289,12 +2833,14 @@ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len, /******* Hash Streaming *******/ +/*! @ingroup xxh64_family*/ XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void) { return (XXH64_state_t *)XXH_malloc(sizeof(XXH64_state_t)); } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) { XXH_free(statePtr); @@ -2302,6 +2848,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) { } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t * dstState, const XXH64_state_t *srcState) { @@ -2309,6 +2856,7 @@ XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t * dstState, } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr, XXH64_hash_t seed) { @@ -2325,6 +2873,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr, } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state, const void *input, size_t len) { @@ -2403,6 +2952,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state, } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) { xxh_u64 h64; @@ -2436,6 +2986,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) { /******* Canonical representation *******/ +/*! @ingroup xxh64_family */ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst, XXH64_hash_t hash) { @@ -2445,6 +2996,7 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst, } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t *src) { @@ -2452,380 +3004,452 @@ XXH64_hashFromCanonical(const XXH64_canonical_t *src) { } - /* ********************************************************************* - * XXH3 - * New generation hash designed for speed on small keys and vectorization - ************************************************************************ */ + #ifndef XXH_NO_XXH3 - /* === Compiler specifics === */ + /* ********************************************************************* + * XXH3 + * New generation hash designed for speed on small keys and vectorization + ************************************************************************ */ + /*! + * @} + * @defgroup xxh3_impl XXH3 implementation + * @ingroup impl + * @{ - #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ - #define XXH_RESTRICT restrict - #else - /* Note: it might be useful to define __restrict or __restrict__ for some - * C++ compilers */ - #define XXH_RESTRICT /* disable */ - #endif + */ - #if (defined(__GNUC__) && (__GNUC__ >= 3)) || \ - (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \ - defined(__clang__) - #define XXH_likely(x) __builtin_expect(x, 1) - #define XXH_unlikely(x) __builtin_expect(x, 0) - #else - #define XXH_likely(x) (x) - #define XXH_unlikely(x) (x) - #endif + /* === Compiler specifics === */ - #if defined(__GNUC__) - #if defined(__AVX2__) - #include <immintrin.h> - #elif defined(__SSE2__) - #include <emmintrin.h> - #elif defined(__ARM_NEON__) || defined(__ARM_NEON) - #define inline __inline__ /* circumvent a clang bug */ - #include <arm_neon.h> - #undef inline + #if ((defined(sun) || defined(__sun)) && \ + __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested \ + with GCC 5.5 */ + #define XXH_RESTRICT /* disable */ + #elif defined(__STDC_VERSION__) && \ + __STDC_VERSION__ >= 199901L /* >= C99 */ + #define XXH_RESTRICT restrict + #else + /* Note: it might be useful to define __restrict or __restrict__ for + * some C++ compilers */ + #define XXH_RESTRICT /* disable */ #endif - #elif defined(_MSC_VER) - #include <intrin.h> - #endif - - /* - * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while - * remaining a true 64-bit/128-bit hash function. - * - * This is done by prioritizing a subset of 64-bit operations that can be - * emulated without too many steps on the average 32-bit machine. - * - * For example, these two lines seem similar, and run equally fast on - * 64-bit: - * - * xxh_u64 x; - * x ^= (x >> 47); // good - * x ^= (x >> 13); // bad - * - * However, to a 32-bit machine, there is a major difference. - * - * x ^= (x >> 47) looks like this: - * - * x.lo ^= (x.hi >> (47 - 32)); - * - * while x ^= (x >> 13) looks like this: - * - * // note: funnel shifts are not usually cheap. - * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); - * x.hi ^= (x.hi >> 13); - * - * The first one is significantly faster than the second, simply because the - * shift is larger than 32. This means: - * - All the bits we need are in the upper 32 bits, so we can ignore the - * lower 32 bits in the shift. - * - The shift result will always fit in the lower 32 bits, and therefore, - * we can ignore the upper 32 bits in the xor. - * - * Thanks to this optimization, XXH3 only requires these features to be - * efficient: - * - * - Usable unaligned access - * - A 32-bit or 64-bit ALU - * - If 32-bit, a decent ADC instruction - * - A 32 or 64-bit multiply with a 64-bit result - * - For the 128-bit variant, a decent byteswap helps short inputs. - * - * The first two are already required by XXH32, and almost all 32-bit and - * 64-bit platforms which can run XXH32 can run XXH3 efficiently. - * - * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one - * notable exception. - * - * First of all, Thumb-1 lacks support for the UMULL instruction which - * performs the important long multiply. This means numerous __aeabi_lmul - * calls. - * - * Second of all, the 8 functional registers are just not enough. - * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic - * need Lo registers, and this shuffling results in thousands more MOVs than - * A32. - * - * A32 and T32 don't have this limitation. They can access all 14 registers, - * do a 32->64 multiply with UMULL, and the flexible operand allowing free - * shifts is helpful, too. - * - * Therefore, we do a quick sanity check. - * - * If compiling Thumb-1 for a target which supports ARM instructions, we - * will emit a warning, as it is not a "sane" platform to compile for. - * - * Usually, if this happens, it is because of an accident and you probably - * need to specify -march, as you likely meant to compile for a newer - * architecture. - * - * Credit: large sections of the vectorial and asm source code paths - * have been contributed by @easyaspi314 - */ - #if defined(__thumb__) && !defined(__thumb2__) && \ - defined(__ARM_ARCH_ISA_ARM) - #warning "XXH3 is highly inefficient without ARM or Thumb-2." - #endif - /* ========================================== - * Vectorization detection - * ========================================== */ - #define XXH_SCALAR 0 /* Portable scalar version */ - #define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ - #define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ - #define XXH_AVX512 3 /* AVX512 for Skylake and Icelake */ - #define XXH_NEON 4 /* NEON for most ARMv7-A and all AArch64 */ - #define XXH_VSX 5 /* VSX and ZVector for POWER8/z13 */ - - #ifndef XXH_VECTOR /* can be defined on command line */ - #if defined(__AVX512F__) - #define XXH_VECTOR XXH_AVX512 - #elif defined(__AVX2__) - #define XXH_VECTOR XXH_AVX2 - #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \ - (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) - #define XXH_VECTOR XXH_SSE2 - #elif defined(__GNUC__) /* msvc support maybe later */ \ - && (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \ - (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ - || (defined(__BYTE_ORDER__) && \ - __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) - #define XXH_VECTOR XXH_NEON - #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \ - (defined(__s390x__) && defined(__VEC__)) && \ - defined(__GNUC__) /* TODO: IBM XL */ - #define XXH_VECTOR XXH_VSX + #if (defined(__GNUC__) && (__GNUC__ >= 3)) || \ + (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \ + defined(__clang__) + #define XXH_likely(x) __builtin_expect(x, 1) + #define XXH_unlikely(x) __builtin_expect(x, 0) #else - #define XXH_VECTOR XXH_SCALAR + #define XXH_likely(x) (x) + #define XXH_unlikely(x) (x) #endif - #endif - /* - * Controls the alignment of the accumulator, - * for compatibility with aligned vector loads, which are usually faster. - */ - #ifndef XXH_ACC_ALIGN - #if defined(XXH_X86DISPATCH) - #define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ - #elif XXH_VECTOR == XXH_SCALAR /* scalar */ - #define XXH_ACC_ALIGN 8 - #elif XXH_VECTOR == XXH_SSE2 /* sse2 */ - #define XXH_ACC_ALIGN 16 - #elif XXH_VECTOR == XXH_AVX2 /* avx2 */ - #define XXH_ACC_ALIGN 32 - #elif XXH_VECTOR == XXH_NEON /* neon */ - #define XXH_ACC_ALIGN 16 - #elif XXH_VECTOR == XXH_VSX /* vsx */ - #define XXH_ACC_ALIGN 16 - #elif XXH_VECTOR == XXH_AVX512 /* avx512 */ - #define XXH_ACC_ALIGN 64 + #if defined(__GNUC__) + #if defined(__AVX2__) + #include <immintrin.h> + #elif defined(__SSE2__) + #include <emmintrin.h> + #elif defined(__ARM_NEON__) || defined(__ARM_NEON) + #define inline __inline__ /* circumvent a clang bug */ + #include <arm_neon.h> + #undef inline + #endif + #elif defined(_MSC_VER) + #include <intrin.h> #endif - #endif - #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \ - XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 - #define XXH_SEC_ALIGN XXH_ACC_ALIGN - #else - #define XXH_SEC_ALIGN 8 - #endif - - /* - * UGLY HACK: - * GCC usually generates the best code with -O3 for xxHash. - * - * However, when targeting AVX2, it is overzealous in its unrolling - * resulting in code roughly 3/4 the speed of Clang. - * - * There are other issues, such as GCC splitting _mm256_loadu_si256 into - * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which - * only applies to Sandy and Ivy Bridge... which don't even support AVX2. - * - * That is why when compiling the AVX2 version, it is recommended to use - * either -O2 -mavx2 -march=haswell or -O2 -mavx2 - * -mno-avx256-split-unaligned-load for decent performance, or to use Clang - * instead. - * - * Fortunately, we can control the first one with a pragma that forces GCC - * into -O2, but the other one we can't control without "failed to inline - * always inline function due to target mismatch" warnings. - */ - #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ - && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__OPTIMIZE__) && \ - !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ - #pragma GCC push_options - #pragma GCC optimize("-O2") - #endif - - #if XXH_VECTOR == XXH_NEON /* - * NEON's setup for vmlal_u32 is a little more complicated than it is on - * SSE2, AVX2, and VSX. + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. * - * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an - * upcast. + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. * - * To do the same operation, the 128-bit 'Q' register needs to be split - * into two 64-bit 'D' registers, performing this operation:: + * For example, these two lines seem similar, and run equally fast on + * 64-bit: * - * [ a | b ] | - * '---------. .--------' | | x | - * | .---------' '--------. | - * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad * - * Due to significant changes in aarch64, the fastest method for aarch64 - * is completely different than the fastest method for ARMv7-A. + * However, to a 32-bit machine, there is a major difference. * - * ARMv7-A treats D registers as unions overlaying Q registers, so - * modifying D11 will modify the high half of Q5. This is similar to how - * modifying AH will only affect bits 8-15 of AX on x86. + * x ^= (x >> 47) looks like this: * - * VZIP takes two registers, and puts even lanes in one register and odd - * lanes in the other. + * x.lo ^= (x.hi >> (47 - 32)); * - * On ARMv7-A, this strangely modifies both parameters in place instead of - * taking the usual 3-operand form. + * while x ^= (x >> 13) looks like this: * - * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on - * the lower and upper halves of the Q register to end up with the high - * and low halves where we want - all in one instruction. + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); * - * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], - * d11[1] } + * The first one is significantly faster than the second, simply because + * the shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the + * lower 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and + * therefore, we can ignore the upper 32 bits in the xor. * - * Unfortunately we need inline assembly for this: Instructions modifying - * two registers at once is not possible in GCC or Clang's IR, and they - * have to create a copy. + * Thanks to this optimization, XXH3 only requires these features to be + * efficient: * - * aarch64 requires a different approach. + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. * - * In order to make it easier to write a decent compiler for aarch64, many - * quirks were removed, such as conditional execution. + * The first two are already required by XXH32, and almost all 32-bit and + * 64-bit platforms which can run XXH32 can run XXH3 efficiently. * - * NEON was also affected by this. + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is + * one notable exception. * - * aarch64 cannot access the high bits of a Q-form register, and writes to - * a D-form register zero the high bits, similar to how writes to W-form - * scalar registers (or DWORD registers on x86_64) work. + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. * - * The formerly free vget_high intrinsics now require a vext (with a few - * exceptions) + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic + * need Lo registers, and this shuffling results in thousands more MOVs + * than A32. * - * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the - * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to - * only modify one operand. + * A32 and T32 don't have this limitation. They can access all 14 + * registers, do a 32->64 multiply with UMULL, and the flexible operand + * allowing free shifts is helpful, too. * - * The equivalent of the VZIP.32 on the lower and upper halves would be - * this mess: + * Therefore, we do a quick sanity check. * - * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] - * } zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } zip2 v0.2s, - * v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * If compiling Thumb-1 for a target which supports ARM instructions, we + * will emit a warning, as it is not a "sane" platform to compile for. * - * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 - * (SHRN): + * Usually, if this happens, it is because of an accident and you probably + * need to specify -march, as you likely meant to compile for a newer + * architecture. * - * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); - * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); - * - * This is available on ARMv7-A, but is less efficient than a single - * VZIP.32. + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 */ + #if defined(__thumb__) && !defined(__thumb2__) && \ + defined(__ARM_ARCH_ISA_ARM) + #warning "XXH3 is highly inefficient without ARM or Thumb-2." + #endif + + /* ========================================== + * Vectorization detection + * ========================================== */ + + #ifdef XXH_DOXYGEN + /*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the + * best implementation. + */ + #define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Note that these are actually implemented as macros. + * + * If this is not defined, it is detected automatically. + * @ref XXH_X86DISPATCH overrides this. + */ +enum XXH_VECTOR_TYPE /* fake enum */ { + + XXH_SCALAR = 0, /*!< Portable scalar version */ + XXH_SSE2 = 1, /*!< + * SSE2 for Pentium 4, Opteron, all x86_64. + * + * @note SSE2 is also guaranteed on Windows 10, macOS, and + * Android x86. + */ + XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ + XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ + XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */ + XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ + +}; + + /*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment reqired for said + * vector type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ + #define XXH_ACC_ALIGN 8 + #endif + + /* Actual definition */ + #ifndef XXH_DOXYGEN + #define XXH_SCALAR 0 + #define XXH_SSE2 1 + #define XXH_AVX2 2 + #define XXH_AVX512 3 + #define XXH_NEON 4 + #define XXH_VSX 5 + #endif + + #ifndef XXH_VECTOR /* can be defined on command line */ + #if defined(__AVX512F__) + #define XXH_VECTOR XXH_AVX512 + #elif defined(__AVX2__) + #define XXH_VECTOR XXH_AVX2 + #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) + #define XXH_VECTOR XXH_SSE2 + #elif defined(__GNUC__) /* msvc support maybe later */ \ + && (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \ + (defined( \ + __LITTLE_ENDIAN__) /* We only support little endian NEON */ \ + || (defined(__BYTE_ORDER__) && \ + __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) + #define XXH_VECTOR XXH_NEON + #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \ + (defined(__s390x__) && defined(__VEC__)) && \ + defined(__GNUC__) /* TODO: IBM XL */ + #define XXH_VECTOR XXH_VSX + #else + #define XXH_VECTOR XXH_SCALAR + #endif + #endif /* - * Function-like macro: - * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t - * &outHi) - * { - - * outLo = (uint32x2_t)(in & 0xFFFFFFFF); - * outHi = (uint32x2_t)(in >> 32); - * in = UNDEFINED; - * } + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. */ - #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ - && defined(__GNUC__) && !defined(__aarch64__) && !defined(__arm64__) - #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ - do { \ - \ - /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, \ - * %f0 = upper D half */ \ - /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 \ - */ \ - /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \ - */ \ - __asm__("vzip.32 %e0, %f0" : "+w"(in)); \ - (outLo) = vget_low_u32(vreinterpretq_u32_u64(in)); \ - (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ - \ - } while (0) + #ifndef XXH_ACC_ALIGN + #if defined(XXH_X86DISPATCH) + #define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ + #elif XXH_VECTOR == XXH_SCALAR /* scalar */ + #define XXH_ACC_ALIGN 8 + #elif XXH_VECTOR == XXH_SSE2 /* sse2 */ + #define XXH_ACC_ALIGN 16 + #elif XXH_VECTOR == XXH_AVX2 /* avx2 */ + #define XXH_ACC_ALIGN 32 + #elif XXH_VECTOR == XXH_NEON /* neon */ + #define XXH_ACC_ALIGN 16 + #elif XXH_VECTOR == XXH_VSX /* vsx */ + #define XXH_ACC_ALIGN 16 + #elif XXH_VECTOR == XXH_AVX512 /* avx512 */ + #define XXH_ACC_ALIGN 64 + #endif + #endif + #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \ + XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 + #define XXH_SEC_ALIGN XXH_ACC_ALIGN #else - #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ - do { \ - \ - (outLo) = vmovn_u64(in); \ - (outHi) = vshrn_n_u64((in), 32); \ - \ - } while (0) + #define XXH_SEC_ALIGN 8 + #endif + /* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling + * resulting in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization + * which only applies to Sandy and Ivy Bridge... which don't even support + * AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use + * either -O2 -mavx2 -march=haswell or -O2 -mavx2 + * -mno-avx256-split-unaligned-load for decent performance, or to use + * Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC + * into -O2, but the other one we can't control without "failed to inline + * always inline function due to target mismatch" warnings. + */ + #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && \ + !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ + #pragma GCC push_options + #pragma GCC optimize("-O2") #endif - #endif /* XXH_VECTOR == XXH_NEON */ - /* - * VSX and Z Vector helpers. - * - * This is very messy, and any pull requests to clean this up are welcome. - * - * There are a lot of problems with supporting VSX and s390x, due to - * inconsistent intrinsics, spotty coverage, and multiple endiannesses. - */ - #if XXH_VECTOR == XXH_VSX - #if defined(__s390x__) - #include <s390intrin.h> - #else - /* gcc's altivec.h can have the unwanted consequence to unconditionally - * #define bool, vector, and pixel keywords, - * with bad consequences for programs already using these keywords for - * other purposes. The paragraph defining these macros is skipped when - * __APPLE_ALTIVEC__ is defined. - * __APPLE_ALTIVEC__ is _generally_ defined automatically by the - * compiler, but it seems that, in some cases, it isn't. Force the build - * macro to be defined, so that keywords are not altered. + #if XXH_VECTOR == XXH_NEON + /* + * NEON's setup for vmlal_u32 is a little more complicated than it is on + * SSE2, AVX2, and VSX. + * + * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an + * upcast. + * + * To do the same operation, the 128-bit 'Q' register needs to be split + * into two 64-bit 'D' registers, performing this operation:: + * + * [ a | b ] | + * '---------. .--------' | | x | + * | .---------' '--------. | + * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 + * ] + * + * Due to significant changes in aarch64, the fastest method for aarch64 + * is completely different than the fastest method for ARMv7-A. + * + * ARMv7-A treats D registers as unions overlaying Q registers, so + * modifying D11 will modify the high half of Q5. This is similar to how + * modifying AH will only affect bits 8-15 of AX on x86. + * + * VZIP takes two registers, and puts even lanes in one register and odd + * lanes in the other. + * + * On ARMv7-A, this strangely modifies both parameters in place instead + * of taking the usual 3-operand form. + * + * Therefore, if we want to do this, we can simply use a D-form VZIP.32 + * on the lower and upper halves of the Q register to end up with the + * high and low halves where we want - all in one instruction. + * + * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { + + * d10[1], d11[1] } + * + * Unfortunately we need inline assembly for this: Instructions + * modifying two registers at once is not possible in GCC or Clang's IR, + * and they have to create a copy. + * + * aarch64 requires a different approach. + * + * In order to make it easier to write a decent compiler for aarch64, + * many quirks were removed, such as conditional execution. + * + * NEON was also affected by this. + * + * aarch64 cannot access the high bits of a Q-form register, and writes + * to a D-form register zero the high bits, similar to how writes to + * W-form scalar registers (or DWORD registers on x86_64) work. + * + * The formerly free vget_high intrinsics now require a vext (with a few + * exceptions) + * + * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the + * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to + * only modify one operand. + * + * The equivalent of the VZIP.32 on the lower and upper halves would be + * this mess: + * + * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], + * v0[1] } zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } zip2 + * v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * + * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 + * (SHRN): + * + * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); + * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); + * + * This is available on ARMv7-A, but is less efficient than a single + * VZIP.32. */ - #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__) - #define __APPLE_ALTIVEC__ - #endif - #include <altivec.h> - #endif -typedef __vector unsigned long long xxh_u64x2; -typedef __vector unsigned char xxh_u8x16; -typedef __vector unsigned xxh_u32x4; + /*! + * Function-like macro: + * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t + * &outHi) + * { - #ifndef XXH_VSX_BE - #if defined(__BIG_ENDIAN__) || \ - (defined(__BYTE_ORDER__) && \ - __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) - #define XXH_VSX_BE 1 - #elif defined(__VEC_ELEMENT_REG_ORDER__) && \ - __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ - #warning \ - "-maltivec=be is not recommended. Please use native endianness." - #define XXH_VSX_BE 1 + * outLo = (uint32x2_t)(in & 0xFFFFFFFF); + * outHi = (uint32x2_t)(in >> 32); + * in = UNDEFINED; + * } + */ + #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ + && defined(__GNUC__) && !defined(__aarch64__) && \ + !defined(__arm64__) + #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + \ + /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, \ + * %f0 = upper D half */ \ + /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 \ + */ \ + /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \ + */ \ + __asm__("vzip.32 %e0, %f0" : "+w"(in)); \ + (outLo) = vget_low_u32(vreinterpretq_u32_u64(in)); \ + (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ + \ + } while (0) #else - #define XXH_VSX_BE 0 + #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + \ + (outLo) = vmovn_u64(in); \ + (outHi) = vshrn_n_u64((in), 32); \ + \ + } while (0) #endif - #endif /* !defined(XXH_VSX_BE) */ + #endif /* XXH_VECTOR == XXH_NEON */ - #if XXH_VSX_BE - /* A wrapper for POWER9's vec_revb. */ - #if defined(__POWER9_VECTOR__) || \ - (defined(__clang__) && defined(__s390x__)) - #define XXH_vec_revb vec_revb + /* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ + #if XXH_VECTOR == XXH_VSX + #if defined(__s390x__) + #include <s390intrin.h> #else + /* gcc's altivec.h can have the unwanted consequence to + * unconditionally #define bool, vector, and pixel keywords, with bad + * consequences for programs already using these keywords for other + * purposes. The paragraph defining these macros is skipped when + * __APPLE_ALTIVEC__ is defined. + * __APPLE_ALTIVEC__ is _generally_ defined automatically by the + * compiler, but it seems that, in some cases, it isn't. Force the + * build macro to be defined, so that keywords are not altered. + */ + #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__) + #define __APPLE_ALTIVEC__ + #endif + #include <altivec.h> + #endif + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + + #ifndef XXH_VSX_BE + #if defined(__BIG_ENDIAN__) || \ + (defined(__BYTE_ORDER__) && \ + __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + #define XXH_VSX_BE 1 + #elif defined(__VEC_ELEMENT_REG_ORDER__) && \ + __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ + #warning \ + "-maltivec=be is not recommended. Please use native endianness." + #define XXH_VSX_BE 1 + #else + #define XXH_VSX_BE 0 + #endif + #endif /* !defined(XXH_VSX_BE) */ + + #if XXH_VSX_BE + #if defined(__POWER9_VECTOR__) || \ + (defined(__clang__) && defined(__s390x__)) + #define XXH_vec_revb vec_revb + #else +/*! + * A polyfill for POWER9's vec_revb(). + */ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) { xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, @@ -2834,40 +3458,40 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) { } - #endif - #endif /* XXH_VSX_BE */ + #endif + #endif /* XXH_VSX_BE */ -/* - * Performs an unaligned load and byte swaps it on big endian. +/*! + * Performs an unaligned vector load and byte swaps it on big endian. */ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) { xxh_u64x2 ret; memcpy(&ret, ptr, sizeof(xxh_u64x2)); - #if XXH_VSX_BE + #if XXH_VSX_BE ret = XXH_vec_revb(ret); - #endif + #endif return ret; } - /* - * vec_mulo and vec_mule are very problematic intrinsics on PowerPC - * - * These intrinsics weren't added until GCC 8, despite existing for a - * while, and they are endian dependent. Also, their meaning swap - * depending on version. - * */ - #if defined(__s390x__) - /* s390x is always big endian, no issue on this platform */ - #define XXH_vec_mulo vec_mulo - #define XXH_vec_mule vec_mule - #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) - /* Clang has a better way to control this, we can just use the builtin - * which doesn't swap. */ - #define XXH_vec_mulo __builtin_altivec_vmulouw - #define XXH_vec_mule __builtin_altivec_vmuleuw - #else + /* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a + * while, and they are endian dependent. Also, their meaning swap + * depending on version. + * */ + #if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ + #define XXH_vec_mulo vec_mulo + #define XXH_vec_mule vec_mule + #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) + /* Clang has a better way to control this, we can just use the builtin + * which doesn't swap. */ + #define XXH_vec_mulo __builtin_altivec_vmulouw + #define XXH_vec_mule __builtin_altivec_vmuleuw + #else /* gcc needs inline assembly */ /* Adapted from * https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ @@ -2887,40 +3511,41 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) { } - #endif /* XXH_vec_mulo, XXH_vec_mule */ - #endif /* XXH_VECTOR == XXH_VSX */ + #endif /* XXH_vec_mulo, XXH_vec_mule */ + #endif /* XXH_VECTOR == XXH_VSX */ - /* prefetch - * can be disabled, by declaring XXH_NO_PREFETCH build macro */ - #if defined(XXH_NO_PREFETCH) - #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ - #else - #if defined(_MSC_VER) && \ - (defined(_M_X64) || \ - defined( \ - _M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ - #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ - #define XXH_PREFETCH(ptr) _mm_prefetch((const char *)(ptr), _MM_HINT_T0) - #elif defined(__GNUC__) && \ - ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) - #define XXH_PREFETCH(ptr) \ - __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) - #else + /* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ + #if defined(XXH_NO_PREFETCH) #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ - #endif - #endif /* XXH_NO_PREFETCH */ + #else + #if defined(_MSC_VER) && \ + (defined(_M_X64) || \ + defined( \ + _M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ + #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ + #define XXH_PREFETCH(ptr) \ + _mm_prefetch((const char *)(ptr), _MM_HINT_T0) + #elif defined(__GNUC__) && \ + ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) + #define XXH_PREFETCH(ptr) \ + __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) + #else + #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ + #endif + #endif /* XXH_NO_PREFETCH */ - /* ========================================== - * XXH3 default settings - * ========================================== */ + /* ========================================== + * XXH3 default settings + * ========================================== */ - #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ - #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) - #error "default keyset is not large enough" - #endif + #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) + #error "default keyset is not large enough" + #endif -/* Pseudorandom secret taken directly from FARSH */ +/*! Pseudorandom secret taken directly from FARSH. */ XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { @@ -2943,69 +3568,79 @@ static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { }; - #ifdef XXH_OLD_NAMES - #define kSecret XXH3_kSecret - #endif + #ifdef XXH_OLD_NAMES + #define kSecret XXH3_kSecret + #endif - /* - * Calculates a 32-bit to 64-bit long multiply. - * - * Wraps __emulu on MSVC x86 because it tends to call __allmul when it - * doesn't need to (but it shouldn't need to anyways, it is about 7 - * instructions to do a 64x64 multiply...). Since we know that this will - * _always_ emit MULL, we use that instead of the normal method. - * - * If you are compiling for platforms like Thumb-1 and don't have a better - * option, you may also want to write your own long multiply routine here. - * - * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) - * { + #ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it + * doesn't need to (but it shouldn't need to anyways, it is about 7 instructions + * to do a 64x64 multiply...). Since we know that this will _always_ emit + * `MULL`, we use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better + * option, you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) { - * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); - * } - */ - #if defined(_MSC_VER) && defined(_M_IX86) - #include <intrin.h> - #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) - #else - /* - * Downcast + upcast is usually better than masking on older compilers - * like GCC 4.2 (especially 32-bit ones), all without affecting newer - * compilers. - * - * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both - * operands and perform a full 64x64 multiply -- entirely redundant on - * 32-bit. - */ - #define XXH_mult32to64(x, y) \ - ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) - #endif + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); -/* - * Calculates a 64->128-bit long multiply. +} + + #elif defined(_MSC_VER) && defined(_M_IX86) + #include <intrin.h> + #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) + #else + /* + * Downcast + upcast is usually better than masking on older compilers + * like GCC 4.2 (especially 32-bit ones), all without affecting newer + * compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both + * operands and perform a full 64x64 multiply -- entirely redundant on + * 32-bit. + */ + #define XXH_mult32to64(x, y) \ + ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) + #endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. * - * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version. + * @param lhs, rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. */ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) { - /* - * GCC/Clang __uint128_t method. - * - * On most 64-bit targets, GCC and Clang define a __uint128_t type. - * This is usually the best way as it usually uses a native long 64-bit - * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. - * - * Usually. - * - * Despite being a 32-bit platform, Clang (and emscripten) define this type - * despite not having the arithmetic for it. This results in a laggy - * compiler builtin call which calculates a full 128-bit multiply. - * In that case it is best to use the portable one. - * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 - */ - #if defined(__GNUC__) && !defined(__wasm__) && \ - defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this + * type despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ + #if defined(__GNUC__) && !defined(__wasm__) && \ + defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; XXH128_hash_t r128; @@ -3013,19 +3648,19 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) { r128.high64 = (xxh_u64)(product >> 64); return r128; - /* - * MSVC for x64's _umul128 method. - * - * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 - * *HighProduct); - * - * This compiles to single operand MUL on x64. - */ - #elif defined(_M_X64) || defined(_M_IA64) + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 + * *HighProduct); + * + * This compiles to single operand MUL on x64. + */ + #elif defined(_M_X64) || defined(_M_IA64) - #ifndef _MSC_VER - #pragma intrinsic(_umul128) - #endif + #ifndef _MSC_VER + #pragma intrinsic(_umul128) + #endif xxh_u64 product_high; xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); XXH128_hash_t r128; @@ -3033,7 +3668,7 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) { r128.high64 = product_high; return r128; - #else + #else /* * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. * @@ -3093,16 +3728,20 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) { r128.low64 = lower; r128.high64 = upper; return r128; - #endif + #endif } -/* - * Does a 64-bit to 128-bit multiply, then XOR folds it. +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. * * The reason for the separate function is to prevent passing too many structs * around by value. This will hopefully inline the multiply, but we don't force * it. + * + * @param lhs, rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() */ static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) { @@ -3111,7 +3750,7 @@ static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) { } -/* Seems to produce slightly better code on GCC for some reason. */ +/*! Seems to produce slightly better code on GCC for some reason. */ XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) { XXH_ASSERT(0 <= shift && shift < 64); @@ -3216,7 +3855,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len, XXH_ASSERT(input != NULL); XXH_ASSERT(secret != NULL); - XXH_ASSERT(4 <= len && len < 8); + XXH_ASSERT(4 <= len && len <= 8); seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; { @@ -3239,7 +3878,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input, XXH_ASSERT(input != NULL); XXH_ASSERT(secret != NULL); - XXH_ASSERT(8 <= len && len <= 16); + XXH_ASSERT(9 <= len && len <= 16); { xxh_u64 const bitflip1 = @@ -3306,11 +3945,10 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input, const xxh_u8 *XXH_RESTRICT secret, xxh_u64 seed64) { - #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ - && \ - !defined( \ - XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like \ + XXH32 hack */ /* * UGLY HACK: * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in @@ -3326,8 +3964,8 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input, * GCC generates much better scalar code than Clang for the rest of XXH3, * which is why finding a more optimal codepath is an interest. */ - __asm__("" : "+r"(seed64)); - #endif + XXH_COMPILER_GUARD(seed64); + #endif { xxh_u64 const input_lo = XXH_readLE64(input); @@ -3381,7 +4019,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b( } - #define XXH3_MIDSIZE_MAX 240 + #define XXH3_MIDSIZE_MAX 240 XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b( const xxh_u8 *XXH_RESTRICT input, size_t len, @@ -3391,8 +4029,8 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b( (void)secretSize; XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); - #define XXH3_MIDSIZE_STARTOFFSET 3 - #define XXH3_MIDSIZE_LASTOFFSET 17 + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 { @@ -3407,31 +4045,31 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b( acc = XXH3_avalanche(acc); XXH_ASSERT(nbRounds >= 8); - #if defined(__clang__) /* Clang */ \ - && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ - /* - * UGLY HACK: - * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. - * In everywhere else, it uses scalar code. - * - * For 64->128-bit multiplies, even if the NEON was 100% optimal, it - * would still be slower than UMAAL (see XXH_mult64to128). - * - * Unfortunately, Clang doesn't handle the long multiplies properly and - * converts them to the nonexistent "vmulq_u64" intrinsic, which is then - * scalarized into an ugly mess of VMOV.32 instructions. - * - * This mess is difficult to avoid without turning autovectorization - * off completely, but they are usually relatively minor and/or not - * worth it to fix. - * - * This loop is the easiest to fix, as unlike XXH32, this pragma - * _actually works_ because it is a loop vectorization instead of an - * SLP vectorization. - */ - #pragma clang loop vectorize(disable) - #endif + #if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) + #endif for (i = 8; i < nbRounds; i++) { acc += @@ -3450,17 +4088,17 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b( } - /* ======= Long Keys ======= */ + /* ======= Long Keys ======= */ - #define XXH_STRIPE_LEN 64 - #define XXH_SECRET_CONSUME_RATE \ - 8 /* nb of secret bytes consumed at each accumulation */ - #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + #define XXH_STRIPE_LEN 64 + #define XXH_SECRET_CONSUME_RATE \ + 8 /* nb of secret bytes consumed at each accumulation */ + #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) - #ifdef XXH_OLD_NAMES - #define STRIPE_LEN XXH_STRIPE_LEN - #define ACC_NB XXH_ACC_NB - #endif + #ifdef XXH_OLD_NAMES + #define STRIPE_LEN XXH_STRIPE_LEN + #define ACC_NB XXH_ACC_NB + #endif XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) { @@ -3469,56 +4107,58 @@ XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) { } - /* Several intrinsic functions below are supposed to accept __int64 as - * argument, as documented in - * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . However, - * several environments do not define __int64 type, requiring a workaround. - */ - #if !defined(__VMS) && \ - (defined(__cplusplus) || (defined(__STDC_VERSION__) && \ - (__STDC_VERSION__ >= 199901L) /* C99 */)) + /* Several intrinsic functions below are supposed to accept __int64 as + * argument, as documented in + * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ + #if !defined(__VMS) && \ + (defined(__cplusplus) || (defined(__STDC_VERSION__) && \ + (__STDC_VERSION__ >= 199901L) /* C99 */)) typedef int64_t xxh_i64; - #else + #else /* the following type must have a width of 64-bit */ typedef long long xxh_i64; - #endif + #endif - /* - * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the - * most optimized. - * - * It is a hardened version of UMAC, based off of FARSH's implementation. - * - * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD - * implementations, and it is ridiculously fast. - * - * We harden it by mixing the original input to the accumulators as well as - * the product. - * - * This means that in the (relatively likely) case of a multiply by zero, the - * original input is preserved. - * - * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve - * cross-pollination, as otherwise the upper and lower halves would be - * essentially independent. - * - * This doesn't matter on 64-bit hashes since they all get merged together in - * the end, so we skip the extra step. - * - * Both XXH3_64bits and XXH3_128bits use this subroutine. - */ + /* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the + * most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as + * the product. + * + * This means that in the (relatively likely) case of a multiply by zero, + * the original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together + * in the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ - #if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH) + #if (XXH_VECTOR == XXH_AVX512) || \ + (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) - #ifndef XXH_TARGET_AVX512 - #define XXH_TARGET_AVX512 /* disable attribute target */ - #endif + #ifndef XXH_TARGET_AVX512 + #define XXH_TARGET_AVX512 /* disable attribute target */ + #endif XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512( void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, const void *XXH_RESTRICT secret) { - XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc; + __m512i *const xacc = (__m512i *)acc; XXH_ASSERT((((size_t)acc) & 63) == 0); XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); @@ -3576,8 +4216,8 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512( XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); { - XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc; - const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + __m512i *const xacc = (__m512i *)acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); /* xacc[0] ^= (xacc[0] >> 47) */ __m512i const acc_vec = *xacc; @@ -3609,19 +4249,21 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512( int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); __m512i const seed = _mm512_mask_set1_epi64( - _mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64); + _mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64)); - XXH_ALIGN(64) const __m512i *const src = (const __m512i *)XXH3_kSecret; - XXH_ALIGN(64) __m512i *const dest = (__m512i *)customSecret; - int i; + const __m512i *const src = (const __m512i *)((const void *)XXH3_kSecret); + __m512i *const dest = (__m512i *)customSecret; + int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); for (i = 0; i < nbRounds; ++i) { /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void - * const*', this will warn "discards ‘const’ qualifier". */ + * const*', this will warn "discards 'const' qualifier". */ union { - XXH_ALIGN(64) const __m512i *cp; - XXH_ALIGN(64) void *p; + const __m512i *cp; + void * p; } remote_const_void; @@ -3635,13 +4277,14 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512( } - #endif + #endif - #if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH) + #if (XXH_VECTOR == XXH_AVX2) || \ + (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) - #ifndef XXH_TARGET_AVX2 - #define XXH_TARGET_AVX2 /* disable attribute target */ - #endif + #ifndef XXH_TARGET_AVX2 + #define XXH_TARGET_AVX2 /* disable attribute target */ + #endif XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2( void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, @@ -3650,7 +4293,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2( XXH_ASSERT((((size_t)acc) & 31) == 0); { - XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc; + __m256i *const xacc = (__m256i *)acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ @@ -3692,7 +4335,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2( XXH_ASSERT((((size_t)acc) & 31) == 0); { - XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc; + __m256i *const xacc = (__m256i *)acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ const __m256i *const xsecret = (const __m256i *)secret; @@ -3732,24 +4375,23 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2( XXH_PREFETCH(customSecret); { - __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, - -(xxh_i64)seed64, (xxh_i64)seed64); + __m256i const seed = + _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, + (xxh_i64)(0U - seed64), (xxh_i64)seed64); - XXH_ALIGN(64) const __m256i *const src = (const __m256i *)XXH3_kSecret; - XXH_ALIGN(64) __m256i * dest = (__m256i *)customSecret; + const __m256i *const src = (const __m256i *)((const void *)XXH3_kSecret); + __m256i * dest = (__m256i *)customSecret; - #if defined(__GNUC__) || defined(__clang__) + #if defined(__GNUC__) || defined(__clang__) /* * On GCC & Clang, marking 'dest' as modified will cause the compiler: * - do not extract the secret from sse registers in the internal loop * - use less common registers, and avoid pushing these reg into stack - * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with - * customSecret, and on aarch64, this prevented LDP from merging two - * loads together for free. Putting the loads together before the stores - * properly generates LDP. */ - __asm__("" : "+r"(dest)); - #endif + XXH_COMPILER_GUARD(dest); + #endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); /* GCC -O2 need unroll loop manually */ dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed); @@ -3763,13 +4405,14 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2( } - #endif + #endif - #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + /* x86dispatch always generates SSE2 */ + #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) - #ifndef XXH_TARGET_SSE2 - #define XXH_TARGET_SSE2 /* disable attribute target */ - #endif + #ifndef XXH_TARGET_SSE2 + #define XXH_TARGET_SSE2 /* disable attribute target */ + #endif XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2( void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, @@ -3779,7 +4422,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2( XXH_ASSERT((((size_t)acc) & 15) == 0); { - XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc; + __m128i *const xacc = (__m128i *)acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ const __m128i *const xinput = (const __m128i *)input; @@ -3820,7 +4463,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2( XXH_ASSERT((((size_t)acc) & 15) == 0); { - XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc; + __m128i *const xacc = (__m128i *)acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ const __m128i *const xsecret = (const __m128i *)secret; @@ -3859,30 +4502,34 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2( int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); - #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 - // MSVC 32bit mode does not support _mm_set_epi64x before 2015 + #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ XXH_ALIGN(16) - const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, -(xxh_i64)seed64}; + const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, (xxh_i64)(0U - seed64)}; __m128i const seed = _mm_load_si128((__m128i const *)seed64x2); - #else - __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64); - #endif + #else + __m128i const seed = + _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); + #endif int i; - XXH_ALIGN(64) const float *const src = (float const *)XXH3_kSecret; - XXH_ALIGN(XXH_SEC_ALIGN) __m128i *dest = (__m128i *)customSecret; - #if defined(__GNUC__) || defined(__clang__) + const void *const src16 = XXH3_kSecret; + __m128i * dst16 = (__m128i *)customSecret; + #if defined(__GNUC__) || defined(__clang__) /* * On GCC & Clang, marking 'dest' as modified will cause the compiler: * - do not extract the secret from sse registers in the internal loop * - use less common registers, and avoid pushing these reg into stack */ - __asm__("" : "+r"(dest)); - #endif + XXH_COMPILER_GUARD(dst16); + #endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); for (i = 0; i < nbRounds; ++i) { - dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src + i * 4)), seed); + dst16[i] = + _mm_add_epi64(_mm_load_si128((const __m128i *)src16 + i), seed); } @@ -3890,9 +4537,9 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2( } - #endif + #endif - #if (XXH_VECTOR == XXH_NEON) + #if (XXH_VECTOR == XXH_NEON) XXH_FORCE_INLINE void XXH3_accumulate_512_neon( void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, @@ -3901,7 +4548,7 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_neon( XXH_ASSERT((((size_t)acc) & 15) == 0); { - XXH_ALIGN(16) uint64x2_t *const xacc = (uint64x2_t *)acc; + uint64x2_t *const xacc = (uint64x2_t *)acc; /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ uint8_t const *const xinput = (const uint8_t *)input; @@ -3996,9 +4643,9 @@ XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void *XXH_RESTRICT acc, } - #endif + #endif - #if (XXH_VECTOR == XXH_VSX) + #if (XXH_VECTOR == XXH_VSX) XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, @@ -4025,12 +4672,12 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT acc, xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); xacc[i] += product; - /* swap high and low halves */ - #ifdef __s390x__ + /* swap high and low halves */ + #ifdef __s390x__ xacc[i] += vec_permi(data_vec, data_vec, 2); - #else + #else xacc[i] += vec_xxpermdi(data_vec, data_vec, 2); - #endif + #endif } @@ -4075,7 +4722,7 @@ XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void *XXH_RESTRICT acc, } - #endif + #endif /* scalar variants - universal */ @@ -4083,7 +4730,6 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar( void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, const void *XXH_RESTRICT secret) { - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 *const xacc = (xxh_u64 *)acc; /* presumed aligned */ const xxh_u8 *const xinput = (const xxh_u8 *)input; /* no alignment restriction */ @@ -4105,7 +4751,6 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar( XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) { - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 *const xacc = (xxh_u64 *)acc; /* presumed aligned */ const xxh_u8 *const xsecret = (const xxh_u8 *)secret; /* no alignment restriction */ @@ -4135,7 +4780,7 @@ XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar( const xxh_u8 *kSecretPtr = XXH3_kSecret; XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); - #if defined(__clang__) && defined(__aarch64__) + #if defined(__clang__) && defined(__aarch64__) /* * UGLY HACK: * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are @@ -4164,8 +4809,8 @@ XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar( * without hack: 2654.4 MB/s * with hack: 3202.9 MB/s */ - __asm__("" : "+r"(kSecretPtr)); - #endif + XXH_COMPILER_GUARD(kSecretPtr); + #endif /* * Note: in debug mode, this overrides the asm optimization * and Clang will emit MOVK chains again. @@ -4200,55 +4845,55 @@ typedef void (*XXH3_f_accumulate_512)(void *XXH_RESTRICT, const void *, typedef void (*XXH3_f_scrambleAcc)(void *XXH_RESTRICT, const void *); typedef void (*XXH3_f_initCustomSecret)(void *XXH_RESTRICT, xxh_u64); - #if (XXH_VECTOR == XXH_AVX512) + #if (XXH_VECTOR == XXH_AVX512) - #define XXH3_accumulate_512 XXH3_accumulate_512_avx512 - #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 - #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + #define XXH3_accumulate_512 XXH3_accumulate_512_avx512 + #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 + #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 - #elif (XXH_VECTOR == XXH_AVX2) + #elif (XXH_VECTOR == XXH_AVX2) - #define XXH3_accumulate_512 XXH3_accumulate_512_avx2 - #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 - #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + #define XXH3_accumulate_512 XXH3_accumulate_512_avx2 + #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 + #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 - #elif (XXH_VECTOR == XXH_SSE2) + #elif (XXH_VECTOR == XXH_SSE2) - #define XXH3_accumulate_512 XXH3_accumulate_512_sse2 - #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 - #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + #define XXH3_accumulate_512 XXH3_accumulate_512_sse2 + #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 + #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 - #elif (XXH_VECTOR == XXH_NEON) + #elif (XXH_VECTOR == XXH_NEON) - #define XXH3_accumulate_512 XXH3_accumulate_512_neon - #define XXH3_scrambleAcc XXH3_scrambleAcc_neon - #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + #define XXH3_accumulate_512 XXH3_accumulate_512_neon + #define XXH3_scrambleAcc XXH3_scrambleAcc_neon + #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - #elif (XXH_VECTOR == XXH_VSX) + #elif (XXH_VECTOR == XXH_VSX) - #define XXH3_accumulate_512 XXH3_accumulate_512_vsx - #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx - #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + #define XXH3_accumulate_512 XXH3_accumulate_512_vsx + #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx + #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - #else /* scalar */ + #else /* scalar */ - #define XXH3_accumulate_512 XXH3_accumulate_512_scalar - #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar - #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + #define XXH3_accumulate_512 XXH3_accumulate_512_scalar + #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar + #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - #endif + #endif - #ifndef XXH_PREFETCH_DIST - #ifdef __clang__ - #define XXH_PREFETCH_DIST 320 - #else - #if (XXH_VECTOR == XXH_AVX512) - #define XXH_PREFETCH_DIST 512 + #ifndef XXH_PREFETCH_DIST + #ifdef __clang__ + #define XXH_PREFETCH_DIST 320 #else - #define XXH_PREFETCH_DIST 384 - #endif - #endif /* __clang__ */ - #endif /* XXH_PREFETCH_DIST */ + #if (XXH_VECTOR == XXH_AVX512) + #define XXH_PREFETCH_DIST 512 + #else + #define XXH_PREFETCH_DIST 384 + #endif + #endif /* __clang__ */ + #endif /* XXH_PREFETCH_DIST */ /* * XXH3_accumulate() @@ -4308,8 +4953,9 @@ XXH_FORCE_INLINE void XXH3_hashLong_internal_loop( { const xxh_u8 *const p = input + len - XXH_STRIPE_LEN; - #define XXH_SECRET_LASTACC_START \ - 7 /* not aligned on 8, last secret is different from acc & scrambler */ + #define XXH_SECRET_LASTACC_START \ + 7 /* not aligned on 8, last secret is different from acc & scrambler \ + */ f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); @@ -4337,10 +4983,10 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc, for (i = 0; i < 4; i++) { result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i); - #if defined(__clang__) /* Clang */ \ - && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ - && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + #if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ /* * UGLY HACK: * Prevent autovectorization on Clang ARMv7-a. Exact same problem as @@ -4349,8 +4995,8 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc, * without hack: 2063.7 MB/s * with hack: 2560.7 MB/s */ - __asm__("" : "+r"(result64)); - #endif + XXH_COMPILER_GUARD(result64); + #endif } @@ -4358,13 +5004,13 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc, } - #define XXH3_INIT_ACC \ - { \ - \ - XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ - XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \ - \ - } + #define XXH3_INIT_ACC \ + { \ + \ + XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \ + \ + } XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal( const void *XXH_RESTRICT input, size_t len, const void *XXH_RESTRICT secret, @@ -4379,9 +5025,9 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal( /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); - /* do not align on 8, so that the secret is different from the accumulator - */ - #define XXH_SECRET_MERGEACCS_START 11 + /* do not align on 8, so that the secret is different from the accumulator + */ + #define XXH_SECRET_MERGEACCS_START 11 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); return XXH3_mergeAccs(acc, (const xxh_u8 *)secret + XXH_SECRET_MERGEACCS_START, @@ -4501,6 +5147,7 @@ XXH3_64bits_internal(const void *XXH_RESTRICT input, size_t len, /* === Public entry point === */ +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) { return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), @@ -4508,6 +5155,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) { } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input, size_t len, const void *secret, @@ -4518,6 +5166,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len, XXH64_hash_t seed) { @@ -4603,6 +5252,7 @@ static void XXH_alignedFree(void *p) { } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) { XXH3_state_t *const state = @@ -4613,6 +5263,7 @@ XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) { } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) { XXH_alignedFree(statePtr); @@ -4620,6 +5271,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) { } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t * dst_state, const XXH3_state_t *src_state) { @@ -4627,9 +5279,8 @@ XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t * dst_state, } -static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr, - XXH64_hash_t seed, const void *secret, - size_t secretSize) { +static void XXH3_reset_internal(XXH3_state_t *statePtr, XXH64_hash_t seed, + const void *secret, size_t secretSize) { size_t const initStart = offsetof(XXH3_state_t, bufferedSize); size_t const initLength = @@ -4654,26 +5305,28 @@ static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr) { if (statePtr == NULL) return XXH_ERROR; - XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, - XXH_SECRET_DEFAULT_SIZE); + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret( XXH3_state_t *statePtr, const void *secret, size_t secretSize) { if (statePtr == NULL) return XXH_ERROR; - XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize); + XXH3_reset_internal(statePtr, 0, secret, secretSize); if (secret == NULL) return XXH_ERROR; if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; return XXH_OK; } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr, XXH64_hash_t seed) { @@ -4681,7 +5334,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr, if (seed == 0) return XXH3_64bits_reset(statePtr); if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed); - XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } @@ -4733,12 +5386,12 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, XXH3_f_scrambleAcc f_scramble) { if (input == NULL) - #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ - (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) + #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ + (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) return XXH_OK; - #else + #else return XXH_ERROR; - #endif + #endif { @@ -4747,6 +5400,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, (state->extSecret == NULL) ? state->customSecret : state->extSecret; state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ @@ -4756,10 +5410,10 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, } - /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ - #define XXH3_INTERNALBUFFER_STRIPES \ - (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + #define XXH3_INTERNALBUFFER_STRIPES \ + (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ @@ -4783,7 +5437,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, XXH_ASSERT(input < bEnd); /* Consume input by a multiple of internal buffer size */ - if (input + XXH3_INTERNALBUFFER_SIZE < bEnd) { + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { const xxh_u8 *const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; do { @@ -4814,6 +5468,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state, const void *input, size_t len) { @@ -4859,6 +5514,7 @@ XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t * acc, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) { const unsigned char *const secret = @@ -4881,8 +5537,9 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) { } - #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) +/*! @ingroup xxh3_family */ XXH_PUBLIC_API void XXH3_generateSecret(void * secretBuffer, const void *customSeed, size_t customSeedSize) { @@ -5398,6 +6055,7 @@ XXH3_128bits_internal(const void *input, size_t len, XXH64_hash_t seed64, /* === Public XXH128 API === */ +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) { return XXH3_128bits_internal(input, len, 0, XXH3_kSecret, @@ -5406,6 +6064,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) { } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input, size_t len, const void *secret, @@ -5416,6 +6075,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input, size_t len, XXH64_hash_t seed) { @@ -5426,6 +6086,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len, XXH64_hash_t seed) { @@ -5437,37 +6098,31 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len, /* * All the functions are actually the same as for 64-bit streaming variant. - * The only difference is the finalizatiom routine. + * The only difference is the finalization routine. */ -static void XXH3_128bits_reset_internal(XXH3_state_t *statePtr, - XXH64_hash_t seed, const void *secret, - size_t secretSize) { - - XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize); - -} - +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr) { if (statePtr == NULL) return XXH_ERROR; - XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, - XXH_SECRET_DEFAULT_SIZE); + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret( XXH3_state_t *statePtr, const void *secret, size_t secretSize) { if (statePtr == NULL) return XXH_ERROR; - XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize); + XXH3_reset_internal(statePtr, 0, secret, secretSize); if (secret == NULL) return XXH_ERROR; if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; return XXH_OK; } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr, XXH64_hash_t seed) { @@ -5475,11 +6130,12 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr, if (seed == 0) return XXH3_128bits_reset(statePtr); if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed); - XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state, const void * input, size_t len) { @@ -5489,6 +6145,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) { const unsigned char *const secret = @@ -5524,11 +6181,12 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) { } - /* 128-bit utility functions */ + /* 128-bit utility functions */ - #include <string.h> /* memcmp, memcpy */ + #include <string.h> /* memcmp, memcpy */ /* return : 1 is equal, 0 if different */ +/*! @ingroup xxh3_family */ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) { /* note : XXH128_hash_t is compact, it has no padding byte */ @@ -5540,6 +6198,7 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) { * return : >0 if *h128_1 > *h128_2 * <0 if *h128_1 < *h128_2 * =0 if *h128_1 == *h128_2 */ +/*! @ingroup xxh3_family */ XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) { XXH128_hash_t const h1 = *(const XXH128_hash_t *)h128_1; @@ -5552,6 +6211,7 @@ XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) { } /*====== Canonical representation ======*/ +/*! @ingroup xxh3_family */ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst, XXH128_hash_t hash) { @@ -5568,6 +6228,7 @@ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t *src) { @@ -5578,16 +6239,21 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src) { } - /* Pop our optimization override from above */ - #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ - && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__OPTIMIZE__) && \ - !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ - #pragma GCC pop_options - #endif + /* Pop our optimization override from above */ + #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && \ + !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ + #pragma GCC pop_options + #endif - #endif /* XXH_NO_LONG_LONG */ + #endif /* XXH_NO_LONG_LONG */ + + #endif /* XXH_NO_XXH3 */ +/*! + * @} + */ #endif /* XXH_IMPLEMENTATION */ #if defined(__cplusplus) |