aboutsummaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorvan Hauser <vh@thc.org>2021-11-07 14:09:09 +0100
committerGitHub <noreply@github.com>2021-11-07 14:09:09 +0100
commitfb443eaf2372ccd1825699c978fd0d662155fb9e (patch)
treeff019fc0b0704c16d68655d0f3864ec4cda49d30 /include
parent5b06413a5f109f310a62e36111a18d7325b246c3 (diff)
parent2ddbaa439ca78b0ae8cc6691d9657f5783b2d5e8 (diff)
downloadafl++-fb443eaf2372ccd1825699c978fd0d662155fb9e.tar.gz
Merge pull request #1141 from AFLplusplus/afl4
cmplog enhancement variant
Diffstat (limited to 'include')
-rw-r--r--include/afl-fuzz.h1
-rw-r--r--include/cmplog.h13
-rw-r--r--include/config.h4
-rw-r--r--include/types.h3
-rw-r--r--include/xxhash.h3084
5 files changed, 1889 insertions, 1216 deletions
diff --git a/include/afl-fuzz.h b/include/afl-fuzz.h
index e73ea1a4..f3d6d99d 100644
--- a/include/afl-fuzz.h
+++ b/include/afl-fuzz.h
@@ -1135,6 +1135,7 @@ void setup_signal_handlers(void);
void save_cmdline(afl_state_t *, u32, char **);
void read_foreign_testcases(afl_state_t *, int);
void write_crash_readme(afl_state_t *afl);
+u8 check_if_text_buf(u8 *buf, u32 len);
/* CmpLog */
diff --git a/include/cmplog.h b/include/cmplog.h
index 1c15d2b8..8778a4b6 100644
--- a/include/cmplog.h
+++ b/include/cmplog.h
@@ -48,7 +48,8 @@ struct cmp_header {
unsigned shape : 5;
unsigned type : 2;
unsigned attribute : 4;
- unsigned reserved : 5;
+ unsigned overflow : 1;
+ unsigned reserved : 4;
} __attribute__((packed));
@@ -59,14 +60,16 @@ struct cmp_operands {
u64 v0_128;
u64 v1_128;
-};
+} __attribute__((packed));
struct cmpfn_operands {
- u8 v0[32];
- u8 v1[32];
+ u8 v0[31];
+ u8 v0_len;
+ u8 v1[31];
+ u8 v1_len;
-};
+} __attribute__((packed));
typedef struct cmp_operands cmp_map_list[CMP_MAP_H];
diff --git a/include/config.h b/include/config.h
index 3aee9b00..b787152f 100644
--- a/include/config.h
+++ b/include/config.h
@@ -267,8 +267,8 @@
(first value), and to keep in memory as candidates. The latter should be much
higher than the former. */
-#define USE_AUTO_EXTRAS 128
-#define MAX_AUTO_EXTRAS (USE_AUTO_EXTRAS * 64)
+#define USE_AUTO_EXTRAS 4096
+#define MAX_AUTO_EXTRAS (USE_AUTO_EXTRAS * 8)
/* Scaling factor for the effector map used to skip some of the more
expensive deterministic steps. The actual divisor is set to
diff --git a/include/types.h b/include/types.h
index e945f0f5..bbcc2f81 100644
--- a/include/types.h
+++ b/include/types.h
@@ -46,6 +46,8 @@ typedef uint128_t u128;
#define FS_ERROR_SHM_OPEN 4
#define FS_ERROR_SHMAT 8
#define FS_ERROR_MMAP 16
+#define FS_ERROR_OLD_CMPLOG 32
+#define FS_ERROR_OLD_CMPLOG_QEMU 64
/* Reporting options */
#define FS_OPT_ENABLED 0x80000001
@@ -53,6 +55,7 @@ typedef uint128_t u128;
#define FS_OPT_SNAPSHOT 0x20000000
#define FS_OPT_AUTODICT 0x10000000
#define FS_OPT_SHDMEM_FUZZ 0x01000000
+#define FS_OPT_NEWCMPLOG 0x02000000
#define FS_OPT_OLD_AFLPP_WORKAROUND 0x0f000000
// FS_OPT_MAX_MAPSIZE is 8388608 = 0x800000 = 2^23 = 1 << 22
#define FS_OPT_MAX_MAPSIZE ((0x00fffffeU >> 1) + 1)
diff --git a/include/xxhash.h b/include/xxhash.h
index 006d3f3d..0ca2b852 100644
--- a/include/xxhash.h
+++ b/include/xxhash.h
@@ -32,7 +32,12 @@
* - xxHash homepage: https://www.xxhash.com
* - xxHash source repository: https://github.com/Cyan4973/xxHash
*/
-
+/*!
+ * @mainpage xxHash
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
/* TODO: update */
/* Notice extracted from xxHash homepage:
@@ -45,7 +50,7 @@ Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo
Name Speed Q.Score Author
xxHash 5.4 GB/s 10
CrapWow 3.2 GB/s 2 Andrew
-MumurHash 3a 2.7 GB/s 10 Austin Appleby
+MurmurHash 3a 2.7 GB/s 10 Austin Appleby
SpookyHash 2.0 GB/s 10 Bob Jenkins
SBox 1.4 GB/s 9 Bret Mulvey
Lookup3 1.2 GB/s 9 Bob Jenkins
@@ -119,29 +124,78 @@ extern "C" {
/*
* This part deals with the special case where a unit wants to inline xxHash,
- * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
- * as part of some previously included *.h header file.
+ * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
+ * such as part of some previously included *.h header file.
* Without further action, the new include would just be ignored,
* and functions would effectively _not_ be inlined (silent failure).
* The following macros solve this situation by prefixing all inlined names,
* avoiding naming collision with previous inclusions.
*/
- #ifdef XXH_NAMESPACE
- #error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
- /*
- * Note: Alternative: #undef all symbols (it's a pretty large list).
- * Without #error: it compiles, but functions are actually not inlined.
- */
- #endif
+/* Before that, we unconditionally #undef all symbols,
+ * in case they were already defined with XXH_NAMESPACE.
+ * They will then be redefined for XXH_INLINE_ALL
+ */
+ #undef XXH_versionNumber
+/* XXH32 */
+ #undef XXH32
+ #undef XXH32_createState
+ #undef XXH32_freeState
+ #undef XXH32_reset
+ #undef XXH32_update
+ #undef XXH32_digest
+ #undef XXH32_copyState
+ #undef XXH32_canonicalFromHash
+ #undef XXH32_hashFromCanonical
+/* XXH64 */
+ #undef XXH64
+ #undef XXH64_createState
+ #undef XXH64_freeState
+ #undef XXH64_reset
+ #undef XXH64_update
+ #undef XXH64_digest
+ #undef XXH64_copyState
+ #undef XXH64_canonicalFromHash
+ #undef XXH64_hashFromCanonical
+/* XXH3_64bits */
+ #undef XXH3_64bits
+ #undef XXH3_64bits_withSecret
+ #undef XXH3_64bits_withSeed
+ #undef XXH3_createState
+ #undef XXH3_freeState
+ #undef XXH3_copyState
+ #undef XXH3_64bits_reset
+ #undef XXH3_64bits_reset_withSeed
+ #undef XXH3_64bits_reset_withSecret
+ #undef XXH3_64bits_update
+ #undef XXH3_64bits_digest
+ #undef XXH3_generateSecret
+/* XXH3_128bits */
+ #undef XXH128
+ #undef XXH3_128bits
+ #undef XXH3_128bits_withSeed
+ #undef XXH3_128bits_withSecret
+ #undef XXH3_128bits_reset
+ #undef XXH3_128bits_reset_withSeed
+ #undef XXH3_128bits_reset_withSecret
+ #undef XXH3_128bits_update
+ #undef XXH3_128bits_digest
+ #undef XXH128_isEqual
+ #undef XXH128_cmp
+ #undef XXH128_canonicalFromHash
+ #undef XXH128_hashFromCanonical
+/* Finally, free the namespace itself */
+ #undef XXH_NAMESPACE
+
+/* employ the namespace for XXH_INLINE_ALL */
#define XXH_NAMESPACE XXH_INLINE_
/*
- * Some identifiers (enums, type names) are not symbols, but they must
- * still be renamed to avoid redeclaration.
+ * Some identifiers (enums, type names) are not symbols,
+ * but they must nonetheless be renamed to avoid redeclaration.
* Alternative solution: do not redeclare them.
- * However, this requires some #ifdefs, and is a more dispersed action.
- * Meanwhile, renaming can be achieved in a single block
+ * However, this requires some #ifdefs, and has a more dispersed impact.
+ * Meanwhile, renaming can be achieved in a single place.
*/
- #define XXH_IPREF(Id) XXH_INLINE_##Id
+ #define XXH_IPREF(Id) XXH_NAMESPACE##Id
#define XXH_OK XXH_IPREF(XXH_OK)
#define XXH_ERROR XXH_IPREF(XXH_ERROR)
#define XXH_errorcode XXH_IPREF(XXH_errorcode)
@@ -166,6 +220,12 @@ extern "C" {
#ifndef XXHASH_H_5627135585666179
#define XXHASH_H_5627135585666179 1
+ /*!
+ * @defgroup public Public API
+ * Contains details on the public xxHash functions.
+ * @{
+
+ */
/* specific declaration modes for Windows */
#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
#if defined(WIN32) && defined(_MSC_VER) && \
@@ -180,19 +240,24 @@ extern "C" {
#endif
#endif
- /*!
- * XXH_NAMESPACE, aka Namespace Emulation:
- *
- * If you want to include _and expose_ xxHash functions from within your own
- * library, but also want to avoid symbol collisions with other libraries
- * which may also include xxHash, you can use XXH_NAMESPACE to automatically
- * prefix any public symbol from xxhash library with the value of
- * XXH_NAMESPACE (therefore, avoid empty or numeric values).
- *
- * Note that no change is required within the calling program as long as it
- * includes `xxhash.h`: Regular symbol names will be automatically translated
- * by this header.
- */
+ #ifdef XXH_DOXYGEN
+ /*!
+ * @brief Emulate a namespace by transparently prefixing all symbols.
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries
+ * which may also include xxHash, you can use XXH_NAMESPACE to automatically
+ * prefix any public symbol from xxhash library with the value of
+ * XXH_NAMESPACE (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically
+ * translated by this header.
+ */
+ #define XXH_NAMESPACE /* YOUR NAME HERE */
+ #undef XXH_NAMESPACE
+ #endif
+
#ifdef XXH_NAMESPACE
#define XXH_CAT(A, B) A##B
#define XXH_NAME2(A, B) XXH_CAT(A, B)
@@ -264,10 +329,19 @@ extern "C" {
***************************************/
#define XXH_VERSION_MAJOR 0
#define XXH_VERSION_MINOR 8
- #define XXH_VERSION_RELEASE 0
+ #define XXH_VERSION_RELEASE 1
#define XXH_VERSION_NUMBER \
(XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + \
XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is only useful when xxHash is compiled as a shared library, as it is
+ * independent of the version defined in the header.
+ *
+ * @return `XXH_VERSION_NUMBER` as of when the libray was compiled.
+ */
XXH_PUBLIC_API unsigned XXH_versionNumber(void);
/* ****************************
@@ -279,15 +353,24 @@ typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode;
/*-**********************************************************************
* 32-bit hash
************************************************************************/
- #if !defined(__VMS) && \
+ #if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+
+ #elif !defined(__VMS) && \
(defined(__cplusplus) || \
(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
#include <stdint.h>
-typedef uint32_t XXH32_hash_t;
+typedef uint32_t XXH32_hash_t;
+
#else
#include <limits.h>
#if UINT_MAX == 0xFFFFFFFFUL
-typedef unsigned int XXH32_hash_t;
+typedef unsigned int XXH32_hash_t;
#else
#if ULONG_MAX == 0xFFFFFFFFUL
typedef unsigned long XXH32_hash_t;
@@ -298,24 +381,52 @@ typedef unsigned long XXH32_hash_t;
#endif
/*!
- * XXH32():
- * Calculate the 32-bit hash of sequence "length" bytes stored at memory
- * address "input". The memory between input & input+length must be valid
- * (allocated and read-accessible). "seed" can be used to alter the result
- * predictably. Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher
- * benchmark): 5.4 GB/s
- *
- * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
- * and offers true 64/128 bit hash results. It provides a superior level of
- * dispersion, and greatly reduces the risks of collisions.
+ * @}
+ *
+ * @defgroup xxh32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ * XXH32 is considered rather weak by today's standards.
+ * The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit
+ * systems, and offers true 64/128 bit hash results. It provides a superior
+ * level of dispersion, and greatly reduces the risks of collisions.
+ *
+ * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
+ * @see @ref xxh32_impl for implementation details
+ * @{
+
+ */
+
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in
+ * size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ * The memory between @p input and @p input + @p length must be valid,
+ * readable, contiguous memory. However, if @p length is `0`, @p input may be
+ * `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit hash value.
+ *
+ * @see
+ * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ * Direct equivalents for the other variants of xxHash.
+ * @see
+ * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
*/
XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length,
XXH32_hash_t seed);
-/******* Streaming *******/
-
-/*
- * Streaming functions generate the xxHash value from an incrememtal input.
+/*!
+ * Streaming functions generate the xxHash value from an incremental input.
* This method is slower than single-call functions, due to state management.
* For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
*
@@ -336,19 +447,125 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length,
* digest, and generate new hash values later on by invoking `XXH*_digest()`.
*
* When done, release the state using `XXH*_freeState()`.
+ *
+ * Example code for incrementally hashing a file:
+ * @code{.c}
+ * #include <stdio.h>
+ * #include <xxhash.h>
+ * #define BUFFER_SIZE 256
+ *
+ * // Note: XXH64 and XXH3 use the same interface.
+ * XXH32_hash_t
+ * hashFile(FILE* stream)
+ * {
+
+ * XXH32_state_t* state;
+ * unsigned char buf[BUFFER_SIZE];
+ * size_t amt;
+ * XXH32_hash_t hash;
+ *
+ * state = XXH32_createState(); // Create a state
+ * assert(state != NULL); // Error check here
+ * XXH32_reset(state, 0xbaad5eed); // Reset state with our seed
+ * while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
+
+ * XXH32_update(state, buf, amt); // Hash the file in chunks
+ * }
+ * hash = XXH32_digest(state); // Finalize the hash
+ * XXH32_freeState(state); // Clean up
+ * return hash;
+ * }
+ * @endcode
+ */
+
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
*/
+typedef struct XXH32_state_s XXH32_state_t;
-typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * Must be freed with XXH32_freeState().
+ * @return An allocated XXH32_state_t on success, `NULL` on failure.
+ */
XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void);
-XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr);
-XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dst_state,
- const XXH32_state_t *src_state);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * Must be allocated with XXH32_createState().
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref
+ * XXH32_createState().
+ * @return XXH_OK.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ * @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dst_state,
+ const XXH32_state_t *src_state);
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ * @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
XXH32_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in
+ * size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ * @p statePtr must not be `NULL`.
+ * @pre
+ * The memory between @p input and @p input + @p length must be valid,
+ * readable, contiguous memory. However, if @p length is `0`, @p input may be
+ * `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *statePtr,
const void *input, size_t length);
-XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @note
+ * Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ * digest, and update again.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ * @p statePtr must not be `NULL`.
+ *
+ * @return The calculated xxHash32 value from that state.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr);
/******* Canonical representation *******/
@@ -373,48 +590,158 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr);
* canonical format.
*/
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
typedef struct {
- unsigned char digest[4];
+ unsigned char digest[4]; /*!< Hash bytes, big endian */
} XXH32_canonical_t;
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ * @p dst must not be `NULL`.
+ */
XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
XXH32_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ * @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ */
XXH_PUBLIC_API XXH32_hash_t
XXH32_hashFromCanonical(const XXH32_canonical_t *src);
+ #ifdef __has_attribute
+ #define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+ #else
+ #define XXH_HAS_ATTRIBUTE(x) 0
+ #endif
+
+ /* C-language Attributes are added in C23. */
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && \
+ defined(__has_c_attribute)
+ #define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+ #else
+ #define XXH_HAS_C_ATTRIBUTE(x) 0
+ #endif
+
+ #if defined(__cplusplus) && defined(__has_cpp_attribute)
+ #define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+ #else
+ #define XXH_HAS_CPP_ATTRIBUTE(x) 0
+ #endif
+
+ /*
+ Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough'
+ attribute introduced in CPP17 and C23. CPP17 :
+ https://en.cppreference.com/w/cpp/language/attributes/fallthrough C23 :
+ https://en.cppreference.com/w/c/language/attributes/fallthrough
+ */
+ #if XXH_HAS_C_ATTRIBUTE(x)
+ #define XXH_FALLTHROUGH [[fallthrough]]
+ #elif XXH_HAS_CPP_ATTRIBUTE(x)
+ #define XXH_FALLTHROUGH [[fallthrough]]
+ #elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+ #define XXH_FALLTHROUGH __attribute__((fallthrough))
+ #else
+ #define XXH_FALLTHROUGH
+ #endif
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+
+ */
+
#ifndef XXH_NO_LONG_LONG
/*-**********************************************************************
* 64-bit hash
************************************************************************/
- #if !defined(__VMS) && \
+ #if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+ #elif !defined(__VMS) && \
(defined(__cplusplus) || (defined(__STDC_VERSION__) && \
(__STDC_VERSION__ >= 199901L) /* C99 */))
#include <stdint.h>
typedef uint64_t XXH64_hash_t;
#else
+ #include <limits.h>
+ #if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+/* LP64 ABI says uint64_t is unsigned long */
+typedef unsigned long XXH64_hash_t;
+ #else
/* the following type must have a width of 64-bit */
typedef unsigned long long XXH64_hash_t;
+ #endif
#endif
/*!
- * XXH64():
- * Returns the 64-bit hash of sequence of length @length stored at memory
- * address @input.
- * @seed can be used to alter the result predictably.
+ * @}
+ *
+ * @defgroup xxh64_family XXH64 family
+ * @ingroup public
+ * @{
+
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ * XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ * and offers true 64/128 bit hash results. It provides a superior level of
+ * dispersion, and greatly reduces the risks of collisions.
+ */
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
*
* This function usually runs faster on 64-bit systems, but slower on 32-bit
* systems (see benchmark).
*
- * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
- * and offers true 64/128 bit hash results. It provides a superior level of
- * dispersion, and greatly reduces the risks of collisions.
+ * @param input The block of data to be hashed, at least @p length bytes in
+ * size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ * The memory between @p input and @p input + @p length must be valid,
+ * readable, contiguous memory. However, if @p length is `0`, @p input may be
+ * `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit hash.
+ *
+ * @see
+ * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ * Direct equivalents for the other variants of xxHash.
+ * @see
+ * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
*/
XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t length,
XXH64_hash_t seed);
/******* Streaming *******/
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ */
typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void);
XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr);
@@ -439,12 +766,15 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
XXH_PUBLIC_API XXH64_hash_t
XXH64_hashFromCanonical(const XXH64_canonical_t *src);
-/*-**********************************************************************
- * XXH3 64-bit variant
- ************************************************************************/
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup xxh3_family XXH3 family
+ * @ingroup public
+ * @{
-/* ************************************************************************
- * XXH3 is a new hash algorithm featuring:
+ *
+ * XXH3 is a more recent hash algorithm featuring:
* - Improved speed for both small and large inputs
* - True 64-bit and 128-bit outputs
* - SIMD acceleration
@@ -454,41 +784,38 @@ XXH64_hashFromCanonical(const XXH64_canonical_t *src);
*
* https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
*
- * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
- * faster on small ones compared to XXH64, though exact differences depend on
- * the platform.
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
*
- * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
- * on all platforms.
- *
- * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
- *
- * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
- * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
- * explained in the implementation.
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Any 32-bit and 64-bit targets that can run XXH32 smoothly
+ * can run XXH3 at competitive speeds, even without vector support.
+ * Further details are explained in the implementation.
*
* Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
- * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
+ * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generage exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
*
* XXH3 offers 2 variants, _64bits and _128bits.
- * When only 64 bits are needed, prefer calling the _64bits variant, as it
- * reduces the amount of mixing, resulting in faster speed on small inputs.
*
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
* It's also generally simpler to manipulate a scalar return type than a struct.
*
- * The 128-bit version adds additional strength, but it is slightly slower.
- *
- * Return values of XXH3 and XXH128 are officially finalized starting
- * with v0.8.0 and will no longer change in future versions.
- * Avoid storing values from before that release in long-term storage.
- *
- * Results produced by v0.7.x are not comparable with results from v0.7.y.
- * However, the API is completely stable, and it can safely be used for
- * ephemeral data (local sessions).
- *
* The API supports one-shot hashing, streaming mode, and custom secrets.
*/
+/*-**********************************************************************
+ * XXH3 64-bit variant
+ ************************************************************************/
+
/* XXH3_64bits():
* default 64-bit variant, using default secret and default seed of 0.
* It's the fastest variant. */
@@ -504,20 +831,28 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *data, size_t len);
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *data, size_t len,
XXH64_hash_t seed);
- /*
- * XXH3_64bits_withSecret():
- * It's possible to provide any blob of bytes as a "secret" to generate the
- * hash. This makes it more difficult for an external actor to prepare an
- * intentional collision. The main condition is that secretSize *must* be
- * large enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced
- * hash values depends on secret's entropy. Technically, the secret must
- * look like a bunch of random bytes. Avoid "trivial" or structured data
- * such as repeated sequences or a text document. Whenever unsure about the
- * "randomness" of the blob of bytes, consider relabelling it as a "custom
- * seed" instead, and employ "XXH3_generateSecret()" (see below) to generate
- * a high entropy secret derived from the custom seed.
+ /*!
+ * The bare minimum size for a custom secret.
+ *
+ * @see
+ * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+ * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
*/
#define XXH3_SECRET_SIZE_MIN 136
+
+/*
+ * XXH3_64bits_withSecret():
+ * It's possible to provide any blob of bytes as a "secret" to generate the
+ * hash. This makes it more difficult for an external actor to prepare an
+ * intentional collision. The main condition is that secretSize *must* be large
+ * enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced hash
+ * values depends on secret's entropy. Technically, the secret must look like a
+ * bunch of random bytes. Avoid "trivial" or structured data such as repeated
+ * sequences or a text document. Whenever unsure about the "randomness" of the
+ * blob of bytes, consider relabelling it as a "custom seed" instead, and employ
+ * "XXH3_generateSecret()" (see below) to generate a high entropy secret derived
+ * from the custom seed.
+ */
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len,
const void *secret,
size_t secretSize);
@@ -529,6 +864,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len,
* As a consequence, streaming is slower than one-shot hashing.
* For better performance, prefer one-shot functions whenever applicable.
*/
+
+/*!
+ * @brief The state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ */
typedef struct XXH3_state_s XXH3_state_t;
XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void);
XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr);
@@ -572,10 +913,16 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *statePtr);
* XXH3 128-bit variant
************************************************************************/
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
typedef struct {
- XXH64_hash_t low64;
- XXH64_hash_t high64;
+ XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+ XXH64_hash_t high64; /*!< `value >> 64` */
} XXH128_hash_t;
@@ -649,6 +996,9 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src);
#endif /* XXH_NO_LONG_LONG */
+/*!
+ * @}
+ */
#endif /* XXHASH_H_5627135585666179 */
#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
@@ -660,7 +1010,7 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src);
* These declarations should only be used with static linking.
* Never use them in association with dynamic linking!
*****************************************************************************
- */
+*/
/*
* These definitions are only present to allow static allocation
@@ -668,41 +1018,72 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src);
* Never **ever** access their members directly.
*/
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
struct XXH32_state_s {
- XXH32_hash_t total_len_32;
- XXH32_hash_t large_len;
- XXH32_hash_t v1;
- XXH32_hash_t v2;
- XXH32_hash_t v3;
- XXH32_hash_t v4;
- XXH32_hash_t mem32[4];
- XXH32_hash_t memsize;
- XXH32_hash_t
- reserved; /* never read nor write, might be removed in a future version */
+ XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+ XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref
+ total_len_32 overflow) */
+ XXH32_hash_t v1; /*!< First accumulator lane */
+ XXH32_hash_t v2; /*!< Second accumulator lane */
+ XXH32_hash_t v3; /*!< Third accumulator lane */
+ XXH32_hash_t v4; /*!< Fourth accumulator lane */
+ XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as
+ unsigned char[16]. */
+ XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */
+ XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may
+ be removed. */
}; /* typedef'd to XXH32_state_t */
#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
struct XXH64_state_s {
- XXH64_hash_t total_len;
- XXH64_hash_t v1;
- XXH64_hash_t v2;
- XXH64_hash_t v3;
- XXH64_hash_t v4;
- XXH64_hash_t mem64[4];
- XXH32_hash_t memsize;
- XXH32_hash_t reserved32; /* required for padding anyway */
- XXH64_hash_t reserved64; /* never read nor write, might be removed in a future
- version */
+ XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */
+ XXH64_hash_t v1; /*!< First accumulator lane */
+ XXH64_hash_t v2; /*!< Second accumulator lane */
+ XXH64_hash_t v3; /*!< Third accumulator lane */
+ XXH64_hash_t v4; /*!< Fourth accumulator lane */
+ XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as
+ unsigned char[32]. */
+ XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */
+ XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/
+ XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it
+ may be removed. */
}; /* typedef'd to XXH64_state_t */
- #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 \
+ */
#include <stdalign.h>
#define XXH_ALIGN(n) alignas(n)
+ #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
+ /* In C++ alignas() is a keyword */
+ #define XXH_ALIGN(n) alignas(n)
#elif defined(__GNUC__)
#define XXH_ALIGN(n) __attribute__((aligned(n)))
#elif defined(_MSC_VER)
@@ -713,39 +1094,94 @@ struct XXH64_state_s {
/* Old GCC versions only accept the attribute after the type in structures.
*/
- #if !(defined(__STDC_VERSION__) && \
- (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
+ #if !(defined(__STDC_VERSION__) && \
+ (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
+ && !(defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
&& defined(__GNUC__)
#define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
#else
#define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
#endif
+ /*!
+ * @brief The size of the internal XXH3 buffer.
+ *
+ * This is the optimal update size for incremental hashing.
+ *
+ * @see XXH3_64b_update(), XXH3_128b_update().
+ */
#define XXH3_INTERNALBUFFER_SIZE 256
+
+ /*!
+ * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+ *
+ * This is the size used in @ref XXH3_kSecret and the seeded functions.
+ *
+ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+ */
#define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
+ * Otherwise it is an opaque type.
+ * Never use this definition in combination with dynamic library.
+ * This allows fields to safely be changed in the future.
+ *
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`,
+ * it will not be sufficiently aligned.
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do never access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
struct XXH3_state_s {
XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
- /* used to store a custom secret generated from a seed */
+ /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref
+ * XXH64_state_s */
XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+ /*!< Used to store a custom secret generated from a seed. */
XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
- XXH32_hash_t bufferedSize;
- XXH32_hash_t reserved32;
- size_t nbStripesSoFar;
- XXH64_hash_t totalLen;
- size_t nbStripesPerBlock;
- size_t secretLimit;
- XXH64_hash_t seed;
- XXH64_hash_t reserved64;
- const unsigned char *extSecret; /* reference to external secret;
- * if == NULL, use .customSecret instead */
+ /*!< The internal buffer. @see XXH32_state_s::mem32 */
+ XXH32_hash_t bufferedSize;
+ /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+ XXH32_hash_t reserved32;
+ /*!< Reserved field. Needed for padding on 64-bit. */
+ size_t nbStripesSoFar;
+ /*!< Number or stripes processed. */
+ XXH64_hash_t totalLen;
+ /*!< Total length hashed. 64-bit even on 32-bit targets. */
+ size_t nbStripesPerBlock;
+ /*!< Number of stripes per block. */
+ size_t secretLimit;
+ /*!< Size of @ref customSecret or @ref extSecret */
+ XXH64_hash_t seed;
+ /*!< Seed for _withSeed variants. Must be zero otherwise, @see
+ * XXH3_INITSTATE() */
+ XXH64_hash_t reserved64;
+ /*!< Reserved field. */
+ const unsigned char *extSecret;
+ /*!< Reference to an external secret for the _withSecret variants, NULL
+ * for other variants. */
/* note: there may be some padding at the end due to alignment on 64 bytes */
}; /* typedef'd to XXH3_state_t */
#undef XXH_ALIGN_MEMBER
- /* When the XXH3_state_t structure is merely emplaced on stack,
+ /*!
+ * @brief Initializes a stack-allocated `XXH3_state_s`.
+ *
+ * When the @ref XXH3_state_t structure is merely emplaced on stack,
* it should be initialized with XXH3_INITSTATE() or a memset()
* in case its first reset uses XXH3_NNbits_reset_withSeed().
* This init can be omitted if the first reset uses default or _withSecret
@@ -802,7 +1238,6 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
XXH64_hash_t seed);
#endif /* XXH_NO_LONG_LONG */
-
#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
#define XXH_IMPLEMENTATION
#endif
@@ -844,81 +1279,183 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
/* *************************************
* Tuning parameters
***************************************/
+
/*!
- * XXH_FORCE_MEMORY_ACCESS:
- * By default, access to unaligned memory is controlled by `memcpy()`, which
- * is safe and portable.
- *
- * Unfortunately, on some target/compiler combinations, the generated assembly
- * is sub-optimal.
+ * @defgroup tuning Tuning parameters
+ * @{
+
*
- * The below switch allow selection of a different access method
- * in the search for improved performance.
- * Method 0 (default):
- * Use `memcpy()`. Safe and portable. Default.
- * Method 1:
- * `__attribute__((packed))` statement. It depends on compiler extensions
- * and is therefore not portable.
- * This method is safe if your compiler supports it, and *generally* as
- * fast or faster than `memcpy`.
- * Method 2:
- * Direct access via cast. This method doesn't depend on the compiler but
- * violates the C standard.
- * It can generate buggy code on targets which do not support unaligned
- * memory accesses.
- * But in some circumstances, it's the only known way to get the most
- * performance (example: GCC + ARMv6)
- * Method 3:
- * Byteshift. This can generate the best code on old compilers which don't
- * inline small `memcpy()` calls, and it might also be faster on
- * big-endian systems which lack a native byteswap instruction. See
- * https://stackoverflow.com/a/32095106/646947 for details. Prefer these
- * methods in priority order (0 > 1 > 2 > 3)
+ * Various macros to control xxHash's behavior.
*/
+ #ifdef XXH_DOXYGEN
+ /*!
+ * @brief Define this to disable 64-bit code.
+ *
+ * Useful if only using the @ref xxh32_family and you have a strict C90
+ * compiler.
+ */
+ #define XXH_NO_LONG_LONG
+ #undef XXH_NO_LONG_LONG /* don't actually */
+ /*!
+ * @brief Controls how unaligned memory is accessed.
+ *
+ * By default, access to unaligned memory is controlled by `memcpy()`, which
+ * is safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated
+ * assembly is sub-optimal.
+ *
+ * The below switch allow selection of a different access method
+ * in the search for improved performance.
+ *
+ * @par Possible options:
+ *
+ * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+ * @par
+ * Use `memcpy()`. Safe and portable. Note that most modern compilers
+ * will eliminate the function call and treat it as an unaligned access.
+ *
+ * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
+ * @par
+ * Depends on compiler extensions and is therefore not portable.
+ * This method is safe _if_ your compiler supports it,
+ * and *generally* as fast or faster than `memcpy`.
+ *
+ * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+ * @par
+ * Casts directly and dereferences. This method doesn't depend on the
+ * compiler, but it violates the C standard as it directly dereferences
+ * an unaligned pointer. It can generate buggy code on targets which do not
+ * support unaligned memory accesses, but in some circumstances, it's
+ * the only known way to get the most performance.
+ *
+ * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+ * @par
+ * Also portable. This can generate the best code on old compilers which
+ * don't inline small `memcpy()` calls, and it might also be faster on
+ * big-endian systems which lack a native byteswap instruction. However,
+ * some compilers will emit literal byteshifts even if the target supports
+ * unaligned access.
+ * .
+ *
+ * @warning
+ * Methods 1 and 2 rely on implementation-defined behavior. Use these with
+ * care, as what works on one compiler/platform/optimization level may
+ * cause another to read garbage data or even crash.
+ *
+ * See https://stackoverflow.com/a/32095106/646947 for details.
+ *
+ * Prefer these methods in priority order (0 > 3 > 1 > 2)
+ */
+ #define XXH_FORCE_MEMORY_ACCESS 0
+ /*!
+ * @def XXH_ACCEPT_NULL_INPUT_POINTER
+ * @brief Whether to add explicit `NULL` checks.
+ *
+ * If the input pointer is `NULL` and the length is non-zero, xxHash's
+ * default behavior is to dereference it, triggering a segfault.
+ *
+ * When this macro is enabled, xxHash actively checks the input for a null
+ * pointer. If it is, the result for null input pointers is the same as a
+ * zero-length input.
+ */
+ #define XXH_ACCEPT_NULL_INPUT_POINTER 0
+ /*!
+ * @def XXH_FORCE_ALIGN_CHECK
+ * @brief If defined to non-zero, adds a special path for aligned inputs
+ * (XXH32() and XXH64() only).
+ *
+ * This is an important performance trick for architectures without decent
+ * unaligned memory access performance.
+ *
+ * It checks for input alignment, and when conditions are met, uses a "fast
+ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+ * faster_ read speed.
+ *
+ * The check costs one initial branch per hash, which is generally
+ * negligible, but not zero.
+ *
+ * Moreover, it's not useful to generate an additional code path if memory
+ * access uses the same instruction for both aligned and unaligned
+ * addresses (e.g. x86 and aarch64).
+ *
+ * In these cases, the alignment check can be removed by setting this macro
+ * to 0. Then the code will always use unaligned memory access. Align check
+ * is automatically disabled on x86, x64 & arm64, which are platforms known
+ * to offer good unaligned memory accesses performance.
+ *
+ * This option does not affect XXH3 (only XXH32 and XXH64).
+ */
+ #define XXH_FORCE_ALIGN_CHECK 0
+
+ /*!
+ * @def XXH_NO_INLINE_HINTS
+ * @brief When non-zero, sets all functions to `static`.
+ *
+ * By default, xxHash tries to force the compiler to inline almost all
+ * internal functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary
+ * which might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to
+ * performance, depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+ * -fno-inline with GCC or Clang, this will automatically be defined.
+ */
+ #define XXH_NO_INLINE_HINTS 0
+
+ /*!
+ * @def XXH_REROLL
+ * @brief Whether to reroll `XXH32_finalize`.
+ *
+ * For performance, `XXH32_finalize` uses an unrolled loop
+ * in the form of a switch statement.
+ *
+ * This is not always desirable, as it generates larger code,
+ * and depending on the architecture, may even be slower
+ *
+ * This is automatically defined with `-Os`/`-Oz` on GCC and Clang.
+ */
+ #define XXH_REROLL 0
+
+ /*!
+ * @internal
+ * @brief Redefines old internal names.
+ *
+ * For compatibility with code that uses xxHash's internals before the names
+ * were changed to improve namespacing. There is no other reason to use
+ * this.
+ */
+ #define XXH_OLD_NAMES
+ #undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
+ #endif /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command \
line for example */
- #if !defined(__clang__) && defined(__GNUC__) && \
- defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && \
- (__ARM_ARCH == 6)
- #define XXH_FORCE_MEMORY_ACCESS 2
- #elif !defined(__clang__) && \
- ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
- (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+ /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
+ #if !defined(__clang__) && \
+ ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+ (defined(__GNUC__) && \
+ ((defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \
+ (defined(__mips__) && (__mips <= 5 || __mips_isa_rev < 6) && \
+ (!defined(__mips16) || defined(__mips_mips16e2))))))
#define XXH_FORCE_MEMORY_ACCESS 1
#endif
#endif
- /*!
- * XXH_ACCEPT_NULL_INPUT_POINTER:
- * If the input pointer is NULL, xxHash's default behavior is to dereference
- * it, triggering a segfault. When this macro is enabled, xxHash actively
- * checks the input for a null pointer. If it is, the result for null input
- * pointers is the same as a zero-length input.
- */
#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */
#define XXH_ACCEPT_NULL_INPUT_POINTER 0
#endif
- /*!
- * XXH_FORCE_ALIGN_CHECK:
- * This is an important performance trick
- * for architectures without decent unaligned memory access performance.
- * It checks for input alignment, and when conditions are met,
- * uses a "fast path" employing direct 32-bit/64-bit read,
- * resulting in _dramatically faster_ read speed.
- *
- * The check costs one initial branch per hash, which is generally negligible,
- * but not zero. Moreover, it's not useful to generate binary for an
- * additional code path if memory access uses same instruction for both
- * aligned and unaligned adresses.
- *
- * In these cases, the alignment check can be removed by setting this macro to
- * 0. Then the code will always use unaligned memory access. Align check is
- * automatically disabled on x86, x64 & arm64, which are platforms known to
- * offer good unaligned memory accesses performance.
- *
- * This option does not affect XXH3 (only XXH32 and XXH64).
- */
#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
#if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || \
defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */
@@ -928,25 +1465,6 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
#endif
#endif
- /*!
- * XXH_NO_INLINE_HINTS:
- *
- * By default, xxHash tries to force the compiler to inline almost all
- * internal functions.
- *
- * This can usually improve performance due to reduced jumping and improved
- * constant folding, but significantly increases the size of the binary which
- * might not be favorable.
- *
- * Additionally, sometimes the forced inlining can be detrimental to
- * performance, depending on the architecture.
- *
- * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
- * compiler full control on whether to inline or not.
- *
- * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
- * -fno-inline with GCC or Clang, this will automatically be defined.
- */
#ifndef XXH_NO_INLINE_HINTS
#if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
|| defined(__NO_INLINE__) /* -O0, -fno-inline */
@@ -956,44 +1474,57 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
#endif
#endif
- /*!
- * XXH_REROLL:
- * Whether to reroll XXH32_finalize, and XXH64_finalize,
- * instead of using an unrolled jump table/if statement loop.
- *
- * This is automatically defined on -Os/-Oz on GCC and Clang.
- */
#ifndef XXH_REROLL
- #if defined(__OPTIMIZE_SIZE__)
+ #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ || \
+ (defined(__GNUC__) && !defined(__clang__))
+ /* The if/then loop is preferable to switch/case on gcc (on x64) */
#define XXH_REROLL 1
#else
#define XXH_REROLL 0
#endif
#endif
+ /*!
+ * @defgroup impl Implementation
+ * @{
+
+ */
+
/* *************************************
* Includes & Memory related functions
***************************************/
- /*!
+ /*
* Modify the local functions below should you wish to use
* different memory routines for malloc() and free()
*/
#include <stdlib.h>
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
static void *XXH_malloc(size_t s) {
return malloc(s);
}
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
static void XXH_free(void *p) {
free(p);
}
- /*! and for memcpy() */
#include <string.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than memcpy().
+ */
static void *XXH_memcpy(void *dest, const void *src, size_t size) {
return memcpy(dest, src, size);
@@ -1037,7 +1568,11 @@ static void *XXH_memcpy(void *dest, const void *src, size_t size) {
/* *************************************
* Debug
***************************************/
- /*
+ /*!
+ * @ingroup tuning
+ * @def XXH_DEBUGLEVEL
+ * @brief Sets the debugging level.
+ *
* XXH_DEBUGLEVEL is expected to be defined externally, typically via the
* compiler's command line options. The value must be a number.
*/
@@ -1057,12 +1592,58 @@ static void *XXH_memcpy(void *dest, const void *src, size_t size) {
#endif
/* note: use after variable declarations */
- #define XXH_STATIC_ASSERT(c) \
- do { \
- \
- enum { XXH_sa = 1 / (int)(!!(c)) }; \
- \
- } while (0)
+ #ifndef XXH_STATIC_ASSERT
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */
+ #include <assert.h>
+ #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
+ do { \
+ \
+ static_assert((c), m); \
+ \
+ } while (0)
+ #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */
+ #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
+ do { \
+ \
+ static_assert((c), m); \
+ \
+ } while (0)
+ #else
+ #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
+ do { \
+ \
+ struct xxh_sa { \
+ \
+ char x[(c) ? 1 : -1]; \
+ \
+ }; \
+ \
+ } while (0)
+ #endif
+ #define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c), #c)
+ #endif
+
+ /*!
+ * @internal
+ * @def XXH_COMPILER_GUARD(var)
+ * @brief Used to prevent unwanted optimizations for @p var.
+ *
+ * It uses an empty GCC inline assembly statement with a register constraint
+ * which forces @p var into a general purpose register (eg eax, ebx, ecx
+ * on x86) and marks it as modified.
+ *
+ * This is used in a few places to avoid unwanted autovectorization (e.g.
+ * XXH32_round()). All vectorization we want is explicit via intrinsics,
+ * and _usually_ isn't wanted elsewhere.
+ *
+ * We also use it to prevent unwanted constant folding for AArch64 in
+ * XXH3_initCustomSecret_scalar().
+ */
+ #ifdef __GNUC__
+ #define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r"(var))
+ #else
+ #define XXH_COMPILER_GUARD(var) ((void)0)
+ #endif
/* *************************************
* Basic Types
@@ -1085,6 +1666,56 @@ typedef XXH32_hash_t xxh_u32;
/* *** Memory access *** */
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ * aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
/*
* Manual byteshift. Best for old compilers which don't inline memcpy.
@@ -1146,16 +1777,23 @@ static xxh_u32 XXH_read32(const void *memPtr) {
#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
-/* *** Endianess *** */
-typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess;
+ /* *** Endianness *** */
/*!
- * XXH_CPU_LITTLE_ENDIAN:
+ * @ingroup tuning
+ * @def XXH_CPU_LITTLE_ENDIAN
+ * @brief Whether the target is little endian.
+ *
* Defined to 1 if the target is little endian, or 0 if it is big endian.
* It can be defined externally, for example on the compiler command line.
*
- * If it is not defined, a runtime check (which is usually constant folded)
- * is used instead.
+ * If it is not defined,
+ * a runtime check (which is usually constant folded) is used instead.
+ *
+ * @note
+ * This is not necessarily defined to an integer constant.
+ *
+ * @see XXH_isLittleEndian() for the runtime check.
*/
#ifndef XXH_CPU_LITTLE_ENDIAN
/*
@@ -1170,8 +1808,11 @@ typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess;
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#define XXH_CPU_LITTLE_ENDIAN 0
#else
-/*
- * runtime test, presumed to simplify to a constant by compiler
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
*/
static int XXH_isLittleEndian(void) {
@@ -1189,7 +1830,7 @@ static int XXH_isLittleEndian(void) {
return one.c[0];
}
-
+\
#define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
#endif
#endif
@@ -1205,6 +1846,19 @@ static int XXH_isLittleEndian(void) {
#define XXH_HAS_BUILTIN(x) 0
#endif
+ /*!
+ * @internal
+ * @def XXH_rotl32(x,r)
+ * @brief 32-bit rotate left.
+ *
+ * @param x The 32-bit integer to be rotated.
+ * @param r The number of bits to rotate.
+ * @pre
+ * @p r > 0 && @p r < 32
+ * @note
+ * @p x and @p r may be evaluated multiple times.
+ * @return The rotated result.
+ */
#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) && \
XXH_HAS_BUILTIN(__builtin_rotateleft64)
#define XXH_rotl32 __builtin_rotateleft32
@@ -1219,6 +1873,14 @@ static int XXH_isLittleEndian(void) {
#define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
#endif
+ /*!
+ * @internal
+ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+ * @brief A 32-bit byteswap.
+ *
+ * @param x The 32-bit integer to byteswap.
+ * @return @p x, byteswapped.
+ */
#if defined(_MSC_VER) /* Visual Studio */
#define XXH_swap32 _byteswap_ulong
#elif XXH_GCC_VERSION >= 403
@@ -1236,7 +1898,17 @@ static xxh_u32 XXH_swap32(xxh_u32 x) {
/* ***************************
* Memory reads
*****************************/
-typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+
+ XXH_aligned, /*!< Aligned */
+ XXH_unaligned /*!< Possibly unaligned */
+
+} XXH_alignment;
/*
* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
@@ -1295,6 +1967,7 @@ XXH_FORCE_INLINE xxh_u32 XXH_readLE32_align(const void * ptr,
/* *************************************
* Misc
***************************************/
+/*! @ingroup public */
XXH_PUBLIC_API unsigned XXH_versionNumber(void) {
return XXH_VERSION_NUMBER;
@@ -1304,16 +1977,19 @@ XXH_PUBLIC_API unsigned XXH_versionNumber(void) {
/* *******************************************************************
* 32-bit hash functions
*********************************************************************/
-static const xxh_u32 XXH_PRIME32_1 =
- 0x9E3779B1U; /* 0b10011110001101110111100110110001 */
-static const xxh_u32 XXH_PRIME32_2 =
- 0x85EBCA77U; /* 0b10000101111010111100101001110111 */
-static const xxh_u32 XXH_PRIME32_3 =
- 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */
-static const xxh_u32 XXH_PRIME32_4 =
- 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */
-static const xxh_u32 XXH_PRIME32_5 =
- 0x165667B1U; /* 0b00010110010101100110011110110001 */
+/*!
+ * @}
+ * @defgroup xxh32_impl XXH32 implementation
+ * @ingroup impl
+ * @{
+
+ */
+/* #define instead of static const, to be used as initializers */
+ #define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */
+ #define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */
+ #define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */
+ #define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */
+ #define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */
#ifdef XXH_OLD_NAMES
#define PRIME32_1 XXH_PRIME32_1
@@ -1323,19 +1999,29 @@ static const xxh_u32 XXH_PRIME32_5 =
#define PRIME32_5 XXH_PRIME32_5
#endif
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) {
acc += input * XXH_PRIME32_2;
acc = XXH_rotl32(acc, 13);
acc *= XXH_PRIME32_1;
- #if defined(__GNUC__) && defined(__SSE4_1__) && \
+ #if (defined(__SSE4_1__) || defined(__aarch64__)) && \
!defined(XXH_ENABLE_AUTOVECTORIZE)
/*
* UGLY HACK:
- * This inline assembly hack forces acc into a normal register. This is the
- * only thing that prevents GCC and Clang from autovectorizing the XXH32
- * loop (pragmas and attributes don't work for some resason) without globally
- * disabling SSE4.1.
+ * A compiler fence is the only thing that prevents GCC and Clang from
+ * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+ * reason) without globally disabling SSE4.1.
*
* The reason we want to avoid vectorization is because despite working on
* 4 integers at a time, there are multiple factors slowing XXH32 down on
@@ -1360,28 +2046,26 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) {
* can load data, while v3 can multiply. SSE forces them to operate
* together.
*
- * How this hack works:
- * __asm__("" // Declare an assembly block but don't declare any
- * instructions : // However, as an Input/Output Operand,
- * "+r" // constrain a read/write operand (+) as a general purpose
- * register (r). (acc) // and set acc as the operand
- * );
- *
- * Because of the 'r', the compiler has promised that seed will be in a
- * general purpose register and the '+' says that it will be 'read/write',
- * so it has to assume it has changed. It is like volatile without all the
- * loads and stores.
- *
- * Since the argument has to be in a normal register (not an SSE register),
- * each time XXH32_round is called, it is impossible to vectorize.
+ * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
+ * and it is pointless writing a NEON implementation that is basically the
+ * same speed as scalar for XXH32.
*/
- __asm__("" : "+r"(acc));
+ XXH_COMPILER_GUARD(acc);
#endif
return acc;
}
-/* mix all bits */
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param h32 The hash to avalanche.
+ * @return The avalanched hash.
+ */
static xxh_u32 XXH32_avalanche(xxh_u32 h32) {
h32 ^= h32 >> 15;
@@ -1395,11 +2079,23 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32) {
#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param h32 The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ */
static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
XXH_alignment align) {
-
- /* dummy comment */
-
+\
#define XXH_PROCESS1 \
do { \
\
@@ -1443,20 +2139,20 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
case 12:
XXH_PROCESS4;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 8:
XXH_PROCESS4;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 4:
XXH_PROCESS4;
return XXH32_avalanche(h32);
case 13:
XXH_PROCESS4;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 9:
XXH_PROCESS4;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 5:
XXH_PROCESS4;
XXH_PROCESS1;
@@ -1464,10 +2160,10 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
case 14:
XXH_PROCESS4;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 10:
XXH_PROCESS4;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 6:
XXH_PROCESS4;
XXH_PROCESS1;
@@ -1476,22 +2172,22 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
case 15:
XXH_PROCESS4;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 11:
XXH_PROCESS4;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 7:
XXH_PROCESS4;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 3:
XXH_PROCESS1;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 2:
XXH_PROCESS1;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 1:
XXH_PROCESS1;
- /* fallthrough */
+ XXH_FALLTHROUGH;
case 0:
return XXH32_avalanche(h32);
@@ -1512,10 +2208,18 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
#undef XXH_PROCESS4
#endif
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input, len, seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len,
xxh_u32 seed, XXH_alignment align) {
- const xxh_u8 *bEnd = input + len;
+ const xxh_u8 *bEnd = input ? input + len : NULL;
xxh_u32 h32;
#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
@@ -1565,6 +2269,7 @@ XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len,
}
+/*! @ingroup xxh32_family */
XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
XXH32_hash_t seed) {
@@ -1574,9 +2279,7 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
XXH32_reset(&state, seed);
XXH32_update(&state, (const xxh_u8*)input, len);
return XXH32_digest(&state);
-
#else
-
if (XXH_FORCE_ALIGN_CHECK) {
if ((((size_t)input) & 3) ==
@@ -1593,13 +2296,16 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
}
/******* Hash streaming *******/
-
+/*!
+ * @ingroup xxh32_family
+ */
XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void) {
return (XXH32_state_t *)XXH_malloc(sizeof(XXH32_state_t));
}
+/*! @ingroup xxh32_family */
XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) {
XXH_free(statePtr);
@@ -1607,6 +2313,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) {
}
+/*! @ingroup xxh32_family */
XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dstState,
const XXH32_state_t *srcState) {
@@ -1614,6 +2321,7 @@ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dstState,
}
+/*! @ingroup xxh32_family */
XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
XXH32_hash_t seed) {
@@ -1630,6 +2338,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
}
+/*! @ingroup xxh32_family */
XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state,
const void *input, size_t len) {
@@ -1719,6 +2428,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state,
}
+/*! @ingroup xxh32_family */
XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) {
xxh_u32 h32;
@@ -1743,7 +2453,8 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) {
/******* Canonical representation *******/
-/*
+/*!
+ * @ingroup xxh32_family
* The default return values from XXH functions are unsigned 32 and 64 bit
* integers.
*
@@ -1765,6 +2476,7 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
}
+/*! @ingroup xxh32_family */
XXH_PUBLIC_API XXH32_hash_t
XXH32_hashFromCanonical(const XXH32_canonical_t *src) {
@@ -1777,7 +2489,12 @@ XXH32_hashFromCanonical(const XXH32_canonical_t *src) {
/* *******************************************************************
* 64-bit hash functions
*********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
+ */
/******* Memory access *******/
typedef XXH64_hash_t xxh_u64;
@@ -1786,40 +2503,6 @@ typedef XXH64_hash_t xxh_u64;
#define U64 xxh_u64
#endif
- /*!
- * XXH_REROLL_XXH64:
- * Whether to reroll the XXH64_finalize() loop.
- *
- * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
- * performance gain on 64-bit hosts, as only one jump is required.
- *
- * However, on 32-bit hosts, because arithmetic needs to be done with two
- * 32-bit registers, and 64-bit arithmetic needs to be simulated, it isn't
- * beneficial to unroll. The code becomes ridiculously large (the largest
- * function in the binary on i386!), and rerolling it saves anywhere from
- * 3kB to 20kB. It is also slightly faster because it fits into cache better
- * and is more likely to be inlined by the compiler.
- *
- * If XXH_REROLL is defined, this is ignored and the loop is always
- * rerolled.
- */
- #ifndef XXH_REROLL_XXH64
- #if (defined(__ILP32__) || \
- defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
- || !(defined(__x86_64__) || defined(_M_X64) || \
- defined(_M_AMD64) /* x86-64 */ \
- || defined(_M_ARM64) || defined(__aarch64__) || \
- defined(__arm64__) /* aarch64 */ \
- || defined(__PPC64__) || defined(__PPC64LE__) || \
- defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
- || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
- || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
- #define XXH_REROLL_XXH64 1
- #else
- #define XXH_REROLL_XXH64 0
- #endif
- #endif /* !defined(XXH_REROLL_XXH64) */
-
#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
/*
* Manual byteshift. Best for old compilers which don't inline memcpy.
@@ -1950,23 +2633,35 @@ XXH_FORCE_INLINE xxh_u64 XXH_readLE64_align(const void * ptr,
}
-/******* xxh64 *******/
+ /******* xxh64 *******/
+ /*!
+ * @}
+ * @defgroup xxh64_impl XXH64 implementation
+ * @ingroup impl
+ * @{
-static const xxh_u64 XXH_PRIME64_1 =
- 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111
- */
-static const xxh_u64 XXH_PRIME64_2 =
- 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111
- */
-static const xxh_u64 XXH_PRIME64_3 =
- 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001
- */
-static const xxh_u64 XXH_PRIME64_4 =
- 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011
- */
-static const xxh_u64 XXH_PRIME64_5 =
- 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101
- */
+ */
+ /* #define rather that static const, to be used as initializers */
+ #define XXH_PRIME64_1 \
+ 0x9E3779B185EBCA87ULL /*!< \
+ 0b1001111000110111011110011011000110000101111010111100101010000111 \
+ */
+ #define XXH_PRIME64_2 \
+ 0xC2B2AE3D27D4EB4FULL /*!< \
+ 0b1100001010110010101011100011110100100111110101001110101101001111 \
+ */
+ #define XXH_PRIME64_3 \
+ 0x165667B19E3779F9ULL /*!< \
+ 0b0001011001010110011001111011000110011110001101110111100111111001 \
+ */
+ #define XXH_PRIME64_4 \
+ 0x85EBCA77C2B2AE63ULL /*!< \
+ 0b1000010111101011110010100111011111000010101100101010111001100011 \
+ */
+ #define XXH_PRIME64_5 \
+ 0x27D4EB2F165667C5ULL /*!< \
+ 0b0010011111010100111010110010111100010110010101100110011111000101 \
+ */
#ifdef XXH_OLD_NAMES
#define PRIME64_1 XXH_PRIME64_1
@@ -2010,185 +2705,35 @@ static xxh_u64 XXH64_avalanche(xxh_u64 h64) {
static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len,
XXH_alignment align) {
- /* dummy comment */
-
- #define XXH_PROCESS1_64 \
- do { \
- \
- h64 ^= (*ptr++) * XXH_PRIME64_5; \
- h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \
- \
- } while (0)
-
- #define XXH_PROCESS4_64 \
- do { \
- \
- h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; \
- ptr += 4; \
- h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \
- \
- } while (0)
-
- #define XXH_PROCESS8_64 \
- do { \
- \
- xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
- ptr += 8; \
- h64 ^= k1; \
- h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4; \
- \
- } while (0)
-
- /* Rerolled version for 32-bit targets is faster and much smaller. */
- if (XXH_REROLL || XXH_REROLL_XXH64) {
-
- len &= 31;
- while (len >= 8) {
-
- XXH_PROCESS8_64;
- len -= 8;
-
- }
-
- if (len >= 4) {
-
- XXH_PROCESS4_64;
- len -= 4;
-
- }
-
- while (len > 0) {
+ len &= 31;
+ while (len >= 8) {
- XXH_PROCESS1_64;
- --len;
+ xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+ ptr += 8;
+ h64 ^= k1;
+ h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+ len -= 8;
- }
+ }
- return XXH64_avalanche(h64);
+ if (len >= 4) {
- } else {
+ h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+ ptr += 4;
+ h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+ len -= 4;
- switch (len & 31) {
+ }
- case 24:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 16:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 8:
- XXH_PROCESS8_64;
- return XXH64_avalanche(h64);
-
- case 28:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 20:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 12:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 4:
- XXH_PROCESS4_64;
- return XXH64_avalanche(h64);
-
- case 25:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 17:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 9:
- XXH_PROCESS8_64;
- XXH_PROCESS1_64;
- return XXH64_avalanche(h64);
-
- case 29:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 21:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 13:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 5:
- XXH_PROCESS4_64;
- XXH_PROCESS1_64;
- return XXH64_avalanche(h64);
-
- case 26:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 18:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 10:
- XXH_PROCESS8_64;
- XXH_PROCESS1_64;
- XXH_PROCESS1_64;
- return XXH64_avalanche(h64);
-
- case 30:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 22:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 14:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 6:
- XXH_PROCESS4_64;
- XXH_PROCESS1_64;
- XXH_PROCESS1_64;
- return XXH64_avalanche(h64);
-
- case 27:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 19:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 11:
- XXH_PROCESS8_64;
- XXH_PROCESS1_64;
- XXH_PROCESS1_64;
- XXH_PROCESS1_64;
- return XXH64_avalanche(h64);
-
- case 31:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 23:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 15:
- XXH_PROCESS8_64;
- /* fallthrough */
- case 7:
- XXH_PROCESS4_64;
- /* fallthrough */
- case 3:
- XXH_PROCESS1_64;
- /* fallthrough */
- case 2:
- XXH_PROCESS1_64;
- /* fallthrough */
- case 1:
- XXH_PROCESS1_64;
- /* fallthrough */
- case 0:
- return XXH64_avalanche(h64);
+ while (len > 0) {
- }
+ h64 ^= (*ptr++) * XXH_PRIME64_5;
+ h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
+ --len;
}
- /* impossible to reach */
- XXH_ASSERT(0);
- return 0; /* unreachable, but some compilers complain without it */
+ return XXH64_avalanche(h64);
}
@@ -2205,7 +2750,7 @@ static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len,
XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len,
xxh_u64 seed, XXH_alignment align) {
- const xxh_u8 *bEnd = input + len;
+ const xxh_u8 *bEnd = input ? input + len : NULL;
xxh_u64 h64;
#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
@@ -2259,6 +2804,7 @@ XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len,
}
+/*! @ingroup xxh64_family */
XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
XXH64_hash_t seed) {
@@ -2268,9 +2814,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
XXH64_reset(&state, seed);
XXH64_update(&state, (const xxh_u8*)input, len);
return XXH64_digest(&state);
-
#else
-
if (XXH_FORCE_ALIGN_CHECK) {
if ((((size_t)input) & 7) ==
@@ -2289,12 +2833,14 @@ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
/******* Hash Streaming *******/
+/*! @ingroup xxh64_family*/
XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void) {
return (XXH64_state_t *)XXH_malloc(sizeof(XXH64_state_t));
}
+/*! @ingroup xxh64_family */
XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) {
XXH_free(statePtr);
@@ -2302,6 +2848,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) {
}
+/*! @ingroup xxh64_family */
XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t * dstState,
const XXH64_state_t *srcState) {
@@ -2309,6 +2856,7 @@ XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t * dstState,
}
+/*! @ingroup xxh64_family */
XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
XXH64_hash_t seed) {
@@ -2325,6 +2873,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
}
+/*! @ingroup xxh64_family */
XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state,
const void *input, size_t len) {
@@ -2403,6 +2952,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state,
}
+/*! @ingroup xxh64_family */
XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) {
xxh_u64 h64;
@@ -2436,6 +2986,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) {
/******* Canonical representation *******/
+/*! @ingroup xxh64_family */
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
XXH64_hash_t hash) {
@@ -2445,6 +2996,7 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
}
+/*! @ingroup xxh64_family */
XXH_PUBLIC_API XXH64_hash_t
XXH64_hashFromCanonical(const XXH64_canonical_t *src) {
@@ -2452,380 +3004,452 @@ XXH64_hashFromCanonical(const XXH64_canonical_t *src) {
}
- /* *********************************************************************
- * XXH3
- * New generation hash designed for speed on small keys and vectorization
- ************************************************************************ */
+ #ifndef XXH_NO_XXH3
- /* === Compiler specifics === */
+ /* *********************************************************************
+ * XXH3
+ * New generation hash designed for speed on small keys and vectorization
+ ************************************************************************ */
+ /*!
+ * @}
+ * @defgroup xxh3_impl XXH3 implementation
+ * @ingroup impl
+ * @{
- #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
- #define XXH_RESTRICT restrict
- #else
- /* Note: it might be useful to define __restrict or __restrict__ for some
- * C++ compilers */
- #define XXH_RESTRICT /* disable */
- #endif
+ */
- #if (defined(__GNUC__) && (__GNUC__ >= 3)) || \
- (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \
- defined(__clang__)
- #define XXH_likely(x) __builtin_expect(x, 1)
- #define XXH_unlikely(x) __builtin_expect(x, 0)
- #else
- #define XXH_likely(x) (x)
- #define XXH_unlikely(x) (x)
- #endif
+ /* === Compiler specifics === */
- #if defined(__GNUC__)
- #if defined(__AVX2__)
- #include <immintrin.h>
- #elif defined(__SSE2__)
- #include <emmintrin.h>
- #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
- #define inline __inline__ /* circumvent a clang bug */
- #include <arm_neon.h>
- #undef inline
+ #if ((defined(sun) || defined(__sun)) && \
+ __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested \
+ with GCC 5.5 */
+ #define XXH_RESTRICT /* disable */
+ #elif defined(__STDC_VERSION__) && \
+ __STDC_VERSION__ >= 199901L /* >= C99 */
+ #define XXH_RESTRICT restrict
+ #else
+ /* Note: it might be useful to define __restrict or __restrict__ for
+ * some C++ compilers */
+ #define XXH_RESTRICT /* disable */
#endif
- #elif defined(_MSC_VER)
- #include <intrin.h>
- #endif
-
- /*
- * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
- * remaining a true 64-bit/128-bit hash function.
- *
- * This is done by prioritizing a subset of 64-bit operations that can be
- * emulated without too many steps on the average 32-bit machine.
- *
- * For example, these two lines seem similar, and run equally fast on
- * 64-bit:
- *
- * xxh_u64 x;
- * x ^= (x >> 47); // good
- * x ^= (x >> 13); // bad
- *
- * However, to a 32-bit machine, there is a major difference.
- *
- * x ^= (x >> 47) looks like this:
- *
- * x.lo ^= (x.hi >> (47 - 32));
- *
- * while x ^= (x >> 13) looks like this:
- *
- * // note: funnel shifts are not usually cheap.
- * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
- * x.hi ^= (x.hi >> 13);
- *
- * The first one is significantly faster than the second, simply because the
- * shift is larger than 32. This means:
- * - All the bits we need are in the upper 32 bits, so we can ignore the
- * lower 32 bits in the shift.
- * - The shift result will always fit in the lower 32 bits, and therefore,
- * we can ignore the upper 32 bits in the xor.
- *
- * Thanks to this optimization, XXH3 only requires these features to be
- * efficient:
- *
- * - Usable unaligned access
- * - A 32-bit or 64-bit ALU
- * - If 32-bit, a decent ADC instruction
- * - A 32 or 64-bit multiply with a 64-bit result
- * - For the 128-bit variant, a decent byteswap helps short inputs.
- *
- * The first two are already required by XXH32, and almost all 32-bit and
- * 64-bit platforms which can run XXH32 can run XXH3 efficiently.
- *
- * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
- * notable exception.
- *
- * First of all, Thumb-1 lacks support for the UMULL instruction which
- * performs the important long multiply. This means numerous __aeabi_lmul
- * calls.
- *
- * Second of all, the 8 functional registers are just not enough.
- * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic
- * need Lo registers, and this shuffling results in thousands more MOVs than
- * A32.
- *
- * A32 and T32 don't have this limitation. They can access all 14 registers,
- * do a 32->64 multiply with UMULL, and the flexible operand allowing free
- * shifts is helpful, too.
- *
- * Therefore, we do a quick sanity check.
- *
- * If compiling Thumb-1 for a target which supports ARM instructions, we
- * will emit a warning, as it is not a "sane" platform to compile for.
- *
- * Usually, if this happens, it is because of an accident and you probably
- * need to specify -march, as you likely meant to compile for a newer
- * architecture.
- *
- * Credit: large sections of the vectorial and asm source code paths
- * have been contributed by @easyaspi314
- */
- #if defined(__thumb__) && !defined(__thumb2__) && \
- defined(__ARM_ARCH_ISA_ARM)
- #warning "XXH3 is highly inefficient without ARM or Thumb-2."
- #endif
- /* ==========================================
- * Vectorization detection
- * ========================================== */
- #define XXH_SCALAR 0 /* Portable scalar version */
- #define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */
- #define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */
- #define XXH_AVX512 3 /* AVX512 for Skylake and Icelake */
- #define XXH_NEON 4 /* NEON for most ARMv7-A and all AArch64 */
- #define XXH_VSX 5 /* VSX and ZVector for POWER8/z13 */
-
- #ifndef XXH_VECTOR /* can be defined on command line */
- #if defined(__AVX512F__)
- #define XXH_VECTOR XXH_AVX512
- #elif defined(__AVX2__)
- #define XXH_VECTOR XXH_AVX2
- #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
- (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
- #define XXH_VECTOR XXH_SSE2
- #elif defined(__GNUC__) /* msvc support maybe later */ \
- && (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \
- (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
- || (defined(__BYTE_ORDER__) && \
- __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
- #define XXH_VECTOR XXH_NEON
- #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \
- (defined(__s390x__) && defined(__VEC__)) && \
- defined(__GNUC__) /* TODO: IBM XL */
- #define XXH_VECTOR XXH_VSX
+ #if (defined(__GNUC__) && (__GNUC__ >= 3)) || \
+ (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \
+ defined(__clang__)
+ #define XXH_likely(x) __builtin_expect(x, 1)
+ #define XXH_unlikely(x) __builtin_expect(x, 0)
#else
- #define XXH_VECTOR XXH_SCALAR
+ #define XXH_likely(x) (x)
+ #define XXH_unlikely(x) (x)
#endif
- #endif
- /*
- * Controls the alignment of the accumulator,
- * for compatibility with aligned vector loads, which are usually faster.
- */
- #ifndef XXH_ACC_ALIGN
- #if defined(XXH_X86DISPATCH)
- #define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */
- #elif XXH_VECTOR == XXH_SCALAR /* scalar */
- #define XXH_ACC_ALIGN 8
- #elif XXH_VECTOR == XXH_SSE2 /* sse2 */
- #define XXH_ACC_ALIGN 16
- #elif XXH_VECTOR == XXH_AVX2 /* avx2 */
- #define XXH_ACC_ALIGN 32
- #elif XXH_VECTOR == XXH_NEON /* neon */
- #define XXH_ACC_ALIGN 16
- #elif XXH_VECTOR == XXH_VSX /* vsx */
- #define XXH_ACC_ALIGN 16
- #elif XXH_VECTOR == XXH_AVX512 /* avx512 */
- #define XXH_ACC_ALIGN 64
+ #if defined(__GNUC__)
+ #if defined(__AVX2__)
+ #include <immintrin.h>
+ #elif defined(__SSE2__)
+ #include <emmintrin.h>
+ #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+ #define inline __inline__ /* circumvent a clang bug */
+ #include <arm_neon.h>
+ #undef inline
+ #endif
+ #elif defined(_MSC_VER)
+ #include <intrin.h>
#endif
- #endif
- #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \
- XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
- #define XXH_SEC_ALIGN XXH_ACC_ALIGN
- #else
- #define XXH_SEC_ALIGN 8
- #endif
-
- /*
- * UGLY HACK:
- * GCC usually generates the best code with -O3 for xxHash.
- *
- * However, when targeting AVX2, it is overzealous in its unrolling
- * resulting in code roughly 3/4 the speed of Clang.
- *
- * There are other issues, such as GCC splitting _mm256_loadu_si256 into
- * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
- * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
- *
- * That is why when compiling the AVX2 version, it is recommended to use
- * either -O2 -mavx2 -march=haswell or -O2 -mavx2
- * -mno-avx256-split-unaligned-load for decent performance, or to use Clang
- * instead.
- *
- * Fortunately, we can control the first one with a pragma that forces GCC
- * into -O2, but the other one we can't control without "failed to inline
- * always inline function due to target mismatch" warnings.
- */
- #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
- && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
- && defined(__OPTIMIZE__) && \
- !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
- #pragma GCC push_options
- #pragma GCC optimize("-O2")
- #endif
-
- #if XXH_VECTOR == XXH_NEON
/*
- * NEON's setup for vmlal_u32 is a little more complicated than it is on
- * SSE2, AVX2, and VSX.
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
*
- * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an
- * upcast.
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
*
- * To do the same operation, the 128-bit 'Q' register needs to be split
- * into two 64-bit 'D' registers, performing this operation::
+ * For example, these two lines seem similar, and run equally fast on
+ * 64-bit:
*
- * [ a | b ] |
- * '---------. .--------' | | x |
- * | .---------' '--------. |
- * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]
+ * xxh_u64 x;
+ * x ^= (x >> 47); // good
+ * x ^= (x >> 13); // bad
*
- * Due to significant changes in aarch64, the fastest method for aarch64
- * is completely different than the fastest method for ARMv7-A.
+ * However, to a 32-bit machine, there is a major difference.
*
- * ARMv7-A treats D registers as unions overlaying Q registers, so
- * modifying D11 will modify the high half of Q5. This is similar to how
- * modifying AH will only affect bits 8-15 of AX on x86.
+ * x ^= (x >> 47) looks like this:
*
- * VZIP takes two registers, and puts even lanes in one register and odd
- * lanes in the other.
+ * x.lo ^= (x.hi >> (47 - 32));
*
- * On ARMv7-A, this strangely modifies both parameters in place instead of
- * taking the usual 3-operand form.
+ * while x ^= (x >> 13) looks like this:
*
- * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on
- * the lower and upper halves of the Q register to end up with the high
- * and low halves where we want - all in one instruction.
+ * // note: funnel shifts are not usually cheap.
+ * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ * x.hi ^= (x.hi >> 13);
*
- * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1],
- * d11[1] }
+ * The first one is significantly faster than the second, simply because
+ * the shift is larger than 32. This means:
+ * - All the bits we need are in the upper 32 bits, so we can ignore the
+ * lower 32 bits in the shift.
+ * - The shift result will always fit in the lower 32 bits, and
+ * therefore, we can ignore the upper 32 bits in the xor.
*
- * Unfortunately we need inline assembly for this: Instructions modifying
- * two registers at once is not possible in GCC or Clang's IR, and they
- * have to create a copy.
+ * Thanks to this optimization, XXH3 only requires these features to be
+ * efficient:
*
- * aarch64 requires a different approach.
+ * - Usable unaligned access
+ * - A 32-bit or 64-bit ALU
+ * - If 32-bit, a decent ADC instruction
+ * - A 32 or 64-bit multiply with a 64-bit result
+ * - For the 128-bit variant, a decent byteswap helps short inputs.
*
- * In order to make it easier to write a decent compiler for aarch64, many
- * quirks were removed, such as conditional execution.
+ * The first two are already required by XXH32, and almost all 32-bit and
+ * 64-bit platforms which can run XXH32 can run XXH3 efficiently.
*
- * NEON was also affected by this.
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is
+ * one notable exception.
*
- * aarch64 cannot access the high bits of a Q-form register, and writes to
- * a D-form register zero the high bits, similar to how writes to W-form
- * scalar registers (or DWORD registers on x86_64) work.
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
*
- * The formerly free vget_high intrinsics now require a vext (with a few
- * exceptions)
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic
+ * need Lo registers, and this shuffling results in thousands more MOVs
+ * than A32.
*
- * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the
- * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to
- * only modify one operand.
+ * A32 and T32 don't have this limitation. They can access all 14
+ * registers, do a 32->64 multiply with UMULL, and the flexible operand
+ * allowing free shifts is helpful, too.
*
- * The equivalent of the VZIP.32 on the lower and upper halves would be
- * this mess:
+ * Therefore, we do a quick sanity check.
*
- * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1]
- * } zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } zip2 v0.2s,
- * v0.2s, v1.2s // v0 = { v0[1], v2[1] }
+ * If compiling Thumb-1 for a target which supports ARM instructions, we
+ * will emit a warning, as it is not a "sane" platform to compile for.
*
- * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64
- * (SHRN):
+ * Usually, if this happens, it is because of an accident and you probably
+ * need to specify -march, as you likely meant to compile for a newer
+ * architecture.
*
- * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
- * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
- *
- * This is available on ARMv7-A, but is less efficient than a single
- * VZIP.32.
+ * Credit: large sections of the vectorial and asm source code paths
+ * have been contributed by @easyaspi314
*/
+ #if defined(__thumb__) && !defined(__thumb2__) && \
+ defined(__ARM_ARCH_ISA_ARM)
+ #warning "XXH3 is highly inefficient without ARM or Thumb-2."
+ #endif
+
+ /* ==========================================
+ * Vectorization detection
+ * ========================================== */
+
+ #ifdef XXH_DOXYGEN
+ /*!
+ * @ingroup tuning
+ * @brief Overrides the vectorization implementation chosen for XXH3.
+ *
+ * Can be defined to 0 to disable SIMD or any of the values mentioned in
+ * @ref XXH_VECTOR_TYPE.
+ *
+ * If this is not defined, it uses predefined macros to determine the
+ * best implementation.
+ */
+ #define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Note that these are actually implemented as macros.
+ *
+ * If this is not defined, it is detected automatically.
+ * @ref XXH_X86DISPATCH overrides this.
+ */
+enum XXH_VECTOR_TYPE /* fake enum */ {
+
+ XXH_SCALAR = 0, /*!< Portable scalar version */
+ XXH_SSE2 = 1, /*!<
+ * SSE2 for Pentium 4, Opteron, all x86_64.
+ *
+ * @note SSE2 is also guaranteed on Windows 10, macOS, and
+ * Android x86.
+ */
+ XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */
+ XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */
+ XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */
+ XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+
+};
+
+ /*!
+ * @ingroup tuning
+ * @brief Selects the minimum alignment for XXH3's accumulators.
+ *
+ * When using SIMD, this should match the alignment reqired for said
+ * vector type, so, for example, 32 for AVX2.
+ *
+ * Default: Auto detected.
+ */
+ #define XXH_ACC_ALIGN 8
+ #endif
+
+ /* Actual definition */
+ #ifndef XXH_DOXYGEN
+ #define XXH_SCALAR 0
+ #define XXH_SSE2 1
+ #define XXH_AVX2 2
+ #define XXH_AVX512 3
+ #define XXH_NEON 4
+ #define XXH_VSX 5
+ #endif
+
+ #ifndef XXH_VECTOR /* can be defined on command line */
+ #if defined(__AVX512F__)
+ #define XXH_VECTOR XXH_AVX512
+ #elif defined(__AVX2__)
+ #define XXH_VECTOR XXH_AVX2
+ #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
+ (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+ #define XXH_VECTOR XXH_SSE2
+ #elif defined(__GNUC__) /* msvc support maybe later */ \
+ && (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \
+ (defined( \
+ __LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+ || (defined(__BYTE_ORDER__) && \
+ __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+ #define XXH_VECTOR XXH_NEON
+ #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \
+ (defined(__s390x__) && defined(__VEC__)) && \
+ defined(__GNUC__) /* TODO: IBM XL */
+ #define XXH_VECTOR XXH_VSX
+ #else
+ #define XXH_VECTOR XXH_SCALAR
+ #endif
+ #endif
/*
- * Function-like macro:
- * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t
- * &outHi)
- * {
-
- * outLo = (uint32x2_t)(in & 0xFFFFFFFF);
- * outHi = (uint32x2_t)(in >> 32);
- * in = UNDEFINED;
- * }
+ * Controls the alignment of the accumulator,
+ * for compatibility with aligned vector loads, which are usually faster.
*/
- #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
- && defined(__GNUC__) && !defined(__aarch64__) && !defined(__arm64__)
- #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
- do { \
- \
- /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, \
- * %f0 = upper D half */ \
- /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 \
- */ \
- /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \
- */ \
- __asm__("vzip.32 %e0, %f0" : "+w"(in)); \
- (outLo) = vget_low_u32(vreinterpretq_u32_u64(in)); \
- (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
- \
- } while (0)
+ #ifndef XXH_ACC_ALIGN
+ #if defined(XXH_X86DISPATCH)
+ #define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */
+ #elif XXH_VECTOR == XXH_SCALAR /* scalar */
+ #define XXH_ACC_ALIGN 8
+ #elif XXH_VECTOR == XXH_SSE2 /* sse2 */
+ #define XXH_ACC_ALIGN 16
+ #elif XXH_VECTOR == XXH_AVX2 /* avx2 */
+ #define XXH_ACC_ALIGN 32
+ #elif XXH_VECTOR == XXH_NEON /* neon */
+ #define XXH_ACC_ALIGN 16
+ #elif XXH_VECTOR == XXH_VSX /* vsx */
+ #define XXH_ACC_ALIGN 16
+ #elif XXH_VECTOR == XXH_AVX512 /* avx512 */
+ #define XXH_ACC_ALIGN 64
+ #endif
+ #endif
+ #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \
+ XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+ #define XXH_SEC_ALIGN XXH_ACC_ALIGN
#else
- #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
- do { \
- \
- (outLo) = vmovn_u64(in); \
- (outHi) = vshrn_n_u64((in), 32); \
- \
- } while (0)
+ #define XXH_SEC_ALIGN 8
+ #endif
+ /*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling
+ * resulting in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization
+ * which only applies to Sandy and Ivy Bridge... which don't even support
+ * AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use
+ * either -O2 -mavx2 -march=haswell or -O2 -mavx2
+ * -mno-avx256-split-unaligned-load for decent performance, or to use
+ * Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC
+ * into -O2, but the other one we can't control without "failed to inline
+ * always inline function due to target mismatch" warnings.
+ */
+ #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+ && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+ && defined(__OPTIMIZE__) && \
+ !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+ #pragma GCC push_options
+ #pragma GCC optimize("-O2")
#endif
- #endif /* XXH_VECTOR == XXH_NEON */
- /*
- * VSX and Z Vector helpers.
- *
- * This is very messy, and any pull requests to clean this up are welcome.
- *
- * There are a lot of problems with supporting VSX and s390x, due to
- * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
- */
- #if XXH_VECTOR == XXH_VSX
- #if defined(__s390x__)
- #include <s390intrin.h>
- #else
- /* gcc's altivec.h can have the unwanted consequence to unconditionally
- * #define bool, vector, and pixel keywords,
- * with bad consequences for programs already using these keywords for
- * other purposes. The paragraph defining these macros is skipped when
- * __APPLE_ALTIVEC__ is defined.
- * __APPLE_ALTIVEC__ is _generally_ defined automatically by the
- * compiler, but it seems that, in some cases, it isn't. Force the build
- * macro to be defined, so that keywords are not altered.
+ #if XXH_VECTOR == XXH_NEON
+ /*
+ * NEON's setup for vmlal_u32 is a little more complicated than it is on
+ * SSE2, AVX2, and VSX.
+ *
+ * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an
+ * upcast.
+ *
+ * To do the same operation, the 128-bit 'Q' register needs to be split
+ * into two 64-bit 'D' registers, performing this operation::
+ *
+ * [ a | b ] |
+ * '---------. .--------' | | x |
+ * | .---------' '--------. |
+ * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32
+ * ]
+ *
+ * Due to significant changes in aarch64, the fastest method for aarch64
+ * is completely different than the fastest method for ARMv7-A.
+ *
+ * ARMv7-A treats D registers as unions overlaying Q registers, so
+ * modifying D11 will modify the high half of Q5. This is similar to how
+ * modifying AH will only affect bits 8-15 of AX on x86.
+ *
+ * VZIP takes two registers, and puts even lanes in one register and odd
+ * lanes in the other.
+ *
+ * On ARMv7-A, this strangely modifies both parameters in place instead
+ * of taking the usual 3-operand form.
+ *
+ * Therefore, if we want to do this, we can simply use a D-form VZIP.32
+ * on the lower and upper halves of the Q register to end up with the
+ * high and low halves where we want - all in one instruction.
+ *
+ * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = {
+
+ * d10[1], d11[1] }
+ *
+ * Unfortunately we need inline assembly for this: Instructions
+ * modifying two registers at once is not possible in GCC or Clang's IR,
+ * and they have to create a copy.
+ *
+ * aarch64 requires a different approach.
+ *
+ * In order to make it easier to write a decent compiler for aarch64,
+ * many quirks were removed, such as conditional execution.
+ *
+ * NEON was also affected by this.
+ *
+ * aarch64 cannot access the high bits of a Q-form register, and writes
+ * to a D-form register zero the high bits, similar to how writes to
+ * W-form scalar registers (or DWORD registers on x86_64) work.
+ *
+ * The formerly free vget_high intrinsics now require a vext (with a few
+ * exceptions)
+ *
+ * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the
+ * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to
+ * only modify one operand.
+ *
+ * The equivalent of the VZIP.32 on the lower and upper halves would be
+ * this mess:
+ *
+ * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0],
+ * v0[1] } zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } zip2
+ * v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
+ *
+ * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64
+ * (SHRN):
+ *
+ * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
+ * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+ *
+ * This is available on ARMv7-A, but is less efficient than a single
+ * VZIP.32.
*/
- #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
- #define __APPLE_ALTIVEC__
- #endif
- #include <altivec.h>
- #endif
-typedef __vector unsigned long long xxh_u64x2;
-typedef __vector unsigned char xxh_u8x16;
-typedef __vector unsigned xxh_u32x4;
+ /*!
+ * Function-like macro:
+ * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t
+ * &outHi)
+ * {
- #ifndef XXH_VSX_BE
- #if defined(__BIG_ENDIAN__) || \
- (defined(__BYTE_ORDER__) && \
- __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
- #define XXH_VSX_BE 1
- #elif defined(__VEC_ELEMENT_REG_ORDER__) && \
- __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
- #warning \
- "-maltivec=be is not recommended. Please use native endianness."
- #define XXH_VSX_BE 1
+ * outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+ * outHi = (uint32x2_t)(in >> 32);
+ * in = UNDEFINED;
+ * }
+ */
+ #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+ && defined(__GNUC__) && !defined(__aarch64__) && \
+ !defined(__arm64__)
+ #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
+ do { \
+ \
+ /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, \
+ * %f0 = upper D half */ \
+ /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 \
+ */ \
+ /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \
+ */ \
+ __asm__("vzip.32 %e0, %f0" : "+w"(in)); \
+ (outLo) = vget_low_u32(vreinterpretq_u32_u64(in)); \
+ (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
+ \
+ } while (0)
#else
- #define XXH_VSX_BE 0
+ #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
+ do { \
+ \
+ (outLo) = vmovn_u64(in); \
+ (outHi) = vshrn_n_u64((in), 32); \
+ \
+ } while (0)
#endif
- #endif /* !defined(XXH_VSX_BE) */
+ #endif /* XXH_VECTOR == XXH_NEON */
- #if XXH_VSX_BE
- /* A wrapper for POWER9's vec_revb. */
- #if defined(__POWER9_VECTOR__) || \
- (defined(__clang__) && defined(__s390x__))
- #define XXH_vec_revb vec_revb
+ /*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+ #if XXH_VECTOR == XXH_VSX
+ #if defined(__s390x__)
+ #include <s390intrin.h>
#else
+ /* gcc's altivec.h can have the unwanted consequence to
+ * unconditionally #define bool, vector, and pixel keywords, with bad
+ * consequences for programs already using these keywords for other
+ * purposes. The paragraph defining these macros is skipped when
+ * __APPLE_ALTIVEC__ is defined.
+ * __APPLE_ALTIVEC__ is _generally_ defined automatically by the
+ * compiler, but it seems that, in some cases, it isn't. Force the
+ * build macro to be defined, so that keywords are not altered.
+ */
+ #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
+ #define __APPLE_ALTIVEC__
+ #endif
+ #include <altivec.h>
+ #endif
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+ #ifndef XXH_VSX_BE
+ #if defined(__BIG_ENDIAN__) || \
+ (defined(__BYTE_ORDER__) && \
+ __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ #define XXH_VSX_BE 1
+ #elif defined(__VEC_ELEMENT_REG_ORDER__) && \
+ __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+ #warning \
+ "-maltivec=be is not recommended. Please use native endianness."
+ #define XXH_VSX_BE 1
+ #else
+ #define XXH_VSX_BE 0
+ #endif
+ #endif /* !defined(XXH_VSX_BE) */
+
+ #if XXH_VSX_BE
+ #if defined(__POWER9_VECTOR__) || \
+ (defined(__clang__) && defined(__s390x__))
+ #define XXH_vec_revb vec_revb
+ #else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
@@ -2834,40 +3458,40 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
}
- #endif
- #endif /* XXH_VSX_BE */
+ #endif
+ #endif /* XXH_VSX_BE */
-/*
- * Performs an unaligned load and byte swaps it on big endian.
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
*/
XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) {
xxh_u64x2 ret;
memcpy(&ret, ptr, sizeof(xxh_u64x2));
- #if XXH_VSX_BE
+ #if XXH_VSX_BE
ret = XXH_vec_revb(ret);
- #endif
+ #endif
return ret;
}
- /*
- * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
- *
- * These intrinsics weren't added until GCC 8, despite existing for a
- * while, and they are endian dependent. Also, their meaning swap
- * depending on version.
- * */
- #if defined(__s390x__)
- /* s390x is always big endian, no issue on this platform */
- #define XXH_vec_mulo vec_mulo
- #define XXH_vec_mule vec_mule
- #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
- /* Clang has a better way to control this, we can just use the builtin
- * which doesn't swap. */
- #define XXH_vec_mulo __builtin_altivec_vmulouw
- #define XXH_vec_mule __builtin_altivec_vmuleuw
- #else
+ /*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a
+ * while, and they are endian dependent. Also, their meaning swap
+ * depending on version.
+ * */
+ #if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+ #define XXH_vec_mulo vec_mulo
+ #define XXH_vec_mule vec_mule
+ #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
+ /* Clang has a better way to control this, we can just use the builtin
+ * which doesn't swap. */
+ #define XXH_vec_mulo __builtin_altivec_vmulouw
+ #define XXH_vec_mule __builtin_altivec_vmuleuw
+ #else
/* gcc needs inline assembly */
/* Adapted from
* https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
@@ -2887,40 +3511,41 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) {
}
- #endif /* XXH_vec_mulo, XXH_vec_mule */
- #endif /* XXH_VECTOR == XXH_VSX */
+ #endif /* XXH_vec_mulo, XXH_vec_mule */
+ #endif /* XXH_VECTOR == XXH_VSX */
- /* prefetch
- * can be disabled, by declaring XXH_NO_PREFETCH build macro */
- #if defined(XXH_NO_PREFETCH)
- #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
- #else
- #if defined(_MSC_VER) && \
- (defined(_M_X64) || \
- defined( \
- _M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
- #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
- #define XXH_PREFETCH(ptr) _mm_prefetch((const char *)(ptr), _MM_HINT_T0)
- #elif defined(__GNUC__) && \
- ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
- #define XXH_PREFETCH(ptr) \
- __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
- #else
+ /* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+ #if defined(XXH_NO_PREFETCH)
#define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
- #endif
- #endif /* XXH_NO_PREFETCH */
+ #else
+ #if defined(_MSC_VER) && \
+ (defined(_M_X64) || \
+ defined( \
+ _M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */
+ #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+ #define XXH_PREFETCH(ptr) \
+ _mm_prefetch((const char *)(ptr), _MM_HINT_T0)
+ #elif defined(__GNUC__) && \
+ ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
+ #define XXH_PREFETCH(ptr) \
+ __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+ #else
+ #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
+ #endif
+ #endif /* XXH_NO_PREFETCH */
- /* ==========================================
- * XXH3 default settings
- * ========================================== */
+ /* ==========================================
+ * XXH3 default settings
+ * ========================================== */
- #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
+ #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
- #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
- #error "default keyset is not large enough"
- #endif
+ #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+ #error "default keyset is not large enough"
+ #endif
-/* Pseudorandom secret taken directly from FARSH */
+/*! Pseudorandom secret taken directly from FARSH. */
XXH_ALIGN(64)
static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
@@ -2943,69 +3568,79 @@ static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
};
- #ifdef XXH_OLD_NAMES
- #define kSecret XXH3_kSecret
- #endif
+ #ifdef XXH_OLD_NAMES
+ #define kSecret XXH3_kSecret
+ #endif
- /*
- * Calculates a 32-bit to 64-bit long multiply.
- *
- * Wraps __emulu on MSVC x86 because it tends to call __allmul when it
- * doesn't need to (but it shouldn't need to anyways, it is about 7
- * instructions to do a 64x64 multiply...). Since we know that this will
- * _always_ emit MULL, we use that instead of the normal method.
- *
- * If you are compiling for platforms like Thumb-1 and don't have a better
- * option, you may also want to write your own long multiply routine here.
- *
- * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
- * {
+ #ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it
+ * doesn't need to (but it shouldn't need to anyways, it is about 7 instructions
+ * to do a 64x64 multiply...). Since we know that this will _always_ emit
+ * `MULL`, we use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better
+ * option, you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) {
- * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
- * }
- */
- #if defined(_MSC_VER) && defined(_M_IX86)
- #include <intrin.h>
- #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
- #else
- /*
- * Downcast + upcast is usually better than masking on older compilers
- * like GCC 4.2 (especially 32-bit ones), all without affecting newer
- * compilers.
- *
- * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both
- * operands and perform a full 64x64 multiply -- entirely redundant on
- * 32-bit.
- */
- #define XXH_mult32to64(x, y) \
- ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
- #endif
+ return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
-/*
- * Calculates a 64->128-bit long multiply.
+}
+
+ #elif defined(_MSC_VER) && defined(_M_IX86)
+ #include <intrin.h>
+ #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+ #else
+ /*
+ * Downcast + upcast is usually better than masking on older compilers
+ * like GCC 4.2 (especially 32-bit ones), all without affecting newer
+ * compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both
+ * operands and perform a full 64x64 multiply -- entirely redundant on
+ * 32-bit.
+ */
+ #define XXH_mult32to64(x, y) \
+ ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+ #endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
*
- * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
+ * @param lhs, rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
*/
static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
- /*
- * GCC/Clang __uint128_t method.
- *
- * On most 64-bit targets, GCC and Clang define a __uint128_t type.
- * This is usually the best way as it usually uses a native long 64-bit
- * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
- *
- * Usually.
- *
- * Despite being a 32-bit platform, Clang (and emscripten) define this type
- * despite not having the arithmetic for it. This results in a laggy
- * compiler builtin call which calculates a full 128-bit multiply.
- * In that case it is best to use the portable one.
- * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
- */
- #if defined(__GNUC__) && !defined(__wasm__) && \
- defined(__SIZEOF_INT128__) || \
- (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+ /*
+ * GCC/Clang __uint128_t method.
+ *
+ * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+ * This is usually the best way as it usually uses a native long 64-bit
+ * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+ *
+ * Usually.
+ *
+ * Despite being a 32-bit platform, Clang (and emscripten) define this
+ * type despite not having the arithmetic for it. This results in a laggy
+ * compiler builtin call which calculates a full 128-bit multiply.
+ * In that case it is best to use the portable one.
+ * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+ */
+ #if defined(__GNUC__) && !defined(__wasm__) && \
+ defined(__SIZEOF_INT128__) || \
+ (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
__uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
XXH128_hash_t r128;
@@ -3013,19 +3648,19 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
r128.high64 = (xxh_u64)(product >> 64);
return r128;
- /*
- * MSVC for x64's _umul128 method.
- *
- * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64
- * *HighProduct);
- *
- * This compiles to single operand MUL on x64.
- */
- #elif defined(_M_X64) || defined(_M_IA64)
+ /*
+ * MSVC for x64's _umul128 method.
+ *
+ * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64
+ * *HighProduct);
+ *
+ * This compiles to single operand MUL on x64.
+ */
+ #elif defined(_M_X64) || defined(_M_IA64)
- #ifndef _MSC_VER
- #pragma intrinsic(_umul128)
- #endif
+ #ifndef _MSC_VER
+ #pragma intrinsic(_umul128)
+ #endif
xxh_u64 product_high;
xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
XXH128_hash_t r128;
@@ -3033,7 +3668,7 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
r128.high64 = product_high;
return r128;
- #else
+ #else
/*
* Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
*
@@ -3093,16 +3728,20 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
r128.low64 = lower;
r128.high64 = upper;
return r128;
- #endif
+ #endif
}
-/*
- * Does a 64-bit to 128-bit multiply, then XOR folds it.
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
*
* The reason for the separate function is to prevent passing too many structs
* around by value. This will hopefully inline the multiply, but we don't force
* it.
+ *
+ * @param lhs, rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
*/
static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
@@ -3111,7 +3750,7 @@ static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
}
-/* Seems to produce slightly better code on GCC for some reason. */
+/*! Seems to produce slightly better code on GCC for some reason. */
XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) {
XXH_ASSERT(0 <= shift && shift < 64);
@@ -3216,7 +3855,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len,
XXH_ASSERT(input != NULL);
XXH_ASSERT(secret != NULL);
- XXH_ASSERT(4 <= len && len < 8);
+ XXH_ASSERT(4 <= len && len <= 8);
seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
{
@@ -3239,7 +3878,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input,
XXH_ASSERT(input != NULL);
XXH_ASSERT(secret != NULL);
- XXH_ASSERT(8 <= len && len <= 16);
+ XXH_ASSERT(9 <= len && len <= 16);
{
xxh_u64 const bitflip1 =
@@ -3306,11 +3945,10 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input,
const xxh_u8 *XXH_RESTRICT secret,
xxh_u64 seed64) {
- #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
- && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \
- && \
- !defined( \
- XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */
+ #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+ && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \
+ && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like \
+ XXH32 hack */
/*
* UGLY HACK:
* GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
@@ -3326,8 +3964,8 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input,
* GCC generates much better scalar code than Clang for the rest of XXH3,
* which is why finding a more optimal codepath is an interest.
*/
- __asm__("" : "+r"(seed64));
- #endif
+ XXH_COMPILER_GUARD(seed64);
+ #endif
{
xxh_u64 const input_lo = XXH_readLE64(input);
@@ -3381,7 +4019,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b(
}
- #define XXH3_MIDSIZE_MAX 240
+ #define XXH3_MIDSIZE_MAX 240
XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
const xxh_u8 *XXH_RESTRICT input, size_t len,
@@ -3391,8 +4029,8 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
(void)secretSize;
XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
- #define XXH3_MIDSIZE_STARTOFFSET 3
- #define XXH3_MIDSIZE_LASTOFFSET 17
+ #define XXH3_MIDSIZE_STARTOFFSET 3
+ #define XXH3_MIDSIZE_LASTOFFSET 17
{
@@ -3407,31 +4045,31 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
acc = XXH3_avalanche(acc);
XXH_ASSERT(nbRounds >= 8);
- #if defined(__clang__) /* Clang */ \
- && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
- && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
- /*
- * UGLY HACK:
- * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
- * In everywhere else, it uses scalar code.
- *
- * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
- * would still be slower than UMAAL (see XXH_mult64to128).
- *
- * Unfortunately, Clang doesn't handle the long multiplies properly and
- * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
- * scalarized into an ugly mess of VMOV.32 instructions.
- *
- * This mess is difficult to avoid without turning autovectorization
- * off completely, but they are usually relatively minor and/or not
- * worth it to fix.
- *
- * This loop is the easiest to fix, as unlike XXH32, this pragma
- * _actually works_ because it is a loop vectorization instead of an
- * SLP vectorization.
- */
- #pragma clang loop vectorize(disable)
- #endif
+ #if defined(__clang__) /* Clang */ \
+ && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+ && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
+ /*
+ * UGLY HACK:
+ * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+ * In everywhere else, it uses scalar code.
+ *
+ * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+ * would still be slower than UMAAL (see XXH_mult64to128).
+ *
+ * Unfortunately, Clang doesn't handle the long multiplies properly and
+ * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+ * scalarized into an ugly mess of VMOV.32 instructions.
+ *
+ * This mess is difficult to avoid without turning autovectorization
+ * off completely, but they are usually relatively minor and/or not
+ * worth it to fix.
+ *
+ * This loop is the easiest to fix, as unlike XXH32, this pragma
+ * _actually works_ because it is a loop vectorization instead of an
+ * SLP vectorization.
+ */
+ #pragma clang loop vectorize(disable)
+ #endif
for (i = 8; i < nbRounds; i++) {
acc +=
@@ -3450,17 +4088,17 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
}
- /* ======= Long Keys ======= */
+ /* ======= Long Keys ======= */
- #define XXH_STRIPE_LEN 64
- #define XXH_SECRET_CONSUME_RATE \
- 8 /* nb of secret bytes consumed at each accumulation */
- #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+ #define XXH_STRIPE_LEN 64
+ #define XXH_SECRET_CONSUME_RATE \
+ 8 /* nb of secret bytes consumed at each accumulation */
+ #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
- #ifdef XXH_OLD_NAMES
- #define STRIPE_LEN XXH_STRIPE_LEN
- #define ACC_NB XXH_ACC_NB
- #endif
+ #ifdef XXH_OLD_NAMES
+ #define STRIPE_LEN XXH_STRIPE_LEN
+ #define ACC_NB XXH_ACC_NB
+ #endif
XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) {
@@ -3469,56 +4107,58 @@ XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) {
}
- /* Several intrinsic functions below are supposed to accept __int64 as
- * argument, as documented in
- * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . However,
- * several environments do not define __int64 type, requiring a workaround.
- */
- #if !defined(__VMS) && \
- (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
- (__STDC_VERSION__ >= 199901L) /* C99 */))
+ /* Several intrinsic functions below are supposed to accept __int64 as
+ * argument, as documented in
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+ * However, several environments do not define __int64 type,
+ * requiring a workaround.
+ */
+ #if !defined(__VMS) && \
+ (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
+ (__STDC_VERSION__ >= 199901L) /* C99 */))
typedef int64_t xxh_i64;
- #else
+ #else
/* the following type must have a width of 64-bit */
typedef long long xxh_i64;
- #endif
+ #endif
- /*
- * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the
- * most optimized.
- *
- * It is a hardened version of UMAC, based off of FARSH's implementation.
- *
- * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
- * implementations, and it is ridiculously fast.
- *
- * We harden it by mixing the original input to the accumulators as well as
- * the product.
- *
- * This means that in the (relatively likely) case of a multiply by zero, the
- * original input is preserved.
- *
- * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
- * cross-pollination, as otherwise the upper and lower halves would be
- * essentially independent.
- *
- * This doesn't matter on 64-bit hashes since they all get merged together in
- * the end, so we skip the extra step.
- *
- * Both XXH3_64bits and XXH3_128bits use this subroutine.
- */
+ /*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the
+ * most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as
+ * the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero,
+ * the original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together
+ * in the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
- #if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH)
+ #if (XXH_VECTOR == XXH_AVX512) || \
+ (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
- #ifndef XXH_TARGET_AVX512
- #define XXH_TARGET_AVX512 /* disable attribute target */
- #endif
+ #ifndef XXH_TARGET_AVX512
+ #define XXH_TARGET_AVX512 /* disable attribute target */
+ #endif
XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512(
void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
const void *XXH_RESTRICT secret) {
- XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
+ __m512i *const xacc = (__m512i *)acc;
XXH_ASSERT((((size_t)acc) & 63) == 0);
XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
@@ -3576,8 +4216,8 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512(
XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
{
- XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
- const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+ __m512i *const xacc = (__m512i *)acc;
+ const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
/* xacc[0] ^= (xacc[0] >> 47) */
__m512i const acc_vec = *xacc;
@@ -3609,19 +4249,21 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(
int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
__m512i const seed = _mm512_mask_set1_epi64(
- _mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);
+ _mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
- XXH_ALIGN(64) const __m512i *const src = (const __m512i *)XXH3_kSecret;
- XXH_ALIGN(64) __m512i *const dest = (__m512i *)customSecret;
- int i;
+ const __m512i *const src = (const __m512i *)((const void *)XXH3_kSecret);
+ __m512i *const dest = (__m512i *)customSecret;
+ int i;
+ XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
+ XXH_ASSERT(((size_t)dest & 63) == 0);
for (i = 0; i < nbRounds; ++i) {
/* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void
- * const*', this will warn "discards ‘const’ qualifier". */
+ * const*', this will warn "discards 'const' qualifier". */
union {
- XXH_ALIGN(64) const __m512i *cp;
- XXH_ALIGN(64) void *p;
+ const __m512i *cp;
+ void * p;
} remote_const_void;
@@ -3635,13 +4277,14 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(
}
- #endif
+ #endif
- #if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH)
+ #if (XXH_VECTOR == XXH_AVX2) || \
+ (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
- #ifndef XXH_TARGET_AVX2
- #define XXH_TARGET_AVX2 /* disable attribute target */
- #endif
+ #ifndef XXH_TARGET_AVX2
+ #define XXH_TARGET_AVX2 /* disable attribute target */
+ #endif
XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2(
void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
@@ -3650,7 +4293,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2(
XXH_ASSERT((((size_t)acc) & 31) == 0);
{
- XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
+ __m256i *const xacc = (__m256i *)acc;
/* Unaligned. This is mainly for pointer arithmetic, and because
* _mm256_loadu_si256 requires a const __m256i * pointer for some reason.
*/
@@ -3692,7 +4335,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(
XXH_ASSERT((((size_t)acc) & 31) == 0);
{
- XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
+ __m256i *const xacc = (__m256i *)acc;
/* Unaligned. This is mainly for pointer arithmetic, and because
* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
const __m256i *const xsecret = (const __m256i *)secret;
@@ -3732,24 +4375,23 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(
XXH_PREFETCH(customSecret);
{
- __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64,
- -(xxh_i64)seed64, (xxh_i64)seed64);
+ __m256i const seed =
+ _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64,
+ (xxh_i64)(0U - seed64), (xxh_i64)seed64);
- XXH_ALIGN(64) const __m256i *const src = (const __m256i *)XXH3_kSecret;
- XXH_ALIGN(64) __m256i * dest = (__m256i *)customSecret;
+ const __m256i *const src = (const __m256i *)((const void *)XXH3_kSecret);
+ __m256i * dest = (__m256i *)customSecret;
- #if defined(__GNUC__) || defined(__clang__)
+ #if defined(__GNUC__) || defined(__clang__)
/*
* On GCC & Clang, marking 'dest' as modified will cause the compiler:
* - do not extract the secret from sse registers in the internal loop
* - use less common registers, and avoid pushing these reg into stack
- * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
- * customSecret, and on aarch64, this prevented LDP from merging two
- * loads together for free. Putting the loads together before the stores
- * properly generates LDP.
*/
- __asm__("" : "+r"(dest));
- #endif
+ XXH_COMPILER_GUARD(dest);
+ #endif
+ XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
+ XXH_ASSERT(((size_t)dest & 31) == 0);
/* GCC -O2 need unroll loop manually */
dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed);
@@ -3763,13 +4405,14 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(
}
- #endif
+ #endif
- #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+ /* x86dispatch always generates SSE2 */
+ #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
- #ifndef XXH_TARGET_SSE2
- #define XXH_TARGET_SSE2 /* disable attribute target */
- #endif
+ #ifndef XXH_TARGET_SSE2
+ #define XXH_TARGET_SSE2 /* disable attribute target */
+ #endif
XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2(
void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
@@ -3779,7 +4422,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2(
XXH_ASSERT((((size_t)acc) & 15) == 0);
{
- XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
+ __m128i *const xacc = (__m128i *)acc;
/* Unaligned. This is mainly for pointer arithmetic, and because
* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
const __m128i *const xinput = (const __m128i *)input;
@@ -3820,7 +4463,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(
XXH_ASSERT((((size_t)acc) & 15) == 0);
{
- XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
+ __m128i *const xacc = (__m128i *)acc;
/* Unaligned. This is mainly for pointer arithmetic, and because
* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
const __m128i *const xsecret = (const __m128i *)secret;
@@ -3859,30 +4502,34 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(
int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
- #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
- // MSVC 32bit mode does not support _mm_set_epi64x before 2015
+ #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+ /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
XXH_ALIGN(16)
- const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, -(xxh_i64)seed64};
+ const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, (xxh_i64)(0U - seed64)};
__m128i const seed = _mm_load_si128((__m128i const *)seed64x2);
- #else
- __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);
- #endif
+ #else
+ __m128i const seed =
+ _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+ #endif
int i;
- XXH_ALIGN(64) const float *const src = (float const *)XXH3_kSecret;
- XXH_ALIGN(XXH_SEC_ALIGN) __m128i *dest = (__m128i *)customSecret;
- #if defined(__GNUC__) || defined(__clang__)
+ const void *const src16 = XXH3_kSecret;
+ __m128i * dst16 = (__m128i *)customSecret;
+ #if defined(__GNUC__) || defined(__clang__)
/*
* On GCC & Clang, marking 'dest' as modified will cause the compiler:
* - do not extract the secret from sse registers in the internal loop
* - use less common registers, and avoid pushing these reg into stack
*/
- __asm__("" : "+r"(dest));
- #endif
+ XXH_COMPILER_GUARD(dst16);
+ #endif
+ XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
+ XXH_ASSERT(((size_t)dst16 & 15) == 0);
for (i = 0; i < nbRounds; ++i) {
- dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src + i * 4)), seed);
+ dst16[i] =
+ _mm_add_epi64(_mm_load_si128((const __m128i *)src16 + i), seed);
}
@@ -3890,9 +4537,9 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(
}
- #endif
+ #endif
- #if (XXH_VECTOR == XXH_NEON)
+ #if (XXH_VECTOR == XXH_NEON)
XXH_FORCE_INLINE void XXH3_accumulate_512_neon(
void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
@@ -3901,7 +4548,7 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_neon(
XXH_ASSERT((((size_t)acc) & 15) == 0);
{
- XXH_ALIGN(16) uint64x2_t *const xacc = (uint64x2_t *)acc;
+ uint64x2_t *const xacc = (uint64x2_t *)acc;
/* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7.
*/
uint8_t const *const xinput = (const uint8_t *)input;
@@ -3996,9 +4643,9 @@ XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void *XXH_RESTRICT acc,
}
- #endif
+ #endif
- #if (XXH_VECTOR == XXH_VSX)
+ #if (XXH_VECTOR == XXH_VSX)
XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT acc,
const void *XXH_RESTRICT input,
@@ -4025,12 +4672,12 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT acc,
xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
xacc[i] += product;
- /* swap high and low halves */
- #ifdef __s390x__
+ /* swap high and low halves */
+ #ifdef __s390x__
xacc[i] += vec_permi(data_vec, data_vec, 2);
- #else
+ #else
xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
- #endif
+ #endif
}
@@ -4075,7 +4722,7 @@ XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void *XXH_RESTRICT acc,
}
- #endif
+ #endif
/* scalar variants - universal */
@@ -4083,7 +4730,6 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(
void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
const void *XXH_RESTRICT secret) {
- XXH_ALIGN(XXH_ACC_ALIGN)
xxh_u64 *const xacc = (xxh_u64 *)acc; /* presumed aligned */
const xxh_u8 *const xinput =
(const xxh_u8 *)input; /* no alignment restriction */
@@ -4105,7 +4751,6 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(
XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void *XXH_RESTRICT acc,
const void *XXH_RESTRICT secret) {
- XXH_ALIGN(XXH_ACC_ALIGN)
xxh_u64 *const xacc = (xxh_u64 *)acc; /* presumed aligned */
const xxh_u8 *const xsecret =
(const xxh_u8 *)secret; /* no alignment restriction */
@@ -4135,7 +4780,7 @@ XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(
const xxh_u8 *kSecretPtr = XXH3_kSecret;
XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
- #if defined(__clang__) && defined(__aarch64__)
+ #if defined(__clang__) && defined(__aarch64__)
/*
* UGLY HACK:
* Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
@@ -4164,8 +4809,8 @@ XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(
* without hack: 2654.4 MB/s
* with hack: 3202.9 MB/s
*/
- __asm__("" : "+r"(kSecretPtr));
- #endif
+ XXH_COMPILER_GUARD(kSecretPtr);
+ #endif
/*
* Note: in debug mode, this overrides the asm optimization
* and Clang will emit MOVK chains again.
@@ -4200,55 +4845,55 @@ typedef void (*XXH3_f_accumulate_512)(void *XXH_RESTRICT, const void *,
typedef void (*XXH3_f_scrambleAcc)(void *XXH_RESTRICT, const void *);
typedef void (*XXH3_f_initCustomSecret)(void *XXH_RESTRICT, xxh_u64);
- #if (XXH_VECTOR == XXH_AVX512)
+ #if (XXH_VECTOR == XXH_AVX512)
- #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
- #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
- #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+ #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+ #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
- #elif (XXH_VECTOR == XXH_AVX2)
+ #elif (XXH_VECTOR == XXH_AVX2)
- #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
- #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
- #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+ #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+ #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
- #elif (XXH_VECTOR == XXH_SSE2)
+ #elif (XXH_VECTOR == XXH_SSE2)
- #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
- #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
- #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+ #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+ #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
- #elif (XXH_VECTOR == XXH_NEON)
+ #elif (XXH_VECTOR == XXH_NEON)
- #define XXH3_accumulate_512 XXH3_accumulate_512_neon
- #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
- #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+ #define XXH3_accumulate_512 XXH3_accumulate_512_neon
+ #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
- #elif (XXH_VECTOR == XXH_VSX)
+ #elif (XXH_VECTOR == XXH_VSX)
- #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
- #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
- #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+ #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+ #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
- #else /* scalar */
+ #else /* scalar */
- #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
- #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
- #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+ #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
- #endif
+ #endif
- #ifndef XXH_PREFETCH_DIST
- #ifdef __clang__
- #define XXH_PREFETCH_DIST 320
- #else
- #if (XXH_VECTOR == XXH_AVX512)
- #define XXH_PREFETCH_DIST 512
+ #ifndef XXH_PREFETCH_DIST
+ #ifdef __clang__
+ #define XXH_PREFETCH_DIST 320
#else
- #define XXH_PREFETCH_DIST 384
- #endif
- #endif /* __clang__ */
- #endif /* XXH_PREFETCH_DIST */
+ #if (XXH_VECTOR == XXH_AVX512)
+ #define XXH_PREFETCH_DIST 512
+ #else
+ #define XXH_PREFETCH_DIST 384
+ #endif
+ #endif /* __clang__ */
+ #endif /* XXH_PREFETCH_DIST */
/*
* XXH3_accumulate()
@@ -4308,8 +4953,9 @@ XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(
{
const xxh_u8 *const p = input + len - XXH_STRIPE_LEN;
- #define XXH_SECRET_LASTACC_START \
- 7 /* not aligned on 8, last secret is different from acc & scrambler */
+ #define XXH_SECRET_LASTACC_START \
+ 7 /* not aligned on 8, last secret is different from acc & scrambler \
+ */
f_acc512(acc, p,
secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
@@ -4337,10 +4983,10 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
for (i = 0; i < 4; i++) {
result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i);
- #if defined(__clang__) /* Clang */ \
- && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \
- && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
- && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
+ #if defined(__clang__) /* Clang */ \
+ && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \
+ && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+ && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
/*
* UGLY HACK:
* Prevent autovectorization on Clang ARMv7-a. Exact same problem as
@@ -4349,8 +4995,8 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
* without hack: 2063.7 MB/s
* with hack: 2560.7 MB/s
*/
- __asm__("" : "+r"(result64));
- #endif
+ XXH_COMPILER_GUARD(result64);
+ #endif
}
@@ -4358,13 +5004,13 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
}
- #define XXH3_INIT_ACC \
- { \
- \
- XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
- XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \
- \
- }
+ #define XXH3_INIT_ACC \
+ { \
+ \
+ XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+ XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \
+ \
+ }
XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(
const void *XXH_RESTRICT input, size_t len, const void *XXH_RESTRICT secret,
@@ -4379,9 +5025,9 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(
/* converge into final hash */
XXH_STATIC_ASSERT(sizeof(acc) == 64);
- /* do not align on 8, so that the secret is different from the accumulator
- */
- #define XXH_SECRET_MERGEACCS_START 11
+ /* do not align on 8, so that the secret is different from the accumulator
+ */
+ #define XXH_SECRET_MERGEACCS_START 11
XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
return XXH3_mergeAccs(acc,
(const xxh_u8 *)secret + XXH_SECRET_MERGEACCS_START,
@@ -4501,6 +5147,7 @@ XXH3_64bits_internal(const void *XXH_RESTRICT input, size_t len,
/* === Public entry point === */
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) {
return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret),
@@ -4508,6 +5155,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) {
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input,
size_t len,
const void *secret,
@@ -4518,6 +5166,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input,
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len,
XXH64_hash_t seed) {
@@ -4603,6 +5252,7 @@ static void XXH_alignedFree(void *p) {
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) {
XXH3_state_t *const state =
@@ -4613,6 +5263,7 @@ XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) {
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) {
XXH_alignedFree(statePtr);
@@ -4620,6 +5271,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) {
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t * dst_state,
const XXH3_state_t *src_state) {
@@ -4627,9 +5279,8 @@ XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t * dst_state,
}
-static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr,
- XXH64_hash_t seed, const void *secret,
- size_t secretSize) {
+static void XXH3_reset_internal(XXH3_state_t *statePtr, XXH64_hash_t seed,
+ const void *secret, size_t secretSize) {
size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
size_t const initLength =
@@ -4654,26 +5305,28 @@ static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr,
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr) {
if (statePtr == NULL) return XXH_ERROR;
- XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret,
- XXH_SECRET_DEFAULT_SIZE);
+ XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
return XXH_OK;
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
if (statePtr == NULL) return XXH_ERROR;
- XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
+ XXH3_reset_internal(statePtr, 0, secret, secretSize);
if (secret == NULL) return XXH_ERROR;
if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
return XXH_OK;
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
XXH64_hash_t seed) {
@@ -4681,7 +5334,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
if (seed == 0) return XXH3_64bits_reset(statePtr);
if (seed != statePtr->seed)
XXH3_initCustomSecret(statePtr->customSecret, seed);
- XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+ XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
return XXH_OK;
}
@@ -4733,12 +5386,12 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
XXH3_f_scrambleAcc f_scramble) {
if (input == NULL)
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
- (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
+ #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
+ (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
return XXH_OK;
- #else
+ #else
return XXH_ERROR;
- #endif
+ #endif
{
@@ -4747,6 +5400,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
(state->extSecret == NULL) ? state->customSecret : state->extSecret;
state->totalLen += len;
+ XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
if (state->bufferedSize + len <=
XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
@@ -4756,10 +5410,10 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
}
- /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+ /* total input is now > XXH3_INTERNALBUFFER_SIZE */
- #define XXH3_INTERNALBUFFER_STRIPES \
- (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+ #define XXH3_INTERNALBUFFER_STRIPES \
+ (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN ==
0); /* clean multiple */
@@ -4783,7 +5437,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
XXH_ASSERT(input < bEnd);
/* Consume input by a multiple of internal buffer size */
- if (input + XXH3_INTERNALBUFFER_SIZE < bEnd) {
+ if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
const xxh_u8 *const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
do {
@@ -4814,6 +5468,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state,
const void *input, size_t len) {
@@ -4859,6 +5514,7 @@ XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t * acc,
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) {
const unsigned char *const secret =
@@ -4881,8 +5537,9 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) {
}
- #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+ #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API void XXH3_generateSecret(void * secretBuffer,
const void *customSeed,
size_t customSeedSize) {
@@ -5398,6 +6055,7 @@ XXH3_128bits_internal(const void *input, size_t len, XXH64_hash_t seed64,
/* === Public XXH128 API === */
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) {
return XXH3_128bits_internal(input, len, 0, XXH3_kSecret,
@@ -5406,6 +6064,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) {
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input,
size_t len,
const void *secret,
@@ -5416,6 +6075,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input,
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input,
size_t len,
XXH64_hash_t seed) {
@@ -5426,6 +6086,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input,
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len,
XXH64_hash_t seed) {
@@ -5437,37 +6098,31 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len,
/*
* All the functions are actually the same as for 64-bit streaming variant.
- * The only difference is the finalizatiom routine.
+ * The only difference is the finalization routine.
*/
-static void XXH3_128bits_reset_internal(XXH3_state_t *statePtr,
- XXH64_hash_t seed, const void *secret,
- size_t secretSize) {
-
- XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
-
-}
-
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr) {
if (statePtr == NULL) return XXH_ERROR;
- XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret,
- XXH_SECRET_DEFAULT_SIZE);
+ XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
return XXH_OK;
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(
XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
if (statePtr == NULL) return XXH_ERROR;
- XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
+ XXH3_reset_internal(statePtr, 0, secret, secretSize);
if (secret == NULL) return XXH_ERROR;
if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
return XXH_OK;
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
XXH64_hash_t seed) {
@@ -5475,11 +6130,12 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
if (seed == 0) return XXH3_128bits_reset(statePtr);
if (seed != statePtr->seed)
XXH3_initCustomSecret(statePtr->customSecret, seed);
- XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+ XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
return XXH_OK;
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state,
const void * input,
size_t len) {
@@ -5489,6 +6145,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state,
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) {
const unsigned char *const secret =
@@ -5524,11 +6181,12 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) {
}
- /* 128-bit utility functions */
+ /* 128-bit utility functions */
- #include <string.h> /* memcmp, memcpy */
+ #include <string.h> /* memcmp, memcpy */
/* return : 1 is equal, 0 if different */
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
/* note : XXH128_hash_t is compact, it has no padding byte */
@@ -5540,6 +6198,7 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
* return : >0 if *h128_1 > *h128_2
* <0 if *h128_1 < *h128_2
* =0 if *h128_1 == *h128_2 */
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) {
XXH128_hash_t const h1 = *(const XXH128_hash_t *)h128_1;
@@ -5552,6 +6211,7 @@ XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) {
}
/*====== Canonical representation ======*/
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
XXH128_hash_t hash) {
@@ -5568,6 +6228,7 @@ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
}
+/*! @ingroup xxh3_family */
XXH_PUBLIC_API XXH128_hash_t
XXH128_hashFromCanonical(const XXH128_canonical_t *src) {
@@ -5578,16 +6239,21 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src) {
}
- /* Pop our optimization override from above */
- #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
- && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
- && defined(__OPTIMIZE__) && \
- !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
- #pragma GCC pop_options
- #endif
+ /* Pop our optimization override from above */
+ #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+ && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+ && defined(__OPTIMIZE__) && \
+ !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+ #pragma GCC pop_options
+ #endif
- #endif /* XXH_NO_LONG_LONG */
+ #endif /* XXH_NO_LONG_LONG */
+
+ #endif /* XXH_NO_XXH3 */
+/*!
+ * @}
+ */
#endif /* XXH_IMPLEMENTATION */
#if defined(__cplusplus)