diff options
author | van Hauser <vh@thc.org> | 2021-11-07 14:09:09 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-11-07 14:09:09 +0100 |
commit | fb443eaf2372ccd1825699c978fd0d662155fb9e (patch) | |
tree | ff019fc0b0704c16d68655d0f3864ec4cda49d30 | |
parent | 5b06413a5f109f310a62e36111a18d7325b246c3 (diff) | |
parent | 2ddbaa439ca78b0ae8cc6691d9657f5783b2d5e8 (diff) | |
download | afl++-fb443eaf2372ccd1825699c978fd0d662155fb9e.tar.gz |
Merge pull request #1141 from AFLplusplus/afl4
cmplog enhancement variant
-rw-r--r-- | GNUmakefile | 2 | ||||
-rw-r--r-- | docs/Changelog.md | 2 | ||||
-rw-r--r-- | frida_mode/src/cmplog/cmplog_arm64.c | 51 | ||||
-rw-r--r-- | frida_mode/src/cmplog/cmplog_x64.c | 50 | ||||
-rw-r--r-- | frida_mode/src/cmplog/cmplog_x86.c | 51 | ||||
-rw-r--r-- | include/afl-fuzz.h | 1 | ||||
-rw-r--r-- | include/cmplog.h | 13 | ||||
-rw-r--r-- | include/config.h | 4 | ||||
-rw-r--r-- | include/types.h | 3 | ||||
-rw-r--r-- | include/xxhash.h | 3084 | ||||
-rw-r--r-- | instrumentation/afl-compiler-rt.o.c | 205 | ||||
-rw-r--r-- | instrumentation/cmplog-routines-pass.cc | 244 | ||||
-rw-r--r-- | qemu_mode/QEMUAFL_VERSION | 2 | ||||
m--------- | qemu_mode/qemuafl | 0 | ||||
-rw-r--r-- | src/afl-forkserver.c | 24 | ||||
-rw-r--r-- | src/afl-fuzz-one.c | 25 | ||||
-rw-r--r-- | src/afl-fuzz-queue.c | 91 | ||||
-rw-r--r-- | src/afl-fuzz-redqueen.c | 363 | ||||
-rw-r--r-- | src/afl-fuzz-stats.c | 3 | ||||
-rw-r--r-- | src/afl-fuzz.c | 19 | ||||
-rw-r--r-- | src/afl-performance.c | 6 | ||||
-rw-r--r-- | test/test-cmplog.c | 21 |
22 files changed, 2844 insertions, 1420 deletions
diff --git a/GNUmakefile b/GNUmakefile index ad2642f3..06840786 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -32,7 +32,7 @@ VERSION = $(shell grep '^$(HASH)define VERSION ' ../config.h | cut -d '"' -f # PROGS intentionally omit afl-as, which gets installed elsewhere. PROGS = afl-fuzz afl-showmap afl-tmin afl-gotcpu afl-analyze -SH_PROGS = afl-plot afl-cmin afl-cmin.bash afl-whatsup afl-system-config afl-persistent-config +SH_PROGS = afl-plot afl-cmin afl-cmin.bash afl-whatsup afl-system-config afl-persistent-config afl-cc MANPAGES=$(foreach p, $(PROGS) $(SH_PROGS), $(p).8) afl-as.8 ASAN_OPTIONS=detect_leaks=0 diff --git a/docs/Changelog.md b/docs/Changelog.md index 7c77a6bf..2c72b5f2 100644 --- a/docs/Changelog.md +++ b/docs/Changelog.md @@ -18,6 +18,8 @@ sending a mail to <afl-users+subscribe@googlegroups.com>. - fix -n dumb mode (nobody should use this) - fix stability issue with LTO and cmplog - better banner + - more effective cmplog mode + - more often update the UI when in input2stage mode - frida_mode: David Carlier added Android support :) - afl-showmap, afl-tmin and afl-analyze: - honor persistent mode for more speed. thanks to dloffre-snl for diff --git a/frida_mode/src/cmplog/cmplog_arm64.c b/frida_mode/src/cmplog/cmplog_arm64.c index dd97f38d..ccc8e89e 100644 --- a/frida_mode/src/cmplog/cmplog_arm64.c +++ b/frida_mode/src/cmplog/cmplog_arm64.c @@ -104,9 +104,9 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) { gsize x0 = ctx_read_reg(context, ARM64_REG_X0); gsize x1 = ctx_read_reg(context, ARM64_REG_X1); - if (((G_MAXULONG - x0) < 32) || ((G_MAXULONG - x1) < 32)) return; + if (((G_MAXULONG - x0) < 31) || ((G_MAXULONG - x1) < 31)) return; - if (!cmplog_is_readable(x0, 32) || !cmplog_is_readable(x1, 32)) return; + if (!cmplog_is_readable(x0, 31) || !cmplog_is_readable(x1, 31)) return; void *ptr1 = GSIZE_TO_POINTER(x0); void *ptr2 = GSIZE_TO_POINTER(x1); @@ -116,18 +116,36 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) { k = (k >> 4) ^ (k << 8); k &= CMP_MAP_W - 1; - __afl_cmp_map->headers[k].type = CMP_TYPE_RTN; + if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) { + + __afl_cmp_map->headers[k].type = CMP_TYPE_RTN; + __afl_cmp_map->headers[k].hits = 0; + + } + + u32 hits = 0; + + if (__afl_cmp_map->headers[k].hits == 0) { + + __afl_cmp_map->headers[k].shape = 30; + + } else { + + hits = __afl_cmp_map->headers[k].hits; + + } - u32 hits = __afl_cmp_map->headers[k].hits; __afl_cmp_map->headers[k].hits = hits + 1; - __afl_cmp_map->headers[k].shape = 31; + __afl_cmp_map->headers[k].shape = 30; hits &= CMP_MAP_RTN_H - 1; + ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0_len = 31; + ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1_len = 31; gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0, ptr1, - 32); + 31); gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1, ptr2, - 32); + 31); } @@ -193,12 +211,23 @@ static void cmplog_handle_cmp_sub(GumCpuContext *context, gsize operand1, k = (k >> 4) ^ (k << 8); k &= CMP_MAP_W - 1; - __afl_cmp_map->headers[k].type = CMP_TYPE_INS; + if (__afl_cmp_map->headers[k].type != CMP_TYPE_INS) + __afl_cmp_map->headers[k].hits = 0; - u32 hits = __afl_cmp_map->headers[k].hits; - __afl_cmp_map->headers[k].hits = hits + 1; + u32 hits = 0; + + if (__afl_cmp_map->headers[k].hits == 0) { + + __afl_cmp_map->headers[k].type = CMP_TYPE_INS; + __afl_cmp_map->headers[k].shape = (size - 1); + + } else { - __afl_cmp_map->headers[k].shape = (size - 1); + hits = __afl_cmp_map->headers[k].hits; + + } + + __afl_cmp_map->headers[k].hits = hits + 1; hits &= CMP_MAP_H - 1; __afl_cmp_map->log[k][hits].v0 = operand1; diff --git a/frida_mode/src/cmplog/cmplog_x64.c b/frida_mode/src/cmplog/cmplog_x64.c index 0d18767a..5319f727 100644 --- a/frida_mode/src/cmplog/cmplog_x64.c +++ b/frida_mode/src/cmplog/cmplog_x64.c @@ -99,9 +99,9 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) { gsize rdi = ctx_read_reg(context, X86_REG_RDI); gsize rsi = ctx_read_reg(context, X86_REG_RSI); - if (((G_MAXULONG - rdi) < 32) || ((G_MAXULONG - rsi) < 32)) return; + if (((G_MAXULONG - rdi) < 31) || ((G_MAXULONG - rsi) < 31)) return; - if (!cmplog_is_readable(rdi, 32) || !cmplog_is_readable(rsi, 32)) return; + if (!cmplog_is_readable(rdi, 31) || !cmplog_is_readable(rsi, 31)) return; void *ptr1 = GSIZE_TO_POINTER(rdi); void *ptr2 = GSIZE_TO_POINTER(rsi); @@ -111,18 +111,34 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) { k = (k >> 4) ^ (k << 8); k &= CMP_MAP_W - 1; - __afl_cmp_map->headers[k].type = CMP_TYPE_RTN; + if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) { - u32 hits = __afl_cmp_map->headers[k].hits; - __afl_cmp_map->headers[k].hits = hits + 1; + __afl_cmp_map->headers[k].type = CMP_TYPE_RTN; + __afl_cmp_map->headers[k].hits = 0; + + } + + u32 hits = 0; + + if (__afl_cmp_map->headers[k].hits == 0) { + + __afl_cmp_map->headers[k].shape = 30; + + } else { + + hits = __afl_cmp_map->headers[k].hits; + + } - __afl_cmp_map->headers[k].shape = 31; + __afl_cmp_map->headers[k].hits = hits + 1; hits &= CMP_MAP_RTN_H - 1; + ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0_len = 31; + ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1_len = 31; gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0, ptr1, - 32); + 31); gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1, ptr2, - 32); + 31); } @@ -179,13 +195,23 @@ static void cmplog_handle_cmp_sub(GumCpuContext *context, gsize operand1, k = (k >> 4) ^ (k << 8); k &= CMP_MAP_W - 7; - __afl_cmp_map->headers[k].type = CMP_TYPE_INS; + if (__afl_cmp_map->headers[k].type != CMP_TYPE_INS) + __afl_cmp_map->headers[k].hits = 0; - u32 hits = __afl_cmp_map->headers[k].hits; - __afl_cmp_map->headers[k].hits = hits + 1; + u32 hits = 0; + + if (__afl_cmp_map->headers[k].hits == 0) { + + __afl_cmp_map->headers[k].type = CMP_TYPE_INS; + __afl_cmp_map->headers[k].shape = (size - 1); - __afl_cmp_map->headers[k].shape = (size - 1); + } else { + hits = __afl_cmp_map->headers[k].hits; + + } + + __afl_cmp_map->headers[k].hits = hits + 1; hits &= CMP_MAP_H - 1; __afl_cmp_map->log[k][hits].v0 = operand1; __afl_cmp_map->log[k][hits].v1 = operand2; diff --git a/frida_mode/src/cmplog/cmplog_x86.c b/frida_mode/src/cmplog/cmplog_x86.c index dd666c34..27d06720 100644 --- a/frida_mode/src/cmplog/cmplog_x86.c +++ b/frida_mode/src/cmplog/cmplog_x86.c @@ -104,9 +104,9 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) { gsize arg1 = esp[0]; gsize arg2 = esp[1]; - if (((G_MAXULONG - arg1) < 32) || ((G_MAXULONG - arg2) < 32)) return; + if (((G_MAXULONG - arg1) < 31) || ((G_MAXULONG - arg2) < 31)) return; - if (!cmplog_is_readable(arg1, 32) || !cmplog_is_readable(arg2, 32)) return; + if (!cmplog_is_readable(arg1, 31) || !cmplog_is_readable(arg2, 31)) return; void *ptr1 = GSIZE_TO_POINTER(arg1); void *ptr2 = GSIZE_TO_POINTER(arg2); @@ -116,18 +116,34 @@ static void cmplog_call_callout(GumCpuContext *context, gpointer user_data) { k = (k >> 4) ^ (k << 8); k &= CMP_MAP_W - 1; - __afl_cmp_map->headers[k].type = CMP_TYPE_RTN; + if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) { - u32 hits = __afl_cmp_map->headers[k].hits; - __afl_cmp_map->headers[k].hits = hits + 1; + __afl_cmp_map->headers[k].type = CMP_TYPE_RTN; + __afl_cmp_map->headers[k].hits = 0; + + } + + u32 hits = 0; + + if (__afl_cmp_map->headers[k].hits == 0) { - __afl_cmp_map->headers[k].shape = 31; + __afl_cmp_map->headers[k].shape = 30; + + } else { + + hits = __afl_cmp_map->headers[k].hits; + + } + + __afl_cmp_map->headers[k].hits = hits + 1; hits &= CMP_MAP_RTN_H - 1; + ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0_len = 31; + ((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1_len = 31; gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0, ptr1, - 32); + 31); gum_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1, ptr2, - 32); + 31); } @@ -184,12 +200,23 @@ static void cmplog_handle_cmp_sub(GumCpuContext *context, gsize operand1, k = (k >> 4) ^ (k << 8); k &= CMP_MAP_W - 1; - __afl_cmp_map->headers[k].type = CMP_TYPE_INS; + if (__afl_cmp_map->headers[k].type != CMP_TYPE_INS) + __afl_cmp_map->headers[k].hits = 0; - u32 hits = __afl_cmp_map->headers[k].hits; - __afl_cmp_map->headers[k].hits = hits + 1; + u32 hits = 0; + + if (__afl_cmp_map->headers[k].hits == 0) { + + __afl_cmp_map->headers[k].type = CMP_TYPE_INS; + __afl_cmp_map->headers[k].shape = (size - 1); + + } else { - __afl_cmp_map->headers[k].shape = (size - 1); + hits = __afl_cmp_map->headers[k].hits; + + } + + __afl_cmp_map->headers[k].hits = hits + 1; hits &= CMP_MAP_H - 1; __afl_cmp_map->log[k][hits].v0 = operand1; diff --git a/include/afl-fuzz.h b/include/afl-fuzz.h index e73ea1a4..f3d6d99d 100644 --- a/include/afl-fuzz.h +++ b/include/afl-fuzz.h @@ -1135,6 +1135,7 @@ void setup_signal_handlers(void); void save_cmdline(afl_state_t *, u32, char **); void read_foreign_testcases(afl_state_t *, int); void write_crash_readme(afl_state_t *afl); +u8 check_if_text_buf(u8 *buf, u32 len); /* CmpLog */ diff --git a/include/cmplog.h b/include/cmplog.h index 1c15d2b8..8778a4b6 100644 --- a/include/cmplog.h +++ b/include/cmplog.h @@ -48,7 +48,8 @@ struct cmp_header { unsigned shape : 5; unsigned type : 2; unsigned attribute : 4; - unsigned reserved : 5; + unsigned overflow : 1; + unsigned reserved : 4; } __attribute__((packed)); @@ -59,14 +60,16 @@ struct cmp_operands { u64 v0_128; u64 v1_128; -}; +} __attribute__((packed)); struct cmpfn_operands { - u8 v0[32]; - u8 v1[32]; + u8 v0[31]; + u8 v0_len; + u8 v1[31]; + u8 v1_len; -}; +} __attribute__((packed)); typedef struct cmp_operands cmp_map_list[CMP_MAP_H]; diff --git a/include/config.h b/include/config.h index 3aee9b00..b787152f 100644 --- a/include/config.h +++ b/include/config.h @@ -267,8 +267,8 @@ (first value), and to keep in memory as candidates. The latter should be much higher than the former. */ -#define USE_AUTO_EXTRAS 128 -#define MAX_AUTO_EXTRAS (USE_AUTO_EXTRAS * 64) +#define USE_AUTO_EXTRAS 4096 +#define MAX_AUTO_EXTRAS (USE_AUTO_EXTRAS * 8) /* Scaling factor for the effector map used to skip some of the more expensive deterministic steps. The actual divisor is set to diff --git a/include/types.h b/include/types.h index e945f0f5..bbcc2f81 100644 --- a/include/types.h +++ b/include/types.h @@ -46,6 +46,8 @@ typedef uint128_t u128; #define FS_ERROR_SHM_OPEN 4 #define FS_ERROR_SHMAT 8 #define FS_ERROR_MMAP 16 +#define FS_ERROR_OLD_CMPLOG 32 +#define FS_ERROR_OLD_CMPLOG_QEMU 64 /* Reporting options */ #define FS_OPT_ENABLED 0x80000001 @@ -53,6 +55,7 @@ typedef uint128_t u128; #define FS_OPT_SNAPSHOT 0x20000000 #define FS_OPT_AUTODICT 0x10000000 #define FS_OPT_SHDMEM_FUZZ 0x01000000 +#define FS_OPT_NEWCMPLOG 0x02000000 #define FS_OPT_OLD_AFLPP_WORKAROUND 0x0f000000 // FS_OPT_MAX_MAPSIZE is 8388608 = 0x800000 = 2^23 = 1 << 22 #define FS_OPT_MAX_MAPSIZE ((0x00fffffeU >> 1) + 1) diff --git a/include/xxhash.h b/include/xxhash.h index 006d3f3d..0ca2b852 100644 --- a/include/xxhash.h +++ b/include/xxhash.h @@ -32,7 +32,12 @@ * - xxHash homepage: https://www.xxhash.com * - xxHash source repository: https://github.com/Cyan4973/xxHash */ - +/*! + * @mainpage xxHash + * + * @file xxhash.h + * xxHash prototypes and implementation + */ /* TODO: update */ /* Notice extracted from xxHash homepage: @@ -45,7 +50,7 @@ Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo Name Speed Q.Score Author xxHash 5.4 GB/s 10 CrapWow 3.2 GB/s 2 Andrew -MumurHash 3a 2.7 GB/s 10 Austin Appleby +MurmurHash 3a 2.7 GB/s 10 Austin Appleby SpookyHash 2.0 GB/s 10 Bob Jenkins SBox 1.4 GB/s 9 Bret Mulvey Lookup3 1.2 GB/s 9 Bob Jenkins @@ -119,29 +124,78 @@ extern "C" { /* * This part deals with the special case where a unit wants to inline xxHash, - * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such - * as part of some previously included *.h header file. + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. * Without further action, the new include would just be ignored, * and functions would effectively _not_ be inlined (silent failure). * The following macros solve this situation by prefixing all inlined names, * avoiding naming collision with previous inclusions. */ - #ifdef XXH_NAMESPACE - #error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported" - /* - * Note: Alternative: #undef all symbols (it's a pretty large list). - * Without #error: it compiles, but functions are actually not inlined. - */ - #endif +/* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ + #undef XXH_versionNumber +/* XXH32 */ + #undef XXH32 + #undef XXH32_createState + #undef XXH32_freeState + #undef XXH32_reset + #undef XXH32_update + #undef XXH32_digest + #undef XXH32_copyState + #undef XXH32_canonicalFromHash + #undef XXH32_hashFromCanonical +/* XXH64 */ + #undef XXH64 + #undef XXH64_createState + #undef XXH64_freeState + #undef XXH64_reset + #undef XXH64_update + #undef XXH64_digest + #undef XXH64_copyState + #undef XXH64_canonicalFromHash + #undef XXH64_hashFromCanonical +/* XXH3_64bits */ + #undef XXH3_64bits + #undef XXH3_64bits_withSecret + #undef XXH3_64bits_withSeed + #undef XXH3_createState + #undef XXH3_freeState + #undef XXH3_copyState + #undef XXH3_64bits_reset + #undef XXH3_64bits_reset_withSeed + #undef XXH3_64bits_reset_withSecret + #undef XXH3_64bits_update + #undef XXH3_64bits_digest + #undef XXH3_generateSecret +/* XXH3_128bits */ + #undef XXH128 + #undef XXH3_128bits + #undef XXH3_128bits_withSeed + #undef XXH3_128bits_withSecret + #undef XXH3_128bits_reset + #undef XXH3_128bits_reset_withSeed + #undef XXH3_128bits_reset_withSecret + #undef XXH3_128bits_update + #undef XXH3_128bits_digest + #undef XXH128_isEqual + #undef XXH128_cmp + #undef XXH128_canonicalFromHash + #undef XXH128_hashFromCanonical +/* Finally, free the namespace itself */ + #undef XXH_NAMESPACE + +/* employ the namespace for XXH_INLINE_ALL */ #define XXH_NAMESPACE XXH_INLINE_ /* - * Some identifiers (enums, type names) are not symbols, but they must - * still be renamed to avoid redeclaration. + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. * Alternative solution: do not redeclare them. - * However, this requires some #ifdefs, and is a more dispersed action. - * Meanwhile, renaming can be achieved in a single block + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. */ - #define XXH_IPREF(Id) XXH_INLINE_##Id + #define XXH_IPREF(Id) XXH_NAMESPACE##Id #define XXH_OK XXH_IPREF(XXH_OK) #define XXH_ERROR XXH_IPREF(XXH_ERROR) #define XXH_errorcode XXH_IPREF(XXH_errorcode) @@ -166,6 +220,12 @@ extern "C" { #ifndef XXHASH_H_5627135585666179 #define XXHASH_H_5627135585666179 1 + /*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + + */ /* specific declaration modes for Windows */ #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) #if defined(WIN32) && defined(_MSC_VER) && \ @@ -180,19 +240,24 @@ extern "C" { #endif #endif - /*! - * XXH_NAMESPACE, aka Namespace Emulation: - * - * If you want to include _and expose_ xxHash functions from within your own - * library, but also want to avoid symbol collisions with other libraries - * which may also include xxHash, you can use XXH_NAMESPACE to automatically - * prefix any public symbol from xxhash library with the value of - * XXH_NAMESPACE (therefore, avoid empty or numeric values). - * - * Note that no change is required within the calling program as long as it - * includes `xxhash.h`: Regular symbol names will be automatically translated - * by this header. - */ + #ifdef XXH_DOXYGEN + /*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries + * which may also include xxHash, you can use XXH_NAMESPACE to automatically + * prefix any public symbol from xxhash library with the value of + * XXH_NAMESPACE (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically + * translated by this header. + */ + #define XXH_NAMESPACE /* YOUR NAME HERE */ + #undef XXH_NAMESPACE + #endif + #ifdef XXH_NAMESPACE #define XXH_CAT(A, B) A##B #define XXH_NAME2(A, B) XXH_CAT(A, B) @@ -264,10 +329,19 @@ extern "C" { ***************************************/ #define XXH_VERSION_MAJOR 0 #define XXH_VERSION_MINOR 8 - #define XXH_VERSION_RELEASE 0 + #define XXH_VERSION_RELEASE 1 #define XXH_VERSION_NUMBER \ (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + \ XXH_VERSION_RELEASE) + +/*! + * @brief Obtains the xxHash version. + * + * This is only useful when xxHash is compiled as a shared library, as it is + * independent of the version defined in the header. + * + * @return `XXH_VERSION_NUMBER` as of when the libray was compiled. + */ XXH_PUBLIC_API unsigned XXH_versionNumber(void); /* **************************** @@ -279,15 +353,24 @@ typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode; /*-********************************************************************** * 32-bit hash ************************************************************************/ - #if !defined(__VMS) && \ + #if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */ +/*! + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; + + #elif !defined(__VMS) && \ (defined(__cplusplus) || \ (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) #include <stdint.h> -typedef uint32_t XXH32_hash_t; +typedef uint32_t XXH32_hash_t; + #else #include <limits.h> #if UINT_MAX == 0xFFFFFFFFUL -typedef unsigned int XXH32_hash_t; +typedef unsigned int XXH32_hash_t; #else #if ULONG_MAX == 0xFFFFFFFFUL typedef unsigned long XXH32_hash_t; @@ -298,24 +381,52 @@ typedef unsigned long XXH32_hash_t; #endif /*! - * XXH32(): - * Calculate the 32-bit hash of sequence "length" bytes stored at memory - * address "input". The memory between input & input+length must be valid - * (allocated and read-accessible). "seed" can be used to alter the result - * predictably. Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher - * benchmark): 5.4 GB/s - * - * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, - * and offers true 64/128 bit hash results. It provides a superior level of - * dispersion, and greatly reduces the risks of collisions. + * @} + * + * @defgroup xxh32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is considered rather weak by today's standards. + * The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit + * systems, and offers true 64/128 bit hash results. It provides a superior + * level of dispersion, and greatly reduces the risks of collisions. + * + * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families + * @see @ref xxh32_impl for implementation details + * @{ + + */ + +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * @param input The block of data to be hashed, at least @p length bytes in + * size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit hash value. + * + * @see + * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. */ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length, XXH32_hash_t seed); -/******* Streaming *******/ - -/* - * Streaming functions generate the xxHash value from an incrememtal input. +/*! + * Streaming functions generate the xxHash value from an incremental input. * This method is slower than single-call functions, due to state management. * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. * @@ -336,19 +447,125 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length, * digest, and generate new hash values later on by invoking `XXH*_digest()`. * * When done, release the state using `XXH*_freeState()`. + * + * Example code for incrementally hashing a file: + * @code{.c} + * #include <stdio.h> + * #include <xxhash.h> + * #define BUFFER_SIZE 256 + * + * // Note: XXH64 and XXH3 use the same interface. + * XXH32_hash_t + * hashFile(FILE* stream) + * { + + * XXH32_state_t* state; + * unsigned char buf[BUFFER_SIZE]; + * size_t amt; + * XXH32_hash_t hash; + * + * state = XXH32_createState(); // Create a state + * assert(state != NULL); // Error check here + * XXH32_reset(state, 0xbaad5eed); // Reset state with our seed + * while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) { + + * XXH32_update(state, buf, amt); // Hash the file in chunks + * } + * hash = XXH32_digest(state); // Finalize the hash + * XXH32_freeState(state); // Clean up + * return hash; + * } + * @endcode + */ + +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. */ +typedef struct XXH32_state_s XXH32_state_t; -typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * Must be freed with XXH32_freeState(). + * @return An allocated XXH32_state_t on success, `NULL` on failure. + */ XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void); -XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr); -XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dst_state, - const XXH32_state_t *src_state); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * Must be allocated with XXH32_createState(). + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref + * XXH32_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dst_state, + const XXH32_state_t *src_state); +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr, XXH32_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in + * size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *statePtr, const void *input, size_t length); -XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr); + +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash32 value from that state. + */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr); /******* Canonical representation *******/ @@ -373,48 +590,158 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr); * canonical format. */ +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ typedef struct { - unsigned char digest[4]; + unsigned char digest[4]; /*!< Hash bytes, big endian */ } XXH32_canonical_t; +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t *src); + #ifdef __has_attribute + #define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) + #else + #define XXH_HAS_ATTRIBUTE(x) 0 + #endif + + /* C-language Attributes are added in C23. */ + #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && \ + defined(__has_c_attribute) + #define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) + #else + #define XXH_HAS_C_ATTRIBUTE(x) 0 + #endif + + #if defined(__cplusplus) && defined(__has_cpp_attribute) + #define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) + #else + #define XXH_HAS_CPP_ATTRIBUTE(x) 0 + #endif + + /* + Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' + attribute introduced in CPP17 and C23. CPP17 : + https://en.cppreference.com/w/cpp/language/attributes/fallthrough C23 : + https://en.cppreference.com/w/c/language/attributes/fallthrough + */ + #if XXH_HAS_C_ATTRIBUTE(x) + #define XXH_FALLTHROUGH [[fallthrough]] + #elif XXH_HAS_CPP_ATTRIBUTE(x) + #define XXH_FALLTHROUGH [[fallthrough]] + #elif XXH_HAS_ATTRIBUTE(__fallthrough__) + #define XXH_FALLTHROUGH __attribute__((fallthrough)) + #else + #define XXH_FALLTHROUGH + #endif + +/*! + * @} + * @ingroup public + * @{ + + */ + #ifndef XXH_NO_LONG_LONG /*-********************************************************************** * 64-bit hash ************************************************************************/ - #if !defined(__VMS) && \ + #if defined(XXH_DOXYGEN) /* don't include <stdint.h> */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; + #elif !defined(__VMS) && \ (defined(__cplusplus) || (defined(__STDC_VERSION__) && \ (__STDC_VERSION__ >= 199901L) /* C99 */)) #include <stdint.h> typedef uint64_t XXH64_hash_t; #else + #include <limits.h> + #if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL +/* LP64 ABI says uint64_t is unsigned long */ +typedef unsigned long XXH64_hash_t; + #else /* the following type must have a width of 64-bit */ typedef unsigned long long XXH64_hash_t; + #endif #endif /*! - * XXH64(): - * Returns the 64-bit hash of sequence of length @length stored at memory - * address @input. - * @seed can be used to alter the result predictably. + * @} + * + * @defgroup xxh64_family XXH64 family + * @ingroup public + * @{ + + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. It provides a superior level of + * dispersion, and greatly reduces the risks of collisions. + */ + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. * * This function usually runs faster on 64-bit systems, but slower on 32-bit * systems (see benchmark). * - * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, - * and offers true 64/128 bit hash results. It provides a superior level of - * dispersion, and greatly reduces the risks of collisions. + * @param input The block of data to be hashed, at least @p length bytes in + * size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit hash. + * + * @see + * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. */ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t length, XXH64_hash_t seed); /******* Streaming *******/ +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + */ typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void); XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr); @@ -439,12 +766,15 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst, XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t *src); -/*-********************************************************************** - * XXH3 64-bit variant - ************************************************************************/ +/*! + * @} + * ************************************************************************ + * @defgroup xxh3_family XXH3 family + * @ingroup public + * @{ -/* ************************************************************************ - * XXH3 is a new hash algorithm featuring: + * + * XXH3 is a more recent hash algorithm featuring: * - Improved speed for both small and large inputs * - True 64-bit and 128-bit outputs * - SIMD acceleration @@ -454,41 +784,38 @@ XXH64_hashFromCanonical(const XXH64_canonical_t *src); * * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html * - * In general, expect XXH3 to run about ~2x faster on large inputs and >3x - * faster on small ones compared to XXH64, though exact differences depend on - * the platform. + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. * - * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash - * on all platforms. - * - * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it. - * - * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run - * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are - * explained in the implementation. + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Any 32-bit and 64-bit targets that can run XXH32 smoothly + * can run XXH3 at competitive speeds, even without vector support. + * Further details are explained in the implementation. * * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, - * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro. + * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generage exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. * * XXH3 offers 2 variants, _64bits and _128bits. - * When only 64 bits are needed, prefer calling the _64bits variant, as it - * reduces the amount of mixing, resulting in faster speed on small inputs. * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. * It's also generally simpler to manipulate a scalar return type than a struct. * - * The 128-bit version adds additional strength, but it is slightly slower. - * - * Return values of XXH3 and XXH128 are officially finalized starting - * with v0.8.0 and will no longer change in future versions. - * Avoid storing values from before that release in long-term storage. - * - * Results produced by v0.7.x are not comparable with results from v0.7.y. - * However, the API is completely stable, and it can safely be used for - * ephemeral data (local sessions). - * * The API supports one-shot hashing, streaming mode, and custom secrets. */ +/*-********************************************************************** + * XXH3 64-bit variant + ************************************************************************/ + /* XXH3_64bits(): * default 64-bit variant, using default secret and default seed of 0. * It's the fastest variant. */ @@ -504,20 +831,28 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *data, size_t len); XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *data, size_t len, XXH64_hash_t seed); - /* - * XXH3_64bits_withSecret(): - * It's possible to provide any blob of bytes as a "secret" to generate the - * hash. This makes it more difficult for an external actor to prepare an - * intentional collision. The main condition is that secretSize *must* be - * large enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced - * hash values depends on secret's entropy. Technically, the secret must - * look like a bunch of random bytes. Avoid "trivial" or structured data - * such as repeated sequences or a text document. Whenever unsure about the - * "randomness" of the blob of bytes, consider relabelling it as a "custom - * seed" instead, and employ "XXH3_generateSecret()" (see below) to generate - * a high entropy secret derived from the custom seed. + /*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). */ #define XXH3_SECRET_SIZE_MIN 136 + +/* + * XXH3_64bits_withSecret(): + * It's possible to provide any blob of bytes as a "secret" to generate the + * hash. This makes it more difficult for an external actor to prepare an + * intentional collision. The main condition is that secretSize *must* be large + * enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced hash + * values depends on secret's entropy. Technically, the secret must look like a + * bunch of random bytes. Avoid "trivial" or structured data such as repeated + * sequences or a text document. Whenever unsure about the "randomness" of the + * blob of bytes, consider relabelling it as a "custom seed" instead, and employ + * "XXH3_generateSecret()" (see below) to generate a high entropy secret derived + * from the custom seed. + */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len, const void *secret, size_t secretSize); @@ -529,6 +864,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len, * As a consequence, streaming is slower than one-shot hashing. * For better performance, prefer one-shot functions whenever applicable. */ + +/*! + * @brief The state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + */ typedef struct XXH3_state_s XXH3_state_t; XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void); XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr); @@ -572,10 +913,16 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *statePtr); * XXH3 128-bit variant ************************************************************************/ +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ typedef struct { - XXH64_hash_t low64; - XXH64_hash_t high64; + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ } XXH128_hash_t; @@ -649,6 +996,9 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src); #endif /* XXH_NO_LONG_LONG */ +/*! + * @} + */ #endif /* XXHASH_H_5627135585666179 */ #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) @@ -660,7 +1010,7 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src); * These declarations should only be used with static linking. * Never use them in association with dynamic linking! ***************************************************************************** - */ +*/ /* * These definitions are only present to allow static allocation @@ -668,41 +1018,72 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src); * Never **ever** access their members directly. */ +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ struct XXH32_state_s { - XXH32_hash_t total_len_32; - XXH32_hash_t large_len; - XXH32_hash_t v1; - XXH32_hash_t v2; - XXH32_hash_t v3; - XXH32_hash_t v4; - XXH32_hash_t mem32[4]; - XXH32_hash_t memsize; - XXH32_hash_t - reserved; /* never read nor write, might be removed in a future version */ + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref + total_len_32 overflow) */ + XXH32_hash_t v1; /*!< First accumulator lane */ + XXH32_hash_t v2; /*!< Second accumulator lane */ + XXH32_hash_t v3; /*!< Third accumulator lane */ + XXH32_hash_t v4; /*!< Fourth accumulator lane */ + XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as + unsigned char[16]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may + be removed. */ }; /* typedef'd to XXH32_state_t */ #ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ struct XXH64_state_s { - XXH64_hash_t total_len; - XXH64_hash_t v1; - XXH64_hash_t v2; - XXH64_hash_t v3; - XXH64_hash_t v4; - XXH64_hash_t mem64[4]; - XXH32_hash_t memsize; - XXH32_hash_t reserved32; /* required for padding anyway */ - XXH64_hash_t reserved64; /* never read nor write, might be removed in a future - version */ + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t v1; /*!< First accumulator lane */ + XXH64_hash_t v2; /*!< Second accumulator lane */ + XXH64_hash_t v3; /*!< Third accumulator lane */ + XXH64_hash_t v4; /*!< Fourth accumulator lane */ + XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as + unsigned char[32]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it + may be removed. */ }; /* typedef'd to XXH64_state_t */ - #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ + #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 \ + */ #include <stdalign.h> #define XXH_ALIGN(n) alignas(n) + #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ + /* In C++ alignas() is a keyword */ + #define XXH_ALIGN(n) alignas(n) #elif defined(__GNUC__) #define XXH_ALIGN(n) __attribute__((aligned(n))) #elif defined(_MSC_VER) @@ -713,39 +1094,94 @@ struct XXH64_state_s { /* Old GCC versions only accept the attribute after the type in structures. */ - #if !(defined(__STDC_VERSION__) && \ - (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + #if !(defined(__STDC_VERSION__) && \ + (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && !(defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ && defined(__GNUC__) #define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) #else #define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type #endif + /*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ #define XXH3_INTERNALBUFFER_SIZE 256 + + /*! + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ #define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. + * + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do never access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ struct XXH3_state_s { XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); - /* used to store a custom secret generated from a seed */ + /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref + * XXH64_state_s */ XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); - XXH32_hash_t bufferedSize; - XXH32_hash_t reserved32; - size_t nbStripesSoFar; - XXH64_hash_t totalLen; - size_t nbStripesPerBlock; - size_t secretLimit; - XXH64_hash_t seed; - XXH64_hash_t reserved64; - const unsigned char *extSecret; /* reference to external secret; - * if == NULL, use .customSecret instead */ + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t reserved32; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see + * XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char *extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ /* note: there may be some padding at the end due to alignment on 64 bytes */ }; /* typedef'd to XXH3_state_t */ #undef XXH_ALIGN_MEMBER - /* When the XXH3_state_t structure is merely emplaced on stack, + /*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, * it should be initialized with XXH3_INITSTATE() or a memset() * in case its first reset uses XXH3_NNbits_reset_withSeed(). * This init can be omitted if the first reset uses default or _withSecret @@ -802,7 +1238,6 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len, XXH64_hash_t seed); #endif /* XXH_NO_LONG_LONG */ - #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) #define XXH_IMPLEMENTATION #endif @@ -844,81 +1279,183 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len, /* ************************************* * Tuning parameters ***************************************/ + /*! - * XXH_FORCE_MEMORY_ACCESS: - * By default, access to unaligned memory is controlled by `memcpy()`, which - * is safe and portable. - * - * Unfortunately, on some target/compiler combinations, the generated assembly - * is sub-optimal. + * @defgroup tuning Tuning parameters + * @{ + * - * The below switch allow selection of a different access method - * in the search for improved performance. - * Method 0 (default): - * Use `memcpy()`. Safe and portable. Default. - * Method 1: - * `__attribute__((packed))` statement. It depends on compiler extensions - * and is therefore not portable. - * This method is safe if your compiler supports it, and *generally* as - * fast or faster than `memcpy`. - * Method 2: - * Direct access via cast. This method doesn't depend on the compiler but - * violates the C standard. - * It can generate buggy code on targets which do not support unaligned - * memory accesses. - * But in some circumstances, it's the only known way to get the most - * performance (example: GCC + ARMv6) - * Method 3: - * Byteshift. This can generate the best code on old compilers which don't - * inline small `memcpy()` calls, and it might also be faster on - * big-endian systems which lack a native byteswap instruction. See - * https://stackoverflow.com/a/32095106/646947 for details. Prefer these - * methods in priority order (0 > 1 > 2 > 3) + * Various macros to control xxHash's behavior. */ + #ifdef XXH_DOXYGEN + /*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref xxh32_family and you have a strict C90 + * compiler. + */ + #define XXH_NO_LONG_LONG + #undef XXH_NO_LONG_LONG /* don't actually */ + /*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which + * is safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated + * assembly is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers + * will eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences + * an unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's + * the only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which + * don't inline small `memcpy()` calls, and it might also be faster on + * big-endian systems which lack a native byteswap instruction. However, + * some compilers will emit literal byteshifts even if the target supports + * unaligned access. + * . + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may + * cause another to read garbage data or even crash. + * + * See https://stackoverflow.com/a/32095106/646947 for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ + #define XXH_FORCE_MEMORY_ACCESS 0 + /*! + * @def XXH_ACCEPT_NULL_INPUT_POINTER + * @brief Whether to add explicit `NULL` checks. + * + * If the input pointer is `NULL` and the length is non-zero, xxHash's + * default behavior is to dereference it, triggering a segfault. + * + * When this macro is enabled, xxHash actively checks the input for a null + * pointer. If it is, the result for null input pointers is the same as a + * zero-length input. + */ + #define XXH_ACCEPT_NULL_INPUT_POINTER 0 + /*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs + * (XXH32() and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally + * negligible, but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro + * to 0. Then the code will always use unaligned memory access. Align check + * is automatically disabled on x86, x64 & arm64, which are platforms known + * to offer good unaligned memory accesses performance. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ + #define XXH_FORCE_ALIGN_CHECK 0 + + /*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all + * internal functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary + * which might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to + * performance, depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using + * -fno-inline with GCC or Clang, this will automatically be defined. + */ + #define XXH_NO_INLINE_HINTS 0 + + /*! + * @def XXH_REROLL + * @brief Whether to reroll `XXH32_finalize`. + * + * For performance, `XXH32_finalize` uses an unrolled loop + * in the form of a switch statement. + * + * This is not always desirable, as it generates larger code, + * and depending on the architecture, may even be slower + * + * This is automatically defined with `-Os`/`-Oz` on GCC and Clang. + */ + #define XXH_REROLL 0 + + /*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use + * this. + */ + #define XXH_OLD_NAMES + #undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + #endif /* XXH_DOXYGEN */ +/*! + * @} + */ + #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command \ line for example */ - #if !defined(__clang__) && defined(__GNUC__) && \ - defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && \ - (__ARM_ARCH == 6) - #define XXH_FORCE_MEMORY_ACCESS 2 - #elif !defined(__clang__) && \ - ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ - (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) + /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */ + #if !defined(__clang__) && \ + ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && \ + ((defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \ + (defined(__mips__) && (__mips <= 5 || __mips_isa_rev < 6) && \ + (!defined(__mips16) || defined(__mips_mips16e2)))))) #define XXH_FORCE_MEMORY_ACCESS 1 #endif #endif - /*! - * XXH_ACCEPT_NULL_INPUT_POINTER: - * If the input pointer is NULL, xxHash's default behavior is to dereference - * it, triggering a segfault. When this macro is enabled, xxHash actively - * checks the input for a null pointer. If it is, the result for null input - * pointers is the same as a zero-length input. - */ #ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ #define XXH_ACCEPT_NULL_INPUT_POINTER 0 #endif - /*! - * XXH_FORCE_ALIGN_CHECK: - * This is an important performance trick - * for architectures without decent unaligned memory access performance. - * It checks for input alignment, and when conditions are met, - * uses a "fast path" employing direct 32-bit/64-bit read, - * resulting in _dramatically faster_ read speed. - * - * The check costs one initial branch per hash, which is generally negligible, - * but not zero. Moreover, it's not useful to generate binary for an - * additional code path if memory access uses same instruction for both - * aligned and unaligned adresses. - * - * In these cases, the alignment check can be removed by setting this macro to - * 0. Then the code will always use unaligned memory access. Align check is - * automatically disabled on x86, x64 & arm64, which are platforms known to - * offer good unaligned memory accesses performance. - * - * This option does not affect XXH3 (only XXH32 and XXH64). - */ #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ #if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || \ defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */ @@ -928,25 +1465,6 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len, #endif #endif - /*! - * XXH_NO_INLINE_HINTS: - * - * By default, xxHash tries to force the compiler to inline almost all - * internal functions. - * - * This can usually improve performance due to reduced jumping and improved - * constant folding, but significantly increases the size of the binary which - * might not be favorable. - * - * Additionally, sometimes the forced inlining can be detrimental to - * performance, depending on the architecture. - * - * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the - * compiler full control on whether to inline or not. - * - * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using - * -fno-inline with GCC or Clang, this will automatically be defined. - */ #ifndef XXH_NO_INLINE_HINTS #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ || defined(__NO_INLINE__) /* -O0, -fno-inline */ @@ -956,44 +1474,57 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len, #endif #endif - /*! - * XXH_REROLL: - * Whether to reroll XXH32_finalize, and XXH64_finalize, - * instead of using an unrolled jump table/if statement loop. - * - * This is automatically defined on -Os/-Oz on GCC and Clang. - */ #ifndef XXH_REROLL - #if defined(__OPTIMIZE_SIZE__) + #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ || \ + (defined(__GNUC__) && !defined(__clang__)) + /* The if/then loop is preferable to switch/case on gcc (on x64) */ #define XXH_REROLL 1 #else #define XXH_REROLL 0 #endif #endif + /*! + * @defgroup impl Implementation + * @{ + + */ + /* ************************************* * Includes & Memory related functions ***************************************/ - /*! + /* * Modify the local functions below should you wish to use * different memory routines for malloc() and free() */ #include <stdlib.h> +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ static void *XXH_malloc(size_t s) { return malloc(s); } +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ static void XXH_free(void *p) { free(p); } - /*! and for memcpy() */ #include <string.h> + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ static void *XXH_memcpy(void *dest, const void *src, size_t size) { return memcpy(dest, src, size); @@ -1037,7 +1568,11 @@ static void *XXH_memcpy(void *dest, const void *src, size_t size) { /* ************************************* * Debug ***************************************/ - /* + /*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * * XXH_DEBUGLEVEL is expected to be defined externally, typically via the * compiler's command line options. The value must be a number. */ @@ -1057,12 +1592,58 @@ static void *XXH_memcpy(void *dest, const void *src, size_t size) { #endif /* note: use after variable declarations */ - #define XXH_STATIC_ASSERT(c) \ - do { \ - \ - enum { XXH_sa = 1 / (int)(!!(c)) }; \ - \ - } while (0) + #ifndef XXH_STATIC_ASSERT + #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ + #include <assert.h> + #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \ + do { \ + \ + static_assert((c), m); \ + \ + } while (0) + #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ + #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \ + do { \ + \ + static_assert((c), m); \ + \ + } while (0) + #else + #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \ + do { \ + \ + struct xxh_sa { \ + \ + char x[(c) ? 1 : -1]; \ + \ + }; \ + \ + } while (0) + #endif + #define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c), #c) + #endif + + /*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ + #ifdef __GNUC__ + #define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r"(var)) + #else + #define XXH_COMPILER_GUARD(var) ((void)0) + #endif /* ************************************* * Basic Types @@ -1085,6 +1666,56 @@ typedef XXH32_hash_t xxh_u32; /* *** Memory access *** */ +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3)) /* * Manual byteshift. Best for old compilers which don't inline memcpy. @@ -1146,16 +1777,23 @@ static xxh_u32 XXH_read32(const void *memPtr) { #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ -/* *** Endianess *** */ -typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess; + /* *** Endianness *** */ /*! - * XXH_CPU_LITTLE_ENDIAN: + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * * Defined to 1 if the target is little endian, or 0 if it is big endian. * It can be defined externally, for example on the compiler command line. * - * If it is not defined, a runtime check (which is usually constant folded) - * is used instead. + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. */ #ifndef XXH_CPU_LITTLE_ENDIAN /* @@ -1170,8 +1808,11 @@ typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess; (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define XXH_CPU_LITTLE_ENDIAN 0 #else -/* - * runtime test, presumed to simplify to a constant by compiler +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. */ static int XXH_isLittleEndian(void) { @@ -1189,7 +1830,7 @@ static int XXH_isLittleEndian(void) { return one.c[0]; } - +\ #define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() #endif #endif @@ -1205,6 +1846,19 @@ static int XXH_isLittleEndian(void) { #define XXH_HAS_BUILTIN(x) 0 #endif + /*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) && \ XXH_HAS_BUILTIN(__builtin_rotateleft64) #define XXH_rotl32 __builtin_rotateleft32 @@ -1219,6 +1873,14 @@ static int XXH_isLittleEndian(void) { #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r)))) #endif + /*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to byteswap. + * @return @p x, byteswapped. + */ #if defined(_MSC_VER) /* Visual Studio */ #define XXH_swap32 _byteswap_ulong #elif XXH_GCC_VERSION >= 403 @@ -1236,7 +1898,17 @@ static xxh_u32 XXH_swap32(xxh_u32 x) { /* *************************** * Memory reads *****************************/ -typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ + +} XXH_alignment; /* * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. @@ -1295,6 +1967,7 @@ XXH_FORCE_INLINE xxh_u32 XXH_readLE32_align(const void * ptr, /* ************************************* * Misc ***************************************/ +/*! @ingroup public */ XXH_PUBLIC_API unsigned XXH_versionNumber(void) { return XXH_VERSION_NUMBER; @@ -1304,16 +1977,19 @@ XXH_PUBLIC_API unsigned XXH_versionNumber(void) { /* ******************************************************************* * 32-bit hash functions *********************************************************************/ -static const xxh_u32 XXH_PRIME32_1 = - 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ -static const xxh_u32 XXH_PRIME32_2 = - 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ -static const xxh_u32 XXH_PRIME32_3 = - 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ -static const xxh_u32 XXH_PRIME32_4 = - 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ -static const xxh_u32 XXH_PRIME32_5 = - 0x165667B1U; /* 0b00010110010101100110011110110001 */ +/*! + * @} + * @defgroup xxh32_impl XXH32 implementation + * @ingroup impl + * @{ + + */ +/* #define instead of static const, to be used as initializers */ + #define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ + #define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ + #define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ + #define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ + #define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ #ifdef XXH_OLD_NAMES #define PRIME32_1 XXH_PRIME32_1 @@ -1323,19 +1999,29 @@ static const xxh_u32 XXH_PRIME32_5 = #define PRIME32_5 XXH_PRIME32_5 #endif +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) { acc += input * XXH_PRIME32_2; acc = XXH_rotl32(acc, 13); acc *= XXH_PRIME32_1; - #if defined(__GNUC__) && defined(__SSE4_1__) && \ + #if (defined(__SSE4_1__) || defined(__aarch64__)) && \ !defined(XXH_ENABLE_AUTOVECTORIZE) /* * UGLY HACK: - * This inline assembly hack forces acc into a normal register. This is the - * only thing that prevents GCC and Clang from autovectorizing the XXH32 - * loop (pragmas and attributes don't work for some resason) without globally - * disabling SSE4.1. + * A compiler fence is the only thing that prevents GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. * * The reason we want to avoid vectorization is because despite working on * 4 integers at a time, there are multiple factors slowing XXH32 down on @@ -1360,28 +2046,26 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) { * can load data, while v3 can multiply. SSE forces them to operate * together. * - * How this hack works: - * __asm__("" // Declare an assembly block but don't declare any - * instructions : // However, as an Input/Output Operand, - * "+r" // constrain a read/write operand (+) as a general purpose - * register (r). (acc) // and set acc as the operand - * ); - * - * Because of the 'r', the compiler has promised that seed will be in a - * general purpose register and the '+' says that it will be 'read/write', - * so it has to assume it has changed. It is like volatile without all the - * loads and stores. - * - * Since the argument has to be in a normal register (not an SSE register), - * each time XXH32_round is called, it is impossible to vectorize. + * This is also enabled on AArch64, as Clang autovectorizes it incorrectly + * and it is pointless writing a NEON implementation that is basically the + * same speed as scalar for XXH32. */ - __asm__("" : "+r"(acc)); + XXH_COMPILER_GUARD(acc); #endif return acc; } -/* mix all bits */ +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param h32 The hash to avalanche. + * @return The avalanched hash. + */ static xxh_u32 XXH32_avalanche(xxh_u32 h32) { h32 ^= h32 >> 15; @@ -1395,11 +2079,23 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32) { #define XXH_get32bits(p) XXH_readLE32_align(p, align) +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param h32 The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + */ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len, XXH_alignment align) { - - /* dummy comment */ - +\ #define XXH_PROCESS1 \ do { \ \ @@ -1443,20 +2139,20 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len, case 12: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 8: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 4: XXH_PROCESS4; return XXH32_avalanche(h32); case 13: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 9: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 5: XXH_PROCESS4; XXH_PROCESS1; @@ -1464,10 +2160,10 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len, case 14: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 10: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 6: XXH_PROCESS4; XXH_PROCESS1; @@ -1476,22 +2172,22 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len, case 15: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 11: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 7: XXH_PROCESS4; - /* fallthrough */ + XXH_FALLTHROUGH; case 3: XXH_PROCESS1; - /* fallthrough */ + XXH_FALLTHROUGH; case 2: XXH_PROCESS1; - /* fallthrough */ + XXH_FALLTHROUGH; case 1: XXH_PROCESS1; - /* fallthrough */ + XXH_FALLTHROUGH; case 0: return XXH32_avalanche(h32); @@ -1512,10 +2208,18 @@ static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len, #undef XXH_PROCESS4 #endif +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input, len, seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len, xxh_u32 seed, XXH_alignment align) { - const xxh_u8 *bEnd = input + len; + const xxh_u8 *bEnd = input ? input + len : NULL; xxh_u32 h32; #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ @@ -1565,6 +2269,7 @@ XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len, } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len, XXH32_hash_t seed) { @@ -1574,9 +2279,7 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len, XXH32_reset(&state, seed); XXH32_update(&state, (const xxh_u8*)input, len); return XXH32_digest(&state); - #else - if (XXH_FORCE_ALIGN_CHECK) { if ((((size_t)input) & 3) == @@ -1593,13 +2296,16 @@ XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len, } /******* Hash streaming *******/ - +/*! + * @ingroup xxh32_family + */ XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void) { return (XXH32_state_t *)XXH_malloc(sizeof(XXH32_state_t)); } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) { XXH_free(statePtr); @@ -1607,6 +2313,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) { } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dstState, const XXH32_state_t *srcState) { @@ -1614,6 +2321,7 @@ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dstState, } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr, XXH32_hash_t seed) { @@ -1630,6 +2338,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr, } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state, const void *input, size_t len) { @@ -1719,6 +2428,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state, } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) { xxh_u32 h32; @@ -1743,7 +2453,8 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) { /******* Canonical representation *******/ -/* +/*! + * @ingroup xxh32_family * The default return values from XXH functions are unsigned 32 and 64 bit * integers. * @@ -1765,6 +2476,7 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst, } +/*! @ingroup xxh32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t *src) { @@ -1777,7 +2489,12 @@ XXH32_hashFromCanonical(const XXH32_canonical_t *src) { /* ******************************************************************* * 64-bit hash functions *********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ /******* Memory access *******/ typedef XXH64_hash_t xxh_u64; @@ -1786,40 +2503,6 @@ typedef XXH64_hash_t xxh_u64; #define U64 xxh_u64 #endif - /*! - * XXH_REROLL_XXH64: - * Whether to reroll the XXH64_finalize() loop. - * - * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a - * performance gain on 64-bit hosts, as only one jump is required. - * - * However, on 32-bit hosts, because arithmetic needs to be done with two - * 32-bit registers, and 64-bit arithmetic needs to be simulated, it isn't - * beneficial to unroll. The code becomes ridiculously large (the largest - * function in the binary on i386!), and rerolling it saves anywhere from - * 3kB to 20kB. It is also slightly faster because it fits into cache better - * and is more likely to be inlined by the compiler. - * - * If XXH_REROLL is defined, this is ignored and the loop is always - * rerolled. - */ - #ifndef XXH_REROLL_XXH64 - #if (defined(__ILP32__) || \ - defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \ - || !(defined(__x86_64__) || defined(_M_X64) || \ - defined(_M_AMD64) /* x86-64 */ \ - || defined(_M_ARM64) || defined(__aarch64__) || \ - defined(__arm64__) /* aarch64 */ \ - || defined(__PPC64__) || defined(__PPC64LE__) || \ - defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \ - || defined(__mips64__) || defined(__mips64)) /* mips64 */ \ - || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */ - #define XXH_REROLL_XXH64 1 - #else - #define XXH_REROLL_XXH64 0 - #endif - #endif /* !defined(XXH_REROLL_XXH64) */ - #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3)) /* * Manual byteshift. Best for old compilers which don't inline memcpy. @@ -1950,23 +2633,35 @@ XXH_FORCE_INLINE xxh_u64 XXH_readLE64_align(const void * ptr, } -/******* xxh64 *******/ + /******* xxh64 *******/ + /*! + * @} + * @defgroup xxh64_impl XXH64 implementation + * @ingroup impl + * @{ -static const xxh_u64 XXH_PRIME64_1 = - 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 - */ -static const xxh_u64 XXH_PRIME64_2 = - 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 - */ -static const xxh_u64 XXH_PRIME64_3 = - 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 - */ -static const xxh_u64 XXH_PRIME64_4 = - 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 - */ -static const xxh_u64 XXH_PRIME64_5 = - 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 - */ + */ + /* #define rather that static const, to be used as initializers */ + #define XXH_PRIME64_1 \ + 0x9E3779B185EBCA87ULL /*!< \ + 0b1001111000110111011110011011000110000101111010111100101010000111 \ + */ + #define XXH_PRIME64_2 \ + 0xC2B2AE3D27D4EB4FULL /*!< \ + 0b1100001010110010101011100011110100100111110101001110101101001111 \ + */ + #define XXH_PRIME64_3 \ + 0x165667B19E3779F9ULL /*!< \ + 0b0001011001010110011001111011000110011110001101110111100111111001 \ + */ + #define XXH_PRIME64_4 \ + 0x85EBCA77C2B2AE63ULL /*!< \ + 0b1000010111101011110010100111011111000010101100101010111001100011 \ + */ + #define XXH_PRIME64_5 \ + 0x27D4EB2F165667C5ULL /*!< \ + 0b0010011111010100111010110010111100010110010101100110011111000101 \ + */ #ifdef XXH_OLD_NAMES #define PRIME64_1 XXH_PRIME64_1 @@ -2010,185 +2705,35 @@ static xxh_u64 XXH64_avalanche(xxh_u64 h64) { static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len, XXH_alignment align) { - /* dummy comment */ - - #define XXH_PROCESS1_64 \ - do { \ - \ - h64 ^= (*ptr++) * XXH_PRIME64_5; \ - h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \ - \ - } while (0) - - #define XXH_PROCESS4_64 \ - do { \ - \ - h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; \ - ptr += 4; \ - h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \ - \ - } while (0) - - #define XXH_PROCESS8_64 \ - do { \ - \ - xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \ - ptr += 8; \ - h64 ^= k1; \ - h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4; \ - \ - } while (0) - - /* Rerolled version for 32-bit targets is faster and much smaller. */ - if (XXH_REROLL || XXH_REROLL_XXH64) { - - len &= 31; - while (len >= 8) { - - XXH_PROCESS8_64; - len -= 8; - - } - - if (len >= 4) { - - XXH_PROCESS4_64; - len -= 4; - - } - - while (len > 0) { + len &= 31; + while (len >= 8) { - XXH_PROCESS1_64; - --len; + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + h64 ^= k1; + h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; - } + } - return XXH64_avalanche(h64); + if (len >= 4) { - } else { + h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; - switch (len & 31) { + } - case 24: - XXH_PROCESS8_64; - /* fallthrough */ - case 16: - XXH_PROCESS8_64; - /* fallthrough */ - case 8: - XXH_PROCESS8_64; - return XXH64_avalanche(h64); - - case 28: - XXH_PROCESS8_64; - /* fallthrough */ - case 20: - XXH_PROCESS8_64; - /* fallthrough */ - case 12: - XXH_PROCESS8_64; - /* fallthrough */ - case 4: - XXH_PROCESS4_64; - return XXH64_avalanche(h64); - - case 25: - XXH_PROCESS8_64; - /* fallthrough */ - case 17: - XXH_PROCESS8_64; - /* fallthrough */ - case 9: - XXH_PROCESS8_64; - XXH_PROCESS1_64; - return XXH64_avalanche(h64); - - case 29: - XXH_PROCESS8_64; - /* fallthrough */ - case 21: - XXH_PROCESS8_64; - /* fallthrough */ - case 13: - XXH_PROCESS8_64; - /* fallthrough */ - case 5: - XXH_PROCESS4_64; - XXH_PROCESS1_64; - return XXH64_avalanche(h64); - - case 26: - XXH_PROCESS8_64; - /* fallthrough */ - case 18: - XXH_PROCESS8_64; - /* fallthrough */ - case 10: - XXH_PROCESS8_64; - XXH_PROCESS1_64; - XXH_PROCESS1_64; - return XXH64_avalanche(h64); - - case 30: - XXH_PROCESS8_64; - /* fallthrough */ - case 22: - XXH_PROCESS8_64; - /* fallthrough */ - case 14: - XXH_PROCESS8_64; - /* fallthrough */ - case 6: - XXH_PROCESS4_64; - XXH_PROCESS1_64; - XXH_PROCESS1_64; - return XXH64_avalanche(h64); - - case 27: - XXH_PROCESS8_64; - /* fallthrough */ - case 19: - XXH_PROCESS8_64; - /* fallthrough */ - case 11: - XXH_PROCESS8_64; - XXH_PROCESS1_64; - XXH_PROCESS1_64; - XXH_PROCESS1_64; - return XXH64_avalanche(h64); - - case 31: - XXH_PROCESS8_64; - /* fallthrough */ - case 23: - XXH_PROCESS8_64; - /* fallthrough */ - case 15: - XXH_PROCESS8_64; - /* fallthrough */ - case 7: - XXH_PROCESS4_64; - /* fallthrough */ - case 3: - XXH_PROCESS1_64; - /* fallthrough */ - case 2: - XXH_PROCESS1_64; - /* fallthrough */ - case 1: - XXH_PROCESS1_64; - /* fallthrough */ - case 0: - return XXH64_avalanche(h64); + while (len > 0) { - } + h64 ^= (*ptr++) * XXH_PRIME64_5; + h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; + --len; } - /* impossible to reach */ - XXH_ASSERT(0); - return 0; /* unreachable, but some compilers complain without it */ + return XXH64_avalanche(h64); } @@ -2205,7 +2750,7 @@ static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len, XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len, xxh_u64 seed, XXH_alignment align) { - const xxh_u8 *bEnd = input + len; + const xxh_u8 *bEnd = input ? input + len : NULL; xxh_u64 h64; #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ @@ -2259,6 +2804,7 @@ XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len, } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len, XXH64_hash_t seed) { @@ -2268,9 +2814,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len, XXH64_reset(&state, seed); XXH64_update(&state, (const xxh_u8*)input, len); return XXH64_digest(&state); - #else - if (XXH_FORCE_ALIGN_CHECK) { if ((((size_t)input) & 7) == @@ -2289,12 +2833,14 @@ XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len, /******* Hash Streaming *******/ +/*! @ingroup xxh64_family*/ XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void) { return (XXH64_state_t *)XXH_malloc(sizeof(XXH64_state_t)); } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) { XXH_free(statePtr); @@ -2302,6 +2848,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) { } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t * dstState, const XXH64_state_t *srcState) { @@ -2309,6 +2856,7 @@ XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t * dstState, } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr, XXH64_hash_t seed) { @@ -2325,6 +2873,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr, } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state, const void *input, size_t len) { @@ -2403,6 +2952,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state, } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) { xxh_u64 h64; @@ -2436,6 +2986,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) { /******* Canonical representation *******/ +/*! @ingroup xxh64_family */ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst, XXH64_hash_t hash) { @@ -2445,6 +2996,7 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst, } +/*! @ingroup xxh64_family */ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t *src) { @@ -2452,380 +3004,452 @@ XXH64_hashFromCanonical(const XXH64_canonical_t *src) { } - /* ********************************************************************* - * XXH3 - * New generation hash designed for speed on small keys and vectorization - ************************************************************************ */ + #ifndef XXH_NO_XXH3 - /* === Compiler specifics === */ + /* ********************************************************************* + * XXH3 + * New generation hash designed for speed on small keys and vectorization + ************************************************************************ */ + /*! + * @} + * @defgroup xxh3_impl XXH3 implementation + * @ingroup impl + * @{ - #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ - #define XXH_RESTRICT restrict - #else - /* Note: it might be useful to define __restrict or __restrict__ for some - * C++ compilers */ - #define XXH_RESTRICT /* disable */ - #endif + */ - #if (defined(__GNUC__) && (__GNUC__ >= 3)) || \ - (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \ - defined(__clang__) - #define XXH_likely(x) __builtin_expect(x, 1) - #define XXH_unlikely(x) __builtin_expect(x, 0) - #else - #define XXH_likely(x) (x) - #define XXH_unlikely(x) (x) - #endif + /* === Compiler specifics === */ - #if defined(__GNUC__) - #if defined(__AVX2__) - #include <immintrin.h> - #elif defined(__SSE2__) - #include <emmintrin.h> - #elif defined(__ARM_NEON__) || defined(__ARM_NEON) - #define inline __inline__ /* circumvent a clang bug */ - #include <arm_neon.h> - #undef inline + #if ((defined(sun) || defined(__sun)) && \ + __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested \ + with GCC 5.5 */ + #define XXH_RESTRICT /* disable */ + #elif defined(__STDC_VERSION__) && \ + __STDC_VERSION__ >= 199901L /* >= C99 */ + #define XXH_RESTRICT restrict + #else + /* Note: it might be useful to define __restrict or __restrict__ for + * some C++ compilers */ + #define XXH_RESTRICT /* disable */ #endif - #elif defined(_MSC_VER) - #include <intrin.h> - #endif - - /* - * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while - * remaining a true 64-bit/128-bit hash function. - * - * This is done by prioritizing a subset of 64-bit operations that can be - * emulated without too many steps on the average 32-bit machine. - * - * For example, these two lines seem similar, and run equally fast on - * 64-bit: - * - * xxh_u64 x; - * x ^= (x >> 47); // good - * x ^= (x >> 13); // bad - * - * However, to a 32-bit machine, there is a major difference. - * - * x ^= (x >> 47) looks like this: - * - * x.lo ^= (x.hi >> (47 - 32)); - * - * while x ^= (x >> 13) looks like this: - * - * // note: funnel shifts are not usually cheap. - * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); - * x.hi ^= (x.hi >> 13); - * - * The first one is significantly faster than the second, simply because the - * shift is larger than 32. This means: - * - All the bits we need are in the upper 32 bits, so we can ignore the - * lower 32 bits in the shift. - * - The shift result will always fit in the lower 32 bits, and therefore, - * we can ignore the upper 32 bits in the xor. - * - * Thanks to this optimization, XXH3 only requires these features to be - * efficient: - * - * - Usable unaligned access - * - A 32-bit or 64-bit ALU - * - If 32-bit, a decent ADC instruction - * - A 32 or 64-bit multiply with a 64-bit result - * - For the 128-bit variant, a decent byteswap helps short inputs. - * - * The first two are already required by XXH32, and almost all 32-bit and - * 64-bit platforms which can run XXH32 can run XXH3 efficiently. - * - * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one - * notable exception. - * - * First of all, Thumb-1 lacks support for the UMULL instruction which - * performs the important long multiply. This means numerous __aeabi_lmul - * calls. - * - * Second of all, the 8 functional registers are just not enough. - * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic - * need Lo registers, and this shuffling results in thousands more MOVs than - * A32. - * - * A32 and T32 don't have this limitation. They can access all 14 registers, - * do a 32->64 multiply with UMULL, and the flexible operand allowing free - * shifts is helpful, too. - * - * Therefore, we do a quick sanity check. - * - * If compiling Thumb-1 for a target which supports ARM instructions, we - * will emit a warning, as it is not a "sane" platform to compile for. - * - * Usually, if this happens, it is because of an accident and you probably - * need to specify -march, as you likely meant to compile for a newer - * architecture. - * - * Credit: large sections of the vectorial and asm source code paths - * have been contributed by @easyaspi314 - */ - #if defined(__thumb__) && !defined(__thumb2__) && \ - defined(__ARM_ARCH_ISA_ARM) - #warning "XXH3 is highly inefficient without ARM or Thumb-2." - #endif - /* ========================================== - * Vectorization detection - * ========================================== */ - #define XXH_SCALAR 0 /* Portable scalar version */ - #define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ - #define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ - #define XXH_AVX512 3 /* AVX512 for Skylake and Icelake */ - #define XXH_NEON 4 /* NEON for most ARMv7-A and all AArch64 */ - #define XXH_VSX 5 /* VSX and ZVector for POWER8/z13 */ - - #ifndef XXH_VECTOR /* can be defined on command line */ - #if defined(__AVX512F__) - #define XXH_VECTOR XXH_AVX512 - #elif defined(__AVX2__) - #define XXH_VECTOR XXH_AVX2 - #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \ - (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) - #define XXH_VECTOR XXH_SSE2 - #elif defined(__GNUC__) /* msvc support maybe later */ \ - && (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \ - (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ - || (defined(__BYTE_ORDER__) && \ - __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) - #define XXH_VECTOR XXH_NEON - #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \ - (defined(__s390x__) && defined(__VEC__)) && \ - defined(__GNUC__) /* TODO: IBM XL */ - #define XXH_VECTOR XXH_VSX + #if (defined(__GNUC__) && (__GNUC__ >= 3)) || \ + (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \ + defined(__clang__) + #define XXH_likely(x) __builtin_expect(x, 1) + #define XXH_unlikely(x) __builtin_expect(x, 0) #else - #define XXH_VECTOR XXH_SCALAR + #define XXH_likely(x) (x) + #define XXH_unlikely(x) (x) #endif - #endif - /* - * Controls the alignment of the accumulator, - * for compatibility with aligned vector loads, which are usually faster. - */ - #ifndef XXH_ACC_ALIGN - #if defined(XXH_X86DISPATCH) - #define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ - #elif XXH_VECTOR == XXH_SCALAR /* scalar */ - #define XXH_ACC_ALIGN 8 - #elif XXH_VECTOR == XXH_SSE2 /* sse2 */ - #define XXH_ACC_ALIGN 16 - #elif XXH_VECTOR == XXH_AVX2 /* avx2 */ - #define XXH_ACC_ALIGN 32 - #elif XXH_VECTOR == XXH_NEON /* neon */ - #define XXH_ACC_ALIGN 16 - #elif XXH_VECTOR == XXH_VSX /* vsx */ - #define XXH_ACC_ALIGN 16 - #elif XXH_VECTOR == XXH_AVX512 /* avx512 */ - #define XXH_ACC_ALIGN 64 + #if defined(__GNUC__) + #if defined(__AVX2__) + #include <immintrin.h> + #elif defined(__SSE2__) + #include <emmintrin.h> + #elif defined(__ARM_NEON__) || defined(__ARM_NEON) + #define inline __inline__ /* circumvent a clang bug */ + #include <arm_neon.h> + #undef inline + #endif + #elif defined(_MSC_VER) + #include <intrin.h> #endif - #endif - #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \ - XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 - #define XXH_SEC_ALIGN XXH_ACC_ALIGN - #else - #define XXH_SEC_ALIGN 8 - #endif - - /* - * UGLY HACK: - * GCC usually generates the best code with -O3 for xxHash. - * - * However, when targeting AVX2, it is overzealous in its unrolling - * resulting in code roughly 3/4 the speed of Clang. - * - * There are other issues, such as GCC splitting _mm256_loadu_si256 into - * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which - * only applies to Sandy and Ivy Bridge... which don't even support AVX2. - * - * That is why when compiling the AVX2 version, it is recommended to use - * either -O2 -mavx2 -march=haswell or -O2 -mavx2 - * -mno-avx256-split-unaligned-load for decent performance, or to use Clang - * instead. - * - * Fortunately, we can control the first one with a pragma that forces GCC - * into -O2, but the other one we can't control without "failed to inline - * always inline function due to target mismatch" warnings. - */ - #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ - && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__OPTIMIZE__) && \ - !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ - #pragma GCC push_options - #pragma GCC optimize("-O2") - #endif - - #if XXH_VECTOR == XXH_NEON /* - * NEON's setup for vmlal_u32 is a little more complicated than it is on - * SSE2, AVX2, and VSX. + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. * - * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an - * upcast. + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. * - * To do the same operation, the 128-bit 'Q' register needs to be split - * into two 64-bit 'D' registers, performing this operation:: + * For example, these two lines seem similar, and run equally fast on + * 64-bit: * - * [ a | b ] | - * '---------. .--------' | | x | - * | .---------' '--------. | - * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad * - * Due to significant changes in aarch64, the fastest method for aarch64 - * is completely different than the fastest method for ARMv7-A. + * However, to a 32-bit machine, there is a major difference. * - * ARMv7-A treats D registers as unions overlaying Q registers, so - * modifying D11 will modify the high half of Q5. This is similar to how - * modifying AH will only affect bits 8-15 of AX on x86. + * x ^= (x >> 47) looks like this: * - * VZIP takes two registers, and puts even lanes in one register and odd - * lanes in the other. + * x.lo ^= (x.hi >> (47 - 32)); * - * On ARMv7-A, this strangely modifies both parameters in place instead of - * taking the usual 3-operand form. + * while x ^= (x >> 13) looks like this: * - * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on - * the lower and upper halves of the Q register to end up with the high - * and low halves where we want - all in one instruction. + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); * - * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], - * d11[1] } + * The first one is significantly faster than the second, simply because + * the shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the + * lower 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and + * therefore, we can ignore the upper 32 bits in the xor. * - * Unfortunately we need inline assembly for this: Instructions modifying - * two registers at once is not possible in GCC or Clang's IR, and they - * have to create a copy. + * Thanks to this optimization, XXH3 only requires these features to be + * efficient: * - * aarch64 requires a different approach. + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. * - * In order to make it easier to write a decent compiler for aarch64, many - * quirks were removed, such as conditional execution. + * The first two are already required by XXH32, and almost all 32-bit and + * 64-bit platforms which can run XXH32 can run XXH3 efficiently. * - * NEON was also affected by this. + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is + * one notable exception. * - * aarch64 cannot access the high bits of a Q-form register, and writes to - * a D-form register zero the high bits, similar to how writes to W-form - * scalar registers (or DWORD registers on x86_64) work. + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. * - * The formerly free vget_high intrinsics now require a vext (with a few - * exceptions) + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic + * need Lo registers, and this shuffling results in thousands more MOVs + * than A32. * - * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the - * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to - * only modify one operand. + * A32 and T32 don't have this limitation. They can access all 14 + * registers, do a 32->64 multiply with UMULL, and the flexible operand + * allowing free shifts is helpful, too. * - * The equivalent of the VZIP.32 on the lower and upper halves would be - * this mess: + * Therefore, we do a quick sanity check. * - * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] - * } zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } zip2 v0.2s, - * v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * If compiling Thumb-1 for a target which supports ARM instructions, we + * will emit a warning, as it is not a "sane" platform to compile for. * - * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 - * (SHRN): + * Usually, if this happens, it is because of an accident and you probably + * need to specify -march, as you likely meant to compile for a newer + * architecture. * - * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); - * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); - * - * This is available on ARMv7-A, but is less efficient than a single - * VZIP.32. + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 */ + #if defined(__thumb__) && !defined(__thumb2__) && \ + defined(__ARM_ARCH_ISA_ARM) + #warning "XXH3 is highly inefficient without ARM or Thumb-2." + #endif + + /* ========================================== + * Vectorization detection + * ========================================== */ + + #ifdef XXH_DOXYGEN + /*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the + * best implementation. + */ + #define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Note that these are actually implemented as macros. + * + * If this is not defined, it is detected automatically. + * @ref XXH_X86DISPATCH overrides this. + */ +enum XXH_VECTOR_TYPE /* fake enum */ { + + XXH_SCALAR = 0, /*!< Portable scalar version */ + XXH_SSE2 = 1, /*!< + * SSE2 for Pentium 4, Opteron, all x86_64. + * + * @note SSE2 is also guaranteed on Windows 10, macOS, and + * Android x86. + */ + XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ + XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ + XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */ + XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ + +}; + + /*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment reqired for said + * vector type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ + #define XXH_ACC_ALIGN 8 + #endif + + /* Actual definition */ + #ifndef XXH_DOXYGEN + #define XXH_SCALAR 0 + #define XXH_SSE2 1 + #define XXH_AVX2 2 + #define XXH_AVX512 3 + #define XXH_NEON 4 + #define XXH_VSX 5 + #endif + + #ifndef XXH_VECTOR /* can be defined on command line */ + #if defined(__AVX512F__) + #define XXH_VECTOR XXH_AVX512 + #elif defined(__AVX2__) + #define XXH_VECTOR XXH_AVX2 + #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) + #define XXH_VECTOR XXH_SSE2 + #elif defined(__GNUC__) /* msvc support maybe later */ \ + && (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \ + (defined( \ + __LITTLE_ENDIAN__) /* We only support little endian NEON */ \ + || (defined(__BYTE_ORDER__) && \ + __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) + #define XXH_VECTOR XXH_NEON + #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \ + (defined(__s390x__) && defined(__VEC__)) && \ + defined(__GNUC__) /* TODO: IBM XL */ + #define XXH_VECTOR XXH_VSX + #else + #define XXH_VECTOR XXH_SCALAR + #endif + #endif /* - * Function-like macro: - * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t - * &outHi) - * { - - * outLo = (uint32x2_t)(in & 0xFFFFFFFF); - * outHi = (uint32x2_t)(in >> 32); - * in = UNDEFINED; - * } + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. */ - #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ - && defined(__GNUC__) && !defined(__aarch64__) && !defined(__arm64__) - #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ - do { \ - \ - /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, \ - * %f0 = upper D half */ \ - /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 \ - */ \ - /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \ - */ \ - __asm__("vzip.32 %e0, %f0" : "+w"(in)); \ - (outLo) = vget_low_u32(vreinterpretq_u32_u64(in)); \ - (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ - \ - } while (0) + #ifndef XXH_ACC_ALIGN + #if defined(XXH_X86DISPATCH) + #define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ + #elif XXH_VECTOR == XXH_SCALAR /* scalar */ + #define XXH_ACC_ALIGN 8 + #elif XXH_VECTOR == XXH_SSE2 /* sse2 */ + #define XXH_ACC_ALIGN 16 + #elif XXH_VECTOR == XXH_AVX2 /* avx2 */ + #define XXH_ACC_ALIGN 32 + #elif XXH_VECTOR == XXH_NEON /* neon */ + #define XXH_ACC_ALIGN 16 + #elif XXH_VECTOR == XXH_VSX /* vsx */ + #define XXH_ACC_ALIGN 16 + #elif XXH_VECTOR == XXH_AVX512 /* avx512 */ + #define XXH_ACC_ALIGN 64 + #endif + #endif + #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \ + XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 + #define XXH_SEC_ALIGN XXH_ACC_ALIGN #else - #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ - do { \ - \ - (outLo) = vmovn_u64(in); \ - (outHi) = vshrn_n_u64((in), 32); \ - \ - } while (0) + #define XXH_SEC_ALIGN 8 + #endif + /* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling + * resulting in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization + * which only applies to Sandy and Ivy Bridge... which don't even support + * AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use + * either -O2 -mavx2 -march=haswell or -O2 -mavx2 + * -mno-avx256-split-unaligned-load for decent performance, or to use + * Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC + * into -O2, but the other one we can't control without "failed to inline + * always inline function due to target mismatch" warnings. + */ + #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && \ + !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ + #pragma GCC push_options + #pragma GCC optimize("-O2") #endif - #endif /* XXH_VECTOR == XXH_NEON */ - /* - * VSX and Z Vector helpers. - * - * This is very messy, and any pull requests to clean this up are welcome. - * - * There are a lot of problems with supporting VSX and s390x, due to - * inconsistent intrinsics, spotty coverage, and multiple endiannesses. - */ - #if XXH_VECTOR == XXH_VSX - #if defined(__s390x__) - #include <s390intrin.h> - #else - /* gcc's altivec.h can have the unwanted consequence to unconditionally - * #define bool, vector, and pixel keywords, - * with bad consequences for programs already using these keywords for - * other purposes. The paragraph defining these macros is skipped when - * __APPLE_ALTIVEC__ is defined. - * __APPLE_ALTIVEC__ is _generally_ defined automatically by the - * compiler, but it seems that, in some cases, it isn't. Force the build - * macro to be defined, so that keywords are not altered. + #if XXH_VECTOR == XXH_NEON + /* + * NEON's setup for vmlal_u32 is a little more complicated than it is on + * SSE2, AVX2, and VSX. + * + * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an + * upcast. + * + * To do the same operation, the 128-bit 'Q' register needs to be split + * into two 64-bit 'D' registers, performing this operation:: + * + * [ a | b ] | + * '---------. .--------' | | x | + * | .---------' '--------. | + * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 + * ] + * + * Due to significant changes in aarch64, the fastest method for aarch64 + * is completely different than the fastest method for ARMv7-A. + * + * ARMv7-A treats D registers as unions overlaying Q registers, so + * modifying D11 will modify the high half of Q5. This is similar to how + * modifying AH will only affect bits 8-15 of AX on x86. + * + * VZIP takes two registers, and puts even lanes in one register and odd + * lanes in the other. + * + * On ARMv7-A, this strangely modifies both parameters in place instead + * of taking the usual 3-operand form. + * + * Therefore, if we want to do this, we can simply use a D-form VZIP.32 + * on the lower and upper halves of the Q register to end up with the + * high and low halves where we want - all in one instruction. + * + * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { + + * d10[1], d11[1] } + * + * Unfortunately we need inline assembly for this: Instructions + * modifying two registers at once is not possible in GCC or Clang's IR, + * and they have to create a copy. + * + * aarch64 requires a different approach. + * + * In order to make it easier to write a decent compiler for aarch64, + * many quirks were removed, such as conditional execution. + * + * NEON was also affected by this. + * + * aarch64 cannot access the high bits of a Q-form register, and writes + * to a D-form register zero the high bits, similar to how writes to + * W-form scalar registers (or DWORD registers on x86_64) work. + * + * The formerly free vget_high intrinsics now require a vext (with a few + * exceptions) + * + * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the + * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to + * only modify one operand. + * + * The equivalent of the VZIP.32 on the lower and upper halves would be + * this mess: + * + * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], + * v0[1] } zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } zip2 + * v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * + * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 + * (SHRN): + * + * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); + * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); + * + * This is available on ARMv7-A, but is less efficient than a single + * VZIP.32. */ - #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__) - #define __APPLE_ALTIVEC__ - #endif - #include <altivec.h> - #endif -typedef __vector unsigned long long xxh_u64x2; -typedef __vector unsigned char xxh_u8x16; -typedef __vector unsigned xxh_u32x4; + /*! + * Function-like macro: + * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t + * &outHi) + * { - #ifndef XXH_VSX_BE - #if defined(__BIG_ENDIAN__) || \ - (defined(__BYTE_ORDER__) && \ - __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) - #define XXH_VSX_BE 1 - #elif defined(__VEC_ELEMENT_REG_ORDER__) && \ - __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ - #warning \ - "-maltivec=be is not recommended. Please use native endianness." - #define XXH_VSX_BE 1 + * outLo = (uint32x2_t)(in & 0xFFFFFFFF); + * outHi = (uint32x2_t)(in >> 32); + * in = UNDEFINED; + * } + */ + #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ + && defined(__GNUC__) && !defined(__aarch64__) && \ + !defined(__arm64__) + #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + \ + /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, \ + * %f0 = upper D half */ \ + /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 \ + */ \ + /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \ + */ \ + __asm__("vzip.32 %e0, %f0" : "+w"(in)); \ + (outLo) = vget_low_u32(vreinterpretq_u32_u64(in)); \ + (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ + \ + } while (0) #else - #define XXH_VSX_BE 0 + #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + \ + (outLo) = vmovn_u64(in); \ + (outHi) = vshrn_n_u64((in), 32); \ + \ + } while (0) #endif - #endif /* !defined(XXH_VSX_BE) */ + #endif /* XXH_VECTOR == XXH_NEON */ - #if XXH_VSX_BE - /* A wrapper for POWER9's vec_revb. */ - #if defined(__POWER9_VECTOR__) || \ - (defined(__clang__) && defined(__s390x__)) - #define XXH_vec_revb vec_revb + /* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ + #if XXH_VECTOR == XXH_VSX + #if defined(__s390x__) + #include <s390intrin.h> #else + /* gcc's altivec.h can have the unwanted consequence to + * unconditionally #define bool, vector, and pixel keywords, with bad + * consequences for programs already using these keywords for other + * purposes. The paragraph defining these macros is skipped when + * __APPLE_ALTIVEC__ is defined. + * __APPLE_ALTIVEC__ is _generally_ defined automatically by the + * compiler, but it seems that, in some cases, it isn't. Force the + * build macro to be defined, so that keywords are not altered. + */ + #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__) + #define __APPLE_ALTIVEC__ + #endif + #include <altivec.h> + #endif + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + + #ifndef XXH_VSX_BE + #if defined(__BIG_ENDIAN__) || \ + (defined(__BYTE_ORDER__) && \ + __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + #define XXH_VSX_BE 1 + #elif defined(__VEC_ELEMENT_REG_ORDER__) && \ + __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ + #warning \ + "-maltivec=be is not recommended. Please use native endianness." + #define XXH_VSX_BE 1 + #else + #define XXH_VSX_BE 0 + #endif + #endif /* !defined(XXH_VSX_BE) */ + + #if XXH_VSX_BE + #if defined(__POWER9_VECTOR__) || \ + (defined(__clang__) && defined(__s390x__)) + #define XXH_vec_revb vec_revb + #else +/*! + * A polyfill for POWER9's vec_revb(). + */ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) { xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, @@ -2834,40 +3458,40 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) { } - #endif - #endif /* XXH_VSX_BE */ + #endif + #endif /* XXH_VSX_BE */ -/* - * Performs an unaligned load and byte swaps it on big endian. +/*! + * Performs an unaligned vector load and byte swaps it on big endian. */ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) { xxh_u64x2 ret; memcpy(&ret, ptr, sizeof(xxh_u64x2)); - #if XXH_VSX_BE + #if XXH_VSX_BE ret = XXH_vec_revb(ret); - #endif + #endif return ret; } - /* - * vec_mulo and vec_mule are very problematic intrinsics on PowerPC - * - * These intrinsics weren't added until GCC 8, despite existing for a - * while, and they are endian dependent. Also, their meaning swap - * depending on version. - * */ - #if defined(__s390x__) - /* s390x is always big endian, no issue on this platform */ - #define XXH_vec_mulo vec_mulo - #define XXH_vec_mule vec_mule - #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) - /* Clang has a better way to control this, we can just use the builtin - * which doesn't swap. */ - #define XXH_vec_mulo __builtin_altivec_vmulouw - #define XXH_vec_mule __builtin_altivec_vmuleuw - #else + /* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a + * while, and they are endian dependent. Also, their meaning swap + * depending on version. + * */ + #if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ + #define XXH_vec_mulo vec_mulo + #define XXH_vec_mule vec_mule + #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) + /* Clang has a better way to control this, we can just use the builtin + * which doesn't swap. */ + #define XXH_vec_mulo __builtin_altivec_vmulouw + #define XXH_vec_mule __builtin_altivec_vmuleuw + #else /* gcc needs inline assembly */ /* Adapted from * https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ @@ -2887,40 +3511,41 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) { } - #endif /* XXH_vec_mulo, XXH_vec_mule */ - #endif /* XXH_VECTOR == XXH_VSX */ + #endif /* XXH_vec_mulo, XXH_vec_mule */ + #endif /* XXH_VECTOR == XXH_VSX */ - /* prefetch - * can be disabled, by declaring XXH_NO_PREFETCH build macro */ - #if defined(XXH_NO_PREFETCH) - #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ - #else - #if defined(_MSC_VER) && \ - (defined(_M_X64) || \ - defined( \ - _M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ - #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ - #define XXH_PREFETCH(ptr) _mm_prefetch((const char *)(ptr), _MM_HINT_T0) - #elif defined(__GNUC__) && \ - ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) - #define XXH_PREFETCH(ptr) \ - __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) - #else + /* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ + #if defined(XXH_NO_PREFETCH) #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ - #endif - #endif /* XXH_NO_PREFETCH */ + #else + #if defined(_MSC_VER) && \ + (defined(_M_X64) || \ + defined( \ + _M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ + #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ + #define XXH_PREFETCH(ptr) \ + _mm_prefetch((const char *)(ptr), _MM_HINT_T0) + #elif defined(__GNUC__) && \ + ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) + #define XXH_PREFETCH(ptr) \ + __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) + #else + #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ + #endif + #endif /* XXH_NO_PREFETCH */ - /* ========================================== - * XXH3 default settings - * ========================================== */ + /* ========================================== + * XXH3 default settings + * ========================================== */ - #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ - #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) - #error "default keyset is not large enough" - #endif + #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) + #error "default keyset is not large enough" + #endif -/* Pseudorandom secret taken directly from FARSH */ +/*! Pseudorandom secret taken directly from FARSH. */ XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { @@ -2943,69 +3568,79 @@ static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { }; - #ifdef XXH_OLD_NAMES - #define kSecret XXH3_kSecret - #endif + #ifdef XXH_OLD_NAMES + #define kSecret XXH3_kSecret + #endif - /* - * Calculates a 32-bit to 64-bit long multiply. - * - * Wraps __emulu on MSVC x86 because it tends to call __allmul when it - * doesn't need to (but it shouldn't need to anyways, it is about 7 - * instructions to do a 64x64 multiply...). Since we know that this will - * _always_ emit MULL, we use that instead of the normal method. - * - * If you are compiling for platforms like Thumb-1 and don't have a better - * option, you may also want to write your own long multiply routine here. - * - * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) - * { + #ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it + * doesn't need to (but it shouldn't need to anyways, it is about 7 instructions + * to do a 64x64 multiply...). Since we know that this will _always_ emit + * `MULL`, we use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better + * option, you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) { - * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); - * } - */ - #if defined(_MSC_VER) && defined(_M_IX86) - #include <intrin.h> - #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) - #else - /* - * Downcast + upcast is usually better than masking on older compilers - * like GCC 4.2 (especially 32-bit ones), all without affecting newer - * compilers. - * - * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both - * operands and perform a full 64x64 multiply -- entirely redundant on - * 32-bit. - */ - #define XXH_mult32to64(x, y) \ - ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) - #endif + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); -/* - * Calculates a 64->128-bit long multiply. +} + + #elif defined(_MSC_VER) && defined(_M_IX86) + #include <intrin.h> + #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) + #else + /* + * Downcast + upcast is usually better than masking on older compilers + * like GCC 4.2 (especially 32-bit ones), all without affecting newer + * compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both + * operands and perform a full 64x64 multiply -- entirely redundant on + * 32-bit. + */ + #define XXH_mult32to64(x, y) \ + ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) + #endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. * - * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version. + * @param lhs, rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. */ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) { - /* - * GCC/Clang __uint128_t method. - * - * On most 64-bit targets, GCC and Clang define a __uint128_t type. - * This is usually the best way as it usually uses a native long 64-bit - * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. - * - * Usually. - * - * Despite being a 32-bit platform, Clang (and emscripten) define this type - * despite not having the arithmetic for it. This results in a laggy - * compiler builtin call which calculates a full 128-bit multiply. - * In that case it is best to use the portable one. - * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 - */ - #if defined(__GNUC__) && !defined(__wasm__) && \ - defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this + * type despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ + #if defined(__GNUC__) && !defined(__wasm__) && \ + defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; XXH128_hash_t r128; @@ -3013,19 +3648,19 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) { r128.high64 = (xxh_u64)(product >> 64); return r128; - /* - * MSVC for x64's _umul128 method. - * - * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 - * *HighProduct); - * - * This compiles to single operand MUL on x64. - */ - #elif defined(_M_X64) || defined(_M_IA64) + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 + * *HighProduct); + * + * This compiles to single operand MUL on x64. + */ + #elif defined(_M_X64) || defined(_M_IA64) - #ifndef _MSC_VER - #pragma intrinsic(_umul128) - #endif + #ifndef _MSC_VER + #pragma intrinsic(_umul128) + #endif xxh_u64 product_high; xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); XXH128_hash_t r128; @@ -3033,7 +3668,7 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) { r128.high64 = product_high; return r128; - #else + #else /* * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. * @@ -3093,16 +3728,20 @@ static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) { r128.low64 = lower; r128.high64 = upper; return r128; - #endif + #endif } -/* - * Does a 64-bit to 128-bit multiply, then XOR folds it. +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. * * The reason for the separate function is to prevent passing too many structs * around by value. This will hopefully inline the multiply, but we don't force * it. + * + * @param lhs, rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() */ static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) { @@ -3111,7 +3750,7 @@ static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) { } -/* Seems to produce slightly better code on GCC for some reason. */ +/*! Seems to produce slightly better code on GCC for some reason. */ XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) { XXH_ASSERT(0 <= shift && shift < 64); @@ -3216,7 +3855,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len, XXH_ASSERT(input != NULL); XXH_ASSERT(secret != NULL); - XXH_ASSERT(4 <= len && len < 8); + XXH_ASSERT(4 <= len && len <= 8); seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; { @@ -3239,7 +3878,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input, XXH_ASSERT(input != NULL); XXH_ASSERT(secret != NULL); - XXH_ASSERT(8 <= len && len <= 16); + XXH_ASSERT(9 <= len && len <= 16); { xxh_u64 const bitflip1 = @@ -3306,11 +3945,10 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input, const xxh_u8 *XXH_RESTRICT secret, xxh_u64 seed64) { - #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ - && \ - !defined( \ - XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like \ + XXH32 hack */ /* * UGLY HACK: * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in @@ -3326,8 +3964,8 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input, * GCC generates much better scalar code than Clang for the rest of XXH3, * which is why finding a more optimal codepath is an interest. */ - __asm__("" : "+r"(seed64)); - #endif + XXH_COMPILER_GUARD(seed64); + #endif { xxh_u64 const input_lo = XXH_readLE64(input); @@ -3381,7 +4019,7 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b( } - #define XXH3_MIDSIZE_MAX 240 + #define XXH3_MIDSIZE_MAX 240 XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b( const xxh_u8 *XXH_RESTRICT input, size_t len, @@ -3391,8 +4029,8 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b( (void)secretSize; XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); - #define XXH3_MIDSIZE_STARTOFFSET 3 - #define XXH3_MIDSIZE_LASTOFFSET 17 + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 { @@ -3407,31 +4045,31 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b( acc = XXH3_avalanche(acc); XXH_ASSERT(nbRounds >= 8); - #if defined(__clang__) /* Clang */ \ - && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ - /* - * UGLY HACK: - * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. - * In everywhere else, it uses scalar code. - * - * For 64->128-bit multiplies, even if the NEON was 100% optimal, it - * would still be slower than UMAAL (see XXH_mult64to128). - * - * Unfortunately, Clang doesn't handle the long multiplies properly and - * converts them to the nonexistent "vmulq_u64" intrinsic, which is then - * scalarized into an ugly mess of VMOV.32 instructions. - * - * This mess is difficult to avoid without turning autovectorization - * off completely, but they are usually relatively minor and/or not - * worth it to fix. - * - * This loop is the easiest to fix, as unlike XXH32, this pragma - * _actually works_ because it is a loop vectorization instead of an - * SLP vectorization. - */ - #pragma clang loop vectorize(disable) - #endif + #if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) + #endif for (i = 8; i < nbRounds; i++) { acc += @@ -3450,17 +4088,17 @@ XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b( } - /* ======= Long Keys ======= */ + /* ======= Long Keys ======= */ - #define XXH_STRIPE_LEN 64 - #define XXH_SECRET_CONSUME_RATE \ - 8 /* nb of secret bytes consumed at each accumulation */ - #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + #define XXH_STRIPE_LEN 64 + #define XXH_SECRET_CONSUME_RATE \ + 8 /* nb of secret bytes consumed at each accumulation */ + #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) - #ifdef XXH_OLD_NAMES - #define STRIPE_LEN XXH_STRIPE_LEN - #define ACC_NB XXH_ACC_NB - #endif + #ifdef XXH_OLD_NAMES + #define STRIPE_LEN XXH_STRIPE_LEN + #define ACC_NB XXH_ACC_NB + #endif XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) { @@ -3469,56 +4107,58 @@ XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) { } - /* Several intrinsic functions below are supposed to accept __int64 as - * argument, as documented in - * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . However, - * several environments do not define __int64 type, requiring a workaround. - */ - #if !defined(__VMS) && \ - (defined(__cplusplus) || (defined(__STDC_VERSION__) && \ - (__STDC_VERSION__ >= 199901L) /* C99 */)) + /* Several intrinsic functions below are supposed to accept __int64 as + * argument, as documented in + * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ + #if !defined(__VMS) && \ + (defined(__cplusplus) || (defined(__STDC_VERSION__) && \ + (__STDC_VERSION__ >= 199901L) /* C99 */)) typedef int64_t xxh_i64; - #else + #else /* the following type must have a width of 64-bit */ typedef long long xxh_i64; - #endif + #endif - /* - * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the - * most optimized. - * - * It is a hardened version of UMAC, based off of FARSH's implementation. - * - * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD - * implementations, and it is ridiculously fast. - * - * We harden it by mixing the original input to the accumulators as well as - * the product. - * - * This means that in the (relatively likely) case of a multiply by zero, the - * original input is preserved. - * - * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve - * cross-pollination, as otherwise the upper and lower halves would be - * essentially independent. - * - * This doesn't matter on 64-bit hashes since they all get merged together in - * the end, so we skip the extra step. - * - * Both XXH3_64bits and XXH3_128bits use this subroutine. - */ + /* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the + * most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as + * the product. + * + * This means that in the (relatively likely) case of a multiply by zero, + * the original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together + * in the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ - #if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH) + #if (XXH_VECTOR == XXH_AVX512) || \ + (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) - #ifndef XXH_TARGET_AVX512 - #define XXH_TARGET_AVX512 /* disable attribute target */ - #endif + #ifndef XXH_TARGET_AVX512 + #define XXH_TARGET_AVX512 /* disable attribute target */ + #endif XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512( void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, const void *XXH_RESTRICT secret) { - XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc; + __m512i *const xacc = (__m512i *)acc; XXH_ASSERT((((size_t)acc) & 63) == 0); XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); @@ -3576,8 +4216,8 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512( XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); { - XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc; - const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + __m512i *const xacc = (__m512i *)acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); /* xacc[0] ^= (xacc[0] >> 47) */ __m512i const acc_vec = *xacc; @@ -3609,19 +4249,21 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512( int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); __m512i const seed = _mm512_mask_set1_epi64( - _mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64); + _mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64)); - XXH_ALIGN(64) const __m512i *const src = (const __m512i *)XXH3_kSecret; - XXH_ALIGN(64) __m512i *const dest = (__m512i *)customSecret; - int i; + const __m512i *const src = (const __m512i *)((const void *)XXH3_kSecret); + __m512i *const dest = (__m512i *)customSecret; + int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); for (i = 0; i < nbRounds; ++i) { /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void - * const*', this will warn "discards ‘const’ qualifier". */ + * const*', this will warn "discards 'const' qualifier". */ union { - XXH_ALIGN(64) const __m512i *cp; - XXH_ALIGN(64) void *p; + const __m512i *cp; + void * p; } remote_const_void; @@ -3635,13 +4277,14 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512( } - #endif + #endif - #if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH) + #if (XXH_VECTOR == XXH_AVX2) || \ + (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) - #ifndef XXH_TARGET_AVX2 - #define XXH_TARGET_AVX2 /* disable attribute target */ - #endif + #ifndef XXH_TARGET_AVX2 + #define XXH_TARGET_AVX2 /* disable attribute target */ + #endif XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2( void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, @@ -3650,7 +4293,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2( XXH_ASSERT((((size_t)acc) & 31) == 0); { - XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc; + __m256i *const xacc = (__m256i *)acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ @@ -3692,7 +4335,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2( XXH_ASSERT((((size_t)acc) & 31) == 0); { - XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc; + __m256i *const xacc = (__m256i *)acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ const __m256i *const xsecret = (const __m256i *)secret; @@ -3732,24 +4375,23 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2( XXH_PREFETCH(customSecret); { - __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, - -(xxh_i64)seed64, (xxh_i64)seed64); + __m256i const seed = + _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, + (xxh_i64)(0U - seed64), (xxh_i64)seed64); - XXH_ALIGN(64) const __m256i *const src = (const __m256i *)XXH3_kSecret; - XXH_ALIGN(64) __m256i * dest = (__m256i *)customSecret; + const __m256i *const src = (const __m256i *)((const void *)XXH3_kSecret); + __m256i * dest = (__m256i *)customSecret; - #if defined(__GNUC__) || defined(__clang__) + #if defined(__GNUC__) || defined(__clang__) /* * On GCC & Clang, marking 'dest' as modified will cause the compiler: * - do not extract the secret from sse registers in the internal loop * - use less common registers, and avoid pushing these reg into stack - * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with - * customSecret, and on aarch64, this prevented LDP from merging two - * loads together for free. Putting the loads together before the stores - * properly generates LDP. */ - __asm__("" : "+r"(dest)); - #endif + XXH_COMPILER_GUARD(dest); + #endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); /* GCC -O2 need unroll loop manually */ dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed); @@ -3763,13 +4405,14 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2( } - #endif + #endif - #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + /* x86dispatch always generates SSE2 */ + #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) - #ifndef XXH_TARGET_SSE2 - #define XXH_TARGET_SSE2 /* disable attribute target */ - #endif + #ifndef XXH_TARGET_SSE2 + #define XXH_TARGET_SSE2 /* disable attribute target */ + #endif XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2( void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, @@ -3779,7 +4422,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2( XXH_ASSERT((((size_t)acc) & 15) == 0); { - XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc; + __m128i *const xacc = (__m128i *)acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ const __m128i *const xinput = (const __m128i *)input; @@ -3820,7 +4463,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2( XXH_ASSERT((((size_t)acc) & 15) == 0); { - XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc; + __m128i *const xacc = (__m128i *)acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ const __m128i *const xsecret = (const __m128i *)secret; @@ -3859,30 +4502,34 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2( int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); - #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 - // MSVC 32bit mode does not support _mm_set_epi64x before 2015 + #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ XXH_ALIGN(16) - const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, -(xxh_i64)seed64}; + const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, (xxh_i64)(0U - seed64)}; __m128i const seed = _mm_load_si128((__m128i const *)seed64x2); - #else - __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64); - #endif + #else + __m128i const seed = + _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); + #endif int i; - XXH_ALIGN(64) const float *const src = (float const *)XXH3_kSecret; - XXH_ALIGN(XXH_SEC_ALIGN) __m128i *dest = (__m128i *)customSecret; - #if defined(__GNUC__) || defined(__clang__) + const void *const src16 = XXH3_kSecret; + __m128i * dst16 = (__m128i *)customSecret; + #if defined(__GNUC__) || defined(__clang__) /* * On GCC & Clang, marking 'dest' as modified will cause the compiler: * - do not extract the secret from sse registers in the internal loop * - use less common registers, and avoid pushing these reg into stack */ - __asm__("" : "+r"(dest)); - #endif + XXH_COMPILER_GUARD(dst16); + #endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); for (i = 0; i < nbRounds; ++i) { - dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src + i * 4)), seed); + dst16[i] = + _mm_add_epi64(_mm_load_si128((const __m128i *)src16 + i), seed); } @@ -3890,9 +4537,9 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2( } - #endif + #endif - #if (XXH_VECTOR == XXH_NEON) + #if (XXH_VECTOR == XXH_NEON) XXH_FORCE_INLINE void XXH3_accumulate_512_neon( void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, @@ -3901,7 +4548,7 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_neon( XXH_ASSERT((((size_t)acc) & 15) == 0); { - XXH_ALIGN(16) uint64x2_t *const xacc = (uint64x2_t *)acc; + uint64x2_t *const xacc = (uint64x2_t *)acc; /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ uint8_t const *const xinput = (const uint8_t *)input; @@ -3996,9 +4643,9 @@ XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void *XXH_RESTRICT acc, } - #endif + #endif - #if (XXH_VECTOR == XXH_VSX) + #if (XXH_VECTOR == XXH_VSX) XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, @@ -4025,12 +4672,12 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT acc, xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); xacc[i] += product; - /* swap high and low halves */ - #ifdef __s390x__ + /* swap high and low halves */ + #ifdef __s390x__ xacc[i] += vec_permi(data_vec, data_vec, 2); - #else + #else xacc[i] += vec_xxpermdi(data_vec, data_vec, 2); - #endif + #endif } @@ -4075,7 +4722,7 @@ XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void *XXH_RESTRICT acc, } - #endif + #endif /* scalar variants - universal */ @@ -4083,7 +4730,6 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar( void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, const void *XXH_RESTRICT secret) { - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 *const xacc = (xxh_u64 *)acc; /* presumed aligned */ const xxh_u8 *const xinput = (const xxh_u8 *)input; /* no alignment restriction */ @@ -4105,7 +4751,6 @@ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar( XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) { - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 *const xacc = (xxh_u64 *)acc; /* presumed aligned */ const xxh_u8 *const xsecret = (const xxh_u8 *)secret; /* no alignment restriction */ @@ -4135,7 +4780,7 @@ XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar( const xxh_u8 *kSecretPtr = XXH3_kSecret; XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); - #if defined(__clang__) && defined(__aarch64__) + #if defined(__clang__) && defined(__aarch64__) /* * UGLY HACK: * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are @@ -4164,8 +4809,8 @@ XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar( * without hack: 2654.4 MB/s * with hack: 3202.9 MB/s */ - __asm__("" : "+r"(kSecretPtr)); - #endif + XXH_COMPILER_GUARD(kSecretPtr); + #endif /* * Note: in debug mode, this overrides the asm optimization * and Clang will emit MOVK chains again. @@ -4200,55 +4845,55 @@ typedef void (*XXH3_f_accumulate_512)(void *XXH_RESTRICT, const void *, typedef void (*XXH3_f_scrambleAcc)(void *XXH_RESTRICT, const void *); typedef void (*XXH3_f_initCustomSecret)(void *XXH_RESTRICT, xxh_u64); - #if (XXH_VECTOR == XXH_AVX512) + #if (XXH_VECTOR == XXH_AVX512) - #define XXH3_accumulate_512 XXH3_accumulate_512_avx512 - #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 - #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + #define XXH3_accumulate_512 XXH3_accumulate_512_avx512 + #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 + #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 - #elif (XXH_VECTOR == XXH_AVX2) + #elif (XXH_VECTOR == XXH_AVX2) - #define XXH3_accumulate_512 XXH3_accumulate_512_avx2 - #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 - #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + #define XXH3_accumulate_512 XXH3_accumulate_512_avx2 + #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 + #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 - #elif (XXH_VECTOR == XXH_SSE2) + #elif (XXH_VECTOR == XXH_SSE2) - #define XXH3_accumulate_512 XXH3_accumulate_512_sse2 - #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 - #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + #define XXH3_accumulate_512 XXH3_accumulate_512_sse2 + #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 + #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 - #elif (XXH_VECTOR == XXH_NEON) + #elif (XXH_VECTOR == XXH_NEON) - #define XXH3_accumulate_512 XXH3_accumulate_512_neon - #define XXH3_scrambleAcc XXH3_scrambleAcc_neon - #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + #define XXH3_accumulate_512 XXH3_accumulate_512_neon + #define XXH3_scrambleAcc XXH3_scrambleAcc_neon + #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - #elif (XXH_VECTOR == XXH_VSX) + #elif (XXH_VECTOR == XXH_VSX) - #define XXH3_accumulate_512 XXH3_accumulate_512_vsx - #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx - #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + #define XXH3_accumulate_512 XXH3_accumulate_512_vsx + #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx + #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - #else /* scalar */ + #else /* scalar */ - #define XXH3_accumulate_512 XXH3_accumulate_512_scalar - #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar - #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + #define XXH3_accumulate_512 XXH3_accumulate_512_scalar + #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar + #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - #endif + #endif - #ifndef XXH_PREFETCH_DIST - #ifdef __clang__ - #define XXH_PREFETCH_DIST 320 - #else - #if (XXH_VECTOR == XXH_AVX512) - #define XXH_PREFETCH_DIST 512 + #ifndef XXH_PREFETCH_DIST + #ifdef __clang__ + #define XXH_PREFETCH_DIST 320 #else - #define XXH_PREFETCH_DIST 384 - #endif - #endif /* __clang__ */ - #endif /* XXH_PREFETCH_DIST */ + #if (XXH_VECTOR == XXH_AVX512) + #define XXH_PREFETCH_DIST 512 + #else + #define XXH_PREFETCH_DIST 384 + #endif + #endif /* __clang__ */ + #endif /* XXH_PREFETCH_DIST */ /* * XXH3_accumulate() @@ -4308,8 +4953,9 @@ XXH_FORCE_INLINE void XXH3_hashLong_internal_loop( { const xxh_u8 *const p = input + len - XXH_STRIPE_LEN; - #define XXH_SECRET_LASTACC_START \ - 7 /* not aligned on 8, last secret is different from acc & scrambler */ + #define XXH_SECRET_LASTACC_START \ + 7 /* not aligned on 8, last secret is different from acc & scrambler \ + */ f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); @@ -4337,10 +4983,10 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc, for (i = 0; i < 4; i++) { result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i); - #if defined(__clang__) /* Clang */ \ - && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ - && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + #if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ /* * UGLY HACK: * Prevent autovectorization on Clang ARMv7-a. Exact same problem as @@ -4349,8 +4995,8 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc, * without hack: 2063.7 MB/s * with hack: 2560.7 MB/s */ - __asm__("" : "+r"(result64)); - #endif + XXH_COMPILER_GUARD(result64); + #endif } @@ -4358,13 +5004,13 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc, } - #define XXH3_INIT_ACC \ - { \ - \ - XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ - XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \ - \ - } + #define XXH3_INIT_ACC \ + { \ + \ + XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \ + \ + } XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal( const void *XXH_RESTRICT input, size_t len, const void *XXH_RESTRICT secret, @@ -4379,9 +5025,9 @@ XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal( /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); - /* do not align on 8, so that the secret is different from the accumulator - */ - #define XXH_SECRET_MERGEACCS_START 11 + /* do not align on 8, so that the secret is different from the accumulator + */ + #define XXH_SECRET_MERGEACCS_START 11 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); return XXH3_mergeAccs(acc, (const xxh_u8 *)secret + XXH_SECRET_MERGEACCS_START, @@ -4501,6 +5147,7 @@ XXH3_64bits_internal(const void *XXH_RESTRICT input, size_t len, /* === Public entry point === */ +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) { return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), @@ -4508,6 +5155,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) { } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input, size_t len, const void *secret, @@ -4518,6 +5166,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len, XXH64_hash_t seed) { @@ -4603,6 +5252,7 @@ static void XXH_alignedFree(void *p) { } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) { XXH3_state_t *const state = @@ -4613,6 +5263,7 @@ XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) { } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) { XXH_alignedFree(statePtr); @@ -4620,6 +5271,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) { } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t * dst_state, const XXH3_state_t *src_state) { @@ -4627,9 +5279,8 @@ XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t * dst_state, } -static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr, - XXH64_hash_t seed, const void *secret, - size_t secretSize) { +static void XXH3_reset_internal(XXH3_state_t *statePtr, XXH64_hash_t seed, + const void *secret, size_t secretSize) { size_t const initStart = offsetof(XXH3_state_t, bufferedSize); size_t const initLength = @@ -4654,26 +5305,28 @@ static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr) { if (statePtr == NULL) return XXH_ERROR; - XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, - XXH_SECRET_DEFAULT_SIZE); + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret( XXH3_state_t *statePtr, const void *secret, size_t secretSize) { if (statePtr == NULL) return XXH_ERROR; - XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize); + XXH3_reset_internal(statePtr, 0, secret, secretSize); if (secret == NULL) return XXH_ERROR; if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; return XXH_OK; } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr, XXH64_hash_t seed) { @@ -4681,7 +5334,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr, if (seed == 0) return XXH3_64bits_reset(statePtr); if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed); - XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } @@ -4733,12 +5386,12 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, XXH3_f_scrambleAcc f_scramble) { if (input == NULL) - #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ - (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) + #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ + (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) return XXH_OK; - #else + #else return XXH_ERROR; - #endif + #endif { @@ -4747,6 +5400,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, (state->extSecret == NULL) ? state->customSecret : state->extSecret; state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ @@ -4756,10 +5410,10 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, } - /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ - #define XXH3_INTERNALBUFFER_STRIPES \ - (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + #define XXH3_INTERNALBUFFER_STRIPES \ + (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ @@ -4783,7 +5437,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, XXH_ASSERT(input < bEnd); /* Consume input by a multiple of internal buffer size */ - if (input + XXH3_INTERNALBUFFER_SIZE < bEnd) { + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { const xxh_u8 *const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; do { @@ -4814,6 +5468,7 @@ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state, const void *input, size_t len) { @@ -4859,6 +5514,7 @@ XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t * acc, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) { const unsigned char *const secret = @@ -4881,8 +5537,9 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) { } - #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) +/*! @ingroup xxh3_family */ XXH_PUBLIC_API void XXH3_generateSecret(void * secretBuffer, const void *customSeed, size_t customSeedSize) { @@ -5398,6 +6055,7 @@ XXH3_128bits_internal(const void *input, size_t len, XXH64_hash_t seed64, /* === Public XXH128 API === */ +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) { return XXH3_128bits_internal(input, len, 0, XXH3_kSecret, @@ -5406,6 +6064,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) { } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input, size_t len, const void *secret, @@ -5416,6 +6075,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input, size_t len, XXH64_hash_t seed) { @@ -5426,6 +6086,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len, XXH64_hash_t seed) { @@ -5437,37 +6098,31 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len, /* * All the functions are actually the same as for 64-bit streaming variant. - * The only difference is the finalizatiom routine. + * The only difference is the finalization routine. */ -static void XXH3_128bits_reset_internal(XXH3_state_t *statePtr, - XXH64_hash_t seed, const void *secret, - size_t secretSize) { - - XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize); - -} - +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr) { if (statePtr == NULL) return XXH_ERROR; - XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, - XXH_SECRET_DEFAULT_SIZE); + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret( XXH3_state_t *statePtr, const void *secret, size_t secretSize) { if (statePtr == NULL) return XXH_ERROR; - XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize); + XXH3_reset_internal(statePtr, 0, secret, secretSize); if (secret == NULL) return XXH_ERROR; if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; return XXH_OK; } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr, XXH64_hash_t seed) { @@ -5475,11 +6130,12 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr, if (seed == 0) return XXH3_128bits_reset(statePtr); if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed); - XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state, const void * input, size_t len) { @@ -5489,6 +6145,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) { const unsigned char *const secret = @@ -5524,11 +6181,12 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) { } - /* 128-bit utility functions */ + /* 128-bit utility functions */ - #include <string.h> /* memcmp, memcpy */ + #include <string.h> /* memcmp, memcpy */ /* return : 1 is equal, 0 if different */ +/*! @ingroup xxh3_family */ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) { /* note : XXH128_hash_t is compact, it has no padding byte */ @@ -5540,6 +6198,7 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) { * return : >0 if *h128_1 > *h128_2 * <0 if *h128_1 < *h128_2 * =0 if *h128_1 == *h128_2 */ +/*! @ingroup xxh3_family */ XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) { XXH128_hash_t const h1 = *(const XXH128_hash_t *)h128_1; @@ -5552,6 +6211,7 @@ XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) { } /*====== Canonical representation ======*/ +/*! @ingroup xxh3_family */ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst, XXH128_hash_t hash) { @@ -5568,6 +6228,7 @@ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst, } +/*! @ingroup xxh3_family */ XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t *src) { @@ -5578,16 +6239,21 @@ XXH128_hashFromCanonical(const XXH128_canonical_t *src) { } - /* Pop our optimization override from above */ - #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ - && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__OPTIMIZE__) && \ - !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ - #pragma GCC pop_options - #endif + /* Pop our optimization override from above */ + #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && \ + !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ + #pragma GCC pop_options + #endif - #endif /* XXH_NO_LONG_LONG */ + #endif /* XXH_NO_LONG_LONG */ + + #endif /* XXH_NO_XXH3 */ +/*! + * @} + */ #endif /* XXH_IMPLEMENTATION */ #if defined(__cplusplus) diff --git a/instrumentation/afl-compiler-rt.o.c b/instrumentation/afl-compiler-rt.o.c index b2802a29..91c690c0 100644 --- a/instrumentation/afl-compiler-rt.o.c +++ b/instrumentation/afl-compiler-rt.o.c @@ -22,6 +22,10 @@ #include "cmplog.h" #include "llvm-alternative-coverage.h" +#define XXH_INLINE_ALL +#include "xxhash.h" +#undef XXH_INLINE_ALL + #include <stdio.h> #include <stdlib.h> #include <signal.h> @@ -154,6 +158,8 @@ static void at_exit(int signal) { } +#define default_hash(a, b) XXH3_64bits(a, b) + /* Uninspired gcc plugin instrumentation */ void __afl_trace(const u32 x) { @@ -669,7 +675,7 @@ static void __afl_start_snapshots(void) { /* Phone home and tell the parent that we're OK. If parent isn't there, assume we're not running in forkserver mode and just execute program. */ - status |= (FS_OPT_ENABLED | FS_OPT_SNAPSHOT); + status |= (FS_OPT_ENABLED | FS_OPT_SNAPSHOT | FS_OPT_NEWCMPLOG); if (__afl_sharedmem_fuzzing != 0) status |= FS_OPT_SHDMEM_FUZZ; if (__afl_map_size <= FS_OPT_MAX_MAPSIZE) status |= (FS_OPT_SET_MAPSIZE(__afl_map_size) | FS_OPT_MAPSIZE); @@ -935,7 +941,12 @@ static void __afl_start_forkserver(void) { } if (__afl_sharedmem_fuzzing != 0) { status_for_fsrv |= FS_OPT_SHDMEM_FUZZ; } - if (status_for_fsrv) { status_for_fsrv |= (FS_OPT_ENABLED); } + if (status_for_fsrv) { + + status_for_fsrv |= (FS_OPT_ENABLED | FS_OPT_NEWCMPLOG); + + } + memcpy(tmp, &status_for_fsrv, 4); /* Phone home and tell the parent that we're OK. If parent isn't there, @@ -1499,8 +1510,7 @@ void __cmplog_ins_hook1(uint8_t arg1, uint8_t arg2, uint8_t attr) { if (unlikely(!__afl_cmp_map || arg1 == arg2)) return; uintptr_t k = (uintptr_t)__builtin_return_address(0); - k = (k >> 4) ^ (k << 8); - k &= CMP_MAP_W - 1; + k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1)); u32 hits; @@ -1530,8 +1540,7 @@ void __cmplog_ins_hook2(uint16_t arg1, uint16_t arg2, uint8_t attr) { if (unlikely(!__afl_cmp_map || arg1 == arg2)) return; uintptr_t k = (uintptr_t)__builtin_return_address(0); - k = (k >> 4) ^ (k << 8); - k &= CMP_MAP_W - 1; + k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1)); u32 hits; @@ -1569,8 +1578,7 @@ void __cmplog_ins_hook4(uint32_t arg1, uint32_t arg2, uint8_t attr) { if (unlikely(!__afl_cmp_map || arg1 == arg2)) return; uintptr_t k = (uintptr_t)__builtin_return_address(0); - k = (k >> 4) ^ (k << 8); - k &= CMP_MAP_W - 1; + k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1)); u32 hits; @@ -1608,8 +1616,7 @@ void __cmplog_ins_hook8(uint64_t arg1, uint64_t arg2, uint8_t attr) { if (unlikely(!__afl_cmp_map || arg1 == arg2)) return; uintptr_t k = (uintptr_t)__builtin_return_address(0); - k = (k >> 4) ^ (k << 8); - k &= CMP_MAP_W - 1; + k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1)); u32 hits; @@ -1652,8 +1659,7 @@ void __cmplog_ins_hookN(uint128_t arg1, uint128_t arg2, uint8_t attr, if (unlikely(!__afl_cmp_map || arg1 == arg2)) return; uintptr_t k = (uintptr_t)__builtin_return_address(0); - k = (k >> 4) ^ (k << 8); - k &= CMP_MAP_W - 1; + k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1)); u32 hits; @@ -1696,8 +1702,7 @@ void __cmplog_ins_hook16(uint128_t arg1, uint128_t arg2, uint8_t attr) { if (likely(!__afl_cmp_map)) return; uintptr_t k = (uintptr_t)__builtin_return_address(0); - k = (k >> 4) ^ (k << 8); - k &= CMP_MAP_W - 1; + k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1)); u32 hits; @@ -1802,8 +1807,8 @@ void __sanitizer_cov_trace_switch(uint64_t val, uint64_t *cases) { for (uint64_t i = 0; i < cases[0]; i++) { uintptr_t k = (uintptr_t)__builtin_return_address(0) + i; - k = (k >> 4) ^ (k << 8); - k &= CMP_MAP_W - 1; + k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & + (CMP_MAP_W - 1)); u32 hits; @@ -1880,6 +1885,159 @@ static int area_is_valid(void *ptr, size_t len) { } +void __cmplog_rtn_hook_n(u8 *ptr1, u8 *ptr2, u64 len) { + + /* + u32 i; + if (area_is_valid(ptr1, 32) <= 0 || area_is_valid(ptr2, 32) <= 0) return; + fprintf(stderr, "rtn_n len=%u arg0=", len); + for (i = 0; i < len; i++) + fprintf(stderr, "%02x", ptr1[i]); + fprintf(stderr, " arg1="); + for (i = 0; i < len; i++) + fprintf(stderr, "%02x", ptr2[i]); + fprintf(stderr, "\n"); + */ + + if (likely(!__afl_cmp_map)) return; + // fprintf(stderr, "RTN1 %p %p %u\n", ptr1, ptr2, len); + if (unlikely(!len)) return; + int l = MIN(31, len); + + // fprintf(stderr, "RTN2 %u\n", l); + uintptr_t k = (uintptr_t)__builtin_return_address(0); + k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1)); + + u32 hits; + + if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) { + + __afl_cmp_map->headers[k].type = CMP_TYPE_RTN; + __afl_cmp_map->headers[k].hits = 1; + __afl_cmp_map->headers[k].shape = l - 1; + hits = 0; + + } else { + + hits = __afl_cmp_map->headers[k].hits++; + + if (__afl_cmp_map->headers[k].shape < l) { + + __afl_cmp_map->headers[k].shape = l - 1; + + } + + } + + struct cmpfn_operands *cmpfn = (struct cmpfn_operands *)__afl_cmp_map->log[k]; + hits &= CMP_MAP_RTN_H - 1; + + cmpfn[hits].v0_len = l; + cmpfn[hits].v1_len = l; + __builtin_memcpy(cmpfn[hits].v0, ptr1, l); + __builtin_memcpy(cmpfn[hits].v1, ptr2, l); + // fprintf(stderr, "RTN3\n"); + +} + +void __cmplog_rtn_hook_strn(u8 *ptr1, u8 *ptr2, u64 len) { + + /* + if (area_is_valid(ptr1, 32) <= 0 || area_is_valid(ptr2, 32) <= 0) return; + fprintf(stderr, "rtn_strn len=%u arg0=%s arg1=%s\n", len, ptr1, ptr2); + */ + + if (likely(!__afl_cmp_map)) return; + // fprintf(stderr, "RTN1 %p %p %u\n", ptr1, ptr2, len); + if (unlikely(!len)) return; + int l = MIN(31, len + 1); + + // fprintf(stderr, "RTN2 %u\n", l); + uintptr_t k = (uintptr_t)__builtin_return_address(0); + k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1)); + + u32 hits; + + if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) { + + __afl_cmp_map->headers[k].type = CMP_TYPE_RTN; + __afl_cmp_map->headers[k].hits = 1; + __afl_cmp_map->headers[k].shape = l - 1; + hits = 0; + + } else { + + hits = __afl_cmp_map->headers[k].hits++; + + if (__afl_cmp_map->headers[k].shape < l) { + + __afl_cmp_map->headers[k].shape = l - 1; + + } + + } + + struct cmpfn_operands *cmpfn = (struct cmpfn_operands *)__afl_cmp_map->log[k]; + hits &= CMP_MAP_RTN_H - 1; + + cmpfn[hits].v0_len = 0x80 + l; + cmpfn[hits].v1_len = 0x80 + l; + __builtin_memcpy(cmpfn[hits].v0, ptr1, l); + __builtin_memcpy(cmpfn[hits].v1, ptr2, l); + // fprintf(stderr, "RTN3\n"); + +} + +void __cmplog_rtn_hook_str(u8 *ptr1, u8 *ptr2) { + + /* + if (area_is_valid(ptr1, 32) <= 0 || area_is_valid(ptr2, 32) <= 0) return; + fprintf(stderr, "rtn_str arg0=%s arg1=%s\n", ptr1, ptr2); + */ + + if (likely(!__afl_cmp_map)) return; + // fprintf(stderr, "RTN1 %p %p\n", ptr1, ptr2); + if (unlikely(!ptr1 || !ptr2)) return; + int len1 = MIN(31, strlen(ptr1) + 1); + int len2 = MIN(31, strlen(ptr2) + 1); + int l = MIN(MAX(len1, len2), 31); + + // fprintf(stderr, "RTN2 %u\n", l); + uintptr_t k = (uintptr_t)__builtin_return_address(0); + k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1)); + + u32 hits; + + if (__afl_cmp_map->headers[k].type != CMP_TYPE_RTN) { + + __afl_cmp_map->headers[k].type = CMP_TYPE_RTN; + __afl_cmp_map->headers[k].hits = 1; + __afl_cmp_map->headers[k].shape = l - 1; + hits = 0; + + } else { + + hits = __afl_cmp_map->headers[k].hits++; + + if (__afl_cmp_map->headers[k].shape < l) { + + __afl_cmp_map->headers[k].shape = l - 1; + + } + + } + + struct cmpfn_operands *cmpfn = (struct cmpfn_operands *)__afl_cmp_map->log[k]; + hits &= CMP_MAP_RTN_H - 1; + + cmpfn[hits].v0_len = 0x80 + len1; + cmpfn[hits].v1_len = 0x80 + len2; + __builtin_memcpy(cmpfn[hits].v0, ptr1, len1); + __builtin_memcpy(cmpfn[hits].v1, ptr2, len2); + // fprintf(stderr, "RTN3\n"); + +} + void __cmplog_rtn_hook(u8 *ptr1, u8 *ptr2) { /* @@ -1900,12 +2058,11 @@ void __cmplog_rtn_hook(u8 *ptr1, u8 *ptr2) { if ((l1 = area_is_valid(ptr1, 32)) <= 0 || (l2 = area_is_valid(ptr2, 32)) <= 0) return; - int len = MIN(l1, l2); + int len = MIN(31, MIN(l1, l2)); // fprintf(stderr, "RTN2 %u\n", len); uintptr_t k = (uintptr_t)__builtin_return_address(0); - k = (k >> 4) ^ (k << 8); - k &= CMP_MAP_W - 1; + k = (uintptr_t)(default_hash((u8 *)&k, sizeof(uintptr_t)) & (CMP_MAP_W - 1)); u32 hits; @@ -1928,11 +2085,13 @@ void __cmplog_rtn_hook(u8 *ptr1, u8 *ptr2) { } + struct cmpfn_operands *cmpfn = (struct cmpfn_operands *)__afl_cmp_map->log[k]; hits &= CMP_MAP_RTN_H - 1; - __builtin_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v0, - ptr1, len); - __builtin_memcpy(((struct cmpfn_operands *)__afl_cmp_map->log[k])[hits].v1, - ptr2, len); + + cmpfn[hits].v0_len = len; + cmpfn[hits].v1_len = len; + __builtin_memcpy(cmpfn[hits].v0, ptr1, len); + __builtin_memcpy(cmpfn[hits].v1, ptr2, len); // fprintf(stderr, "RTN3\n"); } diff --git a/instrumentation/cmplog-routines-pass.cc b/instrumentation/cmplog-routines-pass.cc index 01b7a373..fb514edc 100644 --- a/instrumentation/cmplog-routines-pass.cc +++ b/instrumentation/cmplog-routines-pass.cc @@ -87,12 +87,14 @@ char CmpLogRoutines::ID = 0; bool CmpLogRoutines::hookRtns(Module &M) { - std::vector<CallInst *> calls, llvmStdStd, llvmStdC, gccStdStd, gccStdC; - LLVMContext & C = M.getContext(); + std::vector<CallInst *> calls, llvmStdStd, llvmStdC, gccStdStd, gccStdC, + Memcmp, Strcmp, Strncmp; + LLVMContext &C = M.getContext(); Type *VoidTy = Type::getVoidTy(C); // PointerType *VoidPtrTy = PointerType::get(VoidTy, 0); IntegerType *Int8Ty = IntegerType::getInt8Ty(C); + IntegerType *Int64Ty = IntegerType::getInt64Ty(C); PointerType *i8PtrTy = PointerType::get(Int8Ty, 0); #if LLVM_VERSION_MAJOR < 9 @@ -184,6 +186,60 @@ bool CmpLogRoutines::hookRtns(Module &M) { FunctionCallee cmplogGccStdC = c4; #endif +#if LLVM_VERSION_MAJOR < 9 + Constant * +#else + FunctionCallee +#endif + c5 = M.getOrInsertFunction("__cmplog_rtn_hook_n", VoidTy, i8PtrTy, + i8PtrTy, Int64Ty +#if LLVM_VERSION_MAJOR < 5 + , + NULL +#endif + ); +#if LLVM_VERSION_MAJOR < 9 + Function *cmplogHookFnN = cast<Function>(c5); +#else + FunctionCallee cmplogHookFnN = c5; +#endif + +#if LLVM_VERSION_MAJOR < 9 + Constant * +#else + FunctionCallee +#endif + c6 = M.getOrInsertFunction("__cmplog_rtn_hook_strn", VoidTy, i8PtrTy, + i8PtrTy, Int64Ty +#if LLVM_VERSION_MAJOR < 5 + , + NULL +#endif + ); +#if LLVM_VERSION_MAJOR < 9 + Function *cmplogHookFnStrN = cast<Function>(c6); +#else + FunctionCallee cmplogHookFnStrN = c6; +#endif + +#if LLVM_VERSION_MAJOR < 9 + Constant * +#else + FunctionCallee +#endif + c7 = M.getOrInsertFunction("__cmplog_rtn_hook_str", VoidTy, i8PtrTy, + i8PtrTy +#if LLVM_VERSION_MAJOR < 5 + , + NULL +#endif + ); +#if LLVM_VERSION_MAJOR < 9 + Function *cmplogHookFnStr = cast<Function>(c7); +#else + FunctionCallee cmplogHookFnStr = c7; +#endif + GlobalVariable *AFLCmplogPtr = M.getNamedGlobal("__afl_cmp_map"); if (!AFLCmplogPtr) { @@ -214,12 +270,93 @@ bool CmpLogRoutines::hookRtns(Module &M) { if (callInst->getCallingConv() != llvm::CallingConv::C) continue; FunctionType *FT = Callee->getFunctionType(); + std::string FuncName = Callee->getName().str(); bool isPtrRtn = FT->getNumParams() >= 2 && !FT->getReturnType()->isVoidTy() && FT->getParamType(0) == FT->getParamType(1) && FT->getParamType(0)->isPointerTy(); + bool isPtrRtnN = FT->getNumParams() >= 3 && + !FT->getReturnType()->isVoidTy() && + FT->getParamType(0) == FT->getParamType(1) && + FT->getParamType(0)->isPointerTy() && + FT->getParamType(2)->isIntegerTy(); + if (isPtrRtnN) { + + auto intTyOp = + dyn_cast<IntegerType>(callInst->getArgOperand(2)->getType()); + if (intTyOp) { + + if (intTyOp->getBitWidth() != 32 && + intTyOp->getBitWidth() != 64) { + + isPtrRtnN = false; + + } + + } + + } + + bool isMemcmp = + (!FuncName.compare("memcmp") || !FuncName.compare("bcmp") || + !FuncName.compare("CRYPTO_memcmp") || + !FuncName.compare("OPENSSL_memcmp") || + !FuncName.compare("memcmp_const_time") || + !FuncName.compare("memcmpct")); + isMemcmp &= FT->getNumParams() == 3 && + FT->getReturnType()->isIntegerTy(32) && + FT->getParamType(0)->isPointerTy() && + FT->getParamType(1)->isPointerTy() && + FT->getParamType(2)->isIntegerTy(); + + bool isStrcmp = + (!FuncName.compare("strcmp") || !FuncName.compare("xmlStrcmp") || + !FuncName.compare("xmlStrEqual") || + !FuncName.compare("g_strcmp0") || + !FuncName.compare("curl_strequal") || + !FuncName.compare("strcsequal") || + !FuncName.compare("strcasecmp") || + !FuncName.compare("stricmp") || + !FuncName.compare("ap_cstr_casecmp") || + !FuncName.compare("OPENSSL_strcasecmp") || + !FuncName.compare("xmlStrcasecmp") || + !FuncName.compare("g_strcasecmp") || + !FuncName.compare("g_ascii_strcasecmp") || + !FuncName.compare("Curl_strcasecompare") || + !FuncName.compare("Curl_safe_strcasecompare") || + !FuncName.compare("cmsstrcasecmp") || + !FuncName.compare("strstr") || + !FuncName.compare("g_strstr_len") || + !FuncName.compare("ap_strcasestr") || + !FuncName.compare("xmlStrstr") || + !FuncName.compare("xmlStrcasestr") || + !FuncName.compare("g_str_has_prefix") || + !FuncName.compare("g_str_has_suffix")); + isStrcmp &= + FT->getNumParams() == 2 && FT->getReturnType()->isIntegerTy(32) && + FT->getParamType(0) == FT->getParamType(1) && + FT->getParamType(0) == IntegerType::getInt8PtrTy(M.getContext()); + + bool isStrncmp = (!FuncName.compare("strncmp") || + !FuncName.compare("xmlStrncmp") || + !FuncName.compare("curl_strnequal") || + !FuncName.compare("strncasecmp") || + !FuncName.compare("strnicmp") || + !FuncName.compare("ap_cstr_casecmpn") || + !FuncName.compare("OPENSSL_strncasecmp") || + !FuncName.compare("xmlStrncasecmp") || + !FuncName.compare("g_ascii_strncasecmp") || + !FuncName.compare("Curl_strncasecompare") || + !FuncName.compare("g_strncasecmp")); + isStrncmp &= FT->getNumParams() == 3 && + FT->getReturnType()->isIntegerTy(32) && + FT->getParamType(0) == FT->getParamType(1) && + FT->getParamType(0) == + IntegerType::getInt8PtrTy(M.getContext()) && + FT->getParamType(2)->isIntegerTy(); + bool isGccStdStringStdString = Callee->getName().find("__is_charIT_EE7__value") != std::string::npos && @@ -267,13 +404,19 @@ bool CmpLogRoutines::hookRtns(Module &M) { */ if (isGccStdStringCString || isGccStdStringStdString || - isLlvmStdStringStdString || isLlvmStdStringCString) { + isLlvmStdStringStdString || isLlvmStdStringCString || isMemcmp || + isStrcmp || isStrncmp) { - isPtrRtn = false; + isPtrRtnN = isPtrRtn = false; } + if (isPtrRtnN) { isPtrRtn = false; } + if (isPtrRtn) { calls.push_back(callInst); } + if (isMemcmp || isPtrRtnN) { Memcmp.push_back(callInst); } + if (isStrcmp) { Strcmp.push_back(callInst); } + if (isStrncmp) { Strncmp.push_back(callInst); } if (isGccStdStringStdString) { gccStdStd.push_back(callInst); } if (isGccStdStringCString) { gccStdC.push_back(callInst); } if (isLlvmStdStringStdString) { llvmStdStd.push_back(callInst); } @@ -288,7 +431,8 @@ bool CmpLogRoutines::hookRtns(Module &M) { } if (!calls.size() && !gccStdStd.size() && !gccStdC.size() && - !llvmStdStd.size() && !llvmStdC.size()) + !llvmStdStd.size() && !llvmStdC.size() && !Memcmp.size() && + Strcmp.size() && Strncmp.size()) return false; /* @@ -323,6 +467,96 @@ bool CmpLogRoutines::hookRtns(Module &M) { } + for (auto &callInst : Memcmp) { + + Value *v1P = callInst->getArgOperand(0), *v2P = callInst->getArgOperand(1), + *v3P = callInst->getArgOperand(2); + + IRBuilder<> IRB2(callInst->getParent()); + IRB2.SetInsertPoint(callInst); + + LoadInst *CmpPtr = IRB2.CreateLoad(AFLCmplogPtr); + CmpPtr->setMetadata(M.getMDKindID("nosanitize"), MDNode::get(C, None)); + auto is_not_null = IRB2.CreateICmpNE(CmpPtr, Null); + auto ThenTerm = SplitBlockAndInsertIfThen(is_not_null, callInst, false); + + IRBuilder<> IRB(ThenTerm); + + std::vector<Value *> args; + Value * v1Pcasted = IRB.CreatePointerCast(v1P, i8PtrTy); + Value * v2Pcasted = IRB.CreatePointerCast(v2P, i8PtrTy); + Value * v3Pbitcast = IRB.CreateBitCast( + v3P, IntegerType::get(C, v3P->getType()->getPrimitiveSizeInBits())); + Value *v3Pcasted = + IRB.CreateIntCast(v3Pbitcast, IntegerType::get(C, 64), false); + args.push_back(v1Pcasted); + args.push_back(v2Pcasted); + args.push_back(v3Pcasted); + + IRB.CreateCall(cmplogHookFnN, args); + + // errs() << callInst->getCalledFunction()->getName() << "\n"; + + } + + for (auto &callInst : Strcmp) { + + Value *v1P = callInst->getArgOperand(0), *v2P = callInst->getArgOperand(1); + + IRBuilder<> IRB2(callInst->getParent()); + IRB2.SetInsertPoint(callInst); + + LoadInst *CmpPtr = IRB2.CreateLoad(AFLCmplogPtr); + CmpPtr->setMetadata(M.getMDKindID("nosanitize"), MDNode::get(C, None)); + auto is_not_null = IRB2.CreateICmpNE(CmpPtr, Null); + auto ThenTerm = SplitBlockAndInsertIfThen(is_not_null, callInst, false); + + IRBuilder<> IRB(ThenTerm); + + std::vector<Value *> args; + Value * v1Pcasted = IRB.CreatePointerCast(v1P, i8PtrTy); + Value * v2Pcasted = IRB.CreatePointerCast(v2P, i8PtrTy); + args.push_back(v1Pcasted); + args.push_back(v2Pcasted); + + IRB.CreateCall(cmplogHookFnStr, args); + + // errs() << callInst->getCalledFunction()->getName() << "\n"; + + } + + for (auto &callInst : Strncmp) { + + Value *v1P = callInst->getArgOperand(0), *v2P = callInst->getArgOperand(1), + *v3P = callInst->getArgOperand(2); + + IRBuilder<> IRB2(callInst->getParent()); + IRB2.SetInsertPoint(callInst); + + LoadInst *CmpPtr = IRB2.CreateLoad(AFLCmplogPtr); + CmpPtr->setMetadata(M.getMDKindID("nosanitize"), MDNode::get(C, None)); + auto is_not_null = IRB2.CreateICmpNE(CmpPtr, Null); + auto ThenTerm = SplitBlockAndInsertIfThen(is_not_null, callInst, false); + + IRBuilder<> IRB(ThenTerm); + + std::vector<Value *> args; + Value * v1Pcasted = IRB.CreatePointerCast(v1P, i8PtrTy); + Value * v2Pcasted = IRB.CreatePointerCast(v2P, i8PtrTy); + Value * v3Pbitcast = IRB.CreateBitCast( + v3P, IntegerType::get(C, v3P->getType()->getPrimitiveSizeInBits())); + Value *v3Pcasted = + IRB.CreateIntCast(v3Pbitcast, IntegerType::get(C, 64), false); + args.push_back(v1Pcasted); + args.push_back(v2Pcasted); + args.push_back(v3Pcasted); + + IRB.CreateCall(cmplogHookFnStrN, args); + + // errs() << callInst->getCalledFunction()->getName() << "\n"; + + } + for (auto &callInst : gccStdStd) { Value *v1P = callInst->getArgOperand(0), *v2P = callInst->getArgOperand(1); diff --git a/qemu_mode/QEMUAFL_VERSION b/qemu_mode/QEMUAFL_VERSION index 7bdedf7b..0ffae35c 100644 --- a/qemu_mode/QEMUAFL_VERSION +++ b/qemu_mode/QEMUAFL_VERSION @@ -1 +1 @@ -71ed0d206f +8809a2b2ebf089d3427dd8f6a0044bcc2e13b389 diff --git a/qemu_mode/qemuafl b/qemu_mode/qemuafl -Subproject 71ed0d206fd3d877420dceb4993a1011a4637ae +Subproject 8809a2b2ebf089d3427dd8f6a0044bcc2e13b38 diff --git a/src/afl-forkserver.c b/src/afl-forkserver.c index 44b6c6f9..6320a26b 100644 --- a/src/afl-forkserver.c +++ b/src/afl-forkserver.c @@ -342,6 +342,16 @@ static void report_error_and_exit(int error) { "the fuzzing target reports that the mmap() call to the shared " "memory failed."); break; + case FS_ERROR_OLD_CMPLOG: + FATAL( + "the -c cmplog target was instrumented with an too old afl++ " + "version, you need to recompile it."); + break; + case FS_ERROR_OLD_CMPLOG_QEMU: + FATAL( + "The AFL++ QEMU/FRIDA loaders are from an older version, for -c you " + "need to recompile it.\n"); + break; default: FATAL("unknown error code %d from fuzzing target!", error); @@ -663,6 +673,20 @@ void afl_fsrv_start(afl_forkserver_t *fsrv, char **argv, if ((status & FS_OPT_OLD_AFLPP_WORKAROUND) == FS_OPT_OLD_AFLPP_WORKAROUND) status = (status & 0xf0ffffff); + if ((status & FS_OPT_NEWCMPLOG) == 0 && fsrv->cmplog_binary) { + + if (fsrv->qemu_mode || fsrv->frida_mode) { + + report_error_and_exit(FS_ERROR_OLD_CMPLOG_QEMU); + + } else { + + report_error_and_exit(FS_ERROR_OLD_CMPLOG); + + } + + } + if ((status & FS_OPT_SNAPSHOT) == FS_OPT_SNAPSHOT) { fsrv->snapshot = 1; diff --git a/src/afl-fuzz-one.c b/src/afl-fuzz-one.c index a1134a22..f4d3b77f 100644 --- a/src/afl-fuzz-one.c +++ b/src/afl-fuzz-one.c @@ -448,11 +448,11 @@ u8 fuzz_one_original(afl_state_t *afl) { ACTF( "Fuzzing test case #%u (%u total, %llu uniq crashes found, " - "perf_score=%0.0f, exec_us=%llu, hits=%u, map=%u)...", + "perf_score=%0.0f, exec_us=%llu, hits=%u, map=%u, ascii=%u)...", afl->current_entry, afl->queued_paths, afl->unique_crashes, afl->queue_cur->perf_score, afl->queue_cur->exec_us, likely(afl->n_fuzz) ? afl->n_fuzz[afl->queue_cur->n_fuzz_entry] : 0, - afl->queue_cur->bitmap_size); + afl->queue_cur->bitmap_size, afl->queue_cur->is_ascii); fflush(stdout); } @@ -2003,11 +2003,16 @@ havoc_stage: where we take the input file and make random stacked tweaks. */ #define MAX_HAVOC_ENTRY 59 /* 55 to 60 */ +#define MUTATE_ASCII_DICT 64 u32 r_max, r; r_max = (MAX_HAVOC_ENTRY + 1) + (afl->extras_cnt ? 4 : 0) + - (afl->a_extras_cnt ? 4 : 0); + (afl->a_extras_cnt + ? (unlikely(afl->cmplog_binary && afl->queue_cur->is_ascii) + ? MUTATE_ASCII_DICT + : 4) + : 0); if (unlikely(afl->expand_havoc && afl->ready_for_splicing_count > 1)) { @@ -2592,7 +2597,15 @@ havoc_stage: if (afl->a_extras_cnt) { - if (r < 2) { + u32 r_cmp = 2; + + if (unlikely(afl->cmplog_binary && afl->queue_cur->is_ascii)) { + + r_cmp = MUTATE_ASCII_DICT >> 1; + + } + + if (r < r_cmp) { /* Use the dictionary. */ @@ -2612,7 +2625,7 @@ havoc_stage: break; - } else if (r < 4) { + } else if (r < (r_cmp << 1)) { u32 use_extra = rand_below(afl, afl->a_extras_cnt); u32 extra_len = afl->a_extras[use_extra].len; @@ -2641,7 +2654,7 @@ havoc_stage: } else { - r -= 4; + r -= (r_cmp << 1); } diff --git a/src/afl-fuzz-queue.c b/src/afl-fuzz-queue.c index 33c2b561..1523d556 100644 --- a/src/afl-fuzz-queue.c +++ b/src/afl-fuzz-queue.c @@ -315,7 +315,96 @@ void mark_as_redundant(afl_state_t *afl, struct queue_entry *q, u8 state) { } -/* check if ascii or UTF-8 */ +/* check if pointer is ascii or UTF-8 */ + +u8 check_if_text_buf(u8 *buf, u32 len) { + + u32 offset = 0, ascii = 0, utf8 = 0; + + while (offset < len) { + + // ASCII: <= 0x7F to allow ASCII control characters + if ((buf[offset + 0] == 0x09 || buf[offset + 0] == 0x0A || + buf[offset + 0] == 0x0D || + (0x20 <= buf[offset + 0] && buf[offset + 0] <= 0x7E))) { + + offset++; + utf8++; + ascii++; + continue; + + } + + if (isascii((int)buf[offset]) || isprint((int)buf[offset])) { + + ascii++; + // we continue though as it can also be a valid utf8 + + } + + // non-overlong 2-byte + if (len - offset > 1 && + ((0xC2 <= buf[offset + 0] && buf[offset + 0] <= 0xDF) && + (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0xBF))) { + + offset += 2; + utf8++; + continue; + + } + + // excluding overlongs + if ((len - offset > 2) && + ((buf[offset + 0] == 0xE0 && + (0xA0 <= buf[offset + 1] && buf[offset + 1] <= 0xBF) && + (0x80 <= buf[offset + 2] && + buf[offset + 2] <= 0xBF)) || // straight 3-byte + (((0xE1 <= buf[offset + 0] && buf[offset + 0] <= 0xEC) || + buf[offset + 0] == 0xEE || buf[offset + 0] == 0xEF) && + (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0xBF) && + (0x80 <= buf[offset + 2] && + buf[offset + 2] <= 0xBF)) || // excluding surrogates + (buf[offset + 0] == 0xED && + (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0x9F) && + (0x80 <= buf[offset + 2] && buf[offset + 2] <= 0xBF)))) { + + offset += 3; + utf8++; + continue; + + } + + // planes 1-3 + if ((len - offset > 3) && + ((buf[offset + 0] == 0xF0 && + (0x90 <= buf[offset + 1] && buf[offset + 1] <= 0xBF) && + (0x80 <= buf[offset + 2] && buf[offset + 2] <= 0xBF) && + (0x80 <= buf[offset + 3] && + buf[offset + 3] <= 0xBF)) || // planes 4-15 + ((0xF1 <= buf[offset + 0] && buf[offset + 0] <= 0xF3) && + (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0xBF) && + (0x80 <= buf[offset + 2] && buf[offset + 2] <= 0xBF) && + (0x80 <= buf[offset + 3] && buf[offset + 3] <= 0xBF)) || // plane 16 + (buf[offset + 0] == 0xF4 && + (0x80 <= buf[offset + 1] && buf[offset + 1] <= 0x8F) && + (0x80 <= buf[offset + 2] && buf[offset + 2] <= 0xBF) && + (0x80 <= buf[offset + 3] && buf[offset + 3] <= 0xBF)))) { + + offset += 4; + utf8++; + continue; + + } + + offset++; + + } + + return (utf8 > ascii ? utf8 : ascii); + +} + +/* check if queue entry is ascii or UTF-8 */ static u8 check_if_text(afl_state_t *afl, struct queue_entry *q) { diff --git a/src/afl-fuzz-redqueen.c b/src/afl-fuzz-redqueen.c index a1d6e021..0a6e5eee 100644 --- a/src/afl-fuzz-redqueen.c +++ b/src/afl-fuzz-redqueen.c @@ -45,6 +45,23 @@ enum { }; +// add to dictionary enum +// DEFAULT = 1, notTXT = 2, FOUND = 4, notSAME = 8 +enum { + + DICT_ADD_NEVER = 0, + DICT_ADD_NOTFOUND_SAME_TXT = 1, + DICT_ADD_NOTFOUND_SAME = 3, + DICT_ADD_FOUND_SAME_TXT = 5, + DICT_ADD_FOUND_SAME = 7, + DICT_ADD_NOTFOUND_TXT = 9, + DICT_ADD_NOTFOUND = 11, + DICT_ADD_FOUND_TXT = 13, + DICT_ADD_FOUND = 15, + DICT_ADD_ANY = DICT_ADD_FOUND + +}; + // CMPLOG LVL enum { @@ -54,6 +71,8 @@ enum { }; +#define DICT_ADD_STRATEGY DICT_ADD_FOUND_SAME + struct range { u32 start; @@ -64,6 +83,10 @@ struct range { }; +static u32 hshape; +static u64 screen_update; +static u64 last_update; + static struct range *add_range(struct range *ranges, u32 start, u32 end) { struct range *r = ck_alloc_nozero(sizeof(struct range)); @@ -252,7 +275,6 @@ static u8 colorization(afl_state_t *afl, u8 *buf, u32 len, u64 start_time = get_cur_time(); #endif - u32 screen_update; u64 orig_hit_cnt, new_hit_cnt, exec_cksum; orig_hit_cnt = afl->queued_paths + afl->unique_crashes; @@ -261,24 +283,6 @@ static u8 colorization(afl_state_t *afl, u8 *buf, u32 len, afl->stage_max = (len << 1); afl->stage_cur = 0; - if (likely(afl->queue_cur->exec_us)) { - - if (likely((100000 / 2) >= afl->queue_cur->exec_us)) { - - screen_update = 100000 / afl->queue_cur->exec_us; - - } else { - - screen_update = 1; - - } - - } else { - - screen_update = 100000; - - } - // in colorization we do not classify counts, hence we have to calculate // the original checksum. if (unlikely(get_exec_checksum(afl, buf, len, &exec_cksum))) { @@ -348,7 +352,7 @@ static u8 colorization(afl_state_t *afl, u8 *buf, u32 len, } - if (++afl->stage_cur % screen_update) { show_stats(afl); }; + if (++afl->stage_cur % screen_update == 0) { show_stats(afl); }; } @@ -440,10 +444,10 @@ static u8 colorization(afl_state_t *afl, u8 *buf, u32 len, fprintf( f, "Colorization: fname=%s len=%u ms=%llu result=%u execs=%u found=%llu " - "taint=%u\n", + "taint=%u ascii=%u auto_extra_before=%u\n", afl->queue_cur->fname, len, get_cur_time() - start_time, afl->queue_cur->colorized, afl->stage_cur, new_hit_cnt - orig_hit_cnt, - positions); + positions, afl->queue_cur->is_ascii ? 1 : 0, afl->a_extras_cnt); #ifndef _DEBUG if (afl->not_on_tty) { fclose(f); } @@ -759,11 +763,18 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, u32 its_len = MIN(len - idx, taint_len); + if (afl->fsrv.total_execs - last_update > screen_update) { + + show_stats(afl); + last_update = afl->fsrv.total_execs; + + } + // fprintf(stderr, // "Encode: %llx->%llx into %llx(<-%llx) at idx=%u " // "taint_len=%u shape=%u attr=%u\n", // o_pattern, pattern, repl, changed_val, idx, taint_len, - // h->shape + 1, attr); + // hshape, attr); //#ifdef CMPLOG_SOLVE_TRANSFORM // reverse atoi()/strnu?toll() is expensive, so we only to it in lvl 3 @@ -845,7 +856,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, u64 b_val, o_b_val, mask; u8 bytes; - switch (SHAPE_BYTES(h->shape)) { + switch (hshape) { case 0: case 1: @@ -924,7 +935,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, s64 diff = pattern - b_val; s64 o_diff = o_pattern - o_b_val; /* fprintf(stderr, "DIFF1 idx=%03u shape=%02u %llx-%llx=%lx\n", idx, - h->shape + 1, o_pattern, o_b_val, o_diff); + hshape, o_pattern, o_b_val, o_diff); fprintf(stderr, "DIFF1 %016llx %llx-%llx=%lx\n", repl, pattern, b_val, diff); */ if (diff == o_diff && diff) { @@ -953,7 +964,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, s64 o_diff = o_pattern ^ o_b_val; /* fprintf(stderr, "DIFF2 idx=%03u shape=%02u %llx-%llx=%lx\n", - idx, h->shape + 1, o_pattern, o_b_val, o_diff); + idx, hshape, o_pattern, o_b_val, o_diff); fprintf(stderr, "DIFF2 %016llx %llx-%llx=%lx\n", repl, pattern, b_val, diff); */ @@ -1002,7 +1013,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, } /* fprintf(stderr, "DIFF3 idx=%03u shape=%02u %llx-%llx=%lx\n", - idx, h->shape + 1, o_pattern, o_b_val, o_diff); + idx, hshape, o_pattern, o_b_val, o_diff); fprintf(stderr, "DIFF3 %016llx %llx-%llx=%lx\n", repl, pattern, b_val, diff); */ @@ -1051,7 +1062,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, } /* fprintf(stderr, "DIFF4 idx=%03u shape=%02u %llx-%llx=%lx\n", - idx, h->shape + 1, o_pattern, o_b_val, o_diff); + idx, hshape, o_pattern, o_b_val, o_diff); fprintf(stderr, "DIFF4 %016llx %llx-%llx=%lx\n", repl, pattern, b_val, diff); */ @@ -1089,7 +1100,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, if ((lvl & LVL1) || attr >= IS_FP_MOD) { - if (SHAPE_BYTES(h->shape) >= 8 && *status != 1) { + if (hshape >= 8 && *status != 1) { // if (its_len >= 8) // fprintf(stderr, @@ -1132,7 +1143,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, } - if (SHAPE_BYTES(h->shape) >= 4 && *status != 1) { + if (hshape >= 4 && *status != 1) { // if (its_len >= 4 && (attr <= 1 || attr >= 8)) // fprintf(stderr, @@ -1173,7 +1184,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, } - if (SHAPE_BYTES(h->shape) >= 2 && *status != 1) { + if (hshape >= 2 && *status != 1) { if (its_len >= 2 && ((*buf_16 == (u16)pattern && *o_buf_16 == (u16)o_pattern) || @@ -1244,11 +1255,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, } - if (!(attr & (IS_GREATER | IS_LESSER)) || SHAPE_BYTES(h->shape) < 4) { - - return 0; - - } + if (!(attr & (IS_GREATER | IS_LESSER)) || hshape < 4) { return 0; } // transform >= to < and <= to > if ((attr & IS_EQUAL) && (attr & (IS_GREATER | IS_LESSER))) { @@ -1272,7 +1279,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, if (attr & IS_GREATER) { - if (SHAPE_BYTES(h->shape) == 4 && its_len >= 4) { + if (hshape == 4 && its_len >= 4) { float *f = (float *)&repl; float g = *f; @@ -1280,7 +1287,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, u32 *r = (u32 *)&g; repl_new = (u32)*r; - } else if (SHAPE_BYTES(h->shape) == 8 && its_len >= 8) { + } else if (hshape == 8 && its_len >= 8) { double *f = (double *)&repl; double g = *f; @@ -1307,7 +1314,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, } else { - if (SHAPE_BYTES(h->shape) == 4) { + if (hshape == 4) { float *f = (float *)&repl; float g = *f; @@ -1315,7 +1322,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, u32 *r = (u32 *)&g; repl_new = (u32)*r; - } else if (SHAPE_BYTES(h->shape) == 8) { + } else if (hshape == 8) { double *f = (double *)&repl; double g = *f; @@ -1342,7 +1349,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, } // transform double to float, llvm likes to do that internally ... - if (SHAPE_BYTES(h->shape) == 8 && its_len >= 4) { + if (hshape == 8 && its_len >= 4) { double *f = (double *)&repl; float g = (float)*f; @@ -1353,7 +1360,7 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, memcpy(((char *)&repl_new) + 4, (char *)&g, 4); #endif changed_val = repl_new; - h->shape = 3; // modify shape + hshape = 4; // modify shape // fprintf(stderr, "DOUBLE2FLOAT %llx\n", repl_new); @@ -1361,12 +1368,12 @@ static u8 cmp_extend_encoding(afl_state_t *afl, struct cmp_header *h, afl, h, pattern, repl_new, o_pattern, changed_val, 16, idx, taint_len, orig_buf, buf, cbuf, len, 1, lvl, status))) { - h->shape = 7; // recover shape + hshape = 8; // recover shape return 1; } - h->shape = 7; // recover shape + hshape = 8; // recover shape } @@ -1421,6 +1428,13 @@ static u8 cmp_extend_encodingN(afl_state_t *afl, struct cmp_header *h, u32 taint_len, u8 *orig_buf, u8 *buf, u8 *cbuf, u32 len, u8 do_reverse, u8 lvl, u8 *status) { + if (afl->fsrv.total_execs - last_update > screen_update) { + + show_stats(afl); + last_update = afl->fsrv.total_execs; + + } + u8 *ptr = (u8 *)&buf[idx]; u8 *o_ptr = (u8 *)&orig_buf[idx]; u8 *p = (u8 *)&pattern; @@ -1428,52 +1442,51 @@ static u8 cmp_extend_encodingN(afl_state_t *afl, struct cmp_header *h, u8 *r = (u8 *)&repl; u8 backup[16]; u32 its_len = MIN(len - idx, taint_len); - u32 shape = h->shape + 1; #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) size_t off = 0; #else - size_t off = 16 - shape; + size_t off = 16 - hshape; #endif - if (its_len >= shape) { + if (its_len >= hshape) { #ifdef _DEBUG fprintf(stderr, "TestUN: %u>=%u (len=%u idx=%u attr=%u off=%lu) (%u) ", - its_len, shape, len, idx, attr, off, do_reverse); + its_len, hshape, len, idx, attr, off, do_reverse); u32 i; u8 *o_r = (u8 *)&changed_val; - for (i = 0; i < shape; i++) + for (i = 0; i < hshape; i++) fprintf(stderr, "%02x", ptr[i]); fprintf(stderr, "=="); - for (i = 0; i < shape; i++) + for (i = 0; i < hshape; i++) fprintf(stderr, "%02x", p[off + i]); fprintf(stderr, " "); - for (i = 0; i < shape; i++) + for (i = 0; i < hshape; i++) fprintf(stderr, "%02x", o_ptr[i]); fprintf(stderr, "=="); - for (i = 0; i < shape; i++) + for (i = 0; i < hshape; i++) fprintf(stderr, "%02x", o_p[off + i]); fprintf(stderr, " <= "); - for (i = 0; i < shape; i++) + for (i = 0; i < hshape; i++) fprintf(stderr, "%02x", r[off + i]); fprintf(stderr, "<-"); - for (i = 0; i < shape; i++) + for (i = 0; i < hshape; i++) fprintf(stderr, "%02x", o_r[off + i]); fprintf(stderr, "\n"); #endif - if (!memcmp(ptr, p + off, shape) && !memcmp(o_ptr, o_p + off, shape)) { + if (!memcmp(ptr, p + off, hshape) && !memcmp(o_ptr, o_p + off, hshape)) { - memcpy(backup, ptr, shape); - memcpy(ptr, r + off, shape); + memcpy(backup, ptr, hshape); + memcpy(ptr, r + off, hshape); if (unlikely(its_fuzz(afl, buf, len, status))) { return 1; } #ifdef CMPLOG_COMBINE - if (*status == 1) { memcpy(cbuf + idx, r, shape); } + if (*status == 1) { memcpy(cbuf + idx, r, hshape); } #endif - memcpy(ptr, backup, shape); + memcpy(ptr, backup, hshape); #ifdef _DEBUG fprintf(stderr, "Status=%u\n", *status); @@ -1485,10 +1498,10 @@ static u8 cmp_extend_encodingN(afl_state_t *afl, struct cmp_header *h, if (do_reverse && *status != 1) { if (unlikely(cmp_extend_encodingN( - afl, h, SWAPN(pattern, (shape << 3)), SWAPN(repl, (shape << 3)), - SWAPN(o_pattern, (shape << 3)), SWAPN(changed_val, (shape << 3)), - attr, idx, taint_len, orig_buf, buf, cbuf, len, 0, lvl, - status))) { + afl, h, SWAPN(pattern, (hshape << 3)), SWAPN(repl, (hshape << 3)), + SWAPN(o_pattern, (hshape << 3)), + SWAPN(changed_val, (hshape << 3)), attr, idx, taint_len, orig_buf, + buf, cbuf, len, 0, lvl, status))) { return 1; @@ -1615,6 +1628,8 @@ static u8 cmp_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf, u8 s_v0_inc = 1, s_v1_inc = 1; u8 s_v0_dec = 1, s_v1_dec = 1; + hshape = SHAPE_BYTES(h->shape); + if (h->hits > CMP_MAP_H) { loggeds = CMP_MAP_H; @@ -1626,7 +1641,7 @@ static u8 cmp_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf, } #ifdef WORD_SIZE_64 - switch (SHAPE_BYTES(h->shape)) { + switch (hshape) { case 1: case 2: @@ -1679,8 +1694,7 @@ static u8 cmp_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf, #ifdef _DEBUG fprintf(stderr, "Handling: %llx->%llx vs %llx->%llx attr=%u shape=%u\n", - orig_o->v0, o->v0, orig_o->v1, o->v1, h->attribute, - SHAPE_BYTES(h->shape)); + orig_o->v0, o->v0, orig_o->v1, o->v1, h->attribute, hshape); #endif t = taint; @@ -1830,27 +1844,41 @@ static u8 cmp_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf, "END: %llx->%llx vs %llx->%llx attr=%u i=%u found=%u " "isN=%u size=%u\n", orig_o->v0, o->v0, orig_o->v1, o->v1, h->attribute, i, found_one, - is_n, SHAPE_BYTES(h->shape)); + is_n, hshape); #endif - // If failed, add to dictionary - if (!found_one) { + // we only learn 16 bit + + if (hshape > 1) { - if (afl->pass_stats[key].total == 0) { + if (!found_one || afl->queue_cur->is_ascii) { #ifdef WORD_SIZE_64 if (unlikely(is_n)) { - try_to_add_to_dictN(afl, s128_v0, SHAPE_BYTES(h->shape)); - try_to_add_to_dictN(afl, s128_v1, SHAPE_BYTES(h->shape)); + if (!found_one || + check_if_text_buf((u8 *)&s128_v0, SHAPE_BYTES(h->shape)) == + SHAPE_BYTES(h->shape)) + try_to_add_to_dictN(afl, s128_v0, SHAPE_BYTES(h->shape)); + if (!found_one || + check_if_text_buf((u8 *)&s128_v1, SHAPE_BYTES(h->shape)) == + SHAPE_BYTES(h->shape)) + try_to_add_to_dictN(afl, s128_v1, SHAPE_BYTES(h->shape)); } else #endif { - try_to_add_to_dict(afl, o->v0, SHAPE_BYTES(h->shape)); - try_to_add_to_dict(afl, o->v1, SHAPE_BYTES(h->shape)); + if (!memcmp((u8 *)&o->v0, (u8 *)&orig_o->v0, SHAPE_BYTES(h->shape)) && + (!found_one || + check_if_text_buf((u8 *)&o->v0, SHAPE_BYTES(h->shape)) == + SHAPE_BYTES(h->shape))) + try_to_add_to_dict(afl, o->v0, SHAPE_BYTES(h->shape)); + if (!memcmp((u8 *)&o->v1, (u8 *)&orig_o->v1, SHAPE_BYTES(h->shape)) && + (!found_one || + check_if_text_buf((u8 *)&o->v1, SHAPE_BYTES(h->shape)) == + SHAPE_BYTES(h->shape))) + try_to_add_to_dict(afl, o->v1, SHAPE_BYTES(h->shape)); } @@ -1882,8 +1910,9 @@ static u8 cmp_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf, } -static u8 rtn_extend_encoding(afl_state_t *afl, u8 *pattern, u8 *repl, - u8 *o_pattern, u8 *changed_val, u8 plen, u32 idx, +static u8 rtn_extend_encoding(afl_state_t *afl, u8 entry, + struct cmpfn_operands *o, + struct cmpfn_operands *orig_o, u32 idx, u32 taint_len, u8 *orig_buf, u8 *buf, u8 *cbuf, u32 len, u8 lvl, u8 *status) { @@ -1894,9 +1923,60 @@ static u8 rtn_extend_encoding(afl_state_t *afl, u8 *pattern, u8 *repl, // (void)(changed_val); //#endif + if (afl->fsrv.total_execs - last_update > screen_update) { + + show_stats(afl); + last_update = afl->fsrv.total_execs; + + } + + u8 *pattern, *repl, *o_pattern, *changed_val; + u8 l0, l1, ol0, ol1; + + if (entry == 0) { + + pattern = o->v0; + repl = o->v1; + o_pattern = orig_o->v0; + changed_val = orig_o->v1; + l0 = o->v0_len; + ol0 = orig_o->v0_len; + l1 = o->v1_len; + ol1 = orig_o->v1_len; + + } else { + + pattern = o->v1; + repl = o->v0; + o_pattern = orig_o->v1; + changed_val = orig_o->v0; + l0 = o->v1_len; + ol0 = orig_o->v1_len; + l1 = o->v0_len; + ol1 = orig_o->v0_len; + + } + + if (l0 >= 0x80 || ol0 >= 0x80) { + + l0 -= 0x80; + l1 -= 0x80; + ol0 -= 0x80; + ol1 -= 0x80; + + } + + if (l0 == 0 || l1 == 0 || ol0 == 0 || ol1 == 0 || l0 > 31 || l1 > 31 || + ol0 > 31 || ol1 > 31) { + + l0 = l1 = ol0 = ol1 = hshape; + + } + + u8 lmax = MAX(l0, ol0); u8 save[40]; u32 saved_idx = idx, pre, from = 0, to = 0, i, j; - u32 its_len = MIN((u32)plen, len - idx); + u32 its_len = MIN(MIN(lmax, hshape), len - idx); its_len = MIN(its_len, taint_len); u32 saved_its_len = its_len; @@ -1912,7 +1992,8 @@ static u8 rtn_extend_encoding(afl_state_t *afl, u8 *pattern, u8 *repl, (void)(j); #ifdef _DEBUG - fprintf(stderr, "RTN T idx=%u lvl=%02x ", idx, lvl); + fprintf(stderr, "RTN T idx=%u lvl=%02x is_txt=%u shape=%u/%u ", idx, lvl, + o->v0_len >= 0x80 ? 1 : 0, hshape, l0); for (j = 0; j < 8; j++) fprintf(stderr, "%02x", orig_buf[idx + j]); fprintf(stderr, " -> "); @@ -1972,10 +2053,10 @@ static u8 rtn_extend_encoding(afl_state_t *afl, u8 *pattern, u8 *repl, } - //#ifdef CMPLOG_SOLVE_TRANSFORM - if (*status == 1) return 0; + // transform solving + if (afl->cmplog_enable_transform && (lvl & LVL3)) { u32 toupper = 0, tolower = 0, xor = 0, arith = 0, tohex = 0, fromhex = 0; @@ -2322,6 +2403,8 @@ static u8 rtn_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf, u32 i, j, idx, have_taint = 1, taint_len, loggeds; u8 status = 0, found_one = 0; + hshape = SHAPE_BYTES(h->shape); + if (h->hits > CMP_MAP_RTN_H) { loggeds = CMP_MAP_RTN_H; @@ -2353,18 +2436,22 @@ static u8 rtn_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf, } /* - struct cmp_header *hh = &afl->orig_cmp_map->headers[key]; - fprintf(stderr, "RTN N hits=%u id=%u shape=%u attr=%u v0=", h->hits, - h->id, h->shape, h->attribute); - for (j = 0; j < 8; j++) fprintf(stderr, "%02x", o->v0[j]); - fprintf(stderr, " v1="); - for (j = 0; j < 8; j++) fprintf(stderr, "%02x", o->v1[j]); - fprintf(stderr, "\nRTN O hits=%u id=%u shape=%u attr=%u o0=", - hh->hits, hh->id, hh->shape, hh->attribute); - for (j = 0; j < 8; j++) fprintf(stderr, "%02x", orig_o->v0[j]); - fprintf(stderr, " o1="); - for (j = 0; j < 8; j++) fprintf(stderr, "%02x", orig_o->v1[j]); - fprintf(stderr, "\n"); + struct cmp_header *hh = &afl->orig_cmp_map->headers[key]; + fprintf(stderr, "RTN N hits=%u id=%u shape=%u attr=%u v0=", h->hits, h->id, + hshape, h->attribute); + for (j = 0; j < 8; j++) + fprintf(stderr, "%02x", o->v0[j]); + fprintf(stderr, " v1="); + for (j = 0; j < 8; j++) + fprintf(stderr, "%02x", o->v1[j]); + fprintf(stderr, "\nRTN O hits=%u id=%u shape=%u attr=%u o0=", hh->hits, + hh->id, hshape, hh->attribute); + for (j = 0; j < 8; j++) + fprintf(stderr, "%02x", orig_o->v0[j]); + fprintf(stderr, " o1="); + for (j = 0; j < 8; j++) + fprintf(stderr, "%02x", orig_o->v1[j]); + fprintf(stderr, "\n"); */ t = taint; @@ -2400,25 +2487,24 @@ static u8 rtn_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf, #ifdef _DEBUG int w; - fprintf(stderr, "key=%u idx=%u len=%u o0=", key, idx, - SHAPE_BYTES(h->shape)); - for (w = 0; w < SHAPE_BYTES(h->shape); ++w) + fprintf(stderr, "key=%u idx=%u len=%u o0=", key, idx, hshape); + for (w = 0; w < hshape; ++w) fprintf(stderr, "%02x", orig_o->v0[w]); fprintf(stderr, " v0="); - for (w = 0; w < SHAPE_BYTES(h->shape); ++w) + for (w = 0; w < hshape; ++w) fprintf(stderr, "%02x", o->v0[w]); fprintf(stderr, " o1="); - for (w = 0; w < SHAPE_BYTES(h->shape); ++w) + for (w = 0; w < hshape; ++w) fprintf(stderr, "%02x", orig_o->v1[w]); fprintf(stderr, " v1="); - for (w = 0; w < SHAPE_BYTES(h->shape); ++w) + for (w = 0; w < hshape; ++w) fprintf(stderr, "%02x", o->v1[w]); fprintf(stderr, "\n"); #endif - if (unlikely(rtn_extend_encoding( - afl, o->v0, o->v1, orig_o->v0, orig_o->v1, SHAPE_BYTES(h->shape), - idx, taint_len, orig_buf, buf, cbuf, len, lvl, &status))) { + if (unlikely(rtn_extend_encoding(afl, 0, o, orig_o, idx, taint_len, + orig_buf, buf, cbuf, len, lvl, + &status))) { return 1; @@ -2433,9 +2519,9 @@ static u8 rtn_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf, status = 0; - if (unlikely(rtn_extend_encoding( - afl, o->v1, o->v0, orig_o->v1, orig_o->v0, SHAPE_BYTES(h->shape), - idx, taint_len, orig_buf, buf, cbuf, len, lvl, &status))) { + if (unlikely(rtn_extend_encoding(afl, 1, o, orig_o, idx, taint_len, + orig_buf, buf, cbuf, len, lvl, + &status))) { return 1; @@ -2450,16 +2536,42 @@ static u8 rtn_fuzz(afl_state_t *afl, u32 key, u8 *orig_buf, u8 *buf, u8 *cbuf, } - // If failed, add to dictionary - if (!found_one && (lvl & LVL1)) { + // if (unlikely(!afl->pass_stats[key].total)) { + + if ((!found_one && (lvl & LVL1)) || afl->queue_cur->is_ascii) { + + // if (unlikely(!afl->pass_stats[key].total)) { + + u32 shape_len = SHAPE_BYTES(h->shape); + u32 v0_len = shape_len, v1_len = shape_len; + if (afl->queue_cur->is_ascii || + check_if_text_buf((u8 *)&o->v0, shape_len) == shape_len) { + + if (strlen(o->v0)) v0_len = strlen(o->v0); + + } - if (unlikely(!afl->pass_stats[key].total)) { + if (afl->queue_cur->is_ascii || + check_if_text_buf((u8 *)&o->v1, shape_len) == shape_len) { - maybe_add_auto(afl, o->v0, SHAPE_BYTES(h->shape)); - maybe_add_auto(afl, o->v1, SHAPE_BYTES(h->shape)); + if (strlen(o->v1)) v1_len = strlen(o->v1); } + // fprintf(stderr, "SHOULD: found:%u ascii:%u text?%u:%u %u:%s %u:%s \n", + // found_one, afl->queue_cur->is_ascii, check_if_text_buf((u8 *)&o->v0, + // shape_len), check_if_text_buf((u8 *)&o->v1, shape_len), v0_len, + // o->v0, v1_len, o->v1); + + if (!memcmp(o->v0, orig_o->v0, v0_len) || + (!found_one || check_if_text_buf((u8 *)&o->v0, v0_len) == v0_len)) + maybe_add_auto(afl, o->v0, v0_len); + if (!memcmp(o->v1, orig_o->v1, v1_len) || + (!found_one || check_if_text_buf((u8 *)&o->v1, v1_len) == v1_len)) + maybe_add_auto(afl, o->v1, v1_len); + + //} + } rtn_fuzz_next_iter: @@ -2492,6 +2604,23 @@ u8 input_to_state_stage(afl_state_t *afl, u8 *orig_buf, u8 *buf, u32 len) { } struct tainted *taint = NULL; + if (likely(afl->queue_cur->exec_us)) { + + if (likely((100000 / 2) >= afl->queue_cur->exec_us)) { + + screen_update = 100000 / afl->queue_cur->exec_us; + + } else { + + screen_update = 1; + + } + + } else { + + screen_update = 100000; + + } if (!afl->queue_cur->taint || !afl->queue_cur->cmplog_colorinput) { @@ -2592,8 +2721,6 @@ u8 input_to_state_stage(afl_state_t *afl, u8 *orig_buf, u8 *buf, u32 len) { u64 orig_hit_cnt, new_hit_cnt; u64 orig_execs = afl->fsrv.total_execs; orig_hit_cnt = afl->queued_paths + afl->unique_crashes; - u64 screen_update = 100000 / afl->queue_cur->exec_us, - execs = afl->fsrv.total_execs; afl->stage_name = "input-to-state"; afl->stage_short = "its"; @@ -2630,11 +2757,13 @@ u8 input_to_state_stage(afl_state_t *afl, u8 *orig_buf, u8 *buf, u32 len) { if (afl->shm.cmp_map->headers[k].type == CMP_TYPE_INS) { + // fprintf(stderr, "INS %u\n", k); afl->stage_max += MIN((u32)(afl->shm.cmp_map->headers[k].hits), (u32)CMP_MAP_H); } else { + // fprintf(stderr, "RTN %u\n", k); afl->stage_max += MIN((u32)(afl->shm.cmp_map->headers[k].hits), (u32)CMP_MAP_RTN_H); @@ -2673,13 +2802,6 @@ u8 input_to_state_stage(afl_state_t *afl, u8 *orig_buf, u8 *buf, u32 len) { } - if (afl->fsrv.total_execs - execs > screen_update) { - - execs = afl->fsrv.total_execs; - show_stats(afl); - - } - } r = 0; @@ -2795,9 +2917,10 @@ exit_its: if (f) { fprintf(f, - "Cmplog: fname=%s len=%u ms=%llu result=%u finds=%llu entries=%u\n", + "Cmplog: fname=%s len=%u ms=%llu result=%u finds=%llu entries=%u " + "auto_extra_after=%u\n", afl->queue_cur->fname, len, get_cur_time() - start_time, r, - new_hit_cnt - orig_hit_cnt, cmp_locations); + new_hit_cnt - orig_hit_cnt, cmp_locations, afl->a_extras_cnt); #ifndef _DEBUG if (afl->not_on_tty) { fclose(f); } diff --git a/src/afl-fuzz-stats.c b/src/afl-fuzz-stats.c index 1d48a76c..808bf258 100644 --- a/src/afl-fuzz-stats.c +++ b/src/afl-fuzz-stats.c @@ -278,6 +278,7 @@ void write_stats_file(afl_state_t *afl, u32 t_bytes, double bitmap_cvg, "total_edges : %u\n" "var_byte_count : %u\n" "havoc_expansion : %u\n" + "auto_dict_entries : %u\n" "testcache_size : %llu\n" "testcache_count : %u\n" "testcache_evict : %u\n" @@ -316,7 +317,7 @@ void write_stats_file(afl_state_t *afl, u32 t_bytes, double bitmap_cvg, -1, #endif t_bytes, afl->fsrv.real_map_size, afl->var_byte_count, - afl->expand_havoc, afl->q_testcase_cache_size, + afl->expand_havoc, afl->a_extras_cnt, afl->q_testcase_cache_size, afl->q_testcase_cache_count, afl->q_testcase_evictions, afl->use_banner, afl->unicorn_mode ? "unicorn" : "", afl->fsrv.qemu_mode ? "qemu " : "", diff --git a/src/afl-fuzz.c b/src/afl-fuzz.c index a4093693..c08b8fbb 100644 --- a/src/afl-fuzz.c +++ b/src/afl-fuzz.c @@ -2235,13 +2235,12 @@ int main(int argc, char **argv_orig, char **envp) { } - write_bitmap(afl); - save_auto(afl); - stop_fuzzing: afl->force_ui_update = 1; // ensure the screen is reprinted show_stats(afl); // print the screen one last time + write_bitmap(afl); + save_auto(afl); SAYF(CURSOR_SHOW cLRD "\n\n+++ Testing aborted %s +++\n" cRST, afl->stop_soon == 2 ? "programmatically" : "by user"); @@ -2270,6 +2269,20 @@ stop_fuzzing: } + if (afl->not_on_tty) { + + u32 t_bytes = count_non_255_bytes(afl, afl->virgin_bits); + u8 time_tmp[64]; + u_stringify_time_diff(time_tmp, get_cur_time(), afl->start_time); + ACTF( + "Statistics: %u new paths found, %.02f%% coverage achieved, %llu " + "crashes found, %llu timeouts found, total runtime %s", + afl->queued_discovered, + ((double)t_bytes * 100) / afl->fsrv.real_map_size, afl->unique_crashes, + afl->unique_hangs, time_tmp); + + } + #ifdef PROFILING SAYF(cYEL "[!] " cRST "Profiling information: %llu ms total work, %llu ns/run\n", diff --git a/src/afl-performance.c b/src/afl-performance.c index c6fa554b..04507410 100644 --- a/src/afl-performance.c +++ b/src/afl-performance.c @@ -90,7 +90,8 @@ inline u32 hash32(u8 *key, u32 len, u32 seed) { #endif - return (u32)XXH64(key, len, seed); + (void)seed; + return (u32)XXH3_64bits(key, len); } @@ -102,7 +103,8 @@ inline u64 hash64(u8 *key, u32 len, u64 seed) { #endif - return XXH64(key, len, seed); + (void)seed; + return XXH3_64bits(key, len); } diff --git a/test/test-cmplog.c b/test/test-cmplog.c index b077e3ab..262df6bd 100644 --- a/test/test-cmplog.c +++ b/test/test-cmplog.c @@ -1,15 +1,13 @@ #include <stdio.h> #include <string.h> +#include <stdint.h> #include <stdarg.h> #include <stdlib.h> #include <stdint.h> #include <unistd.h> -int main(int argc, char *argv[]) { - char buf[1024]; - ssize_t i; - if ((i = read(0, buf, sizeof(buf) - 1)) < 24) return 0; - buf[i] = 0; +int LLVMFuzzerTestOneInput(const uint8_t *buf, size_t i) { + if (i < 24) return 0; if (buf[0] != 'A') return 0; if (buf[1] != 'B') return 0; if (buf[2] != 'C') return 0; @@ -18,6 +16,17 @@ int main(int argc, char *argv[]) { if (strncmp(buf + 12, "IJKL", 4) == 0 && strcmp(buf + 16, "DEADBEEF") == 0) abort(); return 0; - } +#ifdef __AFL_COMPILER +int main(int argc, char *argv[]) { + unsigned char buf[1024]; + ssize_t i; + while(__AFL_LOOP(1000)) { + i = read(0, (char*)buf, sizeof(buf) - 1); + if (i > 0) buf[i] = 0; + LLVMFuzzerTestOneInput(buf, i); + } + return 0; +} +#endif |