4 files changed, 331 insertions, 101 deletions
diff --git a/include/afl-fuzz.h b/include/afl-fuzz.h
index 6e695a97..31c19287 100644
--- a/include/afl-fuzz.h
+++ b/include/afl-fuzz.h
@@ -134,6 +134,12 @@
 // Little helper to access the ptr to afl->##name_buf - for use in afl_realloc.
 #define AFL_BUF_PARAM(name) ((void **)&afl->name##_buf)
 
+#ifdef WORD_SIZE_64
+  #define AFL_RAND_RETURN u64
+#else
+  #define AFL_RAND_RETURN u32
+#endif
+
 extern s8  interesting_8[INTERESTING_8_LEN];
 extern s16 interesting_16[INTERESTING_8_LEN + INTERESTING_16_LEN];
 extern s32
@@ -580,7 +586,7 @@ typedef struct afl_state {
 
   u32 rand_cnt;                         /* Random number counter            */
 
-  u64 rand_seed[4];
+  u64 rand_seed[3];
   s64 init_seed;
 
   u64 total_cal_us,                     /* Total calibration time (us)      */
@@ -1015,8 +1021,8 @@ u32  count_bits(afl_state_t *, u8 *);
 u32  count_bytes(afl_state_t *, u8 *);
 u32  count_non_255_bytes(afl_state_t *, u8 *);
 void simplify_trace(afl_state_t *, u8 *);
+void classify_counts(afl_forkserver_t *);
 void init_count_class16(void);
-void classify_counts(afl_forkserver_t *fsrv);
 void minimize_bits(afl_state_t *, u8 *, u8 *);
 #ifndef SIMPLE_FILES
 u8 *describe_op(afl_state_t *, u8, size_t);
@@ -1106,8 +1112,7 @@ u8 common_fuzz_cmplog_stuff(afl_state_t *afl, u8 *out_buf, u32 len);
 u8 input_to_state_stage(afl_state_t *afl, u8 *orig_buf, u8 *buf, u32 len,
                         u64 exec_cksum);
 
-/* xoshiro256** */
-uint64_t rand_next(afl_state_t *afl);
+AFL_RAND_RETURN rand_next(afl_state_t *afl);
 
 /* probability between 0.0 and 1.0 */
 double rand_next_percent(afl_state_t *afl);
diff --git a/include/coverage-32.h b/include/coverage-32.h
new file mode 100644
index 00000000..710ff0cf
--- /dev/null
+++ b/include/coverage-32.h
@@ -0,0 +1,109 @@
+#include "config.h"
+#include "types.h"
+
+inline u32 classify_word(u32 word) {
+
+  u16 mem16[2];
+  memcpy(mem16, &word, sizeof(mem16));
+
+  mem16[0] = count_class_lookup16[mem16[0]];
+  mem16[1] = count_class_lookup16[mem16[1]];
+
+  memcpy(&word, mem16, sizeof(mem16));
+  return word;
+
+}
+
+void simplify_trace(afl_state_t *afl, u8 *bytes) {
+
+  u32 *mem = (u32 *)fsrv->trace_bits;
+  u32  i = (fsrv->map_size >> 2);
+
+  while (i--) {
+
+    /* Optimize for sparse bitmaps. */
+
+    if (unlikely(*mem)) {
+
+      u8 *mem8 = (u8 *)mem;
+
+      mem8[0] = simplify_lookup[mem8[0]];
+      mem8[1] = simplify_lookup[mem8[1]];
+      mem8[2] = simplify_lookup[mem8[2]];
+      mem8[3] = simplify_lookup[mem8[3]];
+
+    } else
+
+      *mem = 0x01010101;
+
+    mem++;
+
+  }
+
+}
+
+inline void classify_counts(u8 *bytes) {
+
+  u64 *mem = (u64 *)bytes;
+  u32  i = MAP_SIZE >> 2;
+
+  while (i--) {
+
+    /* Optimize for sparse bitmaps. */
+
+    if (unlikely(*mem)) { *mem = classify_word(*mem); }
+
+    mem++;
+
+  }
+
+}
+
+/* Updates the virgin bits, then reflects whether a new count or a new tuple is
+ * seen in ret. */
+inline void discover_word(u8 *ret, u32 *current, u32 *virgin) {
+
+  /* Optimize for (*current & *virgin) == 0 - i.e., no bits in current bitmap
+     that have not been already cleared from the virgin map - since this will
+     almost always be the case. */
+
+  if (*current & *virgin) {
+
+    if (likely(*ret < 2)) {
+
+      u8 *cur = (u8 *)current;
+      u8 *vir = (u8 *)virgin;
+
+      /* Looks like we have not found any new bytes yet; see if any non-zero
+         bytes in current[] are pristine in virgin[]. */
+
+      if ((cur[0] && vir[0] == 0xff) || (cur[1] && vir[1] == 0xff) ||
+          (cur[2] && vir[2] == 0xff) || (cur[3] && vir[3] == 0xff))
+        *ret = 2;
+      else
+        *ret = 1;
+
+    }
+
+    *virgin &= ~*current;
+
+  }
+
+}
+
+#define PACK_SIZE 16
+inline u32 skim(const u32 *virgin, const u32 *current, const u32 *current_end) {
+
+  for (; current != current_end; virgin += 4, current += 4) {
+
+    if (current[0] && classify_word(current[0]) & virgin[0]) return 1;
+    if (current[1] && classify_word(current[1]) & virgin[1]) return 1;
+    if (current[2] && classify_word(current[2]) & virgin[2]) return 1;
+    if (current[3] && classify_word(current[3]) & virgin[3]) return 1;
+
+  }
+
+  return 0;
+
+}
+
diff --git a/include/coverage-64.h b/include/coverage-64.h
new file mode 100644
index 00000000..54cf0073
--- /dev/null
+++ b/include/coverage-64.h
@@ -0,0 +1,186 @@
+#include "config.h"
+#include "types.h"
+
+#if (defined(__AVX512F__) && defined(__AVX512DQ__)) || defined(__AVX2__)
+  #include <immintrin.h>
+#endif
+
+inline u64 classify_word(u64 word) {
+
+  u16 mem16[4];
+  memcpy(mem16, &word, sizeof(mem16));
+
+  mem16[0] = count_class_lookup16[mem16[0]];
+  mem16[1] = count_class_lookup16[mem16[1]];
+  mem16[2] = count_class_lookup16[mem16[2]];
+  mem16[3] = count_class_lookup16[mem16[3]];
+
+  memcpy(&word, mem16, sizeof(mem16));
+  return word;
+
+}
+
+void simplify_trace(afl_state_t *afl, u8 *bytes) {
+
+  u64 *mem = (u64 *)bytes;
+  u32  i = (afl->fsrv.map_size >> 3);
+
+  while (i--) {
+
+    /* Optimize for sparse bitmaps. */
+
+    if (unlikely(*mem)) {
+
+      u8 *mem8 = (u8 *)mem;
+
+      mem8[0] = simplify_lookup[mem8[0]];
+      mem8[1] = simplify_lookup[mem8[1]];
+      mem8[2] = simplify_lookup[mem8[2]];
+      mem8[3] = simplify_lookup[mem8[3]];
+      mem8[4] = simplify_lookup[mem8[4]];
+      mem8[5] = simplify_lookup[mem8[5]];
+      mem8[6] = simplify_lookup[mem8[6]];
+      mem8[7] = simplify_lookup[mem8[7]];
+
+    } else
+
+      *mem = 0x0101010101010101ULL;
+
+    mem++;
+
+  }
+
+}
+
+inline void classify_counts(afl_forkserver_t *fsrv) {
+
+  u64 *mem = (u64 *)fsrv->trace_bits;
+  u32  i = (fsrv->map_size >> 3);
+
+  while (i--) {
+
+    /* Optimize for sparse bitmaps. */
+
+    if (unlikely(*mem)) { *mem = classify_word(*mem); }
+
+    mem++;
+
+  }
+
+}
+
+/* Updates the virgin bits, then reflects whether a new count or a new tuple is
+ * seen in ret. */
+inline void discover_word(u8 *ret, u64 *current, u64 *virgin) {
+
+  /* Optimize for (*current & *virgin) == 0 - i.e., no bits in current bitmap
+     that have not been already cleared from the virgin map - since this will
+     almost always be the case. */
+
+  if (*current & *virgin) {
+
+    if (likely(*ret < 2)) {
+
+      u8 *cur = (u8 *)current;
+      u8 *vir = (u8 *)virgin;
+
+      /* Looks like we have not found any new bytes yet; see if any non-zero
+         bytes in current[] are pristine in virgin[]. */
+
+      if ((cur[0] && vir[0] == 0xff) || (cur[1] && vir[1] == 0xff) ||
+          (cur[2] && vir[2] == 0xff) || (cur[3] && vir[3] == 0xff) ||
+          (cur[4] && vir[4] == 0xff) || (cur[5] && vir[5] == 0xff) ||
+          (cur[6] && vir[6] == 0xff) || (cur[7] && vir[7] == 0xff))
+        *ret = 2;
+      else
+        *ret = 1;
+
+    }
+
+    *virgin &= ~*current;
+
+  }
+
+}
+
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+  #define PACK_SIZE 64
+inline u32 skim(const u64 *virgin, const u64 *current, const u64 *current_end) {
+
+  for (; current != current_end; virgin += 8, current += 8) {
+
+    __m512i  value = *(__m512i *)current;
+    __mmask8 mask = _mm512_testn_epi64_mask(value, value);
+
+    /* All bytes are zero. */
+    if (mask == 0xff) continue;
+
+      /* Look for nonzero bytes and check for new bits. */
+  #define UNROLL(x) \
+    if (!(mask & (1 << x)) && classify_word(current[x]) & virgin[x]) return 1
+    UNROLL(0);
+    UNROLL(1);
+    UNROLL(2);
+    UNROLL(3);
+    UNROLL(4);
+    UNROLL(5);
+    UNROLL(6);
+    UNROLL(7);
+  #undef UNROLL
+
+  }
+
+  return 0;
+
+}
+
+#endif
+
+#if !defined(PACK_SIZE) && defined(__AVX2__)
+  #define PACK_SIZE 32
+inline u32 skim(const u64 *virgin, const u64 *current, const u64 *current_end) {
+
+  __m256i zeroes = _mm256_setzero_si256();
+
+  for (; current != current_end; virgin += 4, current += 4) {
+
+    __m256i value = *(__m256i *)current;
+    __m256i cmp = _mm256_cmpeq_epi64(value, zeroes);
+    u32     mask = _mm256_movemask_epi8(cmp);
+
+    /* All bytes are zero. */
+    if (mask == (u32)-1) continue;
+
+    /* Look for nonzero bytes and check for new bits. */
+    if (!(mask & 0xff) && classify_word(current[0]) & virgin[0]) return 1;
+    if (!(mask & 0xff00) && classify_word(current[1]) & virgin[1]) return 1;
+    if (!(mask & 0xff0000) && classify_word(current[2]) & virgin[2]) return 1;
+    if (!(mask & 0xff000000) && classify_word(current[3]) & virgin[3]) return 1;
+
+  }
+
+  return 0;
+
+}
+
+#endif
+
+#if !defined(PACK_SIZE)
+  #define PACK_SIZE 32
+inline u32 skim(const u64 *virgin, const u64 *current, const u64 *current_end) {
+
+  for (; current != current_end; virgin += 4, current += 4) {
+
+    if (current[0] && classify_word(current[0]) & virgin[0]) return 1;
+    if (current[1] && classify_word(current[1]) & virgin[1]) return 1;
+    if (current[2] && classify_word(current[2]) & virgin[2]) return 1;
+    if (current[3] && classify_word(current[3]) & virgin[3]) return 1;
+
+  }
+
+  return 0;
+
+}
+
+#endif
+
diff --git a/src/afl-performance.c b/src/afl-performance.c
index e070a05e..89b170eb 100644
--- a/src/afl-performance.c
+++ b/src/afl-performance.c
@@ -27,45 +27,49 @@
 #include "xxhash.h"
 #undef XXH_INLINE_ALL
 
-/* we use xoshiro256** instead of rand/random because it is 10x faster and has
-   better randomness properties. */
-
-static inline uint64_t rotl(const uint64_t x, int k) {
-
-  return (x << k) | (x >> (64 - k));
-
-}
-
 void rand_set_seed(afl_state_t *afl, s64 init_seed) {
 
   afl->init_seed = init_seed;
   afl->rand_seed[0] =
       hash64((u8 *)&afl->init_seed, sizeof(afl->init_seed), HASH_CONST);
   afl->rand_seed[1] = afl->rand_seed[0] ^ 0x1234567890abcdef;
-  afl->rand_seed[2] = afl->rand_seed[0] & 0x0123456789abcdef;
-  afl->rand_seed[3] = afl->rand_seed[0] | 0x01abcde43f567908;
+  afl->rand_seed[2] = (afl->rand_seed[0] & 0x1234567890abcdef) ^
+                      (afl->rand_seed[1] | 0xfedcba9876543210);
 
 }
 
-inline uint64_t rand_next(afl_state_t *afl) {
+#define ROTL(d, lrot) ((d << (lrot)) | (d >> (8 * sizeof(d) - (lrot))))
 
-  const uint64_t result =
-      rotl(afl->rand_seed[0] + afl->rand_seed[3], 23) + afl->rand_seed[0];
+#ifdef WORD_SIZE_64
+// romuDuoJr
+inline AFL_RAND_RETURN rand_next(afl_state_t *afl) {
 
-  const uint64_t t = afl->rand_seed[1] << 17;
+  AFL_RAND_RETURN xp = afl->rand_seed[0];
+  afl->rand_seed[0] = 15241094284759029579u * afl->rand_seed[1];
+  afl->rand_seed[1] = afl->rand_seed[1] - xp;
+  afl->rand_seed[1] = ROTL(afl->rand_seed[1], 27);
+  return xp;
 
-  afl->rand_seed[2] ^= afl->rand_seed[0];
-  afl->rand_seed[3] ^= afl->rand_seed[1];
-  afl->rand_seed[1] ^= afl->rand_seed[2];
-  afl->rand_seed[0] ^= afl->rand_seed[3];
+}
 
-  afl->rand_seed[2] ^= t;
+#else
+// RomuTrio32
+inline AFL_RAND_RETURN rand_next(afl_state_t *afl) {
+
+  AFL_RAND_RETURN xp = afl->rand_seed[0], yp = afl->rand_seed[1],
+                  zp = afl->rand_seed[2];
+  afl->rand_seed[0] = 3323815723u * zp;
+  afl->rand_seed[1] = yp - xp;
+  afl->rand_seed[1] = ROTL(afl->rand_seed[1], 6);
+  afl->rand_seed[2] = zp - yp;
+  afl->rand_seed[2] = ROTL(afl->rand_seed[2], 22);
+  return xp;
 
-  afl->rand_seed[3] = rotl(afl->rand_seed[3], 45);
+}
 
-  return result;
+#endif
 
-}
+#undef ROTL
 
 /* returns a double between 0.000000000 and 1.000000000 */
 
@@ -75,80 +79,6 @@ inline double rand_next_percent(afl_state_t *afl) {
 
 }
 
-/* This is the jump function for the generator. It is equivalent
-   to 2^128 calls to rand_next(); it can be used to generate 2^128
-   non-overlapping subsequences for parallel computations. */
-
-void jump(afl_state_t *afl) {
-
-  static const uint64_t JUMP[] = {0x180ec6d33cfd0aba, 0xd5a61266f0c9392c,
-                                  0xa9582618e03fc9aa, 0x39abdc4529b1661c};
-  size_t                i, b;
-  uint64_t              s0 = 0;
-  uint64_t              s1 = 0;
-  uint64_t              s2 = 0;
-  uint64_t              s3 = 0;
-  for (i = 0; i < (sizeof(JUMP) / sizeof(*JUMP)); i++)
-    for (b = 0; b < 64; b++) {
-
-      if (JUMP[i] & UINT64_C(1) << b) {
-
-        s0 ^= afl->rand_seed[0];
-        s1 ^= afl->rand_seed[1];
-        s2 ^= afl->rand_seed[2];
-        s3 ^= afl->rand_seed[3];
-
-      }
-
-      rand_next(afl);
-
-    }
-
-  afl->rand_seed[0] = s0;
-  afl->rand_seed[1] = s1;
-  afl->rand_seed[2] = s2;
-  afl->rand_seed[3] = s3;
-
-}
-
-/* This is the long-jump function for the generator. It is equivalent to
-   2^192 calls to rand_next(); it can be used to generate 2^64 starting points,
-   from each of which jump() will generate 2^64 non-overlapping
-   subsequences for parallel distributed computations. */
-
-void long_jump(afl_state_t *afl) {
-
-  static const uint64_t LONG_JUMP[] = {0x76e15d3efefdcbbf, 0xc5004e441c522fb3,
-                                       0x77710069854ee241, 0x39109bb02acbe635};
-
-  size_t   i, b;
-  uint64_t s0 = 0;
-  uint64_t s1 = 0;
-  uint64_t s2 = 0;
-  uint64_t s3 = 0;
-  for (i = 0; i < (sizeof(LONG_JUMP) / sizeof(*LONG_JUMP)); i++)
-    for (b = 0; b < 64; b++) {
-
-      if (LONG_JUMP[i] & UINT64_C(1) << b) {
-
-        s0 ^= afl->rand_seed[0];
-        s1 ^= afl->rand_seed[1];
-        s2 ^= afl->rand_seed[2];
-        s3 ^= afl->rand_seed[3];
-
-      }
-
-      rand_next(afl);
-
-    }
-
-  afl->rand_seed[0] = s0;
-  afl->rand_seed[1] = s1;
-  afl->rand_seed[2] = s2;
-  afl->rand_seed[3] = s3;
-
-}
-
 /* we switch from afl's murmur implementation to xxh3 as it is 30% faster -
    and get 64 bit hashes instead of just 32 bit. Less collisions! :-) */