From e3dadbfe0f9fad435a6fa201131315500f1a348a Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Fri, 13 Jan 2023 18:27:22 +0100
Subject: autotokens

---
 custom_mutators/autotokens/autotokens.cpp | 391 ++++++++++++++++++++++++++++++
 1 file changed, 391 insertions(+)
 create mode 100644 custom_mutators/autotokens/autotokens.cpp

(limited to 'custom_mutators/autotokens/autotokens.cpp')
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
new file mode 100644
index 00000000..afde8c26
--- /dev/null
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -0,0 +1,391 @@
+extern "C" {
+#include "afl-fuzz.h"
+}
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+#include <vector>
+#include <regex>
+
+#define AUTOTOKENS_DEBUG 1
+#define AUTOTOKENS_LEN_MIN 12
+#define AUTOTOKENS_CHANGE_MIN_PERCENT 5
+#define AUTOTOKENS_CHANGE_MAX_PERCENT 10
+
+using namespace std;
+
+typedef struct my_mutator {
+
+  afl_state *afl;
+
+} my_mutator_t;
+
+#define DEBUG \
+  if (unlikely(debug)) fprintf
+
+static afl_state                           *afl_ptr;
+static int                                  debug = AUTOTOKENS_DEBUG;
+static u32                                  current_id = 0;
+static unordered_map<string, vector<u32> *> file_mapping;
+static unordered_map<string, u32>           token_to_id;
+static unordered_map<u32, string>           id_to_token;
+static regex regex_comment_slash("(//.*)([\r\n]?)", regex::optimize);
+static regex regex_comment_star("/\\*(.|\n)*?\\*/",
+                                regex::multiline | regex::optimize);
+static regex regex_string("\"(.*?)\"|'(.*?')", regex::optimize);
+static regex regex_word("[A-Za-z0-9_$]+", regex::optimize);
+static regex regex_whitespace(R"([ \t]+)", regex::optimize);
+static vector<u32> *s;
+
+extern "C" size_t afl_custom_fuzz(my_mutator_t *data, uint8_t *buf, size_t buf_size,
+                       u8 **out_buf, uint8_t *add_buf,
+                       size_t add_buf_size, size_t max_size) {
+
+  DEBUG(stderr, "MUT!\n");
+
+  if (s == NULL) { return 0; }
+
+  vector<u32> m = *s;
+  u32 i, m_size = (u32)m.size();
+
+  u32 rounds = MAX(8, MIN(m_size >> 3, HAVOC_CYCLES * afl_ptr->queue_cur->perf_score * afl_ptr->havoc_div / 256));
+  DEBUG(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
+
+  for (i = 0; i < rounds; ++i) {
+  
+    u32 item, new_item;
+  
+    switch(rand_below(afl_ptr, 4)) {
+      /* CHANGE */
+      case 0: /* fall through */
+      case 1:
+        item = rand_below(afl_ptr, m_size);
+        do {
+          new_item = 1 + rand_below(afl_ptr, current_id);
+        } while(unlikely(new_item == m[item]));
+        m[item] = new_item;
+        break;
+      /* INSERT (+1 so we insert also after last place) */
+      case 2:
+        new_item = 1 + rand_below(afl_ptr, current_id);
+        m.insert(m.begin() + rand_below(afl_ptr, m_size + 1), new_item);
+        ++m_size;
+        break;
+      /* ERASE - only if large enough */
+      case 3:
+        if (m_size > 8) { m.erase(m.begin() + rand_below(afl_ptr, m_size)); }
+        --m_size;
+        break;
+    }
+  
+  }
+  
+  string output;
+  u32 m_size_1 = m_size - 1;
+  for (i = 0; i < m_size; ++i) {
+    output += id_to_token[m[i]];
+    if (likely(i < m_size_1)) { output += " "; }
+  }
+
+  u32 mutated_size = output.size();
+  u8 *mutated_out = (u8*)afl_realloc((void**)out_buf, mutated_size);
+
+  if (unlikely(!mutated_out)) {
+  
+    *out_buf = NULL;
+    return 0;
+  
+  }
+
+  /*
+  *out_buf = buf;
+  return buf_size;
+  */
+  memcpy(mutated_out, output.data(), mutated_size);
+  *out_buf = mutated_out;
+  DEBUG(stderr, "MUTATED to %u bytes:\n%s\n---\n", mutated_size, mutated_out);
+  return mutated_size;
+
+}
+
+
+/* We are not using afl_custom_queue_new_entry() because not every corpus entry
+   will be necessarily fuzzed. so we use afl_custom_queue_get() instead */
+
+extern "C" unsigned char afl_custom_queue_get(void                *data,
+                                              const unsigned char *filename) {
+
+  if (likely(!debug))
+    if (!afl_ptr->queue_cur->is_ascii) { s = NULL; return 0; }
+
+  vector<u32> *structure = NULL;
+  string       fn = (char *)filename;
+
+  auto entry = file_mapping.find(fn);
+  if (entry == file_mapping.end()) {
+
+    // this input file was not analyzed for tokens yet, so let's do it!
+
+    FILE *fp = fopen((char *)filename, "rb");
+    if (!fp) { s = NULL; return 0; }  // should not happen
+    fseek(fp, 0, SEEK_END);
+    size_t len = (size_t)ftell(fp);
+    if (len < AUTOTOKENS_LEN_MIN) {
+
+      fclose(fp);
+      file_mapping[fn] = structure;  // NULL ptr so we don't read the file again
+      DEBUG(stderr, "Too short (%lu) %s\n", len, filename);
+      s = NULL;
+      return 0;
+
+    }
+
+    string input;
+    input.resize(len);
+    rewind(fp);
+    fread(input.data(), input.size(), 1, fp);
+    fclose(fp);
+
+    // DEBUG(stderr, "Read %lu bytes for %s\nBefore comment trim:\n%s\n",
+    // input.size(), filename, input.c_str());
+
+    input = regex_replace(input, regex_comment_slash, "$2");
+    input = regex_replace(input, regex_comment_star, "");
+
+    DEBUG(stderr, "After replace %lu bytes for %s\n%s\n", input.size(),
+          filename, input.c_str());
+
+    /*
+    u32 spaces = count(input.begin(), input.end(), ' ');
+    u32 tabs = count(input.begin(), input.end(), '\t');
+    u32 linefeeds = count(input.begin(), input.end(), '\n');
+    bool ends_with_linefeed = input[input.length() - 1] == '\n';
+    DEBUG(stderr, "spaces=%u tabs=%u linefeeds=%u ends=%u\n", spaces, tabs,
+          linefeeds, ends_with_linefeed);
+    */
+
+    // now extract all tokens
+    vector<string>         tokens;
+    smatch                 match;
+    string::const_iterator cur = input.begin(), ende = input.end(), last = cur,
+                           found, prev;
+
+    DEBUG(stderr, "MATCHES:\n");
+    while (regex_search(cur, ende, match, regex_string)) {
+
+      prev = cur;
+      found = match[1].first;
+      cur = match[1].second;
+      DEBUG(stderr,
+            "string \"%s\" found at start %lu offset %lu continue at %lu\n",
+            match[1].str().c_str(), prev - input.begin(), match.position(),
+            cur - input.begin());
+      if (prev < found) {  // there are items between search start and find
+        sregex_token_iterator it{prev, found, regex_whitespace, -1};
+        vector<std::string>   tokenized{it, {}};
+        tokenized.erase(
+            std::remove_if(tokenized.begin(), tokenized.end(),
+                           [](std::string const &s) { return s.size() == 0; }),
+            tokenized.end());
+        tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
+
+        DEBUG(stderr, "tokens: %lu   input size: %lu\n", tokenized.size(),
+              input.size());
+        for (auto x : tokenized) {
+
+          cerr << x << endl;
+
+        }
+
+        for (auto token : tokenized) {
+
+          string::const_iterator c = token.begin(), e = token.end(), f, p;
+          smatch                 m;
+
+          while (regex_search(c, e, m, regex_word)) {
+
+            p = c;
+            f = m[0].first;
+            c = m[0].second;
+            if (p < f) {
+
+              // there are items between search start and find
+              string foo(p, f);
+              DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
+              tokens.push_back(std::string(p, f));
+
+            }
+
+            DEBUG(stderr,
+                  "SUBstring \"%s\" found at start %lu offset %lu continue at "
+                  "%lu\n",
+                  m[0].str().c_str(), p - input.begin(), m.position(),
+                  c - token.begin());
+            tokens.push_back(m[0].str());
+
+          }
+
+          if (c < e) {
+
+            string foo(c, e);
+            DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+            tokens.push_back(std::string(c, e));
+
+          }
+
+        }
+
+      }
+
+      if (match[1].length() > 0) { tokens.push_back(match[1]); }
+
+    }
+
+    if (cur < ende) {
+
+      DEBUG(stderr, "REST!\n");
+
+      sregex_token_iterator it{cur, ende, regex_whitespace, -1};
+      vector<std::string>   tokenized{it, {}};
+      tokenized.erase(
+          std::remove_if(tokenized.begin(), tokenized.end(),
+                         [](std::string const &s) { return s.size() == 0; }),
+          tokenized.end());
+      tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
+
+      DEBUG(stderr, "tokens: %lu   input size: %lu\n", tokenized.size(),
+            input.size());
+      for (auto x : tokenized) {
+
+        cerr << x << endl;
+
+      }
+
+      for (auto token : tokenized) {
+
+        string::const_iterator c = token.begin(), e = token.end(), f, p;
+        smatch                 m;
+
+        while (regex_search(c, e, m, regex_word)) {
+
+          p = c;
+          f = m[0].first;
+          c = m[0].second;
+          if (p < f) {
+
+            // there are items between search start and find
+            string foo(p, f);
+            DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
+            tokens.push_back(std::string(p, f));
+
+          }
+
+          DEBUG(stderr,
+                "SUB2string \"%s\" found at start %lu offset %lu continue at "
+                "%lu\n",
+                m[0].str().c_str(), p - input.begin(), m.position(),
+                c - token.begin());
+          tokens.push_back(m[0].str());
+
+        }
+
+        if (c < e) {
+
+          string foo(c, e);
+          DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+          tokens.push_back(std::string(c, e));
+
+        }
+
+      }
+
+    }
+
+    DEBUG(stderr, "DUMPING TOKENS:\n");
+    if (unlikely(debug))
+      for (u32 i = 0; i < tokens.size(); ++i) {
+
+        DEBUG(stderr, "%s ", tokens[i].c_str());
+
+      }
+
+    DEBUG(stderr, "---------------------------\n");
+
+    /* Now we transform the tokens into an ID list and saved that */
+
+    structure = new vector<u32>();
+    u32 id;
+
+    for (u32 i = 0; i < tokens.size(); ++i) {
+
+      if ((id = token_to_id[tokens[i]]) == 0) {
+
+        // First time we see this token, add it to the list
+        ++current_id;
+        token_to_id[tokens[i]] = current_id;
+        id_to_token[current_id] = tokens[i];
+        structure->push_back(current_id);
+
+      } else {
+
+        structure->push_back(id);
+
+      }
+
+    }
+
+    // save the token structure to the file mapping
+    file_mapping[fn] = structure;
+    s = structure;
+
+    // we are done!
+    DEBUG(stderr, "DONE! We have %lu tokens in the structure\n",
+          structure->size());
+
+  } else {
+
+    if (entry->second == NULL) {
+
+      DEBUG(stderr, "Skipping %s\n", filename);
+      s = NULL;
+      return 0;
+
+    }
+
+    s = entry->second;
+    DEBUG(stderr, "OK %s\n", filename);
+
+  }
+
+  return 1;  // we always fuzz unless non-ascii or too small
+
+}
+
+extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
+
+  (void)(seed);
+  my_mutator_t *data = (my_mutator_t *)calloc(1, sizeof(my_mutator_t));
+  if (!data) {
+
+    perror("afl_custom_init alloc");
+    return NULL;
+
+  }
+
+  data->afl = afl_ptr = afl;
+
+  return data;
+
+}
+
+extern "C" void afl_custom_deinit(my_mutator_t *data) {
+
+  free(data);
+
+}
+
-- 
cgit 1.4.1


From 35801bed7a5feb8cc3a363bafbd577f256c467f6 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Sun, 15 Jan 2023 13:47:31 +0100
Subject: dictionary support

---
 custom_mutators/autotokens/TODO           |  15 +-
 custom_mutators/autotokens/autotokens.cpp | 248 +++++++++++++++++++++++-------
 include/config.h                          |   2 +-
 3 files changed, 198 insertions(+), 67 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/TODO b/custom_mutators/autotokens/TODO
index 700b3fa7..2e5e384f 100644
--- a/custom_mutators/autotokens/TODO
+++ b/custom_mutators/autotokens/TODO
@@ -1,13 +1,12 @@
 whitespace belassen oder notieren?		MAYBE
 0=space 1=tab 2=linefeed
 
-dictionary mitverwenden?			JA aber nur ascii
--> neue liste?
-wie mache ich das bei honggfuzz?
-ansonsten neuer custom mutator entrypoint?
+cmplog: only add tokens that were found to fit?
+
+create from thin air if no good seed after a cycle and dict large enough?
+(static u32 no_of_struct_inputs;) 
+
+splice insert, splice overwrite
+(linefeed, semicolon)
 
-nur is_ascii wenn cmplog aktiv, ansonsten eigene implementierung
-die aber dann dafür sorgt dass eine leere struktur da ist.
-is is_ascii in afl-common.o ?
 
-cmplog: only add tokens that were found to fit?
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index afde8c26..2fad8dd7 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -1,5 +1,7 @@
 extern "C" {
+
 #include "afl-fuzz.h"
+
 }
 
 #include <stdio.h>
@@ -13,9 +15,7 @@ extern "C" {
 #include <regex>
 
 #define AUTOTOKENS_DEBUG 1
-#define AUTOTOKENS_LEN_MIN 12
-#define AUTOTOKENS_CHANGE_MIN_PERCENT 5
-#define AUTOTOKENS_CHANGE_MAX_PERCENT 10
+#define AUTOTOKENS_CHANGE_MIN 8
 
 using namespace std;
 
@@ -31,43 +31,55 @@ typedef struct my_mutator {
 static afl_state                           *afl_ptr;
 static int                                  debug = AUTOTOKENS_DEBUG;
 static u32                                  current_id = 0;
+static u32                                  valid_structures = 0;
+static u32                                  extras_cnt = 0, a_extras_cnt = 0;
 static unordered_map<string, vector<u32> *> file_mapping;
 static unordered_map<string, u32>           token_to_id;
 static unordered_map<u32, string>           id_to_token;
-static regex regex_comment_slash("(//.*)([\r\n]?)", regex::optimize);
-static regex regex_comment_star("/\\*(.|\n)*?\\*/",
-                                regex::multiline | regex::optimize);
-static regex regex_string("\"(.*?)\"|'(.*?')", regex::optimize);
-static regex regex_word("[A-Za-z0-9_$]+", regex::optimize);
-static regex regex_whitespace(R"([ \t]+)", regex::optimize);
-static vector<u32> *s;
+static regex        regex_comment_slash("(//.*)([\r\n]?)", regex::optimize);
+static regex        regex_comment_star("/\\*(.|\n)*?\\*/",
+                                       regex::multiline | regex::optimize);
+static regex        regex_string("\"(.*?)\"|'(.*?')", regex::optimize);
+static regex        regex_word("[A-Za-z0-9_$]+", regex::optimize);
+static regex        regex_whitespace(R"([ \t]+)", regex::optimize);
+static vector<u32> *s;  // the structure of the currently selected input
 
-extern "C" size_t afl_custom_fuzz(my_mutator_t *data, uint8_t *buf, size_t buf_size,
-                       u8 **out_buf, uint8_t *add_buf,
-                       size_t add_buf_size, size_t max_size) {
+extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
+                                  u8 **out_buf, u8 *add_buf,
+                                  size_t add_buf_size, size_t max_size) {
 
-  DEBUG(stderr, "MUT!\n");
+  if (s == NULL) {
+
+    *out_buf = NULL;
+    return 0;
 
-  if (s == NULL) { return 0; }
+  }
 
-  vector<u32> m = *s;
-  u32 i, m_size = (u32)m.size();
+  vector<u32> m = *s;  // copy of the structure we will modify
+  u32         i, m_size = (u32)m.size();
 
-  u32 rounds = MAX(8, MIN(m_size >> 3, HAVOC_CYCLES * afl_ptr->queue_cur->perf_score * afl_ptr->havoc_div / 256));
-  DEBUG(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
+  u32 rounds =
+      MAX(AUTOTOKENS_CHANGE_MIN,
+          MIN(m_size >> 3, HAVOC_CYCLES * afl_ptr->queue_cur->perf_score *
+                               afl_ptr->havoc_div / 256));
+  // DEBUG(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
 
   for (i = 0; i < rounds; ++i) {
-  
+
     u32 item, new_item;
-  
-    switch(rand_below(afl_ptr, 4)) {
+
+    switch (rand_below(afl_ptr, 4)) {
+
       /* CHANGE */
-      case 0: /* fall through */
+      case 0:                                               /* fall through */
       case 1:
         item = rand_below(afl_ptr, m_size);
         do {
+
           new_item = 1 + rand_below(afl_ptr, current_id);
-        } while(unlikely(new_item == m[item]));
+
+        } while (unlikely(new_item == m[item]));
+
         m[item] = new_item;
         break;
       /* INSERT (+1 so we insert also after last place) */
@@ -81,31 +93,32 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, uint8_t *buf, size_t buf_s
         if (m_size > 8) { m.erase(m.begin() + rand_below(afl_ptr, m_size)); }
         --m_size;
         break;
+        // TODO: add full line insert splice, replace splace, delete
+
     }
-  
+
   }
-  
+
   string output;
-  u32 m_size_1 = m_size - 1;
+  u32    m_size_1 = m_size - 1;
+
   for (i = 0; i < m_size; ++i) {
+
     output += id_to_token[m[i]];
     if (likely(i < m_size_1)) { output += " "; }
+
   }
 
   u32 mutated_size = output.size();
-  u8 *mutated_out = (u8*)afl_realloc((void**)out_buf, mutated_size);
+  u8 *mutated_out = (u8 *)afl_realloc((void **)out_buf, mutated_size);
 
   if (unlikely(!mutated_out)) {
-  
+
     *out_buf = NULL;
     return 0;
-  
+
   }
 
-  /*
-  *out_buf = buf;
-  return buf_size;
-  */
   memcpy(mutated_out, output.data(), mutated_size);
   *out_buf = mutated_out;
   DEBUG(stderr, "MUTATED to %u bytes:\n%s\n---\n", mutated_size, mutated_out);
@@ -113,29 +126,106 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, uint8_t *buf, size_t buf_s
 
 }
 
-
 /* We are not using afl_custom_queue_new_entry() because not every corpus entry
    will be necessarily fuzzed. so we use afl_custom_queue_get() instead */
 
 extern "C" unsigned char afl_custom_queue_get(void                *data,
                                               const unsigned char *filename) {
 
-  if (likely(!debug))
-    if (!afl_ptr->queue_cur->is_ascii) { s = NULL; return 0; }
+  if (likely(!debug)) {
+
+    if (afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) {
+
+      s = NULL;
+      return 0;
+
+    }
+
+  }
+
+  // check if there are new dictionary entries and add them to the tokens
+  if (valid_structures) {
+
+    while (extras_cnt < afl_ptr->extras_cnt) {
+
+      u32 ok = 1, l = afl_ptr->extras[extras_cnt].len;
+      u8 *ptr = afl_ptr->extras[extras_cnt].data;
+
+      for (u32 i = 0; i < l; ++i) {
+
+        if (!isascii((int)ptr[i]) && !isprint((int)ptr[i])) {
+
+          ok = 0;
+          break;
+
+        }
+
+      }
+
+      if (ok) {
+
+        ++current_id;
+        token_to_id[(char *)ptr] = current_id;
+        id_to_token[current_id] = (char *)ptr;
+
+      }
+
+      ++extras_cnt;
+      DEBUG(stderr, "Added from dictionary: \"%s\"\n", ptr);
+
+    }
+
+    while (a_extras_cnt < afl_ptr->a_extras_cnt) {
+
+      u32 ok = 1, l = afl_ptr->a_extras[a_extras_cnt].len;
+      u8 *ptr = afl_ptr->a_extras[a_extras_cnt].data;
+
+      for (u32 i = 0; i < l; ++i) {
+
+        if (!isascii((int)ptr[i]) && !isprint((int)ptr[i])) {
+
+          ok = 0;
+          break;
+
+        }
+
+      }
+
+      if (ok) {
+
+        ++current_id;
+        token_to_id[(char *)ptr] = current_id;
+        id_to_token[current_id] = (char *)ptr;
+
+      }
+
+      ++a_extras_cnt;
+      DEBUG(stderr, "Added from auto dictionary: \"%s\"\n", ptr);
+
+    }
+
+  }
 
   vector<u32> *structure = NULL;
   string       fn = (char *)filename;
+  auto         entry = file_mapping.find(fn);
 
-  auto entry = file_mapping.find(fn);
   if (entry == file_mapping.end()) {
 
     // this input file was not analyzed for tokens yet, so let's do it!
 
     FILE *fp = fopen((char *)filename, "rb");
-    if (!fp) { s = NULL; return 0; }  // should not happen
+    if (!fp) {
+
+      s = NULL;
+      return 0;
+
+    }  // should not happen
+
     fseek(fp, 0, SEEK_END);
     size_t len = (size_t)ftell(fp);
-    if (len < AUTOTOKENS_LEN_MIN) {
+
+    if (len < AFL_TXT_MIN_LEN) {
 
       fclose(fp);
       file_mapping[fn] = structure;  // NULL ptr so we don't read the file again
@@ -151,6 +241,30 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     fread(input.data(), input.size(), 1, fp);
     fclose(fp);
 
+    if (!afl_ptr->shm.cmplog_mode) {
+
+      // not running with CMPLOG? bad choice, but whatever ...
+      // we only want text inputs, so we have to check it ourselves.
+
+      u32 valid_chars = 0;
+      for (u32 i = 0; i < len; ++i) {
+
+        if (isascii((int)input[i]) || isprint((int)input[i])) { ++valid_chars; }
+
+      }
+
+      // we want at least 95% of text characters ...
+      if (((len * AFL_TXT_MIN_PERCENT) / 100) > valid_chars) {
+
+        file_mapping[fn] = NULL;
+        DEBUG(stderr, "Not text (%lu) %s\n", len, filename);
+        s = NULL;
+        return 0;
+
+      }
+
+    }
+
     // DEBUG(stderr, "Read %lu bytes for %s\nBefore comment trim:\n%s\n",
     // input.size(), filename, input.c_str());
 
@@ -175,7 +289,6 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     string::const_iterator cur = input.begin(), ende = input.end(), last = cur,
                            found, prev;
 
-    DEBUG(stderr, "MATCHES:\n");
     while (regex_search(cur, ende, match, regex_string)) {
 
       prev = cur;
@@ -196,11 +309,12 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
         DEBUG(stderr, "tokens: %lu   input size: %lu\n", tokenized.size(),
               input.size());
-        for (auto x : tokenized) {
+        if (unlikely(debug))
+          for (auto x : tokenized) {
 
-          cerr << x << endl;
+            cerr << x << endl;
 
-        }
+          }
 
         for (auto token : tokenized) {
 
@@ -232,8 +346,13 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
           if (c < e) {
 
-            string foo(c, e);
-            DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+            if (unlikely(debug)) {
+
+              string foo(c, e);
+              DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+
+            }
+
             tokens.push_back(std::string(c, e));
 
           }
@@ -248,8 +367,6 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     if (cur < ende) {
 
-      DEBUG(stderr, "REST!\n");
-
       sregex_token_iterator it{cur, ende, regex_whitespace, -1};
       vector<std::string>   tokenized{it, {}};
       tokenized.erase(
@@ -260,11 +377,12 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       DEBUG(stderr, "tokens: %lu   input size: %lu\n", tokenized.size(),
             input.size());
-      for (auto x : tokenized) {
+      if (unlikely(debug))
+        for (auto x : tokenized) {
 
-        cerr << x << endl;
+          cerr << x << endl;
 
-      }
+        }
 
       for (auto token : tokenized) {
 
@@ -279,8 +397,13 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
           if (p < f) {
 
             // there are items between search start and find
-            string foo(p, f);
-            DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
+            if (unlikely(debug)) {
+
+              string foo(p, f);
+              DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
+
+            }
+
             tokens.push_back(std::string(p, f));
 
           }
@@ -296,8 +419,13 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
         if (c < e) {
 
-          string foo(c, e);
-          DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+          if (unlikely(debug)) {
+
+            string foo(c, e);
+            DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+
+          }
+
           tokens.push_back(std::string(c, e));
 
         }
@@ -306,15 +434,18 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     }
 
-    DEBUG(stderr, "DUMPING TOKENS:\n");
-    if (unlikely(debug))
+    if (unlikely(debug)) {
+
+      DEBUG(stderr, "DUMPING TOKENS:\n");
       for (u32 i = 0; i < tokens.size(); ++i) {
 
         DEBUG(stderr, "%s ", tokens[i].c_str());
 
       }
 
-    DEBUG(stderr, "---------------------------\n");
+      DEBUG(stderr, "---------------------------\n");
+
+    }
 
     /* Now we transform the tokens into an ID list and saved that */
 
@@ -342,6 +473,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     // save the token structure to the file mapping
     file_mapping[fn] = structure;
     s = structure;
+    ++valid_structures;
 
     // we are done!
     DEBUG(stderr, "DONE! We have %lu tokens in the structure\n",
diff --git a/include/config.h b/include/config.h
index a5a4c473..6cfaac11 100644
--- a/include/config.h
+++ b/include/config.h
@@ -494,7 +494,7 @@
 /* What is the minimum percentage of ascii characters present to be classifed
    as "is_ascii"? */
 
-#define AFL_TXT_MIN_PERCENT 94
+#define AFL_TXT_MIN_PERCENT 95
 
 /* How often to perform ASCII mutations 0 = disable, 1-8 are good values */
 
-- 
cgit 1.4.1


From 10b82c72772f40f703119fc7cd1c9063500a6bbe Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Sun, 15 Jan 2023 18:17:28 +0100
Subject: fixes

---
 custom_mutators/autotokens/Makefile       |  2 +-
 custom_mutators/autotokens/autotokens.cpp | 40 ++++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 12 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/Makefile b/custom_mutators/autotokens/Makefile
index 1ee7f5c4..5dd52dee 100644
--- a/custom_mutators/autotokens/Makefile
+++ b/custom_mutators/autotokens/Makefile
@@ -1,7 +1,7 @@
 all:	autotokens.so
 
 autotokens.so:	autotokens.cpp
-	$(CXX) -O3 -shared -fPIC -o autotokens.so -I../../include autotokens.cpp ../../src/afl-performance.o
+	$(CXX) -g -O3 $(CFLAGS) -shared -fPIC -o autotokens.so -I../../include autotokens.cpp ../../src/afl-performance.o
 
 clean:
 	rm -f autotokens.so *~ core
\ No newline at end of file
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 2fad8dd7..9fbdf52a 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #include <vector>
 #include <regex>
 
-#define AUTOTOKENS_DEBUG 1
+#define AUTOTOKENS_DEBUG 0
 #define AUTOTOKENS_CHANGE_MIN 8
 
 using namespace std;
@@ -64,11 +64,13 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
                                afl_ptr->havoc_div / 256));
   // DEBUG(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
 
+  u32 max_rand = 4;
+
   for (i = 0; i < rounds; ++i) {
 
     u32 item, new_item;
 
-    switch (rand_below(afl_ptr, 4)) {
+    switch (rand_below(afl_ptr, max_rand)) {
 
       /* CHANGE */
       case 0:                                               /* fall through */
@@ -90,9 +92,19 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
         break;
       /* ERASE - only if large enough */
       case 3:
-        if (m_size > 8) { m.erase(m.begin() + rand_below(afl_ptr, m_size)); }
-        --m_size;
+        if (m_size > 8) {
+
+          m.erase(m.begin() + rand_below(afl_ptr, m_size));
+          --m_size;
+
+        } else {
+
+          max_rand = 3;
+
+        }
+
         break;
+
         // TODO: add full line insert splice, replace splace, delete
 
     }
@@ -119,9 +131,16 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
   }
 
+  if (unlikely(debug)) {
+
+    DEBUG(stderr, "MUTATED to %u bytes:\n", mutated_size);
+    fwrite(output.data(), 1, mutated_size, stderr);
+    DEBUG(stderr, "\n---\n");
+
+  }
+
   memcpy(mutated_out, output.data(), mutated_size);
   *out_buf = mutated_out;
-  DEBUG(stderr, "MUTATED to %u bytes:\n%s\n---\n", mutated_size, mutated_out);
   return mutated_size;
 
 }
@@ -292,11 +311,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     while (regex_search(cur, ende, match, regex_string)) {
 
       prev = cur;
-      found = match[1].first;
-      cur = match[1].second;
-      DEBUG(stderr,
-            "string \"%s\" found at start %lu offset %lu continue at %lu\n",
-            match[1].str().c_str(), prev - input.begin(), match.position(),
+      found = match[0].first;
+      cur = match[0].second;
+      DEBUG(stderr, "string %s found at start %lu offset %lu continue at %lu\n",
+            match[0].str().c_str(), prev - input.begin(), match.position(),
             cur - input.begin());
       if (prev < found) {  // there are items between search start and find
         sregex_token_iterator it{prev, found, regex_whitespace, -1};
@@ -361,7 +379,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       }
 
-      if (match[1].length() > 0) { tokens.push_back(match[1]); }
+      if (match[0].length() > 0) { tokens.push_back(match[0]); }
 
     }
 
-- 
cgit 1.4.1


From 4b915207c42f8100f306778f617d7003c3e2193f Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Mon, 16 Jan 2023 17:05:04 +0100
Subject: autotokens - much better tokenizer

---
 custom_mutators/autotokens/autotokens.cpp | 307 +++++++++++++++++-------------
 1 file changed, 179 insertions(+), 128 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 9fbdf52a..850692a1 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -28,22 +28,41 @@ typedef struct my_mutator {
 #define DEBUG \
   if (unlikely(debug)) fprintf
 
-static afl_state                           *afl_ptr;
-static int                                  debug = AUTOTOKENS_DEBUG;
-static u32                                  current_id = 0;
-static u32                                  valid_structures = 0;
-static u32                                  extras_cnt = 0, a_extras_cnt = 0;
+static afl_state *afl_ptr;
+static int        debug = AUTOTOKENS_DEBUG;
+static u32        current_id;
+static u32        valid_structures;
+static u32        whitespace_ids;
+static u32        extras_cnt, a_extras_cnt;
+static u64        all_spaces, all_tabs, all_lf, all_ws;
 static unordered_map<string, vector<u32> *> file_mapping;
 static unordered_map<string, u32>           token_to_id;
 static unordered_map<u32, string>           id_to_token;
-static regex        regex_comment_slash("(//.*)([\r\n]?)", regex::optimize);
-static regex        regex_comment_star("/\\*(.|\n)*?\\*/",
-                                       regex::multiline | regex::optimize);
-static regex        regex_string("\"(.*?)\"|'(.*?')", regex::optimize);
-static regex        regex_word("[A-Za-z0-9_$]+", regex::optimize);
-static regex        regex_whitespace(R"([ \t]+)", regex::optimize);
+// static regex        regex_comment_slash("(//.*)([\r\n]?)", regex::optimize);
+static regex regex_comment_star("/\\*([:print:]|\n)*?\\*/",
+                                regex::multiline | regex::optimize);
+static regex regex_string("\"[[:print:]]*?\"|'[[:print:]]*?'", regex::optimize);
 static vector<u32> *s;  // the structure of the currently selected input
 
+u32 good_whitespace_or_singleval() {
+
+  u32 i = rand_below(afl_ptr, current_id);
+  if (id_to_token[i].size() == 1) { return i; }
+  i = rand_below(afl_ptr, all_ws);
+  if (i < all_spaces) {
+
+    return 0;
+
+  } else if (i < all_tabs) {
+
+    return 1;
+
+  } else
+
+    return 2;  // linefeed
+
+}
+
 extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
                                   u8 **out_buf, u8 *add_buf,
                                   size_t add_buf_size, size_t max_size) {
@@ -68,30 +87,76 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
   for (i = 0; i < rounds; ++i) {
 
-    u32 item, new_item;
-
     switch (rand_below(afl_ptr, max_rand)) {
 
       /* CHANGE */
       case 0:                                               /* fall through */
-      case 1:
-        item = rand_below(afl_ptr, m_size);
+      case 1: {
+
+        u32 pos = rand_below(afl_ptr, m_size);
+        u32 cur_item = m[pos], new_item;
         do {
 
-          new_item = 1 + rand_below(afl_ptr, current_id);
+          new_item = rand_below(afl_ptr, current_id);
 
-        } while (unlikely(new_item == m[item]));
+        } while (unlikely(
 
-        m[item] = new_item;
+            new_item == cur_item ||
+            (whitespace_ids < new_item && whitespace_ids >= cur_item) ||
+            (whitespace_ids >= new_item && whitespace_ids < cur_item)));
+
+        DEBUG(stderr, "MUT: %u -> %u\n", cur_item, new_item);
+        m[pos] = new_item;
         break;
-      /* INSERT (+1 so we insert also after last place) */
-      case 2:
-        new_item = 1 + rand_below(afl_ptr, current_id);
-        m.insert(m.begin() + rand_below(afl_ptr, m_size + 1), new_item);
+
+      }
+
+      /* INSERT (m_size +1 so we insert also after last place) */
+      case 2: {
+
+        u32 new_item;
+        do {
+
+          new_item = rand_below(afl_ptr, current_id);
+
+        } while (new_item >= whitespace_ids);
+
+        u32 pos = rand_below(afl_ptr, m_size + 1);
+        m.insert(m.begin() + pos, new_item);
         ++m_size;
+
+        // if we insert an identifier or string we might need whitespace
+        if (id_to_token[new_item].size() > 1) {
+
+          // need to insert before?
+
+          if (pos && m[pos - 1] >= whitespace_ids &&
+              id_to_token[m[pos - 1]].size() > 1) {
+
+            m.insert(m.begin() + pos, good_whitespace_or_singleval());
+            ++m_size;
+
+          }
+
+          if (pos + 1 < m_size && m[pos + 1] >= whitespace_ids &&
+              id_to_token[m[pos + 1]].size() > 1) {
+
+            // need to insert after?
+
+            m.insert(m.begin() + pos + 1, good_whitespace_or_singleval());
+            ++m_size;
+
+          }
+
+        }
+
         break;
+
+      }
+
       /* ERASE - only if large enough */
-      case 3:
+      case 3: {
+
         if (m_size > 8) {
 
           m.erase(m.begin() + rand_below(afl_ptr, m_size));
@@ -105,6 +170,8 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
         break;
 
+      }
+
         // TODO: add full line insert splice, replace splace, delete
 
     }
@@ -112,12 +179,10 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
   }
 
   string output;
-  u32    m_size_1 = m_size - 1;
 
   for (i = 0; i < m_size; ++i) {
 
     output += id_to_token[m[i]];
-    if (likely(i < m_size_1)) { output += " "; }
 
   }
 
@@ -183,9 +248,9 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       if (ok) {
 
-        ++current_id;
         token_to_id[(char *)ptr] = current_id;
         id_to_token[current_id] = (char *)ptr;
+        ++current_id;
 
       }
 
@@ -212,9 +277,9 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       if (ok) {
 
-        ++current_id;
         token_to_id[(char *)ptr] = current_id;
         id_to_token[current_id] = (char *)ptr;
+        ++current_id;
 
       }
 
@@ -257,7 +322,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     string input;
     input.resize(len);
     rewind(fp);
-    fread(input.data(), input.size(), 1, fp);
+    fread((void *)input.data(), input.size(), 1, fp);
     fclose(fp);
 
     if (!afl_ptr->shm.cmplog_mode) {
@@ -287,28 +352,34 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     // DEBUG(stderr, "Read %lu bytes for %s\nBefore comment trim:\n%s\n",
     // input.size(), filename, input.c_str());
 
-    input = regex_replace(input, regex_comment_slash, "$2");
+    // input = regex_replace(input, regex_comment_slash, "$2");
     input = regex_replace(input, regex_comment_star, "");
 
     DEBUG(stderr, "After replace %lu bytes for %s\n%s\n", input.size(),
           filename, input.c_str());
 
-    /*
-    u32 spaces = count(input.begin(), input.end(), ' ');
-    u32 tabs = count(input.begin(), input.end(), '\t');
-    u32 linefeeds = count(input.begin(), input.end(), '\n');
+    u32  spaces = count(input.begin(), input.end(), ' ');
+    u32  tabs = count(input.begin(), input.end(), '\t');
+    u32  linefeeds = count(input.begin(), input.end(), '\n');
     bool ends_with_linefeed = input[input.length() - 1] == '\n';
     DEBUG(stderr, "spaces=%u tabs=%u linefeeds=%u ends=%u\n", spaces, tabs,
           linefeeds, ends_with_linefeed);
-    */
+    all_spaces += spaces;
+    all_tabs += tabs;
+    all_lf += linefeeds;
+    all_ws = all_spaces + all_tabs + all_lf;
 
     // now extract all tokens
     vector<string>         tokens;
     smatch                 match;
-    string::const_iterator cur = input.begin(), ende = input.end(), last = cur,
-                           found, prev;
+    string::const_iterator cur = input.begin(), ende = input.end(), found, prev;
 
-    while (regex_search(cur, ende, match, regex_string)) {
+    DEBUG(stderr, "START!\n");
+
+    while (regex_search(cur, ende, match, regex_string,
+                        regex_constants::match_any |
+                            regex_constants::match_not_null |
+                            regex_constants::match_continuous)) {
 
       prev = cur;
       found = match[0].first;
@@ -316,62 +387,42 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       DEBUG(stderr, "string %s found at start %lu offset %lu continue at %lu\n",
             match[0].str().c_str(), prev - input.begin(), match.position(),
             cur - input.begin());
+
       if (prev < found) {  // there are items between search start and find
-        sregex_token_iterator it{prev, found, regex_whitespace, -1};
-        vector<std::string>   tokenized{it, {}};
-        tokenized.erase(
-            std::remove_if(tokenized.begin(), tokenized.end(),
-                           [](std::string const &s) { return s.size() == 0; }),
-            tokenized.end());
-        tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
+        while (prev < found) {
 
-        DEBUG(stderr, "tokens: %lu   input size: %lu\n", tokenized.size(),
-              input.size());
-        if (unlikely(debug))
-          for (auto x : tokenized) {
+          if (isspace(*prev)) {
 
-            cerr << x << endl;
+            auto start = prev;
+            while (isspace(*prev)) {
 
-          }
+              ++prev;
 
-        for (auto token : tokenized) {
+            }
 
-          string::const_iterator c = token.begin(), e = token.end(), f, p;
-          smatch                 m;
+            tokens.push_back(std::string(start, prev));
+            DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", prev - start,
+                  tokens[tokens.size() - 1].c_str());
 
-          while (regex_search(c, e, m, regex_word)) {
+          } else if (isalnum(*prev) || *prev == '$' || *prev == '_') {
 
-            p = c;
-            f = m[0].first;
-            c = m[0].second;
-            if (p < f) {
+            auto start = prev;
+            while (isalnum(*prev) || *prev == '$' || *prev == '_' ||
+                   *prev == '.' || *prev == '/') {
 
-              // there are items between search start and find
-              string foo(p, f);
-              DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
-              tokens.push_back(std::string(p, f));
+              ++prev;
 
             }
 
-            DEBUG(stderr,
-                  "SUBstring \"%s\" found at start %lu offset %lu continue at "
-                  "%lu\n",
-                  m[0].str().c_str(), p - input.begin(), m.position(),
-                  c - token.begin());
-            tokens.push_back(m[0].str());
-
-          }
-
-          if (c < e) {
-
-            if (unlikely(debug)) {
-
-              string foo(c, e);
-              DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+            tokens.push_back(std::string(start, prev));
+            DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", prev - start,
+                  tokens[tokens.size() - 1].c_str());
 
-            }
+          } else {
 
-            tokens.push_back(std::string(c, e));
+            tokens.push_back(std::string(prev, prev + 1));
+            DEBUG(stderr, "OTHER \"%c\"\n", *prev);
+            ++prev;
 
           }
 
@@ -383,68 +434,44 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     }
 
-    if (cur < ende) {
-
-      sregex_token_iterator it{cur, ende, regex_whitespace, -1};
-      vector<std::string>   tokenized{it, {}};
-      tokenized.erase(
-          std::remove_if(tokenized.begin(), tokenized.end(),
-                         [](std::string const &s) { return s.size() == 0; }),
-          tokenized.end());
-      tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
+    DEBUG(stderr, "AFTER all strings\n");
 
-      DEBUG(stderr, "tokens: %lu   input size: %lu\n", tokenized.size(),
-            input.size());
-      if (unlikely(debug))
-        for (auto x : tokenized) {
+    if (cur < ende) {
 
-          cerr << x << endl;
+      while (cur < ende) {
 
-        }
+        if (isspace(*cur)) {
 
-      for (auto token : tokenized) {
+          auto start = cur;
+          while (isspace(*cur)) {
 
-        string::const_iterator c = token.begin(), e = token.end(), f, p;
-        smatch                 m;
+            ++cur;
 
-        while (regex_search(c, e, m, regex_word)) {
-
-          p = c;
-          f = m[0].first;
-          c = m[0].second;
-          if (p < f) {
+          }
 
-            // there are items between search start and find
-            if (unlikely(debug)) {
+          tokens.push_back(std::string(start, cur));
+          DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", cur - start,
+                tokens[tokens.size() - 1].c_str());
 
-              string foo(p, f);
-              DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
+        } else if (isalnum(*cur) || *cur == '$' || *cur == '_') {
 
-            }
+          auto start = cur;
+          while (isalnum(*cur) || *cur == '$' || *cur == '_' || *cur == '.' ||
+                 *cur == '/') {
 
-            tokens.push_back(std::string(p, f));
+            ++cur;
 
           }
 
-          DEBUG(stderr,
-                "SUB2string \"%s\" found at start %lu offset %lu continue at "
-                "%lu\n",
-                m[0].str().c_str(), p - input.begin(), m.position(),
-                c - token.begin());
-          tokens.push_back(m[0].str());
-
-        }
-
-        if (c < e) {
-
-          if (unlikely(debug)) {
+          tokens.push_back(std::string(start, cur));
+          DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", cur - start,
+                tokens[tokens.size() - 1].c_str());
 
-            string foo(c, e);
-            DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
-
-          }
+        } else {
 
-          tokens.push_back(std::string(c, e));
+          tokens.push_back(std::string(cur, cur + 1));
+          DEBUG(stderr, "OTHER \"%c\"\n", *cur);
+          ++cur;
 
         }
 
@@ -457,7 +484,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       DEBUG(stderr, "DUMPING TOKENS:\n");
       for (u32 i = 0; i < tokens.size(); ++i) {
 
-        DEBUG(stderr, "%s ", tokens[i].c_str());
+        DEBUG(stderr, "%s", tokens[i].c_str());
 
       }
 
@@ -475,10 +502,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       if ((id = token_to_id[tokens[i]]) == 0) {
 
         // First time we see this token, add it to the list
-        ++current_id;
         token_to_id[tokens[i]] = current_id;
         id_to_token[current_id] = tokens[i];
         structure->push_back(current_id);
+        ++current_id;
 
       } else {
 
@@ -529,6 +556,30 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
 
   data->afl = afl_ptr = afl;
 
+  // set common whitespace tokens
+  token_to_id[" "] = current_id;
+  id_to_token[current_id] = " ";
+  ++current_id;
+  token_to_id["\t"] = current_id;
+  id_to_token[current_id] = "\t";
+  ++current_id;
+  token_to_id["\n"] = current_id;
+  id_to_token[current_id] = "\n";
+  ++current_id;
+  token_to_id["\r\n"] = current_id;
+  id_to_token[current_id] = "\r\n";
+  ++current_id;
+  token_to_id[" \n"] = current_id;
+  id_to_token[current_id] = " \n";
+  ++current_id;
+  token_to_id["  "] = current_id;
+  id_to_token[current_id] = "  ";
+  ++current_id;
+  token_to_id["\t\t"] = current_id;
+  id_to_token[current_id] = "\t\t";
+  ++current_id;
+  whitespace_ids = current_id;
+
   return data;
 
 }
-- 
cgit 1.4.1


From 33f41e3974348d3b0b71b3a30a6483bb0418068c Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Tue, 17 Jan 2023 09:52:35 +0100
Subject: autotokens: print stats at exit

---
 custom_mutators/autotokens/README         |  7 ++++---
 custom_mutators/autotokens/autotokens.cpp | 12 ++++++++++++
 include/config.h                          |  4 ++--
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/README b/custom_mutators/autotokens/README
index 6849279e..0dcc6a3e 100644
--- a/custom_mutators/autotokens/README
+++ b/custom_mutators/autotokens/README
@@ -1,6 +1,6 @@
 # autotokens
 
-This implements an improved autotoken idea presented in
+This implements an improved autotoken grammar fuzzing idea presented in
 [Token-Level Fuzzing][https://www.usenix.org/system/files/sec21-salls.pdf].
 It is a grammar fuzzer without actually knowing the grammar.
 
@@ -8,5 +8,6 @@ It is recommended to run with together in an instance with `CMPLOG`.
 
 If you have a dictionary (`-x`) this improves this custom grammar mutator.
 
-If **not** run with `CMPLOG`, it is possible to set `AFL_CUSTOM_MUTATOR_ONLY`,
-to concentrate on grammar bug classes.
+If **not** running with `CMPLOG`, it is possible to set
+`AFL_CUSTOM_MUTATOR_ONLY` to concentrate on grammar bug classes.
+
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 850692a1..d6b269fd 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -35,6 +35,7 @@ static u32        valid_structures;
 static u32        whitespace_ids;
 static u32        extras_cnt, a_extras_cnt;
 static u64        all_spaces, all_tabs, all_lf, all_ws;
+static u64        all_structure_items;
 static unordered_map<string, vector<u32> *> file_mapping;
 static unordered_map<string, u32>           token_to_id;
 static unordered_map<u32, string>           id_to_token;
@@ -519,6 +520,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     file_mapping[fn] = structure;
     s = structure;
     ++valid_structures;
+    all_structure_items += structure->size();
 
     // we are done!
     DEBUG(stderr, "DONE! We have %lu tokens in the structure\n",
@@ -586,6 +588,16 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
 
 extern "C" void afl_custom_deinit(my_mutator_t *data) {
 
+  /* we use this to print statistics at exit :-)
+     needs to be stderr as stdout is filtered */
+
+  fprintf(stderr,
+          "\n\nAutotoken mutator statistics:\n"
+          "  Number of all seen tokens:  %lu\n"
+          "  Number of input structures: %lu\n"
+          "  Number of all items in structures: %lu\n\n",
+          current_id - 1, valid_structures, all_structure_items);
+
   free(data);
 
 }
diff --git a/include/config.h b/include/config.h
index 6cfaac11..f8a742f2 100644
--- a/include/config.h
+++ b/include/config.h
@@ -364,9 +364,9 @@
  *                                                         *
  ***********************************************************/
 
-/* Call count interval between reseeding the libc PRNG from /dev/urandom: */
+/* Call count interval between reseeding the PRNG from /dev/urandom: */
 
-#define RESEED_RNG 100000
+#define RESEED_RNG 2500000
 
 /* The default maximum testcase cache size in MB, 0 = disable.
    A value between 50 and 250 is a good default value. Note that the
-- 
cgit 1.4.1


From efe57c936880608a2de452340d63f262470d9fcd Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Tue, 17 Jan 2023 09:57:23 +0100
Subject: more whitespace

---
 custom_mutators/autotokens/autotokens.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index d6b269fd..5580512a 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -559,6 +559,8 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
   data->afl = afl_ptr = afl;
 
   // set common whitespace tokens
+  // we deliberately do not put uncommon ones here to these will count as
+  // identifier tokens.
   token_to_id[" "] = current_id;
   id_to_token[current_id] = " ";
   ++current_id;
@@ -580,6 +582,21 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
   token_to_id["\t\t"] = current_id;
   id_to_token[current_id] = "\t\t";
   ++current_id;
+  token_to_id["\n\n"] = current_id;
+  id_to_token[current_id] = "\n\n";
+  ++current_id;
+  token_to_id["\r\n\r\n"] = current_id;
+  id_to_token[current_id] = "\r\n\r\n";
+  ++current_id;
+  token_to_id["    "] = current_id;
+  id_to_token[current_id] = "    ";
+  ++current_id;
+  token_to_id["\t\t\t\t"] = current_id;
+  id_to_token[current_id] = "\t\t\t\t";
+  ++current_id;
+  token_to_id["\n\n\n\n"] = current_id;
+  id_to_token[current_id] = "\n\n\n\n";
+  ++current_id;
   whitespace_ids = current_id;
 
   return data;
-- 
cgit 1.4.1


From a41fd5cc5c4a5073f38adf06270e2985c88da9d5 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 18 Jan 2023 11:46:28 +0100
Subject: alternate tokenize, options

---
 custom_mutators/autotokens/README         |   9 +
 custom_mutators/autotokens/autotokens.cpp | 432 ++++++++++++++++++++++++------
 2 files changed, 365 insertions(+), 76 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/README b/custom_mutators/autotokens/README
index 0dcc6a3e..f6e9c753 100644
--- a/custom_mutators/autotokens/README
+++ b/custom_mutators/autotokens/README
@@ -11,3 +11,12 @@ If you have a dictionary (`-x`) this improves this custom grammar mutator.
 If **not** running with `CMPLOG`, it is possible to set
 `AFL_CUSTOM_MUTATOR_ONLY` to concentrate on grammar bug classes.
 
+## Configuration via environment variables
+
+`AUTOTOKENS_ONLY_FAV` - only use this mutator on favorite queue items
+`AUTOTOKENS_COMMENT` - what character or string starts a comment which will be
+                       removed. Default: `/* ... */`
+`AUTOTOKENS_ALTERNATIVE_TOKENIZE` - use an alternative tokenize implementation
+                                   (experimental)
+`AUTOTOKENS_WHITESPACE` - whitespace string to use for ALTERNATIVE_TOKENIZE,
+                          default is " "
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 5580512a..28ef91e2 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -15,7 +15,10 @@ extern "C" {
 #include <regex>
 
 #define AUTOTOKENS_DEBUG 0
+#define AUTOTOKENS_ONLY_FAV 0
+#define AUTOTOKENS_ALTERNATIVE_TOKENIZE 0
 #define AUTOTOKENS_CHANGE_MIN 8
+#define AUTOTOKENS_WHITESPACE " "
 
 using namespace std;
 
@@ -30,6 +33,8 @@ typedef struct my_mutator {
 
 static afl_state *afl_ptr;
 static int        debug = AUTOTOKENS_DEBUG;
+static int        only_fav = AUTOTOKENS_ONLY_FAV;
+static int        alternative_tokenize = AUTOTOKENS_ALTERNATIVE_TOKENIZE;
 static u32        current_id;
 static u32        valid_structures;
 static u32        whitespace_ids;
@@ -39,9 +44,12 @@ static u64        all_structure_items;
 static unordered_map<string, vector<u32> *> file_mapping;
 static unordered_map<string, u32>           token_to_id;
 static unordered_map<u32, string>           id_to_token;
-// static regex        regex_comment_slash("(//.*)([\r\n]?)", regex::optimize);
+static string                               whitespace = AUTOTOKENS_WHITESPACE;
+static regex                               *regex_comment_custom;
 static regex regex_comment_star("/\\*([:print:]|\n)*?\\*/",
                                 regex::multiline | regex::optimize);
+static regex regex_word("[A-Za-z0-9_$]+", regex::optimize);
+static regex regex_whitespace(R"([ \t]+)", regex::optimize);
 static regex regex_string("\"[[:print:]]*?\"|'[[:print:]]*?'", regex::optimize);
 static vector<u32> *s;  // the structure of the currently selected input
 
@@ -84,15 +92,15 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
                                afl_ptr->havoc_div / 256));
   // DEBUG(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
 
-  u32 max_rand = 4;
+  u32 max_rand = 7;
 
   for (i = 0; i < rounds; ++i) {
 
     switch (rand_below(afl_ptr, max_rand)) {
 
       /* CHANGE */
-      case 0:                                               /* fall through */
-      case 1: {
+      case 0 ... 3:                                         /* fall through */
+      {
 
         u32 pos = rand_below(afl_ptr, m_size);
         u32 cur_item = m[pos], new_item;
@@ -103,8 +111,9 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
         } while (unlikely(
 
             new_item == cur_item ||
-            (whitespace_ids < new_item && whitespace_ids >= cur_item) ||
-            (whitespace_ids >= new_item && whitespace_ids < cur_item)));
+            (!alternative_tokenize &&
+             ((whitespace_ids < new_item && whitespace_ids >= cur_item) ||
+              (whitespace_ids >= new_item && whitespace_ids < cur_item)))));
 
         DEBUG(stderr, "MUT: %u -> %u\n", cur_item, new_item);
         m[pos] = new_item;
@@ -113,7 +122,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
       }
 
       /* INSERT (m_size +1 so we insert also after last place) */
-      case 2: {
+      case 4 ... 5: {
 
         u32 new_item;
         do {
@@ -126,26 +135,30 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
         m.insert(m.begin() + pos, new_item);
         ++m_size;
 
-        // if we insert an identifier or string we might need whitespace
-        if (id_to_token[new_item].size() > 1) {
+        if (likely(!alternative_tokenize)) {
 
-          // need to insert before?
+          // if we insert an identifier or string we might need whitespace
+          if (id_to_token[new_item].size() > 1) {
 
-          if (pos && m[pos - 1] >= whitespace_ids &&
-              id_to_token[m[pos - 1]].size() > 1) {
+            // need to insert before?
 
-            m.insert(m.begin() + pos, good_whitespace_or_singleval());
-            ++m_size;
+            if (pos && m[pos - 1] >= whitespace_ids &&
+                id_to_token[m[pos - 1]].size() > 1) {
 
-          }
+              m.insert(m.begin() + pos, good_whitespace_or_singleval());
+              ++m_size;
+
+            }
+
+            if (pos + 1 < m_size && m[pos + 1] >= whitespace_ids &&
+                id_to_token[m[pos + 1]].size() > 1) {
 
-          if (pos + 1 < m_size && m[pos + 1] >= whitespace_ids &&
-              id_to_token[m[pos + 1]].size() > 1) {
+              // need to insert after?
 
-            // need to insert after?
+              m.insert(m.begin() + pos + 1, good_whitespace_or_singleval());
+              ++m_size;
 
-            m.insert(m.begin() + pos + 1, good_whitespace_or_singleval());
-            ++m_size;
+            }
 
           }
 
@@ -156,7 +169,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
       }
 
       /* ERASE - only if large enough */
-      case 3: {
+      case 6: {
 
         if (m_size > 8) {
 
@@ -165,7 +178,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
         } else {
 
-          max_rand = 3;
+          max_rand = 6;
 
         }
 
@@ -180,10 +193,16 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
   }
 
   string output;
+  u32    m_size_1 = m_size - 1;
 
   for (i = 0; i < m_size; ++i) {
 
     output += id_to_token[m[i]];
+    if (unlikely(alternative_tokenize && i < m_size_1)) {
+
+      output += whitespace;
+
+    }
 
   }
 
@@ -219,7 +238,8 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
   if (likely(!debug)) {
 
-    if (afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) {
+    if ((afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) ||
+        (only_fav && !afl_ptr->queue_cur->favored)) {
 
       s = NULL;
       return 0;
@@ -353,8 +373,15 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     // DEBUG(stderr, "Read %lu bytes for %s\nBefore comment trim:\n%s\n",
     // input.size(), filename, input.c_str());
 
-    // input = regex_replace(input, regex_comment_slash, "$2");
-    input = regex_replace(input, regex_comment_star, "");
+    if (regex_comment_custom) {
+
+      input = regex_replace(input, *regex_comment_custom, "$2");
+
+    } else {
+
+      input = regex_replace(input, regex_comment_star, "");
+
+    }
 
     DEBUG(stderr, "After replace %lu bytes for %s\n%s\n", input.size(),
           filename, input.c_str());
@@ -377,53 +404,105 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     DEBUG(stderr, "START!\n");
 
-    while (regex_search(cur, ende, match, regex_string,
-                        regex_constants::match_any |
-                            regex_constants::match_not_null |
-                            regex_constants::match_continuous)) {
+    if (likely(!alternative_tokenize)) {
+
+      while (regex_search(cur, ende, match, regex_string,
+                          regex_constants::match_any |
+                              regex_constants::match_not_null |
+                              regex_constants::match_continuous)) {
+
+        prev = cur;
+        found = match[0].first;
+        cur = match[0].second;
+        DEBUG(stderr,
+              "string %s found at start %lu offset %lu continue at %lu\n",
+              match[0].str().c_str(), prev - input.begin(), match.position(),
+              cur - input.begin());
+
+        if (prev < found) {  // there are items between search start and find
+          while (prev < found) {
 
-      prev = cur;
-      found = match[0].first;
-      cur = match[0].second;
-      DEBUG(stderr, "string %s found at start %lu offset %lu continue at %lu\n",
-            match[0].str().c_str(), prev - input.begin(), match.position(),
-            cur - input.begin());
+            if (isspace(*prev)) {
 
-      if (prev < found) {  // there are items between search start and find
-        while (prev < found) {
+              auto start = prev;
+              while (isspace(*prev)) {
 
-          if (isspace(*prev)) {
+                ++prev;
 
-            auto start = prev;
-            while (isspace(*prev)) {
+              }
 
+              tokens.push_back(std::string(start, prev));
+              DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", prev - start,
+                    tokens[tokens.size() - 1].c_str());
+
+            } else if (isalnum(*prev) || *prev == '$' || *prev == '_') {
+
+              auto start = prev;
+              while (isalnum(*prev) || *prev == '$' || *prev == '_' ||
+                     *prev == '.' || *prev == '/') {
+
+                ++prev;
+
+              }
+
+              tokens.push_back(std::string(start, prev));
+              DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", prev - start,
+                    tokens[tokens.size() - 1].c_str());
+
+            } else {
+
+              tokens.push_back(std::string(prev, prev + 1));
+              DEBUG(stderr, "OTHER \"%c\"\n", *prev);
               ++prev;
 
             }
 
-            tokens.push_back(std::string(start, prev));
-            DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", prev - start,
+          }
+
+        }
+
+        if (match[0].length() > 0) { tokens.push_back(match[0]); }
+
+      }
+
+      DEBUG(stderr, "AFTER all strings\n");
+
+      if (cur < ende) {
+
+        while (cur < ende) {
+
+          if (isspace(*cur)) {
+
+            auto start = cur;
+            while (isspace(*cur)) {
+
+              ++cur;
+
+            }
+
+            tokens.push_back(std::string(start, cur));
+            DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", cur - start,
                   tokens[tokens.size() - 1].c_str());
 
-          } else if (isalnum(*prev) || *prev == '$' || *prev == '_') {
+          } else if (isalnum(*cur) || *cur == '$' || *cur == '_') {
 
-            auto start = prev;
-            while (isalnum(*prev) || *prev == '$' || *prev == '_' ||
-                   *prev == '.' || *prev == '/') {
+            auto start = cur;
+            while (isalnum(*cur) || *cur == '$' || *cur == '_' || *cur == '.' ||
+                   *cur == '/') {
 
-              ++prev;
+              ++cur;
 
             }
 
-            tokens.push_back(std::string(start, prev));
-            DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", prev - start,
+            tokens.push_back(std::string(start, cur));
+            DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", cur - start,
                   tokens[tokens.size() - 1].c_str());
 
           } else {
 
-            tokens.push_back(std::string(prev, prev + 1));
-            DEBUG(stderr, "OTHER \"%c\"\n", *prev);
-            ++prev;
+            tokens.push_back(std::string(cur, cur + 1));
+            DEBUG(stderr, "OTHER \"%c\"\n", *cur);
+            ++cur;
 
           }
 
@@ -431,48 +510,227 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       }
 
-      if (match[0].length() > 0) { tokens.push_back(match[0]); }
+    } else {
 
-    }
+      // alternative tokenize
 
-    DEBUG(stderr, "AFTER all strings\n");
+      while (regex_search(cur, ende, match, regex_string)) {
 
-    if (cur < ende) {
+        prev = cur;
+        found = match[0].first;
+        cur = match[0].second;
+        DEBUG(stderr,
+              "string %s found at start %lu offset %lu continue at %lu\n",
+              match[0].str().c_str(), prev - input.begin(), match.position(),
+              cur - input.begin());
+        if (prev < found) {  // there are items between search start and find
+          sregex_token_iterator it{prev, found, regex_whitespace, -1};
+          vector<std::string>   tokenized{it, {}};
+          tokenized.erase(std::remove_if(tokenized.begin(), tokenized.end(),
+                                         [](std::string const &s) {
 
-      while (cur < ende) {
+                                           return s.size() == 0;
 
-        if (isspace(*cur)) {
+                                         }),
 
-          auto start = cur;
-          while (isspace(*cur)) {
+                          tokenized.end());
+          tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
 
-            ++cur;
+          if (unlikely(debug)) {
+
+            DEBUG(stderr, "tokens: %lu   input size: %lu\n", tokenized.size(),
+                  input.size());
+            for (auto x : tokenized) {
+
+              cerr << x << endl;
+
+            }
 
           }
 
-          tokens.push_back(std::string(start, cur));
-          DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", cur - start,
-                tokens[tokens.size() - 1].c_str());
+          for (auto token : tokenized) {
 
-        } else if (isalnum(*cur) || *cur == '$' || *cur == '_') {
+            string::const_iterator c = token.begin(), e = token.end(), f, p;
+            smatch                 m;
 
-          auto start = cur;
-          while (isalnum(*cur) || *cur == '$' || *cur == '_' || *cur == '.' ||
-                 *cur == '/') {
+            while (regex_search(c, e, m, regex_word)) {
 
-            ++cur;
+              p = c;
+              f = m[0].first;
+              c = m[0].second;
+              if (p < f) {
+
+                // there are items between search start and find
+                while (p < f) {
+
+                  if (unlikely(debug)) {
+
+                    string foo(p, p + 1);
+                    DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
+
+                  }
+
+                  tokens.push_back(std::string(p, p + 1));
+                  ++p;
+
+                }
+
+                /*
+                                string foo(p, f);
+                                DEBUG(stderr, "before string: \"%s\"\n",
+                   foo.c_str()); tokens.push_back(std::string(p, f));
+                */
+
+              }
+
+              DEBUG(
+                  stderr,
+                  "SUBstring \"%s\" found at start %lu offset %lu continue at "
+                  "%lu\n",
+                  m[0].str().c_str(), p - input.begin(), m.position(),
+                  c - token.begin());
+              tokens.push_back(m[0].str());
+
+            }
+
+            if (c < e) {
+
+              while (c < e) {
+
+                if (unlikely(debug)) {
+
+                  string foo(c, c + 1);
+                  DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+
+                }
+
+                tokens.push_back(std::string(c, c + 1));
+                ++c;
+
+              }
+
+              /*
+                            if (unlikely(debug)) {
+
+                              string foo(c, e);
+                              DEBUG(stderr, "after string: \"%s\"\n",
+                 foo.c_str());
+
+                            }
+
+                            tokens.push_back(std::string(c, e));
+              */
+
+            }
 
           }
 
-          tokens.push_back(std::string(start, cur));
-          DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", cur - start,
-                tokens[tokens.size() - 1].c_str());
+        }
+
+        if (match[0].length() > 0) { tokens.push_back(match[0]); }
 
-        } else {
+      }
+
+      if (cur < ende) {
+
+        sregex_token_iterator it{cur, ende, regex_whitespace, -1};
+        vector<std::string>   tokenized{it, {}};
+        tokenized.erase(
+            std::remove_if(tokenized.begin(), tokenized.end(),
+                           [](std::string const &s) { return s.size() == 0; }),
+            tokenized.end());
+        tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
+
+        if (unlikely(debug)) {
+
+          DEBUG(stderr, "tokens: %lu   input size: %lu\n", tokenized.size(),
+                input.size());
+          for (auto x : tokenized) {
+
+            cerr << x << endl;
 
-          tokens.push_back(std::string(cur, cur + 1));
-          DEBUG(stderr, "OTHER \"%c\"\n", *cur);
-          ++cur;
+          }
+
+        }
+
+        for (auto token : tokenized) {
+
+          string::const_iterator c = token.begin(), e = token.end(), f, p;
+          smatch                 m;
+
+          while (regex_search(c, e, m, regex_word)) {
+
+            p = c;
+            f = m[0].first;
+            c = m[0].second;
+            if (p < f) {
+
+              // there are items between search start and find
+              while (p < f) {
+
+                if (unlikely(debug)) {
+
+                  string foo(p, p + 1);
+                  DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
+
+                }
+
+                tokens.push_back(std::string(p, p + 1));
+                ++p;
+
+              }
+
+              /*
+                            if (unlikely(debug)) {
+
+                              string foo(p, f);
+                              DEBUG(stderr, "before string: \"%s\"\n",
+                 foo.c_str());
+
+                            }
+
+                            tokens.push_back(std::string(p, f));
+              */
+
+            }
+
+            DEBUG(stderr,
+                  "SUB2string \"%s\" found at start %lu offset %lu continue at "
+                  "%lu\n",
+                  m[0].str().c_str(), p - input.begin(), m.position(),
+                  c - token.begin());
+            tokens.push_back(m[0].str());
+
+          }
+
+          if (c < e) {
+
+            while (c < e) {
+
+              if (unlikely(debug)) {
+
+                string foo(c, c + 1);
+                DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+
+              }
+
+              tokens.push_back(std::string(c, c + 1));
+              ++c;
+
+            }
+
+            /*
+                        if (unlikely(debug)) {
+
+                          string foo(c, e);
+                          DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+
+                        }
+
+                        tokens.push_back(std::string(c, e));
+            */
+
+          }
 
         }
 
@@ -483,9 +741,15 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     if (unlikely(debug)) {
 
       DEBUG(stderr, "DUMPING TOKENS:\n");
+      u32 size_1 = tokens.size() - 1;
       for (u32 i = 0; i < tokens.size(); ++i) {
 
         DEBUG(stderr, "%s", tokens[i].c_str());
+        if (unlikely(alternative_tokenize && i < size_1)) {
+
+          DEBUG(stderr, "%s", whitespace.c_str());
+
+        }
 
       }
 
@@ -556,6 +820,22 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
 
   }
 
+  if (getenv("AUTOTOKENS_ONLY_FAV")) { only_fav = 1; }
+  if (getenv("AUTOTOKENS_ALTERNATIVE_TOKENIZE")) { alternative_tokenize = 1; }
+  if (getenv("AUTOTOKENS_WHITESPACE")) {
+
+    whitespace = getenv("AUTOTOKENS_WHITESPACE");
+
+  }
+
+  if (getenv("AUTOTOKENS_COMMENT")) {
+
+    char buf[256];
+    snprintf(buf, sizeof(buf), "(%s.*)([\r\n]?)", getenv("AUTOTOKENS_COMMENT"));
+    regex_comment_custom = new regex(buf, regex::optimize);
+
+  }
+
   data->afl = afl_ptr = afl;
 
   // set common whitespace tokens
-- 
cgit 1.4.1


From 70f4b456faf8e361f6e0a34246708380c94cb36e Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 18 Jan 2023 13:58:27 +0100
Subject: fixes

---
 custom_mutators/autotokens/Makefile       |  7 ++++++-
 custom_mutators/autotokens/autotokens.cpp | 24 +++++++++++++++++-------
 2 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/Makefile b/custom_mutators/autotokens/Makefile
index 5dd52dee..8af63635 100644
--- a/custom_mutators/autotokens/Makefile
+++ b/custom_mutators/autotokens/Makefile
@@ -1,7 +1,12 @@
+ifdef debug
+	CFLAGS += "-fsanitize=address -Wall"
+	CXX := clang++
+endif
+
 all:	autotokens.so
 
 autotokens.so:	autotokens.cpp
 	$(CXX) -g -O3 $(CFLAGS) -shared -fPIC -o autotokens.so -I../../include autotokens.cpp ../../src/afl-performance.o
 
 clean:
-	rm -f autotokens.so *~ core
\ No newline at end of file
+	rm -f autotokens.so *~ core
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 28ef91e2..57c35846 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -48,7 +48,7 @@ static string                               whitespace = AUTOTOKENS_WHITESPACE;
 static regex                               *regex_comment_custom;
 static regex regex_comment_star("/\\*([:print:]|\n)*?\\*/",
                                 regex::multiline | regex::optimize);
-static regex regex_word("[A-Za-z0-9_$]+", regex::optimize);
+static regex regex_word("[A-Za-z0-9_$.-]+", regex::optimize);
 static regex regex_whitespace(R"([ \t]+)", regex::optimize);
 static regex regex_string("\"[[:print:]]*?\"|'[[:print:]]*?'", regex::optimize);
 static vector<u32> *s;  // the structure of the currently selected input
@@ -514,7 +514,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       // alternative tokenize
 
-      while (regex_search(cur, ende, match, regex_string)) {
+      while (regex_search(cur, ende, match, regex_string,
+                          regex_constants::match_any |
+                              regex_constants::match_not_null |
+                              regex_constants::match_continuous)) {
 
         prev = cur;
         found = match[0].first;
@@ -553,7 +556,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
             string::const_iterator c = token.begin(), e = token.end(), f, p;
             smatch                 m;
 
-            while (regex_search(c, e, m, regex_word)) {
+            while (regex_search(c, e, m, regex_word,
+                                regex_constants::match_any |
+                                    regex_constants::match_not_null |
+                                    regex_constants::match_continuous)) {
 
               p = c;
               f = m[0].first;
@@ -658,7 +664,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
           string::const_iterator c = token.begin(), e = token.end(), f, p;
           smatch                 m;
 
-          while (regex_search(c, e, m, regex_word)) {
+          while (regex_search(c, e, m, regex_word,
+                              regex_constants::match_any |
+                                  regex_constants::match_not_null |
+                                  regex_constants::match_continuous)) {
 
             p = c;
             f = m[0].first;
@@ -820,6 +829,7 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
 
   }
 
+  if (getenv("AUTOTOKENS_DEBUG")) { debug = 1; }
   if (getenv("AUTOTOKENS_ONLY_FAV")) { only_fav = 1; }
   if (getenv("AUTOTOKENS_ALTERNATIVE_TOKENIZE")) { alternative_tokenize = 1; }
   if (getenv("AUTOTOKENS_WHITESPACE")) {
@@ -890,9 +900,9 @@ extern "C" void afl_custom_deinit(my_mutator_t *data) {
 
   fprintf(stderr,
           "\n\nAutotoken mutator statistics:\n"
-          "  Number of all seen tokens:  %lu\n"
-          "  Number of input structures: %lu\n"
-          "  Number of all items in structures: %lu\n\n",
+          "  Number of all seen tokens:  %u\n"
+          "  Number of input structures: %u\n"
+          "  Number of all items in structures: %llu\n\n",
           current_id - 1, valid_structures, all_structure_items);
 
   free(data);
-- 
cgit 1.4.1


From 0db662db7b433a08b01de7f5a989843450919b88 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 18 Jan 2023 14:21:44 +0100
Subject: fix

---
 custom_mutators/autotokens/autotokens.cpp | 78 ++++++++++++++++---------------
 1 file changed, 41 insertions(+), 37 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 57c35846..94f86413 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -851,43 +851,47 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
   // set common whitespace tokens
   // we deliberately do not put uncommon ones here to these will count as
   // identifier tokens.
-  token_to_id[" "] = current_id;
-  id_to_token[current_id] = " ";
-  ++current_id;
-  token_to_id["\t"] = current_id;
-  id_to_token[current_id] = "\t";
-  ++current_id;
-  token_to_id["\n"] = current_id;
-  id_to_token[current_id] = "\n";
-  ++current_id;
-  token_to_id["\r\n"] = current_id;
-  id_to_token[current_id] = "\r\n";
-  ++current_id;
-  token_to_id[" \n"] = current_id;
-  id_to_token[current_id] = " \n";
-  ++current_id;
-  token_to_id["  "] = current_id;
-  id_to_token[current_id] = "  ";
-  ++current_id;
-  token_to_id["\t\t"] = current_id;
-  id_to_token[current_id] = "\t\t";
-  ++current_id;
-  token_to_id["\n\n"] = current_id;
-  id_to_token[current_id] = "\n\n";
-  ++current_id;
-  token_to_id["\r\n\r\n"] = current_id;
-  id_to_token[current_id] = "\r\n\r\n";
-  ++current_id;
-  token_to_id["    "] = current_id;
-  id_to_token[current_id] = "    ";
-  ++current_id;
-  token_to_id["\t\t\t\t"] = current_id;
-  id_to_token[current_id] = "\t\t\t\t";
-  ++current_id;
-  token_to_id["\n\n\n\n"] = current_id;
-  id_to_token[current_id] = "\n\n\n\n";
-  ++current_id;
-  whitespace_ids = current_id;
+  if (!alternative_tokenize) {
+
+    token_to_id[" "] = current_id;
+    id_to_token[current_id] = " ";
+    ++current_id;
+    token_to_id["\t"] = current_id;
+    id_to_token[current_id] = "\t";
+    ++current_id;
+    token_to_id["\n"] = current_id;
+    id_to_token[current_id] = "\n";
+    ++current_id;
+    token_to_id["\r\n"] = current_id;
+    id_to_token[current_id] = "\r\n";
+    ++current_id;
+    token_to_id[" \n"] = current_id;
+    id_to_token[current_id] = " \n";
+    ++current_id;
+    token_to_id["  "] = current_id;
+    id_to_token[current_id] = "  ";
+    ++current_id;
+    token_to_id["\t\t"] = current_id;
+    id_to_token[current_id] = "\t\t";
+    ++current_id;
+    token_to_id["\n\n"] = current_id;
+    id_to_token[current_id] = "\n\n";
+    ++current_id;
+    token_to_id["\r\n\r\n"] = current_id;
+    id_to_token[current_id] = "\r\n\r\n";
+    ++current_id;
+    token_to_id["    "] = current_id;
+    id_to_token[current_id] = "    ";
+    ++current_id;
+    token_to_id["\t\t\t\t"] = current_id;
+    id_to_token[current_id] = "\t\t\t\t";
+    ++current_id;
+    token_to_id["\n\n\n\n"] = current_id;
+    id_to_token[current_id] = "\n\n\n\n";
+    ++current_id;
+    whitespace_ids = current_id;
+
+  }
 
   return data;
 
-- 
cgit 1.4.1


From 22f757a169d3da3081306c0f861ef99a509073fe Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 18 Jan 2023 14:33:06 +0100
Subject: fix

---
 custom_mutators/autotokens/autotokens.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 94f86413..7aecb010 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -129,7 +129,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
           new_item = rand_below(afl_ptr, current_id);
 
-        } while (new_item >= whitespace_ids);
+        } while (!alternative_tokenize && new_item >= whitespace_ids);
 
         u32 pos = rand_below(afl_ptr, m_size + 1);
         m.insert(m.begin() + pos, new_item);
-- 
cgit 1.4.1


From 14d8eb9e40a6329abcb2f153174b543349c68c13 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 18 Jan 2023 22:17:14 +0100
Subject: autotoken: splicing; splice_optout

---
 custom_mutators/autotokens/Makefile       |   6 +-
 custom_mutators/autotokens/autotokens.cpp | 103 +++++++++++++++++++++++++++---
 docs/custom_mutators.md                   |  11 ++++
 include/afl-fuzz.h                        |  14 ++++
 src/afl-fuzz-mutators.c                   |  13 ++++
 src/afl-fuzz-one.c                        |   3 +-
 src/afl-fuzz-python.c                     |  16 +++++
 7 files changed, 155 insertions(+), 11 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/Makefile b/custom_mutators/autotokens/Makefile
index 8af63635..ab1da4b6 100644
--- a/custom_mutators/autotokens/Makefile
+++ b/custom_mutators/autotokens/Makefile
@@ -1,5 +1,9 @@
 ifdef debug
-	CFLAGS += "-fsanitize=address -Wall"
+	CFLAGS += -fsanitize=address -Wall
+	CXX := clang++
+endif
+ifdef DEBUG
+	CFLAGS += -fsanitize=address -Wall
 	CXX := clang++
 endif
 
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 7aecb010..c9ec4352 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -19,6 +19,13 @@ extern "C" {
 #define AUTOTOKENS_ALTERNATIVE_TOKENIZE 0
 #define AUTOTOKENS_CHANGE_MIN 8
 #define AUTOTOKENS_WHITESPACE " "
+#define AUTOTOKENS_SIZE_MIN 8
+#define AUTOTOKENS_SPLICE_MIN 4
+#define AUTOTOKENS_SPLICE_MAX 64
+
+#if AUTOTOKENS_SPLICE_MIN >= AUTOTOKENS_SIZE_MIN
+  #error SPLICE_MIN must be lower than SIZE_MIN
+#endif
 
 using namespace std;
 
@@ -42,6 +49,7 @@ static u32        extras_cnt, a_extras_cnt;
 static u64        all_spaces, all_tabs, all_lf, all_ws;
 static u64        all_structure_items;
 static unordered_map<string, vector<u32> *> file_mapping;
+static unordered_map<u32, vector<u32> *>    id_mapping;
 static unordered_map<string, u32>           token_to_id;
 static unordered_map<u32, string>           id_to_token;
 static string                               whitespace = AUTOTOKENS_WHITESPACE;
@@ -76,6 +84,8 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
                                   u8 **out_buf, u8 *add_buf,
                                   size_t add_buf_size, size_t max_size) {
 
+  (void)(data);
+
   if (s == NULL) {
 
     *out_buf = NULL;
@@ -92,14 +102,14 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
                                afl_ptr->havoc_div / 256));
   // DEBUG(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
 
-  u32 max_rand = 7;
+  u32 max_rand = 14;
 
   for (i = 0; i < rounds; ++i) {
 
     switch (rand_below(afl_ptr, max_rand)) {
 
       /* CHANGE */
-      case 0 ... 3:                                         /* fall through */
+      case 0 ... 7:                                         /* fall through */
       {
 
         u32 pos = rand_below(afl_ptr, m_size);
@@ -122,18 +132,19 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
       }
 
       /* INSERT (m_size +1 so we insert also after last place) */
-      case 4 ... 5: {
+      case 8 ... 9: {
 
         u32 new_item;
         do {
 
           new_item = rand_below(afl_ptr, current_id);
 
-        } while (!alternative_tokenize && new_item >= whitespace_ids);
+        } while (unlikely(!alternative_tokenize && new_item >= whitespace_ids));
 
         u32 pos = rand_below(afl_ptr, m_size + 1);
         m.insert(m.begin() + pos, new_item);
         ++m_size;
+        DEBUG(stderr, "INS: %u at %u\n", new_item, pos);
 
         if (likely(!alternative_tokenize)) {
 
@@ -168,8 +179,63 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
       }
 
+      /* SPLICING */
+      case 10 ... 11: {
+
+        u32  strategy = rand_below(afl_ptr, 4), dst_off, n;
+        auto src = id_mapping[rand_below(afl_ptr, valid_structures)];
+        u32  src_size = src->size();
+        u32  src_off = rand_below(afl_ptr, src_size - AUTOTOKENS_SPLICE_MIN);
+        u32  rand_r = 1 + MAX(AUTOTOKENS_SPLICE_MIN,
+                              MIN(AUTOTOKENS_SPLICE_MAX, src_size - src_off));
+
+        switch (strategy) {
+
+          // insert
+          case 0: {
+
+            dst_off = rand_below(afl_ptr, m_size);
+            n = AUTOTOKENS_SPLICE_MIN +
+                rand_below(afl_ptr, MIN(AUTOTOKENS_SPLICE_MAX,
+                                        rand_r - AUTOTOKENS_SPLICE_MIN));
+            m.insert(m.begin() + dst_off, src->begin() + src_off,
+                     src->begin() + src_off + n);
+            m_size += n;
+            DEBUG(stderr, "SPLICE-INS: %u at %u\n", n, dst_off);
+            break;
+
+          }
+
+          // overwrite
+          default: {
+
+            dst_off = rand_below(afl_ptr, m_size - AUTOTOKENS_SPLICE_MIN);
+            n = AUTOTOKENS_SPLICE_MIN +
+                rand_below(
+                    afl_ptr,
+                    MIN(AUTOTOKENS_SPLICE_MAX - AUTOTOKENS_SPLICE_MIN,
+                        MIN(m_size - dst_off - AUTOTOKENS_SPLICE_MIN,
+                            src_size - src_off - AUTOTOKENS_SPLICE_MIN)));
+
+            for (u32 i = 0; i < n; ++i) {
+
+              m[dst_off + i] = (*src)[src_off + i];
+
+            }
+
+            DEBUG(stderr, "SPLICE-MUT: %u at %u\n", n, dst_off);
+            break;
+
+          }
+
+        }
+
+        break;
+
+      }
+
       /* ERASE - only if large enough */
-      case 6: {
+      case 12 ... 13: {
 
         if (m_size > 8) {
 
@@ -178,7 +244,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
         } else {
 
-          max_rand = 6;
+          max_rand = 12;
 
         }
 
@@ -236,12 +302,15 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 extern "C" unsigned char afl_custom_queue_get(void                *data,
                                               const unsigned char *filename) {
 
+  (void)(data);
+
   if (likely(!debug)) {
 
     if ((afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) ||
         (only_fav && !afl_ptr->queue_cur->favored)) {
 
       s = NULL;
+      DEBUG(stderr, "cmplog not ascii or only_fav and not favorite\n");
       return 0;
 
     }
@@ -334,8 +403,8 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       fclose(fp);
       file_mapping[fn] = structure;  // NULL ptr so we don't read the file again
-      DEBUG(stderr, "Too short (%lu) %s\n", len, filename);
       s = NULL;
+      DEBUG(stderr, "Too short (%lu) %s\n", len, filename);
       return 0;
 
     }
@@ -362,8 +431,8 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       if (((len * AFL_TXT_MIN_PERCENT) / 100) > valid_chars) {
 
         file_mapping[fn] = NULL;
-        DEBUG(stderr, "Not text (%lu) %s\n", len, filename);
         s = NULL;
+        DEBUG(stderr, "Not text (%lu) %s\n", len, filename);
         return 0;
 
       }
@@ -766,6 +835,15 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     }
 
+    if (tokens.size() < AUTOTOKENS_SIZE_MIN) {
+
+      file_mapping[fn] = NULL;
+      s = NULL;
+      DEBUG(stderr, "too few tokens\n");
+      return 0;
+
+    }
+
     /* Now we transform the tokens into an ID list and saved that */
 
     structure = new vector<u32>();
@@ -791,8 +869,9 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     // save the token structure to the file mapping
     file_mapping[fn] = structure;
-    s = structure;
+    id_mapping[valid_structures] = structure;
     ++valid_structures;
+    s = structure;
     all_structure_items += structure->size();
 
     // we are done!
@@ -897,6 +976,12 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
 
 }
 
+extern "C" void afl_custom_splice_optout(my_mutator_t *data) {
+
+  (void)(data);
+
+}
+
 extern "C" void afl_custom_deinit(my_mutator_t *data) {
 
   /* we use this to print statistics at exit :-)
diff --git a/docs/custom_mutators.md b/docs/custom_mutators.md
index 4ffeda7a..322caa5b 100644
--- a/docs/custom_mutators.md
+++ b/docs/custom_mutators.md
@@ -48,6 +48,7 @@ C/C++:
 ```c
 void *afl_custom_init(afl_state_t *afl, unsigned int seed);
 unsigned int afl_custom_fuzz_count(void *data, const unsigned char *buf, size_t buf_size);
+void afl_custom_splice_optout(void *data);
 size_t afl_custom_fuzz(void *data, unsigned char *buf, size_t buf_size, unsigned char **out_buf, unsigned char *add_buf, size_t add_buf_size, size_t max_size);
 const char *afl_custom_describe(void *data, size_t max_description_len);
 size_t afl_custom_post_process(void *data, unsigned char *buf, size_t buf_size, unsigned char **out_buf);
@@ -72,6 +73,9 @@ def init(seed):
 def fuzz_count(buf):
     return cnt
 
+def splice_optout()
+    pass
+
 def fuzz(buf, add_buf, max_size):
     return mutated_out
 
@@ -132,6 +136,13 @@ def deinit():  # optional for Python
     for a specific queue entry, use this function. This function is most useful
     if `AFL_CUSTOM_MUTATOR_ONLY` is **not** used.
 
+- `splice_optout` (optional):
+
+    If this function is present, no splicing target is passed to the `fuzz`
+    function. This saves time if splicing data is not needed by the custom
+    fuzzing function.
+    This function is never called, just needs to be present to activate.
+
 - `fuzz` (optional):
 
     This method performs custom mutations on a given input. It also accepts an
diff --git a/include/afl-fuzz.h b/include/afl-fuzz.h
index 69fea579..1e8d085d 100644
--- a/include/afl-fuzz.h
+++ b/include/afl-fuzz.h
@@ -344,6 +344,7 @@ enum {
   /* 12 */ PY_FUNC_INTROSPECTION,
   /* 13 */ PY_FUNC_DESCRIBE,
   /* 14 */ PY_FUNC_FUZZ_SEND,
+  /* 15 */ PY_FUNC_SPLICE_OPTOUT,
   PY_FUNC_COUNT
 
 };
@@ -495,6 +496,7 @@ typedef struct afl_state {
       no_unlink,                        /* do not unlink cur_input          */
       debug,                            /* Debug mode                       */
       custom_only,                      /* Custom mutator only mode         */
+      custom_splice_optout,             /* Custom mutator no splice buffer  */
       is_main_node,                     /* if this is the main node         */
       is_secondary_node,                /* if this is a secondary instance  */
       pizza_is_served;                  /* pizza mode                       */
@@ -828,6 +830,17 @@ struct custom_mutator {
    */
   u32 (*afl_custom_fuzz_count)(void *data, const u8 *buf, size_t buf_size);
 
+  /**
+   * Opt-out of a splicing input for the fuzz mutator
+   *
+   * Empty dummy function. It's presence tells afl-fuzz not to pass a
+   * splice data pointer and len.
+   *
+   * @param data pointer returned in afl_custom_init by this custom mutator
+   * @noreturn
+   */
+  void (*afl_custom_splice_optout)(void *data);
+
   /**
    * Perform custom mutations on a given input
    *
@@ -1057,6 +1070,7 @@ u8          havoc_mutation_probability_py(void *);
 u8          queue_get_py(void *, const u8 *);
 const char *introspection_py(void *);
 u8          queue_new_entry_py(void *, const u8 *, const u8 *);
+void        splice_optout(void *);
 void        deinit_py(void *);
 
 #endif
diff --git a/src/afl-fuzz-mutators.c b/src/afl-fuzz-mutators.c
index 22e5262e..ce43064a 100644
--- a/src/afl-fuzz-mutators.c
+++ b/src/afl-fuzz-mutators.c
@@ -358,6 +358,19 @@ struct custom_mutator *load_custom_mutator(afl_state_t *afl, const char *fn) {
 
   }
 
+  /* "afl_custom_splice_optout", optional, never called */
+  mutator->afl_custom_splice_optout = dlsym(dh, "afl_custom_splice_optout");
+  if (!mutator->afl_custom_splice_optout) {
+
+    ACTF("optional symbol 'afl_custom_splice_optout' not found.");
+
+  } else {
+
+    OKF("Found 'afl_custom_splice_optout'.");
+    afl->custom_splice_optout = 1;
+
+  }
+
   /* "afl_custom_fuzz_send", optional */
   mutator->afl_custom_fuzz_send = dlsym(dh, "afl_custom_fuzz_send");
   if (!mutator->afl_custom_fuzz_send) {
diff --git a/src/afl-fuzz-one.c b/src/afl-fuzz-one.c
index eaf65987..5e352dcb 100644
--- a/src/afl-fuzz-one.c
+++ b/src/afl-fuzz-one.c
@@ -1954,7 +1954,8 @@ custom_mutator_stage:
           u32                 target_len = 0;
 
           /* check if splicing makes sense yet (enough entries) */
-          if (likely(afl->ready_for_splicing_count > 1)) {
+          if (likely(!afl->custom_splice_optout &&
+                     afl->ready_for_splicing_count > 1)) {
 
             /* Pick a random other queue entry for passing to external API
                that has the necessary length */
diff --git a/src/afl-fuzz-python.c b/src/afl-fuzz-python.c
index b509b936..69c305f7 100644
--- a/src/afl-fuzz-python.c
+++ b/src/afl-fuzz-python.c
@@ -248,6 +248,8 @@ static py_mutator_t *init_py_module(afl_state_t *afl, u8 *module_name) {
         PyObject_GetAttrString(py_module, "queue_get");
     py_functions[PY_FUNC_FUZZ_SEND] =
         PyObject_GetAttrString(py_module, "fuzz_send");
+    py_functions[PY_FUNC_SPLICE_OPTOUT] =
+        PyObject_GetAttrString(py_module, "splice_optout");
     py_functions[PY_FUNC_QUEUE_NEW_ENTRY] =
         PyObject_GetAttrString(py_module, "queue_new_entry");
     py_functions[PY_FUNC_INTROSPECTION] =
@@ -394,6 +396,13 @@ void deinit_py(void *py_mutator) {
 
 }
 
+void splice_optout_py(void *py_mutator) {
+
+  // this is never called
+  (void)(py_mutator);
+
+}
+
 struct custom_mutator *load_custom_mutator_py(afl_state_t *afl,
                                               char        *module_name) {
 
@@ -474,6 +483,13 @@ struct custom_mutator *load_custom_mutator_py(afl_state_t *afl,
 
   }
 
+  if (py_functions[PY_FUNC_SPLICE_OPTOUT]) {
+
+    mutator->afl_custom_splice_optout = splice_optout_py;
+    afl->custom_splice_optout = 1;
+
+  }
+
   if (py_functions[PY_FUNC_QUEUE_NEW_ENTRY]) {
 
     mutator->afl_custom_queue_new_entry = queue_new_entry_py;
-- 
cgit 1.4.1


From 17752465e6b3c70fd0104fae7bb1f84c1cb8bb66 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 18 Jan 2023 22:31:55 +0100
Subject: nit

---
 custom_mutators/autotokens/README         | 2 ++
 custom_mutators/autotokens/TODO           | 8 +-------
 custom_mutators/autotokens/autotokens.cpp | 7 ++-----
 3 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/README b/custom_mutators/autotokens/README
index f6e9c753..f82dcd98 100644
--- a/custom_mutators/autotokens/README
+++ b/custom_mutators/autotokens/README
@@ -11,6 +11,8 @@ If you have a dictionary (`-x`) this improves this custom grammar mutator.
 If **not** running with `CMPLOG`, it is possible to set
 `AFL_CUSTOM_MUTATOR_ONLY` to concentrate on grammar bug classes.
 
+Do **not** set `AFL_DISABLE_TRIM` with this custom mutator!
+
 ## Configuration via environment variables
 
 `AUTOTOKENS_ONLY_FAV` - only use this mutator on favorite queue items
diff --git a/custom_mutators/autotokens/TODO b/custom_mutators/autotokens/TODO
index 2e5e384f..95b79373 100644
--- a/custom_mutators/autotokens/TODO
+++ b/custom_mutators/autotokens/TODO
@@ -1,12 +1,6 @@
-whitespace belassen oder notieren?		MAYBE
-0=space 1=tab 2=linefeed
-
 cmplog: only add tokens that were found to fit?
 
 create from thin air if no good seed after a cycle and dict large enough?
 (static u32 no_of_struct_inputs;) 
 
-splice insert, splice overwrite
-(linefeed, semicolon)
-
-
+splicing -> check if whitespace/token is needed
\ No newline at end of file
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index c9ec4352..5e683455 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -217,11 +217,8 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
                         MIN(m_size - dst_off - AUTOTOKENS_SPLICE_MIN,
                             src_size - src_off - AUTOTOKENS_SPLICE_MIN)));
 
-            for (u32 i = 0; i < n; ++i) {
-
-              m[dst_off + i] = (*src)[src_off + i];
-
-            }
+            copy(src->begin() + src_off, src->begin() + src_off + n,
+                 m.begin() + dst_off);
 
             DEBUG(stderr, "SPLICE-MUT: %u at %u\n", n, dst_off);
             break;
-- 
cgit 1.4.1


From 45567791c66e128361a7533481b385497ced881f Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 18 Jan 2023 23:09:16 +0100
Subject: autotokens: define disable splice

---
 custom_mutators/autotokens/autotokens.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 5e683455..f6ab9ddd 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -22,6 +22,7 @@ extern "C" {
 #define AUTOTOKENS_SIZE_MIN 8
 #define AUTOTOKENS_SPLICE_MIN 4
 #define AUTOTOKENS_SPLICE_MAX 64
+#define AUTOTOKENS_SPLICE_DISABLE 0
 
 #if AUTOTOKENS_SPLICE_MIN >= AUTOTOKENS_SIZE_MIN
   #error SPLICE_MIN must be lower than SIZE_MIN
@@ -102,7 +103,13 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
                                afl_ptr->havoc_div / 256));
   // DEBUG(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
 
-  u32 max_rand = 14;
+#if AUTOTOKENS_SPLICE_DISABLE == 1
+  #define AUTOTOKENS_MUT_MAX 12
+#else
+  #define AUTOTOKENS_MUT_MAX 14
+#endif
+
+  u32 max_rand = AUTOTOKENS_MUT_MAX;
 
   for (i = 0; i < rounds; ++i) {
 
@@ -179,6 +186,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
       }
 
+#if AUTOTOKENS_SPLICE_DISABLE != 1
       /* SPLICING */
       case 10 ... 11: {
 
@@ -230,9 +238,10 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
         break;
 
       }
+#endif
 
       /* ERASE - only if large enough */
-      case 12 ... 13: {
+      default: {
 
         if (m_size > 8) {
 
@@ -241,7 +250,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
         } else {
 
-          max_rand = 12;
+          max_rand = AUTOTOKENS_MUT_MAX - 2;
 
         }
 
-- 
cgit 1.4.1


From 151a8facae2048a26c65658dfec507233a677fb0 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 18 Jan 2023 23:16:18 +0100
Subject: autotokens: stats

---
 custom_mutators/autotokens/autotokens.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index f6ab9ddd..4f3289c9 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -22,7 +22,9 @@ extern "C" {
 #define AUTOTOKENS_SIZE_MIN 8
 #define AUTOTOKENS_SPLICE_MIN 4
 #define AUTOTOKENS_SPLICE_MAX 64
-#define AUTOTOKENS_SPLICE_DISABLE 0
+#ifndef AUTOTOKENS_SPLICE_DISABLE
+  #define AUTOTOKENS_SPLICE_DISABLE 0
+#endif
 
 #if AUTOTOKENS_SPLICE_MIN >= AUTOTOKENS_SIZE_MIN
   #error SPLICE_MIN must be lower than SIZE_MIN
@@ -49,6 +51,7 @@ static u32        whitespace_ids;
 static u32        extras_cnt, a_extras_cnt;
 static u64        all_spaces, all_tabs, all_lf, all_ws;
 static u64        all_structure_items;
+static u64        fuzz_count;
 static unordered_map<string, vector<u32> *> file_mapping;
 static unordered_map<u32, vector<u32> *>    id_mapping;
 static unordered_map<string, u32>           token_to_id;
@@ -238,6 +241,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
         break;
 
       }
+
 #endif
 
       /* ERASE - only if large enough */
@@ -298,6 +302,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
   memcpy(mutated_out, output.data(), mutated_size);
   *out_buf = mutated_out;
+  ++fuzz_count;
   return mutated_size;
 
 }
@@ -997,8 +1002,9 @@ extern "C" void afl_custom_deinit(my_mutator_t *data) {
           "\n\nAutotoken mutator statistics:\n"
           "  Number of all seen tokens:  %u\n"
           "  Number of input structures: %u\n"
-          "  Number of all items in structures: %llu\n\n",
-          current_id - 1, valid_structures, all_structure_items);
+          "  Number of all items in structures: %llu\n"
+          "  Number of total fuzzes: %llu\n\n",
+          current_id - 1, valid_structures, all_structure_items, fuzz_count);
 
   free(data);
 
-- 
cgit 1.4.1


From eeca3a0b2939c605497e9b3a615ee4a466f4a3f2 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Thu, 19 Jan 2023 11:52:19 +0100
Subject: lots of fixes

---
 custom_mutators/autotokens/TODO           |   2 +-
 custom_mutators/autotokens/autotokens.cpp | 424 +++++++++++++++++++-----------
 docs/custom_mutators.md                   |   1 +
 include/afl-fuzz.h                        |  11 +-
 src/afl-fuzz-one.c                        |   3 +-
 5 files changed, 279 insertions(+), 162 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/TODO b/custom_mutators/autotokens/TODO
index 95b79373..2e39511c 100644
--- a/custom_mutators/autotokens/TODO
+++ b/custom_mutators/autotokens/TODO
@@ -3,4 +3,4 @@ cmplog: only add tokens that were found to fit?
 create from thin air if no good seed after a cycle and dict large enough?
 (static u32 no_of_struct_inputs;) 
 
-splicing -> check if whitespace/token is needed
\ No newline at end of file
+splicing -> check if whitespace/token is needed
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 4f3289c9..102bea0f 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -38,8 +38,10 @@ typedef struct my_mutator {
 
 } my_mutator_t;
 
-#define DEBUG \
+#undef DEBUGF
+#define DEBUGF \
   if (unlikely(debug)) fprintf
+#define IFDEBUG if (unlikely(debug))
 
 static afl_state *afl_ptr;
 static int        debug = AUTOTOKENS_DEBUG;
@@ -57,12 +59,12 @@ static unordered_map<u32, vector<u32> *>    id_mapping;
 static unordered_map<string, u32>           token_to_id;
 static unordered_map<u32, string>           id_to_token;
 static string                               whitespace = AUTOTOKENS_WHITESPACE;
+static string                               output;
 static regex                               *regex_comment_custom;
-static regex regex_comment_star("/\\*([:print:]|\n)*?\\*/",
-                                regex::multiline | regex::optimize);
-static regex regex_word("[A-Za-z0-9_$.-]+", regex::optimize);
-static regex regex_whitespace(R"([ \t]+)", regex::optimize);
-static regex regex_string("\"[[:print:]]*?\"|'[[:print:]]*?'", regex::optimize);
+static regex        regex_comment_star("/\\*([:print:]|\n)*?\\*/",
+                                       regex::multiline | regex::optimize);
+static regex        regex_word("[A-Za-z0-9_$.-]+", regex::optimize);
+static regex        regex_whitespace(R"([ \t]+)", regex::optimize);
 static vector<u32> *s;  // the structure of the currently selected input
 
 u32 good_whitespace_or_singleval() {
@@ -104,7 +106,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
       MAX(AUTOTOKENS_CHANGE_MIN,
           MIN(m_size >> 3, HAVOC_CYCLES * afl_ptr->queue_cur->perf_score *
                                afl_ptr->havoc_div / 256));
-  // DEBUG(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
+  // DEBUGF(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
 
 #if AUTOTOKENS_SPLICE_DISABLE == 1
   #define AUTOTOKENS_MUT_MAX 12
@@ -112,7 +114,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
   #define AUTOTOKENS_MUT_MAX 14
 #endif
 
-  u32 max_rand = AUTOTOKENS_MUT_MAX;
+  u32 max_rand = AUTOTOKENS_MUT_MAX, new_item, pos;
 
   for (i = 0; i < rounds; ++i) {
 
@@ -122,8 +124,8 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
       case 0 ... 7:                                         /* fall through */
       {
 
-        u32 pos = rand_below(afl_ptr, m_size);
-        u32 cur_item = m[pos], new_item;
+        pos = rand_below(afl_ptr, m_size);
+        u32 cur_item = m[pos];
         do {
 
           new_item = rand_below(afl_ptr, current_id);
@@ -135,7 +137,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
              ((whitespace_ids < new_item && whitespace_ids >= cur_item) ||
               (whitespace_ids >= new_item && whitespace_ids < cur_item)))));
 
-        DEBUG(stderr, "MUT: %u -> %u\n", cur_item, new_item);
+        DEBUGF(stderr, "MUT: %u -> %u\n", cur_item, new_item);
         m[pos] = new_item;
         break;
 
@@ -144,7 +146,6 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
       /* INSERT (m_size +1 so we insert also after last place) */
       case 8 ... 9: {
 
-        u32 new_item;
         do {
 
           new_item = rand_below(afl_ptr, current_id);
@@ -154,7 +155,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
         u32 pos = rand_below(afl_ptr, m_size + 1);
         m.insert(m.begin() + pos, new_item);
         ++m_size;
-        DEBUG(stderr, "INS: %u at %u\n", new_item, pos);
+        DEBUGF(stderr, "INS: %u at %u\n", new_item, pos);
 
         if (likely(!alternative_tokenize)) {
 
@@ -212,7 +213,8 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
             m.insert(m.begin() + dst_off, src->begin() + src_off,
                      src->begin() + src_off + n);
             m_size += n;
-            DEBUG(stderr, "SPLICE-INS: %u at %u\n", n, dst_off);
+            DEBUGF(stderr, "SPLICE-INS: %u at %u\n", n, dst_off);
+
             break;
 
           }
@@ -231,13 +233,36 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
             copy(src->begin() + src_off, src->begin() + src_off + n,
                  m.begin() + dst_off);
 
-            DEBUG(stderr, "SPLICE-MUT: %u at %u\n", n, dst_off);
+            DEBUGF(stderr, "SPLICE-MUT: %u at %u\n", n, dst_off);
             break;
 
           }
 
         }
 
+        if (likely(!alternative_tokenize)) {
+
+          // do we need a whitespace/token at the beginning?
+          if (dst_off && id_to_token[m[dst_off - 1]].size() > 1 &&
+              id_to_token[m[dst_off]].size() > 1) {
+
+            m.insert(m.begin() + dst_off, good_whitespace_or_singleval());
+            ++m_size;
+
+          }
+
+          // do we need a whitespace/token at the end?
+          if (dst_off + n < m_size &&
+              id_to_token[m[dst_off + n - 1]].size() > 1 &&
+              id_to_token[m[dst_off + n]].size() > 1) {
+
+            m.insert(m.begin() + dst_off + n, good_whitespace_or_singleval());
+            ++m_size;
+
+          }
+
+        }
+
         break;
 
       }
@@ -249,11 +274,32 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
         if (m_size > 8) {
 
-          m.erase(m.begin() + rand_below(afl_ptr, m_size));
-          --m_size;
+          do {
+
+            pos = rand_below(afl_ptr, m_size);
+
+          } while (unlikely(pos < whitespace_ids));
+
+          // if what we delete will result in a missing whitespace/token,
+          // instead of deleting we switch the item to a whitespace or token.
+          if (likely(!alternative_tokenize) && pos && pos < m_size &&
+              id_to_token[m[pos - 1]].size() > 1 &&
+              id_to_token[m[pos + 1]].size() > 1) {
+
+            m[pos] = good_whitespace_or_singleval();
+
+          } else {
+
+            m.erase(m.begin() + pos);
+            --m_size;
+
+          }
 
         } else {
 
+          // if the data is already too small do not try to make it smaller
+          // again this run.
+
           max_rand = AUTOTOKENS_MUT_MAX - 2;
 
         }
@@ -262,14 +308,12 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
       }
 
-        // TODO: add full line insert splice, replace splace, delete
-
     }
 
   }
 
-  string output;
-  u32    m_size_1 = m_size - 1;
+  u32 m_size_1 = m_size - 1;
+  output = "";
 
   for (i = 0; i < m_size; ++i) {
 
@@ -282,31 +326,108 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
   }
 
-  u32 mutated_size = output.size();
-  u8 *mutated_out = (u8 *)afl_realloc((void **)out_buf, mutated_size);
+  u32 mutated_size = (u32)output.size();
+  u8 *mutated_out = (u8 *)output.data();
 
-  if (unlikely(!mutated_out)) {
+  if (unlikely(mutated_size > max_size)) { mutated_size = max_size; }
 
-    *out_buf = NULL;
-    return 0;
-
-  }
-
-  if (unlikely(debug)) {
+  IFDEBUG {
 
-    DEBUG(stderr, "MUTATED to %u bytes:\n", mutated_size);
+    DEBUGF(stderr, "MUTATED to %u bytes:\n", mutated_size);
     fwrite(output.data(), 1, mutated_size, stderr);
-    DEBUG(stderr, "\n---\n");
+    DEBUGF(stderr, "\n---\n");
 
   }
 
-  memcpy(mutated_out, output.data(), mutated_size);
   *out_buf = mutated_out;
   ++fuzz_count;
   return mutated_size;
 
 }
 
+/* I get f*cking stack overflow using C++ regex with a regex of
+   "\"[[:print:]]*?\"" if this matches a long string even with regex::optimize
+   enabled :-( */
+u8 my_search_string(string::const_iterator cur, string::const_iterator ende,
+                    string::const_iterator *match_begin,
+                    string::const_iterator *match_end) {
+
+  string::const_iterator start = cur, found_begin;
+  u8                     quote_type = 0;
+
+  while (cur < ende) {
+
+    switch (*cur) {
+
+      case '"': {
+
+        if (cur == start || *(cur - 1) != '\\') {
+
+          if (!quote_type) {
+
+            found_begin = cur;
+            quote_type = 1;
+
+          } else if (quote_type == 1) {
+
+            *match_begin = found_begin;
+            *match_end = cur + 1;
+            return 1;
+
+          }
+
+        }
+
+        break;
+
+      }
+
+      case '\'': {
+
+        if (cur == start || *(cur - 1) != '\\') {
+
+          if (!quote_type) {
+
+            found_begin = cur;
+            quote_type = 2;
+
+          } else if (quote_type == 2) {
+
+            *match_begin = found_begin;
+            *match_end = cur + 1;
+            return 1;
+
+          }
+
+        }
+
+        break;
+
+      }
+
+      case '\n':
+      case '\r':
+      case 0: {
+
+        quote_type = 0;
+        break;
+
+      }
+
+      default:
+        if (unlikely(quote_type && !isprint(*cur))) { quote_type = 0; }
+        break;
+
+    }
+
+    ++cur;
+
+  }
+
+  return 0;
+
+}
+
 /* We are not using afl_custom_queue_new_entry() because not every corpus entry
    will be necessarily fuzzed. so we use afl_custom_queue_get() instead */
 
@@ -321,7 +442,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
         (only_fav && !afl_ptr->queue_cur->favored)) {
 
       s = NULL;
-      DEBUG(stderr, "cmplog not ascii or only_fav and not favorite\n");
+      DEBUGF(stderr, "cmplog not ascii or only_fav and not favorite\n");
       return 0;
 
     }
@@ -356,7 +477,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       }
 
       ++extras_cnt;
-      DEBUG(stderr, "Added from dictionary: \"%s\"\n", ptr);
+      DEBUGF(stderr, "Added from dictionary: \"%s\"\n", ptr);
 
     }
 
@@ -385,7 +506,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       }
 
       ++a_extras_cnt;
-      DEBUG(stderr, "Added from auto dictionary: \"%s\"\n", ptr);
+      DEBUGF(stderr, "Added from auto dictionary: \"%s\"\n", ptr);
 
     }
 
@@ -415,7 +536,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       fclose(fp);
       file_mapping[fn] = structure;  // NULL ptr so we don't read the file again
       s = NULL;
-      DEBUG(stderr, "Too short (%lu) %s\n", len, filename);
+      DEBUGF(stderr, "Too short (%lu) %s\n", len, filename);
       return 0;
 
     }
@@ -443,14 +564,14 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
         file_mapping[fn] = NULL;
         s = NULL;
-        DEBUG(stderr, "Not text (%lu) %s\n", len, filename);
+        DEBUGF(stderr, "Not text (%lu) %s\n", len, filename);
         return 0;
 
       }
 
     }
 
-    // DEBUG(stderr, "Read %lu bytes for %s\nBefore comment trim:\n%s\n",
+    // DEBUGF(stderr, "Read %lu bytes for %s\nBefore comment trim:\n%s\n",
     // input.size(), filename, input.c_str());
 
     if (regex_comment_custom) {
@@ -463,15 +584,15 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     }
 
-    DEBUG(stderr, "After replace %lu bytes for %s\n%s\n", input.size(),
-          filename, input.c_str());
+    DEBUGF(stderr, "After replace %lu bytes for %s\n%s\n", input.size(),
+           filename, input.c_str());
 
     u32  spaces = count(input.begin(), input.end(), ' ');
     u32  tabs = count(input.begin(), input.end(), '\t');
     u32  linefeeds = count(input.begin(), input.end(), '\n');
     bool ends_with_linefeed = input[input.length() - 1] == '\n';
-    DEBUG(stderr, "spaces=%u tabs=%u linefeeds=%u ends=%u\n", spaces, tabs,
-          linefeeds, ends_with_linefeed);
+    DEBUGF(stderr, "spaces=%u tabs=%u linefeeds=%u ends=%u\n", spaces, tabs,
+           linefeeds, ends_with_linefeed);
     all_spaces += spaces;
     all_tabs += tabs;
     all_lf += linefeeds;
@@ -479,25 +600,28 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     // now extract all tokens
     vector<string>         tokens;
-    smatch                 match;
-    string::const_iterator cur = input.begin(), ende = input.end(), found, prev;
+    string::const_iterator cur = input.begin(), ende = input.end(), found, prev,
+                           match_begin, match_end;
 
-    DEBUG(stderr, "START!\n");
+    DEBUGF(stderr, "START!\n");
 
     if (likely(!alternative_tokenize)) {
 
-      while (regex_search(cur, ende, match, regex_string,
-                          regex_constants::match_any |
-                              regex_constants::match_not_null |
-                              regex_constants::match_continuous)) {
+      while (my_search_string(cur, ende, &match_begin, &match_end)) {
 
         prev = cur;
-        found = match[0].first;
-        cur = match[0].second;
-        DEBUG(stderr,
-              "string %s found at start %lu offset %lu continue at %lu\n",
-              match[0].str().c_str(), prev - input.begin(), match.position(),
-              cur - input.begin());
+        found = match_begin;
+        cur = match_end;
+
+        IFDEBUG {
+
+          string foo(match_begin, match_end);
+          DEBUGF(stderr,
+                 "string %s found at start %lu offset %lu continue at %lu\n",
+                 foo.c_str(), prev - input.begin(), found - prev,
+                 cur - input.begin());
+
+        }
 
         if (prev < found) {  // there are items between search start and find
           while (prev < found) {
@@ -512,8 +636,8 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
               }
 
               tokens.push_back(std::string(start, prev));
-              DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", prev - start,
-                    tokens[tokens.size() - 1].c_str());
+              DEBUGF(stderr, "WHITESPACE %ld \"%s\"\n", prev - start,
+                     tokens[tokens.size() - 1].c_str());
 
             } else if (isalnum(*prev) || *prev == '$' || *prev == '_') {
 
@@ -525,14 +649,14 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
               }
 
-              tokens.push_back(std::string(start, prev));
-              DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", prev - start,
-                    tokens[tokens.size() - 1].c_str());
+              tokens.push_back(string(start, prev));
+              DEBUGF(stderr, "IDENTIFIER %ld \"%s\"\n", prev - start,
+                     tokens[tokens.size() - 1].c_str());
 
             } else {
 
-              tokens.push_back(std::string(prev, prev + 1));
-              DEBUG(stderr, "OTHER \"%c\"\n", *prev);
+              tokens.push_back(string(prev, prev + 1));
+              DEBUGF(stderr, "OTHER \"%c\"\n", *prev);
               ++prev;
 
             }
@@ -541,11 +665,12 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
         }
 
-        if (match[0].length() > 0) { tokens.push_back(match[0]); }
+        tokens.push_back(string(match_begin, match_end));
+        DEBUGF(stderr, "TOK: %s\n", tokens[tokens.size() - 1].c_str());
 
       }
 
-      DEBUG(stderr, "AFTER all strings\n");
+      DEBUGF(stderr, "AFTER all strings\n");
 
       if (cur < ende) {
 
@@ -561,8 +686,8 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
             }
 
             tokens.push_back(std::string(start, cur));
-            DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", cur - start,
-                  tokens[tokens.size() - 1].c_str());
+            DEBUGF(stderr, "WHITESPACE %ld \"%s\"\n", cur - start,
+                   tokens[tokens.size() - 1].c_str());
 
           } else if (isalnum(*cur) || *cur == '$' || *cur == '_') {
 
@@ -575,13 +700,13 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
             }
 
             tokens.push_back(std::string(start, cur));
-            DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", cur - start,
-                  tokens[tokens.size() - 1].c_str());
+            DEBUGF(stderr, "IDENTIFIER %ld \"%s\"\n", cur - start,
+                   tokens[tokens.size() - 1].c_str());
 
           } else {
 
             tokens.push_back(std::string(cur, cur + 1));
-            DEBUG(stderr, "OTHER \"%c\"\n", *cur);
+            DEBUGF(stderr, "OTHER \"%c\"\n", *cur);
             ++cur;
 
           }
@@ -593,19 +718,21 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     } else {
 
       // alternative tokenize
-
-      while (regex_search(cur, ende, match, regex_string,
-                          regex_constants::match_any |
-                              regex_constants::match_not_null |
-                              regex_constants::match_continuous)) {
+      while (my_search_string(cur, ende, &match_begin, &match_end)) {
 
         prev = cur;
-        found = match[0].first;
-        cur = match[0].second;
-        DEBUG(stderr,
-              "string %s found at start %lu offset %lu continue at %lu\n",
-              match[0].str().c_str(), prev - input.begin(), match.position(),
-              cur - input.begin());
+        found = match_begin;
+        cur = match_end;
+        IFDEBUG {
+
+          string foo(match_begin, match_end);
+          DEBUGF(stderr,
+                 "string %s found at start %lu offset %lu continue at %lu\n",
+                 foo.c_str(), prev - input.begin(), found - prev,
+                 cur - input.begin());
+
+        }
+
         if (prev < found) {  // there are items between search start and find
           sregex_token_iterator it{prev, found, regex_whitespace, -1};
           vector<std::string>   tokenized{it, {}};
@@ -619,10 +746,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
                           tokenized.end());
           tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
 
-          if (unlikely(debug)) {
+          IFDEBUG {
 
-            DEBUG(stderr, "tokens: %lu   input size: %lu\n", tokenized.size(),
-                  input.size());
+            DEBUGF(stderr, "tokens1: %lu   input size: %lu\n", tokenized.size(),
+                   input.size());
             for (auto x : tokenized) {
 
               cerr << x << endl;
@@ -636,10 +763,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
             string::const_iterator c = token.begin(), e = token.end(), f, p;
             smatch                 m;
 
-            while (regex_search(c, e, m, regex_word,
-                                regex_constants::match_any |
-                                    regex_constants::match_not_null |
-                                    regex_constants::match_continuous)) {
+            while (regex_search(c, e, m, regex_word)) {
 
               p = c;
               f = m[0].first;
@@ -649,10 +773,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
                 // there are items between search start and find
                 while (p < f) {
 
-                  if (unlikely(debug)) {
+                  IFDEBUG {
 
                     string foo(p, p + 1);
-                    DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
+                    DEBUGF(stderr, "before string: \"%s\"\n", foo.c_str());
 
                   }
 
@@ -661,20 +785,21 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
                 }
 
-                /*
-                                string foo(p, f);
-                                DEBUG(stderr, "before string: \"%s\"\n",
-                   foo.c_str()); tokens.push_back(std::string(p, f));
-                */
+                IFDEBUG {
+
+                  string foo(p, f);
+                  DEBUGF(stderr, "before string: \"%s\"\n", foo.c_str());
+                  tokens.push_back(std::string(p, f));
+
+                }
 
               }
 
-              DEBUG(
-                  stderr,
-                  "SUBstring \"%s\" found at start %lu offset %lu continue at "
-                  "%lu\n",
-                  m[0].str().c_str(), p - input.begin(), m.position(),
-                  c - token.begin());
+              DEBUGF(stderr,
+                     "SUBstring \"%s\" found at start %lu offset %lu continue "
+                     "at %lu\n",
+                     m[0].str().c_str(), p - input.begin(), m.position(),
+                     c - token.begin());
               tokens.push_back(m[0].str());
 
             }
@@ -683,10 +808,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
               while (c < e) {
 
-                if (unlikely(debug)) {
+                IFDEBUG {
 
                   string foo(c, c + 1);
-                  DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+                  DEBUGF(stderr, "after string: \"%s\"\n", foo.c_str());
 
                 }
 
@@ -695,17 +820,14 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
               }
 
-              /*
-                            if (unlikely(debug)) {
+              IFDEBUG {
 
-                              string foo(c, e);
-                              DEBUG(stderr, "after string: \"%s\"\n",
-                 foo.c_str());
+                string foo(c, e);
+                DEBUGF(stderr, "after string: \"%s\"\n", foo.c_str());
 
-                            }
+              }
 
-                            tokens.push_back(std::string(c, e));
-              */
+              tokens.push_back(std::string(c, e));
 
             }
 
@@ -713,7 +835,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
         }
 
-        if (match[0].length() > 0) { tokens.push_back(match[0]); }
+        tokens.push_back(string(match_begin, match_end));
 
       }
 
@@ -727,10 +849,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
             tokenized.end());
         tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
 
-        if (unlikely(debug)) {
+        IFDEBUG {
 
-          DEBUG(stderr, "tokens: %lu   input size: %lu\n", tokenized.size(),
-                input.size());
+          DEBUGF(stderr, "tokens2: %lu   input size: %lu\n", tokenized.size(),
+                 input.size());
           for (auto x : tokenized) {
 
             cerr << x << endl;
@@ -744,10 +866,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
           string::const_iterator c = token.begin(), e = token.end(), f, p;
           smatch                 m;
 
-          while (regex_search(c, e, m, regex_word,
-                              regex_constants::match_any |
-                                  regex_constants::match_not_null |
-                                  regex_constants::match_continuous)) {
+          while (regex_search(c, e, m, regex_word)) {
 
             p = c;
             f = m[0].first;
@@ -757,10 +876,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
               // there are items between search start and find
               while (p < f) {
 
-                if (unlikely(debug)) {
+                IFDEBUG {
 
                   string foo(p, p + 1);
-                  DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
+                  DEBUGF(stderr, "before string: \"%s\"\n", foo.c_str());
 
                 }
 
@@ -769,25 +888,22 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
               }
 
-              /*
-                            if (unlikely(debug)) {
+              IFDEBUG {
 
-                              string foo(p, f);
-                              DEBUG(stderr, "before string: \"%s\"\n",
-                 foo.c_str());
+                string foo(p, f);
+                DEBUGF(stderr, "before string: \"%s\"\n", foo.c_str());
 
-                            }
+              }
 
-                            tokens.push_back(std::string(p, f));
-              */
+              tokens.push_back(std::string(p, f));
 
             }
 
-            DEBUG(stderr,
-                  "SUB2string \"%s\" found at start %lu offset %lu continue at "
-                  "%lu\n",
-                  m[0].str().c_str(), p - input.begin(), m.position(),
-                  c - token.begin());
+            DEBUGF(stderr,
+                   "SUB2string \"%s\" found at start %lu offset %lu continue "
+                   "at %lu\n",
+                   m[0].str().c_str(), p - input.begin(), m.position(),
+                   c - token.begin());
             tokens.push_back(m[0].str());
 
           }
@@ -796,10 +912,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
             while (c < e) {
 
-              if (unlikely(debug)) {
+              IFDEBUG {
 
                 string foo(c, c + 1);
-                DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+                DEBUGF(stderr, "after string: \"%s\"\n", foo.c_str());
 
               }
 
@@ -808,16 +924,14 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
             }
 
-            /*
-                        if (unlikely(debug)) {
+            IFDEBUG {
 
-                          string foo(c, e);
-                          DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+              string foo(c, e);
+              DEBUGF(stderr, "after string: \"%s\"\n", foo.c_str());
 
-                        }
+            }
 
-                        tokens.push_back(std::string(c, e));
-            */
+            tokens.push_back(std::string(c, e));
 
           }
 
@@ -827,22 +941,22 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     }
 
-    if (unlikely(debug)) {
+    IFDEBUG {
 
-      DEBUG(stderr, "DUMPING TOKENS:\n");
+      DEBUGF(stderr, "DUMPING TOKENS:\n");
       u32 size_1 = tokens.size() - 1;
       for (u32 i = 0; i < tokens.size(); ++i) {
 
-        DEBUG(stderr, "%s", tokens[i].c_str());
+        DEBUGF(stderr, "%s", tokens[i].c_str());
         if (unlikely(alternative_tokenize && i < size_1)) {
 
-          DEBUG(stderr, "%s", whitespace.c_str());
+          DEBUGF(stderr, "%s", whitespace.c_str());
 
         }
 
       }
 
-      DEBUG(stderr, "---------------------------\n");
+      DEBUGF(stderr, "---------------------------\n");
 
     }
 
@@ -850,7 +964,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       file_mapping[fn] = NULL;
       s = NULL;
-      DEBUG(stderr, "too few tokens\n");
+      DEBUGF(stderr, "too few tokens\n");
       return 0;
 
     }
@@ -886,21 +1000,23 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     all_structure_items += structure->size();
 
     // we are done!
-    DEBUG(stderr, "DONE! We have %lu tokens in the structure\n",
-          structure->size());
+    DEBUGF(stderr, "DONE! We have %lu tokens in the structure\n",
+           structure->size());
+
+  }
 
-  } else {
+  else {
 
     if (entry->second == NULL) {
 
-      DEBUG(stderr, "Skipping %s\n", filename);
+      DEBUGF(stderr, "Skipping %s\n", filename);
       s = NULL;
       return 0;
 
     }
 
     s = entry->second;
-    DEBUG(stderr, "OK %s\n", filename);
+    DEBUGF(stderr, "OK %s\n", filename);
 
   }
 
diff --git a/docs/custom_mutators.md b/docs/custom_mutators.md
index 322caa5b..82131c92 100644
--- a/docs/custom_mutators.md
+++ b/docs/custom_mutators.md
@@ -150,6 +150,7 @@ def deinit():  # optional for Python
     sense to use it. You would only skip this if `post_process` is used to fix
     checksums etc. so if you are using it, e.g., as a post processing library.
     Note that a length > 0 *must* be returned!
+    The returned output buffer is under **your** memory management!
 
 - `describe` (optional):
 
diff --git a/include/afl-fuzz.h b/include/afl-fuzz.h
index 1e8d085d..229bc025 100644
--- a/include/afl-fuzz.h
+++ b/include/afl-fuzz.h
@@ -844,15 +844,16 @@ struct custom_mutator {
   /**
    * Perform custom mutations on a given input
    *
-   * (Optional for now. Required in the future)
+   * (Optional)
    *
-   * @param data pointer returned in afl_custom_init by this custom mutator
+   * Getting an add_buf can be skipped by using afl_custom_splice_optout().
+   *
+   * @param[in] data Pointer returned in afl_custom_init by this custom mutator
    * @param[in] buf Pointer to the input data to be mutated and the mutated
    *     output
    * @param[in] buf_size Size of the input/output data
-   * @param[out] out_buf the new buffer. We may reuse *buf if large enough.
-   *             *out_buf = NULL is treated as FATAL.
-   * @param[in] add_buf Buffer containing the additional test case
+   * @param[out] out_buf The new buffer, under your memory mgmt.
+   * @param[in] add_buf Buffer containing an additional test case (splicing)
    * @param[in] add_buf_size Size of the additional test case
    * @param[in] max_size Maximum size of the mutated output. The mutation must
    * not produce data larger than max_size.
diff --git a/src/afl-fuzz-one.c b/src/afl-fuzz-one.c
index 5e352dcb..bd482562 100644
--- a/src/afl-fuzz-one.c
+++ b/src/afl-fuzz-one.c
@@ -564,8 +564,7 @@ u8 fuzz_one_original(afl_state_t *afl) {
       if (afl->cmplog_lvl == 3 ||
           (afl->cmplog_lvl == 2 && afl->queue_cur->tc_ref) ||
           afl->queue_cur->favored ||
-          !(afl->fsrv.total_execs % afl->queued_items) ||
-          get_cur_time() - afl->last_find_time > 300000) {  // 300 seconds
+          get_cur_time() - afl->last_find_time > 600000) {  // 600 seconds
 
         if (input_to_state_stage(afl, in_buf, out_buf, len)) {
 
-- 
cgit 1.4.1


From afff6f642c77e4986fdb8a4e9799c1a52e80ce32 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Thu, 19 Jan 2023 13:41:48 +0100
Subject: optimize

---
 custom_mutators/autotokens/autotokens.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 102bea0f..149ae430 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -109,9 +109,9 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
   // DEBUGF(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
 
 #if AUTOTOKENS_SPLICE_DISABLE == 1
-  #define AUTOTOKENS_MUT_MAX 12
+  #define AUTOTOKENS_MUT_MAX 18
 #else
-  #define AUTOTOKENS_MUT_MAX 14
+  #define AUTOTOKENS_MUT_MAX 27
 #endif
 
   u32 max_rand = AUTOTOKENS_MUT_MAX, new_item, pos;
@@ -120,8 +120,8 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
     switch (rand_below(afl_ptr, max_rand)) {
 
-      /* CHANGE */
-      case 0 ... 7:                                         /* fall through */
+      /* CHANGE/MUTATE single item */
+      case 0 ... 9:
       {
 
         pos = rand_below(afl_ptr, m_size);
@@ -144,7 +144,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
       }
 
       /* INSERT (m_size +1 so we insert also after last place) */
-      case 8 ... 9: {
+      case 10 ... 13: {
 
         do {
 
@@ -192,7 +192,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
 #if AUTOTOKENS_SPLICE_DISABLE != 1
       /* SPLICING */
-      case 10 ... 11: {
+      case 14 ... 22: {
 
         u32  strategy = rand_below(afl_ptr, 4), dst_off, n;
         auto src = id_mapping[rand_below(afl_ptr, valid_structures)];
@@ -278,11 +278,11 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
             pos = rand_below(afl_ptr, m_size);
 
-          } while (unlikely(pos < whitespace_ids));
+          } while (unlikely(m[pos] < whitespace_ids));
 
           // if what we delete will result in a missing whitespace/token,
           // instead of deleting we switch the item to a whitespace or token.
-          if (likely(!alternative_tokenize) && pos && pos < m_size &&
+          if (likely(!alternative_tokenize) && pos && pos + 1 < m_size &&
               id_to_token[m[pos - 1]].size() > 1 &&
               id_to_token[m[pos + 1]].size() > 1) {
 
@@ -300,7 +300,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
           // if the data is already too small do not try to make it smaller
           // again this run.
 
-          max_rand = AUTOTOKENS_MUT_MAX - 2;
+          max_rand -= 4;
 
         }
 
@@ -734,6 +734,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
         }
 
         if (prev < found) {  // there are items between search start and find
+
           sregex_token_iterator it{prev, found, regex_whitespace, -1};
           vector<std::string>   tokenized{it, {}};
           tokenized.erase(std::remove_if(tokenized.begin(), tokenized.end(),
-- 
cgit 1.4.1


From 86d3c65559209ce12452e18daf96946222c19b46 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Thu, 19 Jan 2023 15:59:57 +0100
Subject: nit

---
 custom_mutators/autotokens/autotokens.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 149ae430..f4b96c7b 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -121,8 +121,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
     switch (rand_below(afl_ptr, max_rand)) {
 
       /* CHANGE/MUTATE single item */
-      case 0 ... 9:
-      {
+      case 0 ... 9: {
 
         pos = rand_below(afl_ptr, m_size);
         u32 cur_item = m[pos];
@@ -438,8 +437,9 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
   if (likely(!debug)) {
 
-    if ((afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) ||
-        (only_fav && !afl_ptr->queue_cur->favored)) {
+    if (unlikely(!afl_ptr->custom_only) &&
+        ((afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) ||
+         (only_fav && !afl_ptr->queue_cur->favored))) {
 
       s = NULL;
       DEBUGF(stderr, "cmplog not ascii or only_fav and not favorite\n");
-- 
cgit 1.4.1


From 67cfe4f6d4a03c596a5c3e1aa97d64d79263746a Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Thu, 19 Jan 2023 22:24:24 +0100
Subject: nits

---
 custom_mutators/autotokens/autotokens.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index f4b96c7b..16ee8109 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -544,7 +544,15 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     string input;
     input.resize(len);
     rewind(fp);
-    fread((void *)input.data(), input.size(), 1, fp);
+
+    if (fread((void *)input.data(), 1, len, fp) != len) {
+
+      s = NULL;
+      DEBUGF(stderr, "Too short read %s\n", len, filename);
+      return 0;
+
+    }
+
     fclose(fp);
 
     if (!afl_ptr->shm.cmplog_mode) {
-- 
cgit 1.4.1


From bd2cb4cd1c2f07d5406875771cd41fb9a6e1f84d Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Fri, 20 Jan 2023 12:22:29 +0100
Subject: more default tokens

---
 custom_mutators/autotokens/autotokens.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 16ee8109..f9b5bd2e 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -1105,6 +1105,12 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
     id_to_token[current_id] = "\n\n\n\n";
     ++current_id;
     whitespace_ids = current_id;
+    token_to_id["\""] = current_id;
+    id_to_token[current_id] = "\"";
+    ++current_id;
+    token_to_id["'"] = current_id;
+    id_to_token[current_id] = "'";
+    ++current_id;
 
   }
 
-- 
cgit 1.4.1


From 47f35d29ac53ed1cdb87f65591b62947a7965060 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Fri, 27 Jan 2023 14:32:18 +0100
Subject: fix

---
 custom_mutators/autotokens/autotokens.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index f9b5bd2e..4a2cc08f 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -61,8 +61,10 @@ static unordered_map<u32, string>           id_to_token;
 static string                               whitespace = AUTOTOKENS_WHITESPACE;
 static string                               output;
 static regex                               *regex_comment_custom;
-static regex        regex_comment_star("/\\*([:print:]|\n)*?\\*/",
-                                       regex::multiline | regex::optimize);
+// multiline requires g++-11 libs :(
+static regex regex_comment_star(
+    "/\\*([:print:]|\n)*?\\*/",
+    regex_constants::optimize /* | regex_constants::multiline */);
 static regex        regex_word("[A-Za-z0-9_$.-]+", regex::optimize);
 static regex        regex_whitespace(R"([ \t]+)", regex::optimize);
 static vector<u32> *s;  // the structure of the currently selected input
@@ -548,7 +550,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     if (fread((void *)input.data(), 1, len, fp) != len) {
 
       s = NULL;
-      DEBUGF(stderr, "Too short read %s\n", len, filename);
+      DEBUGF(stderr, "Too short read %s\n", filename);
       return 0;
 
     }
-- 
cgit 1.4.1


From b5d8d4c866137a8a6bd55225b0eaf723123c46c9 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Sun, 29 Jan 2023 10:07:33 +0100
Subject: comment

---
 custom_mutators/autotokens/autotokens.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 4a2cc08f..0a010f0b 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -1,3 +1,9 @@
+/*
+   token level fuzzing custom mutator for afl++
+   (c) by Marc Heuse <mh@mh-sec.de>
+   License: Apache 2.0
+*/
+
 extern "C" {
 
 #include "afl-fuzz.h"
-- 
cgit 1.4.1


From 91ccbf3f68ab9e6e4bc277f86c3efed666867132 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 1 Feb 2023 17:16:51 +0100
Subject: fix

---
 custom_mutators/autotokens/autotokens.cpp | 18 ++++++++----------
 src/afl-fuzz-one.c                        |  5 +++--
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 0a010f0b..548e1be9 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -451,7 +451,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       s = NULL;
       DEBUGF(stderr, "cmplog not ascii or only_fav and not favorite\n");
-      return 0;
+      return 1;
 
     }
 
@@ -532,7 +532,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     if (!fp) {
 
       s = NULL;
-      return 0;
+      return 1;
 
     }  // should not happen
 
@@ -545,7 +545,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       file_mapping[fn] = structure;  // NULL ptr so we don't read the file again
       s = NULL;
       DEBUGF(stderr, "Too short (%lu) %s\n", len, filename);
-      return 0;
+      return 1;
 
     }
 
@@ -557,7 +557,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       s = NULL;
       DEBUGF(stderr, "Too short read %s\n", filename);
-      return 0;
+      return 1;
 
     }
 
@@ -581,7 +581,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
         file_mapping[fn] = NULL;
         s = NULL;
         DEBUGF(stderr, "Not text (%lu) %s\n", len, filename);
-        return 0;
+        return 1;
 
       }
 
@@ -982,7 +982,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       file_mapping[fn] = NULL;
       s = NULL;
       DEBUGF(stderr, "too few tokens\n");
-      return 0;
+      return 1;
 
     }
 
@@ -1020,15 +1020,13 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     DEBUGF(stderr, "DONE! We have %lu tokens in the structure\n",
            structure->size());
 
-  }
-
-  else {
+  } else {
 
     if (entry->second == NULL) {
 
       DEBUGF(stderr, "Skipping %s\n", filename);
       s = NULL;
-      return 0;
+      return 1;
 
     }
 
diff --git a/src/afl-fuzz-one.c b/src/afl-fuzz-one.c
index b25398c4..2f016217 100644
--- a/src/afl-fuzz-one.c
+++ b/src/afl-fuzz-one.c
@@ -1988,7 +1988,8 @@ custom_mutator_stage:
 
           if (unlikely(!mutated_buf)) {
 
-            FATAL("Error in custom_fuzz. Size returned: %zu", mutated_size);
+            //FATAL("Error in custom_fuzz. Size returned: %zu", mutated_size);
+            break;
 
           }
 
@@ -2040,7 +2041,7 @@ custom_mutator_stage:
   new_hit_cnt = afl->queued_items + afl->saved_crashes;
 
   afl->stage_finds[STAGE_CUSTOM_MUTATOR] += new_hit_cnt - orig_hit_cnt;
-  afl->stage_cycles[STAGE_CUSTOM_MUTATOR] += afl->stage_max;
+  afl->stage_cycles[STAGE_CUSTOM_MUTATOR] += afl->stage_cur;
 #ifdef INTROSPECTION
   afl->queue_cur->stats_mutated += afl->stage_max;
 #endif
-- 
cgit 1.4.1


From e1434bcfcd8c13de838559fd7b797d1a3cd5a672 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Sat, 4 Feb 2023 14:34:47 +0100
Subject: more autotoken options

---
 custom_mutators/autotokens/TODO           | 17 ++++++++++++
 custom_mutators/autotokens/autotokens.cpp | 45 ++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/TODO b/custom_mutators/autotokens/TODO
index 2e39511c..3cae3060 100644
--- a/custom_mutators/autotokens/TODO
+++ b/custom_mutators/autotokens/TODO
@@ -4,3 +4,20 @@ create from thin air if no good seed after a cycle and dict large enough?
 (static u32 no_of_struct_inputs;) 
 
 splicing -> check if whitespace/token is needed
+
+whitespace/token check only AFTER mutation
+
+analyse welche einen DICT haben, und welche davon rein ascii
+
+corpus analyse:
+	+ libxml
+	- hardbuzz
+	- sqlite
+	- libpcap
+min len, max len, % wenn 95/98/99/100 ascii
+
+funktion und env für menge an mutationen
+
+env für menge an per mutation run
+
+only add inital dictionary, not furher finds, e.g. cmplog
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 548e1be9..a0125851 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -28,6 +28,9 @@ extern "C" {
 #define AUTOTOKENS_SIZE_MIN 8
 #define AUTOTOKENS_SPLICE_MIN 4
 #define AUTOTOKENS_SPLICE_MAX 64
+#define AUTOTOKENS_FUZZ_COUNT_SHIFT 0
+// 0 = no learning, 1 only from -x dict/autodict, 2 also from cmplog
+#define AUTOTOKENS_LEARN_DICT 2
 #ifndef AUTOTOKENS_SPLICE_DISABLE
   #define AUTOTOKENS_SPLICE_DISABLE 0
 #endif
@@ -53,6 +56,8 @@ static afl_state *afl_ptr;
 static int        debug = AUTOTOKENS_DEBUG;
 static int        only_fav = AUTOTOKENS_ONLY_FAV;
 static int        alternative_tokenize = AUTOTOKENS_ALTERNATIVE_TOKENIZE;
+static int        learn_dictionary_tokens = AUTOTOKENS_LEARN_DICT;
+static int        fuzz_count_shift = AUTOTOKENS_FUZZ_COUNT_SHIFT;
 static u32        current_id;
 static u32        valid_structures;
 static u32        whitespace_ids;
@@ -94,6 +99,22 @@ u32 good_whitespace_or_singleval() {
 
 }
 
+extern "C" u32 afl_custom_fuzz_count(void *data, const u8 *buf,
+                                     size_t buf_size) {
+
+  if (s == NULL) return 0;
+
+  u32 shift = unlikely(afl_ptr->custom_only) ? 7 : 8;
+  u32 stage_max = (u32)((HAVOC_CYCLES * afl_ptr->queue_cur->perf_score) /
+                        afl_ptr->havoc_div) >>
+                  shift;
+  if (fuzz_count_shift) { stage_max >>= (u32)fuzz_count_shift; };
+  DEBUGF(stderr, "fuzz count: %u\n", stage_max);
+
+  return stage_max;
+
+}
+
 extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
                                   u8 **out_buf, u8 *add_buf,
                                   size_t add_buf_size, size_t max_size) {
@@ -441,6 +462,7 @@ u8 my_search_string(string::const_iterator cur, string::const_iterator ende,
 extern "C" unsigned char afl_custom_queue_get(void                *data,
                                               const unsigned char *filename) {
 
+  static int learn_state;
   (void)(data);
 
   if (likely(!debug)) {
@@ -458,7 +480,9 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
   }
 
   // check if there are new dictionary entries and add them to the tokens
-  if (valid_structures) {
+  if (valid_structures && learn_state < learn_dictionary_tokens) {
+
+    if (unlikely(!learn_state)) { learn_state = 1; }
 
     while (extras_cnt < afl_ptr->extras_cnt) {
 
@@ -1053,6 +1077,25 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
   if (getenv("AUTOTOKENS_DEBUG")) { debug = 1; }
   if (getenv("AUTOTOKENS_ONLY_FAV")) { only_fav = 1; }
   if (getenv("AUTOTOKENS_ALTERNATIVE_TOKENIZE")) { alternative_tokenize = 1; }
+
+  if (getenv("AUTOTOKENS_LEARN_DICT")) {
+
+    learn_dictionary_tokens = atoi(getenv("AUTOTOKENS_LEARN_DICT"));
+    if (learn_dictionary_tokens < 0 || learn_dictionary_tokens > 2) {
+
+      learn_dictionary_tokens = 2;
+
+    }
+
+  }
+
+  if (getenv("AUTOTOKENS_FUZZ_COUNT_SHIFT")) {
+
+    fuzz_count_shift = atoi(getenv("AUTOTOKENS_FUZZ_COUNT_SHIFT"));
+    if (fuzz_count_shift < 0 || fuzz_count_shift > 16) { fuzz_count_shift = 0; }
+
+  }
+
   if (getenv("AUTOTOKENS_WHITESPACE")) {
 
     whitespace = getenv("AUTOTOKENS_WHITESPACE");
-- 
cgit 1.4.1


From 90f61552f794fc0fae5dc2585f81f31d32db1e89 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Sat, 4 Feb 2023 15:39:03 +0100
Subject: changes

---
 custom_mutators/autotokens/TODO           |  9 ++++-----
 custom_mutators/autotokens/autotokens.cpp | 12 ++++++++++++
 include/config.h                          |  4 ++--
 3 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/TODO b/custom_mutators/autotokens/TODO
index 3cae3060..528dff1f 100644
--- a/custom_mutators/autotokens/TODO
+++ b/custom_mutators/autotokens/TODO
@@ -1,5 +1,3 @@
-cmplog: only add tokens that were found to fit?
-
 create from thin air if no good seed after a cycle and dict large enough?
 (static u32 no_of_struct_inputs;) 
 
@@ -16,8 +14,9 @@ corpus analyse:
 	- libpcap
 min len, max len, % wenn 95/98/99/100 ascii
 
-funktion und env für menge an mutationen
-
 env für menge an per mutation run
 
-only add inital dictionary, not furher finds, e.g. cmplog
+AFL_TXT_MAX_LEN 65535
+AFL_TXT_MIN_LEN 16
+AFL_TXT_MIN_PERCENT=99
+
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index a0125851..46a347f8 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -34,6 +34,9 @@ extern "C" {
 #ifndef AUTOTOKENS_SPLICE_DISABLE
   #define AUTOTOKENS_SPLICE_DISABLE 0
 #endif
+#ifndef AFL_TXT_MAX_LEN
+  #define AFL_TXT_MAX_LEN 65535
+#endif
 
 #if AUTOTOKENS_SPLICE_MIN >= AUTOTOKENS_SIZE_MIN
   #error SPLICE_MIN must be lower than SIZE_MIN
@@ -571,6 +574,15 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       DEBUGF(stderr, "Too short (%lu) %s\n", len, filename);
       return 1;
 
+    } else
+    if (len > AFL_TXT_MAX_LEN) {
+
+      fclose(fp);
+      file_mapping[fn] = structure;  // NULL ptr so we don't read the file again
+      s = NULL;
+      DEBUGF(stderr, "Too long (%lu) %s\n", len, filename);
+      return 1;
+
     }
 
     string input;
diff --git a/include/config.h b/include/config.h
index f8a742f2..ed8b844c 100644
--- a/include/config.h
+++ b/include/config.h
@@ -489,12 +489,12 @@
 
 /* Minimum length of a queue input to be evaluated for "is_ascii"? */
 
-#define AFL_TXT_MIN_LEN 12
+#define AFL_TXT_MIN_LEN 16
 
 /* What is the minimum percentage of ascii characters present to be classifed
    as "is_ascii"? */
 
-#define AFL_TXT_MIN_PERCENT 95
+#define AFL_TXT_MIN_PERCENT 98
 
 /* How often to perform ASCII mutations 0 = disable, 1-8 are good values */
 
-- 
cgit 1.4.1


From f99656e22bffb4bfac8e201ad973a1ea5a6abaa0 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Sun, 5 Feb 2023 13:15:06 +0100
Subject: create from thin air, max mutation

---
 custom_mutators/autotokens/autotokens.cpp | 97 +++++++++++++++++++++++++------
 1 file changed, 78 insertions(+), 19 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 46a347f8..f1263600 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -24,10 +24,12 @@ extern "C" {
 #define AUTOTOKENS_ONLY_FAV 0
 #define AUTOTOKENS_ALTERNATIVE_TOKENIZE 0
 #define AUTOTOKENS_CHANGE_MIN 8
+#define AUTOTOKENS_CHANGE_MAX 64
 #define AUTOTOKENS_WHITESPACE " "
 #define AUTOTOKENS_SIZE_MIN 8
 #define AUTOTOKENS_SPLICE_MIN 4
 #define AUTOTOKENS_SPLICE_MAX 64
+#define AUTOTOKENS_CREATE_FROM_THIN_AIR 1
 #define AUTOTOKENS_FUZZ_COUNT_SHIFT 0
 // 0 = no learning, 1 only from -x dict/autodict, 2 also from cmplog
 #define AUTOTOKENS_LEARN_DICT 2
@@ -61,6 +63,7 @@ static int        only_fav = AUTOTOKENS_ONLY_FAV;
 static int        alternative_tokenize = AUTOTOKENS_ALTERNATIVE_TOKENIZE;
 static int        learn_dictionary_tokens = AUTOTOKENS_LEARN_DICT;
 static int        fuzz_count_shift = AUTOTOKENS_FUZZ_COUNT_SHIFT;
+static int        create_from_thin_air = AUTOTOKENS_CREATE_FROM_THIN_AIR;
 static u32        current_id;
 static u32        valid_structures;
 static u32        whitespace_ids;
@@ -83,7 +86,18 @@ static regex        regex_word("[A-Za-z0-9_$.-]+", regex::optimize);
 static regex        regex_whitespace(R"([ \t]+)", regex::optimize);
 static vector<u32> *s;  // the structure of the currently selected input
 
-u32 good_whitespace_or_singleval() {
+// FUNCTIONS
+
+/* This function is called once after everything is set up but before
+   any fuzzing attempt has been performed.
+   This is called in afl_custom_queue_get() */
+static void first_run(void *data) {
+
+  (void)(data);
+
+}
+
+static u32 good_whitespace_or_singleval() {
 
   u32 i = rand_below(afl_ptr, current_id);
   if (id_to_token[i].size() == 1) { return i; }
@@ -105,6 +119,8 @@ u32 good_whitespace_or_singleval() {
 extern "C" u32 afl_custom_fuzz_count(void *data, const u8 *buf,
                                      size_t buf_size) {
 
+  (void)(data);
+
   if (s == NULL) return 0;
 
   u32 shift = unlikely(afl_ptr->custom_only) ? 7 : 8;
@@ -135,9 +151,10 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
   u32         i, m_size = (u32)m.size();
 
   u32 rounds =
-      MAX(AUTOTOKENS_CHANGE_MIN,
-          MIN(m_size >> 3, HAVOC_CYCLES * afl_ptr->queue_cur->perf_score *
-                               afl_ptr->havoc_div / 256));
+      MIN(AUTOTOKENS_CHANGE_MAX,
+          MAX(AUTOTOKENS_CHANGE_MIN,
+              MIN(m_size >> 3, HAVOC_CYCLES * afl_ptr->queue_cur->perf_score *
+                                   afl_ptr->havoc_div / 256)));
   // DEBUGF(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
 
 #if AUTOTOKENS_SPLICE_DISABLE == 1
@@ -379,9 +396,10 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 /* I get f*cking stack overflow using C++ regex with a regex of
    "\"[[:print:]]*?\"" if this matches a long string even with regex::optimize
    enabled :-( */
-u8 my_search_string(string::const_iterator cur, string::const_iterator ende,
-                    string::const_iterator *match_begin,
-                    string::const_iterator *match_end) {
+static u8 my_search_string(string::const_iterator  cur,
+                           string::const_iterator  ende,
+                           string::const_iterator *match_begin,
+                           string::const_iterator *match_end) {
 
   string::const_iterator start = cur, found_begin;
   u8                     quote_type = 0;
@@ -460,25 +478,30 @@ u8 my_search_string(string::const_iterator cur, string::const_iterator ende,
 }
 
 /* We are not using afl_custom_queue_new_entry() because not every corpus entry
-   will be necessarily fuzzed. so we use afl_custom_queue_get() instead */
+   will be necessarily fuzzed with this custom mutator.
+   So we use afl_custom_queue_get() instead. */
 
 extern "C" unsigned char afl_custom_queue_get(void                *data,
                                               const unsigned char *filename) {
 
-  static int learn_state;
+  static int learn_state = 0;
+  static int is_first_run = 1;
   (void)(data);
 
-  if (likely(!debug)) {
+  if (unlikely(is_first_run)) {
 
-    if (unlikely(!afl_ptr->custom_only) &&
-        ((afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) ||
-         (only_fav && !afl_ptr->queue_cur->favored))) {
+    is_first_run = 0;
+    first_run(data);
 
-      s = NULL;
-      DEBUGF(stderr, "cmplog not ascii or only_fav and not favorite\n");
-      return 1;
+  }
 
-    }
+  if (unlikely(!afl_ptr->custom_only) && !create_from_thin_air &&
+      ((afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) ||
+       (only_fav && !afl_ptr->queue_cur->favored))) {
+
+    s = NULL;
+    DEBUGF(stderr, "cmplog not ascii or only_fav and not favorite\n");
+    return 1;
 
   }
 
@@ -551,6 +574,42 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
   string       fn = (char *)filename;
   auto         entry = file_mapping.find(fn);
 
+  // if there is only one active queue item at start and it is very small
+  // the we create once a structure randomly.
+  if (unlikely(create_from_thin_air)) {
+
+    if (current_id > whitespace_ids + 6 && afl_ptr->active_items == 1 &&
+        afl_ptr->queue_cur->len < AFL_TXT_MIN_LEN) {
+
+      DEBUGF(stderr, "Creating an entry from thin air...\n");
+      structure = new vector<u32>();
+      u32 item, prev, cnt = current_id >> 1;
+      structure->reserve(cnt + 4);
+      for (u32 i = 0; i < cnt; i++) {
+
+        item = rand_below(afl_ptr, current_id);
+        if (i && id_to_token[item].length() > 1 &&
+            id_to_token[prev].length() > 1) {
+
+          structure->push_back(good_whitespace_or_singleval());
+
+        }
+
+        structure->push_back(item);
+        prev = item;
+
+      }
+
+      file_mapping[fn] = structure;
+      s = structure;
+      return 1;
+
+    }
+
+    create_from_thin_air = 0;
+
+  }
+
   if (entry == file_mapping.end()) {
 
     // this input file was not analyzed for tokens yet, so let's do it!
@@ -574,8 +633,7 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       DEBUGF(stderr, "Too short (%lu) %s\n", len, filename);
       return 1;
 
-    } else
-    if (len > AFL_TXT_MAX_LEN) {
+    } else if (len > AFL_TXT_MAX_LEN) {
 
       fclose(fp);
       file_mapping[fn] = structure;  // NULL ptr so we don't read the file again
@@ -1088,6 +1146,7 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
 
   if (getenv("AUTOTOKENS_DEBUG")) { debug = 1; }
   if (getenv("AUTOTOKENS_ONLY_FAV")) { only_fav = 1; }
+  if (getenv("AUTOTOKENS_CREATE_FROM_THIN_AIR")) { create_from_thin_air = 1; }
   if (getenv("AUTOTOKENS_ALTERNATIVE_TOKENIZE")) { alternative_tokenize = 1; }
 
   if (getenv("AUTOTOKENS_LEARN_DICT")) {
-- 
cgit 1.4.1


From e6120282556e4df79c01236849e5f6f225b8e428 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Sun, 5 Feb 2023 14:19:10 +0100
Subject: dict fix

---
 custom_mutators/autotokens/README         |  3 +++
 custom_mutators/autotokens/autotokens.cpp | 22 +++++++++++++++-------
 2 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/README b/custom_mutators/autotokens/README
index 86e7c9b3..d8613232 100644
--- a/custom_mutators/autotokens/README
+++ b/custom_mutators/autotokens/README
@@ -24,6 +24,9 @@ Do **not** set `AFL_DISABLE_TRIM` with this custom mutator!
                           0 = none
                           1 = only -x or autodict
                           2 = -x, autodict and `CMPLOG`
+`AUTOTOKENS_CREATE_FROM_THIN_AIR` - if only one small start file is present and
+                                    a dictionary loaded then create one initial
+                                    structure based on the dictionary.
 `AUTOTOKENS_ALTERNATIVE_TOKENIZE` - use an alternative tokenize implementation
                                    (experimental)
 `AUTOTOKENS_WHITESPACE` - whitespace string to use for ALTERNATIVE_TOKENIZE,
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index f1263600..d3ae7e9c 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -29,7 +29,7 @@ extern "C" {
 #define AUTOTOKENS_SIZE_MIN 8
 #define AUTOTOKENS_SPLICE_MIN 4
 #define AUTOTOKENS_SPLICE_MAX 64
-#define AUTOTOKENS_CREATE_FROM_THIN_AIR 1
+#define AUTOTOKENS_CREATE_FROM_THIN_AIR 0
 #define AUTOTOKENS_FUZZ_COUNT_SHIFT 0
 // 0 = no learning, 1 only from -x dict/autodict, 2 also from cmplog
 #define AUTOTOKENS_LEARN_DICT 2
@@ -506,14 +506,15 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
   }
 
   // check if there are new dictionary entries and add them to the tokens
-  if (valid_structures && learn_state < learn_dictionary_tokens) {
+  if (likely(valid_structures || create_from_thin_air) &&
+      learn_state < learn_dictionary_tokens) {
 
     if (unlikely(!learn_state)) { learn_state = 1; }
 
     while (extras_cnt < afl_ptr->extras_cnt) {
 
       u32 ok = 1, l = afl_ptr->extras[extras_cnt].len;
-      u8 *ptr = afl_ptr->extras[extras_cnt].data;
+      u8 *buf, *ptr = afl_ptr->extras[extras_cnt].data;
 
       for (u32 i = 0; i < l; ++i) {
 
@@ -528,14 +529,17 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       if (ok) {
 
-        token_to_id[(char *)ptr] = current_id;
-        id_to_token[current_id] = (char *)ptr;
+        buf = (u8 *)malloc(afl_ptr->extras[extras_cnt].len + 1);
+        memcpy(buf, afl_ptr->extras[extras_cnt].data,
+               afl_ptr->extras[extras_cnt].len);
+        buf[afl_ptr->extras[extras_cnt].len] = 0;
+        token_to_id[(char *)buf] = current_id;
+        id_to_token[current_id] = (char *)buf;
         ++current_id;
 
       }
 
       ++extras_cnt;
-      DEBUGF(stderr, "Added from dictionary: \"%s\"\n", ptr);
 
     }
 
@@ -600,8 +604,12 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       }
 
-      file_mapping[fn] = structure;
       s = structure;
+      file_mapping[fn] = structure;
+      id_mapping[valid_structures] = structure;
+      ++valid_structures;
+      all_structure_items += structure->size();
+
       return 1;
 
     }
-- 
cgit 1.4.1


From 8a2547073c500fcd637a7b276b7a38313bb70b5f Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Mon, 6 Feb 2023 08:51:20 +0100
Subject: more options

---
 custom_mutators/autotokens/README         |  2 ++
 custom_mutators/autotokens/TODO           |  4 +++-
 custom_mutators/autotokens/autotokens.cpp | 26 ++++++++++++++++++++++----
 3 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/README b/custom_mutators/autotokens/README
index d8613232..e9c48662 100644
--- a/custom_mutators/autotokens/README
+++ b/custom_mutators/autotokens/README
@@ -24,6 +24,8 @@ Do **not** set `AFL_DISABLE_TRIM` with this custom mutator!
                           0 = none
                           1 = only -x or autodict
                           2 = -x, autodict and `CMPLOG`
+`AUTOTOKENS_CHANGE_MIN` - minimum number of mutations (1-256, default 8)
+`AUTOTOKENS_CHANGE_MAX` - maximum number of mutations (1-4096, default 64)
 `AUTOTOKENS_CREATE_FROM_THIN_AIR` - if only one small start file is present and
                                     a dictionary loaded then create one initial
                                     structure based on the dictionary.
diff --git a/custom_mutators/autotokens/TODO b/custom_mutators/autotokens/TODO
index 528dff1f..496bfd45 100644
--- a/custom_mutators/autotokens/TODO
+++ b/custom_mutators/autotokens/TODO
@@ -9,7 +9,6 @@ analyse welche einen DICT haben, und welche davon rein ascii
 
 corpus analyse:
 	+ libxml
-	- hardbuzz
 	- sqlite
 	- libpcap
 min len, max len, % wenn 95/98/99/100 ascii
@@ -20,3 +19,6 @@ AFL_TXT_MAX_LEN 65535
 AFL_TXT_MIN_LEN 16
 AFL_TXT_MIN_PERCENT=99
 
+-> KEIN FAV!
+
+change_min/_max werte
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index d3ae7e9c..ee35c68b 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -32,7 +32,7 @@ extern "C" {
 #define AUTOTOKENS_CREATE_FROM_THIN_AIR 0
 #define AUTOTOKENS_FUZZ_COUNT_SHIFT 0
 // 0 = no learning, 1 only from -x dict/autodict, 2 also from cmplog
-#define AUTOTOKENS_LEARN_DICT 2
+#define AUTOTOKENS_LEARN_DICT 1
 #ifndef AUTOTOKENS_SPLICE_DISABLE
   #define AUTOTOKENS_SPLICE_DISABLE 0
 #endif
@@ -64,6 +64,8 @@ static int        alternative_tokenize = AUTOTOKENS_ALTERNATIVE_TOKENIZE;
 static int        learn_dictionary_tokens = AUTOTOKENS_LEARN_DICT;
 static int        fuzz_count_shift = AUTOTOKENS_FUZZ_COUNT_SHIFT;
 static int        create_from_thin_air = AUTOTOKENS_CREATE_FROM_THIN_AIR;
+static int        change_min = AUTOTOKENS_CHANGE_MIN;
+static int        change_max = AUTOTOKENS_CHANGE_MAX;
 static u32        current_id;
 static u32        valid_structures;
 static u32        whitespace_ids;
@@ -151,8 +153,8 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
   u32         i, m_size = (u32)m.size();
 
   u32 rounds =
-      MIN(AUTOTOKENS_CHANGE_MAX,
-          MAX(AUTOTOKENS_CHANGE_MIN,
+      MIN(change_max,
+          MAX(change_min,
               MIN(m_size >> 3, HAVOC_CYCLES * afl_ptr->queue_cur->perf_score *
                                    afl_ptr->havoc_div / 256)));
   // DEBUGF(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
@@ -1162,7 +1164,7 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
     learn_dictionary_tokens = atoi(getenv("AUTOTOKENS_LEARN_DICT"));
     if (learn_dictionary_tokens < 0 || learn_dictionary_tokens > 2) {
 
-      learn_dictionary_tokens = 2;
+      learn_dictionary_tokens = AUTOTOKENS_LEARN_DICT;
 
     }
 
@@ -1175,6 +1177,22 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
 
   }
 
+  if (getenv("AUTOTOKENS_CHANGE_MIN")) {
+
+    change_min = atoi(getenv("AUTOTOKENS_CHANGE_MIN"));
+    if (change_min < 1 || change_min > 256) { change_min = AUTOTOKENS_CHANGE_MIN; }
+
+  }
+
+  if (getenv("AUTOTOKENS_CHANGE_MAX")) {
+
+    change_max = atoi(getenv("AUTOTOKENS_CHANGE_MAX"));
+    if (change_max < 1 || change_max > 4096) { change_max = AUTOTOKENS_CHANGE_MAX; }
+
+  }
+
+  if (change_max < change_min) { change_max = change_min + 1; }
+
   if (getenv("AUTOTOKENS_WHITESPACE")) {
 
     whitespace = getenv("AUTOTOKENS_WHITESPACE");
-- 
cgit 1.4.1


From 7eaef449a1e92999c89df23ab474b3be3da595f8 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Mon, 13 Feb 2023 08:14:04 +0100
Subject: remove ALTERNATIVE_TOKENIZE

---
 custom_mutators/autotokens/autotokens.cpp | 522 ++++++++----------------------
 1 file changed, 136 insertions(+), 386 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index ee35c68b..a027ac2b 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -22,7 +22,6 @@ extern "C" {
 
 #define AUTOTOKENS_DEBUG 0
 #define AUTOTOKENS_ONLY_FAV 0
-#define AUTOTOKENS_ALTERNATIVE_TOKENIZE 0
 #define AUTOTOKENS_CHANGE_MIN 8
 #define AUTOTOKENS_CHANGE_MAX 64
 #define AUTOTOKENS_WHITESPACE " "
@@ -60,7 +59,6 @@ typedef struct my_mutator {
 static afl_state *afl_ptr;
 static int        debug = AUTOTOKENS_DEBUG;
 static int        only_fav = AUTOTOKENS_ONLY_FAV;
-static int        alternative_tokenize = AUTOTOKENS_ALTERNATIVE_TOKENIZE;
 static int        learn_dictionary_tokens = AUTOTOKENS_LEARN_DICT;
 static int        fuzz_count_shift = AUTOTOKENS_FUZZ_COUNT_SHIFT;
 static int        create_from_thin_air = AUTOTOKENS_CREATE_FROM_THIN_AIR;
@@ -142,7 +140,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
   (void)(data);
 
-  if (s == NULL) {
+  if (unlikely(s == NULL)) {
 
     *out_buf = NULL;
     return 0;
@@ -183,9 +181,8 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
         } while (unlikely(
 
             new_item == cur_item ||
-            (!alternative_tokenize &&
-             ((whitespace_ids < new_item && whitespace_ids >= cur_item) ||
-              (whitespace_ids >= new_item && whitespace_ids < cur_item)))));
+            ((whitespace_ids < new_item && whitespace_ids >= cur_item) ||
+             (whitespace_ids >= new_item && whitespace_ids < cur_item))));
 
         DEBUGF(stderr, "MUT: %u -> %u\n", cur_item, new_item);
         m[pos] = new_item;
@@ -200,37 +197,33 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
           new_item = rand_below(afl_ptr, current_id);
 
-        } while (unlikely(!alternative_tokenize && new_item >= whitespace_ids));
+        } while (unlikely(new_item >= whitespace_ids));
 
         u32 pos = rand_below(afl_ptr, m_size + 1);
         m.insert(m.begin() + pos, new_item);
         ++m_size;
         DEBUGF(stderr, "INS: %u at %u\n", new_item, pos);
 
-        if (likely(!alternative_tokenize)) {
+        // if we insert an identifier or string we might need whitespace
+        if (id_to_token[new_item].size() > 1) {
 
-          // if we insert an identifier or string we might need whitespace
-          if (id_to_token[new_item].size() > 1) {
+          // need to insert before?
 
-            // need to insert before?
+          if (pos && m[pos - 1] >= whitespace_ids &&
+              id_to_token[m[pos - 1]].size() > 1) {
 
-            if (pos && m[pos - 1] >= whitespace_ids &&
-                id_to_token[m[pos - 1]].size() > 1) {
-
-              m.insert(m.begin() + pos, good_whitespace_or_singleval());
-              ++m_size;
-
-            }
+            m.insert(m.begin() + pos, good_whitespace_or_singleval());
+            ++m_size;
 
-            if (pos + 1 < m_size && m[pos + 1] >= whitespace_ids &&
-                id_to_token[m[pos + 1]].size() > 1) {
+          }
 
-              // need to insert after?
+          if (pos + 1 < m_size && m[pos + 1] >= whitespace_ids &&
+              id_to_token[m[pos + 1]].size() > 1) {
 
-              m.insert(m.begin() + pos + 1, good_whitespace_or_singleval());
-              ++m_size;
+            // need to insert after?
 
-            }
+            m.insert(m.begin() + pos + 1, good_whitespace_or_singleval());
+            ++m_size;
 
           }
 
@@ -290,26 +283,22 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
         }
 
-        if (likely(!alternative_tokenize)) {
-
-          // do we need a whitespace/token at the beginning?
-          if (dst_off && id_to_token[m[dst_off - 1]].size() > 1 &&
-              id_to_token[m[dst_off]].size() > 1) {
+        // do we need a whitespace/token at the beginning?
+        if (dst_off && id_to_token[m[dst_off - 1]].size() > 1 &&
+            id_to_token[m[dst_off]].size() > 1) {
 
-            m.insert(m.begin() + dst_off, good_whitespace_or_singleval());
-            ++m_size;
+          m.insert(m.begin() + dst_off, good_whitespace_or_singleval());
+          ++m_size;
 
-          }
+        }
 
-          // do we need a whitespace/token at the end?
-          if (dst_off + n < m_size &&
-              id_to_token[m[dst_off + n - 1]].size() > 1 &&
-              id_to_token[m[dst_off + n]].size() > 1) {
+        // do we need a whitespace/token at the end?
+        if (dst_off + n < m_size &&
+            id_to_token[m[dst_off + n - 1]].size() > 1 &&
+            id_to_token[m[dst_off + n]].size() > 1) {
 
-            m.insert(m.begin() + dst_off + n, good_whitespace_or_singleval());
-            ++m_size;
-
-          }
+          m.insert(m.begin() + dst_off + n, good_whitespace_or_singleval());
+          ++m_size;
 
         }
 
@@ -332,8 +321,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
           // if what we delete will result in a missing whitespace/token,
           // instead of deleting we switch the item to a whitespace or token.
-          if (likely(!alternative_tokenize) && pos && pos + 1 < m_size &&
-              id_to_token[m[pos - 1]].size() > 1 &&
+          if (pos && pos + 1 < m_size && id_to_token[m[pos - 1]].size() > 1 &&
               id_to_token[m[pos + 1]].size() > 1) {
 
             m[pos] = good_whitespace_or_singleval();
@@ -362,17 +350,11 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
   }
 
-  u32 m_size_1 = m_size - 1;
   output = "";
 
   for (i = 0; i < m_size; ++i) {
 
     output += id_to_token[m[i]];
-    if (unlikely(alternative_tokenize && i < m_size_1)) {
-
-      output += whitespace;
-
-    }
 
   }
 
@@ -725,109 +707,57 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     DEBUGF(stderr, "START!\n");
 
-    if (likely(!alternative_tokenize)) {
+    while (my_search_string(cur, ende, &match_begin, &match_end)) {
 
-      while (my_search_string(cur, ende, &match_begin, &match_end)) {
+      prev = cur;
+      found = match_begin;
+      cur = match_end;
 
-        prev = cur;
-        found = match_begin;
-        cur = match_end;
+      IFDEBUG {
 
-        IFDEBUG {
-
-          string foo(match_begin, match_end);
-          DEBUGF(stderr,
-                 "string %s found at start %lu offset %lu continue at %lu\n",
-                 foo.c_str(), prev - input.begin(), found - prev,
-                 cur - input.begin());
-
-        }
-
-        if (prev < found) {  // there are items between search start and find
-          while (prev < found) {
-
-            if (isspace(*prev)) {
-
-              auto start = prev;
-              while (isspace(*prev)) {
-
-                ++prev;
-
-              }
-
-              tokens.push_back(std::string(start, prev));
-              DEBUGF(stderr, "WHITESPACE %ld \"%s\"\n", prev - start,
-                     tokens[tokens.size() - 1].c_str());
-
-            } else if (isalnum(*prev) || *prev == '$' || *prev == '_') {
-
-              auto start = prev;
-              while (isalnum(*prev) || *prev == '$' || *prev == '_' ||
-                     *prev == '.' || *prev == '/') {
-
-                ++prev;
-
-              }
-
-              tokens.push_back(string(start, prev));
-              DEBUGF(stderr, "IDENTIFIER %ld \"%s\"\n", prev - start,
-                     tokens[tokens.size() - 1].c_str());
-
-            } else {
-
-              tokens.push_back(string(prev, prev + 1));
-              DEBUGF(stderr, "OTHER \"%c\"\n", *prev);
-              ++prev;
-
-            }
-
-          }
-
-        }
-
-        tokens.push_back(string(match_begin, match_end));
-        DEBUGF(stderr, "TOK: %s\n", tokens[tokens.size() - 1].c_str());
+        string foo(match_begin, match_end);
+        DEBUGF(stderr,
+               "string %s found at start %lu offset %lu continue at %lu\n",
+               foo.c_str(), prev - input.begin(), found - prev,
+               cur - input.begin());
 
       }
 
-      DEBUGF(stderr, "AFTER all strings\n");
-
-      if (cur < ende) {
+      if (prev < found) {  // there are items between search start and find
+        while (prev < found) {
 
-        while (cur < ende) {
+          if (isspace(*prev)) {
 
-          if (isspace(*cur)) {
+            auto start = prev;
+            while (isspace(*prev)) {
 
-            auto start = cur;
-            while (isspace(*cur)) {
-
-              ++cur;
+              ++prev;
 
             }
 
-            tokens.push_back(std::string(start, cur));
-            DEBUGF(stderr, "WHITESPACE %ld \"%s\"\n", cur - start,
+            tokens.push_back(std::string(start, prev));
+            DEBUGF(stderr, "WHITESPACE %ld \"%s\"\n", prev - start,
                    tokens[tokens.size() - 1].c_str());
 
-          } else if (isalnum(*cur) || *cur == '$' || *cur == '_') {
+          } else if (isalnum(*prev) || *prev == '$' || *prev == '_') {
 
-            auto start = cur;
-            while (isalnum(*cur) || *cur == '$' || *cur == '_' || *cur == '.' ||
-                   *cur == '/') {
+            auto start = prev;
+            while (isalnum(*prev) || *prev == '$' || *prev == '_' ||
+                   *prev == '.' || *prev == '/') {
 
-              ++cur;
+              ++prev;
 
             }
 
-            tokens.push_back(std::string(start, cur));
-            DEBUGF(stderr, "IDENTIFIER %ld \"%s\"\n", cur - start,
+            tokens.push_back(string(start, prev));
+            DEBUGF(stderr, "IDENTIFIER %ld \"%s\"\n", prev - start,
                    tokens[tokens.size() - 1].c_str());
 
           } else {
 
-            tokens.push_back(std::string(cur, cur + 1));
-            DEBUGF(stderr, "OTHER \"%c\"\n", *cur);
-            ++cur;
+            tokens.push_back(string(prev, prev + 1));
+            DEBUGF(stderr, "OTHER \"%c\"\n", *prev);
+            ++prev;
 
           }
 
@@ -835,226 +765,49 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
       }
 
-    } else {
-
-      // alternative tokenize
-      while (my_search_string(cur, ende, &match_begin, &match_end)) {
-
-        prev = cur;
-        found = match_begin;
-        cur = match_end;
-        IFDEBUG {
-
-          string foo(match_begin, match_end);
-          DEBUGF(stderr,
-                 "string %s found at start %lu offset %lu continue at %lu\n",
-                 foo.c_str(), prev - input.begin(), found - prev,
-                 cur - input.begin());
-
-        }
-
-        if (prev < found) {  // there are items between search start and find
-
-          sregex_token_iterator it{prev, found, regex_whitespace, -1};
-          vector<std::string>   tokenized{it, {}};
-          tokenized.erase(std::remove_if(tokenized.begin(), tokenized.end(),
-                                         [](std::string const &s) {
-
-                                           return s.size() == 0;
-
-                                         }),
-
-                          tokenized.end());
-          tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
-
-          IFDEBUG {
-
-            DEBUGF(stderr, "tokens1: %lu   input size: %lu\n", tokenized.size(),
-                   input.size());
-            for (auto x : tokenized) {
-
-              cerr << x << endl;
-
-            }
-
-          }
-
-          for (auto token : tokenized) {
-
-            string::const_iterator c = token.begin(), e = token.end(), f, p;
-            smatch                 m;
-
-            while (regex_search(c, e, m, regex_word)) {
-
-              p = c;
-              f = m[0].first;
-              c = m[0].second;
-              if (p < f) {
-
-                // there are items between search start and find
-                while (p < f) {
-
-                  IFDEBUG {
+      tokens.push_back(string(match_begin, match_end));
+      DEBUGF(stderr, "TOK: %s\n", tokens[tokens.size() - 1].c_str());
 
-                    string foo(p, p + 1);
-                    DEBUGF(stderr, "before string: \"%s\"\n", foo.c_str());
-
-                  }
-
-                  tokens.push_back(std::string(p, p + 1));
-                  ++p;
-
-                }
-
-                IFDEBUG {
-
-                  string foo(p, f);
-                  DEBUGF(stderr, "before string: \"%s\"\n", foo.c_str());
-                  tokens.push_back(std::string(p, f));
-
-                }
-
-              }
-
-              DEBUGF(stderr,
-                     "SUBstring \"%s\" found at start %lu offset %lu continue "
-                     "at %lu\n",
-                     m[0].str().c_str(), p - input.begin(), m.position(),
-                     c - token.begin());
-              tokens.push_back(m[0].str());
-
-            }
-
-            if (c < e) {
-
-              while (c < e) {
-
-                IFDEBUG {
-
-                  string foo(c, c + 1);
-                  DEBUGF(stderr, "after string: \"%s\"\n", foo.c_str());
-
-                }
-
-                tokens.push_back(std::string(c, c + 1));
-                ++c;
-
-              }
-
-              IFDEBUG {
-
-                string foo(c, e);
-                DEBUGF(stderr, "after string: \"%s\"\n", foo.c_str());
-
-              }
-
-              tokens.push_back(std::string(c, e));
-
-            }
-
-          }
-
-        }
-
-        tokens.push_back(string(match_begin, match_end));
+    }
 
-      }
+    DEBUGF(stderr, "AFTER all strings\n");
 
-      if (cur < ende) {
+    if (cur < ende) {
 
-        sregex_token_iterator it{cur, ende, regex_whitespace, -1};
-        vector<std::string>   tokenized{it, {}};
-        tokenized.erase(
-            std::remove_if(tokenized.begin(), tokenized.end(),
-                           [](std::string const &s) { return s.size() == 0; }),
-            tokenized.end());
-        tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
+      while (cur < ende) {
 
-        IFDEBUG {
+        if (isspace(*cur)) {
 
-          DEBUGF(stderr, "tokens2: %lu   input size: %lu\n", tokenized.size(),
-                 input.size());
-          for (auto x : tokenized) {
+          auto start = cur;
+          while (isspace(*cur)) {
 
-            cerr << x << endl;
+            ++cur;
 
           }
 
-        }
-
-        for (auto token : tokenized) {
-
-          string::const_iterator c = token.begin(), e = token.end(), f, p;
-          smatch                 m;
-
-          while (regex_search(c, e, m, regex_word)) {
-
-            p = c;
-            f = m[0].first;
-            c = m[0].second;
-            if (p < f) {
+          tokens.push_back(std::string(start, cur));
+          DEBUGF(stderr, "WHITESPACE %ld \"%s\"\n", cur - start,
+                 tokens[tokens.size() - 1].c_str());
 
-              // there are items between search start and find
-              while (p < f) {
+        } else if (isalnum(*cur) || *cur == '$' || *cur == '_') {
 
-                IFDEBUG {
+          auto start = cur;
+          while (isalnum(*cur) || *cur == '$' || *cur == '_' || *cur == '.' ||
+                 *cur == '/') {
 
-                  string foo(p, p + 1);
-                  DEBUGF(stderr, "before string: \"%s\"\n", foo.c_str());
-
-                }
-
-                tokens.push_back(std::string(p, p + 1));
-                ++p;
-
-              }
-
-              IFDEBUG {
-
-                string foo(p, f);
-                DEBUGF(stderr, "before string: \"%s\"\n", foo.c_str());
-
-              }
-
-              tokens.push_back(std::string(p, f));
-
-            }
-
-            DEBUGF(stderr,
-                   "SUB2string \"%s\" found at start %lu offset %lu continue "
-                   "at %lu\n",
-                   m[0].str().c_str(), p - input.begin(), m.position(),
-                   c - token.begin());
-            tokens.push_back(m[0].str());
+            ++cur;
 
           }
 
-          if (c < e) {
-
-            while (c < e) {
-
-              IFDEBUG {
-
-                string foo(c, c + 1);
-                DEBUGF(stderr, "after string: \"%s\"\n", foo.c_str());
-
-              }
-
-              tokens.push_back(std::string(c, c + 1));
-              ++c;
-
-            }
+          tokens.push_back(std::string(start, cur));
+          DEBUGF(stderr, "IDENTIFIER %ld \"%s\"\n", cur - start,
+                 tokens[tokens.size() - 1].c_str());
 
-            IFDEBUG {
-
-              string foo(c, e);
-              DEBUGF(stderr, "after string: \"%s\"\n", foo.c_str());
-
-            }
+        } else {
 
-            tokens.push_back(std::string(c, e));
-
-          }
+          tokens.push_back(std::string(cur, cur + 1));
+          DEBUGF(stderr, "OTHER \"%c\"\n", *cur);
+          ++cur;
 
         }
 
@@ -1065,15 +818,9 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     IFDEBUG {
 
       DEBUGF(stderr, "DUMPING TOKENS:\n");
-      u32 size_1 = tokens.size() - 1;
       for (u32 i = 0; i < tokens.size(); ++i) {
 
         DEBUGF(stderr, "%s", tokens[i].c_str());
-        if (unlikely(alternative_tokenize && i < size_1)) {
-
-          DEBUGF(stderr, "%s", whitespace.c_str());
-
-        }
 
       }
 
@@ -1157,7 +904,6 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
   if (getenv("AUTOTOKENS_DEBUG")) { debug = 1; }
   if (getenv("AUTOTOKENS_ONLY_FAV")) { only_fav = 1; }
   if (getenv("AUTOTOKENS_CREATE_FROM_THIN_AIR")) { create_from_thin_air = 1; }
-  if (getenv("AUTOTOKENS_ALTERNATIVE_TOKENIZE")) { alternative_tokenize = 1; }
 
   if (getenv("AUTOTOKENS_LEARN_DICT")) {
 
@@ -1180,14 +926,22 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
   if (getenv("AUTOTOKENS_CHANGE_MIN")) {
 
     change_min = atoi(getenv("AUTOTOKENS_CHANGE_MIN"));
-    if (change_min < 1 || change_min > 256) { change_min = AUTOTOKENS_CHANGE_MIN; }
+    if (change_min < 1 || change_min > 256) {
+
+      change_min = AUTOTOKENS_CHANGE_MIN;
+
+    }
 
   }
 
   if (getenv("AUTOTOKENS_CHANGE_MAX")) {
 
     change_max = atoi(getenv("AUTOTOKENS_CHANGE_MAX"));
-    if (change_max < 1 || change_max > 4096) { change_max = AUTOTOKENS_CHANGE_MAX; }
+    if (change_max < 1 || change_max > 4096) {
+
+      change_max = AUTOTOKENS_CHANGE_MAX;
+
+    }
 
   }
 
@@ -1212,53 +966,49 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
   // set common whitespace tokens
   // we deliberately do not put uncommon ones here to these will count as
   // identifier tokens.
-  if (!alternative_tokenize) {
-
-    token_to_id[" "] = current_id;
-    id_to_token[current_id] = " ";
-    ++current_id;
-    token_to_id["\t"] = current_id;
-    id_to_token[current_id] = "\t";
-    ++current_id;
-    token_to_id["\n"] = current_id;
-    id_to_token[current_id] = "\n";
-    ++current_id;
-    token_to_id["\r\n"] = current_id;
-    id_to_token[current_id] = "\r\n";
-    ++current_id;
-    token_to_id[" \n"] = current_id;
-    id_to_token[current_id] = " \n";
-    ++current_id;
-    token_to_id["  "] = current_id;
-    id_to_token[current_id] = "  ";
-    ++current_id;
-    token_to_id["\t\t"] = current_id;
-    id_to_token[current_id] = "\t\t";
-    ++current_id;
-    token_to_id["\n\n"] = current_id;
-    id_to_token[current_id] = "\n\n";
-    ++current_id;
-    token_to_id["\r\n\r\n"] = current_id;
-    id_to_token[current_id] = "\r\n\r\n";
-    ++current_id;
-    token_to_id["    "] = current_id;
-    id_to_token[current_id] = "    ";
-    ++current_id;
-    token_to_id["\t\t\t\t"] = current_id;
-    id_to_token[current_id] = "\t\t\t\t";
-    ++current_id;
-    token_to_id["\n\n\n\n"] = current_id;
-    id_to_token[current_id] = "\n\n\n\n";
-    ++current_id;
-    whitespace_ids = current_id;
-    token_to_id["\""] = current_id;
-    id_to_token[current_id] = "\"";
-    ++current_id;
-    token_to_id["'"] = current_id;
-    id_to_token[current_id] = "'";
-    ++current_id;
-
-  }
+  token_to_id[" "] = current_id;
+  id_to_token[current_id] = " ";
+  ++current_id;
+  token_to_id["\t"] = current_id;
+  id_to_token[current_id] = "\t";
+  ++current_id;
+  token_to_id["\n"] = current_id;
+  id_to_token[current_id] = "\n";
+  ++current_id;
+  token_to_id["\r\n"] = current_id;
+  id_to_token[current_id] = "\r\n";
+  ++current_id;
+  token_to_id[" \n"] = current_id;
+  id_to_token[current_id] = " \n";
+  ++current_id;
+  token_to_id["  "] = current_id;
+  id_to_token[current_id] = "  ";
+  ++current_id;
+  token_to_id["\t\t"] = current_id;
+  id_to_token[current_id] = "\t\t";
+  ++current_id;
+  token_to_id["\n\n"] = current_id;
+  id_to_token[current_id] = "\n\n";
+  ++current_id;
+  token_to_id["\r\n\r\n"] = current_id;
+  id_to_token[current_id] = "\r\n\r\n";
+  ++current_id;
+  token_to_id["    "] = current_id;
+  id_to_token[current_id] = "    ";
+  ++current_id;
+  token_to_id["\t\t\t\t"] = current_id;
+  id_to_token[current_id] = "\t\t\t\t";
+  ++current_id;
+  token_to_id["\n\n\n\n"] = current_id;
+  id_to_token[current_id] = "\n\n\n\n";
+  ++current_id;
+  whitespace_ids = current_id;
+  token_to_id["\""] = current_id;
+  id_to_token[current_id] = "\"";
+  ++current_id;
+  token_to_id["'"] = current_id;
+  id_to_token[current_id] = "'";
+  ++current_id;
 
   return data;
 
-- 
cgit 1.4.1


From 240f6421d8240b4b4d4d5bd509c0c3277a083896 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Mon, 13 Feb 2023 08:23:47 +0100
Subject: optimize performance

---
 custom_mutators/autotokens/autotokens.cpp | 80 +++++++++----------------------
 1 file changed, 23 insertions(+), 57 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index a027ac2b..ca738d0b 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -204,31 +204,6 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
         ++m_size;
         DEBUGF(stderr, "INS: %u at %u\n", new_item, pos);
 
-        // if we insert an identifier or string we might need whitespace
-        if (id_to_token[new_item].size() > 1) {
-
-          // need to insert before?
-
-          if (pos && m[pos - 1] >= whitespace_ids &&
-              id_to_token[m[pos - 1]].size() > 1) {
-
-            m.insert(m.begin() + pos, good_whitespace_or_singleval());
-            ++m_size;
-
-          }
-
-          if (pos + 1 < m_size && m[pos + 1] >= whitespace_ids &&
-              id_to_token[m[pos + 1]].size() > 1) {
-
-            // need to insert after?
-
-            m.insert(m.begin() + pos + 1, good_whitespace_or_singleval());
-            ++m_size;
-
-          }
-
-        }
-
         break;
 
       }
@@ -283,25 +258,6 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
         }
 
-        // do we need a whitespace/token at the beginning?
-        if (dst_off && id_to_token[m[dst_off - 1]].size() > 1 &&
-            id_to_token[m[dst_off]].size() > 1) {
-
-          m.insert(m.begin() + dst_off, good_whitespace_or_singleval());
-          ++m_size;
-
-        }
-
-        // do we need a whitespace/token at the end?
-        if (dst_off + n < m_size &&
-            id_to_token[m[dst_off + n - 1]].size() > 1 &&
-            id_to_token[m[dst_off + n]].size() > 1) {
-
-          m.insert(m.begin() + dst_off + n, good_whitespace_or_singleval());
-          ++m_size;
-
-        }
-
         break;
 
       }
@@ -319,19 +275,8 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
           } while (unlikely(m[pos] < whitespace_ids));
 
-          // if what we delete will result in a missing whitespace/token,
-          // instead of deleting we switch the item to a whitespace or token.
-          if (pos && pos + 1 < m_size && id_to_token[m[pos - 1]].size() > 1 &&
-              id_to_token[m[pos + 1]].size() > 1) {
-
-            m[pos] = good_whitespace_or_singleval();
-
-          } else {
-
-            m.erase(m.begin() + pos);
-            --m_size;
-
-          }
+          m.erase(m.begin() + pos);
+          --m_size;
 
         } else {
 
@@ -350,10 +295,31 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
   }
 
+  /* Now we create the output */
+
   output = "";
+  u32 prev_size = 0;
 
   for (i = 0; i < m_size; ++i) {
 
+    if (likely(i + 1 < m_size)) {
+
+      u32 this_size = id_to_token[m[i]].size();
+
+      /* The output we are generating might need repairing.
+         General rule: two items that have a size larger than 2 are strings
+         or identifizers and need a whitespace or an item of length 1 in
+         between. */
+      if (unlikely(prev_size > 1 && this_size > 1)) {
+
+        output += id_to_token[good_whitespace_or_singleval()];
+
+      }
+
+      prev_size = this_size;
+
+    }
+
     output += id_to_token[m[i]];
 
   }
-- 
cgit 1.4.1


From 61439859cece05cd3e204af60bb5ff08556c490d Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Mon, 13 Feb 2023 08:26:30 +0100
Subject: cleanup

---
 custom_mutators/autotokens/README         | 4 ----
 custom_mutators/autotokens/autotokens.cpp | 8 --------
 2 files changed, 12 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/README b/custom_mutators/autotokens/README
index e9c48662..904b5fa3 100644
--- a/custom_mutators/autotokens/README
+++ b/custom_mutators/autotokens/README
@@ -29,7 +29,3 @@ Do **not** set `AFL_DISABLE_TRIM` with this custom mutator!
 `AUTOTOKENS_CREATE_FROM_THIN_AIR` - if only one small start file is present and
                                     a dictionary loaded then create one initial
                                     structure based on the dictionary.
-`AUTOTOKENS_ALTERNATIVE_TOKENIZE` - use an alternative tokenize implementation
-                                   (experimental)
-`AUTOTOKENS_WHITESPACE` - whitespace string to use for ALTERNATIVE_TOKENIZE,
-                          default is " "
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index ca738d0b..10afa2c2 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -24,7 +24,6 @@ extern "C" {
 #define AUTOTOKENS_ONLY_FAV 0
 #define AUTOTOKENS_CHANGE_MIN 8
 #define AUTOTOKENS_CHANGE_MAX 64
-#define AUTOTOKENS_WHITESPACE " "
 #define AUTOTOKENS_SIZE_MIN 8
 #define AUTOTOKENS_SPLICE_MIN 4
 #define AUTOTOKENS_SPLICE_MAX 64
@@ -75,7 +74,6 @@ static unordered_map<string, vector<u32> *> file_mapping;
 static unordered_map<u32, vector<u32> *>    id_mapping;
 static unordered_map<string, u32>           token_to_id;
 static unordered_map<u32, string>           id_to_token;
-static string                               whitespace = AUTOTOKENS_WHITESPACE;
 static string                               output;
 static regex                               *regex_comment_custom;
 // multiline requires g++-11 libs :(
@@ -913,12 +911,6 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
 
   if (change_max < change_min) { change_max = change_min + 1; }
 
-  if (getenv("AUTOTOKENS_WHITESPACE")) {
-
-    whitespace = getenv("AUTOTOKENS_WHITESPACE");
-
-  }
-
   if (getenv("AUTOTOKENS_COMMENT")) {
 
     char buf[256];
-- 
cgit 1.4.1


From 54fa78d32ce6779117a656c72f5c630713e7033f Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Mon, 13 Feb 2023 09:52:57 +0100
Subject: autodisable and better performance

---
 custom_mutators/autotokens/Makefile       |  12 ++-
 custom_mutators/autotokens/TODO           |  21 -----
 custom_mutators/autotokens/autotokens.cpp | 143 +++++++++++++++++++++++-------
 include/config.h                          |   4 +
 src/afl-fuzz-queue.c                      |  89 ++++++++++++-------
 5 files changed, 179 insertions(+), 90 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/Makefile b/custom_mutators/autotokens/Makefile
index ab1da4b6..6ee7d324 100644
--- a/custom_mutators/autotokens/Makefile
+++ b/custom_mutators/autotokens/Makefile
@@ -1,16 +1,22 @@
 ifdef debug
-	CFLAGS += -fsanitize=address -Wall
+	CPPLAGS += -fsanitize=address
+	CXXFLAGS += -Wall
+	CC := clang
 	CXX := clang++
 endif
 ifdef DEBUG
-	CFLAGS += -fsanitize=address -Wall
+	CPPFLAGS += -fsanitize=address
+	CXXFLAGS += -Wall
+	CC := clang
 	CXX := clang++
 endif
 
 all:	autotokens.so
 
 autotokens.so:	autotokens.cpp
-	$(CXX) -g -O3 $(CFLAGS) -shared -fPIC -o autotokens.so -I../../include autotokens.cpp ../../src/afl-performance.o
+	$(CC) -D_STANDALONE_MODULE=1 -I../../include -g -O3 $(CPPFLAGS) -fPIC -c -o ./afl-fuzz-queue.o ../../src/afl-fuzz-queue.c
+	$(CC) -I../../include -g -O3 $(CPPFLAGS) -DBIN_PATH=\"dummy\" -Wno-pointer-sign -fPIC -c -o ./afl-common.o ../../src/afl-common.c
+	$(CXX) -Wno-deprecated -g -O3 $(CXXFLAGS) $(CPPFLAGS) -shared -fPIC -o autotokens.so -I../../include autotokens.cpp  ./afl-fuzz-queue.o ../../src/afl-performance.o ./afl-common.o
 
 clean:
 	rm -f autotokens.so *~ core
diff --git a/custom_mutators/autotokens/TODO b/custom_mutators/autotokens/TODO
index 496bfd45..2e99e147 100644
--- a/custom_mutators/autotokens/TODO
+++ b/custom_mutators/autotokens/TODO
@@ -1,24 +1,3 @@
-create from thin air if no good seed after a cycle and dict large enough?
-(static u32 no_of_struct_inputs;) 
-
-splicing -> check if whitespace/token is needed
-
-whitespace/token check only AFTER mutation
-
-analyse welche einen DICT haben, und welche davon rein ascii
-
-corpus analyse:
-	+ libxml
-	- sqlite
-	- libpcap
-min len, max len, % wenn 95/98/99/100 ascii
-
 env für menge an per mutation run
 
-AFL_TXT_MAX_LEN 65535
-AFL_TXT_MIN_LEN 16
-AFL_TXT_MIN_PERCENT=99
-
--> KEIN FAV!
-
 change_min/_max werte
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 10afa2c2..cda90a38 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -25,10 +25,12 @@ extern "C" {
 #define AUTOTOKENS_CHANGE_MIN 8
 #define AUTOTOKENS_CHANGE_MAX 64
 #define AUTOTOKENS_SIZE_MIN 8
+#define AUTOTOKENS_SIZE_MAX 65535
 #define AUTOTOKENS_SPLICE_MIN 4
 #define AUTOTOKENS_SPLICE_MAX 64
 #define AUTOTOKENS_CREATE_FROM_THIN_AIR 0
 #define AUTOTOKENS_FUZZ_COUNT_SHIFT 0
+#define AUTOTOKENS_AUTO_DISABLE 0
 // 0 = no learning, 1 only from -x dict/autodict, 2 also from cmplog
 #define AUTOTOKENS_LEARN_DICT 1
 #ifndef AUTOTOKENS_SPLICE_DISABLE
@@ -56,6 +58,8 @@ typedef struct my_mutator {
 #define IFDEBUG if (unlikely(debug))
 
 static afl_state *afl_ptr;
+static int        module_disabled = 0;
+static int        auto_disable = AUTOTOKENS_AUTO_DISABLE;
 static int        debug = AUTOTOKENS_DEBUG;
 static int        only_fav = AUTOTOKENS_ONLY_FAV;
 static int        learn_dictionary_tokens = AUTOTOKENS_LEARN_DICT;
@@ -93,6 +97,99 @@ static void first_run(void *data) {
 
   (void)(data);
 
+  /* For auto-loading this module we check here if we can analyze from the
+     input if the inputs look like text inputs and disable the module if
+     not. */
+
+  if (afl_ptr->custom_only || !auto_disable) { return; }
+
+  if (unlikely(afl_ptr->active_items == 1 &&
+               afl_ptr->queue_cur->len < AFL_TXT_MIN_LEN)) {
+
+    if (afl_ptr->extras_cnt > 8) {
+
+      u32 valid = 0;
+
+      while (extras_cnt < afl_ptr->extras_cnt) {
+
+        u32 ok = 1, l = afl_ptr->extras[extras_cnt].len;
+        u8 *buf, *ptr = afl_ptr->extras[extras_cnt].data;
+
+        for (u32 i = 0; i < l; ++i) {
+
+          if (!isascii((int)ptr[i]) && !isprint((int)ptr[i])) {
+
+            ok = 0;
+            break;
+
+          }
+
+        }
+
+        if (ok) {
+
+          buf = (u8 *)malloc(afl_ptr->extras[extras_cnt].len + 1);
+          memcpy(buf, afl_ptr->extras[extras_cnt].data,
+                 afl_ptr->extras[extras_cnt].len);
+          buf[afl_ptr->extras[extras_cnt].len] = 0;
+          token_to_id[(char *)buf] = current_id;
+          id_to_token[current_id] = (char *)buf;
+          ++current_id;
+          ++valid;
+
+        }
+
+        ++extras_cnt;
+
+      }
+
+      if ((valid * 100) / afl_ptr->extras_cnt < 95) { module_disabled = 1; }
+
+    } else {
+
+      module_disabled = 1;
+
+    }
+
+    return;
+
+  }
+
+  u32 is_ascii = 0, valid = 0;
+
+  for (u32 i = 0; i < afl_ptr->queued_items; ++i) {
+
+    struct queue_entry *q;
+
+    q = afl_ptr->queue_buf[i];
+
+    if (!q->disabled && q->len >= AUTOTOKENS_SIZE_MIN &&
+        q->len <= AFL_TXT_MAX_LEN) {
+
+      ++valid;
+      u8 *input = queue_testcase_get(afl_ptr, q);
+
+      u32 valid_chars = 0;
+      for (u32 i = 0; i < q->len; ++i) {
+
+        if (isascii((int)input[i]) || isprint((int)input[i])) { ++valid_chars; }
+
+      }
+
+      // we want at least 99% of text characters ...
+      if (((q->len * AFL_TXT_MIN_PERCENT) / 100) <= valid_chars) {
+
+        ++is_ascii;
+        q->is_ascii = 1;
+
+      }
+
+    }
+
+  }
+
+  if ((is_ascii * 100) / valid < 70) { module_disabled = 1; }
+
 }
 
 static u32 good_whitespace_or_singleval() {
@@ -441,21 +538,25 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     is_first_run = 0;
     first_run(data);
 
+    if (module_disabled) { WARNF("Autotokens custom module is disabled."); }
+
   }
 
-  if (unlikely(!afl_ptr->custom_only) && !create_from_thin_air &&
-      ((afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) ||
-       (only_fav && !afl_ptr->queue_cur->favored))) {
+  if (likely(module_disabled) ||
+      (unlikely(!afl_ptr->custom_only) && !create_from_thin_air &&
+       ((afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) ||
+        (only_fav && !afl_ptr->queue_cur->favored)))) {
 
     s = NULL;
-    DEBUGF(stderr, "cmplog not ascii or only_fav and not favorite\n");
+    DEBUGF(stderr,
+           "cmplog not ascii or only_fav and not favorite or disabled\n");
     return 1;
 
   }
 
   // check if there are new dictionary entries and add them to the tokens
-  if (likely(valid_structures || create_from_thin_air) &&
-      learn_state < learn_dictionary_tokens) {
+  if (unlikely(learn_state < learn_dictionary_tokens) &&
+      likely(valid_structures || create_from_thin_air)) {
 
     if (unlikely(!learn_state)) { learn_state = 1; }
 
@@ -569,21 +670,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
   if (entry == file_mapping.end()) {
 
     // this input file was not analyzed for tokens yet, so let's do it!
-
-    FILE *fp = fopen((char *)filename, "rb");
-    if (!fp) {
-
-      s = NULL;
-      return 1;
-
-    }  // should not happen
-
-    fseek(fp, 0, SEEK_END);
-    size_t len = (size_t)ftell(fp);
+    size_t len = afl_ptr->queue_cur->len;
 
     if (len < AFL_TXT_MIN_LEN) {
 
-      fclose(fp);
       file_mapping[fn] = structure;  // NULL ptr so we don't read the file again
       s = NULL;
       DEBUGF(stderr, "Too short (%lu) %s\n", len, filename);
@@ -591,7 +681,6 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     } else if (len > AFL_TXT_MAX_LEN) {
 
-      fclose(fp);
       file_mapping[fn] = structure;  // NULL ptr so we don't read the file again
       s = NULL;
       DEBUGF(stderr, "Too long (%lu) %s\n", len, filename);
@@ -599,19 +688,8 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
 
     }
 
-    string input;
-    input.resize(len);
-    rewind(fp);
-
-    if (fread((void *)input.data(), 1, len, fp) != len) {
-
-      s = NULL;
-      DEBUGF(stderr, "Too short read %s\n", filename);
-      return 1;
-
-    }
-
-    fclose(fp);
+    u8    *input_buf = queue_testcase_get(afl_ptr, afl_ptr->queue_cur);
+    string input((char *)input_buf, afl_ptr->queue_cur->len);
 
     if (!afl_ptr->shm.cmplog_mode) {
 
@@ -866,6 +944,7 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
   }
 
   if (getenv("AUTOTOKENS_DEBUG")) { debug = 1; }
+  if (getenv("AUTOTOKENS_AUTO_DISABLE")) { auto_disable = 1; }
   if (getenv("AUTOTOKENS_ONLY_FAV")) { only_fav = 1; }
   if (getenv("AUTOTOKENS_CREATE_FROM_THIN_AIR")) { create_from_thin_air = 1; }
 
diff --git a/include/config.h b/include/config.h
index 49d09174..ad8b76a8 100644
--- a/include/config.h
+++ b/include/config.h
@@ -491,6 +491,10 @@
 
 #define AFL_TXT_MIN_LEN 16
 
+/* Maximum length of a queue input to be evaluated for "is_ascii"? */
+
+#define AFL_TXT_MAX_LEN 65535
+
 /* What is the minimum percentage of ascii characters present to be classifed
    as "is_ascii"? */
 
diff --git a/src/afl-fuzz-queue.c b/src/afl-fuzz-queue.c
index e3faa392..3c8a3e46 100644
--- a/src/afl-fuzz-queue.c
+++ b/src/afl-fuzz-queue.c
@@ -27,6 +27,22 @@
 #include <ctype.h>
 #include <math.h>
 
+#ifdef _STANDALONE_MODULE
+void minimize_bits(afl_state_t *afl, u8 *dst, u8 *src) {
+
+  return;
+
+}
+
+void run_afl_custom_queue_new_entry(afl_state_t *afl, struct queue_entry *q,
+                                    u8 *a, u8 *b) {
+
+  return;
+
+}
+
+#endif
+
 /* select next queue entry based on alias algo - fast! */
 
 inline u32 select_next_queue_entry(afl_state_t *afl) {
@@ -78,8 +94,8 @@ void create_alias_table(afl_state_t *afl) {
   afl->alias_probability = (double *)afl_realloc(
       (void **)&afl->alias_probability, n * sizeof(double));
   double *P = (double *)afl_realloc(AFL_BUF_PARAM(out), n * sizeof(double));
-  int    *S = (u32 *)afl_realloc(AFL_BUF_PARAM(out_scratch), n * sizeof(u32));
-  int    *L = (u32 *)afl_realloc(AFL_BUF_PARAM(in_scratch), n * sizeof(u32));
+  int    *S = (int *)afl_realloc(AFL_BUF_PARAM(out_scratch), n * sizeof(u32));
+  int    *L = (int *)afl_realloc(AFL_BUF_PARAM(in_scratch), n * sizeof(u32));
 
   if (!P || !S || !L || !afl->alias_table || !afl->alias_probability) {
 
@@ -247,11 +263,11 @@ void create_alias_table(afl_state_t *afl) {
 
 void mark_as_det_done(afl_state_t *afl, struct queue_entry *q) {
 
-  u8  fn[PATH_MAX];
-  s32 fd;
+  char fn[PATH_MAX];
+  s32  fd;
 
   snprintf(fn, PATH_MAX, "%s/queue/.state/deterministic_done/%s", afl->out_dir,
-           strrchr(q->fname, '/') + 1);
+           strrchr((char *)q->fname, '/') + 1);
 
   fd = open(fn, O_WRONLY | O_CREAT | O_EXCL, DEFAULT_PERMISSION);
   if (fd < 0) { PFATAL("Unable to create '%s'", fn); }
@@ -266,10 +282,10 @@ void mark_as_det_done(afl_state_t *afl, struct queue_entry *q) {
 
 void mark_as_variable(afl_state_t *afl, struct queue_entry *q) {
 
-  u8 fn[PATH_MAX];
-  u8 ldest[PATH_MAX];
+  char fn[PATH_MAX];
+  char ldest[PATH_MAX];
 
-  u8 *fn_name = strrchr(q->fname, '/') + 1;
+  char *fn_name = strrchr((char *)q->fname, '/') + 1;
 
   sprintf(ldest, "../../%s", fn_name);
   sprintf(fn, "%s/queue/.state/variable_behavior/%s", afl->out_dir, fn_name);
@@ -293,12 +309,12 @@ void mark_as_redundant(afl_state_t *afl, struct queue_entry *q, u8 state) {
 
   if (likely(state == q->fs_redundant)) { return; }
 
-  u8 fn[PATH_MAX];
+  char fn[PATH_MAX];
 
   q->fs_redundant = state;
 
   sprintf(fn, "%s/queue/.state/redundant_edges/%s", afl->out_dir,
-          strrchr(q->fname, '/') + 1);
+          strrchr((char *)q->fname, '/') + 1);
 
   if (state) {
 
@@ -409,7 +425,7 @@ u8 check_if_text_buf(u8 *buf, u32 len) {
 
 static u8 check_if_text(afl_state_t *afl, struct queue_entry *q) {
 
-  if (q->len < AFL_TXT_MIN_LEN) return 0;
+  if (q->len < AFL_TXT_MIN_LEN || q->len < AFL_TXT_MAX_LEN) return 0;
 
   u8     *buf;
   int     fd;
@@ -417,8 +433,8 @@ static u8 check_if_text(afl_state_t *afl, struct queue_entry *q) {
   ssize_t comp;
 
   if (len >= MAX_FILE) len = MAX_FILE - 1;
-  if ((fd = open(q->fname, O_RDONLY)) < 0) return 0;
-  buf = afl_realloc(AFL_BUF_PARAM(in_scratch), len + 1);
+  if ((fd = open((char *)q->fname, O_RDONLY)) < 0) return 0;
+  buf = (u8 *)afl_realloc(AFL_BUF_PARAM(in_scratch), len + 1);
   comp = read(fd, buf, len);
   close(fd);
   if (comp != (ssize_t)len) return 0;
@@ -520,7 +536,8 @@ static u8 check_if_text(afl_state_t *afl, struct queue_entry *q) {
 
 void add_to_queue(afl_state_t *afl, u8 *fname, u32 len, u8 passed_det) {
 
-  struct queue_entry *q = ck_alloc(sizeof(struct queue_entry));
+  struct queue_entry *q =
+      (struct queue_entry *)ck_alloc(sizeof(struct queue_entry));
 
   q->fname = fname;
   q->len = len;
@@ -554,7 +571,7 @@ void add_to_queue(afl_state_t *afl, u8 *fname, u32 len, u8 passed_det) {
 
   afl->cycles_wo_finds = 0;
 
-  struct queue_entry **queue_buf = afl_realloc(
+  struct queue_entry **queue_buf = (struct queue_entry **)afl_realloc(
       AFL_BUF_PARAM(queue), afl->queued_items * sizeof(struct queue_entry *));
   if (unlikely(!queue_buf)) { PFATAL("alloc"); }
   queue_buf[afl->queued_items - 1] = q;
@@ -574,7 +591,11 @@ void add_to_queue(afl_state_t *afl, u8 *fname, u32 len, u8 passed_det) {
   }
 
   /* only redqueen currently uses is_ascii */
-  if (afl->shm.cmplog_mode) q->is_ascii = check_if_text(afl, q);
+  if (unlikely(afl->shm.cmplog_mode && !q->is_ascii)) {
+
+    q->is_ascii = check_if_text(afl, q);
+
+  }
 
 }
 
@@ -704,7 +725,7 @@ void update_bitmap_score(afl_state_t *afl, struct queue_entry *q) {
       if (!q->trace_mini) {
 
         u32 len = (afl->fsrv.map_size >> 3);
-        q->trace_mini = ck_alloc(len);
+        q->trace_mini = (u8 *)ck_alloc(len);
         minimize_bits(afl, q->trace_mini, afl->fsrv.trace_bits);
 
       }
@@ -1090,19 +1111,19 @@ inline void queue_testcase_retake(afl_state_t *afl, struct queue_entry *q,
     if (len != old_len) {
 
       afl->q_testcase_cache_size = afl->q_testcase_cache_size + len - old_len;
-      q->testcase_buf = realloc(q->testcase_buf, len);
+      q->testcase_buf = (u8 *)realloc(q->testcase_buf, len);
 
       if (unlikely(!q->testcase_buf)) {
 
-        PFATAL("Unable to malloc '%s' with len %u", q->fname, len);
+        PFATAL("Unable to malloc '%s' with len %u", (char *)q->fname, len);
 
       }
 
     }
 
-    int fd = open(q->fname, O_RDONLY);
+    int fd = open((char *)q->fname, O_RDONLY);
 
-    if (unlikely(fd < 0)) { PFATAL("Unable to open '%s'", q->fname); }
+    if (unlikely(fd < 0)) { PFATAL("Unable to open '%s'", (char *)q->fname); }
 
     ck_read(fd, q->testcase_buf, len, q->fname);
     close(fd);
@@ -1122,7 +1143,7 @@ inline void queue_testcase_retake_mem(afl_state_t *afl, struct queue_entry *q,
 
     if (likely(len != old_len)) {
 
-      u8 *ptr = realloc(q->testcase_buf, len);
+      u8 *ptr = (u8 *)realloc(q->testcase_buf, len);
 
       if (likely(ptr)) {
 
@@ -1154,23 +1175,23 @@ inline u8 *queue_testcase_get(afl_state_t *afl, struct queue_entry *q) {
 
     if (unlikely(q == afl->queue_cur)) {
 
-      buf = afl_realloc((void **)&afl->testcase_buf, len);
+      buf = (u8 *)afl_realloc((void **)&afl->testcase_buf, len);
 
     } else {
 
-      buf = afl_realloc((void **)&afl->splicecase_buf, len);
+      buf = (u8 *)afl_realloc((void **)&afl->splicecase_buf, len);
 
     }
 
     if (unlikely(!buf)) {
 
-      PFATAL("Unable to malloc '%s' with len %u", q->fname, len);
+      PFATAL("Unable to malloc '%s' with len %u", (char *)q->fname, len);
 
     }
 
-    int fd = open(q->fname, O_RDONLY);
+    int fd = open((char *)q->fname, O_RDONLY);
 
-    if (unlikely(fd < 0)) { PFATAL("Unable to open '%s'", q->fname); }
+    if (unlikely(fd < 0)) { PFATAL("Unable to open '%s'", (char *)q->fname); }
 
     ck_read(fd, buf, len, q->fname);
     close(fd);
@@ -1214,7 +1235,7 @@ inline u8 *queue_testcase_get(afl_state_t *afl, struct queue_entry *q) {
 
         do_once = 1;
         // release unneeded memory
-        afl->q_testcase_cache = ck_realloc(
+        afl->q_testcase_cache = (struct queue_entry **)ck_realloc(
             afl->q_testcase_cache,
             (afl->q_testcase_max_cache_entries + 1) * sizeof(size_t));
 
@@ -1261,15 +1282,15 @@ inline u8 *queue_testcase_get(afl_state_t *afl, struct queue_entry *q) {
 
     /* Map the test case into memory. */
 
-    int fd = open(q->fname, O_RDONLY);
+    int fd = open((char *)q->fname, O_RDONLY);
 
-    if (unlikely(fd < 0)) { PFATAL("Unable to open '%s'", q->fname); }
+    if (unlikely(fd < 0)) { PFATAL("Unable to open '%s'", (char *)q->fname); }
 
-    q->testcase_buf = malloc(len);
+    q->testcase_buf = (u8 *)malloc(len);
 
     if (unlikely(!q->testcase_buf)) {
 
-      PFATAL("Unable to malloc '%s' with len %u", q->fname, len);
+      PFATAL("Unable to malloc '%s' with len %u", (char *)q->fname, len);
 
     }
 
@@ -1332,11 +1353,11 @@ inline void queue_testcase_store_mem(afl_state_t *afl, struct queue_entry *q,
 
   /* Map the test case into memory. */
 
-  q->testcase_buf = malloc(len);
+  q->testcase_buf = (u8 *)malloc(len);
 
   if (unlikely(!q->testcase_buf)) {
 
-    PFATAL("Unable to malloc '%s' with len %u", q->fname, len);
+    PFATAL("Unable to malloc '%s' with len %u", (char *)q->fname, len);
 
   }
 
-- 
cgit 1.4.1


From 668f5e1fa9c126bb8c751a6e4ef038ae60a442fa Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 15 Feb 2023 09:32:32 +0100
Subject: debug output

---
 custom_mutators/autotokens/Makefile       |  8 ++++++--
 custom_mutators/autotokens/autotokens.cpp | 17 ++++++++++++++++-
 docs/Changelog.md                         |  1 +
 docs/env_variables.md                     |  2 ++
 include/afl-fuzz.h                        |  2 +-
 include/envs.h                            |  1 +
 src/afl-fuzz-init.c                       |  2 +-
 src/afl-fuzz-one.c                        |  2 +-
 src/afl-fuzz-run.c                        |  2 +-
 src/afl-fuzz-state.c                      |  7 +++++++
 10 files changed, 37 insertions(+), 7 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/Makefile b/custom_mutators/autotokens/Makefile
index 6ee7d324..0daba17d 100644
--- a/custom_mutators/autotokens/Makefile
+++ b/custom_mutators/autotokens/Makefile
@@ -13,10 +13,14 @@ endif
 
 all:	autotokens.so
 
-autotokens.so:	autotokens.cpp
+afl-fuzz-queue.o:	../../src/afl-fuzz-queue.c
 	$(CC) -D_STANDALONE_MODULE=1 -I../../include -g -O3 $(CPPFLAGS) -fPIC -c -o ./afl-fuzz-queue.o ../../src/afl-fuzz-queue.c
+
+afl-common.o:	../../src/afl-common.c
 	$(CC) -I../../include -g -O3 $(CPPFLAGS) -DBIN_PATH=\"dummy\" -Wno-pointer-sign -fPIC -c -o ./afl-common.o ../../src/afl-common.c
+
+autotokens.so:	afl-fuzz-queue.o afl-common.o autotokens.cpp
 	$(CXX) -Wno-deprecated -g -O3 $(CXXFLAGS) $(CPPFLAGS) -shared -fPIC -o autotokens.so -I../../include autotokens.cpp  ./afl-fuzz-queue.o ../../src/afl-performance.o ./afl-common.o
 
 clean:
-	rm -f autotokens.so *~ core
+	rm -f autotokens.so *.o *~ core
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index cda90a38..043d9588 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -145,6 +145,9 @@ static void first_run(void *data) {
 
       if ((valid * 100) / afl_ptr->extras_cnt < 95) { module_disabled = 1; }
 
+      DEBUGF(stderr, "DICT: valid %u, total %u, %u < 95 == disable\n", valid,
+             afl_ptr->extras_cnt, (u32)((valid * 100) / afl_ptr->extras_cnt));
+
     } else {
 
       module_disabled = 1;
@@ -190,6 +193,10 @@ static void first_run(void *data) {
 
   if ((is_ascii * 100) / valid < 70) { module_disabled = 1; }
 
+  DEBUGF(stderr, "seeds: total %u, valid %u, ascii %u, %u < 70 == disabled\n",
+         afl_ptr->active_items, valid, is_ascii,
+         (u32)((is_ascii * 100) / valid));
+
 }
 
 static u32 good_whitespace_or_singleval() {
@@ -538,7 +545,15 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     is_first_run = 0;
     first_run(data);
 
-    if (module_disabled) { WARNF("Autotokens custom module is disabled."); }
+    if (module_disabled) {
+
+      WARNF("Autotokens custom module is disabled.");
+
+    } else if (auto_disable) {
+
+      OKF("Autotokens custom module is enabled.");
+
+    }
 
   }
 
diff --git a/docs/Changelog.md b/docs/Changelog.md
index 89c37912..5f253064 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -6,6 +6,7 @@
 ### Version ++4.06a (dev)
   - afl-fuzz:
     - ensure temporary file descriptor is closed when not used
+    - added `AFL_NO_WARN_INSTABILITY`
   - afl-cc:
     - add CFI sanitizer variant to gcc targets
     - llvm 16 support (thanks to @devnexen!)
diff --git a/docs/env_variables.md b/docs/env_variables.md
index 61fb1e2b..7a574e59 100644
--- a/docs/env_variables.md
+++ b/docs/env_variables.md
@@ -482,6 +482,8 @@ checks or alter some of the more exotic semantics of the tool:
   - Setting `AFL_NO_STARTUP_CALIBRATION` will skip the initial calibration
     of all starting seeds, and start fuzzing at once.
 
+  - Setting `AFL_NO_WARN_INSTABILITY` will suppress instability warnings.
+
   - In QEMU mode (-Q) and FRIDA mode (-O), `AFL_PATH` will be searched for
     afl-qemu-trace and afl-frida-trace.so.
 
diff --git a/include/afl-fuzz.h b/include/afl-fuzz.h
index 229bc025..9bf91faf 100644
--- a/include/afl-fuzz.h
+++ b/include/afl-fuzz.h
@@ -399,7 +399,7 @@ typedef struct afl_env_vars {
       afl_cycle_schedules, afl_expand_havoc, afl_statsd, afl_cmplog_only_new,
       afl_exit_on_seed_issues, afl_try_affinity, afl_ignore_problems,
       afl_keep_timeouts, afl_pizza_mode, afl_no_crash_readme,
-      afl_ignore_timeouts, afl_no_startup_calibration;
+      afl_ignore_timeouts, afl_no_startup_calibration, afl_no_warn_instability;
 
   u8 *afl_tmpdir, *afl_custom_mutator_library, *afl_python_module, *afl_path,
       *afl_hang_tmout, *afl_forksrv_init_tmout, *afl_preload,
diff --git a/include/envs.h b/include/envs.h
index 5018b0f8..56675eda 100644
--- a/include/envs.h
+++ b/include/envs.h
@@ -172,6 +172,7 @@ static char *afl_environment_variables[] = {
     "AFL_NO_UI",
     "AFL_NO_PYTHON",
     "AFL_NO_STARTUP_CALIBRATION",
+    "AFL_NO_WARN_INSTABILITY",
     "AFL_UNTRACER_FILE",
     "AFL_LLVM_USE_TRACE_PC",
     "AFL_MAP_SIZE",
diff --git a/src/afl-fuzz-init.c b/src/afl-fuzz-init.c
index 1182bd41..c20965b4 100644
--- a/src/afl-fuzz-init.c
+++ b/src/afl-fuzz-init.c
@@ -1120,7 +1120,7 @@ void perform_dry_run(afl_state_t *afl) {
 
     }
 
-    if (q->var_behavior) {
+    if (unlikely(q->var_behavior && !afl->afl_env.afl_no_warn_instability)) {
 
       WARNF("Instrumentation output varies across runs.");
 
diff --git a/src/afl-fuzz-one.c b/src/afl-fuzz-one.c
index 2f016217..e97db273 100644
--- a/src/afl-fuzz-one.c
+++ b/src/afl-fuzz-one.c
@@ -1988,7 +1988,7 @@ custom_mutator_stage:
 
           if (unlikely(!mutated_buf)) {
 
-            //FATAL("Error in custom_fuzz. Size returned: %zu", mutated_size);
+            // FATAL("Error in custom_fuzz. Size returned: %zu", mutated_size);
             break;
 
           }
diff --git a/src/afl-fuzz-run.c b/src/afl-fuzz-run.c
index 7dd83150..f5425011 100644
--- a/src/afl-fuzz-run.c
+++ b/src/afl-fuzz-run.c
@@ -523,7 +523,7 @@ u8 calibrate_case(afl_state_t *afl, struct queue_entry *q, u8 *use_mem,
 
         }
 
-        if (unlikely(!var_detected)) {
+        if (unlikely(!var_detected && !afl->afl_env.afl_no_warn_instability)) {
 
           // note: from_queue seems to only be set during initialization
           if (afl->afl_env.afl_no_ui || from_queue) {
diff --git a/src/afl-fuzz-state.c b/src/afl-fuzz-state.c
index 104b1e4b..6d8c8758 100644
--- a/src/afl-fuzz-state.c
+++ b/src/afl-fuzz-state.c
@@ -204,6 +204,13 @@ void read_afl_environment(afl_state_t *afl, char **envp) {
             afl->afl_env.afl_no_affinity =
                 get_afl_env(afl_environment_variables[i]) ? 1 : 0;
 
+          } else if (!strncmp(env, "AFL_NO_WARN_INSTABILITY",
+
+                              afl_environment_variable_len)) {
+
+            afl->afl_env.afl_no_warn_instability =
+                get_afl_env(afl_environment_variables[i]) ? 1 : 0;
+
           } else if (!strncmp(env, "AFL_TRY_AFFINITY",
 
                               afl_environment_variable_len)) {
-- 
cgit 1.4.1


From 2090f17a9bb9cc225c1d24e8b21ed0c993a2665f Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 15 Feb 2023 11:23:42 +0100
Subject: opt

---
 custom_mutators/autotokens/autotokens.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 043d9588..a2b2814f 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -143,9 +143,9 @@ static void first_run(void *data) {
 
       }
 
-      if ((valid * 100) / afl_ptr->extras_cnt < 95) { module_disabled = 1; }
+      if ((valid * 100) / afl_ptr->extras_cnt <= 70) { module_disabled = 1; }
 
-      DEBUGF(stderr, "DICT: valid %u, total %u, %u < 95 == disable\n", valid,
+      DEBUGF(stderr, "DICT: valid %u, total %u, %u <= 70 == disable\n", valid,
              afl_ptr->extras_cnt, (u32)((valid * 100) / afl_ptr->extras_cnt));
 
     } else {
@@ -191,9 +191,9 @@ static void first_run(void *data) {
 
   }
 
-  if ((is_ascii * 100) / valid < 70) { module_disabled = 1; }
+  if ((is_ascii * 100) / valid <= 70) { module_disabled = 1; }
 
-  DEBUGF(stderr, "seeds: total %u, valid %u, ascii %u, %u < 70 == disabled\n",
+  DEBUGF(stderr, "seeds: total %u, valid %u, ascii %u, %u <= 70 == disabled\n",
          afl_ptr->active_items, valid, is_ascii,
          (u32)((is_ascii * 100) / valid));
 
-- 
cgit 1.4.1


From 04356ecbbe2c6cb72d279081702a6044fcc3ae92 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 15 Feb 2023 11:28:43 +0100
Subject: fix

---
 custom_mutators/autotokens/autotokens.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index a2b2814f..b1f1542e 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -145,8 +145,9 @@ static void first_run(void *data) {
 
       if ((valid * 100) / afl_ptr->extras_cnt <= 70) { module_disabled = 1; }
 
-      DEBUGF(stderr, "DICT: valid %u, total %u, %u <= 70 == disable\n", valid,
-             afl_ptr->extras_cnt, (u32)((valid * 100) / afl_ptr->extras_cnt));
+      DEBUGF(stderr, "DICT: total %u, valid %u, %u <= 70 == disable\n",
+             afl_ptr->extras_cnt, valid,
+             (u32)((valid * 100) / afl_ptr->extras_cnt));
 
     } else {
 
-- 
cgit 1.4.1


From ae94499503596d1e7f45e1a93bc5f7148c6163b6 Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 15 Feb 2023 11:48:49 +0100
Subject: fix

---
 custom_mutators/autotokens/autotokens.cpp | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index b1f1542e..e6b9931d 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -1078,6 +1078,8 @@ extern "C" void afl_custom_deinit(my_mutator_t *data) {
   /* we use this to print statistics at exit :-)
      needs to be stderr as stdout is filtered */
 
+  if (module_disabled) { return; }
+
   fprintf(stderr,
           "\n\nAutotoken mutator statistics:\n"
           "  Number of all seen tokens:  %u\n"
-- 
cgit 1.4.1


From 7f2bafbb8b709720cd3703789071c08064e518bd Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Wed, 15 Feb 2023 11:54:39 +0100
Subject: remove some debug

---
 custom_mutators/autotokens/autotokens.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index e6b9931d..22c78a60 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -287,7 +287,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
             ((whitespace_ids < new_item && whitespace_ids >= cur_item) ||
              (whitespace_ids >= new_item && whitespace_ids < cur_item))));
 
-        DEBUGF(stderr, "MUT: %u -> %u\n", cur_item, new_item);
+        // DEBUGF(stderr, "MUT: %u -> %u\n", cur_item, new_item);
         m[pos] = new_item;
         break;
 
@@ -305,7 +305,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
         u32 pos = rand_below(afl_ptr, m_size + 1);
         m.insert(m.begin() + pos, new_item);
         ++m_size;
-        DEBUGF(stderr, "INS: %u at %u\n", new_item, pos);
+        // DEBUGF(stderr, "INS: %u at %u\n", new_item, pos);
 
         break;
 
@@ -334,7 +334,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
             m.insert(m.begin() + dst_off, src->begin() + src_off,
                      src->begin() + src_off + n);
             m_size += n;
-            DEBUGF(stderr, "SPLICE-INS: %u at %u\n", n, dst_off);
+            // DEBUGF(stderr, "SPLICE-INS: %u at %u\n", n, dst_off);
 
             break;
 
@@ -354,7 +354,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
             copy(src->begin() + src_off, src->begin() + src_off + n,
                  m.begin() + dst_off);
 
-            DEBUGF(stderr, "SPLICE-MUT: %u at %u\n", n, dst_off);
+            // DEBUGF(stderr, "SPLICE-MUT: %u at %u\n", n, dst_off);
             break;
 
           }
@@ -432,6 +432,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
   if (unlikely(mutated_size > max_size)) { mutated_size = max_size; }
 
+  /*
   IFDEBUG {
 
     DEBUGF(stderr, "MUTATED to %u bytes:\n", mutated_size);
@@ -440,6 +441,8 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
 
   }
 
+  */
+
   *out_buf = mutated_out;
   ++fuzz_count;
   return mutated_size;
@@ -633,7 +636,6 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
       }
 
       ++a_extras_cnt;
-      DEBUGF(stderr, "Added from auto dictionary: \"%s\"\n", ptr);
 
     }
 
@@ -751,8 +753,10 @@ extern "C" unsigned char afl_custom_queue_get(void                *data,
     u32  tabs = count(input.begin(), input.end(), '\t');
     u32  linefeeds = count(input.begin(), input.end(), '\n');
     bool ends_with_linefeed = input[input.length() - 1] == '\n';
+
     DEBUGF(stderr, "spaces=%u tabs=%u linefeeds=%u ends=%u\n", spaces, tabs,
            linefeeds, ends_with_linefeed);
+
     all_spaces += spaces;
     all_tabs += tabs;
     all_lf += linefeeds;
-- 
cgit 1.4.1


From 1faf6f67313e726c645ac3b9ecd2d8b5e65f605a Mon Sep 17 00:00:00 2001
From: vanhauser-thc <vh@thc.org>
Date: Thu, 16 Feb 2023 07:47:36 +0100
Subject: fix

---
 custom_mutators/autotokens/autotokens.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'custom_mutators/autotokens/autotokens.cpp')

diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 22c78a60..8135aba1 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -401,25 +401,28 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
   /* Now we create the output */
 
   output = "";
-  u32 prev_size = 0;
+  u32 prev_size = 1, was_whitespace = 1;
 
   for (i = 0; i < m_size; ++i) {
 
     if (likely(i + 1 < m_size)) {
 
       u32 this_size = id_to_token[m[i]].size();
+      u32 is_whitespace = m[i] < whitespace_ids;
 
       /* The output we are generating might need repairing.
          General rule: two items that have a size larger than 2 are strings
          or identifizers and need a whitespace or an item of length 1 in
          between. */
-      if (unlikely(prev_size > 1 && this_size > 1)) {
+      if (unlikely(!(prev_size == 1 || was_whitespace || this_size == 1 ||
+                     is_whitespace))) {
 
         output += id_to_token[good_whitespace_or_singleval()];
 
       }
 
       prev_size = this_size;
+      was_whitespace = is_whitespace;
 
     }
 
-- 
cgit 1.4.1