aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorvanhauser-thc <vh@thc.org>2023-01-18 11:46:28 +0100
committervanhauser-thc <vh@thc.org>2023-01-18 11:46:28 +0100
commita41fd5cc5c4a5073f38adf06270e2985c88da9d5 (patch)
tree812f6065841e28ca36c51a7b81fae3e1adb016fd
parentefe57c936880608a2de452340d63f262470d9fcd (diff)
downloadafl++-a41fd5cc5c4a5073f38adf06270e2985c88da9d5.tar.gz
alternate tokenize, options
-rw-r--r--custom_mutators/autotokens/README9
-rw-r--r--custom_mutators/autotokens/autotokens.cpp432
2 files changed, 365 insertions, 76 deletions
diff --git a/custom_mutators/autotokens/README b/custom_mutators/autotokens/README
index 0dcc6a3e..f6e9c753 100644
--- a/custom_mutators/autotokens/README
+++ b/custom_mutators/autotokens/README
@@ -11,3 +11,12 @@ If you have a dictionary (`-x`) this improves this custom grammar mutator.
If **not** running with `CMPLOG`, it is possible to set
`AFL_CUSTOM_MUTATOR_ONLY` to concentrate on grammar bug classes.
+## Configuration via environment variables
+
+`AUTOTOKENS_ONLY_FAV` - only use this mutator on favorite queue items
+`AUTOTOKENS_COMMENT` - what character or string starts a comment which will be
+ removed. Default: `/* ... */`
+`AUTOTOKENS_ALTERNATIVE_TOKENIZE` - use an alternative tokenize implementation
+ (experimental)
+`AUTOTOKENS_WHITESPACE` - whitespace string to use for ALTERNATIVE_TOKENIZE,
+ default is " "
diff --git a/custom_mutators/autotokens/autotokens.cpp b/custom_mutators/autotokens/autotokens.cpp
index 5580512a..28ef91e2 100644
--- a/custom_mutators/autotokens/autotokens.cpp
+++ b/custom_mutators/autotokens/autotokens.cpp
@@ -15,7 +15,10 @@ extern "C" {
#include <regex>
#define AUTOTOKENS_DEBUG 0
+#define AUTOTOKENS_ONLY_FAV 0
+#define AUTOTOKENS_ALTERNATIVE_TOKENIZE 0
#define AUTOTOKENS_CHANGE_MIN 8
+#define AUTOTOKENS_WHITESPACE " "
using namespace std;
@@ -30,6 +33,8 @@ typedef struct my_mutator {
static afl_state *afl_ptr;
static int debug = AUTOTOKENS_DEBUG;
+static int only_fav = AUTOTOKENS_ONLY_FAV;
+static int alternative_tokenize = AUTOTOKENS_ALTERNATIVE_TOKENIZE;
static u32 current_id;
static u32 valid_structures;
static u32 whitespace_ids;
@@ -39,9 +44,12 @@ static u64 all_structure_items;
static unordered_map<string, vector<u32> *> file_mapping;
static unordered_map<string, u32> token_to_id;
static unordered_map<u32, string> id_to_token;
-// static regex regex_comment_slash("(//.*)([\r\n]?)", regex::optimize);
+static string whitespace = AUTOTOKENS_WHITESPACE;
+static regex *regex_comment_custom;
static regex regex_comment_star("/\\*([:print:]|\n)*?\\*/",
regex::multiline | regex::optimize);
+static regex regex_word("[A-Za-z0-9_$]+", regex::optimize);
+static regex regex_whitespace(R"([ \t]+)", regex::optimize);
static regex regex_string("\"[[:print:]]*?\"|'[[:print:]]*?'", regex::optimize);
static vector<u32> *s; // the structure of the currently selected input
@@ -84,15 +92,15 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
afl_ptr->havoc_div / 256));
// DEBUG(stderr, "structure size: %lu, rounds: %u \n", m.size(), rounds);
- u32 max_rand = 4;
+ u32 max_rand = 7;
for (i = 0; i < rounds; ++i) {
switch (rand_below(afl_ptr, max_rand)) {
/* CHANGE */
- case 0: /* fall through */
- case 1: {
+ case 0 ... 3: /* fall through */
+ {
u32 pos = rand_below(afl_ptr, m_size);
u32 cur_item = m[pos], new_item;
@@ -103,8 +111,9 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
} while (unlikely(
new_item == cur_item ||
- (whitespace_ids < new_item && whitespace_ids >= cur_item) ||
- (whitespace_ids >= new_item && whitespace_ids < cur_item)));
+ (!alternative_tokenize &&
+ ((whitespace_ids < new_item && whitespace_ids >= cur_item) ||
+ (whitespace_ids >= new_item && whitespace_ids < cur_item)))));
DEBUG(stderr, "MUT: %u -> %u\n", cur_item, new_item);
m[pos] = new_item;
@@ -113,7 +122,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
}
/* INSERT (m_size +1 so we insert also after last place) */
- case 2: {
+ case 4 ... 5: {
u32 new_item;
do {
@@ -126,26 +135,30 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
m.insert(m.begin() + pos, new_item);
++m_size;
- // if we insert an identifier or string we might need whitespace
- if (id_to_token[new_item].size() > 1) {
+ if (likely(!alternative_tokenize)) {
- // need to insert before?
+ // if we insert an identifier or string we might need whitespace
+ if (id_to_token[new_item].size() > 1) {
- if (pos && m[pos - 1] >= whitespace_ids &&
- id_to_token[m[pos - 1]].size() > 1) {
+ // need to insert before?
- m.insert(m.begin() + pos, good_whitespace_or_singleval());
- ++m_size;
+ if (pos && m[pos - 1] >= whitespace_ids &&
+ id_to_token[m[pos - 1]].size() > 1) {
- }
+ m.insert(m.begin() + pos, good_whitespace_or_singleval());
+ ++m_size;
+
+ }
+
+ if (pos + 1 < m_size && m[pos + 1] >= whitespace_ids &&
+ id_to_token[m[pos + 1]].size() > 1) {
- if (pos + 1 < m_size && m[pos + 1] >= whitespace_ids &&
- id_to_token[m[pos + 1]].size() > 1) {
+ // need to insert after?
- // need to insert after?
+ m.insert(m.begin() + pos + 1, good_whitespace_or_singleval());
+ ++m_size;
- m.insert(m.begin() + pos + 1, good_whitespace_or_singleval());
- ++m_size;
+ }
}
@@ -156,7 +169,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
}
/* ERASE - only if large enough */
- case 3: {
+ case 6: {
if (m_size > 8) {
@@ -165,7 +178,7 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
} else {
- max_rand = 3;
+ max_rand = 6;
}
@@ -180,10 +193,16 @@ extern "C" size_t afl_custom_fuzz(my_mutator_t *data, u8 *buf, size_t buf_size,
}
string output;
+ u32 m_size_1 = m_size - 1;
for (i = 0; i < m_size; ++i) {
output += id_to_token[m[i]];
+ if (unlikely(alternative_tokenize && i < m_size_1)) {
+
+ output += whitespace;
+
+ }
}
@@ -219,7 +238,8 @@ extern "C" unsigned char afl_custom_queue_get(void *data,
if (likely(!debug)) {
- if (afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) {
+ if ((afl_ptr->shm.cmplog_mode && !afl_ptr->queue_cur->is_ascii) ||
+ (only_fav && !afl_ptr->queue_cur->favored)) {
s = NULL;
return 0;
@@ -353,8 +373,15 @@ extern "C" unsigned char afl_custom_queue_get(void *data,
// DEBUG(stderr, "Read %lu bytes for %s\nBefore comment trim:\n%s\n",
// input.size(), filename, input.c_str());
- // input = regex_replace(input, regex_comment_slash, "$2");
- input = regex_replace(input, regex_comment_star, "");
+ if (regex_comment_custom) {
+
+ input = regex_replace(input, *regex_comment_custom, "$2");
+
+ } else {
+
+ input = regex_replace(input, regex_comment_star, "");
+
+ }
DEBUG(stderr, "After replace %lu bytes for %s\n%s\n", input.size(),
filename, input.c_str());
@@ -377,53 +404,105 @@ extern "C" unsigned char afl_custom_queue_get(void *data,
DEBUG(stderr, "START!\n");
- while (regex_search(cur, ende, match, regex_string,
- regex_constants::match_any |
- regex_constants::match_not_null |
- regex_constants::match_continuous)) {
+ if (likely(!alternative_tokenize)) {
+
+ while (regex_search(cur, ende, match, regex_string,
+ regex_constants::match_any |
+ regex_constants::match_not_null |
+ regex_constants::match_continuous)) {
+
+ prev = cur;
+ found = match[0].first;
+ cur = match[0].second;
+ DEBUG(stderr,
+ "string %s found at start %lu offset %lu continue at %lu\n",
+ match[0].str().c_str(), prev - input.begin(), match.position(),
+ cur - input.begin());
+
+ if (prev < found) { // there are items between search start and find
+ while (prev < found) {
- prev = cur;
- found = match[0].first;
- cur = match[0].second;
- DEBUG(stderr, "string %s found at start %lu offset %lu continue at %lu\n",
- match[0].str().c_str(), prev - input.begin(), match.position(),
- cur - input.begin());
+ if (isspace(*prev)) {
- if (prev < found) { // there are items between search start and find
- while (prev < found) {
+ auto start = prev;
+ while (isspace(*prev)) {
- if (isspace(*prev)) {
+ ++prev;
- auto start = prev;
- while (isspace(*prev)) {
+ }
+ tokens.push_back(std::string(start, prev));
+ DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", prev - start,
+ tokens[tokens.size() - 1].c_str());
+
+ } else if (isalnum(*prev) || *prev == '$' || *prev == '_') {
+
+ auto start = prev;
+ while (isalnum(*prev) || *prev == '$' || *prev == '_' ||
+ *prev == '.' || *prev == '/') {
+
+ ++prev;
+
+ }
+
+ tokens.push_back(std::string(start, prev));
+ DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", prev - start,
+ tokens[tokens.size() - 1].c_str());
+
+ } else {
+
+ tokens.push_back(std::string(prev, prev + 1));
+ DEBUG(stderr, "OTHER \"%c\"\n", *prev);
++prev;
}
- tokens.push_back(std::string(start, prev));
- DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", prev - start,
+ }
+
+ }
+
+ if (match[0].length() > 0) { tokens.push_back(match[0]); }
+
+ }
+
+ DEBUG(stderr, "AFTER all strings\n");
+
+ if (cur < ende) {
+
+ while (cur < ende) {
+
+ if (isspace(*cur)) {
+
+ auto start = cur;
+ while (isspace(*cur)) {
+
+ ++cur;
+
+ }
+
+ tokens.push_back(std::string(start, cur));
+ DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", cur - start,
tokens[tokens.size() - 1].c_str());
- } else if (isalnum(*prev) || *prev == '$' || *prev == '_') {
+ } else if (isalnum(*cur) || *cur == '$' || *cur == '_') {
- auto start = prev;
- while (isalnum(*prev) || *prev == '$' || *prev == '_' ||
- *prev == '.' || *prev == '/') {
+ auto start = cur;
+ while (isalnum(*cur) || *cur == '$' || *cur == '_' || *cur == '.' ||
+ *cur == '/') {
- ++prev;
+ ++cur;
}
- tokens.push_back(std::string(start, prev));
- DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", prev - start,
+ tokens.push_back(std::string(start, cur));
+ DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", cur - start,
tokens[tokens.size() - 1].c_str());
} else {
- tokens.push_back(std::string(prev, prev + 1));
- DEBUG(stderr, "OTHER \"%c\"\n", *prev);
- ++prev;
+ tokens.push_back(std::string(cur, cur + 1));
+ DEBUG(stderr, "OTHER \"%c\"\n", *cur);
+ ++cur;
}
@@ -431,48 +510,227 @@ extern "C" unsigned char afl_custom_queue_get(void *data,
}
- if (match[0].length() > 0) { tokens.push_back(match[0]); }
+ } else {
- }
+ // alternative tokenize
- DEBUG(stderr, "AFTER all strings\n");
+ while (regex_search(cur, ende, match, regex_string)) {
- if (cur < ende) {
+ prev = cur;
+ found = match[0].first;
+ cur = match[0].second;
+ DEBUG(stderr,
+ "string %s found at start %lu offset %lu continue at %lu\n",
+ match[0].str().c_str(), prev - input.begin(), match.position(),
+ cur - input.begin());
+ if (prev < found) { // there are items between search start and find
+ sregex_token_iterator it{prev, found, regex_whitespace, -1};
+ vector<std::string> tokenized{it, {}};
+ tokenized.erase(std::remove_if(tokenized.begin(), tokenized.end(),
+ [](std::string const &s) {
- while (cur < ende) {
+ return s.size() == 0;
- if (isspace(*cur)) {
+ }),
- auto start = cur;
- while (isspace(*cur)) {
+ tokenized.end());
+ tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
- ++cur;
+ if (unlikely(debug)) {
+
+ DEBUG(stderr, "tokens: %lu input size: %lu\n", tokenized.size(),
+ input.size());
+ for (auto x : tokenized) {
+
+ cerr << x << endl;
+
+ }
}
- tokens.push_back(std::string(start, cur));
- DEBUG(stderr, "WHITESPACE %ld \"%s\"\n", cur - start,
- tokens[tokens.size() - 1].c_str());
+ for (auto token : tokenized) {
- } else if (isalnum(*cur) || *cur == '$' || *cur == '_') {
+ string::const_iterator c = token.begin(), e = token.end(), f, p;
+ smatch m;
- auto start = cur;
- while (isalnum(*cur) || *cur == '$' || *cur == '_' || *cur == '.' ||
- *cur == '/') {
+ while (regex_search(c, e, m, regex_word)) {
- ++cur;
+ p = c;
+ f = m[0].first;
+ c = m[0].second;
+ if (p < f) {
+
+ // there are items between search start and find
+ while (p < f) {
+
+ if (unlikely(debug)) {
+
+ string foo(p, p + 1);
+ DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
+
+ }
+
+ tokens.push_back(std::string(p, p + 1));
+ ++p;
+
+ }
+
+ /*
+ string foo(p, f);
+ DEBUG(stderr, "before string: \"%s\"\n",
+ foo.c_str()); tokens.push_back(std::string(p, f));
+ */
+
+ }
+
+ DEBUG(
+ stderr,
+ "SUBstring \"%s\" found at start %lu offset %lu continue at "
+ "%lu\n",
+ m[0].str().c_str(), p - input.begin(), m.position(),
+ c - token.begin());
+ tokens.push_back(m[0].str());
+
+ }
+
+ if (c < e) {
+
+ while (c < e) {
+
+ if (unlikely(debug)) {
+
+ string foo(c, c + 1);
+ DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+
+ }
+
+ tokens.push_back(std::string(c, c + 1));
+ ++c;
+
+ }
+
+ /*
+ if (unlikely(debug)) {
+
+ string foo(c, e);
+ DEBUG(stderr, "after string: \"%s\"\n",
+ foo.c_str());
+
+ }
+
+ tokens.push_back(std::string(c, e));
+ */
+
+ }
}
- tokens.push_back(std::string(start, cur));
- DEBUG(stderr, "IDENTIFIER %ld \"%s\"\n", cur - start,
- tokens[tokens.size() - 1].c_str());
+ }
+
+ if (match[0].length() > 0) { tokens.push_back(match[0]); }
- } else {
+ }
+
+ if (cur < ende) {
+
+ sregex_token_iterator it{cur, ende, regex_whitespace, -1};
+ vector<std::string> tokenized{it, {}};
+ tokenized.erase(
+ std::remove_if(tokenized.begin(), tokenized.end(),
+ [](std::string const &s) { return s.size() == 0; }),
+ tokenized.end());
+ tokens.reserve(tokens.size() + tokenized.size() * 2 + 1);
+
+ if (unlikely(debug)) {
+
+ DEBUG(stderr, "tokens: %lu input size: %lu\n", tokenized.size(),
+ input.size());
+ for (auto x : tokenized) {
+
+ cerr << x << endl;
- tokens.push_back(std::string(cur, cur + 1));
- DEBUG(stderr, "OTHER \"%c\"\n", *cur);
- ++cur;
+ }
+
+ }
+
+ for (auto token : tokenized) {
+
+ string::const_iterator c = token.begin(), e = token.end(), f, p;
+ smatch m;
+
+ while (regex_search(c, e, m, regex_word)) {
+
+ p = c;
+ f = m[0].first;
+ c = m[0].second;
+ if (p < f) {
+
+ // there are items between search start and find
+ while (p < f) {
+
+ if (unlikely(debug)) {
+
+ string foo(p, p + 1);
+ DEBUG(stderr, "before string: \"%s\"\n", foo.c_str());
+
+ }
+
+ tokens.push_back(std::string(p, p + 1));
+ ++p;
+
+ }
+
+ /*
+ if (unlikely(debug)) {
+
+ string foo(p, f);
+ DEBUG(stderr, "before string: \"%s\"\n",
+ foo.c_str());
+
+ }
+
+ tokens.push_back(std::string(p, f));
+ */
+
+ }
+
+ DEBUG(stderr,
+ "SUB2string \"%s\" found at start %lu offset %lu continue at "
+ "%lu\n",
+ m[0].str().c_str(), p - input.begin(), m.position(),
+ c - token.begin());
+ tokens.push_back(m[0].str());
+
+ }
+
+ if (c < e) {
+
+ while (c < e) {
+
+ if (unlikely(debug)) {
+
+ string foo(c, c + 1);
+ DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+
+ }
+
+ tokens.push_back(std::string(c, c + 1));
+ ++c;
+
+ }
+
+ /*
+ if (unlikely(debug)) {
+
+ string foo(c, e);
+ DEBUG(stderr, "after string: \"%s\"\n", foo.c_str());
+
+ }
+
+ tokens.push_back(std::string(c, e));
+ */
+
+ }
}
@@ -483,9 +741,15 @@ extern "C" unsigned char afl_custom_queue_get(void *data,
if (unlikely(debug)) {
DEBUG(stderr, "DUMPING TOKENS:\n");
+ u32 size_1 = tokens.size() - 1;
for (u32 i = 0; i < tokens.size(); ++i) {
DEBUG(stderr, "%s", tokens[i].c_str());
+ if (unlikely(alternative_tokenize && i < size_1)) {
+
+ DEBUG(stderr, "%s", whitespace.c_str());
+
+ }
}
@@ -556,6 +820,22 @@ extern "C" my_mutator_t *afl_custom_init(afl_state *afl, unsigned int seed) {
}
+ if (getenv("AUTOTOKENS_ONLY_FAV")) { only_fav = 1; }
+ if (getenv("AUTOTOKENS_ALTERNATIVE_TOKENIZE")) { alternative_tokenize = 1; }
+ if (getenv("AUTOTOKENS_WHITESPACE")) {
+
+ whitespace = getenv("AUTOTOKENS_WHITESPACE");
+
+ }
+
+ if (getenv("AUTOTOKENS_COMMENT")) {
+
+ char buf[256];
+ snprintf(buf, sizeof(buf), "(%s.*)([\r\n]?)", getenv("AUTOTOKENS_COMMENT"));
+ regex_comment_custom = new regex(buf, regex::optimize);
+
+ }
+
data->afl = afl_ptr = afl;
// set common whitespace tokens