3 files changed, 404 insertions, 0 deletions
diff --git a/libtokencap/Makefile b/libtokencap/Makefile
new file mode 100644
index 00000000..a464f76d
--- /dev/null
+++ b/libtokencap/Makefile
@@ -0,0 +1,38 @@
+#
+# american fuzzy lop - libtokencap
+# --------------------------------
+#
+# Written by Michal Zalewski <lcamtuf@google.com>
+#
+# Copyright 2016 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+
+PREFIX      ?= /usr/local
+HELPER_PATH  = $(PREFIX)/lib/afl
+
+VERSION     = $(shell grep '^\#define VERSION ' ../config.h | cut -d '"' -f2)
+
+CFLAGS      ?= -O3 -funroll-loops
+CFLAGS      += -Wall -D_FORTIFY_SOURCE=2 -g -Wno-pointer-sign
+
+all: libtokencap.so
+
+libtokencap.so: libtokencap.so.c ../config.h
+	$(CC) $(CFLAGS) -shared -fPIC $< -o $@ $(LDFLAGS)
+
+.NOTPARALLEL: clean
+
+clean:
+	rm -f *.o *.so *~ a.out core core.[1-9][0-9]*
+	rm -f libtokencap.so
+
+install: all
+	install -m 755 libtokencap.so $${DESTDIR}$(HELPER_PATH)
+	install -m 644 README.tokencap $${DESTDIR}$(HELPER_PATH)
+
diff --git a/libtokencap/README.tokencap b/libtokencap/README.tokencap
new file mode 100644
index 00000000..650739f2
--- /dev/null
+++ b/libtokencap/README.tokencap
@@ -0,0 +1,61 @@
+=========================================
+strcmp() / memcmp() token capture library
+=========================================
+
+  (See ../docs/README for the general instruction manual.)
+
+This Linux-only companion library allows you to instrument strcmp(), memcmp(),
+and related functions to automatically extract syntax tokens passed to any of
+these libcalls. The resulting list of tokens may be then given as a starting
+dictionary to afl-fuzz (the -x option) to improve coverage on subsequent
+fuzzing runs.
+
+This may help improving coverage in some targets, and do precisely nothing in
+others. In some cases, it may even make things worse: if libtokencap picks up
+syntax tokens that are not used to process the input data, but that are a part
+of - say - parsing a config file... well, you're going to end up wasting a lot
+of CPU time on trying them out in the input stream. In other words, use this
+feature with care. Manually screening the resulting dictionary is almost
+always a necessity.
+
+As for the actual operation: the library stores tokens, without any deduping,
+by appending them to a file specified via AFL_TOKEN_FILE. If the variable is not
+set, the tool uses stderr (which is probably not what you want).
+
+Similarly to afl-tmin, the library is not "proprietary" and can be used with
+other fuzzers or testing tools without the need for any code tweaks. It does not
+require AFL-instrumented binaries to work.
+
+To use the library, you *need* to make sure that your fuzzing target is compiled
+with -fno-builtin and is linked dynamically. If you wish to automate the first
+part without mucking with CFLAGS in Makefiles, you can set AFL_NO_BUILTIN=1
+when using afl-gcc. This setting specifically adds the following flags:
+
+  -fno-builtin-strcmp -fno-builtin-strncmp -fno-builtin-strcasecmp
+  -fno-builtin-strcasencmp -fno-builtin-memcmp -fno-builtin-strstr
+  -fno-builtin-strcasestr
+
+The next step is simply loading this library via LD_PRELOAD. The optimal usage
+pattern is to allow afl-fuzz to fuzz normally for a while and build up a corpus,
+and then fire off the target binary, with libtokencap.so loaded, on every file
+found by AFL in that earlier run. This demonstrates the basic principle:
+
+  export AFL_TOKEN_FILE=$PWD/temp_output.txt
+
+  for i in <out_dir>/queue/id*; do
+    LD_PRELOAD=/path/to/libtokencap.so \
+      /path/to/target/program [...params, including $i...]
+  done
+
+  sort -u temp_output.txt >afl_dictionary.txt
+
+If you don't get any results, the target library is probably not using strcmp()
+and memcmp() to parse input; or you haven't compiled it with -fno-builtin; or
+the whole thing isn't dynamically linked, and LD_PRELOAD is having no effect.
+
+PS. The library is Linux-only because there is probably no particularly portable
+and non-invasive way to distinguish between read-only and read-write memory
+mappings. The __tokencap_load_mappings() function is the only thing that would
+need to be changed for other OSes. Porting to platforms with /proc/<pid>/maps
+(e.g., FreeBSD) should be trivial.
+
diff --git a/libtokencap/libtokencap.so.c b/libtokencap/libtokencap.so.c
new file mode 100644
index 00000000..54072279
--- /dev/null
+++ b/libtokencap/libtokencap.so.c
@@ -0,0 +1,305 @@
+/*
+
+   american fuzzy lop - extract tokens passed to strcmp / memcmp
+   -------------------------------------------------------------
+
+   Written and maintained by Michal Zalewski <lcamtuf@google.com>
+
+   Copyright 2016 Google Inc. All rights reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at:
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   This Linux-only companion library allows you to instrument strcmp(),
+   memcmp(), and related functions to automatically extract tokens.
+   See README.tokencap for more info.
+
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "../types.h"
+#include "../config.h"
+
+#ifndef __linux__
+#  error "Sorry, this library is Linux-specific for now!"
+#endif /* !__linux__ */
+
+
+/* Mapping data and such */
+
+#define MAX_MAPPINGS 1024
+
+static struct mapping {
+  void *st, *en;
+} __tokencap_ro[MAX_MAPPINGS];
+
+static u32   __tokencap_ro_cnt;
+static u8    __tokencap_ro_loaded;
+static FILE* __tokencap_out_file;
+
+
+/* Identify read-only regions in memory. Only parameters that fall into these
+   ranges are worth dumping when passed to strcmp() and so on. Read-write
+   regions are far more likely to contain user input instead. */
+
+static void __tokencap_load_mappings(void) {
+
+  u8 buf[MAX_LINE];
+  FILE* f = fopen("/proc/self/maps", "r");
+
+  __tokencap_ro_loaded = 1;
+
+  if (!f) return;
+
+  while (fgets(buf, MAX_LINE, f)) {
+
+    u8 rf, wf;
+    void* st, *en;
+
+    if (sscanf(buf, "%p-%p %c%c", &st, &en, &rf, &wf) != 4) continue;
+    if (wf == 'w' || rf != 'r') continue;
+
+    __tokencap_ro[__tokencap_ro_cnt].st = (void*)st;
+    __tokencap_ro[__tokencap_ro_cnt].en = (void*)en;
+
+    if (++__tokencap_ro_cnt == MAX_MAPPINGS) break;
+
+  }
+
+  fclose(f);
+
+}
+
+
+/* Check an address against the list of read-only mappings. */
+
+static u8 __tokencap_is_ro(const void* ptr) {
+
+  u32 i;
+
+  if (!__tokencap_ro_loaded) __tokencap_load_mappings();
+
+  for (i = 0; i < __tokencap_ro_cnt; i++) 
+    if (ptr >= __tokencap_ro[i].st && ptr <= __tokencap_ro[i].en) return 1;
+
+  return 0;
+
+}
+
+
+/* Dump an interesting token to output file, quoting and escaping it
+   properly. */
+
+static void __tokencap_dump(const u8* ptr, size_t len, u8 is_text) {
+
+  u8 buf[MAX_AUTO_EXTRA * 4 + 1];
+  u32 i;
+  u32 pos = 0;
+
+  if (len < MIN_AUTO_EXTRA || len > MAX_AUTO_EXTRA || !__tokencap_out_file)
+    return;
+
+  for (i = 0; i < len; i++) {
+
+    if (is_text && !ptr[i]) break;
+
+    switch (ptr[i]) {
+
+      case 0 ... 31:
+      case 127 ... 255:
+      case '\"':
+      case '\\':
+
+        sprintf(buf + pos, "\\x%02x", ptr[i]);
+        pos += 4;
+        break;
+
+      default:
+
+        buf[pos++] = ptr[i];
+
+    }
+
+  }
+
+  buf[pos] = 0;
+
+  fprintf(__tokencap_out_file, "\"%s\"\n", buf);    
+
+}
+
+
+/* Replacements for strcmp(), memcmp(), and so on. Note that these will be used
+   only if the target is compiled with -fno-builtins and linked dynamically. */
+
+#undef strcmp
+
+int strcmp(const char* str1, const char* str2) {
+
+  if (__tokencap_is_ro(str1)) __tokencap_dump(str1, strlen(str1), 1);
+  if (__tokencap_is_ro(str2)) __tokencap_dump(str2, strlen(str2), 1);
+
+  while (1) {
+
+    unsigned char c1 = *str1, c2 = *str2;
+
+    if (c1 != c2) return (c1 > c2) ? 1 : -1;
+    if (!c1) return 0;
+    str1++; str2++;
+
+  }
+
+}
+
+
+#undef strncmp
+
+int strncmp(const char* str1, const char* str2, size_t len) {
+
+  if (__tokencap_is_ro(str1)) __tokencap_dump(str1, len, 1);
+  if (__tokencap_is_ro(str2)) __tokencap_dump(str2, len, 1);
+
+  while (len--) {
+
+    unsigned char c1 = *str1, c2 = *str2;
+
+    if (!c1) return 0;
+    if (c1 != c2) return (c1 > c2) ? 1 : -1;
+    str1++; str2++;
+
+  }
+
+  return 0;
+
+}
+
+
+#undef strcasecmp
+
+int strcasecmp(const char* str1, const char* str2) {
+
+  if (__tokencap_is_ro(str1)) __tokencap_dump(str1, strlen(str1), 1);
+  if (__tokencap_is_ro(str2)) __tokencap_dump(str2, strlen(str2), 1);
+
+  while (1) {
+
+    unsigned char c1 = tolower(*str1), c2 = tolower(*str2);
+
+    if (c1 != c2) return (c1 > c2) ? 1 : -1;
+    if (!c1) return 0;
+    str1++; str2++;
+
+  }
+
+}
+
+
+#undef strncasecmp
+
+int strncasecmp(const char* str1, const char* str2, size_t len) {
+
+  if (__tokencap_is_ro(str1)) __tokencap_dump(str1, len, 1);
+  if (__tokencap_is_ro(str2)) __tokencap_dump(str2, len, 1);
+
+  while (len--) {
+
+    unsigned char c1 = tolower(*str1), c2 = tolower(*str2);
+
+    if (!c1) return 0;
+    if (c1 != c2) return (c1 > c2) ? 1 : -1;
+    str1++; str2++;
+
+  }
+
+  return 0;
+
+}
+
+
+#undef memcmp
+
+int memcmp(const void* mem1, const void* mem2, size_t len) {
+
+  if (__tokencap_is_ro(mem1)) __tokencap_dump(mem1, len, 0);
+  if (__tokencap_is_ro(mem2)) __tokencap_dump(mem2, len, 0);
+
+  while (len--) {
+
+    unsigned char c1 = *(const char*)mem1, c2 = *(const char*)mem2;
+    if (c1 != c2) return (c1 > c2) ? 1 : -1;
+    mem1++; mem2++;
+
+  }
+
+  return 0;
+
+}
+
+
+#undef strstr
+
+char* strstr(const char* haystack, const char* needle) {
+
+  if (__tokencap_is_ro(haystack))
+    __tokencap_dump(haystack, strlen(haystack), 1);
+
+  if (__tokencap_is_ro(needle))
+    __tokencap_dump(needle, strlen(needle), 1);
+
+  do {
+    const char* n = needle;
+    const char* h = haystack;
+
+    while(*n && *h && *n == *h) n++, h++;
+
+    if(!*n) return (char*)haystack;
+
+  } while (*(haystack++));
+
+  return 0;
+
+}
+
+
+#undef strcasestr
+
+char* strcasestr(const char* haystack, const char* needle) {
+
+  if (__tokencap_is_ro(haystack))
+    __tokencap_dump(haystack, strlen(haystack), 1);
+
+  if (__tokencap_is_ro(needle))
+    __tokencap_dump(needle, strlen(needle), 1);
+
+  do {
+
+    const char* n = needle;
+    const char* h = haystack;
+
+    while(*n && *h && tolower(*n) == tolower(*h)) n++, h++;
+
+    if(!*n) return (char*)haystack;
+
+  } while(*(haystack++));
+
+  return 0;
+
+}
+
+
+/* Init code to open the output file (or default to stderr). */
+
+__attribute__((constructor)) void __tokencap_init(void) {
+
+  u8* fn = getenv("AFL_TOKEN_FILE");
+  if (fn) __tokencap_out_file = fopen(fn, "a");
+  if (!__tokencap_out_file) __tokencap_out_file = stderr;
+
+}
+