aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xafl-cmin474
-rwxr-xr-xafl-cmin.bash470
2 files changed, 474 insertions, 470 deletions
diff --git a/afl-cmin b/afl-cmin
index 1dd782d8..75dc63a7 100755
--- a/afl-cmin
+++ b/afl-cmin
@@ -1,470 +1,4 @@
-#!/usr/bin/env bash
-#
-# american fuzzy lop++ - corpus minimization tool
-# ---------------------------------------------
-#
-# Originally written by Michal Zalewski
-#
-# Copyright 2014, 2015 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at:
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# This tool tries to find the smallest subset of files in the input directory
-# that still trigger the full range of instrumentation data points seen in
-# the starting corpus. This has two uses:
-#
-# - Screening large corpora of input files before using them as a seed for
-# afl-fuzz. The tool will remove functionally redundant files and likely
-# leave you with a much smaller set.
-#
-# (In this case, you probably also want to consider running afl-tmin on
-# the individual files later on to reduce their size.)
-#
-# - Minimizing the corpus generated organically by afl-fuzz, perhaps when
-# planning to feed it to more resource-intensive tools. The tool achieves
-# this by removing all entries that used to trigger unique behaviors in the
-# past, but have been made obsolete by later finds.
-#
-# Note that the tool doesn't modify the files themselves. For that, you want
-# afl-tmin.
-#
-# This script must use bash because other shells may have hardcoded limits on
-# array sizes.
-#
-
-echo "corpus minimization tool for afl-fuzz by Michal Zalewski"
-echo
-
-#########
-# SETUP #
-#########
-
-# Process command-line options...
-
-MEM_LIMIT=200
-TIMEOUT=none
-
-unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \
- AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE
-
-while getopts "+i:o:f:m:t:eQUCh" opt; do
-
- case "$opt" in
-
- "h")
- ;;
-
- "i")
- IN_DIR="$OPTARG"
- ;;
-
- "o")
- OUT_DIR="$OPTARG"
- ;;
- "f")
- STDIN_FILE="$OPTARG"
- ;;
- "m")
- MEM_LIMIT="$OPTARG"
- MEM_LIMIT_GIVEN=1
- ;;
- "t")
- TIMEOUT="$OPTARG"
- ;;
- "e")
- EXTRA_PAR="$EXTRA_PAR -e"
- ;;
- "C")
- export AFL_CMIN_CRASHES_ONLY=1
- ;;
- "Q")
- EXTRA_PAR="$EXTRA_PAR -Q"
- test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250
- QEMU_MODE=1
- ;;
- "U")
- EXTRA_PAR="$EXTRA_PAR -U"
- test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250
- UNICORN_MODE=1
- ;;
- "?")
- exit 1
- ;;
-
- esac
-
-done
-
-shift $((OPTIND-1))
-
-TARGET_BIN="$1"
-
-if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then
-
- cat 1>&2 <<_EOF_
-Usage: $0 [ options ] -- /path/to/target_app [ ... ]
-
-Required parameters:
-
- -i dir - input directory with the starting corpus
- -o dir - output directory for minimized files
-
-Execution control settings:
-
- -f file - location read by the fuzzed program (stdin)
- -m megs - memory limit for child process ($MEM_LIMIT MB)
- -t msec - run time limit for child process (none)
- -Q - use binary-only instrumentation (QEMU mode)
- -U - use unicorn-based instrumentation (Unicorn mode)
-
-Minimization settings:
-
- -C - keep crashing inputs, reject everything else
- -e - solve for edge coverage only, ignore hit counts
-
-For additional tips, please consult docs/README.
-
-_EOF_
- exit 1
-fi
-
-# Do a sanity check to discourage the use of /tmp, since we can't really
-# handle this safely from a shell script.
-
-if [ "$AFL_ALLOW_TMP" = "" ]; then
-
- echo "$IN_DIR" | grep -qE '^(/var)?/tmp/'
- T1="$?"
-
- echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/'
- T2="$?"
-
- echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/'
- T3="$?"
-
- echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/'
- T4="$?"
-
- echo "$PWD" | grep -qE '^(/var)?/tmp/'
- T5="$?"
-
- if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then
- echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2
- exit 1
- fi
-
-fi
-
-# If @@ is specified, but there's no -f, let's come up with a temporary input
-# file name.
-
-TRACE_DIR="$OUT_DIR/.traces"
-
-if [ "$STDIN_FILE" = "" ]; then
-
- if echo "$*" | grep -qF '@@'; then
- STDIN_FILE="$TRACE_DIR/.cur_input"
- fi
-
-fi
-
-# Check for obvious errors.
-
-if [ ! "$MEM_LIMIT" = "none" ]; then
-
- if [ "$MEM_LIMIT" -lt "5" ]; then
- echo "[-] Error: dangerously low memory limit." 1>&2
- exit 1
- fi
-
-fi
-
-if [ ! "$TIMEOUT" = "none" ]; then
-
- if [ "$TIMEOUT" -lt "10" ]; then
- echo "[-] Error: dangerously low timeout." 1>&2
- exit 1
- fi
-
-fi
-
-if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then
-
- TNEW="`which "$TARGET_BIN" 2>/dev/null`"
-
- if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then
- echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2
- exit 1
- fi
-
- TARGET_BIN="$TNEW"
-
-fi
-
-if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$UNICORN_MODE" = "" ]; then
-
- if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then
- echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2
- exit 1
- fi
-
-fi
-
-if [ ! -d "$IN_DIR" ]; then
- echo "[-] Error: directory '$IN_DIR' not found." 1>&2
- exit 1
-fi
-
-test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue"
-
-find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null
-rm -rf "$TRACE_DIR" 2>/dev/null
-
-rmdir "$OUT_DIR" 2>/dev/null
-
-if [ -d "$OUT_DIR" ]; then
- echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2
- exit 1
-fi
-
-mkdir -m 700 -p "$TRACE_DIR" || exit 1
-
-if [ ! "$STDIN_FILE" = "" ]; then
- rm -f "$STDIN_FILE" || exit 1
- touch "$STDIN_FILE" || exit 1
-fi
-
-if [ "$AFL_PATH" = "" ]; then
- SHOWMAP="${0%/afl-cmin}/afl-showmap"
-else
- SHOWMAP="$AFL_PATH/afl-showmap"
-fi
-
-if [ ! -x "$SHOWMAP" ]; then
- echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2
- rm -rf "$TRACE_DIR"
- exit 1
-fi
-
-IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`))
-
-if [ "$IN_COUNT" = "0" ]; then
- echo "[+] Hmm, no inputs in the target directory. Nothing to be done."
- rm -rf "$TRACE_DIR"
- exit 1
-fi
-
-FIRST_FILE=`ls "$IN_DIR" | head -1`
-
-# Make sure that we're not dealing with a directory.
-
-if [ -d "$IN_DIR/$FIRST_FILE" ]; then
- echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2
- rm -rf "$TRACE_DIR"
- exit 1
-fi
-
-# Check for the more efficient way to copy files...
-
-if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then
- CP_TOOL=ln
-else
- CP_TOOL=cp
-fi
-
-# Make sure that we can actually get anything out of afl-showmap before we
-# waste too much time.
-
-echo "[*] Testing the target binary..."
-
-if [ "$STDIN_FILE" = "" ]; then
-
- AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE"
-
-else
-
- cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE"
- AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
-
-fi
-
-FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`))
-
-if [ "$FIRST_COUNT" -gt "0" ]; then
-
- echo "[+] OK, $FIRST_COUNT tuples recorded."
-
-else
-
- echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2
- test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
- exit 1
-
-fi
-
-# Let's roll!
-
-#############################
-# STEP 1: COLLECTING TRACES #
-#############################
-
-echo "[*] Obtaining traces for input files in '$IN_DIR'..."
-
-(
-
- CUR=0
-
- if [ "$STDIN_FILE" = "" ]; then
-
- ls "$IN_DIR" | while read -r fn; do
-
- CUR=$((CUR+1))
- printf "\\r Processing file $CUR/$IN_COUNT... "
-
- "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
-
- done
-
- else
-
- ls "$IN_DIR" | while read -r fn; do
-
- CUR=$((CUR+1))
- printf "\\r Processing file $CUR/$IN_COUNT... "
-
- cp "$IN_DIR/$fn" "$STDIN_FILE"
-
- "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
-
- done
-
-
- fi
-
-)
-
-echo
-
-##########################
-# STEP 2: SORTING TUPLES #
-##########################
-
-# With this out of the way, we sort all tuples by popularity across all
-# datasets. The reasoning here is that we won't be able to avoid the files
-# that trigger unique tuples anyway, so we will want to start with them and
-# see what's left.
-
-echo "[*] Sorting trace sets (this may take a while)..."
-
-ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \
- sort | uniq -c | sort -k 1,1 -n >"$TRACE_DIR/.all_uniq"
-
-TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`))
-
-echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files."
-
-#####################################
-# STEP 3: SELECTING CANDIDATE FILES #
-#####################################
-
-# The next step is to find the best candidate for each tuple. The "best"
-# part is understood simply as the smallest input that includes a particular
-# tuple in its trace. Empirical evidence suggests that this produces smaller
-# datasets than more involved algorithms that could be still pulled off in
-# a shell script.
-
-echo "[*] Finding best candidates for each tuple..."
-
-CUR=0
-
-ls -rS "$IN_DIR" | while read -r fn; do
-
- CUR=$((CUR+1))
- printf "\\r Processing file $CUR/$IN_COUNT... "
-
- sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list"
-
-done
-
-echo
-
-##############################
-# STEP 4: LOADING CANDIDATES #
-##############################
-
-# At this point, we have a file of tuple-file pairs, sorted by file size
-# in ascending order (as a consequence of ls -rS). By doing sort keyed
-# only by tuple (-k 1,1) and configured to output only the first line for
-# every key (-s -u), we end up with the smallest file for each tuple.
-
-echo "[*] Sorting candidate list (be patient)..."
-
-sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \
- sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script"
-
-if [ ! -s "$TRACE_DIR/.candidate_script" ]; then
- echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2
- test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
- exit 1
-fi
-
-# The sed command converted the sorted list to a shell script that populates
-# BEST_FILE[tuple]="fname". Let's load that!
-
-. "$TRACE_DIR/.candidate_script"
-
-##########################
-# STEP 5: WRITING OUTPUT #
-##########################
-
-# The final trick is to grab the top pick for each tuple, unless said tuple is
-# already set due to the inclusion of an earlier candidate; and then put all
-# tuples associated with the newly-added file to the "already have" list. The
-# loop works from least popular tuples and toward the most common ones.
-
-echo "[*] Processing candidates and writing output files..."
-
-CUR=0
-
-touch "$TRACE_DIR/.already_have"
-
-while read -r cnt tuple; do
-
- CUR=$((CUR+1))
- printf "\\r Processing tuple $CUR/$TUPLE_COUNT... "
-
- # If we already have this tuple, skip it.
-
- grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue
-
- FN=${BEST_FILE[tuple]}
-
- $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN"
-
- if [ "$((CUR % 5))" = "0" ]; then
- sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp"
- mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have"
- else
- cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have"
- fi
-
-done <"$TRACE_DIR/.all_uniq"
-
-echo
-
-OUT_COUNT=`ls -- "$OUT_DIR" | wc -l`
-
-if [ "$OUT_COUNT" = "1" ]; then
- echo "[!] WARNING: All test cases had the same traces, check syntax!"
-fi
-
-echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'."
-echo
-
-test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
-
-exit 0
+#!/usr/bin/env sh
+THISPATH=`dirname ${0}`
+export PATH=${THISPATH}:$PATH
+awk -f ${0}.awk -- ${@+"$@"}
diff --git a/afl-cmin.bash b/afl-cmin.bash
new file mode 100755
index 00000000..1dd782d8
--- /dev/null
+++ b/afl-cmin.bash
@@ -0,0 +1,470 @@
+#!/usr/bin/env bash
+#
+# american fuzzy lop++ - corpus minimization tool
+# ---------------------------------------------
+#
+# Originally written by Michal Zalewski
+#
+# Copyright 2014, 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# This tool tries to find the smallest subset of files in the input directory
+# that still trigger the full range of instrumentation data points seen in
+# the starting corpus. This has two uses:
+#
+# - Screening large corpora of input files before using them as a seed for
+# afl-fuzz. The tool will remove functionally redundant files and likely
+# leave you with a much smaller set.
+#
+# (In this case, you probably also want to consider running afl-tmin on
+# the individual files later on to reduce their size.)
+#
+# - Minimizing the corpus generated organically by afl-fuzz, perhaps when
+# planning to feed it to more resource-intensive tools. The tool achieves
+# this by removing all entries that used to trigger unique behaviors in the
+# past, but have been made obsolete by later finds.
+#
+# Note that the tool doesn't modify the files themselves. For that, you want
+# afl-tmin.
+#
+# This script must use bash because other shells may have hardcoded limits on
+# array sizes.
+#
+
+echo "corpus minimization tool for afl-fuzz by Michal Zalewski"
+echo
+
+#########
+# SETUP #
+#########
+
+# Process command-line options...
+
+MEM_LIMIT=200
+TIMEOUT=none
+
+unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \
+ AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE
+
+while getopts "+i:o:f:m:t:eQUCh" opt; do
+
+ case "$opt" in
+
+ "h")
+ ;;
+
+ "i")
+ IN_DIR="$OPTARG"
+ ;;
+
+ "o")
+ OUT_DIR="$OPTARG"
+ ;;
+ "f")
+ STDIN_FILE="$OPTARG"
+ ;;
+ "m")
+ MEM_LIMIT="$OPTARG"
+ MEM_LIMIT_GIVEN=1
+ ;;
+ "t")
+ TIMEOUT="$OPTARG"
+ ;;
+ "e")
+ EXTRA_PAR="$EXTRA_PAR -e"
+ ;;
+ "C")
+ export AFL_CMIN_CRASHES_ONLY=1
+ ;;
+ "Q")
+ EXTRA_PAR="$EXTRA_PAR -Q"
+ test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250
+ QEMU_MODE=1
+ ;;
+ "U")
+ EXTRA_PAR="$EXTRA_PAR -U"
+ test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250
+ UNICORN_MODE=1
+ ;;
+ "?")
+ exit 1
+ ;;
+
+ esac
+
+done
+
+shift $((OPTIND-1))
+
+TARGET_BIN="$1"
+
+if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then
+
+ cat 1>&2 <<_EOF_
+Usage: $0 [ options ] -- /path/to/target_app [ ... ]
+
+Required parameters:
+
+ -i dir - input directory with the starting corpus
+ -o dir - output directory for minimized files
+
+Execution control settings:
+
+ -f file - location read by the fuzzed program (stdin)
+ -m megs - memory limit for child process ($MEM_LIMIT MB)
+ -t msec - run time limit for child process (none)
+ -Q - use binary-only instrumentation (QEMU mode)
+ -U - use unicorn-based instrumentation (Unicorn mode)
+
+Minimization settings:
+
+ -C - keep crashing inputs, reject everything else
+ -e - solve for edge coverage only, ignore hit counts
+
+For additional tips, please consult docs/README.
+
+_EOF_
+ exit 1
+fi
+
+# Do a sanity check to discourage the use of /tmp, since we can't really
+# handle this safely from a shell script.
+
+if [ "$AFL_ALLOW_TMP" = "" ]; then
+
+ echo "$IN_DIR" | grep -qE '^(/var)?/tmp/'
+ T1="$?"
+
+ echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/'
+ T2="$?"
+
+ echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/'
+ T3="$?"
+
+ echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/'
+ T4="$?"
+
+ echo "$PWD" | grep -qE '^(/var)?/tmp/'
+ T5="$?"
+
+ if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then
+ echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2
+ exit 1
+ fi
+
+fi
+
+# If @@ is specified, but there's no -f, let's come up with a temporary input
+# file name.
+
+TRACE_DIR="$OUT_DIR/.traces"
+
+if [ "$STDIN_FILE" = "" ]; then
+
+ if echo "$*" | grep -qF '@@'; then
+ STDIN_FILE="$TRACE_DIR/.cur_input"
+ fi
+
+fi
+
+# Check for obvious errors.
+
+if [ ! "$MEM_LIMIT" = "none" ]; then
+
+ if [ "$MEM_LIMIT" -lt "5" ]; then
+ echo "[-] Error: dangerously low memory limit." 1>&2
+ exit 1
+ fi
+
+fi
+
+if [ ! "$TIMEOUT" = "none" ]; then
+
+ if [ "$TIMEOUT" -lt "10" ]; then
+ echo "[-] Error: dangerously low timeout." 1>&2
+ exit 1
+ fi
+
+fi
+
+if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then
+
+ TNEW="`which "$TARGET_BIN" 2>/dev/null`"
+
+ if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then
+ echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2
+ exit 1
+ fi
+
+ TARGET_BIN="$TNEW"
+
+fi
+
+if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$UNICORN_MODE" = "" ]; then
+
+ if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then
+ echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2
+ exit 1
+ fi
+
+fi
+
+if [ ! -d "$IN_DIR" ]; then
+ echo "[-] Error: directory '$IN_DIR' not found." 1>&2
+ exit 1
+fi
+
+test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue"
+
+find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null
+rm -rf "$TRACE_DIR" 2>/dev/null
+
+rmdir "$OUT_DIR" 2>/dev/null
+
+if [ -d "$OUT_DIR" ]; then
+ echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2
+ exit 1
+fi
+
+mkdir -m 700 -p "$TRACE_DIR" || exit 1
+
+if [ ! "$STDIN_FILE" = "" ]; then
+ rm -f "$STDIN_FILE" || exit 1
+ touch "$STDIN_FILE" || exit 1
+fi
+
+if [ "$AFL_PATH" = "" ]; then
+ SHOWMAP="${0%/afl-cmin}/afl-showmap"
+else
+ SHOWMAP="$AFL_PATH/afl-showmap"
+fi
+
+if [ ! -x "$SHOWMAP" ]; then
+ echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2
+ rm -rf "$TRACE_DIR"
+ exit 1
+fi
+
+IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`))
+
+if [ "$IN_COUNT" = "0" ]; then
+ echo "[+] Hmm, no inputs in the target directory. Nothing to be done."
+ rm -rf "$TRACE_DIR"
+ exit 1
+fi
+
+FIRST_FILE=`ls "$IN_DIR" | head -1`
+
+# Make sure that we're not dealing with a directory.
+
+if [ -d "$IN_DIR/$FIRST_FILE" ]; then
+ echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2
+ rm -rf "$TRACE_DIR"
+ exit 1
+fi
+
+# Check for the more efficient way to copy files...
+
+if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then
+ CP_TOOL=ln
+else
+ CP_TOOL=cp
+fi
+
+# Make sure that we can actually get anything out of afl-showmap before we
+# waste too much time.
+
+echo "[*] Testing the target binary..."
+
+if [ "$STDIN_FILE" = "" ]; then
+
+ AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE"
+
+else
+
+ cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE"
+ AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
+
+fi
+
+FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`))
+
+if [ "$FIRST_COUNT" -gt "0" ]; then
+
+ echo "[+] OK, $FIRST_COUNT tuples recorded."
+
+else
+
+ echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2
+ test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
+ exit 1
+
+fi
+
+# Let's roll!
+
+#############################
+# STEP 1: COLLECTING TRACES #
+#############################
+
+echo "[*] Obtaining traces for input files in '$IN_DIR'..."
+
+(
+
+ CUR=0
+
+ if [ "$STDIN_FILE" = "" ]; then
+
+ ls "$IN_DIR" | while read -r fn; do
+
+ CUR=$((CUR+1))
+ printf "\\r Processing file $CUR/$IN_COUNT... "
+
+ "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
+
+ done
+
+ else
+
+ ls "$IN_DIR" | while read -r fn; do
+
+ CUR=$((CUR+1))
+ printf "\\r Processing file $CUR/$IN_COUNT... "
+
+ cp "$IN_DIR/$fn" "$STDIN_FILE"
+
+ "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
+
+ done
+
+
+ fi
+
+)
+
+echo
+
+##########################
+# STEP 2: SORTING TUPLES #
+##########################
+
+# With this out of the way, we sort all tuples by popularity across all
+# datasets. The reasoning here is that we won't be able to avoid the files
+# that trigger unique tuples anyway, so we will want to start with them and
+# see what's left.
+
+echo "[*] Sorting trace sets (this may take a while)..."
+
+ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \
+ sort | uniq -c | sort -k 1,1 -n >"$TRACE_DIR/.all_uniq"
+
+TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`))
+
+echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files."
+
+#####################################
+# STEP 3: SELECTING CANDIDATE FILES #
+#####################################
+
+# The next step is to find the best candidate for each tuple. The "best"
+# part is understood simply as the smallest input that includes a particular
+# tuple in its trace. Empirical evidence suggests that this produces smaller
+# datasets than more involved algorithms that could be still pulled off in
+# a shell script.
+
+echo "[*] Finding best candidates for each tuple..."
+
+CUR=0
+
+ls -rS "$IN_DIR" | while read -r fn; do
+
+ CUR=$((CUR+1))
+ printf "\\r Processing file $CUR/$IN_COUNT... "
+
+ sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list"
+
+done
+
+echo
+
+##############################
+# STEP 4: LOADING CANDIDATES #
+##############################
+
+# At this point, we have a file of tuple-file pairs, sorted by file size
+# in ascending order (as a consequence of ls -rS). By doing sort keyed
+# only by tuple (-k 1,1) and configured to output only the first line for
+# every key (-s -u), we end up with the smallest file for each tuple.
+
+echo "[*] Sorting candidate list (be patient)..."
+
+sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \
+ sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script"
+
+if [ ! -s "$TRACE_DIR/.candidate_script" ]; then
+ echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2
+ test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
+ exit 1
+fi
+
+# The sed command converted the sorted list to a shell script that populates
+# BEST_FILE[tuple]="fname". Let's load that!
+
+. "$TRACE_DIR/.candidate_script"
+
+##########################
+# STEP 5: WRITING OUTPUT #
+##########################
+
+# The final trick is to grab the top pick for each tuple, unless said tuple is
+# already set due to the inclusion of an earlier candidate; and then put all
+# tuples associated with the newly-added file to the "already have" list. The
+# loop works from least popular tuples and toward the most common ones.
+
+echo "[*] Processing candidates and writing output files..."
+
+CUR=0
+
+touch "$TRACE_DIR/.already_have"
+
+while read -r cnt tuple; do
+
+ CUR=$((CUR+1))
+ printf "\\r Processing tuple $CUR/$TUPLE_COUNT... "
+
+ # If we already have this tuple, skip it.
+
+ grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue
+
+ FN=${BEST_FILE[tuple]}
+
+ $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN"
+
+ if [ "$((CUR % 5))" = "0" ]; then
+ sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp"
+ mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have"
+ else
+ cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have"
+ fi
+
+done <"$TRACE_DIR/.all_uniq"
+
+echo
+
+OUT_COUNT=`ls -- "$OUT_DIR" | wc -l`
+
+if [ "$OUT_COUNT" = "1" ]; then
+ echo "[!] WARNING: All test cases had the same traces, check syntax!"
+fi
+
+echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'."
+echo
+
+test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
+
+exit 0