diff options
-rw-r--r-- | utils/autodict_ql/autodict_ql.py | 188 | ||||
-rw-r--r-- | utils/autodict_ql/build-codeql.sh | 17 | ||||
-rw-r--r-- | utils/autodict_ql/litan.py | 86 | ||||
-rw-r--r-- | utils/autodict_ql/qlpack.yml | 3 | ||||
-rw-r--r-- | utils/autodict_ql/readme.md | 81 | ||||
-rw-r--r-- | utils/autodict_ql/strtool.ql | 6 |
6 files changed, 378 insertions, 3 deletions
diff --git a/utils/autodict_ql/autodict_ql.py b/utils/autodict_ql/autodict_ql.py new file mode 100644 index 00000000..69d11f48 --- /dev/null +++ b/utils/autodict_ql/autodict_ql.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +import os +import string +import binascii +import codecs +import errno +import struct +import argparse +import shutil +import subprocess + +from binascii import unhexlify + +def ensure_dir(dir): + try: + os.makedirs(dir) + except OSError as e: + if e.errno != errno.EEXIST: + raise + +def parse_args(): + parser = argparse.ArgumentParser(description=( + "Helper - Specify input file analysis and output folder to save corpus for strings in the overall project --------------------------------------------------------------------------- Example usage : python2 thisfile.py outdir str.txt" )) + + #parser.add_argument("tokenpath", + #help="Destination directory for tokens") + parser.add_argument("cur", + help = "Current Path") + parser.add_argument("db", + help = "CodeQL database Path") + parser.add_argument("tokenpath", + help="Destination directory for tokens") + + return parser.parse_args() + +def static_analysis(file,file2,cur,db) : + with open(cur+"/"+file, "w") as f: + print(cur+"/"+file) + stream = os.popen("codeql query run " + cur +"/"+ file2 + " -d " + db ) + output = stream.read() + f.write(output) + f.close() + +def copy_tokens(cur, tokenpath) : + subprocess.call(["cp " + cur + "/" + "arrays-lits/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True) + subprocess.call(["cp " + cur + "/" + "strstr-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True) + subprocess.call(["cp " + cur + "/" + "strcmp-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True) + subprocess.call(["cp " + cur + "/" + "strncmp-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True) + subprocess.call(["cp " + cur + "/" + "local-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True) + subprocess.call(["cp " + cur + "/" + "memcmp-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True) + subprocess.call(["cp " + cur + "/" + "global-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True) + subprocess.call(["cp " + cur + "/" + "lits/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True) + subprocess.call(["cp " + cur + "/" + "arrays-lits/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True) + subprocess.call(["cp " + cur + "/" + "arrays-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True) + subprocess.call(["cp " + cur + "/" + "strtool-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True) + #strtool-strs + + +def codeql_analysis(cur, db) : + static_analysis("litout.out","litool.ql", cur, db) + static_analysis("strcmp-strings.out","strcmp-str.ql", cur, db) + static_analysis("strncmp-strings.out","strncmp-str.ql", cur, db) + static_analysis("strstr-strings.out","strstr-str.ql", cur, db) + static_analysis("memcmp-strings.out","memcmp-str.ql", cur, db) + static_analysis("global-values-strings.out","globals-values.ql", cur, db) + static_analysis("local-strings.out","locals-strs.ql", cur, db) + static_analysis("strtool-strings.out","strtool.ql", cur, db) + static_analysis("arrays.out","array-literals.ql", cur, db) + start_aflql(0,cur) + #command1 = [ + # 'codeql','query', 'run', + # cur + '/litool.ql', + # '-d', + # db, '>','fff.txt' + # ] + #with open("litool2.log", "w") as f: + # stream = os.popen("codeql query run litool.ql -d " + db ) + # output = stream.read() + # f.write(output) + # f.close() + #worker1 = subprocess.Popen(command1) + #print(worker1.communicate()) + + +def start_aflql(tokenpath, cur): + command = [ + 'python3', + cur + '/litan.py', + cur+'/lits/', + cur+'/litout.out' + ] + worker1 = subprocess.Popen(command) + print(worker1.communicate()) + + command1 = [ + 'python3', + cur + '/strcmp-strings.py', + cur + '/strcmp-strs/', + cur + '/strcmp-strings.out' + ] + worker2 = subprocess.Popen(command1) + print(worker2.communicate()) + + command2 = [ + 'python3', + cur + '/strncmp-strings.py', + cur + '/strncmp-strs/', + cur + '/strncmp-strings.out' + ] + worker3 = subprocess.Popen(command2) + print(worker3.communicate()) + + command3 = [ + 'python3', + cur + '/array-lits.py', + cur + '/arrays-lits/', + cur + '/arrays.out' + ] + worker4 = subprocess.Popen(command3) + print(worker4.communicate()) + + command4 = [ + 'python3', + cur + '/array-strings.py', + cur + '/arrays-strs/', + cur + '/arrays.out' + ] + worker5 = subprocess.Popen(command4) + print(worker5.communicate()) + + + command5 = [ + 'python3', + cur + '/memcmp-strings.py', + cur + '/memcmp-strs/', + cur + '/memcmp-strings.out' + ] + worker6 = subprocess.Popen(command5) + print(worker6.communicate()) + + command6 = [ + 'python3', + cur + '/globals-strings.py', + cur + '/global-strs/', + cur + '/global-values-strings.out' + ] + worker7 = subprocess.Popen(command6) + print(worker7.communicate()) + + command7 = [ + 'python3', + cur + '/strstr-strings.py', + cur + '/strstr-strs/', + cur + '/strstr-strings.out' + ] + worker8 = subprocess.Popen(command7) + print(worker8.communicate()) + + + #strtool-strings.out + + command8 = [ + 'python3', + cur + '/stan-strings.py', + cur + '/strtool-strs/', + cur + '/strtool-strings.out' + ] + worker9 = subprocess.Popen(command8) + print(worker9.communicate()) + + command9 = [ + 'python3', + cur + '/local-strings.py', + cur + '/local-strs/', + cur + '/local-strings.out' + ] + worker10 = subprocess.Popen(command9) + print(worker10.communicate()) + +def main(): + args = parse_args() + ensure_dir(args.tokenpath) + #copy_tokens(args.cur, args.tokenpath) + codeql_analysis(args.cur, args.db) + copy_tokens(args.cur, args.tokenpath) + #start_aflql(args.tokenpath, args.cur) +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/autodict_ql/build-codeql.sh b/utils/autodict_ql/build-codeql.sh new file mode 100644 index 00000000..ccff932e --- /dev/null +++ b/utils/autodict_ql/build-codeql.sh @@ -0,0 +1,17 @@ +cd ~ +if [ -d "codeql-home" ]; then + echo "Exist !" + exit 1 +fi +sudo apt install build-essential libtool-bin python3-dev automake git vim wget -y +mkdir codeql-home +cd codeql-home +git clone https://github.com/github/codeql.git codeql-repo +git clone https://github.com/github/codeql-go.git +wget https://github.com/github/codeql-cli-binaries/releases/download/v2.4.6/codeql-linux64.zip +unzip codeql-linux64.zip +mv codeql codeql-cli +export "PATH=~/codeql-home/codeql-cli/:$PATH" +codeql resolve languages +codeql resolve qlpacks +echo "export PATH=~/codeql-home/codeql-cli/:$PATH" >> ~/.bashrc \ No newline at end of file diff --git a/utils/autodict_ql/litan.py b/utils/autodict_ql/litan.py new file mode 100644 index 00000000..18c04c34 --- /dev/null +++ b/utils/autodict_ql/litan.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# Autodict-QL - Optimal token generation for fuzzing +# Part of AFL++ Project +# Author : Microsvuln - Arash.vre@gmail.com +import string +import os +import binascii +import codecs +import struct +import errno +import argparse +import re +import base64 +from binascii import unhexlify +def parse_args(): + parser = argparse.ArgumentParser(description=( + "Helper - Specify input file to analysis and output folder to save corpdirus for constants in the overall project ------- Example usage : python2 thisfile.py outdir o.txt")) + parser.add_argument("corpdir", + help="The path to the corpus directory to generate files.") + parser.add_argument("infile", + help="Specify file output of codeql analysis - ex. ooo-hex.txt, analysis take place on this file, example : python2 thisfile.py outdir out.txt") + return parser.parse_args() +def ensure_dir(dir): + try: + os.makedirs(dir) + except OSError as e: + if e.errno == errno.EEXIST: + #print "[-] Directory exists, specify another directory" + exit(1) +def do_analysis1(corpdir, infile): + with open(infile, "rb") as f: + lines = f.readlines()[1:] + f.close() + new_lst = [] + n = 1 + for i, num in enumerate(lines): + if i != 0: + new_lst.append(num) + str1 = str(num) + print ("num is " + str1) + str1 = str1.rstrip('\n\n') + #str1 = str1.replace("0x",""); + str1 = str1.replace("|","") + str1 = str1.rstrip('\r\n') + str1 = str1.rstrip('\n') + str1 = str1.replace(" ","") + #str1 = str1.translate(None, string.punctuation) + translator=str.maketrans('','',string.punctuation) + str1=str1.translate(translator) + str1 = str1[1:] + str1 = str1[:-1] + print("After cleanup : " + str1) + if (str1 != '0') and (str1 != 'ffffffff') and (str1 != 'fffffffe') or (len(str1) == 4) or (len(str1) == 8): + print ("first : "+str1) + if len(str1) > 8 : + str1 = str1[:-1] + elif (len(str1) == 5) : + str1 = str1 = "0" + try: + #str1 = str1.decode("hex") + with open(corpdir+'/lit-seed{0}'.format(n), 'w') as file: + str1 = str1.replace("0x",""); + print (str1) + str1 = int(str1,base=16) + str1 = str1.to_bytes(4, byteorder='little') + file.write(str(str1)) + file.close() + with open (corpdir+'/lit-seed{0}'.format(n), 'r') as q : + a = q.readline() + a = a[1:] + print ("AFL++ Autodict-QL by Microsvuln : Writing Token :" + str(a)) + q.close() + with open (corpdir+'/lit-seed{0}'.format(n), 'w') as w1 : + w1.write(str(a)) + print ("Done!") + w1.close() + except: + print("Error!") + n = n+1 + +def main(): + args = parse_args() + ensure_dir(args.corpdir) + do_analysis1(args.corpdir, args.infile) +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/autodict_ql/qlpack.yml b/utils/autodict_ql/qlpack.yml new file mode 100644 index 00000000..c037a344 --- /dev/null +++ b/utils/autodict_ql/qlpack.yml @@ -0,0 +1,3 @@ +name: automate +version: 0.0.0 +libraryPathDependencies: codeql-cpp diff --git a/utils/autodict_ql/readme.md b/utils/autodict_ql/readme.md new file mode 100644 index 00000000..77a15f8e --- /dev/null +++ b/utils/autodict_ql/readme.md @@ -0,0 +1,81 @@ +# Autodict-QL - Optimal Token Generation for Fuzzing + +## What is this? + +Autodict-QL is a plugin system that enables fast generation of Tokens/Dictionaries in a handy way that can be manipulated by the user (Unlike The LLVM Passes that are hard to modify). This means that autodict-ql is a scriptable feature which basically uses the CodeQL (A powerful semantic code analysis engine) to fetch information from a code base. + +Tokens are useful when you perform fuzzing on different parsers. AFL++ `-x` switch enables the usage of dictionaries through your fuzzing campagin. if you are not familiar with Dictionaries in fuzzing, take a look [here](https://github.com/AFLplusplus/AFLplusplus/tree/stable/dictionaries) . + + +## Why CodeQL ? +We basically developed this plugin on top of CodeQL engine because it gives the user scripting features, it's easier and it's independent of the LLVM system. This means that a user can write his CodeQL scripts or modify the current scripts to improve or change the token generation algorithms based on different program analysis concepts. + + +## CodeQL scripts +Currently, we pushed some scripts as defaults for Token generation. In addition, we provide every CodeQL script as an standalone script because it's easier to modify or test. + +Currently we provided the following CodeQL scripts : + +`strcmp-str.ql` is used to extract strings that are related to `strcmp` function. + +`strncmp-str.ql` is used to extract the strings from the `strncmp` function. + +`memcmp-str.ql` is used to extract the strings from the `memcmp` function. + +`litool.ql` extracts Magic numbers as Hexadecimal format. + +`strtool.ql` extracts strings with uses of a regex and dataflow concept to capture the string comparison functions. if strcmp is rewritten in a project as Mystrcmp or something like strmycmp, then this script can catch the arguments and these are valuable tokens. + +You can write other CodeQL scripts to extract possible effective tokens if you think they can be useful. + + +## Usage +The usage of Autodict-QL is pretty easy. But let's describe it as : + +1. First of all, you need to have CodeQL installed on the system. we make this possible with `build-codeql.sh` bash script. This script will install CodeQL completety and will set the required environment variables for your system, so : + +` # chmod +x codeql-build.sh` + +` # codeql ` + +Then you should get : + +` Usage: codeql <command> <argument>... +Create and query CodeQL databases, or work with the QL language. + +GitHub makes this program freely available for the analysis of open-source software and certain other uses, but it is +not itself free software. Type codeql --license to see the license terms. + + --license Show the license terms for the CodeQL toolchain. +Common options: + -h, --help Show this help text. + -v, --verbose Incrementally increase the number of progress messages printed. + -q, --quiet Incrementally decrease the number of progress messages printed. +Some advanced options have been hidden; try --help -v for a fuller view. +Commands: + query Compile and execute QL code. + bqrs Get information from .bqrs files. + database Create, analyze and process CodeQL databases. + dataset [Plumbing] Work with raw QL datasets. + test Execute QL unit tests. + resolve [Deep plumbing] Helper commands to resolve disk locations etc. + execute [Deep plumbing] Low-level commands that need special JVM options. + version Show the version of the CodeQL toolchain. + generate Generate formatted QL documentation. + github Commands useful for interacting with the GitHub API through CodeQL. +` + +2. Compiler your project with CodeQL: For using the Autodict-QL plugin, you need to compile the source of the target you want to fuzz with CodeQL. This is not something hard . + - First you need to create a CodeQL database of the project codebase, suppose we want to compile the libxml with codeql. go to libxml and issue the following commands: + - `./configure --disable-shared` + - `codeql create database libxml-db --language=cpp --command=make + - Now you have the CodeQL database of the project :-) +3. To run the Autodict-QL, the final step is to just create a folder named `automate` in the project you want to fuzz. + - `mkdir automate` (inside the libxml directory) +4. The final step is to update the CodeQL database you created in the step 2 inside the automate dir you created at step 3 : + - `codeql database upgrade ../libxml-db` +5. Everything is set! :-), now you should issue the following to get the tokens : + - `python3 autodict-ql.py [CURRECT_DIR] [CODEQL_DATABASE_PATH] [TOKEN_PATH]` + - example : `python3 autodict-ql.py /home/user/libxml/automate /home/user/libxml/libxml-db tokens` + - This will create the final `tokens` dir for you and you are done, then pass the tokens path to afl `-x` flag. +6. Done! \ No newline at end of file diff --git a/utils/autodict_ql/strtool.ql b/utils/autodict_ql/strtool.ql index f78aabbb..253d1555 100644 --- a/utils/autodict_ql/strtool.ql +++ b/utils/autodict_ql/strtool.ql @@ -3,8 +3,8 @@ import semmle.code.cpp.dataflow.DataFlow class StringLiteralNode extends DataFlow::Node { StringLiteralNode() { this.asExpr() instanceof StringLiteral } } -class MemcmpArgNode extends DataFlow::Node { - MemcmpArgNode() { +class CmpArgNode extends DataFlow::Node { + CmpArgNode() { exists(FunctionCall fc | fc.getTarget().getName().regexpMatch(".*(str|mem|strn|b)*(cmp|str)*") and fc.getArgument(0) = this.asExpr() @@ -17,7 +17,7 @@ class MemcmpArgNode extends DataFlow::Node { } } -from StringLiteralNode src, MemcmpArgNode arg +from StringLiteralNode src, CmpArgNode arg where DataFlow::localFlow(src, arg) |