about summary refs log tree commit diff
path: root/custom_mutators/gramatron/preprocess/construct_automata.py
diff options
context:
space:
mode:
Diffstat (limited to 'custom_mutators/gramatron/preprocess/construct_automata.py')
-rw-r--r--custom_mutators/gramatron/preprocess/construct_automata.py275
1 files changed, 275 insertions, 0 deletions
diff --git a/custom_mutators/gramatron/preprocess/construct_automata.py b/custom_mutators/gramatron/preprocess/construct_automata.py
new file mode 100644
index 00000000..b9e84aa8
--- /dev/null
+++ b/custom_mutators/gramatron/preprocess/construct_automata.py
@@ -0,0 +1,275 @@
+import sys
+import json
+import re
+from collections import defaultdict
+# import pygraphviz as pgv
+
+gram_data = None
+state_count = 1
+pda = []
+worklist = []
+state_stacks = {} 
+
+# === If user provides upper bound on the stack size during FSA creation ===
+# Specifies the upper bound to which the stack is allowed to grow
+# If for any generated state, the stack size is >= stack_limit then this
+# state is not expanded further.
+stack_limit = None 
+# Holds the set of unexpanded rules owing to the user-passed stack constraint limit
+unexpanded_rules = set()
+
+def main(grammar, limit):
+    global worklist, gram_data, stack_limit
+    current = '0'
+    stack_limit = limit
+    if stack_limit:
+        print ('[X] Operating in bounded stack mode')
+
+    with open(grammar, 'r') as fd:
+        gram_data = json.load(fd)
+    start_symbol = gram_data["Start"][0]
+    worklist.append([current, [start_symbol]])
+    # print (grammar)
+    filename = (grammar.split('/')[-1]).split('.')[0]
+    
+
+    while worklist:
+        # Take an element from the worklist
+        # print ('================')
+        # print ('Worklist:', worklist)
+        element = worklist.pop(0)
+        prep_transitions(element)
+    
+    pda_file = filename + '_transition.json'
+    graph_file = filename + '.png'
+    # print ('XXXXXXXXXXXXXXXX')
+    # print ('PDA file:%s Png graph file:%s' % (pda_file, graph_file))
+    # XXX Commented out because visualization of current version of PHP causes segfault
+    # Create the graph and dump the transitions to a file
+    # create_graph(filename)
+    transformed = postprocess()
+    with open(filename + '_automata.json', 'w+') as fd:
+        json.dump(transformed, fd)
+    with open(filename + '_transition.json', 'w+') as fd:
+        json.dump(pda, fd)
+    if not unexpanded_rules:
+        print ('[X] No unexpanded rules, absolute FSA formed')
+        exit(0)
+    else:
+        print ('[X] Certain rules were not expanded due to stack size limit. Inexact approximation has been created and the disallowed rules have been put in {}_disallowed.json'.format(filename))
+        print ('[X] Number of unexpanded rules:', len(unexpanded_rules))
+        with open(filename + '_disallowed.json', 'w+') as fd:
+            json.dump(list(unexpanded_rules), fd)
+
+def create_graph(filename):
+    '''
+    Creates a DOT representation of the PDA
+    '''
+    global pda
+    G = pgv.AGraph(strict = False, directed = True)
+    for transition in pda:
+        print ('Transition:', transition)
+        G.add_edge(transition['source'], transition['dest'], 
+                label = 'Term:{}'.format(transition['terminal']))
+    G.layout(prog = 'dot')
+    print ('Do it up 2')
+    G.draw(filename + '.png')
+
+def prep_transitions(element):
+    '''
+    Generates transitions
+    '''
+    global gram_data, state_count, pda, worklist, state_stacks, stack_limit, unexpanded_rules
+    state = element[0]
+    try:
+        nonterminal = element[1][0] 
+    except IndexError:
+        # Final state was encountered, pop from worklist without doing anything
+        return
+    rules = gram_data[nonterminal]
+    count = 1
+    for rule in rules:
+        isRecursive  = False
+        # print ('Current state:', state)
+        terminal, ss, termIsRegex = tokenize(rule)
+        transition = get_template()
+        transition['trigger'] = '_'.join([state, str(count)])
+        transition['source'] = state
+        transition['dest'] = str(state_count) 
+        transition['ss'] = ss 
+        transition['terminal'] = terminal
+        transition['rule'] = "{} -> {}".format(nonterminal, rule )
+        if termIsRegex:
+            transition['termIsRegex'] = True
+        
+        # Creating a state stack for the new state
+        try:
+            state_stack = state_stacks[state][:]
+        except:
+            state_stack = []
+        if len(state_stack):
+            state_stack.pop(0)
+        if ss:
+            for symbol in ss[::-1]:
+                state_stack.insert(0, symbol)
+        transition['stack'] = state_stack 
+
+        # Check if a recursive transition state being created, if so make a backward
+        # edge and don't add anything to the worklist
+        # print (state_stacks)
+        if state_stacks:
+            for state_element, stack in state_stacks.items():
+                # print ('Stack:', sorted(stack))
+                # print ('State stack:', sorted(state_stack))
+                if sorted(stack) == sorted(state_stack):
+                    transition['dest'] = state_element
+                    # print ('Recursive:', transition)
+                    pda.append(transition)
+                    count += 1
+                    isRecursive = True
+                    break 
+        # If a recursive transition exercised don't add the same transition as a new
+        # edge, continue onto the next transitions
+        if isRecursive:
+            continue
+            
+        # If the generated state has a stack size > stack_limit then that state is abandoned
+        # and not added to the FSA or the worklist for further expansion
+        if stack_limit:
+            if (len(transition['stack']) > stack_limit):
+                unexpanded_rules.add(transition['rule'])
+                continue
+
+        # Create transitions for the non-recursive relations and add to the worklist
+        # print ('Normal:', transition)
+        # print ('State2:', state)
+        pda.append(transition)
+        worklist.append([transition['dest'], transition['stack']])
+        state_stacks[transition['dest']] = state_stack
+        state_count += 1
+        count += 1
+
+def tokenize(rule):
+    '''
+    Gets the terminal and the corresponding stack symbols from a rule in GNF form
+    '''
+    pattern = re.compile("([r])*\'([\s\S]+)\'([\s\S]*)")
+    terminal = None
+    ss = None
+    termIsRegex = False
+    match = pattern.match(rule)
+    if match.group(1):
+        termIsRegex = True
+    if match.group(2):
+        terminal = match.group(2)
+    else:
+        raise AssertionError("Rule is not in GNF form")
+
+    if match.group(3):
+        ss = (match.group(3)).split()
+
+    return terminal, ss, termIsRegex
+
+def get_template():
+    transition_template = {
+            'trigger':None,
+            'source': None,
+            'dest': None,
+            'termIsRegex': False,
+            'terminal' : None,
+            'stack': []
+            }
+    return transition_template
+
+def postprocess():
+    '''
+    Creates a representation to be passed on to the C-module
+    '''
+    global pda
+    final_struct = {}
+    memoized = defaultdict(list)
+    # Supporting data structures for if stack limit is imposed
+    culled_pda = []
+    culled_final = []
+    num_transitions = 0 # Keep track of number of transitions
+
+
+    states, final, initial = _get_states()
+
+    print (initial)
+    assert len(initial) == 1, 'More than one init state found'
+
+    # Cull transitions to states which were not expanded owing to the stack limit
+    if stack_limit:
+
+        blocklist = []
+        for final_state in final:
+            for transition in pda:
+                if (transition["dest"] == final_state) and (len(transition["stack"]) > 0):
+                    blocklist.append(transition["dest"])
+                    continue
+                else:
+                    culled_pda.append(transition)
+        
+        culled_final = [state for state in final if state not in blocklist]
+
+        assert len(culled_final) == 1, 'More than one final state found'
+
+        for transition in culled_pda:
+            state = transition["source"]
+            if transition["dest"] in blocklist:
+                    continue 
+            num_transitions += 1
+            memoized[state].append([transition["trigger"], transition["dest"], 
+                transition["terminal"]])
+        final_struct["init_state"] = initial
+        final_struct["final_state"] = culled_final[0]
+        # The reason we do this is because when states are culled, the indexing is
+        # still relative to the actual number of states hence we keep numstates recorded
+        # as the original number of states
+        print ('[X] Actual Number of states:', len(memoized.keys()))
+        print ('[X] Number of transitions:', num_transitions)
+        print ('[X] Original Number of states:', len(states))
+        final_struct["numstates"] = len(states) 
+        final_struct["pda"] = memoized
+        return final_struct
+    
+    # Running FSA construction in exact approximation mode and postprocessing it like so
+    for transition in pda:
+       state = transition["source"]
+       memoized[state].append([transition["trigger"], transition["dest"], 
+           transition["terminal"]])
+
+    final_struct["init_state"] = initial
+    final_struct["final_state"] = final[0]
+    print ('[X] Actual Number of states:', len(memoized.keys()))
+    final_struct["numstates"] = len(memoized.keys()) 
+    final_struct["pda"] = memoized
+    return final_struct
+
+
+def _get_states():
+    source = set()
+    dest = set()
+    global pda
+    for transition in pda:
+        source.add(transition["source"])
+        dest.add(transition["dest"])
+    source_copy = source.copy()
+    source_copy.update(dest)
+    return list(source_copy), list(dest.difference(source)), str(''.join(list(source.difference(dest))))
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description = 'Script to convert GNF grammar to PDA')
+    parser.add_argument(
+            '--gf',
+            type = str,
+            help = 'Location of GNF grammar')
+    parser.add_argument(
+            '--limit',
+            type = int,
+            default = None,
+            help = 'Specify the upper bound for the stack size')
+    args = parser.parse_args()
+    main(args.gf, args.limit)