diff options
Diffstat (limited to 'custom_mutators/grammatron/preprocess/construct_automata.py')
| -rw-r--r-- | custom_mutators/grammatron/preprocess/construct_automata.py | 275 |
1 files changed, 0 insertions, 275 deletions
diff --git a/custom_mutators/grammatron/preprocess/construct_automata.py b/custom_mutators/grammatron/preprocess/construct_automata.py deleted file mode 100644 index b9e84aa8..00000000 --- a/custom_mutators/grammatron/preprocess/construct_automata.py +++ /dev/null @@ -1,275 +0,0 @@ -import sys -import json -import re -from collections import defaultdict -# import pygraphviz as pgv - -gram_data = None -state_count = 1 -pda = [] -worklist = [] -state_stacks = {} - -# === If user provides upper bound on the stack size during FSA creation === -# Specifies the upper bound to which the stack is allowed to grow -# If for any generated state, the stack size is >= stack_limit then this -# state is not expanded further. -stack_limit = None -# Holds the set of unexpanded rules owing to the user-passed stack constraint limit -unexpanded_rules = set() - -def main(grammar, limit): - global worklist, gram_data, stack_limit - current = '0' - stack_limit = limit - if stack_limit: - print ('[X] Operating in bounded stack mode') - - with open(grammar, 'r') as fd: - gram_data = json.load(fd) - start_symbol = gram_data["Start"][0] - worklist.append([current, [start_symbol]]) - # print (grammar) - filename = (grammar.split('/')[-1]).split('.')[0] - - - while worklist: - # Take an element from the worklist - # print ('================') - # print ('Worklist:', worklist) - element = worklist.pop(0) - prep_transitions(element) - - pda_file = filename + '_transition.json' - graph_file = filename + '.png' - # print ('XXXXXXXXXXXXXXXX') - # print ('PDA file:%s Png graph file:%s' % (pda_file, graph_file)) - # XXX Commented out because visualization of current version of PHP causes segfault - # Create the graph and dump the transitions to a file - # create_graph(filename) - transformed = postprocess() - with open(filename + '_automata.json', 'w+') as fd: - json.dump(transformed, fd) - with open(filename + '_transition.json', 'w+') as fd: - json.dump(pda, fd) - if not unexpanded_rules: - print ('[X] No unexpanded rules, absolute FSA formed') - exit(0) - else: - print ('[X] Certain rules were not expanded due to stack size limit. Inexact approximation has been created and the disallowed rules have been put in {}_disallowed.json'.format(filename)) - print ('[X] Number of unexpanded rules:', len(unexpanded_rules)) - with open(filename + '_disallowed.json', 'w+') as fd: - json.dump(list(unexpanded_rules), fd) - -def create_graph(filename): - ''' - Creates a DOT representation of the PDA - ''' - global pda - G = pgv.AGraph(strict = False, directed = True) - for transition in pda: - print ('Transition:', transition) - G.add_edge(transition['source'], transition['dest'], - label = 'Term:{}'.format(transition['terminal'])) - G.layout(prog = 'dot') - print ('Do it up 2') - G.draw(filename + '.png') - -def prep_transitions(element): - ''' - Generates transitions - ''' - global gram_data, state_count, pda, worklist, state_stacks, stack_limit, unexpanded_rules - state = element[0] - try: - nonterminal = element[1][0] - except IndexError: - # Final state was encountered, pop from worklist without doing anything - return - rules = gram_data[nonterminal] - count = 1 - for rule in rules: - isRecursive = False - # print ('Current state:', state) - terminal, ss, termIsRegex = tokenize(rule) - transition = get_template() - transition['trigger'] = '_'.join([state, str(count)]) - transition['source'] = state - transition['dest'] = str(state_count) - transition['ss'] = ss - transition['terminal'] = terminal - transition['rule'] = "{} -> {}".format(nonterminal, rule ) - if termIsRegex: - transition['termIsRegex'] = True - - # Creating a state stack for the new state - try: - state_stack = state_stacks[state][:] - except: - state_stack = [] - if len(state_stack): - state_stack.pop(0) - if ss: - for symbol in ss[::-1]: - state_stack.insert(0, symbol) - transition['stack'] = state_stack - - # Check if a recursive transition state being created, if so make a backward - # edge and don't add anything to the worklist - # print (state_stacks) - if state_stacks: - for state_element, stack in state_stacks.items(): - # print ('Stack:', sorted(stack)) - # print ('State stack:', sorted(state_stack)) - if sorted(stack) == sorted(state_stack): - transition['dest'] = state_element - # print ('Recursive:', transition) - pda.append(transition) - count += 1 - isRecursive = True - break - # If a recursive transition exercised don't add the same transition as a new - # edge, continue onto the next transitions - if isRecursive: - continue - - # If the generated state has a stack size > stack_limit then that state is abandoned - # and not added to the FSA or the worklist for further expansion - if stack_limit: - if (len(transition['stack']) > stack_limit): - unexpanded_rules.add(transition['rule']) - continue - - # Create transitions for the non-recursive relations and add to the worklist - # print ('Normal:', transition) - # print ('State2:', state) - pda.append(transition) - worklist.append([transition['dest'], transition['stack']]) - state_stacks[transition['dest']] = state_stack - state_count += 1 - count += 1 - -def tokenize(rule): - ''' - Gets the terminal and the corresponding stack symbols from a rule in GNF form - ''' - pattern = re.compile("([r])*\'([\s\S]+)\'([\s\S]*)") - terminal = None - ss = None - termIsRegex = False - match = pattern.match(rule) - if match.group(1): - termIsRegex = True - if match.group(2): - terminal = match.group(2) - else: - raise AssertionError("Rule is not in GNF form") - - if match.group(3): - ss = (match.group(3)).split() - - return terminal, ss, termIsRegex - -def get_template(): - transition_template = { - 'trigger':None, - 'source': None, - 'dest': None, - 'termIsRegex': False, - 'terminal' : None, - 'stack': [] - } - return transition_template - -def postprocess(): - ''' - Creates a representation to be passed on to the C-module - ''' - global pda - final_struct = {} - memoized = defaultdict(list) - # Supporting data structures for if stack limit is imposed - culled_pda = [] - culled_final = [] - num_transitions = 0 # Keep track of number of transitions - - - states, final, initial = _get_states() - - print (initial) - assert len(initial) == 1, 'More than one init state found' - - # Cull transitions to states which were not expanded owing to the stack limit - if stack_limit: - - blocklist = [] - for final_state in final: - for transition in pda: - if (transition["dest"] == final_state) and (len(transition["stack"]) > 0): - blocklist.append(transition["dest"]) - continue - else: - culled_pda.append(transition) - - culled_final = [state for state in final if state not in blocklist] - - assert len(culled_final) == 1, 'More than one final state found' - - for transition in culled_pda: - state = transition["source"] - if transition["dest"] in blocklist: - continue - num_transitions += 1 - memoized[state].append([transition["trigger"], transition["dest"], - transition["terminal"]]) - final_struct["init_state"] = initial - final_struct["final_state"] = culled_final[0] - # The reason we do this is because when states are culled, the indexing is - # still relative to the actual number of states hence we keep numstates recorded - # as the original number of states - print ('[X] Actual Number of states:', len(memoized.keys())) - print ('[X] Number of transitions:', num_transitions) - print ('[X] Original Number of states:', len(states)) - final_struct["numstates"] = len(states) - final_struct["pda"] = memoized - return final_struct - - # Running FSA construction in exact approximation mode and postprocessing it like so - for transition in pda: - state = transition["source"] - memoized[state].append([transition["trigger"], transition["dest"], - transition["terminal"]]) - - final_struct["init_state"] = initial - final_struct["final_state"] = final[0] - print ('[X] Actual Number of states:', len(memoized.keys())) - final_struct["numstates"] = len(memoized.keys()) - final_struct["pda"] = memoized - return final_struct - - -def _get_states(): - source = set() - dest = set() - global pda - for transition in pda: - source.add(transition["source"]) - dest.add(transition["dest"]) - source_copy = source.copy() - source_copy.update(dest) - return list(source_copy), list(dest.difference(source)), str(''.join(list(source.difference(dest)))) - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser(description = 'Script to convert GNF grammar to PDA') - parser.add_argument( - '--gf', - type = str, - help = 'Location of GNF grammar') - parser.add_argument( - '--limit', - type = int, - default = None, - help = 'Specify the upper bound for the stack size') - args = parser.parse_args() - main(args.gf, args.limit) |
