import sys import re import copy import json from string import ascii_uppercase from itertools import combinations from collections import defaultdict NONTERMINALSET = [] COUNT = 1 def main(grammar_file, out, start): grammar = None # If grammar file is a preprocessed NT file, then skip preprocessing if '.json' in grammar_file: with open(grammar_file, 'r') as fd: grammar = json.load(fd) elif '.g4' in grammar_file: with open(grammar_file, 'r') as fd: data = fd.readlines() grammar = preprocess(data) else: raise('Unknwown file format passed. Accepts (.g4/.json)') with open('debug_preprocess.json', 'w+') as fd: json.dump(grammar, fd) grammar = remove_unit(grammar) # eliminates unit productions with open('debug_unit.json', 'w+') as fd: json.dump(grammar, fd) grammar = remove_mixed(grammar) # eliminate terminals existing with non-terminals with open('debug_mixed.json', 'w+') as fd: json.dump(grammar, fd) grammar = break_rules(grammar) # eliminate rules with more than two non-terminals with open('debug_break.json', 'w+') as fd: json.dump(grammar, fd) grammar = gnf(grammar) # Dump GNF form of the grammar with only reachable rules # reachable_grammar = get_reachable(grammar, start) # with open('debug_gnf_reachable.json', 'w+') as fd: # json.dump(reachable_grammar, fd) with open('debug_gnf.json', 'w+') as fd: json.dump(grammar, fd) grammar["Start"] = [start] with open(out, 'w+') as fd: json.dump(grammar, fd) def get_reachable(grammar, start): ''' Returns a grammar without dead rules ''' reachable_nt = set() worklist = list() processed = set() reachable_grammar = dict() worklist.append(start) while worklist: nt = worklist.pop(0) processed.add(nt) reachable_grammar[nt] = grammar[nt] rules = grammar[nt] for rule in rules: tokens = gettokens(rule) for token in tokens: if not isTerminal(token): if token not in processed: worklist.append(token) return reachable_grammar def gettokens(rule): pattern = re.compile("([^\s\"\']+)|\"([^\"]*)\"|\'([^\']*)\'") return [matched.group(0) for matched in pattern.finditer(rule)] def gnf(grammar): old_grammar = copy.deepcopy(grammar) new_grammar = defaultdict(list) isgnf = False while not isgnf: for lhs, rules in old_grammar.items(): for rule in rules: tokens = gettokens(rule) if len(tokens) == 1 and isTerminal(rule): new_grammar[lhs].append(rule) continue startoken = tokens[0] endrule = tokens[1:] if not isTerminal(startoken): newrules = [] extendrules = old_grammar[startoken] for extension in extendrules: temprule = endrule[:] temprule.insert(0, extension) newrules.append(temprule) for newnew in newrules: new_grammar[lhs].append(' '.join(newnew)) else: new_grammar[lhs].append(rule) isgnf = True for lhs, rules in new_grammar.items(): for rule in rules: # if "\' \'" or isTerminal(rule): tokens = gettokens(rule) if len(tokens) == 1 and isTerminal(rule): continue startoken = tokens[0] if not isTerminal(startoken): isgnf = False break if not isgnf: old_grammar = copy.deepcopy(new_grammar) new_grammar = defaultdict(list) return new_grammar def preprocess(data): productions = [] production = [] for line in data: if line != '\n': production.append(line) else: productions.append(production) production = [] final_rule_set = {} for production in productions: rules = [] init = production[0] nonterminal = init.split(':')[0] rules.append(strip_chars(init.split(':')[1]).strip('| ')) for production_rule in production[1:]: rules.append(strip_chars(production_rule.split('|')[0])) final_rule_set[nonterminal] = rules # for line in data: # if line != '\n': # production.append(line) return final_rule_set def remove_unit(grammar): nounitproductions = False old_grammar = copy.deepcopy(grammar) new_grammar = defaultdict(list) while not nounitproductions: for lhs, rules in old_grammar.items(): for rhs in rules: # Checking if the rule is a unit production rule if len(gettokens(rhs)) == 1: if not isTerminal(rhs): new_grammar[lhs].extend([rule for rule in old_grammar[rhs]]) else: new_grammar[lhs].append(rhs) else: new_grammar[lhs].append(rhs) # Checking there are no unit productions left in the grammar nounitproductions = True for lhs, rules in new_grammar.items(): for rhs in rules: if len(gettokens(rhs)) == 1: if not isTerminal(rhs): nounitproductions = False break if not nounitproductions: break # Unit productions are still there in the grammar -- repeat the process if not nounitproductions: old_grammar = copy.deepcopy(new_grammar) new_grammar = defaultdict(list) return new_grammar def isTerminal(rule): # pattern = re.compile("([r]*\'[\s\S]+\')") pattern = re.compile("\'(.*?)\'") match = pattern.match(rule) if match: return True else: return False def remove_mixed(grammar): ''' Remove rules where there are terminals mixed in with non-terminals ''' new_grammar = defaultdict(list) for lhs, rules in grammar.items(): for rhs in rules: # tokens = rhs.split(' ') regen_rule = [] tokens = gettokens(rhs) if len(gettokens(rhs)) == 1: new_grammar[lhs].append(rhs) continue for token in tokens: # Identify if there is a terminal in the RHS if isTerminal(token): # Check if a corresponding nonterminal already exists nonterminal = terminal_exist(token, new_grammar) if nonterminal: regen_rule.append(nonterminal) else: new_nonterm = get_nonterminal() new_grammar[new_nonterm].append(token) regen_rule.append(new_nonterm) else: regen_rule.append(token) new_grammar[lhs].append(' '.join(regen_rule)) return new_grammar def break_rules(grammar): new_grammar = defaultdict(list) old_grammar = copy.deepcopy(grammar) nomulti = False while not nomulti: for lhs, rules in old_grammar.items(): for rhs in rules: tokens = gettokens(rhs) if len(tokens) > 2 and (not isTerminal(rhs)): split = tokens[:-1] nonterminal = terminal_exist(' '.join(split), new_grammar) if nonterminal: newrule = ' '.join([nonterminal, tokens[-1]]) new_grammar[lhs].append(newrule) else: nonterminal = get_nonterminal() new_grammar[nonterminal].append(' '.join(split)) newrule = ' '.join([nonterminal, tokens[-1]]) new_grammar[lhs].append(newrule) else: new_grammar[lhs].append(rhs) nomulti = True for lhs, rules in new_grammar.items(): for rhs in rules: # tokens = rhs.split(' ') tokens = gettokens(rhs) if len(tokens) > 2 and (not isTerminal(rhs)): nomulti = False break if not nomulti: old_grammar = copy.deepcopy(new_grammar) new_grammar = defaultdict(list) return new_grammar def strip_chars(rule): return rule.strip('\n\t ') def get_nonterminal(): global NONTERMINALSET if NONTERMINALSET: return NONTERMINALSET.pop(0) else: _repopulate() return NONTERMINALSET.pop(0) def _repopulate(): global COUNT global NONTERMINALSET NONTERMINALSET = [''.join(x) for x in list(combinations(ascii_uppercase, COUNT))] COUNT += 1 def terminal_exist(token, grammar): for nonterminal, rules in grammar.items(): if token in rules: return nonterminal return None if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description = 'Script to convert grammar to GNF form') parser.add_argument( '--gf', type = str, required = True, help = 'Location of grammar file') parser.add_argument( '--out', type = str, required = True, help = 'Location of output file') parser.add_argument( '--start', type = str, required = True, help = 'Start token') args = parser.parse_args() main(args.gf, args.out, args.start)