# -*- coding: utf-8 -*- import ply.lex as lex import ply.yacc as yacc from statement import * from ply.yacc import YaccError import netaddr class Lexer(object): def __init__(self,**kwargs): self.lexer = lex.lex(module=self, **kwargs) reserved = { 'splitter' : 'splitterKeyword', 'groupfilter' : 'groupFilterKeyword', 'filter' : 'filterKeyword', 'grouper' : 'grouperKeyword', 'module' : 'moduleKeyword', 'merger' : 'mergerKeyword', 'export' : 'exportKeyword', 'ungrouper' : 'ungrouperKeyword', 'branches' : 'branchesKeyword', 'branch' : 'branchKeyword', 'aggregate' : 'aggregateKeyword', 'as' : 'asKeyword', 'min' : 'minKeyword', 'max' : 'maxKeyword', 'avg' : 'avgKeyword', 'sum' : 'sumKeyword', 'count' : 'countKeyword', 'union' : 'unionKeyword', 'in' : 'inKeyword', 'notin' : 'notinKeyword', 'OR' : 'ORKeyword', 'NOT' : 'NOTKeyword', 'bitOR': 'bitORKeyword', 'bitAND' : 'bitANDKeyword', 'm' : 'mKeyword', 'mi' : 'miKeyword', 'o' : 'oKeyword', 'oi' : 'oiKeyword', 's' : 'sKeyword', 'si' : 'siKeyword', 'd' : 'dKeyword', 'di' : 'diKeyword', 'f' : 'fKeyword', 'fi' : 'fiKeyword', 'eq' : 'eqKeyword', # prevent clash with = for match rules 'delta': 'deltaKeyword', 'rdelta' : 'rdeltaKeyword', 'ms' : 'msKeyword' } def t_LTEQ(self, t): r'<=' t.value = 'LTEQ' return t def t_GTEQ(self, t): r'>=' t.value = 'GTEQ' return t def t_ML(self, t): r'<<' t.value = 'ML' return t def t_MG(self, t): r'>>' t.value = 'MG' return t def t_LT(self, t): r'<' t.value = 'LT' return t def t_EQ(self, t): r'=' t.value = 'EQ' return t def t_GT(self, t): r'>' t.value = 'GT' return t tokens = ['id', 'LT', 'EQ', 'GT', 'LTEQ', 'GTEQ', 'ML', 'MG', 'MAC', 'IPv4', 'IPv6', 'int', 'float', 'hex', 'string'] + list(reserved.values()) t_ignore = ' \t' t_ignore_comment = r'\#.*' literals = "+-*/(){},." def t_string(self, t): r'"[^"\\\r\n]*(?:\\.[^"\\\r\n]*)*"' t.value = Arg("string", t.value[1:-1].replace("\\",''), t.value) return t def t_IPv4(self, t): r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' #the regex does include invalid IPs but they are #checked later during conversion try: t.value =Arg("addr_IPv4", int(netaddr.IP(t.value)), t.value) return t except netaddr.AddrFormatError: message = 'Bad IPv4 format %s at line %s' %(t.value, t.lexer.lineno) raise SyntaxError(message) def t_MAC(self, t): r'([a-fA-F0-9]{2}[:\-]){5}[a-fA-F0-9]{2}' try: t.value = Arg("addr_MAC", int(netaddr.EUI(t.value)), t.value) return t except netaddr.AddrFormatError: message = 'Bad MAC format %s at line %s' %(t.value, t.lexer.lineno) raise SyntaxError(message) def t_IPv6(self, t): r'(::[0-9a-f]{1,4}[0-9a-f:]*)|([0-9a-f]:[0-9a-f:]*)' # the regular expression is very genral, so this rule should be # after the other address rules. try: t.value = Arg("addr_IPv6", int(netaddr.IP(t.value)), t.value) return t except netaddr.AddrFormatError: message = 'Bad IPv6 format %s at line %s' %(t.value, t.lexer.lineno) raise SyntaxError(message) def t_float(self, t): '[0-9]*\.[0-9]+([eE][+-]?[0-9]+)?' t.value = Arg("float", float(t.value), t.value) return t def t_hex(self, t): r'0[xX][0-9a-fA-F]+' t.value = Arg("int", int(t.value, 0), t.value) return t def t_int(self, t): r'\d+' t.value = Arg("int", int(t.value), t.value) return t #All the reserved words are matched in this rule def t_id(self, t): r'[a-zA-Z_][a-zA-Z_0-9]*' # matches also keywords, so be careful t.type = self.reserved.get(t.value,'id') # Check for reserved words return t def t_newline(self, t): r'\n+' t.lexer.lineno += len(t.value) # Error handling rule def t_error(self,t): msg = "Illegal character '%s'" % t.value[0] raise SyntaxError(msg) # Test it output def test(self,data): self.lexer.input(data) while True: tok = self.lexer.token() if not tok: break print tok class Parser(object): # the tokens from the lexer class: tokens = Lexer.tokens def __init__(self): self.filters = [] self.groupers = [] self.splitter = None self.group_filters = [] self.mergers = [] self.branch_names = set() self.ungroupers = [] self.branches = [] self.input = None self.outputs = [] self.names = {} self.lexer = Lexer().lexer self.parser = yacc.yacc(module=self) def p_file(self,p): '''file : pipeline_stage_1n''' # for k, v in self.names.iteritems(): # print k, v def p_pipeline_stage_1n(self,p): 'pipeline_stage_1n : pipeline_stage pipeline_stage_1n' # add a name mapping: try: # branch statements dont have names # so we skip them with try/except self.names[p[1].name] = p[1] except AttributeError: pass def p_pipeline_stage_end(self,p): 'pipeline_stage_1n :' def p_pipeline_stage(self,p): ''' pipeline_stage : splitter | filter | composite_filter | branch | ungrouper | grouper | group_filter | merger ''' p[0] = p[1] def p_splitter(self,p): ''' splitter : splitterKeyword id '{' '}' ''' p[0] = Splitter(p[2], p.lineno(2)) if self.splitter != None: raise SyntaxError( "More than one splitter defined in file at line %s",p.lineno(2)) self.splitter = p[0] def p_filter(self,p): ''' filter : filterKeyword id '{' filter_rule_1n '}' ''' # Note that p[4] is a list of lists of rules. # If the list has one element the rule is simple. # If the rule has more than one element, the # rule is OR-ed of all the rules in the list p[0] = Filter(p[2], p.lineno(2), p[4]) self.filters.append(p[0]) def p_composite_filter(self, p): ''' composite_filter : filterKeyword id '{' filter_ref_rule_1n '}' ''' # Note that p[4] is a list of lists of rules. # If the list has one element the rule is simple. # If the rule has more than one element, the # rule is OR-ed of all the rules in the list p[0] = Filter(p[2], p.lineno(2), p[4]) self.filters.append(p[0]) def p_group_filter(self, p): ''' group_filter : groupFilterKeyword id '{' filter_rule_1n '}' ''' # Note that p[4] is a list of lists of rules. # If the list has one element the rule is simple. # If the rule has more than one element, the # rule is OR-ed of all the rules in the list p[0] = Filter(p[2], p.lineno(2), p[4]) self.group_filters.append(p[0]) def p_filter_rule_1n(self,p): 'filter_rule_1n : filter_rule filter_rule_1n' p[2].extend([p[1]]) p[0] = p[2] def p_filter_rule_0(self,p): 'filter_rule_1n :' p[0] = [] def p_filter_rule(self,p): ''' filter_rule : or_rule ''' p[0] = p[1] def p_filter_ref_rule_1n(self,p): 'filter_ref_rule_1n : filter_ref_rule filter_ref_rule_1n' p[2].extend([p[1]]) p[0] = p[2] def p_filter_ref_rule_0(self,p): 'filter_ref_rule_1n : filter_ref_rule' p[0] = [p[1]] def p_filter_ref_rule(self,p): ''' filter_ref_rule : or_id ''' p[0] = p[1] def p_or_id(self, p): 'or_id : not_id opt_or_id' p[1].extend(p[2]) p[0] = p[1] def p_opt_or_id(self, p): ''' opt_or_id : ORKeyword not_id opt_or_id ''' p[2].extend(p[3]) p[0] = p[2] def p_opt_or_id_end(self, p): 'opt_or_id :' p[0] = [] def p_not_id(self, p): ''' not_id : NOTKeyword id | id ''' try: p[0] = [FilterRef(p[2], p.lineno(2), True)] except IndexError: p[0] = [FilterRef(p[1], p.lineno(1))] def p_or_optrule(self,p): 'or_rule : rule_or_not opt_rule' if len(p[2]) > 0: ors = [p[1]] ors.extend(p[2]) p[0] = ors else: p[0] = [p[1]] def p_or_rule(self, p): 'opt_rule : ORKeyword rule_or_not opt_rule' res = [p[2]] res.extend(p[3]) p[0] = res def p_term_opt_rule(self,p): 'opt_rule :' p[0] = [] def p_rule_or_not(self, p): ''' rule_or_not : rule | NOTKeyword rule ''' try: p[2].NOT = True p[0] = p[2] except IndexError: p[0] = p[1] def p_rule(self,p): ''' rule : infix_rule | prefix_rule ''' p[0] = p[1] def p_infix_rule(self,p): 'infix_rule : arg op arg' p[1].extend(p[3]) # concatenate args to get [arg, arg] # for some unknown reason p.lineno(2) does not work in this production # so p[2] is (op, lineno) p[0] = Rule(p[2][0], p[2][1], p[1]) # (op, line, args) From filter.py def p_op(self, p): ''' op : EQ | LT | GT | LTEQ | GTEQ | ML | MG | inKeyword | notinKeyword ''' p[0] = (p[1], p.lineno(1)) def p_rule_prefix(self,p): ''' prefix_rule : id '(' args ')' | bitANDKeyword '(' args ')' | bitORKeyword '(' args ')' ''' p[0] = Rule(p[1], p.lineno(1), p[3]) def p_args(self,p): ''' args : arg ',' args ''' p[0] = p[1] p[0].extend(p[3]) # concatenate the rest of the args to arg def p_args_more(self,p): 'args : arg' p[0] = p[1] def p_no_args(self, p): 'args :' p[0] = [] def p_arg(self, p): ''' arg : id | IPv4 | IPv6 | CIDR | MAC | int | float | hex | prefix_rule | string ''' if type(p[1]) is type("string"): p[1] = Field(p[1]) # Was defined in filter.py, but the definition was commented out. p[0] = [p[1]] # list of one element for easy [].extend later def p_cidr(self, p): ''' CIDR : IPv4 '/' int | IPv6 '/' int ''' p[0] = Rule('cidr_mask', p[1], p[3]) def p_start_branch(self, p): ''' branch : id arrow mid_branch ''' br = [BranchNode(p[1], p.lineno(1))] # In statement.py br.extend(p[3]) p[0] = br self.branches.append(p[0]) def p_input_branch(self, p): ''' branch : string arrow mid_branch ''' if self.input != None: raise SyntaxError("More than one input defined in file at line %s", p.lineno(1)) self.input = Input(p[1].value, p.lineno(1)) br = [self.input] br.extend(p[3]) p[0] = br self.branches.append(p[0]) def p_split_branch(self, p): ''' branch : id branchKeyword mid_branch ''' br = [BranchNode(p[1], p.lineno(1))] p[3][0] = Branch(p[3][0].name, p[3][0].line) br.extend(p[3]) p[0] = br self.branches.append(p[0]) def p_mid_branch(self, p): ''' mid_branch : id arrow mid_branch ''' br = [BranchNode(p[1], p.lineno(1))] br.extend(p[3]) p[0] = br def p_mid_branch_terminate(self, p): ''' mid_branch : end_branch ''' p[0] = p[1] def p_end_branch(self, p): 'end_branch : id' p[0] = [BranchNode(p[1], p.lineno(1))] def p_output_branch(self, p): 'end_branch : string' out = Output(p[1].value, p.lineno(1)) self.outputs.append(out) p[0] = [out] def p_arrow(self, p): """arrow : "-" GT""" pass def p_ungrouper(self, p): ''' ungrouper : ungrouperKeyword id '{' '}' ''' p[0] = Ungrouper(p[2], p.lineno(2)) self.ungroupers.append(p[0]) def p_grouper(self, p): "grouper : grouperKeyword id '{' module1_n aggregate '}'" p[0] = Grouper(p[2], p.lineno(2), p[4], p[5]) # insert aggregation of record ids (needed for ungrouping later) p[0].aggr.insert(0,(Rule('union', p.lineno(2), [Field('rec_id'), 'records']))) p[0].aggr.insert(0,(Rule('min', p.lineno(2), [Field('First'), 'First']))) p[0].aggr.insert(0,(Rule('max', p.lineno(2), [Field('Last'), 'Last']))) self.groupers.append(p[0]) def p_module1_n(self, p): 'module1_n : module module1_n' p[1].extend(p[2]) p[0] = p[1] def p_module0(self, p): 'module1_n :' p[0] = [] def p_module(self, p): "module : moduleKeyword id '{' grouper_rule1_n '}'" p[0] = [Module(p[2], p.lineno(2), p[4])] def p_grouper_rule1_n(self, p): 'grouper_rule1_n : grouper_rule grouper_rule1_n' p[1].extend(p[2]) p[0] = p[1] def p_grouper_rule0(self, p): 'grouper_rule1_n :' p[0] = [] def p_grouper_rule(self, p): 'grouper_rule : id grouper_op id' p[0] = [[GrouperRule(p[2], p.lineno(2), [Field(p[1]), Field(p[3]), None, False])]] def p_grouper_rule_delta(self, p): ''' grouper_rule : id grouper_op id deltaKeyword delta_arg ''' p[0] = [[GrouperRule(p[2], p.lineno(2), [Field(p[1]), Field(p[3]), p[5], False])]] def p_grouper_rule_rel_delta(self, p): ''' grouper_rule : id grouper_op id rdeltaKeyword delta_arg ''' p[0] = [[GrouperRule(p[2], p.lineno(2), [Field(p[1]), Field(p[3]), p[5], True])]] def p_grouper_op(self, p): ''' grouper_op : EQ | LT | GT | GTEQ | LTEQ ''' p[0] = p[1] def p_delta_arg(self, p): ''' delta_arg : time | int ''' p[0] = p[1] def p_time(self, p): ''' time : int sKeyword | int msKeyword | int minKeyword ''' # the number should be in ms: if p[2] == 's': p[1].value = p[1].value * 1000 if p[2] == 'min': p[1].value = p[1].value * 60 * 1000 p[0] = p[1] def p_aggregate(self, p): 'aggregate : aggregateKeyword aggr1_n' for aggr in p[2]: if aggr.line == 0: aggr.line = p.lineno(1) p[0] = p[2] def p_aggr1_n(self, p): 'aggr1_n : aggr opt_aggr' p[1].extend(p[2]) p[0] = p[1] def p_opt_aggr(self, p): "opt_aggr : ',' aggr opt_aggr" p[2].extend(p[3]) p[0] = p[2] def p_opt_aggr_end(self, p): 'opt_aggr :' p[0] = [] def p_aggr(self, p): "aggr : aggr_op '(' id_or_qid ')' asKeyword id" args = [Field(p[3]), p[6]] # [id_or_qid, id, aggr_op] p[0] = [Rule(p[1], p.lineno(4), args)] def p_simple_agg(self, p): 'aggr : id_or_qid asKeyword id' args = [Field(p[1]), p[3]] # [qid, id] p[0] = [Rule('last', p.lineno(2), args)] def p_simple_agg_same_name(self, p): 'aggr : id_or_qid' args = [Field(p[1]), p[1]] # [qid, id] p[0] = [Rule('last', p.lineno(1), args)] def p_qid(self, p): ''' qid : id '.' id ''' p[0] = p[1] + p[2] + p[3] def p_id_or_qid(self, p): ''' id_or_qid : id | qid ''' p[0] = p[1] def p_aggr_op(self, p): ''' aggr_op : minKeyword | maxKeyword | sumKeyword | avgKeyword | unionKeyword | countKeyword | bitANDKeyword | bitORKeyword ''' p[0] = p[1] def p_merger(self, p): "merger : mergerKeyword id '{' merger_module1_n export '}'" p[0] = Merger(p[2], p.lineno(2), p[4], p[5]) self.mergers.append(p[0]) def p_merger_module1_n(self, p): 'merger_module1_n : merger_module merger_module1_n' p[1].extend(p[2]) p[0] = p[1] def p_merger_module0(self, p): 'merger_module1_n : ' p[0] = [] def p_merger_module(self, p): """ merger_module : moduleKeyword id '{' merger_branches merger_rule1_n '}' """ p[0] = [Module(p[2], p.lineno(2), p[5], p[4])] def p_merger_branches(self, p): 'merger_branches : branchesKeyword branches1_n' p[0] = p[2] def p_branches1_n(self, p): """ branches1_n : id ',' branches1_n """ p[0] = [p[1]] p[0].extend(p[3]) def p_branches1(self, p): ' branches1_n : id' p[0] = [p[1]] def p_export(self, p): 'export : exportKeyword id' p[0] = p[2] def p_merger_rule1_n(self, p): 'merger_rule1_n : merger_rule merger_rule1_n' p[1].extend(p[2]) p[0] = p[1] def p_merger_rule0(self,p): 'merger_rule1_n :' p[0] = [] def p_merger_rule(self, p): ''' merger_rule : merger_prefix_rule | merger_infix_rule ''' p[0] = [[p[1]]] def p_not_merger_rule(self, p): ''' merger_rule : NOTKeyword merger_prefix_rule | NOTKeyword merger_infix_rule ''' p[2].NOT = True p[0] = [[p[2]]] def p_merger_infix_rule(self, p): 'merger_infix_rule : qid_arg op qid_arg' p[1].extend(p[3]) p[0] = Rule(p[2][0], p[2][1], p[1]) def p_merger_prefix_rule(self,p): ''' merger_prefix_rule : id '(' qid_args ')' ''' p[0] = Rule(p[1], p.lineno(1), p[3]) def p_qid_args(self,p): ''' qid_args : qid_arg ',' qid_args ''' p[0] = p[1] p[0].extend(p[3]) # concatenate the rest of the args to arg def p__qid_args_more(self,p): 'qid_args : qid_arg' p[0] = p[1] def p_no_qid_args(self, p): 'qid_args :' p[0] = [] def p_qid_arg(self, p): ''' qid_arg : qid | IPv4 | IPv6 | CIDR | MAC | int | float | hex | merger_prefix_rule | string ''' if type(p[1]) is type("string"): p[1] = Field(p[1]) p[0] = [p[1]] # list of one element for easy [].extend later def p_merger_rule_al_op(self, p): 'merger_rule : allen_rule opt_or_allen_rule' p[1].extend(p[2]) p[0] = [p[1]] def p_opt_or_allen_rule(self, p): 'opt_or_allen_rule : ORKeyword allen_rule opt_or_allen_rule' p[2].extend(p[3]) p[0] = p[2] def p_opt_op_rule_end(self, p): 'opt_or_allen_rule : ' p[0] = [] def p_allen_rule(self, p): 'allen_rule : id allen_op id opt_allen_delta' args = [Field(p[1]), Field(p[3])] args.extend(p[4]) # add the delta time to [arg, arg] p[0] = [AllenRule(p[2], p.lineno(1), args)] # (op, line, args) def p_opt_allen_delta(self, p): ''' opt_allen_delta : deltaKeyword time ''' p[0] = [p[2]] def p_no_allen_delta(self, p): 'opt_allen_delta :' p[0] = [] def p_allen_op(self, p): ''' allen_op : LT | GT | EQ | mKeyword | miKeyword | oKeyword | oiKeyword | sKeyword | siKeyword | dKeyword | diKeyword | fKeyword | fiKeyword | eqKeyword ''' # for some strange reason upper level refuses to recognize lineno: p[0] = p[1] def p_error(self, p): msg ="Syntax error. Unexpected token " msg +="%s (%s)"%(p.value, p.type) msg += " at line %s"% self.lexer.lineno raise SyntaxError(msg) def parse(self, text): self.parser.parse(text, lexer=self.lexer) # parse method is called from ply.yacc self.resolve_branches() def find_io_nodes(self): ''' Finds which branch nodes are inputs and which are outputs. The rest of the branches are processing stages. ''' pass def check_branching(self): pass def check_branch_nodes(self): for b in self.branch_nodes.values(): if not b.is_branch: try: node = self.names[b.name] if len(b.inputs) == 0: msg = "Node %s at line" % b.name msg += " %s does not have input." % b.line raise SyntaxError(msg) if len(b.outputs) == 0: msg = "Node %s at line" % b.name msg += " %s does not have output." % b.line raise SyntaxError(msg) if len(b.inputs) > 1 and type(node) is not Merger: msg = "Non-Merger node %s at line" % b.name msg += " %s has more than one input." % b.line raise SyntaxError(msg) if len(b.outputs) > 1 and type(node) is not Splitter: msg = "Non-Splitter node %s at line" % b.name msg += " %s has more than one output." % b.line raise SyntaxError(msg) except KeyError: # check whether this is some middle node if len(b.inputs) != 0 and len(b.outputs) !=0: msg = "Node %s refferenced at line" % b.name msg += " %s not defined" % b.line raise SyntaxError(msg) #check whether the node name is actually parser string(Arg) if type(b.name) is not Arg: msg = "Node %s refferenced at line" % b.name msg += " %s not defined" % b.line raise SyntaxError(msg) else: if len(b.inputs) != 1 or len(b.outputs) != 1: msg = "Branch Node %s at line" % b.name msg += " %s must have 1 input and 1 output." % b.line raise SyntaxError(msg) def resolve_branches(self): noname_branchings = [] for branch in self.branches: # print branch # print "" br_name = False br_index = 0 for i, node in enumerate(branch): if type(node) is BranchNode: try: branch[i] = self.names[node.name] except KeyError: msg = "Node %s refferenced at line" % node.name msg += " %s not defined" % node.line raise SyntaxError(msg) if type(node) is Branch: br_name = node.name br_index = i self.branch_names.add(br_name) if type(node) is Input and i != 0: msg = "Input node %s at line" % node.name msg += " %s should be at first posigion" % node.line msg += " of branching statement" raise SyntaxError(msg) if type(node) is Output and i != (len(branch) - 1): msg = "Output node %s at line" % node.name msg += " %s should be at position posigion" % node.line msg += " of branching statement" raise SyntaxError(msg) if br_name: del(branch[br_index]) for node in branch: node.branches.add(br_name) else: noname_branchings.append(branch) # second iteration to fix the remaining node, which don't have branches for branch in noname_branchings: s = set() for node in branch: s.update(node.branches) for node in branch: node.branches.update(s) class ParsedFile(object): def __init__(self, filters, groupers, splitters, group_filters, mergers, branches, ungroupers, input, output, names): self.filters = filters self.groupers = groupers self.splitters = splitters self.group_filters = group_filters self.mergers = mergers self.branches = branches self.ungroupers = ungroupers self.input = input self.output = output self.names = names