Intial flowy commit

This commit is contained in:
Git User 2010-11-05 18:57:01 +01:00
parent b9362ec8f5
commit d6fe38272e
370 changed files with 12160 additions and 0 deletions

72
3http-download.flw Normal file
View file

@ -0,0 +1,72 @@
splitter S {}
filter www_req {
dstport = 80
}
filter www_res {
srcport = 80
}
filter www_res1 {
srcport = 80
}
grouper g_www_req {
module g1 {
srcip = srcip
dstip = dstip
etime < stime delta 1s
}
aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
bitOR(tcp_flags) as flags, union(srcport) as srcports
}
grouper g_www_res {
module g1 {
srcip = srcip
dstip = dstip
etime < stime delta 1s
}
aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
bitOR(tcp_flags) as flags, union(dstport) as dstports
}
grouper g_www_res1 {
module g1 {
srcip = srcip
dstip = dstip
etime < stime delta 5s
}
aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
bitOR(tcp_flags) as flags, union(dstport) as dstports
}
groupfilter ggf {
bitAND(flags, 0x13) = 0x13
}
merger M {
module m1 {
branches C, B, A
A.srcip = B.dstip
A.srcip = C.dstip
A.srcports = B.dstports
A.srcports = C.dstports
A.bytes < B.bytes
A.bytes < C.bytes
B oi A OR B d A
C o B
C m A
}
export m1
}
ungrouper U {}
"./netflow-trace.h5" -> S
S branch A -> www_req -> g_www_req -> ggf -> M
S branch B -> www_res -> g_www_res -> ggf -> M
S branch C -> www_res1 -> g_www_res1 -> ggf -> M
M->U->"./ungroped.h5"

17
README Normal file
View file

@ -0,0 +1,17 @@
Flowy - Network Flow Analysis Application
Requirements:
Python 2.5 or higher (tested with Python 2.6)
Pytables 2.1 or higher
PLY (Python Lex-Yacc) 2.5 or higher
pyflowtools 3.1 or higher
Usage:
ft2hdf.py - convert flow-tools traces to hdf hdf file
printhdf.py - print flowy hdf file
print_hdf_in_step.py - print two or more hdf files printing one record
from each file at each step
flowy.py - the main flowy program

0
__init__.py Normal file
View file

152
aggr_operators.py Normal file
View file

@ -0,0 +1,152 @@
import options
from tables import UInt32Col, UInt64Col
if options.import_grouper_ops:
external_import = __import__(options.import_grouper_ops)
class last(object):
__slots__ = ['field', 'gr_field', 'field_type', 'last']
def __init__(self, field, gr_field, field_type):
self.field = field
self.gr_field = gr_field
self.field_type = field_type
self.last = None
def __call__(self, record = None):
if record == None:
return self.last
else:
self.last = getattr(record, self.field)
return self.last
class sum(object):
__slots__ = ['field', 'gr_field', 'field_type','sum']
def __init__(self, field, gr_field, field_type):
self.field = field
self.gr_field = gr_field
self.field_type = field_type
self.sum = 0
def __call__(self, record = None):
if record == None:
return self.sum
else:
self.sum += getattr(record, self.field)
return self.sum
class avg(object):
__slots__ = ['field', 'gr_field', 'field_type','sum','n','avg']
def __init__(self, field, gr_field, field_type):
self.field = field
self.gr_field = gr_field
self.field_type = field_type
self.sum = 0
self.n = 0
self.avg = None
def __call__(self, record = None):
if record == None:
if str(self.field_type).find('Int') != -1:
return int(round(self.avg))
else:
return self.avg
else:
self.sum += getattr(record, self.field)
self.n += 1
self.avg = self.sum / self.n
return self.avg
class max(object):
__slots__ = ['field', 'gr_field', 'field_type','max']
def __init__(self, field, gr_field, field_type):
self.field = field
self.gr_field = gr_field
self.field_type = field_type
self.max = float("-inf")
def __call__(self, record = None):
if record == None:
return self.max
else:
new_val = getattr(record, self.field)
if self.max < new_val:
self.max = new_val
return self.max
class min(object):
__slots__ = ['field', 'gr_field', 'field_type','min']
def __init__(self, field, gr_field, field_type):
self.field = field
self.gr_field = gr_field
self.field_type = field_type
self.min = float("inf")
def __call__(self, record = None):
if record == None:
return self.min
else:
new_val = getattr(record, self.field)
if self.min > new_val:
self.min = new_val
return self.min
class count(object):
__slots__ = ['field', 'gr_field', 'field_type','count']
def __init__(self, field, gr_field, field_type):
self.field = field
self.gr_field = gr_field
self.field_type = field_type
self.count = 0
def __call__(self, record = None):
if record == None:
return self.count
else:
self.count += 1
return self.count
class union(object):
__slots__ = ['field', 'gr_field', 'field_type','union']
def __init__(self, field, gr_field, field_type):
self.field = field
self.gr_field = gr_field
self.field_type = field_type
self.union = []
def __call__(self, record = None):
if record == None:
return sorted(set(self.union))
else:
self.union.append(getattr(record, self.field))
return self.union
class bitAND(object):
__slots__ = ['field', 'gr_field', 'field_type','bitAND']
def __init__(self, field, gr_field, field_type):
self.field = field
self.gr_field = gr_field
self.field_type = field_type
self.bitAND = pow(2,field_type.size) - 1 # all 1s for the given size
def __call__(self, record = None):
if record == None:
return self.bitAND
else:
self.bitAND &= getattr(record, self.field)
return self.bitAND
class bitOR(object):
__slots__ = ['field', 'gr_field', 'field_type','bitOR']
def __init__(self, field, gr_field, field_type):
self.field = field
self.gr_field = gr_field
self.field_type = field_type
self.bitOR = 0
def __call__(self, record = None):
if record == None:
return self.bitOR
else:
self.bitOR |= getattr(record, self.field)
return self.bitOR

BIN
aggr_operators.pyc Normal file

Binary file not shown.

172
allen_index.py Normal file
View file

@ -0,0 +1,172 @@
class LT(object):
"""
X < Y
x before y
"""
def __init__(self, src, target, delta):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.etime, x.etime + self.delta
class GT(object):
"""
X > Y
x after y
"""
def __init__(self, src, target, delta):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.stime - self.delta, x.stime
class m(object):
"""
X m Y
x meets y (x starts before y)
y should occur at end of x
"""
def __init__(self, src, target, delta=1):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.etime, x.etime + self.delta
class mi(object):
"""
X mi Y
inverse x meets y (x starts after y)
y should occur at the beginning of x
"""
def __init__(self, src, target, delta=1):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.stime - self.delta, x.stime
class o(object):
"""
X o Y
x overlaps y (x starts before y)
y should occur at the end of x
"""
def __init__(self, src, target, delta=0):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.etime-self.delta, x.etime+self.delta
class oi(object):
"""
X oi Y
inverse x overlaps y (x starts after y)
"""
def __init__(self, src, target, delta=0):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.stime, x.stime
class d(object):
"""
X d Y
x during y
"""
def __init__(self, src, target, delta=0):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.stime, x.stime
class di(object):
"""
X di Y
inverse x during y (y during x)
"""
def __init__(self, src, target, delta=0):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.stime, x.etime
class f(object):
"""
X f Y
x finishes y (x starts after y, x and y end together)
"""
def __init__(self, src, target, delta=1):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.etime - self.delta, x.etime + self.delta
class fi(object):
"""
X fi Y
inverse x finishes y (x is finished by y)
"""
def __init__(self, src, target, delta=1):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.etime - self.delta, x.etime + self.delta
class s(object):
"""
X s Y
x starts y (x ends before y, x and y starts together)
"""
def __init__(self, src, target, delta=0):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.stime - self.delta, x.stime + self.delta
class si(object):
"""
X si Y
inverse x starts y (x is started by y)
"""
def __init__(self, src, target, delta=1):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.stime - self.delta, x.stime + self.delta
class EQ(object):
"""
X = Y
X lasts the same time as Y and both start together.
"""
def __init__(self, src, target, delta=1):
self.delta = delta
self.src = src
self.target = target
def __call__(self, x):
return x.stime - self.delta, x.stime + self.delta

BIN
allen_index.pyc Normal file

Binary file not shown.

232
allen_ops.py Normal file
View file

@ -0,0 +1,232 @@
from math import floor, ceil
def inv_op_str(op_name_string):
inverse = {
'LT' : 'GT',
'GT' : 'LT',
'm' : 'mi',
'mi' : 'm',
'o' : 'oi',
'oi' : 'o',
's' : 'si',
'si' : 's',
'd' : 'di',
'di' : 'd',
'f' : 'fi',
'fi' : 'f',
'=' : '='
}
return inverse[op_name_string]
class AllenOpIndex(object):
def __init__(self, index):
self.index
def LT(self, x, delta):
"""
X < Y
x before y
"""
return x.etime, x.etime + delta
def GT(self, x, delta):
"""
X > Y
x after y
"""
return x.stime - delta, x.stime
def m(self, x, delta=1):
"""
X m Y
x meets y (x starts before y)
y should occur at end of x
"""
return x.etime, x.etime + delta
def mi(self, x, delta=1):
"""
X mi Y
inverse x meets y (x starts after y)
y should occur at the beginning of x
"""
return x.stime - delta, x.stime
def o(self, x, delta=1):
"""
X o Y
x overlaps y (x starts before y)
y should occur at the end of x
"""
return x.etime-delta, x.etime+delta
def oi(self, x, delta=1):
"""
X oi Y
inverse x overlaps y (x starts after y)
"""
return x.stime, x.stime
def d(self, x, delta=0):
"""
X d Y
x during y
"""
return x.stime, x.stime
def di(self, x, delta=0):
"""
X di Y
inverse x during y (y during x)
"""
return x.stime, x.etime
def f(self, x, delta=1):
"""
X f Y
x finishes y (x starts after y, x and y end together)
"""
# delta disregarded here
return x.etime - delta, x.etime + delta
def fi(self, x, delta=1):
"""
X fi Y
inverse x finishes y (x is finished by y)
"""
return x.etime - delta, x.etime + delta
def s(self, x, delta=1):
"""
X s Y
x starts y (x ends before y, x and y starts together)
"""
return x.stime - delta, x.stime + delta
def si(self, x, delta=1):
"""
X si Y
inverse x starts y (x is started by y)
"""
# delta disregarded here
return x.stime - delta, x.stime + delta
def EQ(self, x, delta=1):
"""
X = Y
X lasts the same time as Y
"""
# delta disregarded here
return int((x.stime + x.etime)/2) - delta, int((x.stime +
x.etime)/2) + delta
def composite_intervals(self, op_x_delta_tuples):
intervals = set()
for op_x_delta in op_x_delta_tuples:
op = op_x_delta[0]
args = op_x_delta[1:]
intervals.update(getattr(self, op)(*args))
res = list(intervals)
res.sort()
return res
def LT(x, y, delta=0):
"""
X < Y
x before y
"""
return x.etime < y.stime
def GT(x, y, delta=1):
"""
X > Y
x after y
"""
return x.stime > y.etime
def m(x, y, delta=1):
"""
X m Y
x meets y (x starts before y)
y should occur at end of x
"""
return abs(x.etime - y.stime) < delta
def mi(x, y, delta=1):
"""
X mi Y
inverse x meets y (x starts after y)
y should occur at the beginning of x
"""
return abs(x.stime - y.etime) < delta
def o(x, y, delta=1):
"""
X o Y
x overlaps y (x starts before y)
y should occur at the end of x
"""
return y.stime < x.etime < y.etime
def oi(x, y, delta=1):
"""
X oi Y
inverse x overlaps y (x starts after y)
"""
return y.stime < x.stime < y.etime
def d(x, y, delta=0):
"""
X d Y
x during y
"""
return y.stime < x.stime and x.etime < y.etime
def di(x, y, delta=0):
"""
X di Y
inverse x during y (y during x)
"""
return y.stime > x.stime and x.etime > y.etime
def f(x, y, delta=1):
"""
X f Y
x finishes y (x starts after y, x and y end together)
"""
# delta disregarded here
return x.stime > y.etime and abs(x.etime - y.etime) < delta
def fi(x, y, delta=1):
"""
X fi Y
inverse x finishes y (x is finished by y)
"""
return x.stime < y.etime and abs(x.etime - y.etime) < delta
def s(x, y, delta=1):
"""
X s Y
x starts y (x ends before y, x and y start together)
"""
return x.etime < y.etime and abs(x.stime - y.stime) < delta
def si(x, y, delta=1):
"""
X si Y
inverse x starts y (x is started by y)
"""
# delta disregarded here
return x.etime > y.etime and abs(x.stime - y.stime) < delta
def EQ(x, y, delta=1):
"""
X fi Y
inverse x finishes y (x is finished by y)
"""
# delta disregarded here
return abs(x.stime - y.stime) < delta and abs(x.etime - y.etime) < delta

BIN
allen_ops.pyc Normal file

Binary file not shown.

5
custops.py Normal file
View file

@ -0,0 +1,5 @@
def minus(*args):
res = args[0]
for arg in args[1:]:
res -= arg
return res

BIN
custops.pyc Normal file

Binary file not shown.

197
filter.py Normal file
View file

@ -0,0 +1,197 @@
from copy import deepcopy
from copy import copy
from statement import Field
from record import RecordReader
import time
import profiler
class NoMatch(Exception):
pass
class Filter(object):
def __init__(self,rules, records, br_mask, nbranches):
self.rules = rules
self.records = records
self.br_mask = br_mask
# print "The filter has just been initiated"
# Iteration of the filter happens at the splitter function go()
# In this teration function, each of the records is being matched
# against all of the conditions in each of the filters, and based
# on what condition it matches, it is assigned an appropriate
# branch mask. I.e., if Branch A has a srcport=443, then the record
# that matches this requirement gets a mask of [True, False], else
# if Branch B's filter is matched, then a mask of [False, True] is
# assigned.
def __iter__(self):
print "Started filtering"
# start = time.clock()
# print "Fitlering time started at:", start
for record in self.records:
self.br_mask.reset()
try:
for rule in self.rules:
rule_result = rule.match(record)
self.br_mask.mask(rule.branch_mask, rule_result)
except NoMatch:
continue
branches = self.br_mask.final_result()
if True in branches:
yield record, branches
# print "Finished filtering"
# time_elapsed = (time.clock() - start)
# print "Filtering required:", time_elapsed
#class Field(object):
# def __init__(self, name):
# self.name = name
# def __repr__(self):
# return "Field('%s')"%self.name
# Implementation of a self-defined deepcopy function that operates
# for the simple data types.
def deep_copy(org):
out = dict().fromkeys(org)
for k,v in org.iteritems():
try:
out[k] = v.copy() # dicts, sets
except AttributeError:
try:
out[k] = v[:] # lists, tuples, strings, unicode
except TypeError:
out[k] = v # ints
return out
class BranchMask(object):
def __init__(self, branch_masks, pseudo_branches, n_real_branches):
self.masks = branch_masks
# self.orig_mask = deepcopy(branch_masks)
self.orig_mask = deep_copy(branch_masks)
# self.pseudo_branches = deepcopy(pseudo_branches)
self.pseudo_branches = deep_copy(pseudo_branches)
self.n_real_branches = n_real_branches
def reset(self):
# self.masks = deepcopy(self.orig_mask)
self.masks = deep_copy(self.orig_mask)
#self.masks = copy(self.orig_mask)
# self.masks = self.orig_mask
def mask(self, sub_branches, result):
for br, sub_br, NOT in sub_branches:
res = not result if NOT else result
if sub_br == 0:
self.masks[br][sub_br] = self.masks[br][sub_br] and res
else:
self.masks[br][sub_br] = self.masks[br][sub_br] or res
def final_result(self):
final_mask = {}
for br, mask in self.masks.iteritems():
final_mask[br] = True if False not in mask else False
result = []
for id in xrange(self.n_real_branches):
try:
result.append(final_mask[id])
except KeyError:
gr_res = True
for or_group in self.pseudo_branches[id]:
res = False
for ref in or_group:
if ref[1]:
res = res or not final_mask[ref[0]]
else:
res = res or final_mask[ref[0]]
gr_res = gr_res and res
result.append(gr_res)
return result
class Rule(object):
def __init__(self, branch_mask, operation, args):
self.operation = operation
self.args = args
self.branch_mask = branch_mask
# This match operation is used at both the filtering and group-filering
# stages, since group-filter also relies on this Rule class.
def match(self, record):
args = []
for arg in self.args:
if type(arg) is Field: # Used both at filterin and group-filtering stages
args.append(getattr(record, arg.name))
elif type(arg) is Rule: # Used only at the group-fitlering stage
args.append(arg.match(record))
else: # Used at both stages. The actual argument numbers, i.e., port 80
args.append(arg)
return self.operation(*args)
class PreSplitRule(Rule):
def match(self,record):
result = Rule.match(self,record)
if not result:
raise NoMatch()
class GroupFilter(object):
def __init__(self, rules, records, branch_name, groups_table, index):
self.rules = rules
self.records = records
self.branch_name = branch_name
self.index = index
self.groups_table = groups_table
self.record_reader = RecordReader(self.groups_table)
def go(self):
count = 0
for record in self.records:
for or_rules in self.rules:
matched = False
for rule in or_rules:
if rule.match(record):
matched = True
break
if not matched:
break
if matched:
record.rec_id = count
count += 1
self.index.add(record)
self.groups_table.append(record)
print "Finished filtering groups for branch " + self.branch_name
self.groups_table.flush()
def __iter__(self):
for rec in self.record_reader:
yield rec
class AcceptGroupFilter(GroupFilter):
def __init__(self, records, branch_name, groups_table, index):
GroupFilter.__init__(self, None, records, branch_name, groups_table,
index)
def go(self):
count = 0
for record in self.records:
record.rec_id = count
count += 1
self.index.add(record)
self.groups_table.append(record)
print "Finished filtering groups for branch " + self.branch_name
self.groups_table.flush()

BIN
filter.pyc Normal file

Binary file not shown.

178
filter_validator.py Normal file
View file

@ -0,0 +1,178 @@
from validator_common import *
from copy import deepcopy
from record import RecordReader
from statement import FilterRef
from filter import Rule as RuleImpl
from filter import Filter as FilterImpl
from filter import BranchMask
class FilterValidator(object):
def __init__(self, parser):
self.parser = parser
self.n_real_branches = len(self.parser.branch_names)
self.filters = deepcopy(parser.filters)
self.filter_names = dict((filter.name, filter) for filter in self.filters)
self.branch_names = self.parser.branch_names # note! not a copy
# get_input_fields_types() comes from validator_common.py
# get_input_reader()comes from validator_common.py, takes parsed query
# as an input and returns a reader for the parser's input - a reader
# object for an HDF table of flow records
self.fields = get_input_fields_types(get_input_reader(self.parser)).keys()
self.pseudo_branches = {}
# Argument is a reader object that has an access to the description of the
# stored records, and can create a list of available fields
self.input_reader = RecordReader(get_input_reader(parser))
self.impl = self.create_impl()
def check_for_unused_filters(self):
for filter in self.filters:
if len(filter.branches) == 0:
msg = "Warning filter %s "%filter.name
msg += "defined on line %s"%filter.line
msg += " is not used in any branch."
print msg
continue # skips unused filters
def check_duplicate_filter_names(self):
duplicates = {}
for filter in self.filters:
old_val = duplicates.setdefault(filter.name, 0)
duplicates[filter.name] = old_val + 1
duplicate_names = [k for k,v in duplicates.iteritems() if v > 1]
if len(duplicate_names) > 0:
msg = "Filter(s) %s"%duplicate_names
msg += " is/are all defined more than once."
raise SyntaxError(msg)
def check_field_refs(self):
"Check record field references, for unknown fields"
for filter in self.filters:
for rule in iterate_rules(filter):
check_rule_fields(rule, self.fields)
def change_branch_names_to_id(self):
"""
Turn branch names into numerical ids. This helps with mask creation.
"""
# create numerical branch id's:
self.branches_ids = dict((branch, id)
for id, branch in enumerate(self.parser.branch_names))
self.ids_branches = dict((id, branch)
for id, branch in enumerate(self.parser.branch_names))
for filter in self.filters:
filter.branches = [self.branches_ids[br] for br in filter.branches]
def create_pseudobranches(self):
"""
Finds all Filter ref's and adds their branches to the referenced
filters. If a filter is ORed with another a new branch is created for
each OR-ed rule.
"""
max_id = len(self.branches_ids)
for filter in self.filters:
for or_rule in filter.rules:
if type(or_rule[0]) is not FilterRef:
# Not a composite rule, so there can't be need for
# pseudo branches
break
if len(or_rule) == 1:
# Not an ORed FilterRef. Just add FilterRef's branches
# to the referenced filter
ref_filt = self.parser.names[or_rule[0].name]
ref_filt.branches.update(filter.branches)
else:
# ORed FilteRef create pseudo branches
pseudo_branch_group = []
for br in filter.branches:
for filter_ref in or_rule:
try:
ref_filt = self.filter_names[filter_ref.name]
except KeyError, ex:
msg = "Filter %s referenced in "%ex.message
msg += "%s is not defined"%filter.name
raise SyntaxError(msg)
id = max_id
max_id += 1
self.branch_names.add(id)
ref_filt.branches.append(id)
pseudo_branch_group.append((id, filter_ref.NOT))
ps_br_set = self.pseudo_branches.setdefault(br, [])
ps_br_set.append(pseudo_branch_group)
def create_masks(self):
branches_masks = {}
rule_masks = {}
for filter in self.filters:
if type(filter.rules[0][0]) is FilterRef:
continue
for branch in filter.branches:
for or_rule in filter.rules:
if len(or_rule) == 1:
#not an OR rule:
branches_masks.setdefault(branch,[True])[0] = True
sub_br_id = 0
else:
branches_masks.setdefault(branch,
[True]).append(False)
sub_br_id = len(branches_masks[branch]) - 1
for rule in or_rule:
rule_masks.setdefault(rule,[]).append((branch,
sub_br_id,
rule.NOT))
self.branches_masks = branches_masks
self.rule_masks = rule_masks
def create_rule_implementations(self):
rules = []
for rule, br_mask in self.rule_masks.iteritems():
# print rule, br_mask
self.replace_nested_rules(rule)
# print rule, br_mask
op = find_op(rule)
args = rule.args
rules.append(RuleImpl(br_mask, op, args))
return rules
def replace_nested_rules(self, rule):
if Rule not in map(type, rule.args):
op = find_op(rule)
args = rule.args
return RuleImpl(None, op, args)
for i, arg in enumerate(rule.args):
if type(arg) is Rule:
rule.args[i] = self.replace_nested_rules(arg)
def validate(self):
self.check_duplicate_filter_names()
self.check_field_refs()
self.change_branch_names_to_id()
for filter in self.filters:
replace_bound_rules(filter)
replace_with_vals(filter)
self.create_pseudobranches()
self.check_for_unused_filters()
self.create_masks()
def create_impl(self):
self.validate()
rules = self.create_rule_implementations()
pseudo_branches = self.pseudo_branches
branch_masks = self.branches_masks
br_mask = BranchMask(branch_masks, pseudo_branches,
self.n_real_branches)
filter_impl = FilterImpl(rules, self.input_reader, br_mask,
self.n_real_branches)
return filter_impl

BIN
filter_validator.pyc Normal file

Binary file not shown.

BIN
flowy-py-files.tar.gz Normal file

Binary file not shown.

BIN
flowy-run/GroupsA-merged.h5 Normal file

Binary file not shown.

BIN
flowy-run/GroupsA.h5 Normal file

Binary file not shown.

BIN
flowy-run/GroupsB-merged.h5 Normal file

Binary file not shown.

BIN
flowy-run/GroupsB.h5 Normal file

Binary file not shown.

BIN
flowy-run/GroupsC.h5 Normal file

Binary file not shown.

BIN
flowy-run/GroupsD.h5 Normal file

Binary file not shown.

BIN
flowy-run/MergedM.h5 Normal file

Binary file not shown.

32
flowy.py Executable file
View file

@ -0,0 +1,32 @@
#!/usr/bin/python
import options
from optparse import OptionParser
import flowy_exec
import sys
import ply
if __name__ == '__main__':
usage = 'usage: %prog [options] input_file.flw'
p = OptionParser(usage)
option_names = ['--time_index_interval_ms', '--max_unsatisfiable_deltas',
'--unsat_delta_threshold_mul', '--do_not_expand_groups']
for opt_name in option_names:
p.add_option(opt_name)
opts, arguments = p.parse_args()
for opt_name in map(lambda x: x[2:], option_names):
opt = getattr(opts, opt_name)
if opt:
setattr(options, opt_name, opt)
if len(arguments) != 1:
sys.stderr.write('Exactly one argument expected\n')
exit(1)
file = arguments[0]
try:
flowy_exec.run(file)
except (ply.yacc.YaccError, SyntaxError) as e:
import sys
sys.stderr.write(str(e)+'\n')

142
flowy_exec.py Normal file
View file

@ -0,0 +1,142 @@
from parser import Parser
from filter_validator import FilterValidator
from splitter_validator import SplitterValidator
from grouper_validator import GrouperValidator
from groupfilter_validator import GroupFilterValidator
from merger_validator import MergerValidator
from ungrouper_validator import UngrouperValidator
from threading import Thread
import options
import profiler
import time
#profiler.profile_on()
start = time.clock()
print start
def run(filename):
#valstart_elapsed = (time.clock() - start)
#print "Parsing and validation started:", valstart_elapsed
p = Parser()
file = open(filename)
doc = file.read()
p.parse(doc)
#inps = get_inputs_list(p)
#print get_input_fields_types(inps[0])
# hdf_file = "../testFT2.h5"
# r = pytables.FlowRecordsTable(hdf_file)
# recordReader = record.RecordReader(r)
f = FilterValidator(p)
# fl = f.impl
s = SplitterValidator(p, f)
spl = s.impl
gr = GrouperValidator(p, s)
# grs = gr.impl
gr_filt = GroupFilterValidator(p, gr)
# Returns a number of group-filter instances
# with accordance to the number of branches.
gr_filters = gr_filt.impl
mr = MergerValidator(p, gr_filt)
mergers = mr.impl
#valend_elapsed = (time.clock() - start)
#print "Parsing and validation finished:", valend_elapsed
splitter_thread = Thread(target=spl.go)
gf_threads = [Thread(target=gf.go)for gf in gr_filters]
splitter_elapsed = (time.clock() - start)
print "Splitter time estarted:", splitter_elapsed
splitter_thread.start()
groupfil_start= (time.clock() - start)
print "Group filter time started:", groupfil_start
for gf_thread in gf_threads:
gf_thread.start()
#Originally it was after gf_thread.start()
splitter_thread.join()
print "Splitter finished"
splitter_elapsed = (time.clock() - start)
print "Splitter time elapsed:", splitter_elapsed
for gf_thread in gf_threads:
gf_thread.join()
groupfil_elapsed = (time.clock() - start)
print "Group filter threads joined:", groupfil_elapsed
merger_threads = [Thread(target=m.go()) for m in mergers]
for merger_thread in merger_threads:
merger_thread.start()
for merger_thread in merger_threads:
merger_thread.join()
merger_elapsed = (time.clock() - start)
print "Merger time elapsed:", merger_elapsed
ung = UngrouperValidator(p, mr)
ungroupers = ung.impl
ungrouper_threads = [Thread(target=u.go) for u in ungroupers]
for ungrouper_thread in ungrouper_threads:
ungrouper_thread.start()
for ungrouper_thread in ungrouper_threads:
ungrouper_thread.join()
# profiler.profile_off()
# import pickle
# stats = profiler.get_profile_stats()
# sorted_stats = sorted(stats.iteritems(), key=lambda a: a[1][1]/a[1][0])
# for st in sorted_stats:
# print st
# print ' '
print "FINISHED!"
overall_elapsed = (time.clock() - start)
print "Overall time elapsed:", overall_elapsed
# fname = mergers[0].merger_table.tuples_table.file_path
# print fname
import ft2hdf
if __name__ == '__main__':
options.delete_temp_files = True
import ply
# import profiler
# profiler.profile_on()
run('www_one_dir.flw')
#
#
# profiler.profile_off()
# import pickle
# stats = profiler.get_profile_stats()
# sorted_stats = sorted(stats.iteritems(), key=lambda a: a[1][0])
# for st in sorted_stats:
# print st
#
# f = open('./profile_stats1', 'w')
# pickle.dump(sorted_stats,f)

BIN
flowy_exec.pyc Normal file

Binary file not shown.

183
ft2hdf.py Executable file
View file

@ -0,0 +1,183 @@
#!/usr/bin/python
from pytables import FlowRecordsTable
import pytables
import ftreader
import record
import os
from os.path import split, join, islink
import re
import sys
from bisect import bisect, bisect_left
from operator import itemgetter
from optparse import OptionParser
#def ft2hdf(ft_file, hdf_file):
# ft_fields = ftreader.find_fields(ft_file)
# fields = ftreader.translate_field_names(ft_fields,
# ftreader.default_names_dict)
# field_types = dict((field,pytables.default_ft_types[field])
# for field in fields)
## print field_types
# pytables.create_table_file(hdf_file, field_types)
# rec_table = pytables.FlowRecordsTable(hdf_file)
# # since pytables is initiated with dictionary there is no way to
# # sort the fields order, so we have to translate back in order
# # to keep the fields names order
# ordered_ft_fields = ftreader.translate_field_names(rec_table.fields,
# ftreader.reverse_names_dict)
# flow_set = ftreader.FlowToolsReader(ft_file, ordered_ft_fields)
# for flow in flow_set:
# rec_table.append(flow)
# rec_table.close()
def ft2hdf_single(ft_file, hdf_file):
ft_fields = ftreader.find_fields(ft_file)
fields = ftreader.translate_field_names(ft_fields,
ftreader.default_names_dict)
field_types = dict((field,pytables.default_ft_types[field])
for field in fields)
# print field_types
pytables.create_table_file(hdf_file, field_types)
rec_table = pytables.FlowRecordsTable(hdf_file)
# since pytables is initiated with dictionary there is no way to
# sort the fields order, so we have to translate back in order
# to keep the fields names order
ordered_ft_fields = ftreader.translate_field_names(rec_table.fields,
ftreader.reverse_names_dict)
flow_set = ftreader.FlowToolsReader(ft_file,
ordered_ft_fields, rec_table.fields[1:])
rec_set = record.RecordReader(flow_set)
for flow in rec_set:
rec_table.append(flow)
rec_table.close()
def ft2hdf(many_files, hdf_file):
ft_file = many_files[0]
ft_fields = ftreader.find_fields(ft_file) # returns fields present in the flow record
fields = ftreader.translate_field_names(ft_fields, ftreader.default_names_dict)
field_types = dict((field,pytables.default_ft_types[field]) for field in fields)
# print ft_fields
# print fields
pytables.create_table_file(hdf_file, field_types)
rec_table = pytables.FlowRecordsTable(hdf_file)
# since pytables is initiated with dictionary there is no way to
# sort the fields order, so we have to translate back in order
# to keep the fields names order
ordered_ft_fields = ftreader.translate_field_names(rec_table.fields, ftreader.reverse_names_dict)
for ft_file in many_files:
flow_set = ftreader.FlowToolsReader(ft_file, ordered_ft_fields, rec_table.fields[1:]) # all fields except 'id_rec'
rec_set = record.RecordReader(flow_set)
for flow in rec_set:
rec_table.append(flow)
rec_table.close()
def printHDF(hdf_file):
r = pytables.FlowRecordsTable(hdf_file)
recordReader = record.RecordReader(r)
for rec in recordReader:
print rec
class FSLoop(Exception):
pass
def findFiles(path, start_time, end_time, filter_files = False):
timeExp = re.compile(r"ft-v05\.(\d{4})-(\d{2})-(\d{2}).(\d{6}).(\d{4})")
time_file_list = []
dir_links = [path]
def walkDirs(dir_links):
file_list = []
more_dir_links = []
for link in dir_links:
for root, dirs, files in os.walk(link):
for file in files:
match = timeExp.search(file)
if match:
element = (int(''.join(match.groups()[:-1])), join(root,file))
if element in time_file_list:
raise FSLoop
file_list.append(element)
for dir in dirs:
if islink(join(root,dir)):
print file
more_dir_links.append(join(root,dir))
return file_list, more_dir_links
while len(dir_links) > 0:
tf, dir_links = walkDirs(dir_links)
time_file_list.extend(tf)
def cmp((a,x),(b,y)):
if a-b < 0:
return -1
elif a-b>0:
return 1
else:
return 0
time_file_list.sort(cmp)
if (filter_files):
keys = [r[0] for r in time_file_list]
begin = 0
end = len(time_file_list)
if start_time is not None:
begin = bisect_left(keys, long(start_time))
if end_time is not None:
end = bisect(keys, long(end_time))
# the start and end time must be converted to long
time_file_list = time_file_list[begin:end]
time_file_list = map(lambda (x,y):y,time_file_list)
return time_file_list
def dateToInt(date):
number_of_digits = [4, 2, 2, 2, 2, 2]
separators = '[- :/]*'
expr = "\d{%s}"%number_of_digits[0]
for digit in number_of_digits[1:]:
expr += separators + "(\d{%s})"%digit
timeExp = re.compile(expr)
result = timeExp.match(date)
if result is None:
raise ValueError("invalid date format")
return date.translate(None, '- :/')
def lotsOfFolders(paths, start_time=None, end_time=None):
full_file_paths=[]
start_time, end_time = [dateToInt(d) if d != None else d for d in (start_time, end_time)]
for path in paths:
full_file_paths.extend(findFiles(path, start_time, end_time, True))
# sort the results
split_paths = map(split, full_file_paths)
split_paths = set(split_paths)
split_paths = sorted(split_paths, key=itemgetter(1))
full_file_paths = [join(x, y) for x, y in split_paths]
return full_file_paths
def main():
usage = 'usage: %prog [options] input_path1 [input_path2 [...]] output_file.h5'
p = OptionParser(usage)
p.add_option('--start-time', '-s')
p.add_option('--end-time', '-e')
options, arguments = p.parse_args()
start_time = options.start_time
end_time = options.end_time
folders = arguments[:-1]
output = arguments[-1]
if not (output[output.find('.h5'):] == '.h5'):
sys.stderr.write('Output file should have an .h5 extension\n')
exit(1)
file_paths = lotsOfFolders(folders, start_time,end_time)
if len(file_paths) < 1:
sys.stderr.write('No flow-tools files found\n')
exit(1)
ft2hdf(file_paths, output)
if __name__ == "__main__":
main()

BIN
ft2hdf.pyc Normal file

Binary file not shown.

109
ftreader.py Normal file
View file

@ -0,0 +1,109 @@
import flowtools
from os.path import exists
default_names_dict = {
'dFlows': 'dFlows', 'dOctets': 'bytes', 'dPkts': 'dPkts',
'dst_as': 'dst_as', 'dst_mask': 'dst_mask', 'dst_tag': 'dst_tag',
'dstaddr_raw': 'dstip', 'dstport': 'dstport',
'engine_id': 'engine_id', 'engine_type': 'engine_type',
'exaddr_raw': 'exaddr', 'extra_pkts': 'extra_pkts',
'first_raw': 'stime', 'in_encaps': 'in_encaps',
'input': 'input', 'last_raw': 'etime', 'marked_tos': 'marked_tos',
'nexthop_raw': 'nexthop', 'out_encaps': 'out_encaps',
'output': 'output', 'peer_nexthop_raw': 'peer_nexthop',
'prot': 'prot', 'router_sc': 'router_sc', 'src_as': 'src_as',
'src_mask': 'src_mask', 'src_tag': 'src_tag',
'srcaddr_raw': 'srcip', 'srcport': 'srcport',
'sysUpTime': 'sysUpTime', 'tcp_flags': 'tcp_flags',
'tos': 'tos', 'unix_nsecs': 'unix_nsecs',
'unix_secs': 'unix_secs'}
reverse_names_dict = dict(zip(default_names_dict.values(),
default_names_dict.keys()))
# list of the possible fields in the flow tools file
flow_tools_fields = ['dFlows', 'dOctets', 'dPkts', 'dst_as', 'dst_mask',
'dst_tag', 'dstaddr_raw', 'dstport', 'engine_id',
'engine_type', 'exaddr_raw', 'extra_pkts', 'first_raw',
'in_encaps', 'input', 'last_raw', 'marked_tos',
'nexthop_raw', 'out_encaps', 'output', 'peer_nexthop_raw',
'prot', 'router_sc', 'src_as', 'src_mask', 'src_tag',
'srcaddr_raw', 'srcport', 'sysUpTime', 'tcp_flags', 'tos',
'unix_nsecs', 'unix_secs']
def find_fields(flowtools_file, fields_of_interest=flow_tools_fields):
'''
Returns list of fields_of_interest which are present in
flotools_file.
Arguments:
flowtools_file - path to flowtools records file
fields_of_interest - names of the fields for which to check
if none is given all possible fields are searched for.
'''
# read first record to see which fields are present:
flowset = flowtools.FlowSet(flowtools_file)
flow = iter(flowset).next()
# Find which fields are present in the file
# (The flow record should have these attributes):
present_fields = [k for k in fields_of_interest if hasattr(flow,k)]
return present_fields
def translate_field_names(fields_list, dictionary):
'''
Translates names of fields which have keys dictionary.
For names not present in dictionary the name remains unchanged.
'''
return [dictionary[k] for k in fields_list if dictionary.has_key(k)]
def create_flowtools_value_reader(fields):
def get_fields(record):
x = tuple(getattr(record,attr) for attr in fields)
return x
return get_fields
class FlowToolsReader(object):
def __init__(self, path, ft_fields=None, fields=None):
self.ft_fields = ft_fields if ft_fields else flow_tools_fields
self.fields = fields if fields else ft_fields
self.fields = ('rec_id',) + self.fields
self.get_vals = create_flowtools_value_reader(self.ft_fields)
if exists(path):
self.path = path
else:
raise IOError("File %s cannot be accessed."%path)
def __iter__(self):
flowset = flowtools.FlowSet(self.path)
for id, flow in enumerate(flowset):
yield (id,) + self.get_vals(flow)
raise StopIteration
#ft_file = "../ft-v05.2008-10-02.120001+0200"
#ft_fields = find_fields(ft_file)
#print ft_fields
#fields = translate_field_names(ft_fields, default_names_dict)
#import pytables
#field_types = dict((field,pytables.default_ft_types[field]) for field in fields)
#ordered_ft_fields = translate_field_names(field_types.keys(), reverse_names_dict)
#print ordered_ft_fields
#flow_set = FlowToolsReader(ft_file, ft_fields, ft_fields)
#import record
#rec_set = record.RecordReader(flow_set)
#print len(flow_set.fields)
#unix_secs = 0
#sysuptime = 0
#uptime_set = set()
#for i, flow in enumerate(rec_set):
# if sysuptime != flow.sysUpTime:
# sysuptime = flow.sysUpTime
# uptime_set.add(sysuptime)
# print i, 'ut', flow.sysUpTime - flow.last_raw, 'usecs', flow.unix_secs, 'first - last', flow.last_raw - flow.first_raw
#
#print uptime_set

BIN
ftreader.pyc Normal file

Binary file not shown.

7
gnuplot-http.dat Normal file
View file

@ -0,0 +1,7 @@
# Records # Splitter(s) Grouper(s) Merger(s) Branch A Branch B Records Match
3811 0.6 0.74 2.81 146 143 68
26521 24.8 34.95 144.75 1800 1816 1683
56992 53.06 57.68 443.36 1985 2004 2438
99925 100.03 136.09 960.34 3644 3684 4038
298063 475.83 1415.34 11485 16412 16666 15131
916633 1706.32 50141 50338

6
gnuplot-https.dat Normal file
View file

@ -0,0 +1,6 @@
# Records Splitter(s) Grouper(s) Merger(s) Branch A Branch B Records Match
26521 6.1 6.17 6.23 243 243 486
56992 13.2 13.2 13.23 158 61 219
99925
298063
916633

5
gnuplot.dat Normal file
View file

@ -0,0 +1,5 @@
# Records # Splitter Grouper Merger
3811
26521
56992
298063

238
grouper.py Normal file
View file

@ -0,0 +1,238 @@
import record
import options
from aggr_operators import count
import time
import profiler
class UnsatisfiableDelta(Exception):
pass
class Grouper(object):
def __init__(self, id, modules, aggr_ops, records, branch_name):
self.modules = modules
self.records = records
self.aggr_ops = aggr_ops
self.group_record_fields = self.create_gr_record_fields_list()
self.group_record_fields = ('rec_id',) + self.group_record_fields
self.group_record_types = self.create_gr_record_fields_types()
self.group_records = []
self.branch_name = branch_name
self.Record = record.get_record_class(self.group_record_fields)
#profiler.profile_on()
#profiler.profile_off()
#import pickle
#stats = profiler.get_profile_stats()
#sorted_stats = sorted(stats.iteritems(), key=lambda a: a[1][1]/a[1][0])
#for st in sorted_stats:
# print st
# print
def new_group(self, record):
group = Group(record, self.modules, self.aggr_ops)
return group
def __iter__(self):
count = 0
start2 = time.clock()
#print "Grouping started at:", start2
# For each of the records that have passed either
# of the branched conditions we try to find a
for record in self.records:
# print record
matched = False
count = count + 1
# print len(self.group_records)
del_list = []
try:
for i, group_record in enumerate(self.group_records):
# print i
if group_record.satisfiable:
if group_record.match(record): # match from Group class
matched = True
break
else:
yield self.Record(*((count,)+group_record.export()))
count += 1
del_list.append(i)
except ValueError:
# Group Records list is empty
# WARNING may catch ValueError from another place
# group list is still empty
matched = False # this will signal new group creation
if not matched:
self.group_records.append(self.new_group(record))
# remove exported groups:
for n_removed, i in enumerate(del_list):
# count removed elements with n_removed and compensate
# because positions change when removing elements
# Fortunately del_list is sorted so '-' works as
# a compensation, as all removed elements are before the
# current one
del self.group_records[i - n_removed]
print "Number of records in branch "+self.branch_name, count
for group_record in self.group_records:
yield self.Record(*((count,)+group_record.export()))
count += 1
print "Finished grouping branch "+self.branch_name
#time_elapsed2 = (time.clock() - start2)
#print "Grouping time finished for branch "+self.branch_name, time_elapsed2
#print "Current time is: ", time.clock()
def create_gr_record_fields_list(self):
field_list = []
for module in self.modules:
for op in module.aggr_ops:
field_list.append(op.gr_field)
for op in self.aggr_ops:
field_list.append(op.gr_field)
return tuple(field_list)
def create_gr_record_fields_types(self):
type_list = [None]
for module in self.modules:
for op in module.aggr_ops:
type_list.append(op.field_type)
for op in self.aggr_ops:
if type(op) == count:
type_list[0] = op.field_type # set the type for rec_id
type_list.append(op.field_type)
return tuple(type_list)
class AggrOp(object):
def __init__(self, op, field, gr_field, field_type):
self.op = op
self.field = field
self.gr_field = gr_field # field name used for the grouping of a set of common entries
self.field_type = field_type
def new_op(self):
return self.op(self.field, self.gr_field, self.field_type)
class GrouperModule(object):
def __init__(self, name, rules, aggr_ops):
self.name = name
self.rules = rules
self.aggr_ops = aggr_ops # set of the defined aggregation operations, plus 3 implicit operations
def match(self, record, group):
for rule in self.rules:
if not rule.match(record, group):
return False
return True
class GrouperRule(object):
def __init__(self, op, old_rec_field, new_record_field,
delta=None, relative=False):
self.op = op
self.old_rec_field = old_rec_field
self.new_rec_field = new_record_field
self.delta = delta
self.relative = relative
self.is_shortcut = self.check_is_shortcut()
# print self.op, self.old_rec_field, self.new_rec_field
def check_is_shortcut(self):
if self.delta:
if (self.old_rec_field in ('stime', 'etime') and
self.new_rec_field in ('stime', 'etime')):
return True
return False
def match(self, record, group):
new = getattr(record, self.new_rec_field)
if self.relative:
old = getattr(group.last_record, self.old_rec_field)
else:
old = getattr(group.first_record, self.old_rec_field)
if self.delta:
if self.op(abs(new - old), self.delta):
return True
elif (self.is_shortcut and
not self.op(abs(new - old),
self.delta * options.unsat_delta_threshold_mul )):
# print abs(new - old)/1000.0, (self.delta * options.unsat_delta_threshold_mul)/1000.0
raise UnsatisfiableDelta
else:
return True
else:
return self.op(old, new)
class Group(object):
__slots__ = ['modules', 'modules_aggr_ops', 'aggr_ops', 'records',
'first_record', 'last_record', 'satisfiable',
'n_unsatisfiable_deltas', 'max_unsat_deltas']
def __init__(self, first_record, modules, aggr_ops,
max_unsat_deltas=options.max_unsatisfiable_deltas):
self.first_record = first_record
self.last_record = first_record # changes with each new matched record
self.modules = modules
# list of lists of aggr_ops each corresponding to a module
self.modules_aggr_ops = self.create_modules_aggr()
self.aggr_ops = self.create_aggr_ops(aggr_ops)
self.satisfiable = True
self.n_unsatisfiable_deltas = 0
self.max_unsat_deltas = max_unsat_deltas
def create_modules_aggr(self):
modules_aggr_ops = []
for module in self.modules:
aggr = [op.new_op() for op in module.aggr_ops]
for op in aggr:
op(self.first_record)
modules_aggr_ops.append(aggr)
return modules_aggr_ops
def create_aggr_ops(self, aggr_ops):
aggr = [op.new_op() for op in aggr_ops]
for op in aggr:
op(self.first_record)
return aggr
def match(self, record):
matched = False
for module, aggr_ops in zip(self.modules, self.modules_aggr_ops):
try:
if module.match(record, self):
for op in aggr_ops:
op(record)
matched = True
except UnsatisfiableDelta:
if matched:
continue
self.n_unsatisfiable_deltas += 1
if self.n_unsatisfiable_deltas > self.max_unsat_deltas:
self.satisfiable = False
if matched:
# self.aggr_ops contains the fields from the aggregation statement of the grouper module
# as well as 3 other implicitly stated aggregation operations (etime, stime, records...)
for aggr_op in self.aggr_ops:
aggr_op(record)
# print aggr_op.gr_field, aggr_op()
# print self.records
self.n_unsatisfiable_deltas = 0
return True
else:
return False
def export(self):
fields = []
for aggr_ops in self.modules_aggr_ops:
for op in aggr_ops:
fields.append(op())
for op in self.aggr_ops:
fields.append(op())
return tuple(fields)

BIN
grouper.pyc Normal file

Binary file not shown.

179
grouper_validator.py Normal file
View file

@ -0,0 +1,179 @@
from validator_common import *
from copy import deepcopy
from tables import UIntAtom, UIntCol
from grouper import GrouperModule as GrouperModuleImpl
from grouper import Grouper as GrouperImpl
from grouper import GrouperRule as GrouperRuleImpl
from grouper import AggrOp as AggrOpImpl
import profiler
class GrouperValidator(object):
def __init__(self, parser, splitter_validator):
self.parser = parser
self.fields_types = get_input_fields_types(
get_input_reader(self.parser))
self.groupers = deepcopy(parser.groupers)
# print splitter_validator.br_name_to_br
self.br_name_to_br = splitter_validator.br_name_to_br
self.br_name_to_grouper = {}
self.impl = self.create_impl()
' '
def validate(self):
self.check_field_refs()
self.check_duplicate_grouper_names()
for grouper in self.groupers:
self.check_duplicate_module_names(grouper)
for module in grouper.modules:
# Both of these come from the validator_common.py
# module in this case is/are the modules present in
# each instance of the grouper
#print module
replace_bound_rules(module)
replace_with_vals(module)
def check_duplicate_grouper_names(self):
duplicates = {}
for grouper in self.groupers:
old_val = duplicates.setdefault(grouper.name, 0)
duplicates[grouper.name] = old_val + 1
duplicate_names = [k for k,v in duplicates.iteritems() if v > 1]
if len(duplicate_names) > 0:
msg = "Grouper(s) %s"%duplicate_names
msg += " is/are all defined more than once."
raise SyntaxError(msg)
def check_duplicate_module_names(self, grouper):
duplicates = {}
for module in grouper.modules:
old_val = duplicates.setdefault(module.name, 0)# Insert (key, value) into the dictionary
duplicates[module.name] = old_val + 1
duplicate_names = [k for k,v in duplicates.iteritems() if v > 1]
if len(duplicate_names) > 0:
msg = "Module(s) %s"%duplicate_names
msg += " is/are all defined more than once in grouper"
msg += " %s."%grouper.name
raise SyntaxError(msg)
# Check for presence of the reference fields
def check_field_refs(self):
for grouper in self.groupers:
for module in grouper.modules:
for rule in module.rules:
# Checks if the rule names of modules match those that were established
# from the flow records (passed as a second argument here). Defined in
# validator_common
check_rule_fields(rule[0], self.fields_types.keys())
# This section checks the correctness of the field names passed to the aggregator
# section of the grouper stage. field_types are defined in init and are also
# obtained in the validaton_common module.
for aggr in grouper.aggr:
for arg in aggr.args:
if type(arg) == Field:
mod, _, field = arg.name.partition('.')
if field != '':
if field not in self.fields_types.keys():
msg = 'There is no such field %s, '%arg.name
msg += 'referenced at line %s'%aggr.line
raise SyntaxError(msg)
else:
if mod not in self.fields_types.keys():
msg = 'There is no such field %s, '%arg.name
msg += 'referenced at line %s'%aggr.line
raise SyntaxError(msg)
#
def create_grouper_rules_impl(self, grouper):
modules_list = []
for module in grouper.modules:
rule_impl_list = self.convert_module_rules(module)
aggr_ops_list = self.convert_module_aggr_ops(grouper, module)
module_impl = GrouperModuleImpl(module.name, rule_impl_list,
aggr_ops_list)
modules_list.append(module_impl)
grouper_aggr_ops = []
for aggr in grouper.aggr:
init_args = self.create_aggr_impl_init_args(aggr)
# print init_args
spl = str.split(init_args[1], '.')
if len(spl) > 1:
msg = 'There is no such grouper module %s, '%spl
msg += 'referenced on line %s'%aggr.line
raise SyntaxError(msg)
impl = AggrOpImpl(*init_args)
grouper_aggr_ops.append(impl)
groupers = [GrouperImpl(grouper.name, modules_list, grouper_aggr_ops,
self.br_name_to_br[br_name], br_name)
for br_name in grouper.branches]
for grouper in groupers:
self.br_name_to_grouper[grouper.branch_name] = grouper
# print self.br_name_to_grouper
return groupers
def convert_module_aggr_ops(self, grouper, module):
aggr_ops_list = []
del_list = []
for aggr in grouper.aggr:
op, field, gr_field, field_type = self.create_aggr_impl_init_args(
aggr)
mod_name, _, f = str.partition(field, '.')
if f != '':
if module.name == mod_name:
impl = AggrOpImpl(op, f, gr_field, field_type)
aggr_ops_list.append(impl)
del_list.append(aggr)
for a in del_list:
grouper.aggr.remove(a)
return aggr_ops_list
def create_aggr_impl_init_args(self, aggr):
field = aggr.args[0].name
if '.' in field:
_, _, non_qid_field = field.partition('.')
else:
non_qid_field = field
gr_field = aggr.args[1]
if aggr.op == 'count':
field_type = UIntCol(self.fields_types['rec_id'].itemsize)
elif aggr.op == 'union':
field_type = UIntAtom(self.fields_types[non_qid_field].itemsize)
else:
field_type = UIntCol(self.fields_types[non_qid_field].itemsize)
op = find_op(aggr, 'aggr_operators')
return op, field, gr_field, field_type
def convert_module_rules(self, module):
rule_impl_list = []
for rules in module.rules:
for rule in rules:
op = find_op(rule)
args = [arg.name if type(arg) is Field else arg
for arg in rule.args]
rule_impl_list.append(GrouperRuleImpl(op, *args))
return rule_impl_list
def create_impl(self):
self.validate()
groupers_impls = []
for grouper in self.groupers:
groupers_impls.extend(self.create_grouper_rules_impl(grouper))
# print self.br_name_to_grouper
for br_name in self.br_name_to_br.keys():
if br_name not in self.br_name_to_grouper.keys():
msg = 'There is no grouper for branch %s.'%br_name
raise SyntaxError(msg)
return groupers_impls

BIN
grouper_validator.pyc Normal file

Binary file not shown.

62
groupfilter.py Normal file
View file

@ -0,0 +1,62 @@
from record import RecordReader
from filter import Rule
import profiler
class GroupFilter(object):
def __init__(self, rules, records, branch_name, groups_table, index):
self.rules = rules
self.records = records
self.branch_name = branch_name
self.index = index
self.groups_table = groups_table
self.record_reader = RecordReader(self.groups_table)
def go(self):
count = 0
for record in self.records: # These are the grouped records according to the groupers/modules
# print record
matched = False
for or_rules in self.rules:
# matched = False
for rule in or_rules: # This for-loop, just extracts the rule from the list
# print rule
if rule.match(record):
# print rule.operation
matched = True
break
if not matched:
break
if matched:
record.rec_id = count
count += 1
# Adds a record to the TimeIndex class' time interval
# as an index value, over those times that the record
# covers with its start-/end-time intervals.
self.index.add(record)
self.groups_table.append(record)
print "Finished group-filtering for branch " + self.branch_name
self.groups_table.flush()
def __iter__(self):
for rec in self.record_reader:
yield rec
class AcceptGroupFilter(GroupFilter):
def __init__(self, records, branch_name, groups_table, index):
GroupFilter.__init__(self, None, records, branch_name, groups_table,
index)
# NIK commented out on Feb 08
# This function is not used anywhere
# in the code
# def go(self):
# count = 0
# for record in self.records:
# record.rec_id = count
# count += 1
# self.index.add(record)
# self.groups_table.append(record)
# print "Finished filtering groups for branch " + self.branch_name
# self.groups_table.flush()

BIN
groupfilter.pyc Normal file

Binary file not shown.

141
groupfilter_validator.py Normal file
View file

@ -0,0 +1,141 @@
import options
from copy import deepcopy
from validator_common import *
from groupfilter import Rule as RuleImpl
from groupfilter import GroupFilter as GroupFilterImpl
from groupfilter import AcceptGroupFilter as AcceptGroupFilterImpl
from operators import NOT
import pytables
from timeindex import TimeIndex
import time
class GroupFilterValidator(object):
# The initiation of the GroupFilterValidator happens only ones.
def __init__(self, parser, grouper_validator):
self.parser = parser
self.grouper_validator = grouper_validator
self.filters = deepcopy(parser.group_filters)
self.branches_fields = self.get_branches_fields()
self.br_name_to_grouper = grouper_validator.br_name_to_grouper
self.br_name_to_gr_filter = {}
self.impl = self.create_impl()
def check_duplicate_filter_names(self):
duplicates = {}
for filter in self.filters:
old_val = duplicates.setdefault(filter.name, 0)
duplicates[filter.name] = old_val + 1
duplicate_names = [k for k,v in duplicates.iteritems() if v > 1]
if len(duplicate_names) > 0:
msg = "Group filter(s) %s"%duplicate_names
msg += " is/are all defined more than once."
raise SyntaxError(msg)
def check_field_refs(self):
"Check record field references, for unknown fields"
for filter in self.filters:
for rule in iterate_rules(filter):
for branch in filter.branches:
check_rule_fields(rule, self.branches_fields[branch])
def get_branches_fields(self):
branches_fields = {}
for grouper in self.grouper_validator.impl:
branches_fields[grouper.branch_name] = grouper.group_record_fields
return branches_fields
def validate(self):
self.check_for_unused_filters()
self.check_field_refs()
self.check_duplicate_filter_names()
def check_for_unused_filters(self):
for filter in self.filters:
if len(filter.branches) == 0:
msg = "Warning groupfilter %s "%filter.name
msg += "defined on line %s"%filter.line
msg += " is not used in any branch."
print msg
continue # skips unused filters
def get_rule_impl(self, rule):
op = find_op(rule)
args = [self.get_rule_impl(arg) if type(arg) == Rule else arg
for arg in rule.args]
impl = RuleImpl(None, NOT(op) if rule.NOT else op, args)
return impl
def get_rules_impl(self, filter):
replace_bound_rules(filter)
replace_with_vals(filter)
rules_impl = []
for or_rule in filter.rules:
or_rule_list = []
for rule in or_rule:
impl = self.get_rule_impl(rule)
or_rule_list.append(impl)
rules_impl.append(or_rule_list)
return rules_impl
def create_impl(self):
#start = time.clock()
#print "GF validation started at:", start
self.validate()
group_filters_impl = []
for filter in self.filters:
rules_impl = self.get_rules_impl(filter)
for br_name in filter.branches:
records = self.br_name_to_grouper[br_name]
index = TimeIndex(5000)
grouper = records
field_types = dict(zip(grouper.group_record_fields,
grouper.group_record_types))
# print records
fname = options.temp_path + options.groups_file_prefix
fname += br_name+".h5"
if options.delete_temp_files: if_exists_delete(fname)
file = pytables.create_table_file(fname, field_types)
groups_table = pytables.FlowRecordsTable(fname) # Create separate table files for each of the branches
filt_impl = GroupFilterImpl(rules_impl, records, br_name,
groups_table, index)
group_filters_impl.append(filt_impl)
self.br_name_to_gr_filter = dict((filt.branch_name, filt)
for filt in group_filters_impl)
# Check for branches that don't have group filters and and put accept
# filters on them
for br_name in self.br_name_to_grouper.keys():
if br_name not in self.br_name_to_gr_filter.keys():
# print "We get here if the group-filter is removed"
records = self.br_name_to_grouper[br_name]
index = TimeIndex(5000)
grouper = records
field_types = dict(zip(grouper.group_record_fields,
grouper.group_record_types))
fname = options.temp_path + options.groups_file_prefix
fname += br_name+".h5"
if options.delete_temp_files: if_exists_delete(fname)
file = pytables.create_table_file(fname, field_types)
groups_table = pytables.FlowRecordsTable(fname)
filt_impl = AcceptGroupFilterImpl(records, br_name,
groups_table, index) # This class is called in case some branch is missing
# the definition of a group-filter. Essentially a plain
# GroupFilter, but with no rules as an argument.
self.br_name_to_gr_filter[br_name] = filt_impl
group_filters_impl.append(filt_impl)
#time_elapsed = (time.clock() - start)
#print "GF Validation required:", time_elapsed
return group_filters_impl

BIN
groupfilter_validator.pyc Normal file

Binary file not shown.

BIN
h5ports.h5 Normal file

Binary file not shown.

66
http-download.flw Normal file
View file

@ -0,0 +1,66 @@
splitter S {}
filter www_req {
dstport = 80
}
filter www_res {
srcport = 80
}
filter www_res1 {
srcport = 80
}
grouper g_www_req {
module g1 {
srcip = srcip
dstip = dstip
etime < stime delta 1s
}
aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
bitOR(tcp_flags) as flags, union(srcport) as srcports
}
grouper g_www_res {
module g1 {
srcip = srcip
dstip = dstip
etime < stime delta 1s
}
aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
bitOR(tcp_flags) as flags, union(dstport) as dstports
}
grouper g_www_res1 {
module g1 {
srcip = srcip
dstip = dstip
etime < stime delta 5s
}
aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
bitOR(tcp_flags) as flags, union(dstport) as dstports
}
groupfilter ggf {
bitAND(flags, 0x13) = 0x13
}
merger M {
module m1 {
branches C, B, A
A.srcip = B.dstip
A.srcports = B.dstports
A.bytes < B.bytes
B oi A OR B d A
}
export m1
}
ungrouper U {}
"./netflow-trace.h5" -> S
S branch A -> www_req -> g_www_req -> ggf -> M
S branch B -> www_res -> g_www_res -> ggf -> M
S branch C -> www_res1 -> g_www_res1 -> ggf -> M
M->U->"./ungroped.h5"

44
https-flows.flw Normal file
View file

@ -0,0 +1,44 @@
splitter S {}
filter fil_dstport {
dstport = 443
}
filter fil_srcport {
srcport = 443
}
grouper g_fil_dstport {
module g1 {
}
aggregate sum(bytes) as bytes, bitOR(tcp_flags) as flags
# aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
# bitOR(tcp_flags) as flags, union(srcport) as srcports
}
grouper g_fil_srcport {
module g1 {
}
aggregate sum(bytes) as bytes, bitOR(tcp_flags) as flags
# aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
# bitOR(tcp_flags) as flags, union(dstport) as dstports
}
groupfilter ggf {
bitAND(flags, 0x13) = 0x13
}
merger M {
module m1 {
branches B, A
A m B delta 1440min
}
export m1
}
ungrouper U {}
"./netflow-trace.h5" -> S
S branch A -> fil_dstport -> g_fil_dstport -> ggf -> M
S branch B -> fil_srcport -> g_fil_srcport -> ggf -> M
M->U->"./ungroped.h5"

190
merger.py Normal file
View file

@ -0,0 +1,190 @@
class MergerStorage(object):
def __init__(self, id, tuples_table, record_class):
self.id = id
self.tuples_table = tuples_table
self.RecordClass = record_class
def put(self, gr_rec_tuple):
self.tuples_table.append(self.RecordClass(*gr_rec_tuple))
def flush(self):
self.tuples_table.flush()
class MergerRule(object):
def __init__(self, op, args, br_to_record):
# The records are changed externally from branches:
self.br_to_record = br_to_record
self.args = args
self.op = op
def match(self):
# The records are changed externally by another object
args = []
for arg in self.args:
if type(arg) is MergerRule:
args.append(arg.match())
elif type(arg) is tuple:
br, field = arg
record = self.br_to_record[br]
if field:
# normal rule get field of the record
args.append(getattr(record, field))
else:
# allen rule, argument is the record
args.append(record)
else:
args.append(arg)
return self.op(*args)
class Reject(Exception):
pass
class Accept(Exception):
pass
# This class represents only the first branch loop and no nested loops, unlike MergerBranch class
class Merger(object):
def __init__ (self, name, br_name, records, name_to_branch,
next_branches_names, export_branches, br_to_record,
index, index_rules, rules, merger_table):
self.name = name
self.merger_table = merger_table
self.records = records
self.export_branches = export_branches
self.br_name = br_name
self.name_to_branch = name_to_branch
self.rules = rules
self.index = index
self.br_to_record = br_to_record
self.next_branches_names = next_branches_names
self.remaining_rec = dict((name, None) for name
in next_branches_names)
self.index_rules = index_rules
@property
def next_branch(self):
if not self.next_branches_names:
return False
return self.name_to_branch[self.next_branches_names[0]]
def match(self):
for rule in self.rules:
if not rule.match():
return False
return True
def pass_allen_indices_down(self, record):
new_br_remaining_rec = {}
for rules in self.index_rules:
br_name = rules[0].target
rec_set = set()
branch = self.name_to_branch[br_name]
index = branch.index
for rule in rules:
interval = rule(record)
rec_set.update(index.get_interval_records(*interval))
# note {}.get(k) return none if {} has no key k
set_from_parent = self.remaining_rec[br_name]
if set_from_parent:
# there is a set of records defined by parent
# do an intersection
new_br_remaining_rec[br_name] = rec_set & set_from_parent
else:
# no set from parent, just add this rec_set
new_br_remaining_rec[br_name] = rec_set
# pass to next branch
if len(new_br_remaining_rec) == 0:
self.next_branch.remaining_rec = self.remaining_rec
else:
self.next_branch.remaining_rec = new_br_remaining_rec
# print "passing",self.next_branch.remaining_rec
def go(self):
for rec in self.records.record_reader:
self.br_to_record[self.br_name] = rec
self.pass_allen_indices_down(rec)
self.next_branch.next()
print "Finished merging branches: ",
print [self.br_name] + self.next_branches_names
self.merger_table.flush()
self.merger_table.tuples_table.close()
class MergerBranch(Merger):
def __init__ (self, br_name, records, name_to_branch, next_branches_names,
export_branches, br_to_record ,index, index_rules, rules,
merger_table):
Merger.__init__(self, None, br_name, records, name_to_branch,
next_branches_names, export_branches, br_to_record,
index, index_rules, rules, merger_table)
def next(self):
remaining = self.remaining_rec[self.br_name]
for rec in self.records.record_reader.read_rows_list(remaining):
self.br_to_record[self.br_name] = rec
if not self.match():
continue
self.pass_allen_indices_down(rec)
try:
self.next_branch.next()
except Accept:
# the reject modules did not reject this tuple
res = tuple(self.br_to_record[br].rec_id for br
in self.export_branches)
self.merger_table.put(res)
except Reject:
# this tuple matched reject module so we go on
pass
class MergerLastBranch(Merger):
def __init__ (self, br_name, records, name_to_branch, next_branches_names,
export_branches, br_to_record ,index, index_rules, rules,
merger_table):
Merger.__init__(self, None, br_name, records, name_to_branch,
next_branches_names, export_branches, br_to_record,
index, index_rules, rules, merger_table)
def next(self):
remaining = self.remaining_rec[self.br_name]
for rec in self.records.record_reader.read_rows_list(remaining):
self.br_to_record[self.br_name] = rec
if not self.match():
continue
# last branch and no reject branches
# append the record
res = tuple(self.br_to_record[br].rec_id for br
in self.export_branches)
self.merger_table.put(res)
class MergerRejectBranch(Merger):
def __init__ (self, br_name, records, name_to_branch, next_branches_names,
export_branches, br_to_record ,index, index_rules, rules,
merger_table):
Merger.__init__(self, None, br_name, records, name_to_branch,
next_branches_names, export_branches, br_to_record,
index, index_rules, rules, merger_table)
def next(self):
remaining = self.remaining_rec[self.br_name]
for rec in self.records.record_reader.read_rows_list(remaining):
self.br_to_record[self.br_name] = rec
if self.match():
raise Reject # goes all the way up to last normal branch
else:
try:
if self.next_branch:
self.pass_allen_indices_down(rec)
self.next_branch.next()
else:
# this is the last branch, so go on
pass
except Accept:
# this Accept is from lower reject-branch so just
# go on and raise Accept when this branch finishes
pass
raise Accept

BIN
merger.pyc Normal file

Binary file not shown.

505
merger_validator.py Normal file
View file

@ -0,0 +1,505 @@
from validator_common import *
from copy import deepcopy
from tables import UIntCol
from merger import MergerStorage
from merger import Merger as MergerImpl
from merger import MergerBranch as MergerBranchImpl
from merger import MergerLastBranch as MergerLastBranchImpl
from merger import MergerRejectBranch as MergerRejectBranchImpl
from merger import MergerRule as MergerRuleImpl
import itertools
import allen_ops
import pytables
import record
import options
class MergerValidator(object):
def __init__(self, parser, gr_filter_validator):
self.parser = parser
self.gr_filter_validator = gr_filter_validator
self.mergers = deepcopy(parser.mergers)
# The last field returns a list of the present fields for each branch
# ('rec_id', 'etime', 'stime', 'records', 'srcip', 'dstip', 'bytes', 'n', 'flags', 'srcports')
# ('rec_id', 'etime', 'stime', 'records', 'srcip', 'dstip', 'bytes', 'n', 'flags', 'dstports')
self.branches_fields = gr_filter_validator.branches_fields
# A simple dictionary mapptin of branch name to a GroupFilter
# {'A': <groupfilter.GroupFilter object at 0x9c3d66c>, 'B': <groupfilter.GroupFilter object at 0x9c43ccc>}
self.br_name_to_gr_filter = gr_filter_validator.br_name_to_gr_filter
# Checks that all the defined merger modules are actually exported
# Returns a dictionay of a merger name and module implementation
self.megers_export_modules = self.find_mergers_export_modules()
# Returns the size of the field type of the 'records' field, 4 bytes
self.id_size = self.get_id_size()
self.impl = self.get_mergers_impl()
# Returns the size of the field type of the 'records' field, 4 bytess
def get_id_size(self):
rec_reader = self.gr_filter_validator.impl[0].records
field_types = dict(zip(rec_reader.group_record_fields,
rec_reader.group_record_types))
id_size = field_types['records'].itemsize
return id_size
# Check for duplicate merger names
def check_duplicate_merger_names(self):
duplicates = {}
for merger in self.mergers:
old_val = duplicates.setdefault(merger.name, 0)
duplicates[merger.name] = old_val + 1
duplicate_names = [k for k,v in duplicates.iteritems() if v > 1]
if len(duplicate_names) > 0:
msg = "Merger(s) %s"%duplicate_names
msg += " is/are all defined more than once."
raise SyntaxError(msg)
# Check for duplicate module names
def check_duplicate_module_names(self, merger):
duplicates = {}
for module in merger.modules:
old_val = duplicates.setdefault(module.name, 0)
duplicates[module.name] = old_val + 1
duplicate_names = [k for k,v in duplicates.iteritems() if v > 1]
if len(duplicate_names) > 0:
msg = "Module(s) %s"%duplicate_names
msg += " is/are all defined more than once in merger"
msg += " %s."%merger.name
raise SyntaxError(msg)
# Checks that all the defined merger modules are actually exported
# Returns a dictionay of a merger name and module implementation
def find_mergers_export_modules(self):
merger_to_export_module = {}
for merger in self.mergers:
exp = None
for module in merger.modules:
if merger.export == module.name:
exp = module
break
if exp:
merger_to_export_module[merger.name] = exp
# print merger_to_export_module
else:
msg = "Merger %s"%merger.name
msg += " export module %s is not defined."%merger.export
return merger_to_export_module
#--------------------------------------ALLEN CHECKS-------------------------------------#
#All the operations on rules are around a sample set like: {'M': Module('m1', 38, [[Rule('EQ', 40, [Field('A.srcip'), Field('B.dstip')], False)], [Rule('EQ', 41, [Field('A.srcports'), Field('B.dstports')], False)], [Rule('LT', 42, [Field('A.bytes'), Field('B.bytes')], False)], [AllenRule('oi', 43, [Field('B'), Field('A')], False), AllenRule('d', 43, [Field('B'), Field('A')], False)]], ['B', 'A'])}
#Returns only the Allen rules
def iterate_module_allen_op_groups(self, merger):
for module in merger.modules:
for rules in module.rules:
if type(rules[0]) is not AllenRule:
continue
else:
for op in rules:
yield op
# Orders allen operations and the branches that they influence in a reverse order, if not already so
def order_allen_ops_args(self, merger):
order = self.get_merger_branches_order(merger)#Orders merger branches, exported module's branches being first
arg_combinaions = tuple(itertools.combinations(order, 2))#combinations('ABCD', 2) --> AB AC AD BC BD CD
for allen_op in self.iterate_module_allen_op_groups(merger):#Returns only the Allen rules
first, second = allen_op.args[:2] # Returns Field('B') Field('A')
op = allen_op.op # operations like oi, d
if (first.name, second.name) not in arg_combinaions:
allen_op.args = [second, first] + allen_op.args[2:]# reverse names
allen_op.op = allen_ops.inv_op_str(op)# and operations
# A number of different checks of the AllenRule
def check_allen_ops(self, merger):
allen_arg_pairs = []
arg_pairs_to_line = {}
for module in merger.modules:
for rules in module.rules:
if type(rules[0]) is not AllenRule:
continue
first_arg = rules[0].args[0].name # Get the branch names influenced by the AllenRule
second_arg = rules[0].args[1].name
line = rules[0].line
order = (first_arg, second_arg)
allen_arg_pairs.append(order)# [('B', 'A')]
self.check_allen_satisfiability(arg_pairs_to_line, order, line)
self.check_allen_consistency(first_arg, second_arg, rules)
self.check_allen_deltas(rules)
self.check_allen_reachability(allen_arg_pairs, merger)
# The following 3 methods run different tests on the allen arguments and rules
def check_allen_satisfiability(self, arg_pairs_to_line, order, line):
if arg_pairs_to_line.has_key(order):
msg = "Unsatisfiable Allen op group. "
msg += "All allen ops concerning a pair of branches should"
msg += " be connected with and OR into a single group "
msg += "within a single module.\n"
msg += "Argument pair %s on line %s"%(order, line)
msg += " is also used on line %s."%arg_pairs_to_line[order]
raise SyntaxError(msg)
else:
arg_pairs_to_line[order] = line
def check_allen_consistency(self, first_arg, second_arg, rules):
for al_op in rules:
first = al_op.args[0].name
second = al_op.args[1].name
if (first != first_arg or second != second_arg):
msg = "Inconsistent group of Allen statements "
msg += "on line %s"%rules[0].line
msg += ": %s, %s.\n"%(first, second)
msg += "All branches in this group should have "
msg += "%s and %s"%(first_arg, second_arg)
msg += " as left and righthand side arguments "
msg += "respectively."
raise SyntaxError(msg)
def check_allen_deltas(self, rules):
for al_op in rules:
if al_op.op == 'LT' or al_op.op == 'GT':
if len(al_op.args) < 3:
msg = "Allen op < or > on line %s "%al_op.line
msg += " should have delta explicitly stated."
raise SyntaxError(msg)
# A check for reachability of subsequent branches from the first one
def check_allen_reachability(self, allen_arg_pairs, merger):
br_order = self.get_merger_branches_order(merger)
# check reachability through allen index from initial branch
# of export module:
reachable = br_order[0:1] # list of first branch of exp module
unreachable = br_order[1:]
change = True
while(change):
change = False
for arg1, arg2 in allen_arg_pairs:
if arg1 in reachable and arg2 in unreachable:
unreachable.remove(arg2)
reachable.append(arg2)
change = True
if len(unreachable) > 0:
msg = "Branch(es): %s"%unreachable
msg += " in merger %s"%merger.name
msg += " is/are unreachable through an allen op or chain of"
msg += " allen ops from the first branch of the exported module"
raise SyntaxError(msg)
#--------------------------------------END ALLEN CHECKS---------------------------------#
# Orders the merger modules s.t. the exported module comes first
def order_modules(self):
for merger in self.mergers:
exp_module = self.megers_export_modules[merger.name]
new_modules_order = [exp_module]
new_modules_order += [m for m in merger.modules if m != exp_module]
merger.modules = new_modules_order
# Checks that the modules are interconnected among each other with at least one branch
def check_for_disjoint_modules(self):
for merger in self.mergers:
exp_module = self.megers_export_modules[merger.name]
exp_branches = set(exp_module.branches)
for module in merger.modules:
branches = set(module.branches)
# NOTE & is set intersection
if len(exp_branches & branches) < 1:
msg = "Merger module %s.%s"%(merger.name,module.name)
msg += " in has no overlaping branches with the"
msg += " export module."
raise SyntaxError(msg)
# Check the validity of the AllenRule, by seeing if the branch names are all defined
def check_branch_id_ref(self, rule, module_branches):
for arg in rule.args:
if type(arg) is Field:
id_ref = arg.name
if id_ref not in self.br_name_to_gr_filter.keys():
msg = 'Branch %s referenced on line'%id_ref
msg += ' %s is not defined.'%rule.line
raise SyntaxError(msg)
if id_ref not in module_branches:
msg = 'Branch %s referenced on line'%id_ref
msg += " %s "%rule.line
msg += "is not in module's branches statement."
raise SyntaxError(msg)
# Check the validity of the Rule, GrouperRule and statements like A.bytes
def check_qid_field_ref(self, rule, module_branches):
for arg in rule.args:
if type(arg) is Field:
qid_field = arg.name
branch, _, field = qid_field.partition('.') #Separates statements like A.bytes
try:
if field not in self.branches_fields[branch]:
msg = 'Wrong field %s on line %s. '%(qid_field,
rule.line)
msg += 'Branch %s does not have field %s.'%(branch,
field)
raise SyntaxError(msg)
except KeyError:
msg = 'Branch %s referenced on line'%branch
msg += ' %s is not defined'%rule.line
raise SyntaxError(msg)
if branch not in module_branches:
msg = 'Branch %s referenced on line'%branch
msg += " %s "%rule.line
msg += "is not in module's branches statement."
raise SyntaxError(msg)
# Orders merger branches with the exported module's branches being first
def get_merger_branches_order(self, merger):
br_order = []
# first add export module
for module in merger.modules:
if module.name == merger.export:
for br in module.branches:
if br not in br_order:
br_order.append(br)
# add all the others:
for module in merger.modules:
for br in module.branches:
if br not in br_order:
br_order.append(br)
return br_order
#
def order_merger_rules(self, merger):
"""
Produces mapping between incrementally larger available branches tuples
(A,B,C,etc) ordered as they will appear in the implementation.
"""
br_order = self.get_merger_branches_order(merger)
needed_brs_to_rule = {}
for module in merger.modules:
replace_with_vals(module)
replace_bound_rules(module)
for rules in module.rules:
rule_branches = self.get_rule_needed_branches(rules[0])
ordered_branches = tuple(br for br in br_order
if br in rule_branches)
if len(rules) > 1:
rule = Rule('or_op', 0, rules)
else:
rule = rules[0]
needed_brs_to_rule.setdefault(ordered_branches,
[]).append(rule)
avail_to_rules = {}
tup = ()
# create sets - needed for the set intersection operation
needed_sets = map(set, needed_brs_to_rule.keys())
# incrementaly add branches to the tuple of available branches
# and check which rules have their branch needs satisfied
for br in br_order:
tup += (br,)
# find how many of the needed branches are in this tuple
# of branches. It makes elementwise intesection of the sets
# of the needed branches and the tuple of available branches
intersect = map(set(tup).intersection , needed_sets )
for el, intersection, key in zip(needed_sets , intersect,
needed_brs_to_rule.keys()):
if len(intersection) == len(el):
# Lenght is the same, which means all needed branches
# are present. Remove these elements, take the rules from
# the needed_brs_to_rule and delete the key their to
# keep the zip() in sync
needed_sets.remove(el)
avail_to_rules[tup] = needed_brs_to_rule[key]
del needed_brs_to_rule[key]
return avail_to_rules
#
def get_rule_needed_branches(self, rule):
args_list = set()
for sub_rule in iterate_subrules(rule):
for arg in sub_rule.args:
if type(arg) is Field:
args_list.add(arg.name)
for arg in rule.args:
if type(arg) is Field:
args_list.add(arg.name)
if type(rule) is AllenRule:
return list(args_list)
else:
return [qid.partition('.')[0] for qid in args_list]
# Validates the correctness of the merger stage
def validate(self):
self.check_duplicate_merger_names()
for merger in self.mergers:
self.check_duplicate_module_names(merger)
for module in merger.modules:
# Checks the whole rule list to see that all
# the rules fall into [Rule, GrouperRule, AllenRule]
# Returns the actual rules
for rule in iterate_rules(module):
# Checks that all the rule entries are correctly specified
if type(rule) is AllenRule:
self.check_branch_id_ref(rule, module.branches)
else:
self.check_qid_field_ref(rule, module.branches)
# Orders allen operations and the branches that they influence in a reverse order
self.order_allen_ops_args(merger)
# Performs several checks on the branches and the operations (consistency, reachability, etc.)
self.check_allen_ops(merger)
# Orders the merger modules s.t. the exported module comes first
self.order_modules()
# Checks that the modules are interconnected among each other with at least one branch
self.check_for_disjoint_modules()
# Get the allen indexing operations for each branch.
def get_branches_allen_index_ops(self, merger):
"""
Get the allen indexing operations for each branch.
"""
br_to_allen_ind_ops = {}
for module in merger.modules:
for rules in module.rules:
if type(rules[0]) != AllenRule:
continue
br = rules[0].args[0].name
br_to_allen_ind_ops.setdefault(br, []).append(rules)
return br_to_allen_ind_ops
#
def get_rule_impl(self, rule, br_to_record):
if type(rule) == AllenRule:
op = find_op(rule, module='allen_ops')
args = [ (arg.name, None)
if type(arg) == Field else arg
for arg in rule.args]
else:
args = []
op = find_op(rule)
for arg in rule.args:
if type(arg) == Rule:
arg_impl = self.get_rule_impl(arg, br_to_record)
elif type(arg) == Field:
branch, _, field = arg.name.partition('.')
arg_impl = (branch, field)
else:
arg_impl = arg
args.append(arg_impl)
return MergerRuleImpl(op, args, br_to_record)
# Create indexing rules implementation for AllenRules
def get_index_rule_impl(self, rules):
res = []
for or_rules in rules:
or_rules_impl = []
for rule in or_rules:
op = find_op(rule, 'allen_index')
args = [arg.name if type(arg) == Field else arg
for arg in rule.args]
# replace with values
args = [arg.value if type(arg) == Arg else arg
for arg in args]
#[<allen_index.oi object at 0x9f5adcc>, <allen_index.d object at 0x9f5ae0c>]
or_rules_impl.append(op(*args))
res.append(or_rules_impl)
return res
# Creates a file MergedM.h5 for further storage of the merged files
def get_merger_table_impl(self, merger):
fields = self.megers_export_modules[merger.name].branches
types = [UIntCol(self.id_size) for _ in fields]
field_types = dict(zip(fields,types))
recordClass = record.get_record_class(fields, types)
# TODO fix file names
fname = options.temp_path + options.merger_file_prefix
fname += merger.name + ".h5"
if options.delete_temp_files: if_exists_delete(fname)
pytables.create_table_file(fname, field_types)
mergerTable = FlowRecordsTable(fname)
return MergerStorage(merger.name, mergerTable, recordClass)
# Actual implementation of the merger stage
def get_merger_impl(self, merger):
# Create merger storage
merger_table = self.get_merger_table_impl(merger)
# Create indexing rules implementation
br_to_index_rule_impl = {}
# {'B': [[AllenRule('oi', 43, [Field('B'), Field('A')], False), AllenRule('d', 43, [Field('B'), Field('A')], False)]]}
for br, rules in self.get_branches_allen_index_ops(merger).iteritems():
br_to_index_rule_impl[br] = self.get_index_rule_impl(rules)# List of allen index rules implemented
for br in self.get_merger_branches_order(merger):#orders branches with the exported branch being first
if br not in br_to_index_rule_impl.keys():
br_to_index_rule_impl[br] = []
# some "globals" shared among branches or needed for their creation
needed_brs = self.order_merger_rules(merger) # Re-orders the rules as they will appear in the implementation
tup = () # tuple of available branches
name = merger.name
br_order = self.get_merger_branches_order(merger) # Returns reversely-ordered branch names of the merger
export_branches = self.megers_export_modules[merger.name].branches # Returns branch names contained in the export module
br_to_record = {}
name_to_branch = {}
merger_impl = None
for br_name in br_order: # For each branch in the ordered branch set
tup += (br_name,)
next_branches_names = [br for br in br_order if br not in tup]
records = self.br_name_to_gr_filter[br_name] # Group-filters associated with each branch
index_rules = br_to_index_rule_impl[br_name] # Allen index rule associated with each branch
index = records.index # Time index object
if len(tup)<2: # If tuple contains only one branch, then execute the initial Merger class
# first branch
rules = []
impl = MergerImpl(name, br_name, records, name_to_branch,
next_branches_names, export_branches,
br_to_record, index, index_rules, rules,
merger_table)
merger_impl = impl
else:
unimpl_rules = needed_brs[tup]
rules = [self.get_rule_impl(rule, br_to_record)
for rule in unimpl_rules]
if br_name not in export_branches:
# Reject branch
impl = MergerRejectBranchImpl(br_name, records,
name_to_branch, next_branches_names,
export_branches, br_to_record, index,
index_rules, rules, merger_table)
elif not next_branches_names:
# Last non-rejecting branch
impl = MergerLastBranchImpl(br_name, records,
name_to_branch, next_branches_names,
export_branches, br_to_record, index,
index_rules, rules, merger_table)
else:
# For normal middle branches execute the MergerBranch class
impl = MergerBranchImpl(br_name, records, name_to_branch,
next_branches_names, export_branches,
br_to_record, index, index_rules,
rules, merger_table)
name_to_branch[br_name] = impl
return merger_impl
def get_mergers_impl(self):
self.validate()
mergers_impl = [self.get_merger_impl(merger)
for merger in self.mergers]
return mergers_impl

BIN
merger_validator.pyc Normal file

Binary file not shown.

BIN
netflow-trace.h5 Normal file

Binary file not shown.

111
operators.py Normal file
View file

@ -0,0 +1,111 @@
import options
from socket import getprotobyname
if options.import_ops:
external_import = __import__(options.import_ops)
def NOT(op):
def not_op(*args):
op_result = op(*args)
return not op_result
return not_op
def and_op(*args, **kwargs):
res = True
for arg in args:
res = res and arg
for arg in kwargs.values():
res = res and arg
return res
def bitAND(*args):
res = args[0]
for arg in args[1:]:
res &= arg
return res
def bitOR(*args):
res = args[0]
for arg in args[1:]:
res |= arg
return res
def or_op(*args, **kwargs):
res = False
for arg in args:
res = res or arg
for arg in kwargs.values():
res = res or arg
return res
def protocol(name):
return getprotobyname(name)
def SUM(*args):
sum = 0
for arg in args:
sum += arg
return sum
def EQ(*args):
prev_arg = args[0]
result = True
for arg in args[1:]:
result = result and prev_arg == arg
prev_arg = arg
return result
def LT(*args):
prev_arg = args[0]
result = True
for arg in args[1:]:
result = result and prev_arg < arg
prev_arg = arg
return result
def GT(*args):
prev_arg = args[0]
result = True
for arg in args[1:]:
result = result and prev_arg > arg
prev_arg = arg
return result
def GTEQ(*args):
prev_arg = args[0]
result = True
for arg in args[1:]:
result = result and prev_arg >= arg
prev_arg = arg
return result
def LTEQ(*args):
prev_arg = args[0]
result = True
for arg in args[1:]:
result = result and prev_arg <= arg
prev_arg = arg
return result
def IN(*args):
last_arg = args[-1] # probably subnet mask
result = True
for arg in args[:-1]:
result = result and arg & last_arg
return result
def true(*args):
return True

BIN
operators.pyc Normal file

Binary file not shown.

19
options.py Normal file
View file

@ -0,0 +1,19 @@
import_ops = "custops"
import_grouper_ops = None
delete_temp_files = True
time_index_interval_ms = 5000
unsat_delta_threshold_mul = 10
max_unsatisfiable_deltas = 20
do_not_expand_groups = False
temp_path = "./flowy-run/"
import os
try:
os.mkdir(temp_path)
except OSError:
pass
groups_file_prefix = "Groups"
merger_file_prefix = "Merged"

BIN
options.pyc Normal file

Binary file not shown.

4298
parser.out Normal file

File diff suppressed because it is too large Load diff

931
parser.py Normal file
View file

@ -0,0 +1,931 @@
# -*- coding: utf-8 -*-
import ply.lex as lex
import ply.yacc as yacc
from statement import *
from ply.yacc import YaccError
import netaddr
class Lexer(object):
def __init__(self,**kwargs):
self.lexer = lex.lex(module=self, **kwargs)
reserved = {
'splitter' : 'splitterKeyword',
'groupfilter' : 'groupFilterKeyword',
'filter' : 'filterKeyword',
'grouper' : 'grouperKeyword',
'module' : 'moduleKeyword',
'merger' : 'mergerKeyword',
'export' : 'exportKeyword',
'ungrouper' : 'ungrouperKeyword',
'branches' : 'branchesKeyword',
'branch' : 'branchKeyword',
'aggregate' : 'aggregateKeyword',
'as' : 'asKeyword',
'min' : 'minKeyword',
'max' : 'maxKeyword',
'avg' : 'avgKeyword',
'sum' : 'sumKeyword',
'count' : 'countKeyword',
'union' : 'unionKeyword',
'in' : 'inKeyword',
'notin' : 'notinKeyword',
'OR' : 'ORKeyword',
'NOT' : 'NOTKeyword',
'bitOR': 'bitORKeyword',
'bitAND' : 'bitANDKeyword',
'm' : 'mKeyword',
'mi' : 'miKeyword',
'o' : 'oKeyword',
'oi' : 'oiKeyword',
's' : 'sKeyword',
'si' : 'siKeyword',
'd' : 'dKeyword',
'di' : 'diKeyword',
'f' : 'fKeyword',
'fi' : 'fiKeyword',
'eq' : 'eqKeyword', # prevent clash with = for match rules
'delta': 'deltaKeyword',
'rdelta' : 'rdeltaKeyword',
'ms' : 'msKeyword'
}
def t_LTEQ(self, t):
r'<='
t.value = 'LTEQ'
return t
def t_GTEQ(self, t):
r'>='
t.value = 'GTEQ'
return t
def t_ML(self, t):
r'<<'
t.value = 'ML'
return t
def t_MG(self, t):
r'>>'
t.value = 'MG'
return t
def t_LT(self, t):
r'<'
t.value = 'LT'
return t
def t_EQ(self, t):
r'='
t.value = 'EQ'
return t
def t_GT(self, t):
r'>'
t.value = 'GT'
return t
tokens = ['id', 'LT', 'EQ', 'GT',
'LTEQ', 'GTEQ', 'ML', 'MG',
'MAC', 'IPv4', 'IPv6',
'int', 'float', 'hex',
'string'] + list(reserved.values())
t_ignore = ' \t'
t_ignore_comment = r'\#.*'
literals = "+-*/(){},."
def t_string(self, t):
r'"[^"\\\r\n]*(?:\\.[^"\\\r\n]*)*"'
t.value = Arg("string", t.value[1:-1].replace("\\",''), t.value)
return t
def t_IPv4(self, t):
r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
#the regex does include invalid IPs but they are
#checked later during conversion
try:
t.value =Arg("addr_IPv4", int(netaddr.IP(t.value)), t.value)
return t
except netaddr.AddrFormatError:
message = 'Bad IPv4 format %s at line %s' %(t.value,
t.lexer.lineno)
raise SyntaxError(message)
def t_MAC(self, t):
r'([a-fA-F0-9]{2}[:\-]){5}[a-fA-F0-9]{2}'
try:
t.value = Arg("addr_MAC", int(netaddr.EUI(t.value)), t.value)
return t
except netaddr.AddrFormatError:
message = 'Bad MAC format %s at line %s' %(t.value,
t.lexer.lineno)
raise SyntaxError(message)
def t_IPv6(self, t):
r'(::[0-9a-f]{1,4}[0-9a-f:]*)|([0-9a-f]:[0-9a-f:]*)'
# the regular expression is very genral, so this rule should be
# after the other address rules.
try:
t.value = Arg("addr_IPv6", int(netaddr.IP(t.value)), t.value)
return t
except netaddr.AddrFormatError:
message = 'Bad IPv6 format %s at line %s' %(t.value,
t.lexer.lineno)
raise SyntaxError(message)
def t_float(self, t):
'[0-9]*\.[0-9]+([eE][+-]?[0-9]+)?'
t.value = Arg("float", float(t.value), t.value)
return t
def t_hex(self, t):
r'0[xX][0-9a-fA-F]+'
t.value = Arg("int", int(t.value, 0), t.value)
return t
def t_int(self, t):
r'\d+'
t.value = Arg("int", int(t.value), t.value)
return t
#All the reserved words are matched in this rule
def t_id(self, t):
r'[a-zA-Z_][a-zA-Z_0-9]*'
# matches also keywords, so be careful
t.type = self.reserved.get(t.value,'id') # Check for reserved words
return t
def t_newline(self, t):
r'\n+'
t.lexer.lineno += len(t.value)
# Error handling rule
def t_error(self,t):
msg = "Illegal character '%s'" % t.value[0]
raise SyntaxError(msg)
# Test it output
def test(self,data):
self.lexer.input(data)
while True:
tok = self.lexer.token()
if not tok: break
print tok
class Parser(object):
# the tokens from the lexer class:
tokens = Lexer.tokens
def __init__(self):
self.filters = []
self.groupers = []
self.splitter = None
self.group_filters = []
self.mergers = []
self.branch_names = set()
self.ungroupers = []
self.branches = []
self.input = None
self.outputs = []
self.names = {}
self.lexer = Lexer().lexer
self.parser = yacc.yacc(module=self)
def p_file(self,p):
'''file : pipeline_stage_1n'''
# for k, v in self.names.iteritems():
# print k, v
def p_pipeline_stage_1n(self,p):
'pipeline_stage_1n : pipeline_stage pipeline_stage_1n'
# add a name mapping:
try:
# branch statements dont have names
# so we skip them with try/except
self.names[p[1].name] = p[1]
except AttributeError:
pass
def p_pipeline_stage_end(self,p):
'pipeline_stage_1n :'
def p_pipeline_stage(self,p):
'''
pipeline_stage : splitter
| filter
| composite_filter
| branch
| ungrouper
| grouper
| group_filter
| merger
'''
p[0] = p[1]
def p_splitter(self,p):
'''
splitter : splitterKeyword id '{' '}'
'''
p[0] = Splitter(p[2], p.lineno(2))
if self.splitter != None:
raise SyntaxError(
"More than one splitter defined in file at line %s",p.lineno(2))
self.splitter = p[0]
def p_filter(self,p):
'''
filter : filterKeyword id '{' filter_rule_1n '}'
'''
# Note that p[4] is a list of lists of rules.
# If the list has one element the rule is simple.
# If the rule has more than one element, the
# rule is OR-ed of all the rules in the list
p[0] = Filter(p[2], p.lineno(2), p[4])
self.filters.append(p[0])
def p_composite_filter(self, p):
'''
composite_filter : filterKeyword id '{' filter_ref_rule_1n '}'
'''
# Note that p[4] is a list of lists of rules.
# If the list has one element the rule is simple.
# If the rule has more than one element, the
# rule is OR-ed of all the rules in the list
p[0] = Filter(p[2], p.lineno(2), p[4])
self.filters.append(p[0])
def p_group_filter(self, p):
'''
group_filter : groupFilterKeyword id '{' filter_rule_1n '}'
'''
# Note that p[4] is a list of lists of rules.
# If the list has one element the rule is simple.
# If the rule has more than one element, the
# rule is OR-ed of all the rules in the list
p[0] = Filter(p[2], p.lineno(2), p[4])
self.group_filters.append(p[0])
def p_filter_rule_1n(self,p):
'filter_rule_1n : filter_rule filter_rule_1n'
p[2].extend([p[1]])
p[0] = p[2]
def p_filter_rule_0(self,p):
'filter_rule_1n :'
p[0] = []
def p_filter_rule(self,p):
'''
filter_rule : or_rule
'''
p[0] = p[1]
def p_filter_ref_rule_1n(self,p):
'filter_ref_rule_1n : filter_ref_rule filter_ref_rule_1n'
p[2].extend([p[1]])
p[0] = p[2]
def p_filter_ref_rule_0(self,p):
'filter_ref_rule_1n : filter_ref_rule'
p[0] = [p[1]]
def p_filter_ref_rule(self,p):
'''
filter_ref_rule : or_id
'''
p[0] = p[1]
def p_or_id(self, p):
'or_id : not_id opt_or_id'
p[1].extend(p[2])
p[0] = p[1]
def p_opt_or_id(self, p):
'''
opt_or_id : ORKeyword not_id opt_or_id
'''
p[2].extend(p[3])
p[0] = p[2]
def p_opt_or_id_end(self, p):
'opt_or_id :'
p[0] = []
def p_not_id(self, p):
'''
not_id : NOTKeyword id
| id
'''
try:
p[0] = [FilterRef(p[2], p.lineno(2), True)]
except IndexError:
p[0] = [FilterRef(p[1], p.lineno(1))]
def p_or_optrule(self,p):
'or_rule : rule_or_not opt_rule'
if len(p[2]) > 0:
ors = [p[1]]
ors.extend(p[2])
p[0] = ors
else:
p[0] = [p[1]]
def p_or_rule(self, p):
'opt_rule : ORKeyword rule_or_not opt_rule'
res = [p[2]]
res.extend(p[3])
p[0] = res
def p_term_opt_rule(self,p):
'opt_rule :'
p[0] = []
def p_rule_or_not(self, p):
'''
rule_or_not : rule
| NOTKeyword rule
'''
try:
p[2].NOT = True
p[0] = p[2]
except IndexError:
p[0] = p[1]
def p_rule(self,p):
'''
rule : infix_rule
| prefix_rule
'''
p[0] = p[1]
def p_infix_rule(self,p):
'infix_rule : arg op arg'
p[1].extend(p[3]) # concatenate args to get [arg, arg]
# for some unknown reason p.lineno(2) does not work in this production
# so p[2] is (op, lineno)
p[0] = Rule(p[2][0], p[2][1], p[1]) # (op, line, args) From filter.py
def p_op(self, p):
'''
op : EQ
| LT
| GT
| LTEQ
| GTEQ
| ML
| MG
| inKeyword
| notinKeyword
'''
p[0] = (p[1], p.lineno(1))
def p_rule_prefix(self,p):
'''
prefix_rule : id '(' args ')'
| bitANDKeyword '(' args ')'
| bitORKeyword '(' args ')'
'''
p[0] = Rule(p[1], p.lineno(1), p[3])
def p_args(self,p):
'''
args : arg ',' args
'''
p[0] = p[1]
p[0].extend(p[3]) # concatenate the rest of the args to arg
def p_args_more(self,p):
'args : arg'
p[0] = p[1]
def p_no_args(self, p):
'args :'
p[0] = []
def p_arg(self, p):
'''
arg : id
| IPv4
| IPv6
| CIDR
| MAC
| int
| float
| hex
| prefix_rule
| string
'''
if type(p[1]) is type("string"):
p[1] = Field(p[1]) # Was defined in filter.py, but the definition was commented out.
p[0] = [p[1]] # list of one element for easy [].extend later
def p_cidr(self, p):
'''
CIDR : IPv4 '/' int
| IPv6 '/' int
'''
p[0] = Rule('cidr_mask', p[1], p[3])
def p_start_branch(self, p):
'''
branch : id arrow mid_branch
'''
br = [BranchNode(p[1], p.lineno(1))] # In statement.py
br.extend(p[3])
p[0] = br
self.branches.append(p[0])
def p_input_branch(self, p):
'''
branch : string arrow mid_branch
'''
if self.input != None:
raise SyntaxError("More than one input defined in file at line %s",
p.lineno(1))
self.input = Input(p[1].value, p.lineno(1))
br = [self.input]
br.extend(p[3])
p[0] = br
self.branches.append(p[0])
def p_split_branch(self, p):
'''
branch : id branchKeyword mid_branch
'''
br = [BranchNode(p[1], p.lineno(1))]
p[3][0] = Branch(p[3][0].name, p[3][0].line)
br.extend(p[3])
p[0] = br
self.branches.append(p[0])
def p_mid_branch(self, p):
'''
mid_branch : id arrow mid_branch
'''
br = [BranchNode(p[1], p.lineno(1))]
br.extend(p[3])
p[0] = br
def p_mid_branch_terminate(self, p):
'''
mid_branch : end_branch
'''
p[0] = p[1]
def p_end_branch(self, p):
'end_branch : id'
p[0] = [BranchNode(p[1], p.lineno(1))]
def p_output_branch(self, p):
'end_branch : string'
out = Output(p[1].value, p.lineno(1))
self.outputs.append(out)
p[0] = [out]
def p_arrow(self, p):
"""arrow : "-" GT"""
pass
def p_ungrouper(self, p):
'''
ungrouper : ungrouperKeyword id '{' '}'
'''
p[0] = Ungrouper(p[2], p.lineno(2))
self.ungroupers.append(p[0])
def p_grouper(self, p):
"grouper : grouperKeyword id '{' module1_n aggregate '}'"
p[0] = Grouper(p[2], p.lineno(2), p[4], p[5])
# insert aggregation of record ids (needed for ungrouping later)
p[0].aggr.insert(0,(Rule('union', p.lineno(2), [Field('rec_id'),
'records'])))
p[0].aggr.insert(0,(Rule('min', p.lineno(2), [Field('stime'),
'stime'])))
p[0].aggr.insert(0,(Rule('max', p.lineno(2), [Field('etime'),
'etime'])))
self.groupers.append(p[0])
def p_module1_n(self, p):
'module1_n : module module1_n'
p[1].extend(p[2])
p[0] = p[1]
def p_module0(self, p):
'module1_n :'
p[0] = []
def p_module(self, p):
"module : moduleKeyword id '{' grouper_rule1_n '}'"
p[0] = [Module(p[2], p.lineno(2), p[4])]
def p_grouper_rule1_n(self, p):
'grouper_rule1_n : grouper_rule grouper_rule1_n'
p[1].extend(p[2])
p[0] = p[1]
def p_grouper_rule0(self, p):
'grouper_rule1_n :'
p[0] = []
def p_grouper_rule(self, p):
'grouper_rule : id grouper_op id'
p[0] = [[GrouperRule(p[2], p.lineno(2), [Field(p[1]), Field(p[3]),
None, False])]]
def p_grouper_rule_delta(self, p):
'''
grouper_rule : id grouper_op id deltaKeyword delta_arg
'''
p[0] = [[GrouperRule(p[2], p.lineno(2), [Field(p[1]), Field(p[3]),
p[5], False])]]
def p_grouper_rule_rel_delta(self, p):
'''
grouper_rule : id grouper_op id rdeltaKeyword delta_arg
'''
p[0] = [[GrouperRule(p[2], p.lineno(2), [Field(p[1]), Field(p[3]),
p[5], True])]]
def p_grouper_op(self, p):
'''
grouper_op : EQ
| LT
| GT
| GTEQ
| LTEQ
'''
p[0] = p[1]
def p_delta_arg(self, p):
'''
delta_arg : time
| int
'''
p[0] = p[1]
def p_time(self, p):
'''
time : int sKeyword
| int msKeyword
| int minKeyword
'''
# the number should be in ms:
if p[2] == 's':
p[1].value = p[1].value * 1000
if p[2] == 'min':
p[1].value = p[1].value * 60 * 1000
p[0] = p[1]
def p_aggregate(self, p):
'aggregate : aggregateKeyword aggr1_n'
for aggr in p[2]:
if aggr.line == 0:
aggr.line = p.lineno(1)
p[0] = p[2]
def p_aggr1_n(self, p):
'aggr1_n : aggr opt_aggr'
p[1].extend(p[2])
p[0] = p[1]
def p_opt_aggr(self, p):
"opt_aggr : ',' aggr opt_aggr"
p[2].extend(p[3])
p[0] = p[2]
def p_opt_aggr_end(self, p):
'opt_aggr :'
p[0] = []
def p_aggr(self, p):
"aggr : aggr_op '(' id_or_qid ')' asKeyword id"
args = [Field(p[3]), p[6]] # [id_or_qid, id, aggr_op]
p[0] = [Rule(p[1], p.lineno(4), args)]
def p_simple_agg(self, p):
'aggr : id_or_qid asKeyword id'
args = [Field(p[1]), p[3]] # [qid, id]
p[0] = [Rule('last', p.lineno(2), args)]
def p_simple_agg_same_name(self, p):
'aggr : id_or_qid'
args = [Field(p[1]), p[1]] # [qid, id]
p[0] = [Rule('last', p.lineno(1), args)]
def p_qid(self, p):
'''
qid : id '.' id
'''
p[0] = p[1] + p[2] + p[3]
def p_id_or_qid(self, p):
'''
id_or_qid : id
| qid
'''
p[0] = p[1]
def p_aggr_op(self, p):
'''
aggr_op : minKeyword
| maxKeyword
| sumKeyword
| avgKeyword
| unionKeyword
| countKeyword
| bitANDKeyword
| bitORKeyword
'''
p[0] = p[1]
def p_merger(self, p):
"merger : mergerKeyword id '{' merger_module1_n export '}'"
p[0] = Merger(p[2], p.lineno(2), p[4], p[5])
self.mergers.append(p[0])
def p_merger_module1_n(self, p):
'merger_module1_n : merger_module merger_module1_n'
p[1].extend(p[2])
p[0] = p[1]
def p_merger_module0(self, p):
'merger_module1_n : '
p[0] = []
def p_merger_module(self, p):
"""
merger_module : moduleKeyword id '{' merger_branches merger_rule1_n '}'
"""
p[0] = [Module(p[2], p.lineno(2), p[5], p[4])]
def p_merger_branches(self, p):
'merger_branches : branchesKeyword branches1_n'
p[0] = p[2]
def p_branches1_n(self, p):
"""
branches1_n : id ',' branches1_n
"""
p[0] = [p[1]]
p[0].extend(p[3])
def p_branches1(self, p):
' branches1_n : id'
p[0] = [p[1]]
def p_export(self, p):
'export : exportKeyword id'
p[0] = p[2]
def p_merger_rule1_n(self, p):
'merger_rule1_n : merger_rule merger_rule1_n'
p[1].extend(p[2])
p[0] = p[1]
def p_merger_rule0(self,p):
'merger_rule1_n :'
p[0] = []
def p_merger_rule(self, p):
'''
merger_rule : merger_prefix_rule
| merger_infix_rule
'''
p[0] = [[p[1]]]
def p_not_merger_rule(self, p):
'''
merger_rule : NOTKeyword merger_prefix_rule
| NOTKeyword merger_infix_rule
'''
p[2].NOT = True
p[0] = [[p[2]]]
def p_merger_infix_rule(self, p):
'merger_infix_rule : qid_arg op qid_arg'
p[1].extend(p[3])
p[0] = Rule(p[2][0], p[2][1], p[1])
def p_merger_prefix_rule(self,p):
'''
merger_prefix_rule : id '(' qid_args ')'
'''
p[0] = Rule(p[1], p.lineno(1), p[3])
def p_qid_args(self,p):
'''
qid_args : qid_arg ',' qid_args
'''
p[0] = p[1]
p[0].extend(p[3]) # concatenate the rest of the args to arg
def p__qid_args_more(self,p):
'qid_args : qid_arg'
p[0] = p[1]
def p_no_qid_args(self, p):
'qid_args :'
p[0] = []
def p_qid_arg(self, p):
'''
qid_arg : qid
| IPv4
| IPv6
| CIDR
| MAC
| int
| float
| hex
| merger_prefix_rule
| string
'''
if type(p[1]) is type("string"):
p[1] = Field(p[1])
p[0] = [p[1]] # list of one element for easy [].extend later
def p_merger_rule_al_op(self, p):
'merger_rule : allen_rule opt_or_allen_rule'
p[1].extend(p[2])
p[0] = [p[1]]
def p_opt_or_allen_rule(self, p):
'opt_or_allen_rule : ORKeyword allen_rule opt_or_allen_rule'
p[2].extend(p[3])
p[0] = p[2]
def p_opt_op_rule_end(self, p):
'opt_or_allen_rule : '
p[0] = []
def p_allen_rule(self, p):
'allen_rule : id allen_op id opt_allen_delta'
args = [Field(p[1]), Field(p[3])]
args.extend(p[4]) # add the delta time to [arg, arg]
p[0] = [AllenRule(p[2], p.lineno(1), args)] # (op, line, args)
def p_opt_allen_delta(self, p):
'''
opt_allen_delta : deltaKeyword time
'''
p[0] = [p[2]]
def p_no_allen_delta(self, p):
'opt_allen_delta :'
p[0] = []
def p_allen_op(self, p):
'''
allen_op : LT
| GT
| EQ
| mKeyword
| miKeyword
| oKeyword
| oiKeyword
| sKeyword
| siKeyword
| dKeyword
| diKeyword
| fKeyword
| fiKeyword
| eqKeyword
'''
# for some strange reason upper level refuses to recognize lineno:
p[0] = p[1]
def p_error(self, p):
msg ="Syntax error. Unexpected token "
msg +="%s (%s)"%(p.value, p.type)
msg += " at line %s"% self.lexer.lineno
raise SyntaxError(msg)
def parse(self, text):
self.parser.parse(text, lexer=self.lexer) # parse method is called from ply.yacc
self.resolve_branches()
def find_io_nodes(self):
'''
Finds which branch nodes are inputs and which are outputs.
The rest of the branches are processing stages.
'''
pass
def check_branching(self):
pass
def check_branch_nodes(self):
for b in self.branch_nodes.values():
if not b.is_branch:
try:
node = self.names[b.name]
if len(b.inputs) == 0:
msg = "Node %s at line" % b.name
msg += " %s does not have input." % b.line
raise SyntaxError(msg)
if len(b.outputs) == 0:
msg = "Node %s at line" % b.name
msg += " %s does not have output." % b.line
raise SyntaxError(msg)
if len(b.inputs) > 1 and type(node) is not Merger:
msg = "Non-Merger node %s at line" % b.name
msg += " %s has more than one input." % b.line
raise SyntaxError(msg)
if len(b.outputs) > 1 and type(node) is not Splitter:
msg = "Non-Splitter node %s at line" % b.name
msg += " %s has more than one output." % b.line
raise SyntaxError(msg)
except KeyError:
# check whether this is some middle node
if len(b.inputs) != 0 and len(b.outputs) !=0:
msg = "Node %s refferenced at line" % b.name
msg += " %s not defined" % b.line
raise SyntaxError(msg)
#check whether the node name is actually parser string(Arg)
if type(b.name) is not Arg:
msg = "Node %s refferenced at line" % b.name
msg += " %s not defined" % b.line
raise SyntaxError(msg)
else:
if len(b.inputs) != 1 or len(b.outputs) != 1:
msg = "Branch Node %s at line" % b.name
msg += " %s must have 1 input and 1 output." % b.line
raise SyntaxError(msg)
def resolve_branches(self):
noname_branchings = []
for branch in self.branches:
# print branch
# print ""
br_name = False
br_index = 0
for i, node in enumerate(branch):
if type(node) is BranchNode:
try:
branch[i] = self.names[node.name]
except KeyError:
msg = "Node %s refferenced at line" % node.name
msg += " %s not defined" % node.line
raise SyntaxError(msg)
if type(node) is Branch:
br_name = node.name
br_index = i
self.branch_names.add(br_name)
if type(node) is Input and i != 0:
msg = "Input node %s at line" % node.name
msg += " %s should be at first posigion" % node.line
msg += " of branching statement"
raise SyntaxError(msg)
if type(node) is Output and i != (len(branch) - 1):
msg = "Output node %s at line" % node.name
msg += " %s should be at position posigion" % node.line
msg += " of branching statement"
raise SyntaxError(msg)
if br_name:
del(branch[br_index])
for node in branch:
node.branches.add(br_name)
else:
noname_branchings.append(branch)
# second iteration to fix the remaining node, which don't have branches
for branch in noname_branchings:
s = set()
for node in branch:
s.update(node.branches)
for node in branch:
node.branches.update(s)
class ParsedFile(object):
def __init__(self, filters, groupers, splitters, group_filters,
mergers, branches, ungroupers, input, output, names):
self.filters = filters
self.groupers = groupers
self.splitters = splitters
self.group_filters = group_filters
self.mergers = mergers
self.branches = branches
self.ungroupers = ungroupers
self.input = input
self.output = output
self.names = names

BIN
parser.pyc Normal file

Binary file not shown.

185
parsetab.py Normal file

File diff suppressed because one or more lines are too long

BIN
parsetab.pyc Normal file

Binary file not shown.

48
port-filter.flw Normal file
View file

@ -0,0 +1,48 @@
splitter S {}
filter www_req {
dstport = 80
}
filter www_res {
dstport = 80
}
grouper g_www_req {
module g1 {
dstport = dstport
etime < stime rdelta 1s
}
aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
bitOR(tcp_flags) as flags, union(srcport) as srcports
}
grouper g_www_res {
module g1 {
srcport = srcport
etime < stime rdelta 1s
}
aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
bitOR(tcp_flags) as flags, union(srcport) as srcports
}
groupfilter ggf {
bitAND(flags, 0x13) = 0x13
}
merger M {
module m1 {
branches A, B
}
export m1
}
ungrouper U {}
"./netflow-trace.h5" -> S
S branch A -> www_req -> g_www_req -> ggf -> M
S branch B -> www_res -> g_www_res -> ggf -> M
M->U->"./ungroped.h5"

46
ports.flw Normal file
View file

@ -0,0 +1,46 @@
splitter S {}
filter www_req {
dstport = 443 OR dstport = 80 OR dstport = 8080
unix_secs > 1259413200
unix_secs < 1259445600
}
filter www_res {
unix_secs < 1259445600
unix_secs > 1259413200
srcport = 443 OR srcport = 80 OR srcport = 8080
}
grouper g_www_req {
module g1 {
}
aggregate bitOR(tcp_flags) as flags
}
grouper g_www_res {
module g1 {
}
aggregate bitOR(tcp_flags) as flags
}
groupfilter ggf {
bitAND(flags, 0x13) = 0x13
}
merger M {
module m1 {
branches B, A
# B.stime = 1259413200 AND B.etime = 1259445600
A d B OR B d A
# B o A delta 32400s
}
export m1
}
ungrouper U {}
"./h5ports.h5" -> S
S branch A -> www_req -> g_www_req -> ggf -> M
S branch B -> www_res -> g_www_res -> ggf -> M
M->U->"./portsungroped.h5"

BIN
portsungroped.h5 Normal file

Binary file not shown.

18
print_hdf_in_step.py Executable file
View file

@ -0,0 +1,18 @@
#!/usr/bin/python
from record import RecordReader
from pytables import FlowRecordsTable
from itertools import izip
from optparse import OptionParser
if __name__ == '__main__':
usage = 'usage: %prog [options] input files'
p = OptionParser(usage)
opts, arguments = p.parse_args()
mg_readers = [RecordReader(FlowRecordsTable(f)) for f in arguments]
for rec_tuple in izip(*mg_readers):
print ""
for r in rec_tuple:
print r

21
printhdf.py Executable file
View file

@ -0,0 +1,21 @@
#!/usr/bin/python
from optparse import OptionParser
import pytables
import record
import sys
def printHDF(hdf_file):
r = pytables.FlowRecordsTable(hdf_file)
recordReader = record.RecordReader(r)
for rec in recordReader:
print rec
if __name__ == "__main__":
usage = 'usage: %prog file_name.h5'
p = OptionParser(usage)
options, arguments = p.parse_args()
if len(arguments) != 1:
sys.stderr.write('Exactly one argument expected\n')
exit(1)
printHDF(arguments[0])

98
profiler.py Normal file
View file

@ -0,0 +1,98 @@
from time import time
import threading
import sys
from collections import deque
try:
from resource import getrusage, RUSAGE_SELF
except ImportError:
RUSAGE_SELF = 0
def getrusage(who=0):
return [0.0, 0.0] # on non-UNIX platforms cpu_time always 0.0
p_stats = None
p_start_time = None
def profiler(frame, event, arg):
if event not in ('call','return'): return profiler
#### gather stats ####
rusage = getrusage(RUSAGE_SELF)
t_cpu = rusage[0] + rusage[1] # user time + system time
code = frame.f_code
fun = (code.co_name, code.co_filename, code.co_firstlineno)
#### get stack with functions entry stats ####
ct = threading.currentThread()
try:
p_stack = ct.p_stack
except AttributeError:
ct.p_stack = deque()
p_stack = ct.p_stack
#### handle call and return ####
if event == 'call':
p_stack.append((time(), t_cpu, fun))
elif event == 'return':
try:
t,t_cpu_prev,f = p_stack.pop()
assert f == fun
except IndexError: # TODO investigate
t,t_cpu_prev,f = p_start_time, 0.0, None
call_cnt, t_sum, t_cpu_sum = p_stats.get(fun, (0, 0.0, 0.0))
p_stats[fun] = (call_cnt+1, t_sum+time()-t, t_cpu_sum+t_cpu-t_cpu_prev)
return profiler
def profile_on():
global p_stats, p_start_time
p_stats = {}
p_start_time = time()
threading.setprofile(profiler)
sys.setprofile(profiler)
def profile_off():
threading.setprofile(None)
sys.setprofile(None)
def get_profile_stats():
"""
returns dict[function_tuple] -> stats_tuple
where
function_tuple = (function_name, filename, lineno)
stats_tuple = (call_cnt, real_time, cpu_time)
"""
return p_stats
#### EXAMPLE ##################################################################
if __name__ == '__main__':
from time import sleep
from threading import Thread
import random
def test_function():
pass
class T(Thread):
def __init__(self):
Thread.__init__(self)
def run(self): # takes about 5 seconds
for i in xrange(100):
self.test_method()
test_function()
def test_method(self):
sleep(random.random() / 10)
profile_on()
#######################
threads = [T() for i in xrange(3)]
for t in threads:
t.start()
for i in xrange(100):
test_function()
for t in threads:
t.join()
#######################
profile_off()
from pprint import pprint
pprint(get_profile_stats())

BIN
profiler.pyc Normal file

Binary file not shown.

View file

@ -0,0 +1,885 @@
/var/netflow/ft-data-fall09/sne-ft-data/2009/sneze/2009-11-29/
26521 records
1683 records matched the http request
deepcopy
Splitter initiated
Parsing and validation finished: 0.31
Started filtering
Finished filtering
Filters ready
Splitter time elapsed: 27.19
Finished grouping branch A
Finished grouping branch B
Finished filtering groups for branch A
Finished filtering groups for branch B
Group filter time elapsed: 45.0
Finished merging branches: ['B', 'A']
Merger time elapsed: 162.3
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 168.99
Closing remaining open files: ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsA.h5... done
real 2m49.129s
user 2m44.070s
sys 0m5.824s
Splitter initiated
Parsing and validation finished: 0.33
Started filtering
Finished filtering
Filters ready
Splitter time elapsed: 30.16
Finished grouping branch B
Finished grouping branch A
Finished filtering groups for branch B
Finished filtering groups for branch A
Group filter time elapsed: 34.2
Finished merging branches: ['B', 'A']
Merger time elapsed: 138.3
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 143.71
Closing remaining open files: ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsA.h5... done
real 2m24.193s
user 2m19.957s
sys 0m4.608s
deep_copy
Splitter initiated
Parsing and validation finished: 0.36
Started filtering
Finished filtering
Filters ready
Splitter time elapsed: 24.02
Finished grouping branch B
Finished grouping branch A
Finished filtering groups for branch B
Finished filtering groups for branch A
Group filter time elapsed: 32.74
Finished merging branches: ['B', 'A']
Merger time elapsed: 155.7
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 162.56
Closing remaining open files: ./flowy-run/GroupsA.h5... done ./flowy-run/GroupsB.h5... done
real 2m43.294s
user 2m38.782s
sys 0m4.628s
Splitter initiated
Parsing and validation finished: 0.26
Started filtering
Finished filtering
Filters ready
Splitter time elapsed: 24.8
Finished grouping branch B
Finished grouping branch A
Finished filtering groups for branch A
Finished filtering groups for branch B
Group filter time elapsed: 34.95
Finished merging branches: ['B', 'A']
Merger time elapsed: 144.75
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 149.73
Closing remaining open files: ./flowy-run/GroupsA.h5... done ./flowy-run/GroupsB.h5... done
real 2m36.640s
user 2m27.385s
sys 0m3.508s
Splitter initiated
Parsing and validation finished: 0.3
Started filtering
Finished filtering
Filters ready
Splitter time elapsed: 24.2
Finished grouping branch B
Finished grouping branch A
Finished filtering groups for branch A
Finished filtering groups for branch B
Group filter time elapsed: 31.15
Finished merging branches: ['B', 'A']
Merger time elapsed: 145.9
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 151.19
real 2m31.325s
user 2m26.629s
sys 0m5.412s
modified reset/deepcopy
(('reset', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 64), (26521, 11.015153884887695, 11.560714000001838))
(('new_group', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 21), (1466, 6.5672850608825684, 5.3123339999998507))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 25), (1468, 775.12532043457031, 766.78390699999591))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 15), (3228, 155.0828640460968, 160.51002500000152))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/splitter.py', 37), (3229, 87.616034030914307, 89.193573000000356))
(('append', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 118), (3490, 35.743690967559814, 30.529941999999664))
(('notify', '/usr/lib/python2.6/threading.py', 270), (6570, 10.859287977218628, 10.72066600000062))
(('_is_owned', '/usr/lib/python2.6/threading.py', 219), (6695, 9.4564809799194336, 9.1245670000004111))
(('final_result', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 57), (26521, 5.4859673976898193, 5.0482840000003648))
(('reset', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 45), (26521, 85.135001659393311, 88.205508000023968))
(('_deepcopy_dict', '/usr/lib/python2.6/copy.py', 251), (26712, 73.298033714294434, 75.524687000011454))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/record.py', 129), (27270, 27.118208885192871, 27.781735000003209))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/merger.py', 72), (97059, 33.632721662521362, 30.013754000007793))
(('read_row', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 99), (99360, 518.74268817901611, 468.40537100055235))
(('iterrows', '/usr/local/lib/python2.6/dist-packages/tables/table.py', 1441), (99377, 118.15105223655701, 106.11463399998161))
(('read_rows_list', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 105), (99797, 522.83437442779541, 472.12965100054475))
(('read_rows_list', '/home/melnikovkolya/classes/semester-3-project/flowy/record.py', 133), (99797, 550.52120852470398, 497.50723100058826))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/merger.py', 23), (147484, 24.74915337562561, 21.317261000004237))
(('deepcopy', '/usr/lib/python2.6/copy.py', 144), (187567, 161.90160441398621, 165.33823200019515))
(('read', '/usr/local/lib/python2.6/dist-packages/tables/vlarray.py', 700), (195364, 274.85678458213806, 246.25141199899576))
(('<genexpr>', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 102), (294714, 294.22120332717896, 264.55258099813909))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 178), (856942, 596.70967555046082, 576.32406800022113))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 101), (861729, 430.92800951004028, 418.1861820004529))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 126), (989422, 290.51547265052795, 272.90903400041935))
(('idx2long', '/usr/local/lib/python2.6/dist-packages/tables/utils.py', 66), (784846, 75.146798133850098, 69.772329999996373))
(('is_idx', '/usr/local/lib/python2.6/dist-packages/tables/utils.py', 44), (784846, 26.284930467605591, 21.873351000002572))
/var/netflow/ft-data-fall09/kur-ft-data/2009-11-17/
56992 records
2438 records matched the http request
With profiler off:
real 8m8.700s
user 7m47.945s
sys 0m12.909s
Splitter initiated
Parsing and validation finished: 1.29
Started filtering
Finished filtering
Filters ready
Splitter time elapsed: 58.21
Finished grouping branch B
Finished grouping branch A
Finished filtering groups for branch A
Finished filtering groups for branch B
Group filter time elapsed: 59.8
Finished merging branches: ['B', 'A']
Merger time elapsed: 471.27
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 480.68
56992
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# time python flowy.py http-download.flw
0.72
Splitter initiated
Started filtering
Group filter time started: 0.29
3955
Filters ready
Splitter finished
Splitter time elapsed: 53.06
Number of records in branch A 1985
Number of records in branch B 2004
Finished grouping branch A
Finished group-filtering for branch A
Finished grouping branch B
Finished group-filtering for branch B
Group filter threads joined: 57.68
Finished merging branches: ['B', 'A']
Merger time elapsed: 443.36
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 452.1
Closing remaining open files: ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsA.h5... done
real 7m46.456s
user 7m21.036s
sys 0m11.921s
(('new_group', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 21), (1824, 9.5541517734527588, 9.8766150000006974))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 25), (1828, 1249.1410629749298, 1300.497268999989))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/splitter.py', 37), (3955, 59.615991353988647, 62.479928999999061))
(('split', '/home/melnikovkolya/classes/semester-3-project/flowy/splitter.py', 17), (3955, 30.423548460006714, 32.126016000000902))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 15), (3956, 456.31715869903564, 475.12168400000229))
(('get', '/usr/lib/python2.6/Queue.py', 150), (3957, 35.274902582168579, 37.742364999999495))
(('append', '/usr/local/lib/python2.6/dist-packages/tables/vlarray.py', 452), (5486, 76.012235879898071, 76.39678599999479))
(('append', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 118), (5785, 81.44921350479126, 81.341101000000435))
(('notify', '/usr/lib/python2.6/threading.py', 270), (8002, 17.408251523971558, 17.825101000000359))
(('_is_owned', '/usr/lib/python2.6/threading.py', 219), (8101, 14.244855642318726, 15.092936000000464))
(('final_result', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 57), (56992, 15.892577886581421, 15.040958000006583))
(('reset', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 45), (56992, 255.76119065284729, 262.48040000008808))
(('_deepcopy_dict', '/usr/lib/python2.6/copy.py', 251), (57183, 218.50618243217468, 224.26205200008098))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 89), (58365, 30.709211587905884, 31.189945000012358))
(('iterate_fixed_fields', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 93), (58365, 19.963983297348022, 19.749231000007512))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/record.py', 129), (58365, 86.714945554733276, 88.23755700004449))
(('_deepcopy_list', '/usr/lib/python2.6/copy.py', 224), (114144, 72.901082038879395, 73.184596000045076))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 90), (117636, 47.137009859085083, 43.210651000023745))
(('_deepcopy_atomic', '/usr/lib/python2.6/copy.py', 197), (171331, 14.566928386688232, 13.152824000005694))
(('_keep_alive', '/usr/lib/python2.6/copy.py', 261), (343098, 47.557926893234253, 39.274455000023863))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/merger.py', 72), (343759, 89.168351411819458, 86.809352999718158))
(('read_row', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 99), (347405, 1355.7759656906128, 1345.6080879980259))
(('iterrows', '/usr/local/lib/python2.6/dist-packages/tables/table.py', 1441), (347422, 306.37827634811401, 304.82301899932509))
(('read_rows_list', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 105), (348171, 1369.7901601791382, 1360.4090329980108))
(('deepcopy', '/usr/lib/python2.6/copy.py', 144), (400864, 485.14781737327576, 489.78665900019996))
(('__init__', '/home/melnikovkolya/classes/semester-3-project/flowy/record.py', 82), (408197, 79.613070487976074, 80.693067999662162))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/merger.py', 23), (527995, 64.410658597946167, 62.123842999773387))
(('read', '/usr/local/lib/python2.6/dist-packages/tables/vlarray.py', 700), (689950, 714.14480590820312, 706.58424299669286))
(('<genexpr>', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 102), (1037339, 765.8496515750885, 758.55947299578656))
(('_processRangeRead', '/usr/local/lib/python2.6/dist-packages/tables/leaf.py', 449), (1037372, 470.43238306045532, 463.84111299771757))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 101), (1380363, 747.47748589515686, 753.67501099601259))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 178), (1380426, 1028.9652721881866, 1053.8537989941979))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 126), (1740570, 498.78313732147217, 495.35881499854258))
(('EQ', '/home/melnikovkolya/classes/semester-3-project/flowy/operators.py', 63), (2370745, 182.36606240272522, 156.70575899921459))
(('idx2long', '/usr/local/lib/python2.6/dist-packages/tables/utils.py', 66), (2764694, 214.65504741668701, 203.63286399914659))
(('is_idx', '/usr/local/lib/python2.6/dist-packages/tables/utils.py', 44), (2764694, 75.347645044326782, 63.899976999761293))
(('reset', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 64), (56992, 31.726502895355225, 31.213908000036554))
(('deep_copy', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 41), (56994, 15.406083345413208, 16.889049000018872))
7 days of data
python ft2hdf.py /var/netflow/ft-data-fall09/sne-ft-data/2009/sneze/2009-12-0* netflow-trace.h5
246350 records in total
12394 records match the query
profiling:
Splitter initiated
Parsing and validation finished: 2.22
Started filtering
Finished filtering
Filters ready
Splitter time elapsed: 1130.1
Finished grouping branch B
Finished filtering groups for branch B
Finished grouping branch A
Finished filtering groups for branch A
Group filter time elapsed: 2123.665408
Finished merging branches: ['B', 'A']
Merger time elapsed: -185.553776
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: -114.543776
no profiling:
Splitter initiated
Parsing and validation finished: 0.26
Started filtering
Finished filtering
Filters ready
Splitter time elapsed: 320.43
Finished grouping branch B
Finished grouping branch A
Finished filtering groups for branch B
Finished filtering groups for branch A
Group filter time elapsed: 922.42
Finished merging branches: ['B', 'A']
Merger time elapsed: 1039.122704
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 1074.252704
Closing remaining open files: ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsA.h5... done
start()real 90m16.511s
user 86m23.020s
sys 3m7.356s
Splitter initiated
Parsing and validation finished: 0.31
Started filtering
Finished filtering
Filters ready
Splitter time elapsed: 346.66
Finished grouping branch B
Finished grouping branch A
Finished filtering groups for branch B
Finished filtering groups for branch A
Group filter time elapsed: 916.19
Finished merging branches: ['B', 'A']
Merger time elapsed: 1037.532704
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 1073.552704
(('reset', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 64), (246349, 940.52704691886902, 994.15005099796895))
(('reset', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 64), (246349, 111.18868279457092, 105.20649999988791))
(('deep_copy', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 41), (246351, 61.105264902114868, 52.447237999959725))
(('read_rows_list', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 105), (3155228, 13582.554839611053, 13318.368595361764))
(('read_rows_list', '/home/melnikovkolya/classes/semester-3-project/flowy/record.py', 133), (3155228, 14223.106386899948, 13948.747855334786))
(('read', '/usr/local/lib/python2.6/dist-packages/tables/vlarray.py', 700), (6280932, 6694.1691343784332, 6541.9808274548959))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 178), (30651429, 17337.516788959503, 17566.637794171394))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 101), (30677828, 12477.594463348389, 12583.8665639143))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 126), (35383022, 8230.0888061523438, 8037.6824171527333))
(('EQ', '/home/melnikovkolya/classes/semester-3-project/flowy/operators.py', 63), (40143460, 2728.9575715065002, 2304.1001345953482))
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# time flow-cat /var/netflow/ft-data-fall09/kur-ft-data/2009-11-16/ | flow-print | wc -l
99925
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# time python flowy.py http-download.flw0.77
Splitter initiated
Started filtering
Group filter time started: 0.33
7222
Filters ready
Splitter finished
Splitter time elapsed: 100.03
Number of records in branch B 3684
Number of records in branch A 3644
Finished grouping branch A
Finished group-filtering for branch A
Finished grouping branch B
Finished group-filtering for branch B
Group filter threads joined: 136.09
Finished merging branches: ['B', 'A']
Merger time elapsed: 960.34
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 974.11
Closing remaining open files: ./flowy-run/GroupsA.h5... done ./flowy-run/GroupsB.h5... done
real 16m39.728s
user 15m49.067s
sys 0m26.002s
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# python ft2hdf.py /var/netflow/ft-data-fall09/sne-ft-data/2009/sneze/2009-11-* netflow-trace.h5
298063
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# time python flowy.py http-download.flw
0.84
Splitter initiated
Started filtering
29448
Filters ready
Splitter finished
Splitter time elapsed: 475.83
Number of records in branch B 16666
Number of records in branch A 16412
Finished grouping branch B
Finished group-filtering for branch B
Finished grouping branch A
Finished group-filtering for branch A
Group filter threads joined: 1415.34
Finished merging branches: ['B', 'A']
Merger time elapsed: -1347.101888 = 11485
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: -1301.531888 = 11531
Closing remaining open files: ./flowy-run/GroupsA.h5... done ./flowy-run/GroupsB.h5... done
real 192m11.507s = 11531
user 185m39.648s
sys 7m25.104s
The following example shows how heavy is the influence of performing the match operations (and the consequences in case of a match being found) by having a different load for 2 different branches.
We can see that processing of Branch A, which has been matching records against a port number 443, has a running time of around 17 seconds, while the branch, which checked whether a record entry has a TCP protocol has taken around 90 seconds. The reason for a larger running time is that many more entries that match the prot=TCP requirement have been fround, and it takes an additional processing to index and to append the record to a group.
Though each of the executions of the reset function takes on average 9-10 times more time than the match function, the number of the executions of the match function is at least 30 times greater than that of the reset function.
After spot-profiling (running multi-thread profiler on certain sections of the code only), I could verify that the time spent in executing the match calls of the grouper module was causing most significant slow-down in all of the code, up to the merger module. Depending on the complexity of each of the match() calls, the execution time varied for the same number of function calls. The three match() calls from different classes form a nested chain, where one match() function, relies on another match(). The heaviest (in terms of time per execution) of all three match functions is the top-most match(), that comes from the Group class of the grouper module. Besides relying on a double-nested match call from two other classes, it also performs a
and calculates the first and the last records of newly-formed group, which is necessary for relative comparisons.
with an average time spent per cycle (including the profiler overhead) being:
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 193), (280246, 151.20158743858337, 154.237679000132))
A very simple GrouperModule class match, with a single
def match(self, record, group):
for rule in self.rules:
if not rule.match(record, group):
return False
return True
This was followed by the filtering operation
Splitter initiated
GF validation started at: 0.89
GF Validation required: 0.09
Parsing and validation finished: 0.32
Started filtering
Grouping started at:Fitlering time started at: Grouping started at: 1.0
1.0
1.0
Finished filtering
Filtering required: 16.87
Filters ready
Splitter time elapsed: 17.11
Finished grouping branch A
Grouping time required branch A 17.34
Current time is: 18.34
Finished filtering groups for branch A
Finished grouping branch B
Grouping time required branch B 90.08
Current time is: 91.08
Finished filtering groups for branch B
Group filter time elapsed: 90.41
Finished merging branches: ['B', 'A']
Merger time elapsed: 111.58
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 111.75
Closing remaining open files: ./flowy-run/GroupsA.h5... done ./flowy-run/GroupsB.h5... done
real 1m52.516s
user 1m50.411s
sys 0m2.136s
The performance of the group-filters differs significantly, depending on how many matching records have been found in the filtering stage.
I.e., a filter for the port 443 results in relatively few records, while the filter for a port number >30000 results in many record matching it. The matching records need to be processed and stored for further group-filters, where the group filters try to form groups from the matched records. An example of running a query, which identifies flows with a destination port 443 and a source port > 30000, is shown next. It can be seen, that group-filtering of branch B, which is responsible for filtering out the srcport > 30000 request has a much larger running time than that of branch A, which looks only for those few records with a destination port of 443.
Splitter initiated
Parsing and validation finished: 0.28
Started filtering
Fitlering time started at: 0.98
Finished filtering
Filtering required: 33.49
Filters ready
Splitter time elapsed: 33.61
Finished grouping branch A
Grouping time finished for branch A 40.49
Finished filtering groups for branch A
Finished grouping branch B
Grouping time finished for branch B 228.46
Finished filtering groups for branch B
Group filter time elapsed: 227.86
Finished merging branches: ['B', 'A']
Merger time elapsed: 252.77
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 253.31
Closing remaining open files: ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsA.h5... done
real 4m11.671s
user 4m9.296s
sys 0m4.880s
==================================HTTPS PROFILE===========================================
A query that selects source and destination ports is defined as follows. We used that query to compare simple running times of different tools:
splitter S {}
filter www_tcp {
dstport = 443
}
filter www_port {
srcport = 443
}
grouper g_www_tcp {
module g1 {
}
aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
bitOR(tcp_flags) as flags, union(srcport) as srcports
}
grouper g_www_port {
module g1 {
}
aggregate srcip, dstip, sum(bytes) as bytes, count(rec_id) as n,
bitOR(tcp_flags) as flags, union(dstport) as dstports
}
groupfilter ggf {
bitAND(flags, 0x13) = 0x13
}
merger M {
module m1 {
branches B, A
A m B delta 1440min
}
export m1
}
ungrouper U {}
"./netflow-trace.h5" -> S
S branch A -> www_tcp -> g_www_tcp -> ggf -> M
S branch B -> www_port -> g_www_port -> ggf -> M
M->U->"./ungroped.h5"
The same number of
/var/netflow/ft-data-fall09/sne-ft-data/2009/sneze/2009-11-29/
26521 records in total
486 records match
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# time python flowy.py https-flows.flw
0.73
Splitter initiated
Started filtering
486
Filters ready
Splitter finished
Number of records in branch A 243 Number of records in branch B 243
Finished grouping branch A
Finished group-filtering for branch A
Finished grouping branch B
Finished group-filtering for branch B
Group filter threads joined
Finished merging branches: ['B', 'A']
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 6.61
Closing remaining open files: ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsA.h5... done ./netflow-trace.h5... done ./netflow-trace.h5... done ./flowy-run/GroupsB-merged.h5... done ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsA-merged.h5... done ./flowy-run/MergedM.h5... done ./flowy-run/MergedM.h5... done ./ungroped.h5... done ./flowy-run/GroupsA.h5... done
real 0m14.245s
user 0m7.168s
sys 0m0.280s
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# time python flowy.py https-flows.flw
0.81
Splitter initiated
Started filtering
486
Filters ready
Splitter finished
Number of records in branch B 243Number of records in branch A 243
Finished grouping branch A
Finished group-filtering for branch A
Finished grouping branch B
Finished group-filtering for branch B
Group filter threads joined
Finished merging branches: ['B', 'A']
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 6.31
Closing remaining open files: ./netflow-trace.h5... done ./flowy-run/GroupsA.h5... done ./flowy-run/MergedM.h5... done ./flowy-run/GroupsB-merged.h5... done ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsA.h5... done ./ungroped.h5... done ./netflow-trace.h5... done ./flowy-run/MergedM.h5... done ./flowy-run/GroupsA-merged.h5... done ./flowy-run/GroupsB.h5... done
real 0m9.051s
user 0m7.072s
sys 0m0.160s
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# time python flowy.py https-flows.flw
0.83
Splitter initiated
Started filtering
Group filter time started: 0.23
486
Filters ready
Number of records in branch A 243
Splitter finished
Splitter time elapsed: 6.1
Finished grouping branch A
Finished group-filtering for branch A
Number of records in branch B 243
Finished grouping branch B
Finished group-filtering for branch B
Group filter threads joined: 6.17
Finished merging branches: ['B', 'A']
Merger time elapsed: 6.23
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 7.36
Closing remaining open files: ./flowy-run/GroupsB.h5... done ./netflow-trace.h5... done ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsA.h5... done ./netflow-trace.h5... done ./flowy-run/GroupsB-merged.h5... done ./flowy-run/GroupsA.h5... done ./flowy-run/MergedM.h5... done ./flowy-run/GroupsA-merged.h5... done ./ungroped.h5... done ./flowy-run/MergedM.h5... done
real 0m15.893s
user 0m7.440s
sys 0m0.868s
Most frequent:
(('final_result', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 99), (26521, 1.8366894721984863, 1.7001189999999156))
(('reset', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 83), (26521, 3.138737678527832, 3.0042079999998066))
(('deep_copy', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 58), (26523, 1.7581963539123535, 1.6681159999999338))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 92), (26526, 3.3419792652130127, 3.0921969999998495))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/record.py', 130), (26526, 9.8621282577514648, 9.6565820000015421))
(('iterate_fixed_fields', '/home/melnikovkolya/classes/semester-3-project/flowy/pytables.py', 96), (26526, 1.9721605777740479, 1.7561189999999769))
(('__init__', '/home/melnikovkolya/classes/semester-3-project/flowy/record.py', 82), (27015, 4.6438140869140625, 4.6482780000005732))
(('mask', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 90), (53042, 1.6173598766326904, 1.5800989999999153))
(('EQ', '/home/melnikovkolya/classes/semester-3-project/flowy/operators.py', 63), (53044, 1.4263303279876709, 1.1120729999999632))
(('match', '/home/melnikovkolya/classes/semester-3-project/flowy/filter.py', 134), (53046, 5.1699655055999756, 4.6562810000002663))
Heaviest:
(('__init__', '/home/melnikovkolya/classes/semester-3-project/flowy/groupfilter_validator.py', 16), (1, 0.18725490570068359, 0.18801199999999962))
(('get_interval_records', '/home/melnikovkolya/classes/semester-3-project/flowy/timeindex.py', 57), (1, 0.2019498348236084, 0.20001300000000199))
(('pass_allen_indices_down', '/home/melnikovkolya/classes/semester-3-project/flowy/merger.py', 79), (1, 0.20258498191833496, 0.20001300000000199))
(('go', '/home/melnikovkolya/classes/semester-3-project/flowy/merger.py', 108), (1, 0.2162168025970459, 0.21201300000000245))
(('__init__', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper_validator.py', 11), (1, 0.22698211669921875, 0.22401300000000002))
(('__iter__', '/home/melnikovkolya/classes/semester-3-project/flowy/grouper.py', 36), (4, 1.1266498565673828, 1.1920739999999945))
(('_form_master_re', '/usr/lib/pymodules/python2.6/ply/lex.py', 482), (1, 0.30334997177124023, 0.22801499999999986))
(('validate_rules', '/usr/lib/pymodules/python2.6/ply/lex.py', 723), (1, 0.33556008338928223, 0.31602000000000008))
(('validate_all', '/usr/lib/pymodules/python2.6/ply/lex.py', 567), (1, 0.33656787872314453, 0.31602000000000008))
(('__init__', '/home/melnikovkolya/classes/semester-3-project/flowy/filter_validator.py', 10), (1, 0.37907099723815918, 0.3560230000000002))
(('go', '/home/melnikovkolya/classes/semester-3-project/flowy/groupfilter.py', 14), (2, 1.1871206760406494, 1.248076999999995))
(('create_impl', '/home/melnikovkolya/classes/semester-3-project/flowy/ungrouper_validator.py', 76), (1, 0.60985612869262695, 0.60803800000000052))
(('lex', '/usr/lib/pymodules/python2.6/ply/lex.py', 865), (1, 0.65552186965942383, 0.56003499999999995))
(('__init__', '/home/melnikovkolya/classes/semester-3-project/flowy/parser.py', 9), (1, 0.6572871208190918, 0.56403499999999995))
(('__init__', '/home/melnikovkolya/classes/semester-3-project/flowy/ungrouper_validator.py', 10), (1, 0.67348289489746094, 0.67204200000000114))
(('__init__', '/home/melnikovkolya/classes/semester-3-project/flowy/parser.py', 182), (1, 0.71254801750183105, 0.6200389999999999))
(('go', '/home/melnikovkolya/classes/semester-3-project/flowy/ungrouper.py', 29), (1, 1.85223388671875, 1.8081130000000023))
/var/netflow/ft-data-fall09/kur-ft-data/2009-11-17/
56992 records
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# python ft2hdf.py /var/netflow/ft-data-fall09/kur-ft-data/2009-11-17/ netflow-trace.h5
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# time python flowy.py https-flows.flw
0.7
Splitter initiated
Started filtering
Group filter time started: 0.27
219
Filters ready
Splitter finished
Splitter time elapsed: 13.2
Number of records in branch A 158
Finished grouping branch A
Finished group-filtering for branch A
Number of records in branch B 61
Finished grouping branch B
Finished group-filtering for branch B
Group filter threads joined: 13.18
Finished merging branches: ['B', 'A']
Merger time elapsed: 13.23
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 13.83
Closing remaining open files: ./netflow-trace.h5... done ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsA.h5... done ./netflow-trace.h5... done ./flowy-run/GroupsB-merged.h5... done ./flowy-run/GroupsA.h5... done ./flowy-run/GroupsB.h5... done ./flowy-run/MergedM.h5... done ./flowy-run/GroupsA-merged.h5... done ./ungroped.h5... done ./flowy-run/MergedM.h5... done
real 0m15.696s
user 0m13.653s
sys 0m1.004s
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# python printhdf.py ungroped.h5 | wc -l
Closing remaining open files: ungroped.h5... done
219
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# python ft2hdf.py /var/netflow/ft-data-fall09/kur-ft-data/2009-11-16/ netflow-trace.h5
99924
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# time python flowy.py https-flows.flw
0.71
Splitter initiated
Started filtering
Group filter time started: 0.27
1434
Filters ready
Splitter finished
Splitter time elapsed: 23.19
Number of records in branch A 748
Finished grouping branch A
Finished group-filtering for branch A
Number of records in branch B 686
Finished grouping branch B
Finished group-filtering for branch B
Group filter threads joined: 23.23
Finished merging branches: ['B', 'A']
Merger time elapsed: 23.31
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 26.48
Closing remaining open files: ./netflow-trace.h5... done ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsA.h5... done ./netflow-trace.h5... done ./flowy-run/GroupsB-merged.h5... done ./flowy-run/GroupsA.h5... done ./flowy-run/MergedM.h5... done ./flowy-run/GroupsA-merged.h5... done ./ungroped.h5... done ./flowy-run/MergedM.h5... done
real 0m28.767s
user 0m24.486s
sys 0m2.840s
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# python printhdf.py ungroped.h5 | wc -l
Closing remaining open files: ungroped.h5... done
1434
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# python ft2hdf.py /var/netflow/ft-data-fall09/sne-ft-data/2009/sneze/2009-11-* netflow-trace.h5
298063
root@melnikovkolya-laptop:/home/melnikovkolya/classes/semester-3-project/flowy# python printhdf.py ungroped.h5 | wc -l
Closing remaining open files: ungroped.h5... done
4087
0.76
Splitter initiated
Started filtering
4087
Filters ready
Splitter finished
Group filter time started: 53.73
Splitter time elapsed: 53.73
Number of records in branch A 2041
Finished grouping branch A
Finished group-filtering for branch A
Number of records in branch B 2046
Finished grouping branch B
Finished group-filtering for branch B
Group filter threads joined: 54.37
Finished merging branches: ['B', 'A']
Merger time elapsed: 54.47
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 63.47
Closing remaining open files: ./flowy-run/GroupsB-merged.h5... done ./netflow-trace.h5... done ./ungroped.h5... done ./flowy-run/MergedM.h5... done ./flowy-run/MergedM.h5... done ./netflow-trace.h5... done ./flowy-run/GroupsA.h5... done ./flowy-run/GroupsA.h5... done ./flowy-run/GroupsA-merged.h5... done ./flowy-run/GroupsB.h5... done ./flowy-run/GroupsB.h5... done
real 1m8.146s
user 1m3.576s
sys 0m0.776s
\begin{mytinylisting}
\begin{verbatim}
splitter S {}
filter www_req {
dstport = 443 OR dstport = 80 OR dstport = 8080
unix_secs > 1259413200
unix_secs < 1259445600
}
filter www_res {
srcport = 443 OR srcport = 80 OR srcport = 8080
unix_secs < 1259445600
unix_secs > 1259413200
}
grouper g_www_req {
module g1 {
}
aggregate bitOR(tcp_flags) as flags
}
grouper g_www_res {
module g1 {
}
aggregate bitOR(tcp_flags) as flags
}
groupfilter ggf {
bitAND(flags, 0x13) = 0x13
}
merger M {
module m1 {
branches B, A
A d B OR B d A
}
export m1
}
ungrouper U {}
"./h5ports.h5" -> S
S branch A -> www_req -> g_www_req -> ggf -> M
S branch B -> www_res -> g_www_res -> ggf -> M
M->U->"./portsungroped.h5"
\end{verbatim}
\end{mytinylisting}
%
Execution of that query
\begin{verbatim}
flowy# time python flowy.py ports.flw
0.83
Splitter initiated
Started filtering
Group filter time started: 0.3
1463
Filters ready
Splitter finished
Splitter time elapsed: 7.12
Number of records in branch B 1463
Finished grouping branch B
Finished group-filtering for branch B
Number of records in branch A 1463
Finished grouping branch A
Finished group-filtering for branch A
Group filter threads joined: 7.26
Finished merging branches: ['B', 'A']
Merger time elapsed: 7.26
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 13.92
real 0m14.788s
user 0m13.969s
sys 0m0.900s
\end{verbatim}

383
pytables.py Normal file
View file

@ -0,0 +1,383 @@
import tables
import os.path
from itertools import izip
from math import ceil, floor
default_fields = []
default_types = []
def create_flowtools_value_reader(fields):
def get_fields(record):
x = tuple(getattr(record,attr) for attr in fields)
return x
return get_fields
#class RecordsTable(object):
# def __init__(self, file_path, tree_path, id_size):
# if os.path.exists(file_path):
# self.file_path = file_path
# else:
# raise IOError("File %s cannot be accessed."%file_path)
# self.tree_path = tree_path
# # open for reading
# self.file = tables.openFile(self.file_path, mode="r+")
# try:
# #try to open the table as specified by path if node does not
# #exist create it
# self.table = self.file.getNode(self.tree_path)
# except tables.exceptions.NoSuchNodeError:
# raise IOError("file %s does not contain table %s"%
# (self.file_path,self.tree_path))
# self.fields = tuple(self.table.description._v_names)
# self.types = tuple([self.table.description._v_dtypes[name]
# for name in self.fields])
# # add the id field base on row number:
# self.fields += ('rec_id',)
# self.types += (id_size,)
#
# def __del__(self):
# self.file.close()
#
# def close(self):
# self.file.close()
class Table(object):
def __init__(self, file_path, id_size):
if os.path.exists(file_path):
self.file_path = file_path
else:
raise IOError("File %s cannot be accessed."%file_path)
# open for reading
self.file = tables.openFile(self.file_path, mode="r+")
# Returns the names of the fields that matter at the filter stage.
# i.e. srcport/dstport/prot/srcip/...
var_nodes = ['/' + field
for field in self.file.root._v_attrs.variable_fields]
self.table = self.file.getNode("/fixed_length_fields")
# print var_nodes
self.tables =[self.table.iterrows()] + map(self.file.getNode, var_nodes)
# print self.tables
self.fixed_fields = self.file.root._v_attrs.fixed_fields
# print self.fixed_fields
self.variable_fields = self.file.root._v_attrs.variable_fields
# add the id field base on row number:
self.fields = tuple(['rec_id'] +
self.fixed_fields +
self.variable_fields)
self.types = tuple([id_size] +
self.file.root._v_attrs.fixed_types +
self.file.root._v_attrs.variable_types)
# print self.file.root._v_attrs.variable_fields
def __del__(self):
self.file.close()
def close(self):
self.file.close()
def flush(self):
self.file.flush()
class FlowRecordsTable(Table):
"""A reader object for an HDF table of flow records"""
def __init__(self, file_path, expected_id_size = tables.UInt32Col()):
Table.__init__(self, file_path, id_size = expected_id_size)
def __iter__(self):
for row in izip(self.iterate_fixed_fields(), *self.tables[1:]):
yield row[0] + tuple(row[1:])
def iterate_fixed_fields(self):
for row in self.table:
yield tuple([row.nrow] +
[row[field] for field in self.fixed_fields])
raise StopIteration
def read_row(self, row_n):
row = [r for r in self.table.iterrows(row_n, row_n + 1)][0] #Is iterrows actually heavy itself?
fixed = tuple([row[field] for field in self.fixed_fields])
variable = tuple(table.read(row_n)[0] for table in self.tables[1:])
# print (row_n,) + fixed + variable
return (row_n,) + fixed + variable
def read_rows_list(self, rows_list):
for row_n in rows_list:
yield self.read_row(row_n)
def iter_ids(self, id_list):
return self.table.readCoordinates(id_list)
def get_record_by_id(self,id):
return self.table[id]
def __del__(self):
self.file.close()
def append(self, record):
self.row = self.table.row
for field in self.fixed_fields:
self.row[field] = getattr(record, field)
self.row.append()
for field in self.variable_fields:
getattr(self.file.root, field).append(getattr(record, field))
def get_current_row(self):
return self.row.nrow
@property
def len(self):
return self.table.nrows
class TimeIndex(FlowRecordsTable):
def __init__(self, fname, id_size_bytes=4):
FlowRecordsTable.__init__(self, fname, id_size_bytes)
self.start_time = self.file.root._v_attrs.start_time
self.delta = self.file.root._v_attrs.delta
self.id_size = id_size_bytes
self.index = self.tables[0]
def get_intervals_list(self, stime, etime):
start_interval = int(floor((stime - self.start_time) / self.delta))
end_interval = int(ceil((etime - self.start_time) / self.delta))
if start_interval < 1 or end_interval < 1:
raise ValueError("Something's wrong with index intervals")
return xrange(start_interval, end_interval)
def get_intervals_before(self, record, time_before):
res = self.get_intervals_list(record.stime - time_before, record.stime)
return res
def get_intervals_after(self, record, time_after):
res = self.get_intervals_list(record.etime, record.etime + time_after)
return res
def during(self, record):
return self.index.get_intervals_list
def time_to_index_row(self, time):
return int(floor((time - self.start_time) / self.delta))
def index(self, record):
for i in self.get_intervals_list(record.stime, record.etime):
self.index[i] = self.index[i].append(record.rec_id)
#class FlowRecordsTable(RecordsTable):
# """A reader object for an HDF table of flow records"""
# def __init__(self, file_path, expected_id_size = tables.UInt32Col()):
# RecordsTable.__init__(self, file_path, "/flow_records",
# id_size = expected_id_size)
#
# def __iter__(self):
# for row in self.table:
# yield row[:] + (row.nrow,) # tuple concatenation
#
# raise StopIteration
#
# def iter_ids(self, id_list):
# return self.table.readCoordinates(id_list)
#
# def get_record_by_id(self,id):
# return self.table[id]
#
# def __del__(self):
# self.file.close()
#
# def append(self,args):
# self.row = self.table.row
## print zip(self.fields, args)
# for field, val in zip(self.fields, args):
# self.row[field]= val
# self.row.append()
def create_Table(file, fields, table_name, field_types, filters):
file.createTable(file.root, table_name, field_types,
"Records Table", filters=filters)
def create_VLArray(file, name, atom, description, filters):
array = file.createVLArray(file.root, name,
atom,
"variable length field "+name,
filters=filters)
array.flavor = 'python'
#def create_table_file(file_path, field_types, table_name="flow_records",
# complib='lzo', complevel=9):
# if os.path.exists(file_path):
# raise IOError("File %s already exists"%file_path)
#
# file = tables.openFile(file_path, mode="w")
# filters = tables.Filters(complevel=complevel, complib=complib)
# file.createTable(file.root, table_name, field_types,
# "Records Table", filters=filters)
# file.close()
def create_index_file(file_path, start_time, delta, id_size_bytes,
complib='lzo', complevel=9, itemsize_in_bytes = 4):
if os.path.exists(file_path):
raise IOError("File %s already exists"%file_path)
file = tables.openFile(file_path, mode="w")
filters = tables.Filters(complevel=complevel, complib=complib)
array = create_VLArray(file.root, 'time_index',
tables.UIntAtom(itemsize=itemsize_in_bytes),
"time_index", filters=filters)
array.flavor = 'python'
file.root._v_attrs.variable_fields = ['time_index']
file.root._v_attrs.variable_types = [
tables.UIntAtom(itemsize=itemsize_in_bytes)]
file.root._v_attrs.start_time = start_time
file.root._v_attrs.delta = delta
file.close()
def create_table_file(file_path, field_types,
complib='lzo', complevel=9):
if os.path.exists(file_path):
raise IOError("File %s already exists"%file_path)
file = tables.openFile(file_path, mode="w")
filters = tables.Filters(complevel=complevel, complib=complib)
# filters = tables.Filters()
if 'rec_id' in field_types:
del field_types['rec_id']
fixed_length_fields = {}
variable_length_fields = {}
for k, v in field_types.iteritems():
# print str(type(v)), str(type(v)).find('atom')
if str(type(v)).find('atom') == -1:
fixed_length_fields[k] = v
else:
variable_length_fields[k] = v
file.createTable(file.root, "fixed_length_fields", fixed_length_fields,
"Records Table", filters=filters)
for field_name, atom in variable_length_fields.iteritems():
array = file.createVLArray(file.root, field_name, atom, "field "
+ field_name, filters)
array.flavor = 'python'
file.root._v_attrs.fixed_fields = fixed_length_fields.keys()
file.root._v_attrs.fixed_types = fixed_length_fields.values()
file.root._v_attrs.variable_fields = variable_length_fields.keys()
file.root._v_attrs.variable_types = variable_length_fields.values()
file.close()
class GroupsMembersTable(object):
def __init__(self, file_path, tree_path):
self.file_path = file_path
self.tree_path = tree_path
# open for reading
self.file = tables.openFile(self.file_path, mode="r+")
try:
#try to open the table as specified by path if node does not
#exist create it
self.table = self.file.getNode(self.tree_path)
except tables.exceptions.NoSuchNodeError:
raise IOError("file %s does not contain table %s"%
(self.file_path,self.tree_path))
def __iter__(self):
for row in self.table:
yield row
raise StopIteration
def iter_ids(self, id_list):
for id in id_list:
yield self.table[id]
def get_group_by_id(self):
return self.table[id]
def __del__(self):
# self.table.flush()
self.file.close()
def append(self, val_list):
self.table.append(val_list)
# Performs ungrouping, based on the iterator of group records and an
# iterator over flow records
class GroupsExpander(object):
def __init__(self, groups_file_path, records_file_path):
self.groups = GroupsMembersTable(groups_file_path, "gr1")
self.records = FlowRecordsTable(self.records_file_path)
def group_members(self,group_id):
grp_member_ids = self.groups.get_group_by_id(group_id)
return self.record.iter_ids(grp_member_ids)
default_ft_types = {
'dFlows' : tables.UInt32Col(), 'bytes' : tables.UInt32Col(),
'dPkts' : tables.UInt32Col(), 'dst_as' : tables.UInt16Col(),
'dst_mask' : tables.UInt8Col(), 'dst_tag' : tables.UInt32Col(),
'dstip' : tables.UInt32Col(), 'dstport' : tables.UInt16Col(),
'engine_id' : tables.UInt8Col(), 'engine_type' : tables.UInt8Col(),
'exaddr' : tables.UInt32Col(), 'extra_pkts' : tables.UInt32Col(),
'stime' : tables.UInt32Col(), 'in_encaps' : tables.UInt8Col(),
'input' : tables.UInt16Col(), 'etime' : tables.UInt32Col(),
'marked_tos' : tables.UInt8Col(), 'nexthop' : tables.UInt32Col(),
'out_encaps' : tables.UInt8Col(), 'output' : tables.UInt16Col(),
'peer_nexthop' : tables.UInt32Col(), 'prot' : tables.UInt8Col(),
'router_sc' : tables.UInt32Col(), 'src_as' : tables.UInt16Col(),
'src_mask' : tables.UInt8Col(), 'src_tag' : tables.UInt32Col(),
'srcip' : tables.UInt32Col(), 'srcport' : tables.UInt16Col(),
'sysUpTime' : tables.UInt32Col(), 'tcp_flags' : tables.UInt8Col(),
'tos' : tables.UInt8Col(), 'unix_nsecs' : tables.UInt32Col(),
'unix_secs' : tables.UInt32Col()
}
#tab = FlowRecordsTable("../dynZip9.h5")
#for x in tab:
# print x
#print tab.fields
#wr = TableWriter("../test.h5","/dumps/table1")
#create_group_file("../grptest.h5", "gr1")
#grp = GroupsMembersTable("../grptest.h5", "/gr1")
#grp.append([1,3,5])
#grp.append([2,4])
#grp.append([4324904231490123,98])
#
#for ls in grp.iter_ids([1,2]):
# print ls
#grp.__del__()
#print [1,4,543,32]
#from os import remove
#fname = "../comp.h5"
#remove(fname)
#field_types = {'info': tables.UInt8Col(),
# 'records': tables.UInt8Atom(), 'info_list': tables.UInt8Atom()}
#create_complex_file(fname, field_types)
##h5f = tables.openFile(fname, 'r')
##print h5f.root._v_attrs.fixed_fields
##print h5f.root._v_attrs.fixed_types
##print h5f.root._v_attrs.variable_fields
##print h5f.root._v_attrs.variable_types
#
#cread = FlRecordsTable(fname)
#
#cread.append((999,[1,3],[1]))
#cread.append((2,[1,4],[2,4,999]))
#cread.close()
#
#read = FlRecordsTable(fname)
#for r in read:
# print r

BIN
pytables.pyc Normal file

Binary file not shown.

165
record.py Normal file
View file

@ -0,0 +1,165 @@
"""
This module provides methods for dynamically creating flow and
group record classes.
"""
def get_record_class(attributes, types=None, default_vals=None):
'''
Creates a record class for given attribute names.
Arguments:
attributes - a sequence of attribute names
types - optional sequence of attribute types, which
correspond to the attribute names in attributes.
Types may be of any type, and are not used by the
Record class, but are useful for external storage,
where data type has to be predetermined.
default_val - a sequence of default values which
correspond to the attribute names in attributes
Lists are used instead of dictionaries because the order
may be important.
Return:
Record class which has attributes with the names given
by attributes list. The class uses __slots__ to lower
memory usage as potentially millions of instance will
be present during runtime. The class has a constructor,
which takes as argument values for the attributes ordered
the same way as in the attributes list. If default values
are specified there is a default(no argument) constructor
as well.
NOTE that this method returns a class not an instance.
Raises:
ValueError if number of types or default values doesn't
match number of attributes.
'''
if default_vals and len(attributes) != len(default_vals):
raise ValueError(
"Number of attributes(%d) and number of default values(%d)"%
(len(attributes),len(default_vals))+" don't match")
if types and len(attributes) != len(types):
raise ValueError(
"Number of attributes(%d) and number of default types(%d)"%
(len(attributes),len(default_vals))+" don't match")
elif types:
types_dict = dict(zip(attributes, types))
else:
types_dict = {}
class Record(object):
'''
Record class contains flow or group record information.
It uses __slots__ to save memory because potentially millions of
FlowRecords will be used during run time.
Attributes:
attribute names are specified in cls.__slots__
defaults - contains the default values for attributes used
with default constructor.
attr_types - contains a dictionary of the types of
the attributes.
Methods:
__init__ - when defaults is specified __init__()
creates an object with default values. If no
defaults are specified during class creation
__init__() raises TypeError.
__init__(*args) takes exactly the same number
of arguments as the classes' number of attributes,
and creates new instance with the given values.
Argument order corresponds to the order of
attributes in cls.__slots__
'''
# set slots to conserve memory
# copy ([:]) don't reference to protect from unexpected changes
__slots__ = attributes[:]
attr_types = types_dict
num_of_fields = len(__slots__)
defaults = default_vals[:] if default_vals else None
def __init__(self, *args):
num_args = len(args)
if num_args == self.num_of_fields:
for name, value in zip(self.__slots__,args):
setattr(self, name, value)
elif num_args == 0 and self.defaults != None:
for name, value in zip(self.__slots__,self.defaults):
setattr(self, name, value)
elif self.defaults == None:
raise TypeError(
"__init__() takes %d arguments (%d given)"%
( self.num_of_fields + 1, num_args+1))
else:
raise TypeError(
"__init__() takes either 1 or %d arguments (%d given)"%
( self.num_of_fields + 1, num_args+1))
def tuple(self):
return tuple(getattr(self, field) for field in self.slots)
def __repr__(self):
res = "Recod("
for field in self.__slots__:
val = getattr(self, field)
if type(val) is str:
val = "'" + str(val) + "'"
else:
val = str(val)
res += val + ", "
res =res[:-2] + ")"
return res
def __str__(self):
res = "Recod: "
for field in self.__slots__:
val = getattr(self, field)
res += field + "->" + str(val) + ", "
res =res[:-2]
return res
return Record
class RecordReader(object):
def __init__(self, reader_object):
self.reader = reader_object
#print self.reader.fields
self.Record = get_record_class(self.reader.fields)
def __iter__(self):
for tuple in self.reader:
yield self.Record(*tuple)
def read_rows_list(self, rows_list):
for tuple in self.reader.read_rows_list(rows_list):
yield self.Record(*tuple)
def read_row(self, row_n):
tup = self.reader.read_row(row_n)
return self.Record(*tup)
#from flowy import pytables
#ptread = pytables.FlowRecordsTable("../testFT.h5" )
#rr = RecordReader(ptread)
#for i in rr:
# print i.dOctets
#
#
#FlowRecord = get_record_class(["a","b"],["str","uint"],[1,6])
#
#def printSth(self):
# print "sth"
#
#FlowRecord.p = printSth
#
#x = FlowRecord(1,6)
#
#
#print x.a, x.b
#print x.__slots__
#
#t = FlowRecord()
#print t.a
#t.p()

BIN
record.pyc Normal file

Binary file not shown.

1387
run-output.txt Normal file

File diff suppressed because it is too large Load diff

25
run-output2.txt Normal file
View file

@ -0,0 +1,25 @@
0.82
[Input('./netflow-trace.h5', 50, set([]), set([]), set([])), BranchNode('S', 50, set([]), set([]))]
[BranchNode('S', 51, set([]), set([])), Branch('A', 51, None, set([]), set([])), BranchNode('www_req', 51, set([]), set([])), BranchNode('g_www_req', 51, set([]), set([])), BranchNode('ggf', 51, set([]), set([])), BranchNode('M', 51, set([]), set([]))]
[BranchNode('S', 52, set([]), set([])), Branch('B', 52, None, set([]), set([])), BranchNode('www_res', 52, set([]), set([])), BranchNode('g_www_res', 52, set([]), set([])), BranchNode('ggf', 52, set([]), set([])), BranchNode('M', 52, set([]), set([]))]
[BranchNode('M', 53, set([]), set([])), BranchNode('U', 53, set([]), set([])), Output('./ungroped.h5', 53, set([]), set([]), set([]))]
Splitter initiated
Parsing and validation finished: 0.31
Started filtering
Finished filtering
Filters ready
Splitter time elapsed: 346.66
Finished grouping branch B
Finished grouping branch A
Finished filtering groups for branch B
Finished filtering groups for branch A
Group filter time elapsed: 916.19
Finished merging branches: ['B', 'A']
Merger time elapsed: 1037.532704
Ungrouper U finished exectution
FINISHED!
Overall time elapsed: 1073.552704

64
splitter.py Normal file
View file

@ -0,0 +1,64 @@
from Queue import Queue
from Queue import Empty
import profiler
class Splitter(object):
def __init__(self, name_to_br, filter):
self.branches = name_to_br.values() # Returns the actual implementaion of Branches A and B, their values
self.name_to_branch = name_to_br
self.filter = filter
print "Splitter initiated"
def go(self):
count = 0
# Exactly rec and branch are returned, since that is specified
# by the 'generator' function, denoted by 'yield' inside the
# __iter__ function. Every time an __iter__ is called, one tuple
# of (rec, branch) is returned
for rec, branch in self.filter:
self.split(branch, rec)
count = count + 1
print count
self.ready()
def split(self, branch_mask, record):
# print zip(self.branches, branch_mask)
for branch, active in zip(self.branches, branch_mask):
# print active, branch
if active:
branch.put(record)
# if branch.name == 'A': print record
# if branch.name == 'B': print record
# print branch
def ready(self):
print "Filters ready"
for br in self.branches:
br.ready = True
class Branch(Queue):
def __init__(self, name):
Queue.__init__(self, 0)
self.name = name
self.ready = False
def __iter__(self):
while(True):
if self.empty() and self.ready:
raise StopIteration
try:
record = self.get(timeout=3)
yield record
self.task_done()
except Empty:
if self.ready:
raise StopIteration

BIN
splitter.pyc Normal file

Binary file not shown.

25
splitter_validator.py Normal file
View file

@ -0,0 +1,25 @@
from copy import copy, deepcopy
from splitter import Branch as BranchImpl
from splitter import Splitter as SplitterImpl
class SplitterValidator(object):
def __init__(self, parser, filter_validator):
self.splitter = copy(parser.splitter)
self.branches = deepcopy(parser.branches)
self.branch_ids = filter_validator.branches_ids
self.filter_impl = filter_validator.impl
self.br_name_to_br = {}
self.impl = self.create_impl()
def sort_branches(self):
id_to_branch = dict(zip(self.branch_ids.values(),
self.branch_ids.keys()))
sorted_br = [id_to_branch[k] for k in sorted(id_to_branch.keys())]
return sorted_br
def create_impl(self):
br_names = self.sort_branches()
branches = [BranchImpl(name) for name in br_names]# Actual branch instances are being initiated
name_to_br = dict(zip(br_names, branches))# These instances are being mapped to the corresponding names, i.e. A, B
self.br_name_to_br = name_to_br
return SplitterImpl(name_to_br, self.filter_impl)

BIN
splitter_validator.pyc Normal file

Binary file not shown.

181
statement.py Normal file
View file

@ -0,0 +1,181 @@
class Splitter(object):
def __init__(self, name, line, branches=None):
self.name = name
self.line = line
self.branches = branches if branches else set()
def __repr__(self):
str = "Splitter('%s', %s, %s)"%(self.name, self.line, self.branches)
return str
class Ungrouper(object):
def __init__(self, name, line, branches=None):
self.name = name
self.line = line
self.branches = branches if branches else set()
def __repr__(self):
str = "Ungrouper('%s', %s, %s)"%(self.name, self.line, self.branches)
return str
class Input(object):
def __init__(self, name, line, inputs=None, outputs=None, branches=None):
self.name = name
self.line = line
self.branches = branches if branches else set()
self.inputs = inputs if inputs != None else set()
self.outputs = outputs if outputs != None else set()
def __repr__(self):
str = "Input('%s', %s, %s, %s, %s)"%(self.name, self.line,
self.branches, self.inputs,
self.outputs)
return str
class Output(object):
def __init__(self, name, line, inputs=None, outputs=None, branches=None):
self.name = name
self.line = line
self.branches = branches if branches else set()
self.inputs = inputs if inputs != None else set()
self.outputs = outputs if outputs != None else set()
def __repr__(self):
str = "Output('%s', %s, %s, %s, %s)"%(self.name, self.line,
self.branches, self.inputs,
self.outputs)
return str
class Branch(object):
def __init__(self, name, line, members=None, inputs=None, outputs=None):
self.name = name
self.members = members
self.line = line
self.inputs = members if members != None else set()
self.inputs = inputs if inputs != None else set()
self.outputs = outputs if outputs != None else set()
def __repr__(self):
str = "Branch('%s', %s, %s, %s, %s)"%(self.name, self.line,
self.members, self.inputs,
self.outputs)
return str
class BranchNode(object):
def __init__(self, name, line, inputs=None, outputs=None):
self.name = name
self.line = line
self.inputs = inputs if inputs != None else set()
self.outputs = outputs if outputs != None else set()
def __repr__(self):
str = "BranchNode('%s', %s, %s, %s)"%(self.name, self.line ,
self.inputs, self.outputs)
return str
class Rule(object):
def __init__(self, op, line, args, NOT=False):
self.op = op
self.args = args
self.NOT = False
self.line = line
def __repr__(self):
str = "Rule('%s', %s, %s, %s)"%(self.op, self.line,
self.args, self.NOT)
return str
def __str__(self):
return "%s%s" % (self.op, self.args)
def __eq__(self, other):
return str(self)== str(other)
def __hash__(self):
return hash(str(self))
class AllenRule(Rule):
def __repr__(self):
str = "AllenRule('%s', %s, %s, %s)"%(self.op, self.line,
self.args, self.NOT)
return str
class Field(object):
def __init__(self, name):
self.name = name
def __repr__(self):
return "Field('%s')"%self.name
class GrouperRule(object):
def __init__(self, op, line, args):
self.line = line
self.args = args
self.op = op
def __repr__(self):
str = "GrouperRule('%s', %s, %s)"%(self.op, self.line, self.args)
return str
class Filter(object):
def __init__(self, name, line, rules, branches=None):
self.name = name
self.rules = rules
self.line = line
self.branches = branches if branches else set()
def __repr__(self):
str = "Filter('%s', %s, %s, %s)"%(self.name, self.line, self.rules,
self.branches)
return str
class Module(Filter):
def __repr__(self):
str = "Module('%s', %s, %s, %s)"%(self.name, self.line,
self.rules, self.branches)
return str
class Grouper(object):
def __init__(self, name, line, modules, aggr, branches=None):
self.name = name
self.aggr = aggr
self.modules = modules
self.line = line
self.branches = branches if branches else set()
def __repr__(self):
str = "Grouper('%s', %s, %s, %s, %s)"%(self.name, self.line,
self.modules, self.aggr, self.branches)
return str
class Merger(object):
def __init__(self, name, line, modules, export, branches=None):
self.name = name
self.export = export
self.modules = modules
self.line = line
self.branches = branches if branches else set()
def __repr__(self):
str = "Merger('%s', %s, %s, %s, %s)"%(self.name, self.line,
self.modules, self.export, self.branches)
return str
class FilterRef(object):
def __init__(self, name, line, NOT=False):
self.name = name
self.NOT = NOT
self.line = line
def __repr__(self):
str = "FilterRef('%s', %s, %s)"%(self.name, self.line, self.NOT)
return str
class Arg(object):
def __init__(self, type, value, str=''):
self.type = type
self.value = value
self.str = str
def __repr__(self):
str = "Arg('%s', %s, '%s')"%(self.type, repr(self.value), self.str)
return str

BIN
statement.pyc Normal file

Binary file not shown.

13
stats_print.py Normal file
View file

@ -0,0 +1,13 @@
import pickle
f = open('./profile_stats1')
stats = pickle.load(f)
#for st in sorted(filter(lambda a: a[1][0]>1 and a[1][1]>10, stats), key=lambda a: a[1][1]/a[1][0], reverse=True):
# print st[0], st[1][1]/st[1][0], st[1][1], st[1][0]
for st in sorted(stats, key=lambda a: a[1][1], reverse=True):
print st
#for st in sorted(stats, key=lambda a: a[1][0], reverse=True):
# if st[0][1].find('flowy/src/flowy') != -1:
# print (st[0][1].partition('flowy/src/flowy/'))[2], st[0][0], st[0][2], st[1][0], st[1][1]

62
timeindex.py Normal file
View file

@ -0,0 +1,62 @@
#from pytables import FlowRecordsTable
#from pytables import create_table_file
#from tables import UIntAtom
from math import floor
#class TimeIndex(object):
# def __init__(self, start_time, delta, id_size_bytes=4):
# self.start_time = start_time
# self.delta = delta
# self.id_size = id_size_bytes
# self.index = self.get_index_file()
#
# def get_index_file(self):
# if self.index: return self.index
# create_table_file(self.file, {'t': UIntAtom(self.id_size)})
#
# self.index = FlowRecordsTable(self.file)
# self.index.
# return self.index
class TimeIndex(object):
def __init__(self, interval=1000, maxsize=10**5):
self.interval = float(interval)
self.index = {}
self.maxsize = maxsize
self.mintime = float('inf') # later replaced with int
self.maxtime = float('-inf') # later replaced with int
@property
def len(self):
return len(self.index)
def get_interval(self, stime, etime):
start = int(floor(stime/self.interval))
end = int(floor(etime/self.interval) + 1)
return xrange(start, end)
def update_min_max_time(self, record):
if self.mintime > record.stime:
self.mintime = record.stime
if self.maxtime < record.etime:
self.maxtime = record.etime
def get_total_interval(self):
return self.get_interval(self.mintime, self.maxtime)
def add(self, record):
interval = self.get_interval(record.stime, record.etime)
for i in interval:
self.index.setdefault(i, set()).add(record.rec_id)
self.update_min_max_time(record)
if self.len > self.maxsize:
print "Warning large index"
def get_interval_records(self, stime, etime):
res = set()
for i in self.get_interval(stime, etime):
res |= self.index.setdefault(i, set()) # set union
return sorted(res)

BIN
timeindex.pyc Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show more