flowy/ft2hdf.py

184 lines
6.7 KiB
Python
Raw Permalink Normal View History

2010-11-05 17:57:01 +00:00
#!/usr/bin/python
from pytables import FlowRecordsTable
import pytables
import ftreader
import record
import os
from os.path import split, join, islink
import re
import sys
from bisect import bisect, bisect_left
from operator import itemgetter
from optparse import OptionParser
#def ft2hdf(ft_file, hdf_file):
# ft_fields = ftreader.find_fields(ft_file)
# fields = ftreader.translate_field_names(ft_fields,
# ftreader.default_names_dict)
# field_types = dict((field,pytables.default_ft_types[field])
# for field in fields)
## print field_types
# pytables.create_table_file(hdf_file, field_types)
# rec_table = pytables.FlowRecordsTable(hdf_file)
# # since pytables is initiated with dictionary there is no way to
# # sort the fields order, so we have to translate back in order
# # to keep the fields names order
# ordered_ft_fields = ftreader.translate_field_names(rec_table.fields,
# ftreader.reverse_names_dict)
# flow_set = ftreader.FlowToolsReader(ft_file, ordered_ft_fields)
# for flow in flow_set:
# rec_table.append(flow)
# rec_table.close()
def ft2hdf_single(ft_file, hdf_file):
ft_fields = ftreader.find_fields(ft_file)
fields = ftreader.translate_field_names(ft_fields,
ftreader.default_names_dict)
field_types = dict((field,pytables.default_ft_types[field])
for field in fields)
# print field_types
pytables.create_table_file(hdf_file, field_types)
rec_table = pytables.FlowRecordsTable(hdf_file)
# since pytables is initiated with dictionary there is no way to
# sort the fields order, so we have to translate back in order
# to keep the fields names order
ordered_ft_fields = ftreader.translate_field_names(rec_table.fields,
ftreader.reverse_names_dict)
flow_set = ftreader.FlowToolsReader(ft_file,
ordered_ft_fields, rec_table.fields[1:])
rec_set = record.RecordReader(flow_set)
for flow in rec_set:
rec_table.append(flow)
rec_table.close()
def ft2hdf(many_files, hdf_file):
ft_file = many_files[0]
ft_fields = ftreader.find_fields(ft_file) # returns fields present in the flow record
fields = ftreader.translate_field_names(ft_fields, ftreader.default_names_dict)
field_types = dict((field,pytables.default_ft_types[field]) for field in fields)
# print ft_fields
# print fields
pytables.create_table_file(hdf_file, field_types)
rec_table = pytables.FlowRecordsTable(hdf_file)
# since pytables is initiated with dictionary there is no way to
# sort the fields order, so we have to translate back in order
# to keep the fields names order
ordered_ft_fields = ftreader.translate_field_names(rec_table.fields, ftreader.reverse_names_dict)
for ft_file in many_files:
flow_set = ftreader.FlowToolsReader(ft_file, ordered_ft_fields, rec_table.fields[1:]) # all fields except 'id_rec'
rec_set = record.RecordReader(flow_set)
for flow in rec_set:
rec_table.append(flow)
rec_table.close()
def printHDF(hdf_file):
r = pytables.FlowRecordsTable(hdf_file)
recordReader = record.RecordReader(r)
for rec in recordReader:
print rec
class FSLoop(Exception):
pass
def findFiles(path, start_time, end_time, filter_files = False):
timeExp = re.compile(r"ft-v05\.(\d{4})-(\d{2})-(\d{2}).(\d{6}).(\d{4})")
time_file_list = []
dir_links = [path]
def walkDirs(dir_links):
file_list = []
more_dir_links = []
for link in dir_links:
for root, dirs, files in os.walk(link):
for file in files:
match = timeExp.search(file)
if match:
element = (int(''.join(match.groups()[:-1])), join(root,file))
if element in time_file_list:
raise FSLoop
file_list.append(element)
for dir in dirs:
if islink(join(root,dir)):
print file
more_dir_links.append(join(root,dir))
return file_list, more_dir_links
while len(dir_links) > 0:
tf, dir_links = walkDirs(dir_links)
time_file_list.extend(tf)
def cmp((a,x),(b,y)):
if a-b < 0:
return -1
elif a-b>0:
return 1
else:
return 0
time_file_list.sort(cmp)
if (filter_files):
keys = [r[0] for r in time_file_list]
begin = 0
end = len(time_file_list)
if start_time is not None:
begin = bisect_left(keys, long(start_time))
if end_time is not None:
end = bisect(keys, long(end_time))
# the start and end time must be converted to long
time_file_list = time_file_list[begin:end]
time_file_list = map(lambda (x,y):y,time_file_list)
return time_file_list
def dateToInt(date):
number_of_digits = [4, 2, 2, 2, 2, 2]
separators = '[- :/]*'
expr = "\d{%s}"%number_of_digits[0]
for digit in number_of_digits[1:]:
expr += separators + "(\d{%s})"%digit
timeExp = re.compile(expr)
result = timeExp.match(date)
if result is None:
raise ValueError("invalid date format")
return date.translate(None, '- :/')
def lotsOfFolders(paths, start_time=None, end_time=None):
full_file_paths=[]
start_time, end_time = [dateToInt(d) if d != None else d for d in (start_time, end_time)]
for path in paths:
full_file_paths.extend(findFiles(path, start_time, end_time, True))
# sort the results
split_paths = map(split, full_file_paths)
split_paths = set(split_paths)
split_paths = sorted(split_paths, key=itemgetter(1))
full_file_paths = [join(x, y) for x, y in split_paths]
return full_file_paths
def main():
usage = 'usage: %prog [options] input_path1 [input_path2 [...]] output_file.h5'
p = OptionParser(usage)
p.add_option('--start-time', '-s')
p.add_option('--end-time', '-e')
options, arguments = p.parse_args()
start_time = options.start_time
end_time = options.end_time
folders = arguments[:-1]
output = arguments[-1]
if not (output[output.find('.h5'):] == '.h5'):
sys.stderr.write('Output file should have an .h5 extension\n')
exit(1)
file_paths = lotsOfFolders(folders, start_time,end_time)
if len(file_paths) < 1:
sys.stderr.write('No flow-tools files found\n')
exit(1)
ft2hdf(file_paths, output)
if __name__ == "__main__":
main()