initial commit

This commit is contained in:
Johannes Schauer 2010-06-18 13:28:40 +02:00
commit 4dd9907c39

479
parser.py Normal file
View file

@ -0,0 +1,479 @@
from urllib2 import Request, urlopen
from urllib import urlencode, quote_plus, urlretrieve
from gzip import GzipFile
from cStringIO import StringIO
from zlib import decompress
from struct import unpack
from time import sleep
from datetime import timedelta, date, datetime
from lxml import etree
def get_id(station):
return get_ids(station+"!")[0]
def search_ids(station):
return get_ids(station+"?")
def get_ids(query):
req = Request("http://railnavigator.bahn.de/bin/rnav/query.exe/dn",
'<?xml version="1.0" encoding="UTF-8" ?><ReqC ver="1.1" prod="String" lang="DE"><MLcReq><MLc n="%s" t="ST" /></MLcReq></ReqC>'%query,
{"User-Agent":"Java/1.6.0_0"})
# <MLc t="ST" n="Bahlen Germann "Bahler Buur", Dinklage" i="A=1@O=Bahlen Germann "Bahler Buur", Dinklage@X=81
# grrrrrrrrr - too stupid to escape their xml!!!!
parser = etree.XMLParser(recover=True, encoding='iso-8859-1')
tree = etree.parse(urlopen(req), parser)
# make sure None entries from badly parsed nodes are not in the output
return [elem.get("i") for elem in tree.findall(".//MLc") if elem.get("i")]
def parse_i(i):
tokens = i.strip('@').split("@")
d = dict()
for t in tokens:
key, value = t.split("=", 1)
d[key] = value
return d
def compile_i_from_station(station):
return get_id(station)
def compile_i_from_stationid(stationid):
return "A=1@L=%09d@"%stationid
def compile_i_from_coords(x, y, name=None):
if not name:
name="---"
return "A=16@O=%s@X=%d@Y=%d@"%(name, int(x*1000000), int(y*1000000))
#infile = open("../cities_germany_211009.clean.txt", "r")
#outfile = open("../cities_germany_291009.out.txt", "w")
#for city in infile:
# print city,
# for station in search_ids(city):
# outfile.write("%s\t%s\t%s\t%s\n"%(station["O"], station["X"], station["Y"], station["L"]))
# sleep(10)
#infile.close()
#outfile.close()
#exit()
#big plan
def get_big_pln_data():
req = Request("http://persoenlicherfahrplan.bahn.de/bin/pf/query-p2w.exe/dn",
urlencode({"start":"1",
"pp":"5",
"ZID":"A=1@X=10885568@Y=48365444@O=Augsburg Hbf@L=008000013@U=80@K=S1-0N1@G=8000013@C=49@a=128@B=1@",
"output":"pln",
"hcount":"0",
"h2gversion":"6.20.7",
"spmo":"1",
"htype":"MicroEmulator-2.0",
"SID":"A=1@X=6092250@Y=50767829@O=Aachen Hbf@L=008000001@U=80@K=S1-0N1@G=8000001@C=49@a=128@B=1@",
"L":"vs_javapln",
"p2wIVRoute":"1",
"p2wCreateMaps":"1"}),
{"User-Agent":"Java/1.6.0_0"})
tokens = urlopen(req).read().split()
time = ""
url = ""
for t in tokens:
key, value = t.split("=", 1)
if key == "url":
url = value
elif key == "time":
time = value
def test_pln():
url2 = ""
while url2 == "":
tokens = urlopen(url).read().strip().split("\n")
print tokens
for t in tokens:
key, value = t.split("=", 1)
if key == "url":
url2 = value
break
elif key == "time":
sleep(10)
print value
urlretrieve(url2, "pln")
# next: ignoreMinuteRound=yes&REQ0HafasScrollDir=1&h2g-direct=1&seqnr=1&ident=ox.0348695.1256054390&
def get_departures(station, dt):
if isinstance(station, str):
station = parse_i(get_id(station))["L"]
req = Request("http://mobile.bahn.de/bin/mobil/bhftafel.exe/dn?",
urlencode({
"start":"yes",
"date":dt.strftime("%d.%m.%Y"),
"time":dt.strftime("%H:%M"),
"boardType":"dep",
"sTI":"0",
"L":"vs_java3",
"input":dep_station,
"productsFilter":"1"*14
}),
{"User-Agent":"Java/1.6.0_0"})
stops = urlopen(req).read()[4:-4].split("\n/>\n<Journey ")
result = list()
for stop in stops:
tokens = stop.split("\n")
evaid, name = tokens[0].split(" ", 1)
s = dict()
s["L"] = evaid.split("=")[1].strip("\"")
s["O"] = name.split("=")[1].strip("\"")
for t in tokens[1:]:
key, value = t.split("=", 1)
value = value.strip("\" ")
if value:
s[key] = value
result.append(s)
return result
def get_stops(train, dep_station, arr_station, dt):
if isinstance(station, str):
station = parse_i(get_id(station))["L"]
req = Request("http://mobile.bahn.de/bin/mobil/bhftafel.exe/dn?",
urlencode({
"start":"yes",
"REQTrain_name":train,
"date":dt.strftime("%d.%m.%Y"),
"time":dt.strftime("%H:%M"),
"boardType":"dep",
"sTI":"1",
"L":"vs_java3",
"input":dep_station,
"dirInput":arr_station,
"productsFilter":"1"*14
}),
{"User-Agent":"Java/1.6.0_0"})
stops = urlopen(req).read()[4:-4].split("\n/>\n<St ")
result = list()
for stop in stops:
tokens = stop.split("\n")
evaid, name = tokens[0].split(" ", 1)
s = dict()
s["L"] = evaid.split("=")[1].strip("\"")
s["O"] = name.split("=")[1].strip("\"")
for t in tokens[1:]:
key, value = t.split("=", 1)
value = value.strip("\" ")
if value:
s[key] = value
result.append(s)
return result
#late:
#http://mobile.bahn.de/bin/mobil/bhftafel.exe/dn?
# start=yes
# &time=actual
# &rT.2=13%3A51
# &rT.1=13%3A51
# &boardType=dep
# &hcount=0
# &h2gversion=6.20.7
# &rZ.2=RE%2010125
# &htype=MicroEmulator-2.0
# &rZ.1=RE%2010125
# &L=vs_java
# &input=%238000001&
def get_pln_data(start, end, stop1=None, stop2=None, dt=None, bike=False,
ice=True, ic=True, ir=True, re=True, sbahn=True, bus=True, boat=True,
subway=True, tram=True, taxi=True):
if dt is None:
dt = datetime.now()
query = {"ignoreMinuteRound":"yes",
"date":dt.strftime("%d.%m.%Y"),
"ZID":end,
"h2g-direct":"1",
"time":dt.strftime("%H:%M"),
"SID":start,
"start":"1"}
if not all([ice, ic, ir, re, sbahn, bus, boat, subway, tram, taxi]):
query["REQ0JourneyProduct_prod_list_1"] = \
"".join(str(int(x)) for x in [ice, ic, ir, re, sbahn, bus, boat, subway, tram, taxi])+"1111"
if bike:
query["REQ0JourneyProduct_opt3"] = "1"
if stop1:
query["VID1"]=stop1
if stop2:
query["VID2"]=stop2
req = Request("http://mobile.bahn.de/bin/mobil/query.exe/dn",
urlencode(query),
{"User-Agent":"Java/1.6.0_0"})
r = urlopen(req) # is a file like object but does not have tell()
s = StringIO(r.read()) # this is why a stringio has to be created for gzip
return GzipFile(mode="r", fileobj=s) # possibly replacable by manually reading the gzip header
class PlnParse:
"""
all data in little endian
offsets are always counted from start of the file
latitude, longitude are in wgs84 format
ushort is of size 2
uint is of size 4
"""
def __init__(self, fh):
self.f = fh
self.strings = dict()
self.connections = list()
self.f.seek(0x00)
self.version, = unpack("<H", self.f.read(2))
"""
pos type description
0x00 ushort version
"""
if self.version != 5:
raise IOError, "unknown version: %d"%self.version
self.f.seek(0x02)
start_station, u1, sX, sY = unpack("<H3I", self.f.read(14))
"""
pos type description
0x02 ushort string reference to start station name
0x04 uint unknown
0x08 uint longitude of start station
0x0C uint latitude of start station
"""
self.f.seek(0x10)
end_station, u1, eX, eY = unpack("<H3I", self.f.read(14))
"""
pos type description
0x10 ushort string reference to end station name
0x12 uint unknown
0x16 uint longitude of end station
0x1A uint latitude of end station
"""
self.f.seek(0x1e)
number_of_conn, self.frequencies_offset, self.strings_offset = unpack("<HII", self.f.read(10))
"""
pos type description
0x1e ushort number of connections found
0x20 uint position of connection frequency data
0x24 uint position of string block
"""
#print "number_of_conn: %d, frequencies_offset: %d, strings_offset: %d"%(number_of_conn, self.frequencies_offset, self.strings_offset)
#now that we read the string_offset, get the station names
self.start_station = {"name":self.get_string(start_station), "X":sX, "Y":sY}
self.end_station = {"name":self.get_string(end_station), "X":eX, "Y":eY}
self.f.seek(0x28)
timetable_begin, timetable_end, today, timetable_remaining = unpack("<4H", self.f.read(8))
"""
pos type description
0x28 ushort begin date of current timetable version
0x2a ushort end date of current timetable version
0x2c ushort date of query
0x2e ushort remaining days in current timetable version
"""
self.timetable_info = {
"timetable_begin":self.parse_date(timetable_begin),
"timetable_end":self.parse_date(timetable_end),
"today":self.parse_date(today),
"timetable_remaining":timetable_remaining}
self.f.seek(0x30)
#print "unknown string: %s"%self.f.read(6)
self.f.seek(0x36)
self.cities_offset, self.train_props_offset \
= unpack("<II", self.f.read(8))
"""
pos type description
0x36 uint position of city descriptions
0x3a uint position of train property data
"""
#TODO: <begin big (46 bytes) ugly unknown section>
# a lot of unknown stuff which consists either of never changing data
# or of zero-ed data
# but seems to be not important for the rest as its size is static and
# no static information seems to be missing
# probably 46 bytes of obsolete data?
self.f.seek(0x3e)
#print "cities_offset: %d, train_props_offset: %d"%(self.cities_offset, self.train_props_offset)
additional_offset1, u2, additional_offset2 \
= unpack("<3I", self.f.read(12))
#print "additional_offset1: %d, u2: %d, additional_offset2: %d"%(additional_offset1, u2, additional_offset2)
self.f.seek(additional_offset1)
u1, = unpack("<H", self.f.read(2))
#print "u1: %d"%u1
self.f.seek(additional_offset2)
u1, u2, u3, request_id, u5_offset, u6, u7, u8_offset, u9, u10, u11, u12 \
= unpack("<IIHHIHHI4H", self.f.read(32))
#print "u1: %d, u2: %d, u3: %d, request_id: %s, u5_offset: %d, u6: %d, u7: %s, u8_offset: %d, u9: %d, u10: %d, u11: %d, u12: %d"%(u1, u2, u3, self.get_string(request_id), u5_offset, u6, self.get_string(u7), u8_offset, u9, u10, u11, u12)
# the only useful stuff:
self.reqest_id = self.get_string(request_id)
self.products = self.parse_products(self.get_string(u7))
#TODO: </end big ugly unknown section>
for i in xrange(number_of_conn):
self.f.seek(0x4a + 12*i)
freq, train_list_offset, number_of_trains, number_of_changes, duration \
= unpack("<HI3H", self.f.read(12))
"""
0 <= i < number_of_conn
pos type description
0x4a+12*i ushort frequency of this connection
0x4c+12*i uint position of train list after train_list_offset
0x50+12*i ushort number of changes
0x52+12*i ushort duration of this connection
"""
trains = list()
for j in xrange(number_of_trains):
self.f.seek(0x4a + train_list_offset + j*20)
dep_time, dep_station, arr_time, arr_station, transportation, \
train, arr_track, dep_track, note, train_proporties \
= unpack("<10H", self.f.read(20))
"""
0 <= j < number_of_trains
pos type description
0x4a+train_list_offset+j*20 ushort departure time
0x4a+train_list_offset+j*20 ushort departure station
0x4a+train_list_offset+j*20 ushort arrival time
0x4a+train_list_offset+j*20 ushort arrival station
0x4a+train_list_offset+j*20 ushort transportation method
0x4a+train_list_offset+j*20 ushort train identification
0x4a+train_list_offset+j*20 ushort arrival track
0x4a+train_list_offset+j*20 ushort departure track
0x4a+train_list_offset+j*20 ushort notes like final destination of tram
0x4a+train_list_offset+j*20 ushort train proporties
"""
trains.append({
"dep_time":self.parse_time(dep_time),
"dep_station":self.get_city(dep_station),
"arr_time":self.parse_time(arr_time),
"arr_station":self.get_city(arr_station),
"transportation_type":self.parse_transportation(transportation),
"train":self.get_string(train),
"arr_track":self.get_string(arr_track),
"dep_track":self.get_string(dep_track),
"note":self.get_string(note),
"train_properties":self.get_train_props(train_proporties)})
self.connections.append({
"freq":self.get_frequency(freq),
"number_of_changes":number_of_changes,
"duration":self.parse_time(duration),
"trains":trains})
def parse_products(self, products):
"""
used means of transportation for a returned connection set is given as
10 chars being "1" or "0" depending wether the product is being
considered or not.
this method maps this string to a dict
"""
if products and len(products) == 14:
p = ["ice", "ic", "ir", "re", "sbahn", "bus", "boat", "subway", "tram", "taxi"]
return dict(zip(p, map(bool, map(int, products[:10]))))
def parse_time(self, time):
"""
time is stored as an integer which, when represented as a string can be
split to get a string representation of the time
1345 => 13:54
512 => 5:12
"""
time = "%03d"%time
return ":".join([time[:-2], time[-2:]])
def parse_date(self, d):
"""
dates are expressed as integers of days since 01.01.1980
"""
return date(1980, 1, 1)+timedelta(days=d)
def parse_transportation(self, t):
"""
transportation can be by some train or by foot
"""
if t == 1:
return "feet"
elif t == 2:
return "train"
else:
raise Exception, "transportation %d unexpected"%t
def get_frequency(self, offset):
"""
given the offset, get the frequency a connection is scheduled from the
data block beginning at frequencies_offset
the last three values are still a myth as they dont seem to correspond
with the information in the string referenced by the first value
"""
self.f.seek(self.frequencies_offset+offset)
#TODO: what do the last values mean?
#TODO: where are the days of service properly encoded?
freq, u1, u2, u3 = unpack("<4H", self.f.read(8))
return self.get_string(freq), u1, u2, u3
def get_string(self, offset):
"""
given the offset, return a zero terminated string from the stringblock
"""
if offset in self.strings:
# get it from dict to prevent ugly f.read(1)
return self.strings[offset]
else:
self.f.seek(self.strings_offset+offset)
result = ""
# read zero terminated string.. ugly in py..
while True:
b = self.f.read(1)
if b == "\0":
# some strings come with many whitespaces at the end
result = result.strip()
# by convention of the format "---" means None
if result == "---":
result = None
# fill a dict with strings to prevent too much ugly f.read(1)
self.strings[offset] = result
return result
else:
result += b
def get_train_props(self, offset):
"""
given the offset, get the string list of train properties from the
data block beginning at train_props_offset. the first ushort marks the
amount of properties
"""
self.f.seek(self.train_props_offset+offset)
n, = unpack("<H", self.f.read(2))
return map(self.get_string, unpack("<%dH"%n, self.f.read(2*n)))
def get_city(self, offset):
"""
given the offset, get data about a city from the data block beginning
at cities_offset.
the data includes the city name, its code and latitude, longitude
"""
self.f.seek(self.cities_offset+offset*14)
name, L, X, Y = unpack("<H3I", self.f.read(14))
return {"name":self.get_string(name), "L":L, "X":X, "Y":Y}
#f = open("pln", "r")
a = PlnParse(get_pln_data(compile_i_from_station("Bremen"), compile_i_from_station("Karlsruhe"), dt = datetime.today()+timedelta(days=1)))
#a = PlnParse(get_pln_data(compile_i_from_coords(8.6535, 53.1667), compile_i_from_station("Bremen"), dt = datetime.today()+timedelta(days=4)))
#a = PlnParse(f)
#def bin(i):
# return "".join(str((i >> y) & 1) for y in range(16-1, -1, -1))
#print a.timetable_info
for conn in a.connections:
print conn
#for conn in a.connections:
# print "\t".join([conn["freq"][0], bin(conn["freq"][1]), bin(conn["freq"][2]), bin(conn["freq"][3])])
#for conn in a.connections:
# print conn["freq"], conn["trains"][0]["dep_station"]["L"], conn["trains"][0]["train"], conn["trains"][0]["dep_time"]