hafaspln/parser.py

from urllib2 import Request, urlopen
from urllib import urlencode, quote_plus, urlretrieve
from gzip import GzipFile
from cStringIO import StringIO
from struct import unpack
from time import sleep
from datetime import timedelta, date, time, datetime
from lxml import etree

DEBUG = 3

def get_id(station):
	return get_ids(station+"!")[0]

def search_ids(station):
	return get_ids(station+"?")

def get_ids(query):
	req = Request("http://railnavigator.bahn.de/bin/rnav/query.exe/dn",
			'<?xml version="1.0" encoding="UTF-8" ?><ReqC ver="1.1" prod="String" lang="DE"><MLcReq><MLc n="%s" t="ST" /></MLcReq></ReqC>'%query,
			{"User-Agent":"Java/1.6.0_0"})
	# <MLc t="ST" n="Bahlen Germann "Bahler Buur", Dinklage" i="A=1@O=Bahlen Germann "Bahler Buur", Dinklage@X=81
	# grrrrrrrrr - too stupid to escape their xml!!!!
	parser = etree.XMLParser(recover=True, encoding='iso-8859-1')
	tree = etree.parse(urlopen(req), parser)
	# make sure None entries from badly parsed nodes are not in the output
	return [elem.get("i") for elem in tree.findall(".//MLc") if elem.get("i")]

def parse_i(i):
	tokens = i.strip('@').split("@")
	d = dict()
	for t in tokens:
		key, value = t.split("=", 1)
		d[key] = value
	return d

def compile_i_from_station(station):
	return get_id(station)

def compile_i_from_stationid(stationid):
	return "A=1@L=%09d@"%stationid

def compile_i_from_coords(x, y, name=None):
	if not name:
		name="---"
	return "A=16@O=%s@X=%d@Y=%d@"%(name, int(x*1000000), int(y*1000000))

#infile = open("../cities_germany_211009.clean.txt", "r")
#outfile = open("../cities_germany_291009.out.txt", "w")
#for city in infile:
#	print city,
#	for station in search_ids(city):
#		outfile.write("%s\t%s\t%s\t%s\n"%(station["O"], station["X"], station["Y"], station["L"]))
#	sleep(10)
#infile.close()
#outfile.close()
#exit()


#big plan
def get_big_pln_data():
	req = Request("http://persoenlicherfahrplan.bahn.de/bin/pf/query-p2w.exe/dn",
			urlencode({"start":"1",
				   "pp":"5",
					   "ZID":"A=1@X=10885568@Y=48365444@O=Augsburg Hbf@L=008000013@U=80@K=S1-0N1@G=8000013@C=49@a=128@B=1@",
					   "output":"pln",
					   "hcount":"0",
					   "h2gversion":"6.20.7",
					   "spmo":"1",
					   "htype":"MicroEmulator-2.0",
					   "SID":"A=1@X=6092250@Y=50767829@O=Aachen Hbf@L=008000001@U=80@K=S1-0N1@G=8000001@C=49@a=128@B=1@",
					   "L":"vs_javapln",
					   "p2wIVRoute":"1",
					   "p2wCreateMaps":"1"}),
			{"User-Agent":"Java/1.6.0_0"})
	tokens = urlopen(req).read().split()
	time = ""
	url = ""
	for t in tokens:
		key, value = t.split("=", 1)
		if key == "url":
			url = value
		elif key == "time":
			time = value

def test_pln():
	url2 = ""
	while url2 == "":
		tokens = urlopen(url).read().strip().split("\n")
		print tokens
		for t in tokens:
			key, value = t.split("=", 1)
			if key == "url":
				url2 = value
				break
			elif key == "time":
				sleep(10)
				print value
	urlretrieve(url2, "pln")

# next: ignoreMinuteRound=yes&REQ0HafasScrollDir=1&h2g-direct=1&seqnr=1&ident=ox.0348695.1256054390&


def get_departures(station, dt):
	if isinstance(station, str):
		station = parse_i(get_id(station))["L"]
	req = Request("http://mobile.bahn.de/bin/mobil/bhftafel.exe/dn?",
			urlencode({
				"start":"yes",
				"date":dt.strftime("%d.%m.%Y"),
				"time":dt.strftime("%H:%M"),
				"boardType":"dep",
				"sTI":"0", # can be left out
				"L":"vs_java3", # also vs_java
				"input":station,
				"productsFilter":"1"*14
				}),
			{"User-Agent":"Java/1.6.0_0"})
	stops = urlopen(req).read()[4:-4].split("\n/>\n<Journey ")
	result = list()
	for stop in stops:
		tokens = stop.split("\n")
		evaid, name = tokens[0].split(" ", 1)
		s = dict()
		s["L"] = evaid.split("=")[1].strip("\"")
		s["O"] = name.split("=")[1].strip("\"")
		for t in tokens[1:]:
			key, value = t.split("=", 1)
			value = value.strip("\" ")
			if value:
				s[key] = value
		result.append(s)
	return result

def get_stops(train, dep_station, arr_station, dt):
	if isinstance(dep_station, str):
		station = parse_i(get_id(dep_station))["L"]
	if isinstance(arr_station, str):
		station = parse_i(get_id(arr_station))["L"]
	req = Request("http://mobile.bahn.de/bin/mobil/bhftafel.exe/dn?",
			urlencode({
				"start":"yes",
				"REQTrain_name":train,
				"date":dt.strftime("%d.%m.%Y"),
				"time":dt.strftime("%H:%M"),
				"boardType":"dep",
				"sTI":"1",
				"L":"vs_java3",
				"input":dep_station,
				"dirInput":arr_station,
				"productsFilter":"1"*14
				}),
			{"User-Agent":"Java/1.6.0_0"})
	stops = urlopen(req).read()[4:-4].split("\n/>\n<St ")
	result = list()
	for stop in stops:
		tokens = stop.split("\n")
		evaid, name = tokens[0].split(" ", 1)
		s = dict()
		s["L"] = evaid.split("=")[1].strip("\"")
		s["O"] = name.split("=")[1].strip("\"")
		for t in tokens[1:]:
			key, value = t.split("=", 1)
			value = value.strip("\" ")
			if value:
				s[key] = value
		result.append(s)
	return result

#print get_stops('RE 36072', 8010334, 8010310, datetime(2010, 06, 18, 15, 43))

#late:
#http://mobile.bahn.de/bin/mobil/bhftafel.exe/dn?
#	start=yes
#	&time=actual
#	&rT.2=13%3A51
#	&rT.1=13%3A51
#	&boardType=dep
#	&hcount=0
#	&h2gversion=6.20.7
#	&rZ.2=RE%2010125
#	&htype=MicroEmulator-2.0
#	&rZ.1=RE%2010125
#	&L=vs_java
#	&input=%238000001&

def get_pln_data(start, end, stop1=None, stop2=None, dt=None, bike=False,
	ice=True, ic=True, ir=True, re=True, sbahn=True, bus=True, boat=True,
	subway=True, tram=True, taxi=True):
	if dt is None:
		dt = datetime.now()
	query = {"ignoreMinuteRound":"yes",
			"date":dt.strftime("%d.%m.%Y"),
			"ZID":end,
			"h2g-direct":"1",
			"time":dt.strftime("%H:%M"),
			"SID":start,
			"start":"1"}

	if not all([ice, ic, ir, re, sbahn, bus, boat, subway, tram, taxi]):
		query["REQ0JourneyProduct_prod_list_1"] = \
				"".join(str(int(x)) for x in [ice, ic, ir, re, sbahn, bus, boat, subway, tram, taxi])+"1111"
	if bike:
		query["REQ0JourneyProduct_opt3"] = "1"
	if stop1:
		query["VID1"]=stop1
		if stop2:
			query["VID2"]=stop2

	req = Request("http://mobile.bahn.de/bin/mobil/query.exe/dn",
			urlencode(query),
			{"User-Agent":"Java/1.6.0_0"})
	r = urlopen(req) # is a file like object but does not have tell()
	s = StringIO(r.read()) # this is why a stringio has to be created for gzip
	return GzipFile(mode="r", fileobj=s) # possibly replacable by manually reading the gzip header

def debug(level, *args):
	if DEBUG >= level:
		print "\033[1m"+args[0]%args[1:]+"\033[0m"

class PlnParse:
	"""
	all data in little endian
	offsets are always counted from start of the file
	latitude, longitude are in wgs84 format
	ushort is of size 2
	uint is of size 4
	"""
	def __init__(self, fh):
		self.f = fh
		self.strings = dict()
		self.connections = list()

		"""
		pos	 type	description
		0x00	ushort  version
		"""
		self.f.seek(0x00)
		debug(1, "%08x: read %d bytes for version", self.f.tell(), 2)
		self.version, = unpack("<H", self.f.read(2))
		if self.version != 5:
			raise IOError, "unknown version: %d"%self.version

		"""
		pos	 type	description
		0x02	ushort  string reference to start station name
		0x04	uint	unknown
		0x08	uint	longitude of start station
		0x0C	uint	latitude of start station
		"""
		self.f.seek(0x02)
		debug(1, "%08x: read %d bytes for start station, unknown, longitude and latitude", self.f.tell(), 14)
		start_station, u1, sX, sY = unpack("<H3I", self.f.read(14))
		debug(2, "\tstart_station: %d, u1: %d, sX: %d, sY: %d", start_station, u1, sX, sY)

		"""
		pos	 type	description
		0x10	ushort  string reference to end station name
		0x12	uint	unknown
		0x16	uint	longitude of end station
		0x1A	uint	latitude of end station
		"""
		self.f.seek(0x10)
		debug(1, "%08x: read %d bytes for end station, unknown, longitude and latitude", self.f.tell(), 14)
		end_station, u1, eX, eY = unpack("<H3I", self.f.read(14))
		debug(2, "\tend_station: %d, u1: %d, eX: %d, eY: %d", end_station, u1, eX, eY)

		"""
		pos	 type	description
		0x1e	ushort  number of connections found
		0x20	uint	position of connection frequency data
		0x24	uint	position of string block
		"""
		self.f.seek(0x1e)
		debug(1, "%08x: read %d bytes for number of connections, frequency offset and string offset", self.f.tell(), 10)
		number_of_conn, self.frequencies_offset, self.strings_offset = unpack("<HII", self.f.read(10))
		debug(2, "\tnumber_of_conn: %d, frequencies_offset: %d, strings_offset: %d", number_of_conn, self.frequencies_offset, self.strings_offset)
		debug(3, "\t\thex(frequencies_offset) = %08x", self.frequencies_offset)
		debug(3, "\t\thex(strings_offset) = %08x", self.strings_offset)

		#now that we read the string_offset, get the station names
		self.start_station = {"name":self.get_string(start_station), "X":sX, "Y":sY}
		self.end_station = {"name":self.get_string(end_station), "X":eX, "Y":eY}
		debug(3, "\t\tget_string(start_station) = %s", self.get_string(start_station))
		debug(3, "\t\tget_string(end_station) = %s", self.get_string(end_station))

		"""
		pos	 type	description
		0x28	ushort  begin date of current timetable version
		0x2a	ushort  end date of current timetable version
		0x2c	ushort  date of query
		0x2e	ushort  remaining days in current timetable version
		"""
		self.f.seek(0x28)
		debug(1, "%08x: read %d bytes for dates of beginning, ending and remaining days of the current timetable", self.f.tell(), 8)
		timetable_begin, timetable_end, today, timetable_remaining = unpack("<4H", self.f.read(8))
		self.timetable_info = {
			"timetable_begin":self.parse_date(timetable_begin),
			"timetable_end":self.parse_date(timetable_end),
			"today":self.parse_date(today),
			"timetable_remaining":timetable_remaining}
		debug(2, "\ttimetable_begin: %d, timetable_end: %d, today: %d, timetable_remaining: %d", timetable_begin, timetable_end, today, timetable_remaining)

		self.f.seek(0x30)
		debug(1, "%08x: read %d bytes for unknown string", self.f.tell(), 6)
		debug(2, "\tunknown string: %s", self.f.read(6))

		"""
		pos	 type	description
		0x36	uint	position of city descriptions
		0x3a	uint	position of train property data
		"""
		self.f.seek(0x36)
		debug(1, "%08x: read %d bytes for city and train props offset", self.f.tell(), 8)
		self.cities_offset, self.train_props_offset = unpack("<II", self.f.read(8))
		debug(2, "\tcities_offset: %d, train_props_offset: %d", self.cities_offset, self.train_props_offset)

		#TODO: <begin big (46 bytes) ugly unknown section>
		# a lot of mostly unknown stuff which consists either of never changing data
		# or of zero-ed data
		# but seems not to be important as its size is static and
		# no static information seems to be missing
		# probably 46 bytes of obsolete data?
		# u5_offset and u8_offset are only non-zero for partial pln data
		self.f.seek(0x3e)
		debug(1, "%08x: read %d bytes for lots of mostly useless stuff", self.f.tell(), 12)
		additional_offset1, u1, additional_offset2 = unpack("<3I", self.f.read(12))
		debug(2, "\tadditional_offset1: %d, u1: %d, additional_offset2: %d", additional_offset1, u1, additional_offset2)
		debug(3, "\t\thex(additional_offset1) = %08x", additional_offset1)
		debug(3, "\t\thex(additional_offset2) = %08x", additional_offset2)

		self.f.seek(additional_offset1)
		debug(1, "%08x: read %d bytes for some unknown", self.f.tell(), 2)
		u1, = unpack("<H", self.f.read(2))
		debug(2, "\tu1: %d", u1)

		self.f.seek(additional_offset2)
		debug(1, "%08x: read %d bytes for a huge bunch of unknowns", self.f.tell(), 32)
		u1, u2, request_number, request_id, u5_offset, u6, products, u8_offset, u9, u10, u11, u12 = unpack("<IIHHIHHI4H", self.f.read(32))
		debug(2, "\tu1: %d, u2: %d, request_number: %d, request_id: %d, u5_offset: %d, u6: %d, products: %d, u8_offset: %d, u9: %d, u10: %d, u11: %d, u12: %d", u1, u2, request_number, request_id, u5_offset, u6, products, u8_offset, u9, u10, u11, u12)
		debug(3, "\t\tget_string(request_id) = %s", self.get_string(request_id))
		debug(3, "\t\tget_string(products) = %s", self.get_string(products))
		debug(3, "\t\thex(u5_offset) = %08x", u5_offset)
		debug(3, "\t\thex(u8_offset) = %08x", u8_offset)

		# the only useful stuff:
		self.reqest_id = self.get_string(request_id)
		self.products = self.parse_products(self.get_string(products))
		#TODO: </end big ugly unknown section>

		for i in xrange(number_of_conn):
			self.f.seek(0x4a + 12*i)
			freq, train_list_offset, number_of_trains, number_of_changes, duration \
				= unpack("<HI3H", self.f.read(12))
			"""
			0 <= i < number_of_conn
			pos		 type	description
			0x4a+12*i   ushort  frequency of this connection
			0x4c+12*i   uint	position of train list after train_list_offset
			0x50+12*i   ushort  number of changes
			0x52+12*i   ushort  duration of this connection
			"""
			trains = list()
			for j in xrange(number_of_trains):
				self.f.seek(0x4a + train_list_offset + j*20)
				dep_time, dep_station, arr_time, arr_station, transportation, \
				train, arr_track, dep_track, note, train_proporties \
					= unpack("<10H", self.f.read(20))
				"""
				0 <= j < number_of_trains
				pos				type	description
				0x4a+train_list_offset+j*20	ushort	departure time
				0x4a+train_list_offset+j*20	ushort	departure station
				0x4a+train_list_offset+j*20	ushort	arrival time
				0x4a+train_list_offset+j*20	ushort	arrival station
				0x4a+train_list_offset+j*20	ushort	transportation method
				0x4a+train_list_offset+j*20	ushort	train identification
				0x4a+train_list_offset+j*20	ushort	arrival track
				0x4a+train_list_offset+j*20	ushort	departure track
				0x4a+train_list_offset+j*20	ushort	notes like final destination of tram
				0x4a+train_list_offset+j*20	ushort	train proporties
				"""
				trains.append({
					"dep_time":self.parse_time(dep_time),
					"dep_station":self.get_city(dep_station),
					"arr_time":self.parse_time(arr_time),
					"arr_station":self.get_city(arr_station),
					"transportation_type":self.parse_transportation(transportation),
					"train":self.get_string(train),
					"arr_track":self.get_string(arr_track),
					"dep_track":self.get_string(dep_track),
					"note":self.get_string(note),
					"train_properties":self.get_train_props(train_proporties)})
			self.connections.append({
				"freq":self.get_frequency(freq),
				"number_of_changes":number_of_changes,
				"duration":self.parse_timedelta(duration),
				"trains":trains})

	def parse_products(self, products):
		"""
		used means of transportation for a returned connection set is given as
		10 chars being "1" or "0" depending wether the product is being
		considered or not.
		this method maps this string to a dict
		"""
		if products and len(products) == 14:
			p = ["ice", "ic", "ir", "re", "sbahn", "bus", "boat", "subway", "tram", "taxi"]
			return dict(zip(p, map(bool, map(int, products[:10]))))

	def parse_timedelta(self, t):
		"""
		time is stored as an integer which, when represented as a string can be
		split to get a string representation of the time
		1345 => 13:54
		512 => 5:12
		"""
		t = "%03d"%t
		return timedelta(hours=int(t[:-2]), minutes=int(t[-2:]))

	def parse_time(self, t):
		"""
		time is stored as an integer which, when represented as a string can be
		split to get a string representation of the time
		1345 => 13:54
		512 => 5:12
		"""
		t = "%03d"%t
		hour, minute = int(t[:-2]), int(t[-2:])
		# TODO: what to do with hour>=24 ?
		hour %= 24
		return time(hour, minute)

	def parse_date(self, d):
		"""
		dates are expressed as integers of days since 01.01.1980
		"""
		return date(1980, 1, 1)+timedelta(days=d)

	def parse_transportation(self, t):
		"""
		transportation can be by some train or by foot
		"""
		if t == 1:
			return "feet"
		elif t == 2:
			return "train"
		else:
			raise Exception, "transportation %d unexpected"%t

	def get_frequency(self, offset):
		"""
		given the offset, get the frequency a connection is scheduled from the
		data block beginning at frequencies_offset
		the last three values are still a myth as they dont seem to correspond
		with the information in the string referenced by the first value
		"""
		self.f.seek(self.frequencies_offset+offset)
		#TODO: what do the last values mean?
		#TODO: where are the days of service properly encoded?
		freq, u1, u2, u3 = unpack("<4H", self.f.read(8))
		return self.get_string(freq), u1, u2, u3

	def get_string(self, offset):
		"""
		given the offset, return a zero terminated string from the stringblock
		"""
		if offset in self.strings:
			# get it from dict to prevent ugly f.read(1)
			return self.strings[offset]
		else:
			self.f.seek(self.strings_offset+offset)
			result = ""
			# read zero terminated string.. ugly in py..
			while True:
				b = self.f.read(1)
				if b == "\0":
					# some strings come with many whitespaces at the end
					result = result.strip()
					# by convention of the format "---" means None
					if result == "---":
						result = None
					# fill a dict with strings to prevent too much ugly f.read(1)
					self.strings[offset] = result
					return result
				else:
					result += b

	def get_train_props(self, offset):
		"""
		given the offset, get the string list of train properties from the
		data block beginning at train_props_offset. the first ushort marks the
		amount of properties
		"""
		self.f.seek(self.train_props_offset+offset)
		n, = unpack("<H", self.f.read(2))
		return map(self.get_string, unpack("<%dH"%n, self.f.read(2*n)))

	def get_city(self, offset):
		"""
		given the offset, get data about a city from the data block beginning
		at cities_offset.
		the data includes the city name, its code and latitude, longitude
		"""
		self.f.seek(self.cities_offset+offset*14)
		name, L, X, Y = unpack("<H3I", self.f.read(14))
		return {"name":self.get_string(name), "L":L, "X":X, "Y":Y}

#f = open("stendal-salzwedel-1st.bin", "r")
f = GzipFile("stendal-salzwedel-1st.bin", "r")
#a = PlnParse(get_pln_data(compile_i_from_station("Bremen"), compile_i_from_station("Karlsruhe"), dt = datetime.today()+timedelta(days=1)))
#a = PlnParse(get_pln_data(compile_i_from_coords(8.6535, 53.1667), compile_i_from_station("Bremen"), dt = datetime.today()+timedelta(days=4)))
a = PlnParse(f)

#def bin(i):
#	return "".join(str((i >> y) & 1) for y in range(16-1, -1, -1))
#print a.timetable_info
for conn in a.connections:
	print conn
#for conn in a.connections:
#	print "\t".join([conn["freq"][0], bin(conn["freq"][1]), bin(conn["freq"][2]), bin(conn["freq"][3])])
#for conn in a.connections:
#	print conn["freq"], conn["trains"][0]["dep_station"]["L"], conn["trains"][0]["train"], conn["trains"][0]["dep_time"]