heroes-renaissance/pyglet/text/formats/html.py

365 lines
13 KiB
Python
Raw Permalink Normal View History

2008-11-23 20:07:47 +00:00
# ----------------------------------------------------------------------------
# pyglet
# Copyright (c) 2006-2008 Alex Holkner
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of pyglet nor the names of its
# contributors may be used to endorse or promote products
# derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# ----------------------------------------------------------------------------
'''Decode HTML into attributed text.
A subset of HTML 4.01 Transitional is implemented. The following elements are
supported fully::
B BLOCKQUOTE BR CENTER CODE DD DIR DL EM FONT H1 H2 H3 H4 H5 H6 I IMG KBD
LI MENU OL P PRE Q SAMP STRONG SUB SUP TT U UL VAR
The mark (bullet or number) of a list item is separated from the body of the
list item with a tab, as the pyglet document model does not allow
out-of-stream text. This means lists display as expected, but behave a little
oddly if edited.
No CSS styling is supported.
'''
__docformat__ = 'restructuredtext'
__version__ = '$Id: $'
import HTMLParser
import htmlentitydefs
import os
import re
import pyglet
from pyglet.text.formats import structured
def _hex_color(val):
return [(val >> 16) & 0xff, (val >> 8) & 0xff, val & 0xff, 255]
_color_names = {
'black': _hex_color(0x000000),
'silver': _hex_color(0xc0c0c0),
'gray': _hex_color(0x808080),
'white': _hex_color(0xffffff),
'maroon': _hex_color(0x800000),
'red': _hex_color(0xff0000),
'purple': _hex_color(0x800080),
'fucsia': _hex_color(0x008000),
'green': _hex_color(0x00ff00),
'lime': _hex_color(0xffff00),
'olive': _hex_color(0x808000),
'yellow': _hex_color(0xff0000),
'navy': _hex_color(0x000080),
'blue': _hex_color(0x0000ff),
'teal': _hex_color(0x008080),
'aqua': _hex_color(0x00ffff),
}
def _parse_color(value):
if value.startswith('#'):
return _hex_color(int(value[1:], 16))
else:
try:
return _color_names[value.lower()]
except KeyError:
raise ValueError()
_whitespace_re = re.compile(u'[\u0020\u0009\u000c\u200b\r\n]+', re.DOTALL)
_metadata_elements = ['head', 'title']
_block_elements = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'ul', 'ol', 'dir', 'menu',
'pre', 'dl', 'div', 'center',
'noscript', 'noframes', 'blockquote', 'form',
'isindex', 'hr', 'table', 'fieldset', 'address',
# Incorrect, but we treat list items as blocks:
'li', 'dd', 'dt', ]
_block_containers = ['_top_block',
'body', 'div', 'center', 'object', 'applet',
'blockquote', 'ins', 'del', 'dd', 'li', 'form',
'fieldset', 'button', 'th', 'td', 'iframe', 'noscript',
'noframes',
# Incorrect, but we treat list items as blocks:
'ul', 'ol', 'dir', 'menu', 'dl']
class HTMLDecoder(HTMLParser.HTMLParser, structured.StructuredTextDecoder):
'''Decoder for HTML documents.
'''
#: Default style attributes for unstyled text in the HTML document.
#:
#: :type: dict
default_style = {
'font_name': 'Times New Roman',
'font_size': 12,
'margin_bottom': '12pt',
}
#: Map HTML font sizes to actual font sizes, in points.
#:
#: :type: dict
font_sizes = {
1: 8,
2: 10,
3: 12,
4: 14,
5: 18,
6: 24,
7: 48
}
def decode_structured(self, text, location):
self.location = location
self._font_size_stack = [3]
self.list_stack.append(structured.UnorderedListBuilder({}))
self.strip_leading_space = True
self.block_begin = True
self.need_block_begin = False
self.element_stack = ['_top_block']
self.in_metadata = False
self.in_pre = False
self.push_style('_default', self.default_style)
self.feed(text)
self.close()
def get_image(self, filename):
return pyglet.image.load(filename, file=self.location.open(filename))
def prepare_for_data(self):
if self.need_block_begin:
self.add_text('\n')
self.block_begin = True
self.need_block_begin = False
def handle_data(self, data):
if self.in_metadata:
return
if self.in_pre:
self.add_text(data)
else:
data = _whitespace_re.sub(' ', data)
if data.strip():
self.prepare_for_data()
if self.block_begin or self.strip_leading_space:
data = data.lstrip()
self.block_begin = False
self.add_text(data)
self.strip_leading_space = data.endswith(' ')
def handle_starttag(self, tag, case_attrs):
if self.in_metadata:
return
element = tag.lower()
attrs = {}
for key, value in case_attrs:
attrs[key.lower()] = value
if element in _metadata_elements:
self.in_metadata = True
elif element in _block_elements:
# Pop off elements until we get to a block container.
while self.element_stack[-1] not in _block_containers:
self.handle_endtag(self.element_stack[-1])
if not self.block_begin:
self.add_text('\n')
self.block_begin = True
self.need_block_begin = False
self.element_stack.append(element)
style = {}
if element in ('b', 'strong'):
style['bold'] = True
elif element in ('i', 'em', 'var'):
style['italic'] = True
elif element in ('tt', 'code', 'samp', 'kbd'):
style['font_name'] = 'Courier New'
elif element == 'u':
color = self.current_style.get('color')
if color is None:
color = [0, 0, 0, 255]
style['underline'] = color
elif element == 'font':
if 'face' in attrs:
style['font_name'] = attrs['face'].split(',')
if 'size' in attrs:
size = attrs['size']
try:
if size.startswith('+'):
size = self._font_size_stack[-1] + int(size[1:])
elif size.startswith('-'):
size = self._font_size_stack[-1] - int(size[1:])
else:
size = int(size)
except ValueError:
size = 3
self._font_size_stack.append(size)
if size in self.font_sizes:
style['font_size'] = self.font_sizes.get(size, 3)
else:
self._font_size_stack.append(self._font_size_stack[-1])
if 'color' in attrs:
try:
style['color'] = _parse_color(attrs['color'])
except ValueError:
pass
elif element == 'sup':
size = self._font_size_stack[-1] - 1
style['font_size'] = self.font_sizes.get(size, 1)
style['baseline'] = '3pt'
elif element == 'sub':
size = self._font_size_stack[-1] - 1
style['font_size'] = self.font_sizes.get(size, 1)
style['baseline'] = '-3pt'
elif element == 'h1':
style['font_size'] = 24
style['bold'] = True
style['align'] = 'center'
elif element == 'h2':
style['font_size'] = 18
style['bold'] = True
elif element == 'h3':
style['font_size'] = 16
style['bold'] = True
elif element == 'h4':
style['font_size'] = 14
style['bold'] = True
elif element == 'h5':
style['font_size'] = 12
style['bold'] = True
elif element == 'h6':
style['font_size'] = 12
style['italic'] = True
elif element == 'br':
self.add_text(u'\u2028')
self.strip_leading_space = True
elif element == 'p':
if attrs.get('align') in ('left', 'center', 'right'):
style['align'] = attrs['align']
elif element == 'center':
style['align'] = 'center'
elif element == 'pre':
style['font_name'] = 'Courier New'
style['margin_bottom'] = 0
self.in_pre = True
elif element == 'blockquote':
left_margin = self.current_style.get('margin_left') or 0
right_margin = self.current_style.get('margin_right') or 0
style['margin_left'] = left_margin + 60
style['margin_right'] = right_margin + 60
elif element == 'q':
self.handle_data(u'\u201c')
elif element == 'ol':
try:
start = int(attrs.get('start', 1))
except ValueError:
start = 1
format = attrs.get('type', '1') + '.'
builder = structured.OrderedListBuilder(start, format)
builder.begin(self, style)
self.list_stack.append(builder)
elif element in ('ul', 'dir', 'menu'):
type = attrs.get('type', 'disc').lower()
if type == 'circle':
mark = u'\u25cb'
elif type == 'square':
mark = u'\u25a1'
else:
mark = u'\u25cf'
builder = structured.UnorderedListBuilder(mark)
builder.begin(self, style)
self.list_stack.append(builder)
elif element == 'li':
self.list_stack[-1].item(self, style)
self.strip_leading_space = True
elif element == 'dl':
style['margin_bottom'] = 0
elif element == 'dd':
left_margin = self.current_style.get('margin_left') or 0
style['margin_left'] = left_margin + 30
elif element == 'img':
image = self.get_image(attrs.get('src'))
if image:
width = attrs.get('width')
if width:
width = int(width)
height = attrs.get('height')
if height:
height = int(height)
self.prepare_for_data()
self.add_element(structured.ImageElement(image, width, height))
self.strip_leading_space = False
self.push_style(element, style)
def handle_endtag(self, tag):
element = tag.lower()
if element not in self.element_stack:
return
self.pop_style(element)
while self.element_stack.pop() != element:
pass
if element in _metadata_elements:
self.in_metadata = False
elif element in _block_elements:
self.block_begin = False
self.need_block_begin = True
if element == 'font' and len(self._font_size_stack) > 1:
self._font_size_stack.pop()
elif element == 'pre':
self.in_pre = False
elif element == 'q':
self.handle_data(u'\u201d')
elif element in ('ul', 'ol'):
if len(self.list_stack) > 1:
self.list_stack.pop()
def handle_entityref(self, name):
if name in htmlentitydefs.name2codepoint:
self.handle_data(unichr(htmlentitydefs.name2codepoint[name]))
def handle_charref(self, name):
name = name.lower()
try:
if name.startswith('x'):
self.handle_data(unichr(int(name[1:], 16)))
else:
self.handle_data(unichr(int(name)))
except ValueError:
pass