# ---------------------------------------------------------------------------- # pyglet # Copyright (c) 2006-2008 Alex Holkner # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of pyglet nor the names of its # contributors may be used to endorse or promote products # derived from this software without specific prior written # permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # ---------------------------------------------------------------------------- '''Decode HTML into attributed text. A subset of HTML 4.01 Transitional is implemented. The following elements are supported fully:: B BLOCKQUOTE BR CENTER CODE DD DIR DL EM FONT H1 H2 H3 H4 H5 H6 I IMG KBD LI MENU OL P PRE Q SAMP STRONG SUB SUP TT U UL VAR The mark (bullet or number) of a list item is separated from the body of the list item with a tab, as the pyglet document model does not allow out-of-stream text. This means lists display as expected, but behave a little oddly if edited. No CSS styling is supported. ''' __docformat__ = 'restructuredtext' __version__ = '$Id: $' import HTMLParser import htmlentitydefs import os import re import pyglet from pyglet.text.formats import structured def _hex_color(val): return [(val >> 16) & 0xff, (val >> 8) & 0xff, val & 0xff, 255] _color_names = { 'black': _hex_color(0x000000), 'silver': _hex_color(0xc0c0c0), 'gray': _hex_color(0x808080), 'white': _hex_color(0xffffff), 'maroon': _hex_color(0x800000), 'red': _hex_color(0xff0000), 'purple': _hex_color(0x800080), 'fucsia': _hex_color(0x008000), 'green': _hex_color(0x00ff00), 'lime': _hex_color(0xffff00), 'olive': _hex_color(0x808000), 'yellow': _hex_color(0xff0000), 'navy': _hex_color(0x000080), 'blue': _hex_color(0x0000ff), 'teal': _hex_color(0x008080), 'aqua': _hex_color(0x00ffff), } def _parse_color(value): if value.startswith('#'): return _hex_color(int(value[1:], 16)) else: try: return _color_names[value.lower()] except KeyError: raise ValueError() _whitespace_re = re.compile(u'[\u0020\u0009\u000c\u200b\r\n]+', re.DOTALL) _metadata_elements = ['head', 'title'] _block_elements = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'dir', 'menu', 'pre', 'dl', 'div', 'center', 'noscript', 'noframes', 'blockquote', 'form', 'isindex', 'hr', 'table', 'fieldset', 'address', # Incorrect, but we treat list items as blocks: 'li', 'dd', 'dt', ] _block_containers = ['_top_block', 'body', 'div', 'center', 'object', 'applet', 'blockquote', 'ins', 'del', 'dd', 'li', 'form', 'fieldset', 'button', 'th', 'td', 'iframe', 'noscript', 'noframes', # Incorrect, but we treat list items as blocks: 'ul', 'ol', 'dir', 'menu', 'dl'] class HTMLDecoder(HTMLParser.HTMLParser, structured.StructuredTextDecoder): '''Decoder for HTML documents. ''' #: Default style attributes for unstyled text in the HTML document. #: #: :type: dict default_style = { 'font_name': 'Times New Roman', 'font_size': 12, 'margin_bottom': '12pt', } #: Map HTML font sizes to actual font sizes, in points. #: #: :type: dict font_sizes = { 1: 8, 2: 10, 3: 12, 4: 14, 5: 18, 6: 24, 7: 48 } def decode_structured(self, text, location): self.location = location self._font_size_stack = [3] self.list_stack.append(structured.UnorderedListBuilder({})) self.strip_leading_space = True self.block_begin = True self.need_block_begin = False self.element_stack = ['_top_block'] self.in_metadata = False self.in_pre = False self.push_style('_default', self.default_style) self.feed(text) self.close() def get_image(self, filename): return pyglet.image.load(filename, file=self.location.open(filename)) def prepare_for_data(self): if self.need_block_begin: self.add_text('\n') self.block_begin = True self.need_block_begin = False def handle_data(self, data): if self.in_metadata: return if self.in_pre: self.add_text(data) else: data = _whitespace_re.sub(' ', data) if data.strip(): self.prepare_for_data() if self.block_begin or self.strip_leading_space: data = data.lstrip() self.block_begin = False self.add_text(data) self.strip_leading_space = data.endswith(' ') def handle_starttag(self, tag, case_attrs): if self.in_metadata: return element = tag.lower() attrs = {} for key, value in case_attrs: attrs[key.lower()] = value if element in _metadata_elements: self.in_metadata = True elif element in _block_elements: # Pop off elements until we get to a block container. while self.element_stack[-1] not in _block_containers: self.handle_endtag(self.element_stack[-1]) if not self.block_begin: self.add_text('\n') self.block_begin = True self.need_block_begin = False self.element_stack.append(element) style = {} if element in ('b', 'strong'): style['bold'] = True elif element in ('i', 'em', 'var'): style['italic'] = True elif element in ('tt', 'code', 'samp', 'kbd'): style['font_name'] = 'Courier New' elif element == 'u': color = self.current_style.get('color') if color is None: color = [0, 0, 0, 255] style['underline'] = color elif element == 'font': if 'face' in attrs: style['font_name'] = attrs['face'].split(',') if 'size' in attrs: size = attrs['size'] try: if size.startswith('+'): size = self._font_size_stack[-1] + int(size[1:]) elif size.startswith('-'): size = self._font_size_stack[-1] - int(size[1:]) else: size = int(size) except ValueError: size = 3 self._font_size_stack.append(size) if size in self.font_sizes: style['font_size'] = self.font_sizes.get(size, 3) else: self._font_size_stack.append(self._font_size_stack[-1]) if 'color' in attrs: try: style['color'] = _parse_color(attrs['color']) except ValueError: pass elif element == 'sup': size = self._font_size_stack[-1] - 1 style['font_size'] = self.font_sizes.get(size, 1) style['baseline'] = '3pt' elif element == 'sub': size = self._font_size_stack[-1] - 1 style['font_size'] = self.font_sizes.get(size, 1) style['baseline'] = '-3pt' elif element == 'h1': style['font_size'] = 24 style['bold'] = True style['align'] = 'center' elif element == 'h2': style['font_size'] = 18 style['bold'] = True elif element == 'h3': style['font_size'] = 16 style['bold'] = True elif element == 'h4': style['font_size'] = 14 style['bold'] = True elif element == 'h5': style['font_size'] = 12 style['bold'] = True elif element == 'h6': style['font_size'] = 12 style['italic'] = True elif element == 'br': self.add_text(u'\u2028') self.strip_leading_space = True elif element == 'p': if attrs.get('align') in ('left', 'center', 'right'): style['align'] = attrs['align'] elif element == 'center': style['align'] = 'center' elif element == 'pre': style['font_name'] = 'Courier New' style['margin_bottom'] = 0 self.in_pre = True elif element == 'blockquote': left_margin = self.current_style.get('margin_left') or 0 right_margin = self.current_style.get('margin_right') or 0 style['margin_left'] = left_margin + 60 style['margin_right'] = right_margin + 60 elif element == 'q': self.handle_data(u'\u201c') elif element == 'ol': try: start = int(attrs.get('start', 1)) except ValueError: start = 1 format = attrs.get('type', '1') + '.' builder = structured.OrderedListBuilder(start, format) builder.begin(self, style) self.list_stack.append(builder) elif element in ('ul', 'dir', 'menu'): type = attrs.get('type', 'disc').lower() if type == 'circle': mark = u'\u25cb' elif type == 'square': mark = u'\u25a1' else: mark = u'\u25cf' builder = structured.UnorderedListBuilder(mark) builder.begin(self, style) self.list_stack.append(builder) elif element == 'li': self.list_stack[-1].item(self, style) self.strip_leading_space = True elif element == 'dl': style['margin_bottom'] = 0 elif element == 'dd': left_margin = self.current_style.get('margin_left') or 0 style['margin_left'] = left_margin + 30 elif element == 'img': image = self.get_image(attrs.get('src')) if image: width = attrs.get('width') if width: width = int(width) height = attrs.get('height') if height: height = int(height) self.prepare_for_data() self.add_element(structured.ImageElement(image, width, height)) self.strip_leading_space = False self.push_style(element, style) def handle_endtag(self, tag): element = tag.lower() if element not in self.element_stack: return self.pop_style(element) while self.element_stack.pop() != element: pass if element in _metadata_elements: self.in_metadata = False elif element in _block_elements: self.block_begin = False self.need_block_begin = True if element == 'font' and len(self._font_size_stack) > 1: self._font_size_stack.pop() elif element == 'pre': self.in_pre = False elif element == 'q': self.handle_data(u'\u201d') elif element in ('ul', 'ol'): if len(self.list_stack) > 1: self.list_stack.pop() def handle_entityref(self, name): if name in htmlentitydefs.name2codepoint: self.handle_data(unichr(htmlentitydefs.name2codepoint[name])) def handle_charref(self, name): name = name.lower() try: if name.startswith('x'): self.handle_data(unichr(int(name[1:], 16))) else: self.handle_data(unichr(int(name))) except ValueError: pass