Admin/website/build/xhtmlparse.py
author haftmann
Wed, 12 Jul 2006 17:00:31 +0200
changeset 20107 239a0efd38b2
parent 19533 fc4c6458d569
permissions -rw-r--r--
class_of_param instead of class_of
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
19533
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
     1
#!/usr/bin/env python
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
     2
# -*- coding: Latin-1 -*-
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
     3
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
     4
"""
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
     5
    Common services for parsing xhtml.
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
     6
"""
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
     7
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
     8
__all__ = ['TransformerHandler']
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
     9
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    10
__author__ = 'Florian Haftmann, florian.haftmann@informatik.tu-muenchen.de'
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    11
__revision__ = '$Id$'
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    12
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    13
from os import path
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    14
import codecs
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    15
import posixpath
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    16
from xml.sax.saxutils import escape
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    17
from xml.sax.saxutils import quoteattr
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    18
from xml.sax import make_parser as makeParser
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    19
from xml.sax.handler import ContentHandler
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    20
from xml.sax.handler import EntityResolver
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    21
from xml.sax.xmlreader import AttributesImpl as Attributes
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    22
from xml.sax import SAXException
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    23
from xml.sax import SAXParseException
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    24
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    25
nbsp = unichr(160)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    26
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    27
class TransformerHandler(object, ContentHandler, EntityResolver):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    28
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    29
    def __init__(self, out, encoding, dtd):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    30
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    31
        ContentHandler.__init__(self)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    32
        self._out = codecs.getwriter(encoding)(out)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    33
        self._encoding = encoding
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    34
        self._dtd = dtd
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    35
        self._ns_contexts = [{}] # contains uri -> prefix dicts
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    36
        self._current_context = self._ns_contexts[-1]
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    37
        self._undeclared_ns_maps = []
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    38
        self._characterBuffer = {}
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    39
        self._lastStart = False
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    40
        self._currentXPath = []
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    41
        self._init = False
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    42
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    43
    def closeLastStart(self):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    44
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    45
        if self._lastStart:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    46
            self._out.write(u'>')
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    47
            self._lastStart = False
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    48
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    49
    def currentContent(self):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    50
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    51
        return u"".join(self._characterBuffer)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    52
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    53
    def flushCharacterBuffer(self):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    54
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    55
        content = escape(self.currentContent())
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    56
        self._out.write(content)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    57
        self._characterBuffer = []
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    58
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    59
    def startDocument(self):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    60
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    61
        if not self._init:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    62
            if self._encoding.upper() != 'UTF-8':
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    63
                self._out.write(u'<?xml version="1.0" encoding="%s"?>\n' %
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    64
                                self._encoding)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    65
            else:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    66
                self._out.write(u'<?xml version="1.0"?>\n')
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    67
            self._init = True
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    68
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    69
    def startPrefixMapping(self, prefix, uri):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    70
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    71
        self._ns_contexts.append(self._current_context.copy())
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    72
        self._current_context[uri] = prefix
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    73
        self._undeclared_ns_maps.append((prefix, uri))
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    74
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    75
    def endPrefixMapping(self, prefix):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    76
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    77
        self._current_context = self._ns_contexts[-1]
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    78
        del self._ns_contexts[-1]
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    79
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    80
    def startElement(self, name, attrs):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    81
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    82
        self.closeLastStart()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    83
        self.flushCharacterBuffer()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    84
        self._out.write(u'<' + name)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    85
        for (key, value) in attrs.items():
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    86
            self._out.write(u' %s=%s' % (key, quoteattr(value)))
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    87
        self._currentXPath.append(name)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    88
        self._lastStart = True
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    89
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    90
    def endElement(self, name):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    91
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    92
        self.flushCharacterBuffer()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    93
        if self._lastStart:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    94
            self._out.write(u'/>')
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    95
            self._lastStart = False
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    96
        else:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    97
            self._out.write('</%s>' % name)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    98
        self._currentXPath.pop()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
    99
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   100
    def startElementNS(self, name, qname, attrs):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   101
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   102
        self.closeLastStart()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   103
        self.flushCharacterBuffer()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   104
        if name[0] is None:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   105
            # if the name was not namespace-scoped, use the unqualified part
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   106
            name = name[1]
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   107
        else:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   108
            # else try to restore the original prefix from the namespace
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   109
            name = self._current_context[name[0]] + u":" + name[1]
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   110
        self._out.write(u'<' + name)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   111
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   112
        for pair in self._undeclared_ns_maps:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   113
            self._out.write(u' xmlns:%s="%s"' % pair)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   114
        self._undeclared_ns_maps = []
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   115
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   116
        for (name, value) in attrs.items():
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   117
            name = self._current_context[name[0]] + ":" + name[1]
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   118
            self._out.write(' %s=%s' % (name, quoteattr(value)))
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   119
        self._out.write('>')
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   120
        self._currentXPath.append(name)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   121
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   122
    def endElementNS(self, name, qname):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   123
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   124
        self.flushCharacterBuffer()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   125
        if name[0] is None:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   126
            name = name[1]
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   127
        else:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   128
            name = self._current_context[name[0]] + u":" + name[1]
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   129
        if self._lastStart:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   130
            self._out.write(u'/>')
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   131
            self._lastStart = False
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   132
        else:
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   133
            self._out.write(u'</%s>' % name)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   134
        self._currentXPath.pop()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   135
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   136
    def characters(self, content):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   137
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   138
        self.closeLastStart()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   139
        self._characterBuffer.append(content)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   140
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   141
    def ignorableWhitespace(self, content):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   142
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   143
        self.closeLastStart()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   144
        self.flushCharacterBuffer()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   145
        self._out.write(content)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   146
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   147
    def resolveEntity(self, publicId, systemId):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   148
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   149
        loc, name = posixpath.split(systemId)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   150
        if loc == u"http://www.w3.org/TR/xhtml1/DTD" or loc == u"":
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   151
            systemId = path.abspath(path.join(self._dtd, name))
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   152
        return EntityResolver.resolveEntity(self, publicId, systemId)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   153
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   154
    def processingInstruction(self, target, data):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   155
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   156
        raise Exception("no handler defined for processing instructions")
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   157
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   158
def parseWithER(istream, handler):
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   159
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   160
    parser = makeParser()
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   161
    parser.setContentHandler(handler)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   162
    parser.setEntityResolver(handler)
fc4c6458d569 added obfuscation for mails
haftmann
parents:
diff changeset
   163
    parser.parse(istream)