Admin/website/build/obfusmail.py
author haftmann
Thu, 04 May 2006 10:13:55 +0200
changeset 19555 7938d8e0c52d
parent 19554 bc0bef4a124e
child 19556 a3951e34269f
permissions -rw-r--r--
fixed some flaws

#!/usr/bin/env python
# -*- coding: Latin-1 -*-

"""
    Obfucatings mail adresses
"""

__author__ = 'Florian Haftmann, florian.haftmann@informatik.tu-muenchen.de'
__revision__ = '$Id$'

import sys
import os
from os import path
import posixpath
import optparse
from cStringIO import StringIO

from xml.sax.saxutils import escape
from xml.sax.saxutils import quoteattr

from xhtmlparse import TransformerHandler, parseWithER

# global configuration
outputEncoding = 'UTF-8'

def split_mail(mail):

    mail_arg = mail.split("?", 2)
    if len(mail_arg) == 2:
        mail, arg = mail_arg
    else:
        mail = mail_arg[0]
        arg = None
    name, host = mail.split("@", 2) 

    return ((name, host), arg)

class FindHandler(TransformerHandler):

    class DevZero(object):

        def write(self, s):

            pass

    def __init__(self, dtd, mails, enc):

        super(FindHandler, self).__init__(self.DevZero(), 'UTF-8', dtd)
        self.mails = mails
        self.enc = enc
        self.pending_mail = None

    def startElement(self, name, attrs):

        if name == u'a':
            href = attrs.get(u'href', u'')
            if href.startswith(u'mailto:'):
                self.pending_mail = href[7:]
        super(FindHandler, self).startElement(name, attrs)
        if name == u'meta' and attrs.get(u'http-equiv', u'').lower() == u'content-type':
            content = attrs.get(u'content', u'')
            if content.startswith(u'text/html; charset='):
                self.enc = content[19:]

    def endElement(self, name):

        if name == u'a':
            if self.pending_mail is not None:
                baremail = "%s@%s" % split_mail(self.pending_mail)[0]
                if self.currentContent() != baremail:
                    raise Exception("Inconsistent mail address: '%s' vs. '%s'" % (self.currentContent(), baremail))
                self.mails[self.pending_mail] = True
                self.pending_mail = None
        super(FindHandler, self).endElement(name)

    def processingInstruction(self, target, data):

        pass

class ReplaceHandler(TransformerHandler):

    def __init__(self, out, dtd, encoding, mails):

        super(ReplaceHandler, self).__init__(out, encoding, dtd)
        self.pending_mail = None
        self.mails = mails

    def startElement(self, name, attrs):

        if name == u'a':
            href = attrs.get(u'href', u'')
            if href.startswith(u'mailto:'):
                self.pending_mail = href[7:]
                return

        super(ReplaceHandler, self).startElement(name, attrs)

    def endElement(self, name):

        if name == u'a':
            if self.pending_mail is not None:
                self.flushCharacterBuffer()
                self._out.write(self.mails[self.pending_mail])
                self.pending_mail = None
                return

        super(ReplaceHandler, self).endElement(name)

    def characters(self, content):

        if self.pending_mail is None:
            super(ReplaceHandler, self).characters(content)

    def processingInstruction(self, target, data):

        pass

def obfuscate(mailaddr, htmlfile):

    def mk_line(s):
        return u"document.write('%s');" % s.replace("'", "\\'")
    def mk_script(s):
        return u'<script type="text/javascript">/*<![CDATA[*/%s/*]]>*/</script>' % s
    def cmd(s):
        print "[shell cmd] %s" % s
        n = os.system(s)
        if n != 0:
            raise Exception("shell cmd error: %s" % n)

    ((name, host), arg) = split_mail(mailaddr)
    baremail = "%s@%s" % (name, host)
    imgname = (name + "_" + host).replace(".", "_") + ".png"
    imgfile = path.join(path.split(htmlfile)[0], imgname)
    cmd("convert label:'%s' '%s'" % (baremail, imgfile))
    if arg is not None:
        mailsimple = u"{%s} AT [%s] WITH (%s)" % (name, host, arg)
        mailscript = u" ".join(map(mk_line, ['<a href="', "mailto:", name, "@", host, "?", arg, '">']));
    else:
        mailsimple = u"{%s} AT [%s]" % (name, host)
        mailscript = u" ".join(map(mk_line, ['<a href="', "mailto:", name, "@", host, '">']));
    mailimg = '<img src=%s style="vertical-align:middle" alt=%s />' % (quoteattr(imgname), quoteattr(mailsimple))

    return (mk_script(mailscript) + mailimg + mk_script(mk_line("</a>")))

def main():

    # parse command line
    cmdlineparser = optparse.OptionParser(
        usage = '%prog [options] htmlfile',
        conflict_handler = "error",
        description = '''Protecting mail adresses in html files by obfuscating''',
        add_help_option = True,
    )
    cmdlineparser.add_option("-t", "--dtd",
        action="store", dest="dtd",
        type="string", default=".",
        help="local mirror of XHTML DTDs", metavar='location')

    options, (filename,) = cmdlineparser.parse_args(sys.argv[1:])

    # find mails
    mails = {}
    enc = outputEncoding
    istream = open(filename, 'r')
    findhandler = FindHandler(options.dtd, mails, enc)
    parseWithER(istream, findhandler)
    enc = findhandler.enc
    istream.close()

    if mails:
        # transform mails
        mails_subst = {}
        for mail in mails.iterkeys():
            mails_subst[mail] = obfuscate(mail, filename)
    
        # transform pages
        istream = StringIO(open(filename, 'r').read())
        ostream = open(filename, 'wb')
        print "writing %s with %s" % (filename, enc)
        replacehandler = ReplaceHandler(ostream, options.dtd, enc, mails_subst)
        parseWithER(istream, replacehandler)
        ostream.close()
        istream.close()

if __name__ == '__main__':
    main()

__todo__ = '''
'''