1 files changed, 144 insertions, 83 deletions
diff --git a/bleach/__init__.py b/bleach/__init__.py
index af75d0f..b110972 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -1,6 +1,8 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
 import logging
 import re
-import sys
 
 import html5lib
 from html5lib.sanitizer import HTMLSanitizer
@@ -11,8 +13,8 @@ from .encoding import force_unicode
 from .sanitizer import BleachSanitizer
 
 
-VERSION = (1, 2, 1)
-__version__ = '1.2.1'
+VERSION = (1, 4, 0)
+__version__ = '1.4'
 
 __all__ = ['clean', 'linkify']
 
@@ -61,12 +63,12 @@ TLDS.reverse()
 
 url_re = re.compile(
     r"""\(*  # Match any opening parentheses.
-    \b(?<![@.])(?:(?:%s):/{0,3}(?:(?:\w+:)?\w+@)?)?  # http://
-    ([\w-]+\.)+(?:%s)(?:\:\d+)?(?!\.\w)\b   # xx.yy.tld(:##)?
-    (?:[/?][^\s\{\}\|\\\^\[\]`<>"]*)?
+    \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
+    ([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
         # /path/zz (excluding "unsafe" chars from RFC 1738,
         # except for # and ~, which happen in practice)
-    """ % (u'|'.join(PROTOCOLS), u'|'.join(TLDS)),
+    """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
     re.IGNORECASE | re.VERBOSE | re.UNICODE)
 
 proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
@@ -75,8 +77,8 @@ punct_re = re.compile(r'([\.,]+)$')
 
 email_re = re.compile(
     r"""(?<!//)
-    (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
-        (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*  # dot-atom
+    (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
+        (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)*  # dot-atom
     |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
         |\\[\001-011\013\014\016-\177])*"  # quoted-string
     )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.?  # domain
@@ -85,17 +87,18 @@ email_re = re.compile(
 
 NODE_TEXT = 4  # The numeric ID of a text node in simpletree.
 
-DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
+# a simple routine that returns the tag name with the namespace prefix
+# as returned by etree's Element.tag attribute
 
-PY_26 = (sys.version_info < (2, 7))
-RECURSION_EXCEPTION = RuntimeError if not PY_26 else AttributeError
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
 
 
 def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
           styles=ALLOWED_STYLES, strip=False, strip_comments=True):
     """Clean an HTML fragment and return it"""
     if not text:
-        return u''
+        return ''
 
     text = force_unicode(text)
 
@@ -123,22 +126,38 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
     text = force_unicode(text)
 
     if not text:
-        return u''
+        return ''
 
     parser = html5lib.HTMLParser(tokenizer=tokenizer)
 
     forest = parser.parseFragment(text)
+    _seen = set([])
 
-    def replace_nodes(tree, new_frag, node):
+    def replace_nodes(tree, new_frag, node, index=0):
+        """
+        Doesn't really replace nodes, but inserts the nodes contained in
+        new_frag into the treee at position index and returns the number
+        of nodes inserted.
+        If node is passed in, it is removed from the tree
+        """
+        count = 0
         new_tree = parser.parseFragment(new_frag)
-        for n in new_tree.childNodes:
-            # Prevent us from re-parsing links new links as existing links.
-            if n.name == 'a':
-                n._seen = True
-            tree.insertBefore(n, node)
-        tree.removeChild(node)
-        # Return the number of new nodes.
-        return len(new_tree.childNodes) - 1
+        # capture any non-tag text at the start of the fragment
+        if new_tree.text:
+            if index == 0:
+                tree.text += new_tree.text
+            else:
+                tree[index-1].tail += new_tree.text
+        # the put in the tagged elements into the old tree
+        for n in new_tree:
+            if n.tag == ETREE_TAG('a'):
+                _seen.add(n)
+            tree.insert(index+count, n)
+            count += 1
+        # if we got a node to remove...
+        if node is not None:
+            tree.remove(node)
+        return count
 
     def strip_wrapping_parentheses(fragment):
         """Strips wrapping parentheses.
@@ -189,58 +208,102 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
                 return None
         return attrs
 
+    def _render_inner(node):
+        out = ['' if node.text is None else node.text]
+        for subnode in node:
+            out.append(_render(subnode))
+            if subnode.tail:
+                out.append(subnode.tail)
+        return ''.join(out)
+
     def linkify_nodes(tree, parse_text=True):
-        # I know this isn't Pythonic, but we're sometimes mutating
-        # tree.childNodes, which ends up breaking the loop and causing us to
-        # reparse code.
-        children = len(tree.childNodes)
-        current = 0  # A pointer to the "current" node.
-        while current < children:
-            node = tree.childNodes[current]
-            if node.type == NODE_TEXT and parse_text:
-                new_frag = _render(node)
-                # Look for email addresses?
-                if parse_email:
-                    new_frag = re.sub(email_re, email_repl, new_frag)
-                    if new_frag != _render(node):
-                        adj = replace_nodes(tree, new_frag, node)
+        children = len(tree)
+        current_child = -1
+        # start at -1 to process the parent first
+        while current_child < len(tree):
+            if current_child < 0:
+                node = tree
+                if parse_text and node.text:
+                    new_txt = old_txt = node.text
+                    if parse_email:
+                        new_txt = re.sub(email_re, email_repl, node.text)
+                        if new_txt and new_txt != node.text:
+                            node.text = ''
+                            adj = replace_nodes(tree, new_txt, None, 0)
+                            children += adj
+                            current_child += adj
+                            linkify_nodes(tree, True)
+                            continue
+
+                    new_txt = re.sub(url_re, link_repl, new_txt)
+                    if new_txt != old_txt:
+                        node.text = ''
+                        adj = replace_nodes(tree, new_txt, None, 0)
                         children += adj
-                        current += adj
-                        linkify_nodes(tree)
+                        current_child += adj
                         continue
-                new_frag = re.sub(url_re, link_repl, new_frag)
-                if new_frag != _render(node):
-                    adj = replace_nodes(tree, new_frag, node)
+            else:
+                node = tree[current_child]
+
+            if parse_text and node.tail:
+                new_tail = old_tail = node.tail
+                if parse_email:
+                    new_tail = re.sub(email_re, email_repl, new_tail)
+                    if new_tail != node.tail:
+                        node.tail = ''
+                        adj = replace_nodes(tree, new_tail, None,
+                                            current_child+1)
+                        #insert the new nodes made from my tail into
+                        # the tree right after me. current_child+1
+                        children += adj
+
+                new_tail = re.sub(url_re, link_repl, new_tail)
+                if new_tail != old_tail:
+                    node.tail = ''
+                    adj = replace_nodes(tree, new_tail, None, current_child+1)
                     children += adj
-                    current += adj
-            elif node.name == 'a' and not getattr(node, '_seen', False):
-                if 'href' in node.attributes:
-                    attrs = node.attributes
-                    _text = attrs['_text'] = ''.join(c.toxml() for
-                                                     c in node.childNodes)
+
+            if node.tag == ETREE_TAG('a') and not (node in _seen):
+                if not node.get('href', None) is None:
+                    attrs = dict(node.items())
+
+                    _text = attrs['_text'] = _render_inner(node)
+
                     attrs = apply_callbacks(attrs, False)
-                    if attrs is not None:
+
+                    if attrs is None:
+                        # <a> tag replaced by the text within it
+                        adj = replace_nodes(tree, _text, node,
+                                            current_child)
+                        current_child -= 1
+                        # pull back current_child by 1 to scan the
+                        # new nodes again.
+                    else:
                         text = force_unicode(attrs.pop('_text'))
-                        node.attributes = attrs
-                        for n in reversed(node.childNodes):
-                            node.removeChild(n)
+                        for attr_key, attr_val in attrs.items():
+                            node.set(attr_key, attr_val)
+
+                        for n in reversed(list(node)):
+                            node.remove(n)
                         text = parser.parseFragment(text)
-                        for n in text.childNodes:
-                            node.appendChild(n)
-                        node._seen = True
-                    else:
-                        replace_nodes(tree, _text, node)
-            elif skip_pre and node.name == 'pre':
-                linkify_nodes(node, False)
-            elif not getattr(node, '_seen', False):
-                linkify_nodes(node)
-            current += 1
+                        node.text = text.text
+                        for n in text:
+                            node.append(n)
+                        _seen.add(node)
+
+            elif current_child >= 0:
+                if node.tag == ETREE_TAG('pre') and skip_pre:
+                    linkify_nodes(node, False)
+                elif not (node in _seen):
+                    linkify_nodes(node, True)
+
+            current_child += 1
 
     def email_repl(match):
         addr = match.group(0).replace('"', '&quot;')
         link = {
             '_text': addr,
-            'href': 'mailto:%s' % addr,
+            'href': 'mailto:{0!s}'.format(addr),
         }
         link = apply_callbacks(link, True)
 
@@ -250,18 +313,18 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
         _href = link.pop('href')
         _text = link.pop('_text')
 
-        repl = '<a href="%s" %s>%s</a>'
-        attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())
-        return repl % (_href, attribs, _text)
+        repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
+        attr = '{0!s}="{1!s}"'
+        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
+        return repl.format(_href, attribs, _text)
 
     def link_repl(match):
         url = match.group(0)
         open_brackets = close_brackets = 0
         if url.startswith('('):
-            url, open_brackets, close_brackets = (
-                    strip_wrapping_parentheses(url)
-            )
-        end = u''
+            _wrapping = strip_wrapping_parentheses(url)
+            url, open_brackets, close_brackets = _wrapping
+        end = ''
         m = re.search(punct_re, url)
         if m:
             end = m.group(0)
@@ -269,7 +332,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
         if re.search(proto_re, url):
             href = url
         else:
-            href = u''.join([u'http://', url])
+            href = ''.join(['http://', url])
 
         link = {
             '_text': url,
@@ -284,32 +347,30 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
         _text = link.pop('_text')
         _href = link.pop('href')
 
-        repl = u'%s<a href="%s" %s>%s</a>%s%s'
-        attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())
+        repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
+        attr = '{0!s}="{1!s}"'
+        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
 
-        return repl % ('(' * open_brackets,
-                       _href, attribs, _text, end,
-                       ')' * close_brackets)
+        return repl.format('(' * open_brackets,
+                           _href, attribs, _text, end,
+                           ')' * close_brackets)
 
     try:
         linkify_nodes(forest)
-    except (RECURSION_EXCEPTION), e:
+    except RuntimeError as e:
         # If we hit the max recursion depth, just return what we've got.
-        log.exception('Probable recursion error: %r' % e)
+        log.exception('Probable recursion error: {0!r}'.format(e))
 
     return _render(forest)
 
 
 def _render(tree):
     """Try rendering as HTML, then XML, then give up."""
-    try:
-        return force_unicode(_serialize(tree))
-    except AssertionError:  # The treewalker throws this sometimes.
-        return force_unicode(tree.toxml())
+    return force_unicode(_serialize(tree))
 
 
 def _serialize(domtree):
-    walker = html5lib.treewalkers.getTreeWalker('simpletree')
+    walker = html5lib.treewalkers.getTreeWalker('etree')
     stream = walker(domtree)
     serializer = HTMLSerializer(quote_attr_values=True,
                                 omit_optional_tags=False)