1 files changed, 102 insertions, 128 deletions
diff --git a/bleach/__init__.py b/bleach/__init__.py
index bc8e49c..af75d0f 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -1,19 +1,18 @@
-import itertools
 import logging
 import re
 import sys
-import urlparse
 
 import html5lib
 from html5lib.sanitizer import HTMLSanitizer
 from html5lib.serializer.htmlserializer import HTMLSerializer
 
-from encoding import force_unicode
-from sanitizer import BleachSanitizer
+from . import callbacks as linkify_callbacks
+from .encoding import force_unicode
+from .sanitizer import BleachSanitizer
 
 
-VERSION = (1, 1, 5)
-__version__ = '.'.join(map(str, VERSION))
+VERSION = (1, 2, 1)
+__version__ = '1.2.1'
 
 __all__ = ['clean', 'linkify']
 
@@ -56,18 +55,21 @@ TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
        tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws xn ye yt yu za zm
        zw""".split()
 
+PROTOCOLS = HTMLSanitizer.acceptable_protocols
+
 TLDS.reverse()
 
 url_re = re.compile(
     r"""\(*  # Match any opening parentheses.
-    \b(?<![@.])(?:\w[\w-]*:/{0,3}(?:(?:\w+:)?\w+@)?)?  # http://
+    \b(?<![@.])(?:(?:%s):/{0,3}(?:(?:\w+:)?\w+@)?)?  # http://
     ([\w-]+\.)+(?:%s)(?:\:\d+)?(?!\.\w)\b   # xx.yy.tld(:##)?
     (?:[/?][^\s\{\}\|\\\^\[\]`<>"]*)?
         # /path/zz (excluding "unsafe" chars from RFC 1738,
         # except for # and ~, which happen in practice)
-    """ % u'|'.join(TLDS), re.VERBOSE | re.UNICODE)
+    """ % (u'|'.join(PROTOCOLS), u'|'.join(TLDS)),
+    re.IGNORECASE | re.VERBOSE | re.UNICODE)
 
-proto_re = re.compile(r'^[\w-]+:/{0,3}')
+proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
 
 punct_re = re.compile(r'([\.,]+)$')
 
@@ -83,7 +85,10 @@ email_re = re.compile(
 
 NODE_TEXT = 4  # The numeric ID of a text node in simpletree.
 
-identity = lambda x: x  # The identity function.
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+
+PY_26 = (sys.version_info < (2, 7))
+RECURSION_EXCEPTION = RuntimeError if not PY_26 else AttributeError
 
 
 def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
@@ -93,8 +98,6 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
         return u''
 
     text = force_unicode(text)
-    if text.startswith(u'<!--'):
-        text = u' ' + text
 
     class s(BleachSanitizer):
         allowed_elements = tags
@@ -105,32 +108,17 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
 
     parser = html5lib.HTMLParser(tokenizer=s)
 
-    return _render(parser.parseFragment(text)).strip()
+    return _render(parser.parseFragment(text))
 
 
-def linkify(text, nofollow=True, target=None, filter_url=identity,
-            filter_text=identity, skip_pre=False, parse_email=False,
-            tokenizer=HTMLSanitizer):
+def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
+            parse_email=False, tokenizer=HTMLSanitizer):
     """Convert URL-like strings in an HTML fragment to links.
 
     linkify() converts strings that look like URLs or domain names in a
     blob of text that may be an HTML fragment to links, while preserving
     (a) links already in the string, (b) urls found in attributes, and
     (c) email addresses.
-
-    If the nofollow argument is True (the default) then rel="nofollow"
-    will be added to links created by linkify() as well as links already
-    found in the text.
-
-    The target argument will optionally add a target attribute with the
-    given value to links created by linkify() as well as links already
-    found in the text.
-
-    linkify() uses up to two filters on each link. For links created by
-    linkify(), the href attribute is passed through filter_url()
-    and the text of the link is passed through filter_text(). For links
-    already found in the document, the href attribute is passed through
-    filter_url(), but the text is untouched.
     """
     text = force_unicode(text)
 
@@ -141,16 +129,16 @@ def linkify(text, nofollow=True, target=None, filter_url=identity,
 
     forest = parser.parseFragment(text)
 
-    if nofollow:
-        rel = u'rel="nofollow"'
-    else:
-        rel = u''
-
     def replace_nodes(tree, new_frag, node):
         new_tree = parser.parseFragment(new_frag)
         for n in new_tree.childNodes:
+            # Prevent us from re-parsing links new links as existing links.
+            if n.name == 'a':
+                n._seen = True
             tree.insertBefore(n, node)
         tree.removeChild(node)
+        # Return the number of new nodes.
+        return len(new_tree.childNodes) - 1
 
     def strip_wrapping_parentheses(fragment):
         """Strips wrapping parentheses.
@@ -194,34 +182,77 @@ def linkify(text, nofollow=True, target=None, filter_url=identity,
 
         return fragment, opening_parentheses, closing_parentheses
 
+    def apply_callbacks(attrs, new):
+        for cb in callbacks:
+            attrs = cb(attrs, new)
+            if attrs is None:
+                return None
+        return attrs
+
     def linkify_nodes(tree, parse_text=True):
-        for node in tree.childNodes:
+        # I know this isn't Pythonic, but we're sometimes mutating
+        # tree.childNodes, which ends up breaking the loop and causing us to
+        # reparse code.
+        children = len(tree.childNodes)
+        current = 0  # A pointer to the "current" node.
+        while current < children:
+            node = tree.childNodes[current]
             if node.type == NODE_TEXT and parse_text:
-                new_frag = node.toxml()
+                new_frag = _render(node)
+                # Look for email addresses?
                 if parse_email:
                     new_frag = re.sub(email_re, email_repl, new_frag)
-                    if new_frag != node.toxml():
-                        replace_nodes(tree, new_frag, node)
+                    if new_frag != _render(node):
+                        adj = replace_nodes(tree, new_frag, node)
+                        children += adj
+                        current += adj
                         linkify_nodes(tree)
                         continue
                 new_frag = re.sub(url_re, link_repl, new_frag)
-                replace_nodes(tree, new_frag, node)
-            elif node.name == 'a':
+                if new_frag != _render(node):
+                    adj = replace_nodes(tree, new_frag, node)
+                    children += adj
+                    current += adj
+            elif node.name == 'a' and not getattr(node, '_seen', False):
                 if 'href' in node.attributes:
-                    if nofollow:
-                        node.attributes['rel'] = 'nofollow'
-                    if target is not None:
-                        node.attributes['target'] = target
-                    href = node.attributes['href']
-                    node.attributes['href'] = filter_url(href)
+                    attrs = node.attributes
+                    _text = attrs['_text'] = ''.join(c.toxml() for
+                                                     c in node.childNodes)
+                    attrs = apply_callbacks(attrs, False)
+                    if attrs is not None:
+                        text = force_unicode(attrs.pop('_text'))
+                        node.attributes = attrs
+                        for n in reversed(node.childNodes):
+                            node.removeChild(n)
+                        text = parser.parseFragment(text)
+                        for n in text.childNodes:
+                            node.appendChild(n)
+                        node._seen = True
+                    else:
+                        replace_nodes(tree, _text, node)
             elif skip_pre and node.name == 'pre':
                 linkify_nodes(node, False)
-            else:
+            elif not getattr(node, '_seen', False):
                 linkify_nodes(node)
+            current += 1
 
     def email_repl(match):
-        repl = u'<a href="mailto:%(mail)s">%(mail)s</a>'
-        return repl % {'mail': match.group(0).replace('"', '&quot;')}
+        addr = match.group(0).replace('"', '&quot;')
+        link = {
+            '_text': addr,
+            'href': 'mailto:%s' % addr,
+        }
+        link = apply_callbacks(link, True)
+
+        if link is None:
+            return addr
+
+        _href = link.pop('href')
+        _text = link.pop('_text')
+
+        repl = '<a href="%s" %s>%s</a>'
+        attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())
+        return repl % (_href, attribs, _text)
 
     def link_repl(match):
         url = match.group(0)
@@ -240,98 +271,41 @@ def linkify(text, nofollow=True, target=None, filter_url=identity,
         else:
             href = u''.join([u'http://', url])
 
-        repl = u'%s<a href="%s" %s>%s</a>%s%s'
-
-        attribs = [rel]
-        if target is not None:
-            attribs.append('target="%s"' % target)
-
-        return repl % ('(' * open_brackets,
-                       filter_url(href), ' '.join(attribs), filter_text(url),
-                       end, ')' * close_brackets)
+        link = {
+            '_text': url,
+            'href': href,
+        }
 
-    linkify_nodes(forest)
+        link = apply_callbacks(link, True)
 
-    return _render(forest)
+        if link is None:
+            return url
 
+        _text = link.pop('_text')
+        _href = link.pop('href')
 
-def delinkify(text, allow_domains=None, allow_relative=False):
-    """Remove links from text, except those allowed to stay."""
-    text = force_unicode(text)
-    if not text:
-        return u''
+        repl = u'%s<a href="%s" %s>%s</a>%s%s'
+        attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())
 
-    parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer)
-    forest = parser.parseFragment(text)
+        return repl % ('(' * open_brackets,
+                       _href, attribs, _text, end,
+                       ')' * close_brackets)
 
-    if allow_domains is None:
-        allow_domains = []
-    elif isinstance(allow_domains, basestring):
-        allow_domains = [allow_domains]
+    try:
+        linkify_nodes(forest)
+    except (RECURSION_EXCEPTION), e:
+        # If we hit the max recursion depth, just return what we've got.
+        log.exception('Probable recursion error: %r' % e)
 
-    def delinkify_nodes(tree):
-        """Remove <a> tags and replace them with their contents."""
-        for node in tree.childNodes:
-            if node.name == 'a':
-                if 'href' not in node.attributes:
-                    continue
-                parts = urlparse.urlparse(node.attributes['href'])
-                host = parts.hostname
-                if any(_domain_match(host, d) for d in allow_domains):
-                    continue
-                if host is None and allow_relative:
-                    continue
-                # Replace the node with its children.
-                # You can't nest <a> tags, and html5lib takes care of that
-                # for us in the tree-building step.
-                for n in node.childNodes:
-                    tree.insertBefore(n, node)
-                tree.removeChild(node)
-            elif node.type != NODE_TEXT: # Don't try to delinkify text.
-                delinkify_nodes(node)
-
-    delinkify_nodes(forest)
     return _render(forest)
 
 
-def _domain_match(test, compare):
-    test = test.lower()
-    compare = compare.lower()
-    if '*' not in compare:
-        return test == compare
-    c = compare.split('.')[::-1]
-    if '**' in c and (c.count('**') > 1 or not compare.startswith('**')):
-        raise ValidationError(
-            'Only 1 ** is allowed, and must start the domain.')
-    t = test.split('.')[::-1]
-    z = itertools.izip_longest(c, t)
-    for c, t in z:
-        if c == t:
-            continue
-        elif c == '*':
-            continue
-        elif c == '**':
-            return True
-        return False
-    # Got all the way through and everything matched.
-    return True
-
-
-class ValidationError(ValueError):
-    pass
-
-
 def _render(tree):
     """Try rendering as HTML, then XML, then give up."""
     try:
         return force_unicode(_serialize(tree))
-    except Exception, e:
-        log.error('HTML: %r' % e, exc_info=sys.exc_info())
-        try:
-            return force_unicode(tree.toxml())
-        except Exception, e:
-            log.error('XML: %r' % e, exc_info=sys.exc_info())
-            return u''
+    except AssertionError:  # The treewalker throws this sometimes.
+        return force_unicode(tree.toxml())
 
 
 def _serialize(domtree):