Imported Upstream version 1.4upstream/1.4

author: Per Andersson <avtobiff@gmail.com> 2014-01-30 01:25:11 +0100
committer: Per Andersson <avtobiff@gmail.com> 2014-01-30 01:25:11 +0100
commit: 6cff86ce6de27fbd4f9fc07716fb1205b14ffae4 (patch)
tree: 15d1f032fcf000801b69307d2463c10004489ee4
parent: fac84c6d90e0875e6c1b10c5ef02d577ee008af4 (diff)
download: python-bleach-6cff86ce6de27fbd4f9fc07716fb1205b14ffae4.tar
python-bleach-6cff86ce6de27fbd4f9fc07716fb1205b14ffae4.tar.gz
20 files changed, 514 insertions, 310 deletions
diff --git a/CHANGES b/CHANGES
index d9bad9c..1def1a2 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,26 @@
 Bleach Changes
 ==============
 
+Version 1.4
+-----------
+
+- Update linkify to use etree type Treeewalker instead of simpletree.
+- Updated html5lib to version >= 0.999.
+- Update all code to be compatible with Python 3 and 2 using six.
+- Switch to Apache License.
+
+
+Version 1.3
+-----------
+
+- Used by Python 3-only fork.
+
+
+Version 1.2.2
+-------------
+
+- Pin html5lib to version 0.95 for now due to major API break.
+
 Version 1.2.1
 -------------
 
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index f014916..c2d052a 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -3,7 +3,7 @@ within and without the Mozilla Corporation and Foundation.
 
 Lead Developer:
 
-- James Socol <james@mozilla.com>
+- James Socol <me@jamessocol.com>
 
 Contributors:
 
@@ -23,3 +23,6 @@ Patches:
 - Anton Kovalyov
 - Mark Paschal
 - Alex Ehlke
+- Marc DM
+- mdxs
+- Marc Abramowitz
diff --git a/LICENSE b/LICENSE
index b2df30c..f7afaef 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,28 +1,13 @@
-Copyright (c) 2010, Mozilla Foundation
-All rights reserved.
+Copyright (c) 2014, Mozilla Foundation
 
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-    1. Redistributions of source code must retain the above copyright notice,
-       this list of conditions and the following disclaimer.
-
-    2. Redistributions in binary form must reproduce the above copyright
-       notice, this list of conditions and the following disclaimer in the
-       documentation and/or other materials provided with the distribution.
-
-    3. Neither the name of bleach nor the names of its contributors may
-       be used to endorse or promote products derived from this software
-       without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+    http://www.apache.org/licenses/LICENSE-2.0
 
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/README.rst b/README.rst
index 093edc1..5e52cae 100644
--- a/README.rst
+++ b/README.rst
@@ -60,7 +60,7 @@ Then install it by running::
     $ python setup.py install
 
 
-.. _html5lib: http://code.google.com/p/html5lib/
+.. _html5lib: https://github.com/html5lib/html5lib-python
 .. _GitHub: https://github.com/jsocol/bleach
 .. _ReadTheDocs: http://bleach.readthedocs.org/
 .. _PyPI: http://pypi.python.org/pypi/bleach
diff --git a/bleach/__init__.py b/bleach/__init__.py
index af75d0f..b110972 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -1,6 +1,8 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
 import logging
 import re
-import sys
 
 import html5lib
 from html5lib.sanitizer import HTMLSanitizer
@@ -11,8 +13,8 @@ from .encoding import force_unicode
 from .sanitizer import BleachSanitizer
 
 
-VERSION = (1, 2, 1)
-__version__ = '1.2.1'
+VERSION = (1, 4, 0)
+__version__ = '1.4'
 
 __all__ = ['clean', 'linkify']
 
@@ -61,12 +63,12 @@ TLDS.reverse()
 
 url_re = re.compile(
     r"""\(*  # Match any opening parentheses.
-    \b(?<![@.])(?:(?:%s):/{0,3}(?:(?:\w+:)?\w+@)?)?  # http://
-    ([\w-]+\.)+(?:%s)(?:\:\d+)?(?!\.\w)\b   # xx.yy.tld(:##)?
-    (?:[/?][^\s\{\}\|\\\^\[\]`<>"]*)?
+    \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
+    ([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
         # /path/zz (excluding "unsafe" chars from RFC 1738,
         # except for # and ~, which happen in practice)
-    """ % (u'|'.join(PROTOCOLS), u'|'.join(TLDS)),
+    """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
     re.IGNORECASE | re.VERBOSE | re.UNICODE)
 
 proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
@@ -75,8 +77,8 @@ punct_re = re.compile(r'([\.,]+)$')
 
 email_re = re.compile(
     r"""(?<!//)
-    (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
-        (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*  # dot-atom
+    (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
+        (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)*  # dot-atom
     |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
         |\\[\001-011\013\014\016-\177])*"  # quoted-string
     )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.?  # domain
@@ -85,17 +87,18 @@ email_re = re.compile(
 
 NODE_TEXT = 4  # The numeric ID of a text node in simpletree.
 
-DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
+# a simple routine that returns the tag name with the namespace prefix
+# as returned by etree's Element.tag attribute
 
-PY_26 = (sys.version_info < (2, 7))
-RECURSION_EXCEPTION = RuntimeError if not PY_26 else AttributeError
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
 
 
 def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
           styles=ALLOWED_STYLES, strip=False, strip_comments=True):
     """Clean an HTML fragment and return it"""
     if not text:
-        return u''
+        return ''
 
     text = force_unicode(text)
 
@@ -123,22 +126,38 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
     text = force_unicode(text)
 
     if not text:
-        return u''
+        return ''
 
     parser = html5lib.HTMLParser(tokenizer=tokenizer)
 
     forest = parser.parseFragment(text)
+    _seen = set([])
 
-    def replace_nodes(tree, new_frag, node):
+    def replace_nodes(tree, new_frag, node, index=0):
+        """
+        Doesn't really replace nodes, but inserts the nodes contained in
+        new_frag into the treee at position index and returns the number
+        of nodes inserted.
+        If node is passed in, it is removed from the tree
+        """
+        count = 0
         new_tree = parser.parseFragment(new_frag)
-        for n in new_tree.childNodes:
-            # Prevent us from re-parsing links new links as existing links.
-            if n.name == 'a':
-                n._seen = True
-            tree.insertBefore(n, node)
-        tree.removeChild(node)
-        # Return the number of new nodes.
-        return len(new_tree.childNodes) - 1
+        # capture any non-tag text at the start of the fragment
+        if new_tree.text:
+            if index == 0:
+                tree.text += new_tree.text
+            else:
+                tree[index-1].tail += new_tree.text
+        # the put in the tagged elements into the old tree
+        for n in new_tree:
+            if n.tag == ETREE_TAG('a'):
+                _seen.add(n)
+            tree.insert(index+count, n)
+            count += 1
+        # if we got a node to remove...
+        if node is not None:
+            tree.remove(node)
+        return count
 
     def strip_wrapping_parentheses(fragment):
         """Strips wrapping parentheses.
@@ -189,58 +208,102 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
                 return None
         return attrs
 
+    def _render_inner(node):
+        out = ['' if node.text is None else node.text]
+        for subnode in node:
+            out.append(_render(subnode))
+            if subnode.tail:
+                out.append(subnode.tail)
+        return ''.join(out)
+
     def linkify_nodes(tree, parse_text=True):
-        # I know this isn't Pythonic, but we're sometimes mutating
-        # tree.childNodes, which ends up breaking the loop and causing us to
-        # reparse code.
-        children = len(tree.childNodes)
-        current = 0  # A pointer to the "current" node.
-        while current < children:
-            node = tree.childNodes[current]
-            if node.type == NODE_TEXT and parse_text:
-                new_frag = _render(node)
-                # Look for email addresses?
-                if parse_email:
-                    new_frag = re.sub(email_re, email_repl, new_frag)
-                    if new_frag != _render(node):
-                        adj = replace_nodes(tree, new_frag, node)
+        children = len(tree)
+        current_child = -1
+        # start at -1 to process the parent first
+        while current_child < len(tree):
+            if current_child < 0:
+                node = tree
+                if parse_text and node.text:
+                    new_txt = old_txt = node.text
+                    if parse_email:
+                        new_txt = re.sub(email_re, email_repl, node.text)
+                        if new_txt and new_txt != node.text:
+                            node.text = ''
+                            adj = replace_nodes(tree, new_txt, None, 0)
+                            children += adj
+                            current_child += adj
+                            linkify_nodes(tree, True)
+                            continue
+
+                    new_txt = re.sub(url_re, link_repl, new_txt)
+                    if new_txt != old_txt:
+                        node.text = ''
+                        adj = replace_nodes(tree, new_txt, None, 0)
                         children += adj
-                        current += adj
-                        linkify_nodes(tree)
+                        current_child += adj
                         continue
-                new_frag = re.sub(url_re, link_repl, new_frag)
-                if new_frag != _render(node):
-                    adj = replace_nodes(tree, new_frag, node)
+            else:
+                node = tree[current_child]
+
+            if parse_text and node.tail:
+                new_tail = old_tail = node.tail
+                if parse_email:
+                    new_tail = re.sub(email_re, email_repl, new_tail)
+                    if new_tail != node.tail:
+                        node.tail = ''
+                        adj = replace_nodes(tree, new_tail, None,
+                                            current_child+1)
+                        #insert the new nodes made from my tail into
+                        # the tree right after me. current_child+1
+                        children += adj
+
+                new_tail = re.sub(url_re, link_repl, new_tail)
+                if new_tail != old_tail:
+                    node.tail = ''
+                    adj = replace_nodes(tree, new_tail, None, current_child+1)
                     children += adj
-                    current += adj
-            elif node.name == 'a' and not getattr(node, '_seen', False):
-                if 'href' in node.attributes:
-                    attrs = node.attributes
-                    _text = attrs['_text'] = ''.join(c.toxml() for
-                                                     c in node.childNodes)
+
+            if node.tag == ETREE_TAG('a') and not (node in _seen):
+                if not node.get('href', None) is None:
+                    attrs = dict(node.items())
+
+                    _text = attrs['_text'] = _render_inner(node)
+
                     attrs = apply_callbacks(attrs, False)
-                    if attrs is not None:
+
+                    if attrs is None:
+                        # <a> tag replaced by the text within it
+                        adj = replace_nodes(tree, _text, node,
+                                            current_child)
+                        current_child -= 1
+                        # pull back current_child by 1 to scan the
+                        # new nodes again.
+                    else:
                         text = force_unicode(attrs.pop('_text'))
-                        node.attributes = attrs
-                        for n in reversed(node.childNodes):
-                            node.removeChild(n)
+                        for attr_key, attr_val in attrs.items():
+                            node.set(attr_key, attr_val)
+
+                        for n in reversed(list(node)):
+                            node.remove(n)
                         text = parser.parseFragment(text)
-                        for n in text.childNodes:
-                            node.appendChild(n)
-                        node._seen = True
-                    else:
-                        replace_nodes(tree, _text, node)
-            elif skip_pre and node.name == 'pre':
-                linkify_nodes(node, False)
-            elif not getattr(node, '_seen', False):
-                linkify_nodes(node)
-            current += 1
+                        node.text = text.text
+                        for n in text:
+                            node.append(n)
+                        _seen.add(node)
+
+            elif current_child >= 0:
+                if node.tag == ETREE_TAG('pre') and skip_pre:
+                    linkify_nodes(node, False)
+                elif not (node in _seen):
+                    linkify_nodes(node, True)
+
+            current_child += 1
 
     def email_repl(match):
         addr = match.group(0).replace('"', '&quot;')
         link = {
             '_text': addr,
-            'href': 'mailto:%s' % addr,
+            'href': 'mailto:{0!s}'.format(addr),
         }
         link = apply_callbacks(link, True)
 
@@ -250,18 +313,18 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
         _href = link.pop('href')
         _text = link.pop('_text')
 
-        repl = '<a href="%s" %s>%s</a>'
-        attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())
-        return repl % (_href, attribs, _text)
+        repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
+        attr = '{0!s}="{1!s}"'
+        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
+        return repl.format(_href, attribs, _text)
 
     def link_repl(match):
         url = match.group(0)
         open_brackets = close_brackets = 0
         if url.startswith('('):
-            url, open_brackets, close_brackets = (
-                    strip_wrapping_parentheses(url)
-            )
-        end = u''
+            _wrapping = strip_wrapping_parentheses(url)
+            url, open_brackets, close_brackets = _wrapping
+        end = ''
         m = re.search(punct_re, url)
         if m:
             end = m.group(0)
@@ -269,7 +332,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
         if re.search(proto_re, url):
             href = url
         else:
-            href = u''.join([u'http://', url])
+            href = ''.join(['http://', url])
 
         link = {
             '_text': url,
@@ -284,32 +347,30 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
         _text = link.pop('_text')
         _href = link.pop('href')
 
-        repl = u'%s<a href="%s" %s>%s</a>%s%s'
-        attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())
+        repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
+        attr = '{0!s}="{1!s}"'
+        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
 
-        return repl % ('(' * open_brackets,
-                       _href, attribs, _text, end,
-                       ')' * close_brackets)
+        return repl.format('(' * open_brackets,
+                           _href, attribs, _text, end,
+                           ')' * close_brackets)
 
     try:
         linkify_nodes(forest)
-    except (RECURSION_EXCEPTION), e:
+    except RuntimeError as e:
         # If we hit the max recursion depth, just return what we've got.
-        log.exception('Probable recursion error: %r' % e)
+        log.exception('Probable recursion error: {0!r}'.format(e))
 
     return _render(forest)
 
 
 def _render(tree):
     """Try rendering as HTML, then XML, then give up."""
-    try:
-        return force_unicode(_serialize(tree))
-    except AssertionError:  # The treewalker throws this sometimes.
-        return force_unicode(tree.toxml())
+    return force_unicode(_serialize(tree))
 
 
 def _serialize(domtree):
-    walker = html5lib.treewalkers.getTreeWalker('simpletree')
+    walker = html5lib.treewalkers.getTreeWalker('etree')
     stream = walker(domtree)
     serializer = HTMLSerializer(quote_attr_values=True,
                                 omit_optional_tags=False)
diff --git a/bleach/callbacks.py b/bleach/callbacks.py
index cc4682d..227f089 100644
--- a/bleach/callbacks.py
+++ b/bleach/callbacks.py
@@ -1,10 +1,15 @@
 """A set of basic callbacks for bleach.linkify."""
+from __future__ import unicode_literals
 
 
 def nofollow(attrs, new=False):
     if attrs['href'].startswith('mailto:'):
         return attrs
-    attrs['rel'] = 'nofollow'
+    rel = [x for x in attrs.get('rel', '').split(' ') if x]
+    if not 'nofollow' in [x.lower() for x in rel]:
+        rel.append('nofollow')
+    attrs['rel'] = ' '.join(rel)
+
     return attrs
 
 
diff --git a/bleach/encoding.py b/bleach/encoding.py
index b9a989d..707adaa 100644
--- a/bleach/encoding.py
+++ b/bleach/encoding.py
@@ -1,6 +1,7 @@
 import datetime
 from decimal import Decimal
 import types
+import six
 
 
 def is_protected_type(obj):
@@ -10,45 +11,52 @@ def is_protected_type(obj):
     force_unicode(strings_only=True).
     """
     return isinstance(obj, (
-        types.NoneType,
-        int, long,
-        datetime.datetime, datetime.date, datetime.time,
-        float, Decimal)
+        six.integer_types +
+        (types.NoneType,
+         datetime.datetime, datetime.date, datetime.time,
+         float, Decimal))
     )
 
 
 def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
     """
-    Similar to smart_unicode, except that lazy instances are resolved to
+    Similar to smart_text, except that lazy instances are resolved to
     strings, rather than kept as lazy objects.
 
     If strings_only is True, don't convert (some) non-string-like objects.
     """
+    # Handle the common case first, saves 30-40% when s is an instance of
+    # six.text_type. This function gets called often in that setting.
+    if isinstance(s, six.text_type):
+        return s
     if strings_only and is_protected_type(s):
         return s
     try:
-        if not isinstance(s, basestring,):
+        if not isinstance(s, six.string_types):
             if hasattr(s, '__unicode__'):
-                s = unicode(s)
+                s = s.__unicode__()
             else:
-                try:
-                    s = unicode(str(s), encoding, errors)
-                except UnicodeEncodeError:
-                    if not isinstance(s, Exception):
-                        raise
-                    # If we get to here, the caller has passed in an Exception
-                    # subclass populated with non-ASCII data without special
-                    # handling to display as a string. We need to handle this
-                    # without raising a further exception. We do an
-                    # approximation to what the Exception's standard str()
-                    # output should be.
-                    s = ' '.join([force_unicode(arg, encoding, strings_only,
-                            errors) for arg in s])
-        elif not isinstance(s, unicode):
-            # Note: We use .decode() here, instead of unicode(s, encoding,
-            # errors), so that if s is a SafeString, it ends up being a
-            # SafeUnicode at the end.
+                if six.PY3:
+                    if isinstance(s, bytes):
+                        s = six.text_type(s, encoding, errors)
+                    else:
+                        s = six.text_type(s)
+                else:
+                    s = six.text_type(bytes(s), encoding, errors)
+        else:
+            # Note: We use .decode() here, instead of six.text_type(s,
+            # encoding, errors), so that if s is a SafeBytes, it ends up being
+            # a SafeText at the end.
             s = s.decode(encoding, errors)
-    except UnicodeDecodeError, e:
-        raise UnicodeDecodeError(*e.args)
+    except UnicodeDecodeError as e:
+        if not isinstance(s, Exception):
+            raise UnicodeDecodeError(*e.args)
+        else:
+            # If we get to here, the caller has passed in an Exception
+            # subclass populated with non-ASCII bytestring data without a
+            # working unicode method. Try to handle this without raising a
+            # further exception by individually forcing the exception args
+            # to unicode.
+            s = ' '.join([force_unicode(arg, encoding, strings_only,
+                          errors) for arg in s])
     return s
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 4640012..88246f8 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,3 +1,4 @@
+from __future__ import unicode_literals
 import re
 from xml.sax.saxutils import escape, unescape
 
@@ -14,8 +15,6 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
     """Mixin to replace sanitize_token() and sanitize_css()."""
 
     allowed_svg_properties = []
-    # TODO: When the next html5lib version comes out, nuke this.
-    attr_val_is_uri = HTMLSanitizerMixin.attr_val_is_uri + ['poster']
 
     def sanitize_token(self, token):
         """Sanitize a token either by HTML-encoding or dropping.
@@ -30,7 +29,7 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
 
         """
         if (getattr(self, 'wildcard_attributes', None) is None and
-            isinstance(self.allowed_attributes, dict)):
+                isinstance(self.allowed_attributes, dict)):
             self.wildcard_attributes = self.allowed_attributes.get('*', [])
 
         if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
@@ -56,7 +55,7 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
                                                unescape(attrs[attr])).lower()
                         # Remove replacement characters from unescaped
                         # characters.
-                        val_unescaped = val_unescaped.replace(u"\ufffd", "")
+                        val_unescaped = val_unescaped.replace("\ufffd", "")
                         if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
                             and (val_unescaped.split(':')[0] not in
                                  self.allowed_protocols)):
@@ -67,8 +66,8 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
                                                  ' ',
                                                  unescape(attrs[attr]))
                     if (token['name'] in self.svg_allow_local_href and
-                        'xlink:href' in attrs and
-                        re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
+                            'xlink:href' in attrs and
+                            re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
                         del attrs['xlink:href']
                     if 'style' in attrs:
                         attrs['style'] = self.sanitize_css(attrs['style'])
@@ -79,13 +78,14 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
                 pass
             else:
                 if token['type'] == tokenTypes['EndTag']:
-                    token['data'] = '</%s>' % token['name']
+                    token['data'] = '</{0!s}>'.format(token['name'])
                 elif token['data']:
-                    attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in
+                    attr = ' {0!s}="{1!s}"'
+                    attrs = ''.join([attr.format(k, escape(v)) for k, v in
                                     token['data']])
-                    token['data'] = '<%s%s>' % (token['name'], attrs)
+                    token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
                 else:
-                    token['data'] = '<%s>' % token['name']
+                    token['data'] = '<{0!s}>'.format(token['name'])
                 if token['selfClosing']:
                     token['data'] = token['data'][:-1] + '/>'
                 token['type'] = tokenTypes['Characters']
@@ -112,8 +112,8 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
         # TODO: Make sure this does what it's meant to - I *think* it wants to
         # validate style attribute contents.
         parts = style.split(';')
-        gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*"""
-                              """|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
+        gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
+                              """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
         for part in parts:
             if not gauntlet.match(part):
                 return ''
diff --git a/bleach/tests/test_basics.py b/bleach/tests/test_basics.py
index 9eca687..822407f 100644
--- a/bleach/tests/test_basics.py
+++ b/bleach/tests/test_basics.py
@@ -1,7 +1,9 @@
+import six
 import html5lib
 from nose.tools import eq_
 
 import bleach
+from bleach.tests.tools import in_
 
 
 def test_empty():
@@ -9,7 +11,12 @@ def test_empty():
 
 
 def test_nbsp():
-    eq_(u'\xa0test string\xa0', bleach.clean('&nbsp;test string&nbsp;'))
+    if six.PY3:
+        expected = '\xa0test string\xa0'
+    else:
+        expected = six.u('\\xa0test string\\xa0')
+
+    eq_(expected, bleach.clean('&nbsp;test string&nbsp;'))
 
 
 def test_comments_only():
@@ -18,8 +25,8 @@ def test_comments_only():
     eq_('', bleach.clean(comment))
     eq_('', bleach.clean(open_comment))
     eq_(comment, bleach.clean(comment, strip_comments=False))
-    eq_('%s-->' % open_comment, bleach.clean(open_comment,
-                                             strip_comments=False))
+    eq_('{0!s}-->'.format(open_comment), bleach.clean(open_comment,
+                                                      strip_comments=False))
 
 
 def test_with_comments():
@@ -55,9 +62,11 @@ def test_function_arguments():
 
 def test_named_arguments():
     ATTRS = {'a': ['rel', 'href']}
-    s = u'<a href="http://xx.com" rel="alternate">xx.com</a>'
-    eq_('<a href="http://xx.com">xx.com</a>', bleach.clean(s))
-    eq_(s, bleach.clean(s, attributes=ATTRS))
+    s = ('<a href="http://xx.com" rel="alternate">xx.com</a>',
+         '<a rel="alternate" href="http://xx.com">xx.com</a>')
+
+    eq_('<a href="http://xx.com">xx.com</a>', bleach.clean(s[0]))
+    in_(s, bleach.clean(s[0], attributes=ATTRS))
 
 
 def test_disallowed_html():
@@ -81,19 +90,19 @@ def test_bare_entities():
 
 
 def test_escaped_entities():
-    s = u'&lt;em&gt;strong&lt;/em&gt;'
+    s = '&lt;em&gt;strong&lt;/em&gt;'
     eq_(s, bleach.clean(s))
 
 
 def test_serializer():
-    s = u'<table></table>'
+    s = '<table></table>'
     eq_(s, bleach.clean(s, tags=['table']))
-    eq_(u'test<table></table>', bleach.linkify(u'<table>test</table>'))
-    eq_(u'<p>test</p>', bleach.clean(u'<p>test</p>', tags=['p']))
+    eq_('test<table></table>', bleach.linkify('<table>test</table>'))
+    eq_('<p>test</p>', bleach.clean('<p>test</p>', tags=['p']))
 
 
 def test_no_href_links():
-    s = u'<a name="anchor">x</a>'
+    s = '<a name="anchor">x</a>'
     eq_(s, bleach.linkify(s))
 
 
@@ -112,7 +121,7 @@ def test_stripping():
         bleach.clean('a test <em>with</em> <b>html</b> tags', strip=True))
     eq_('a test <em>with</em>  <b>html</b> tags',
         bleach.clean('a test <em>with</em> <img src="http://example.com/"> '
-                '<b>html</b> tags', strip=True))
+                     '<b>html</b> tags', strip=True))
 
     s = '<p><a href="http://example.com/">link text</a></p>'
     eq_('<p>link text</p>', bleach.clean(s, tags=['p'], strip=True))
@@ -138,7 +147,7 @@ def test_allowed_styles():
 
 def test_idempotent():
     """Make sure that applying the filter twice doesn't change anything."""
-    dirty = u'<span>invalid & </span> < extra http://link.com<em>'
+    dirty = '<span>invalid & </span> < extra http://link.com<em>'
 
     clean = bleach.clean(dirty)
     eq_(clean, bleach.clean(clean))
@@ -147,10 +156,23 @@ def test_idempotent():
     eq_(linked, bleach.linkify(linked))
 
 
+def test_rel_already_there():
+    """Make sure rel attribute is updated not replaced"""
+    linked = ('Click <a href="http://example.com" rel="tooltip">'
+              'here</a>.')
+    link_good = (('Click <a href="http://example.com" rel="tooltip nofollow">'
+                  'here</a>.'),
+                 ('Click <a rel="tooltip nofollow" href="http://example.com">'
+                  'here</a>.'))
+
+    in_(link_good, bleach.linkify(linked))
+    in_(link_good, bleach.linkify(link_good[0]))
+
+
 def test_lowercase_html():
     """We should output lowercase HTML."""
-    dirty = u'<EM CLASS="FOO">BAR</EM>'
-    clean = u'<em class="FOO">BAR</em>'
+    dirty = '<EM CLASS="FOO">BAR</EM>'
+    clean = '<em class="FOO">BAR</em>'
     eq_(clean, bleach.clean(dirty, attributes=['class']))
 
 
@@ -160,14 +182,15 @@ def test_wildcard_attributes():
         'img': ['src'],
     }
     TAG = ['img', 'em']
-    dirty = (u'both <em id="foo" style="color: black">can</em> have '
-             u'<img id="bar" src="foo"/>')
-    clean = u'both <em id="foo">can</em> have <img id="bar" src="foo">'
-    eq_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR))
+    dirty = ('both <em id="foo" style="color: black">can</em> have '
+             '<img id="bar" src="foo"/>')
+    clean = ('both <em id="foo">can</em> have <img src="foo" id="bar">',
+             'both <em id="foo">can</em> have <img id="bar" src="foo">')
+    in_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR))
 
 
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
-    dirty = u'Yeah right <sarcasm/>'
-    clean = u'Yeah right &lt;sarcasm/&gt;'
+    dirty = 'Yeah right <sarcasm/>'
+    clean = 'Yeah right &lt;sarcasm/&gt;'
     eq_(clean, bleach.clean(dirty))
diff --git a/bleach/tests/test_css.py b/bleach/tests/test_css.py
index 588c8ce..b40596f 100644
--- a/bleach/tests/test_css.py
+++ b/bleach/tests/test_css.py
@@ -29,14 +29,14 @@ def test_allowed_css():
         ('font-family: "Arial";', 'font-family: "Arial";', ['font-family']),
     )
 
-    p_single = '<p style="%s">bar</p>'
-    p_double = "<p style='%s'>bar</p>"
+    p_single = '<p style="{0!s}">bar</p>'
+    p_double = "<p style='{0!s}'>bar</p>"
 
     def check(i, o, s):
         if '"' in i:
-            eq_(p_double % o, clean(p_double % i, styles=s))
+            eq_(p_double.format(o), clean(p_double.format(i), styles=s))
         else:
-            eq_(p_single % o, clean(p_single % i, styles=s))
+            eq_(p_single.format(o), clean(p_single.format(i), styles=s))
 
     for i, o, s in tests:
         yield check, i, o, s
@@ -70,12 +70,13 @@ def test_style_hang():
              """font: normal normal normal 100%/normal 'Courier New', """
              """'Andale Mono', monospace; background-position: initial """
              """initial; background-repeat: initial initial;""")
-    html = '<p style="%s">Hello world</p>' % style
+    html = '<p style="{0!s}">Hello world</p>'.format(style)
     styles = [
         'border', 'float', 'overflow', 'min-height', 'vertical-align',
         'white-space',
         'margin', 'margin-left', 'margin-top', 'margin-bottom', 'margin-right',
-        'padding', 'padding-left', 'padding-top', 'padding-bottom', 'padding-right',
+        'padding', 'padding-left', 'padding-top', 'padding-bottom',
+        'padding-right',
         'background',
         'background-color',
         'font', 'font-size', 'font-weight', 'text-align', 'text-transform',
diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py
index ac593c4..abf889d 100644
--- a/bleach/tests/test_links.py
+++ b/bleach/tests/test_links.py
@@ -1,18 +1,20 @@
-import urllib
+try:
+    from urllib.parse import quote_plus
+except ImportError:
+    from urllib import quote_plus
 
 from html5lib.tokenizer import HTMLTokenizer
 from nose.tools import eq_
 
 from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
-
-
+from bleach.tests.tools import in_
 
 
 def test_url_re():
     def no_match(s):
         match = url_re.search(s)
         if match:
-            assert not match, 'matched %s' % s[slice(*match.span())]
+            assert not match, 'matched {0!s}'.format(s[slice(*match.span())])
     yield no_match, 'just what i am looking for...it'
 
 
@@ -21,36 +23,48 @@ def test_empty():
 
 
 def test_simple_link():
-    eq_('a <a href="http://example.com" rel="nofollow">http://example.com'
+    in_(('a <a href="http://example.com" rel="nofollow">http://example.com'
         '</a> link',
+        'a <a rel="nofollow" href="http://example.com">http://example.com'
+        '</a> link'),
         linkify('a http://example.com link'))
-    eq_('a <a href="https://example.com" rel="nofollow">https://example.com'
+    in_(('a <a href="https://example.com" rel="nofollow">https://example.com'
         '</a> link',
+        'a <a rel="nofollow" href="https://example.com">https://example.com'
+        '</a> link'),
         linkify('a https://example.com link'))
-    eq_('an <a href="http://example.com" rel="nofollow">example.com</a> link',
-        linkify('an example.com link'))
+    in_(('a <a href="http://example.com" rel="nofollow">example.com</a> link',
+         'a <a rel="nofollow" href="http://example.com">example.com</a> link'),
+        linkify('a example.com link'))
 
 
 def test_trailing_slash():
-    eq_('<a href="http://example.com/" rel="nofollow">http://example.com/</a>',
-       linkify('http://example.com/'))
-    eq_('<a href="http://example.com/foo/" rel="nofollow">'
-        'http://example.com/foo/</a>',
-       linkify('http://example.com/foo/'))
-    eq_('<a href="http://example.com/foo/bar/" rel="nofollow">'
-        'http://example.com/foo/bar/</a>',
-       linkify('http://example.com/foo/bar/'))
+    in_(('<a href="http://examp.com/" rel="nofollow">http://examp.com/</a>',
+         '<a rel="nofollow" href="http://examp.com/">http://examp.com/</a>'),
+        linkify('http://examp.com/'))
+    in_(('<a href="http://example.com/foo/" rel="nofollow">'
+         'http://example.com/foo/</a>',
+         '<a rel="nofollow" href="http://example.com/foo/">'
+         'http://example.com/foo/</a>'),
+        linkify('http://example.com/foo/'))
+    in_(('<a href="http://example.com/foo/bar/" rel="nofollow">'
+         'http://example.com/foo/bar/</a>',
+         '<a rel="nofollow" href="http://example.com/foo/bar/">'
+         'http://example.com/foo/bar/</a>'),
+        linkify('http://example.com/foo/bar/'))
 
 
 def test_mangle_link():
     """We can muck with the href attribute of the link."""
     def filter_url(attrs, new=False):
-        attrs['href'] = (u'http://bouncer/?u=%s' %
-                         urllib.quote_plus(attrs['href']))
+        quoted = quote_plus(attrs['href'])
+        attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted)
         return attrs
 
-    eq_('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">'
-        'http://example.com</a>',
+    in_(('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">'
+         'http://example.com</a>',
+         '<a rel="nofollow" href="http://bouncer/?u=http%3A%2F%2Fexample.com">'
+         'http://example.com</a>'),
         linkify('http://example.com', DC + [filter_url]))
 
 
@@ -76,13 +90,19 @@ def test_email_link():
             'james@example.com.au</a> mailto', True,
             'aussie james@example.com.au mailto'),
         # This is kind of a pathological case. I guess we do our best here.
-        ('email to <a href="james@example.com" rel="nofollow">'
-            'james@example.com</a>', True,
-            'email to <a href="james@example.com">james@example.com</a>'),
+        (('email to <a href="james@example.com" rel="nofollow">'
+          'james@example.com</a>',
+          'email to <a rel="nofollow" href="james@example.com">'
+          'james@example.com</a>'),
+         True,
+         'email to <a href="james@example.com">james@example.com</a>'),
     )
 
     def _check(o, p, i):
-        eq_(o, linkify(i, parse_email=p))
+        if isinstance(o, (list, tuple)):
+            in_(o, linkify(i, parse_email=p))
+        else:
+            eq_(o, linkify(i, parse_email=p))
 
     for (o, p, i) in tests:
         yield _check, o, p, i
@@ -151,7 +171,8 @@ def test_set_attrs():
         attrs['rev'] = 'canonical'
         return attrs
 
-    eq_('<a href="http://ex.mp" rev="canonical">ex.mp</a>',
+    in_(('<a href="http://ex.mp" rev="canonical">ex.mp</a>',
+         '<a rev="canonical" href="http://ex.mp">ex.mp</a>'),
         linkify('ex.mp', [set_attr]))
 
 
@@ -179,15 +200,19 @@ def test_stop_email():
 
 
 def test_tlds():
-    eq_('<a href="http://example.com" rel="nofollow">example.com</a>',
+    in_(('<a href="http://example.com" rel="nofollow">example.com</a>',
+         '<a rel="nofollow" href="http://example.com">example.com</a>'),
         linkify('example.com'))
-    eq_('<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>',
+    in_(('<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>',
+         '<a rel="nofollow" href="http://example.co.uk">example.co.uk</a>'),
         linkify('example.co.uk'))
-    eq_('<a href="http://example.edu" rel="nofollow">example.edu</a>',
+    in_(('<a href="http://example.edu" rel="nofollow">example.edu</a>',
+         '<a rel="nofollow" href="http://example.edu">example.edu</a>'),
         linkify('example.edu'))
     eq_('example.xxx', linkify('example.xxx'))
     eq_(' brie', linkify(' brie'))
-    eq_('<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>',
+    in_(('<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>',
+         '<a rel="nofollow" href="http://bit.ly/fun">bit.ly/fun</a>'),
         linkify('bit.ly/fun'))
 
 
@@ -197,61 +222,81 @@ def test_escaping():
 
 def test_nofollow_off():
     eq_('<a href="http://example.com">example.com</a>',
-        linkify(u'example.com', []))
+        linkify('example.com', []))
 
 
 def test_link_in_html():
-    eq_('<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>',
+    in_(('<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>',
+         '<i><a rel="nofollow" href="http://yy.com">http://yy.com</a></i>'),
         linkify('<i>http://yy.com</i>'))
-    eq_('<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com</a>'
-        '</strong></em>',
+
+    in_(('<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com'
+         '</a></strong></em>',
+         '<em><strong><a rel="nofollow" href="http://xx.com">http://xx.com'
+         '</a></strong></em>'),
         linkify('<em><strong>http://xx.com</strong></em>'))
 
 
 def test_links_https():
-    eq_('<a href="https://yy.com" rel="nofollow">https://yy.com</a>',
+    in_(('<a href="https://yy.com" rel="nofollow">https://yy.com</a>',
+         '<a rel="nofollow" href="https://yy.com">https://yy.com</a>'),
         linkify('https://yy.com'))
 
 
 def test_add_rel_nofollow():
     """Verify that rel="nofollow" is added to an existing link"""
-    eq_('<a href="http://yy.com" rel="nofollow">http://yy.com</a>',
+    in_(('<a href="http://yy.com" rel="nofollow">http://yy.com</a>',
+         '<a rel="nofollow" href="http://yy.com">http://yy.com</a>'),
         linkify('<a href="http://yy.com">http://yy.com</a>'))
 
 
 def test_url_with_path():
-    eq_('<a href="http://example.com/path/to/file" rel="nofollow">'
-        'http://example.com/path/to/file</a>',
+    in_(('<a href="http://example.com/path/to/file" rel="nofollow">'
+         'http://example.com/path/to/file</a>',
+         '<a rel="nofollow" href="http://example.com/path/to/file">'
+         'http://example.com/path/to/file</a>'),
         linkify('http://example.com/path/to/file'))
 
 
 def test_link_ftp():
-    eq_('<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
-        'ftp://ftp.mozilla.org/some/file</a>',
+    in_(('<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
+         'ftp://ftp.mozilla.org/some/file</a>',
+         '<a rel="nofollow" href="ftp://ftp.mozilla.org/some/file">'
+         'ftp://ftp.mozilla.org/some/file</a>'),
         linkify('ftp://ftp.mozilla.org/some/file'))
 
 
 def test_link_query():
-    eq_('<a href="http://xx.com/?test=win" rel="nofollow">'
+    in_(('<a href="http://xx.com/?test=win" rel="nofollow">'
         'http://xx.com/?test=win</a>',
+        '<a rel="nofollow" href="http://xx.com/?test=win">'
+        'http://xx.com/?test=win</a>'),
         linkify('http://xx.com/?test=win'))
-    eq_('<a href="http://xx.com/?test=win" rel="nofollow">'
+    in_(('<a href="http://xx.com/?test=win" rel="nofollow">'
         'xx.com/?test=win</a>',
+        '<a rel="nofollow" href="http://xx.com/?test=win">'
+        'xx.com/?test=win</a>'),
         linkify('xx.com/?test=win'))
-    eq_('<a href="http://xx.com?test=win" rel="nofollow">'
+    in_(('<a href="http://xx.com?test=win" rel="nofollow">'
         'xx.com?test=win</a>',
+        '<a rel="nofollow" href="http://xx.com?test=win">'
+        'xx.com?test=win</a>'),
         linkify('xx.com?test=win'))
 
 
 def test_link_fragment():
-    eq_('<a href="http://xx.com/path#frag" rel="nofollow">'
-        'http://xx.com/path#frag</a>',
+    in_(('<a href="http://xx.com/path#frag" rel="nofollow">'
+         'http://xx.com/path#frag</a>',
+         '<a rel="nofollow" href="http://xx.com/path#frag">'
+         'http://xx.com/path#frag</a>'),
         linkify('http://xx.com/path#frag'))
 
 
 def test_link_entities():
-    eq_('<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">'
+    in_(('<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">'
         'http://xx.com/?a=1&amp;b=2</a>',
+        '<a rel="nofollow" href="http://xx.com/?a=1&amp;b=2">'
+        'http://xx.com/?a=1&amp;b=2</a>'),
         linkify('http://xx.com/?a=1&b=2'))
 
 
@@ -262,9 +307,12 @@ def test_escaped_html():
 
 
 def test_link_http_complete():
-    eq_('<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d'
+    in_(('<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d'
         '&amp;e#f" rel="nofollow">'
         'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>',
+        '<a rel="nofollow" href="https://user:pass@ftp.mozilla.org/x/'
+        'y.exe?a=b&amp;c=d&amp;e#f">'
+        'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>'),
         linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f'))
 
 
@@ -282,8 +330,10 @@ def test_javascript_url():
 
 def test_unsafe_url():
     """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning."""
-    eq_('All your{"<a href="http://xx.yy.com/grover.png" '
-                     'rel="nofollow">xx.yy.com/grover.png</a>"}base are',
+    in_(('All your{"<a href="http://xx.yy.com/grover.png" '
+         'rel="nofollow">xx.yy.com/grover.png</a>"}base are',
+         'All your{"<a rel="nofollow" href="http://xx.yy.com/grover.png"'
+         '>xx.yy.com/grover.png</a>"}base are'),
         linkify('All your{"xx.yy.com/grover.png"}base are'))
 
 
@@ -291,17 +341,23 @@ def test_skip_pre():
     """Skip linkification in <pre> tags."""
     simple = 'http://xx.com <pre>http://xx.com</pre>'
     linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
+              '<pre>http://xx.com</pre>',
+              '<a rel="nofollow" href="http://xx.com">http://xx.com</a> '
               '<pre>http://xx.com</pre>')
     all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
                   '<pre><a href="http://xx.com" rel="nofollow">http://xx.com'
+                  '</a></pre>',
+                  '<a rel="nofollow" href="http://xx.com">http://xx.com</a> '
+                  '<pre><a rel="nofollow" href="http://xx.com">http://xx.com'
                   '</a></pre>')
-    eq_(linked, linkify(simple, skip_pre=True))
-    eq_(all_linked, linkify(simple))
+    in_(linked, linkify(simple, skip_pre=True))
+    in_(all_linked, linkify(simple))
 
     already_linked = '<pre><a href="http://xx.com">xx</a></pre>'
-    nofollowed = '<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>'
-    eq_(nofollowed, linkify(already_linked))
-    eq_(nofollowed, linkify(already_linked, skip_pre=True))
+    nofollowed = ('<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>',
+                  '<pre><a rel="nofollow" href="http://xx.com">xx</a></pre>')
+    in_(nofollowed, linkify(already_linked))
+    in_(nofollowed, linkify(already_linked, skip_pre=True))
 
 
 def test_libgl():
@@ -311,11 +367,13 @@ def test_libgl():
 
 def test_end_of_sentence():
     """example.com. should match."""
-    out = u'<a href="http://%s" rel="nofollow">%s</a>%s'
-    in_ = u'%s%s'
+    outs = ('<a href="http://{0!s}" rel="nofollow">{0!s}</a>{1!s}',
+            '<a rel="nofollow" href="http://{0!s}">{0!s}</a>{1!s}')
+    intxt = '{0!s}{1!s}'
 
     def check(u, p):
-        eq_(out % (u, u, p), linkify(in_ % (u, p)))
+        in_([out.format(u, p) for out in outs],
+            linkify(intxt.format(u, p)))
 
     tests = (
         ('example.com', '.'),
@@ -330,49 +388,50 @@ def test_end_of_sentence():
 
 def test_end_of_clause():
     """example.com/foo, shouldn't include the ,"""
-    eq_('<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar',
+    in_(('<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar',
+         '<a rel="nofollow" href="http://ex.com/foo">ex.com/foo</a>, bar'),
         linkify('ex.com/foo, bar'))
 
 
 def test_sarcasm():
     """Jokes should crash.<sarcasm/>"""
-    dirty = u'Yeah right <sarcasm/>'
-    clean = u'Yeah right &lt;sarcasm/&gt;'
+    dirty = 'Yeah right <sarcasm/>'
+    clean = 'Yeah right &lt;sarcasm/&gt;'
     eq_(clean, linkify(dirty))
 
 
 def test_wrapping_parentheses():
     """URLs wrapped in parantheses should not include them."""
-    out = u'%s<a href="http://%s" rel="nofollow">%s</a>%s'
+    outs = ('{0!s}<a href="http://{1!s}" rel="nofollow">{2!s}</a>{3!s}',
+            '{0!s}<a rel="nofollow" href="http://{1!s}">{2!s}</a>{3!s}')
 
     tests = (
-        ('(example.com)', out % ('(', 'example.com', 'example.com', ')')),
-        ('(example.com/)', out % ('(', 'example.com/', 'example.com/', ')')),
-        ('(example.com/foo)', out % ('(', 'example.com/foo',
-                                     'example.com/foo', ')')),
-        ('(((example.com/))))', out % ('(((', 'example.com/)',
-                                       'example.com/)', ')))')),
-        ('example.com/))', out % ('', 'example.com/))',
-                                  'example.com/))', '')),
+        ('(example.com)', ('(', 'example.com', 'example.com', ')')),
+        ('(example.com/)', ('(', 'example.com/', 'example.com/', ')')),
+        ('(example.com/foo)', ('(', 'example.com/foo',
+         'example.com/foo', ')')),
+        ('(((example.com/))))', ('(((', 'example.com/)',
+         'example.com/)', ')))')),
+        ('example.com/))', ('', 'example.com/))', 'example.com/))', '')),
         ('http://en.wikipedia.org/wiki/Test_(assessment)',
-            out % ('', 'en.wikipedia.org/wiki/Test_(assessment)',
-                   'http://en.wikipedia.org/wiki/Test_(assessment)', '')),
+         ('', 'en.wikipedia.org/wiki/Test_(assessment)',
+          'http://en.wikipedia.org/wiki/Test_(assessment)', '')),
         ('(http://en.wikipedia.org/wiki/Test_(assessment))',
-            out % ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
-                   'http://en.wikipedia.org/wiki/Test_(assessment)', ')')),
+         ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
+          'http://en.wikipedia.org/wiki/Test_(assessment)', ')')),
         ('((http://en.wikipedia.org/wiki/Test_(assessment))',
-            out % ('((', 'en.wikipedia.org/wiki/Test_(assessment',
-                   'http://en.wikipedia.org/wiki/Test_(assessment', '))')),
+         ('((', 'en.wikipedia.org/wiki/Test_(assessment',
+          'http://en.wikipedia.org/wiki/Test_(assessment', '))')),
         ('(http://en.wikipedia.org/wiki/Test_(assessment)))',
-            out % ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
-                   'http://en.wikipedia.org/wiki/Test_(assessment))', ')')),
+         ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
+          'http://en.wikipedia.org/wiki/Test_(assessment))', ')')),
         ('(http://en.wikipedia.org/wiki/)Test_(assessment',
-            out % ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
-                   'http://en.wikipedia.org/wiki/)Test_(assessment', '')),
+         ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
+          'http://en.wikipedia.org/wiki/)Test_(assessment', '')),
     )
 
     def check(test, expected_output):
-        eq_(expected_output, linkify(test))
+        in_([o.format(*expected_output) for o in outs], linkify(test))
 
     for test, expected_output in tests:
         yield check, test, expected_output
@@ -389,7 +448,9 @@ def test_ports():
     )
 
     def check(test, output):
-        eq_(u'<a href="{0}" rel="nofollow">{0}</a>{1}'.format(*output),
+        outs = ('<a href="{0}" rel="nofollow">{0}</a>{1}',
+                '<a rel="nofollow" href="{0}">{0}</a>{1}')
+        in_([out.format(*output) for out in outs],
             linkify(test))
 
     for test, output in tests:
@@ -406,8 +467,9 @@ def test_tokenizer():
 def test_ignore_bad_protocols():
     eq_('foohttp://bar',
         linkify('foohttp://bar'))
-    eq_('foohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>',
-        linkify('foohttp://exampl.com'))
+    in_(('fohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>',
+         'fohttp://<a rel="nofollow" href="http://exampl.com">exampl.com</a>'),
+        linkify('fohttp://exampl.com'))
 
 
 def test_max_recursion_depth():
@@ -420,21 +482,28 @@ def test_link_emails_and_urls():
     """parse_email=True shouldn't prevent URLs from getting linkified."""
     output = ('<a href="http://example.com" rel="nofollow">'
               'http://example.com</a> <a href="mailto:person@example.com">'
+              'person@example.com</a>',
+              '<a rel="nofollow" href="http://example.com">'
+              'http://example.com</a> <a href="mailto:person@example.com">'
               'person@example.com</a>')
-    eq_(output, linkify('http://example.com person@example.com',
+    in_(output, linkify('http://example.com person@example.com',
                         parse_email=True))
 
 
 def test_links_case_insensitive():
     """Protocols and domain names are case insensitive."""
     expect = ('<a href="HTTP://EXAMPLE.COM" rel="nofollow">'
+              'HTTP://EXAMPLE.COM</a>',
+              '<a rel="nofollow" href="HTTP://EXAMPLE.COM">'
               'HTTP://EXAMPLE.COM</a>')
-    eq_(expect, linkify('HTTP://EXAMPLE.COM'))
+    in_(expect, linkify('HTTP://EXAMPLE.COM'))
 
 
 def test_elements_inside_links():
-    eq_(u'<a href="#" rel="nofollow">hello<br></a>',
+    in_(('<a href="#" rel="nofollow">hello<br></a>',
+         '<a rel="nofollow" href="#">hello<br></a>'),
         linkify('<a href="#">hello<br></a>'))
 
-    eq_(u'<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>',
+    in_(('<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>',
+         '<a rel="nofollow" href="#"><strong>bold</strong> hello<br></a>'),
         linkify('<a href="#"><strong>bold</strong> hello<br></a>'))
diff --git a/bleach/tests/test_security.py b/bleach/tests/test_security.py
index 6c2b33f..6adab59 100644
--- a/bleach/tests/test_security.py
+++ b/bleach/tests/test_security.py
@@ -25,10 +25,10 @@ def test_invalid_attr():
         clean('<a onclick="evil" href="test">test</a>'))
     eq_('<img src="test">',
         clean('<img onclick="evil" src="test" />',
-                tags=IMG, attributes=IMG_ATTR))
+              tags=IMG, attributes=IMG_ATTR))
     eq_('<img src="test">',
         clean('<img href="invalid" src="test" />',
-                tags=IMG, attributes=IMG_ATTR))
+              tags=IMG, attributes=IMG_ATTR))
 
 
 def test_unquoted_attr():
@@ -57,7 +57,7 @@ def test_invalid_filter_attr():
 
     eq_('<img src="http://example.com/">',
         clean('<img onclick="evil" src="http://example.com/" />',
-                tags=IMG, attributes=IMG_ATTR))
+              tags=IMG, attributes=IMG_ATTR))
 
     eq_('<img>', clean('<img onclick="evil" src="http://badhost.com/" />',
                        tags=IMG, attributes=IMG_ATTR))
@@ -91,9 +91,9 @@ def test_nasty():
     """Nested, broken up, multiple tags, are still foiled!"""
     test = ('<scr<script></script>ipt type="text/javascript">alert("foo");</'
             '<script></script>script<del></del>>')
-    expect = (u'&lt;scr&lt;script&gt;&lt;/script&gt;ipt type="text/javascript"'
-              u'&gt;alert("foo");&lt;/script&gt;script&lt;del&gt;&lt;/del&gt;'
-              u'&gt;')
+    expect = ('&lt;scr&lt;script&gt;&lt;/script&gt;ipt type="text/javascript"'
+              '&gt;alert("foo");&lt;/script&gt;script&lt;del&gt;&lt;/del&gt;'
+              '&gt;')
     eq_(expect, clean(test))
 
 
diff --git a/bleach/tests/test_unicode.py b/bleach/tests/test_unicode.py
index 67123cc..796924d 100644
--- a/bleach/tests/test_unicode.py
+++ b/bleach/tests/test_unicode.py
@@ -1,54 +1,59 @@
 # -*- coding: utf-8 -*-
-
+from __future__ import unicode_literals
 from nose.tools import eq_
 
 from bleach import clean, linkify
+from bleach.tests.tools import in_
 
 
 def test_japanese_safe_simple():
-    eq_(u'ヘルプとチュートリアル', clean(u'ヘルプとチュートリアル'))
-    eq_(u'ヘルプとチュートリアル', linkify(u'ヘルプとチュートリアル'))
+    eq_('ヘルプとチュートリアル', clean('ヘルプとチュートリアル'))
+    eq_('ヘルプとチュートリアル', linkify('ヘルプとチュートリアル'))
 
 
 def test_japanese_strip():
-    eq_(u'<em>ヘルプとチュートリアル</em>',
-        clean(u'<em>ヘルプとチュートリアル</em>'))
-    eq_(u'&lt;span&gt;ヘルプとチュートリアル&lt;/span&gt;',
-        clean(u'<span>ヘルプとチュートリアル</span>'))
+    eq_('<em>ヘルプとチュートリアル</em>',
+        clean('<em>ヘルプとチュートリアル</em>'))
+    eq_('&lt;span&gt;ヘルプとチュートリアル&lt;/span&gt;',
+        clean('<span>ヘルプとチュートリアル</span>'))
 
 
 def test_russian_simple():
-    eq_(u'Домашняя', clean(u'Домашняя'))
-    eq_(u'Домашняя', linkify(u'Домашняя'))
+    eq_('Домашняя', clean('Домашняя'))
+    eq_('Домашняя', linkify('Домашняя'))
 
 
 def test_mixed():
-    eq_(u'Домашняяヘルプとチュートリアル',
-        clean(u'Домашняяヘルプとチュートリアル'))
+    eq_('Домашняяヘルプとチュートリアル',
+        clean('Домашняяヘルプとチュートリアル'))
 
 
 def test_mixed_linkify():
-    eq_(u'Домашняя <a href="http://example.com" rel="nofollow">'
-        u'http://example.com</a> ヘルプとチュートリアル',
-        linkify(u'Домашняя http://example.com ヘルプとチュートリアル'))
+    in_(('Домашняя <a href="http://example.com" rel="nofollow">'
+        'http://example.com</a> ヘルプとチュートリアル',
+        'Домашняя <a rel="nofollow" href="http://example.com">'
+        'http://example.com</a> ヘルプとチュートリアル'),
+        linkify('Домашняя http://example.com ヘルプとチュートリアル'))
 
 
 def test_url_utf8():
     """Allow UTF8 characters in URLs themselves."""
-    out = u'<a href="%(url)s" rel="nofollow">%(url)s</a>'
+    outs = ('<a href="{0!s}" rel="nofollow">{0!s}</a>',
+            '<a rel="nofollow" href="{0!s}">{0!s}</a>')
+
+    out = lambda url: [x.format(url) for x in outs]
 
     tests = (
-        ('http://éxámplé.com/', out % {'url': u'http://éxámplé.com/'}),
-        ('http://éxámplé.com/íàñá/',
-                out % {'url': u'http://éxámplé.com/íàñá/'}),
+        ('http://éxámplé.com/', out('http://éxámplé.com/')),
+        ('http://éxámplé.com/íàñá/', out('http://éxámplé.com/íàñá/')),
         ('http://éxámplé.com/íàñá/?foo=bar',
-            out % {'url': u'http://éxámplé.com/íàñá/?foo=bar'}),
+         out('http://éxámplé.com/íàñá/?foo=bar')),
         ('http://éxámplé.com/íàñá/?fóo=bár',
-            out % {'url': u'http://éxámplé.com/íàñá/?fóo=bár'}),
+         out('http://éxámplé.com/íàñá/?fóo=bár')),
     )
 
     def check(test, expected_output):
-        eq_(expected_output, linkify(test))
+        in_(expected_output, linkify(test))
 
     for test, expected_output in tests:
         yield check, test, expected_output
diff --git a/bleach/tests/tools.py b/bleach/tests/tools.py
new file mode 100644
index 0000000..87f926c
--- /dev/null
+++ b/bleach/tests/tools.py
@@ -0,0 +1,7 @@
+
+
+def in_(l, a, msg=None):
+    """Shorthand for 'assert a in l, "%r not in %r" % (a, l)
+    """
+    if not a in l:
+        raise AssertionError(msg or "%r not in %r" % (a, l))
diff --git a/docs/clean.rst b/docs/clean.rst
index a31dc89..2fb888b 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -85,7 +85,7 @@ allowed but no values will be.
 For example, to allow users to set the color and font-weight of text::
 
     attrs = {
-        '*': 'style'
+        '*': ['style']
     }
     tags = ['p', 'em', 'strong']
     styles = ['color', 'font-weight']
diff --git a/docs/conf.py b/docs/conf.py
index a63aedf..96b2fc8 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -48,9 +48,9 @@ copyright = u'2012, James Socol'
 # built documents.
 #
 # The short X.Y version.
-version = '1.2'
+version = '1.3'
 # The full version, including alpha/beta/rc tags.
-release = '1.2.0'
+release = '1.3.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/index.rst b/docs/index.rst
index 0929e53..0439786 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -16,14 +16,9 @@ regular-expression-based sanitizers.
 Bleach's ``linkify`` function is highly configurable and can be used to find,
 edit, and filter links most other auto-linkers can't.
 
-The version of bleach on GitHub_ is the always the most up-to-date and the
+The version of bleach on GitHub_ is always the most up-to-date and the
 ``master`` branch should always work.
 
-.. warn::
-
-   Bleach is currently incompatible with html5lib 1.0b and any versions below
-   0.9.5.
-
 
 Installing Bleach
 =================
@@ -56,7 +51,6 @@ Contents:
    goals
 
 
-
 Indices and tables
 ==================
 
@@ -64,6 +58,6 @@ Indices and tables
 * :ref:`modindex`
 * :ref:`search`
 
-.. _html5lib: http://code.google.com/p/html5lib/
+.. _html5lib: https://github.com/html5lib/html5lib-python
 .. _GitHub: https://github.com/jsocol/bleach
-.. _PyPI: http://pypi.python.org/pypi/bleach
+.. _PyPI: https://pypi.python.org/pypi/bleach
diff --git a/requirements.txt b/requirements.txt
index 1500a14..d6e9357 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
-# These are the requirements to run the test suite.
-nose==1.3.0
-html5lib==0.9.5
+six
+html5lib>=0.999
+# Requirements to run the test suite:
+nose
+flake8
diff --git a/setup.py b/setup.py
index e48c3f7..6d5cfb4 100644
--- a/setup.py
+++ b/setup.py
@@ -2,26 +2,35 @@ from setuptools import setup, find_packages
 
 setup(
     name='bleach',
-    version='1.2.2',
+    version='1.4',
     description='An easy whitelist-based HTML-sanitizing tool.',
     long_description=open('README.rst').read(),
     author='James Socol',
-    author_email='james@mozilla.com',
+    author_email='me@jamessocol.com',
     url='http://github.com/jsocol/bleach',
-    license='BSD',
+    license='Apache Software License',
     packages=find_packages(),
     include_package_data=True,
     package_data={'': ['README.rst']},
     zip_safe=False,
-    install_requires=['html5lib==0.95'],
+    install_requires=[
+        'six',
+        'html5lib>=0.999',
+    ],
     classifiers=[
-        'Development Status :: 4 - Beta',
+        'Development Status :: 5 - Production/Stable',
         'Environment :: Web Environment',
         'Environment :: Web Environment :: Mozilla',
         'Intended Audience :: Developers',
-        'License :: OSI Approved :: BSD License',
+        'License :: OSI Approved :: Apache Software License',
         'Operating System :: OS Independent',
         'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.6',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
         'Topic :: Software Development :: Libraries :: Python Modules',
     ]
 )
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..4d8e5f6
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,12 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+envlist = py26, py27, py32, py33, pypy
+
+[testenv]
+commands = nosetests {posargs:-v}
+deps =
+    nose
author	Per Andersson <avtobiff@gmail.com>	2014-01-30 01:25:11 +0100
committer	Per Andersson <avtobiff@gmail.com>	2014-01-30 01:25:11 +0100
commit	6cff86ce6de27fbd4f9fc07716fb1205b14ffae4 (patch)
tree	15d1f032fcf000801b69307d2463c10004489ee4
parent	fac84c6d90e0875e6c1b10c5ef02d577ee008af4 (diff)
download	python-bleach-6cff86ce6de27fbd4f9fc07716fb1205b14ffae4.tar python-bleach-6cff86ce6de27fbd4f9fc07716fb1205b14ffae4.tar.gz