aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPer Andersson <avtobiff@gmail.com>2014-01-30 01:25:11 +0100
committerPer Andersson <avtobiff@gmail.com>2014-01-30 01:25:11 +0100
commit6cff86ce6de27fbd4f9fc07716fb1205b14ffae4 (patch)
tree15d1f032fcf000801b69307d2463c10004489ee4
parentfac84c6d90e0875e6c1b10c5ef02d577ee008af4 (diff)
downloadpython-bleach-6cff86ce6de27fbd4f9fc07716fb1205b14ffae4.tar
python-bleach-6cff86ce6de27fbd4f9fc07716fb1205b14ffae4.tar.gz
Imported Upstream version 1.4upstream/1.4
-rw-r--r--CHANGES20
-rw-r--r--CONTRIBUTORS5
-rw-r--r--LICENSE35
-rw-r--r--README.rst2
-rw-r--r--bleach/__init__.py227
-rw-r--r--bleach/callbacks.py7
-rw-r--r--bleach/encoding.py60
-rw-r--r--bleach/sanitizer.py24
-rw-r--r--bleach/tests/test_basics.py65
-rw-r--r--bleach/tests/test_css.py13
-rw-r--r--bleach/tests/test_links.py241
-rw-r--r--bleach/tests/test_security.py12
-rw-r--r--bleach/tests/test_unicode.py47
-rw-r--r--bleach/tests/tools.py7
-rw-r--r--docs/clean.rst2
-rw-r--r--docs/conf.py4
-rw-r--r--docs/index.rst12
-rw-r--r--requirements.txt8
-rw-r--r--setup.py21
-rw-r--r--tox.ini12
20 files changed, 514 insertions, 310 deletions
diff --git a/CHANGES b/CHANGES
index d9bad9c..1def1a2 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,26 @@
Bleach Changes
==============
+Version 1.4
+-----------
+
+- Update linkify to use etree type Treeewalker instead of simpletree.
+- Updated html5lib to version >= 0.999.
+- Update all code to be compatible with Python 3 and 2 using six.
+- Switch to Apache License.
+
+
+Version 1.3
+-----------
+
+- Used by Python 3-only fork.
+
+
+Version 1.2.2
+-------------
+
+- Pin html5lib to version 0.95 for now due to major API break.
+
Version 1.2.1
-------------
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index f014916..c2d052a 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -3,7 +3,7 @@ within and without the Mozilla Corporation and Foundation.
Lead Developer:
-- James Socol <james@mozilla.com>
+- James Socol <me@jamessocol.com>
Contributors:
@@ -23,3 +23,6 @@ Patches:
- Anton Kovalyov
- Mark Paschal
- Alex Ehlke
+- Marc DM
+- mdxs
+- Marc Abramowitz
diff --git a/LICENSE b/LICENSE
index b2df30c..f7afaef 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,28 +1,13 @@
-Copyright (c) 2010, Mozilla Foundation
-All rights reserved.
+Copyright (c) 2014, Mozilla Foundation
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
- 1. Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- 3. Neither the name of bleach nor the names of its contributors may
- be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/README.rst b/README.rst
index 093edc1..5e52cae 100644
--- a/README.rst
+++ b/README.rst
@@ -60,7 +60,7 @@ Then install it by running::
$ python setup.py install
-.. _html5lib: http://code.google.com/p/html5lib/
+.. _html5lib: https://github.com/html5lib/html5lib-python
.. _GitHub: https://github.com/jsocol/bleach
.. _ReadTheDocs: http://bleach.readthedocs.org/
.. _PyPI: http://pypi.python.org/pypi/bleach
diff --git a/bleach/__init__.py b/bleach/__init__.py
index af75d0f..b110972 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -1,6 +1,8 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
import logging
import re
-import sys
import html5lib
from html5lib.sanitizer import HTMLSanitizer
@@ -11,8 +13,8 @@ from .encoding import force_unicode
from .sanitizer import BleachSanitizer
-VERSION = (1, 2, 1)
-__version__ = '1.2.1'
+VERSION = (1, 4, 0)
+__version__ = '1.4'
__all__ = ['clean', 'linkify']
@@ -61,12 +63,12 @@ TLDS.reverse()
url_re = re.compile(
r"""\(* # Match any opening parentheses.
- \b(?<![@.])(?:(?:%s):/{0,3}(?:(?:\w+:)?\w+@)?)? # http://
- ([\w-]+\.)+(?:%s)(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)?
- (?:[/?][^\s\{\}\|\\\^\[\]`<>"]*)?
+ \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
+ ([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)?
+ (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
# /path/zz (excluding "unsafe" chars from RFC 1738,
# except for # and ~, which happen in practice)
- """ % (u'|'.join(PROTOCOLS), u'|'.join(TLDS)),
+ """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
re.IGNORECASE | re.VERBOSE | re.UNICODE)
proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
@@ -75,8 +77,8 @@ punct_re = re.compile(r'([\.,]+)$')
email_re = re.compile(
r"""(?<!//)
- (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
- (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)* # dot-atom
+ (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
+ (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)* # dot-atom
|^"([\001-\010\013\014\016-\037!#-\[\]-\177]
|\\[\001-011\013\014\016-\177])*" # quoted-string
)@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.? # domain
@@ -85,17 +87,18 @@ email_re = re.compile(
NODE_TEXT = 4 # The numeric ID of a text node in simpletree.
-DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
+# a simple routine that returns the tag name with the namespace prefix
+# as returned by etree's Element.tag attribute
-PY_26 = (sys.version_info < (2, 7))
-RECURSION_EXCEPTION = RuntimeError if not PY_26 else AttributeError
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES, strip=False, strip_comments=True):
"""Clean an HTML fragment and return it"""
if not text:
- return u''
+ return ''
text = force_unicode(text)
@@ -123,22 +126,38 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
text = force_unicode(text)
if not text:
- return u''
+ return ''
parser = html5lib.HTMLParser(tokenizer=tokenizer)
forest = parser.parseFragment(text)
+ _seen = set([])
- def replace_nodes(tree, new_frag, node):
+ def replace_nodes(tree, new_frag, node, index=0):
+ """
+ Doesn't really replace nodes, but inserts the nodes contained in
+ new_frag into the treee at position index and returns the number
+ of nodes inserted.
+ If node is passed in, it is removed from the tree
+ """
+ count = 0
new_tree = parser.parseFragment(new_frag)
- for n in new_tree.childNodes:
- # Prevent us from re-parsing links new links as existing links.
- if n.name == 'a':
- n._seen = True
- tree.insertBefore(n, node)
- tree.removeChild(node)
- # Return the number of new nodes.
- return len(new_tree.childNodes) - 1
+ # capture any non-tag text at the start of the fragment
+ if new_tree.text:
+ if index == 0:
+ tree.text += new_tree.text
+ else:
+ tree[index-1].tail += new_tree.text
+ # the put in the tagged elements into the old tree
+ for n in new_tree:
+ if n.tag == ETREE_TAG('a'):
+ _seen.add(n)
+ tree.insert(index+count, n)
+ count += 1
+ # if we got a node to remove...
+ if node is not None:
+ tree.remove(node)
+ return count
def strip_wrapping_parentheses(fragment):
"""Strips wrapping parentheses.
@@ -189,58 +208,102 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
return None
return attrs
+ def _render_inner(node):
+ out = ['' if node.text is None else node.text]
+ for subnode in node:
+ out.append(_render(subnode))
+ if subnode.tail:
+ out.append(subnode.tail)
+ return ''.join(out)
+
def linkify_nodes(tree, parse_text=True):
- # I know this isn't Pythonic, but we're sometimes mutating
- # tree.childNodes, which ends up breaking the loop and causing us to
- # reparse code.
- children = len(tree.childNodes)
- current = 0 # A pointer to the "current" node.
- while current < children:
- node = tree.childNodes[current]
- if node.type == NODE_TEXT and parse_text:
- new_frag = _render(node)
- # Look for email addresses?
- if parse_email:
- new_frag = re.sub(email_re, email_repl, new_frag)
- if new_frag != _render(node):
- adj = replace_nodes(tree, new_frag, node)
+ children = len(tree)
+ current_child = -1
+ # start at -1 to process the parent first
+ while current_child < len(tree):
+ if current_child < 0:
+ node = tree
+ if parse_text and node.text:
+ new_txt = old_txt = node.text
+ if parse_email:
+ new_txt = re.sub(email_re, email_repl, node.text)
+ if new_txt and new_txt != node.text:
+ node.text = ''
+ adj = replace_nodes(tree, new_txt, None, 0)
+ children += adj
+ current_child += adj
+ linkify_nodes(tree, True)
+ continue
+
+ new_txt = re.sub(url_re, link_repl, new_txt)
+ if new_txt != old_txt:
+ node.text = ''
+ adj = replace_nodes(tree, new_txt, None, 0)
children += adj
- current += adj
- linkify_nodes(tree)
+ current_child += adj
continue
- new_frag = re.sub(url_re, link_repl, new_frag)
- if new_frag != _render(node):
- adj = replace_nodes(tree, new_frag, node)
+ else:
+ node = tree[current_child]
+
+ if parse_text and node.tail:
+ new_tail = old_tail = node.tail
+ if parse_email:
+ new_tail = re.sub(email_re, email_repl, new_tail)
+ if new_tail != node.tail:
+ node.tail = ''
+ adj = replace_nodes(tree, new_tail, None,
+ current_child+1)
+ #insert the new nodes made from my tail into
+ # the tree right after me. current_child+1
+ children += adj
+
+ new_tail = re.sub(url_re, link_repl, new_tail)
+ if new_tail != old_tail:
+ node.tail = ''
+ adj = replace_nodes(tree, new_tail, None, current_child+1)
children += adj
- current += adj
- elif node.name == 'a' and not getattr(node, '_seen', False):
- if 'href' in node.attributes:
- attrs = node.attributes
- _text = attrs['_text'] = ''.join(c.toxml() for
- c in node.childNodes)
+
+ if node.tag == ETREE_TAG('a') and not (node in _seen):
+ if not node.get('href', None) is None:
+ attrs = dict(node.items())
+
+ _text = attrs['_text'] = _render_inner(node)
+
attrs = apply_callbacks(attrs, False)
- if attrs is not None:
+
+ if attrs is None:
+ # <a> tag replaced by the text within it
+ adj = replace_nodes(tree, _text, node,
+ current_child)
+ current_child -= 1
+ # pull back current_child by 1 to scan the
+ # new nodes again.
+ else:
text = force_unicode(attrs.pop('_text'))
- node.attributes = attrs
- for n in reversed(node.childNodes):
- node.removeChild(n)
+ for attr_key, attr_val in attrs.items():
+ node.set(attr_key, attr_val)
+
+ for n in reversed(list(node)):
+ node.remove(n)
text = parser.parseFragment(text)
- for n in text.childNodes:
- node.appendChild(n)
- node._seen = True
- else:
- replace_nodes(tree, _text, node)
- elif skip_pre and node.name == 'pre':
- linkify_nodes(node, False)
- elif not getattr(node, '_seen', False):
- linkify_nodes(node)
- current += 1
+ node.text = text.text
+ for n in text:
+ node.append(n)
+ _seen.add(node)
+
+ elif current_child >= 0:
+ if node.tag == ETREE_TAG('pre') and skip_pre:
+ linkify_nodes(node, False)
+ elif not (node in _seen):
+ linkify_nodes(node, True)
+
+ current_child += 1
def email_repl(match):
addr = match.group(0).replace('"', '&quot;')
link = {
'_text': addr,
- 'href': 'mailto:%s' % addr,
+ 'href': 'mailto:{0!s}'.format(addr),
}
link = apply_callbacks(link, True)
@@ -250,18 +313,18 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
_href = link.pop('href')
_text = link.pop('_text')
- repl = '<a href="%s" %s>%s</a>'
- attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())
- return repl % (_href, attribs, _text)
+ repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
+ attr = '{0!s}="{1!s}"'
+ attribs = ' '.join(attr.format(k, v) for k, v in link.items())
+ return repl.format(_href, attribs, _text)
def link_repl(match):
url = match.group(0)
open_brackets = close_brackets = 0
if url.startswith('('):
- url, open_brackets, close_brackets = (
- strip_wrapping_parentheses(url)
- )
- end = u''
+ _wrapping = strip_wrapping_parentheses(url)
+ url, open_brackets, close_brackets = _wrapping
+ end = ''
m = re.search(punct_re, url)
if m:
end = m.group(0)
@@ -269,7 +332,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
if re.search(proto_re, url):
href = url
else:
- href = u''.join([u'http://', url])
+ href = ''.join(['http://', url])
link = {
'_text': url,
@@ -284,32 +347,30 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
_text = link.pop('_text')
_href = link.pop('href')
- repl = u'%s<a href="%s" %s>%s</a>%s%s'
- attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())
+ repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
+ attr = '{0!s}="{1!s}"'
+ attribs = ' '.join(attr.format(k, v) for k, v in link.items())
- return repl % ('(' * open_brackets,
- _href, attribs, _text, end,
- ')' * close_brackets)
+ return repl.format('(' * open_brackets,
+ _href, attribs, _text, end,
+ ')' * close_brackets)
try:
linkify_nodes(forest)
- except (RECURSION_EXCEPTION), e:
+ except RuntimeError as e:
# If we hit the max recursion depth, just return what we've got.
- log.exception('Probable recursion error: %r' % e)
+ log.exception('Probable recursion error: {0!r}'.format(e))
return _render(forest)
def _render(tree):
"""Try rendering as HTML, then XML, then give up."""
- try:
- return force_unicode(_serialize(tree))
- except AssertionError: # The treewalker throws this sometimes.
- return force_unicode(tree.toxml())
+ return force_unicode(_serialize(tree))
def _serialize(domtree):
- walker = html5lib.treewalkers.getTreeWalker('simpletree')
+ walker = html5lib.treewalkers.getTreeWalker('etree')
stream = walker(domtree)
serializer = HTMLSerializer(quote_attr_values=True,
omit_optional_tags=False)
diff --git a/bleach/callbacks.py b/bleach/callbacks.py
index cc4682d..227f089 100644
--- a/bleach/callbacks.py
+++ b/bleach/callbacks.py
@@ -1,10 +1,15 @@
"""A set of basic callbacks for bleach.linkify."""
+from __future__ import unicode_literals
def nofollow(attrs, new=False):
if attrs['href'].startswith('mailto:'):
return attrs
- attrs['rel'] = 'nofollow'
+ rel = [x for x in attrs.get('rel', '').split(' ') if x]
+ if not 'nofollow' in [x.lower() for x in rel]:
+ rel.append('nofollow')
+ attrs['rel'] = ' '.join(rel)
+
return attrs
diff --git a/bleach/encoding.py b/bleach/encoding.py
index b9a989d..707adaa 100644
--- a/bleach/encoding.py
+++ b/bleach/encoding.py
@@ -1,6 +1,7 @@
import datetime
from decimal import Decimal
import types
+import six
def is_protected_type(obj):
@@ -10,45 +11,52 @@ def is_protected_type(obj):
force_unicode(strings_only=True).
"""
return isinstance(obj, (
- types.NoneType,
- int, long,
- datetime.datetime, datetime.date, datetime.time,
- float, Decimal)
+ six.integer_types +
+ (types.NoneType,
+ datetime.datetime, datetime.date, datetime.time,
+ float, Decimal))
)
def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
"""
- Similar to smart_unicode, except that lazy instances are resolved to
+ Similar to smart_text, except that lazy instances are resolved to
strings, rather than kept as lazy objects.
If strings_only is True, don't convert (some) non-string-like objects.
"""
+ # Handle the common case first, saves 30-40% when s is an instance of
+ # six.text_type. This function gets called often in that setting.
+ if isinstance(s, six.text_type):
+ return s
if strings_only and is_protected_type(s):
return s
try:
- if not isinstance(s, basestring,):
+ if not isinstance(s, six.string_types):
if hasattr(s, '__unicode__'):
- s = unicode(s)
+ s = s.__unicode__()
else:
- try:
- s = unicode(str(s), encoding, errors)
- except UnicodeEncodeError:
- if not isinstance(s, Exception):
- raise
- # If we get to here, the caller has passed in an Exception
- # subclass populated with non-ASCII data without special
- # handling to display as a string. We need to handle this
- # without raising a further exception. We do an
- # approximation to what the Exception's standard str()
- # output should be.
- s = ' '.join([force_unicode(arg, encoding, strings_only,
- errors) for arg in s])
- elif not isinstance(s, unicode):
- # Note: We use .decode() here, instead of unicode(s, encoding,
- # errors), so that if s is a SafeString, it ends up being a
- # SafeUnicode at the end.
+ if six.PY3:
+ if isinstance(s, bytes):
+ s = six.text_type(s, encoding, errors)
+ else:
+ s = six.text_type(s)
+ else:
+ s = six.text_type(bytes(s), encoding, errors)
+ else:
+ # Note: We use .decode() here, instead of six.text_type(s,
+ # encoding, errors), so that if s is a SafeBytes, it ends up being
+ # a SafeText at the end.
s = s.decode(encoding, errors)
- except UnicodeDecodeError, e:
- raise UnicodeDecodeError(*e.args)
+ except UnicodeDecodeError as e:
+ if not isinstance(s, Exception):
+ raise UnicodeDecodeError(*e.args)
+ else:
+ # If we get to here, the caller has passed in an Exception
+ # subclass populated with non-ASCII bytestring data without a
+ # working unicode method. Try to handle this without raising a
+ # further exception by individually forcing the exception args
+ # to unicode.
+ s = ' '.join([force_unicode(arg, encoding, strings_only,
+ errors) for arg in s])
return s
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 4640012..88246f8 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -1,3 +1,4 @@
+from __future__ import unicode_literals
import re
from xml.sax.saxutils import escape, unescape
@@ -14,8 +15,6 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
"""Mixin to replace sanitize_token() and sanitize_css()."""
allowed_svg_properties = []
- # TODO: When the next html5lib version comes out, nuke this.
- attr_val_is_uri = HTMLSanitizerMixin.attr_val_is_uri + ['poster']
def sanitize_token(self, token):
"""Sanitize a token either by HTML-encoding or dropping.
@@ -30,7 +29,7 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
"""
if (getattr(self, 'wildcard_attributes', None) is None and
- isinstance(self.allowed_attributes, dict)):
+ isinstance(self.allowed_attributes, dict)):
self.wildcard_attributes = self.allowed_attributes.get('*', [])
if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
@@ -56,7 +55,7 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
unescape(attrs[attr])).lower()
# Remove replacement characters from unescaped
# characters.
- val_unescaped = val_unescaped.replace(u"\ufffd", "")
+ val_unescaped = val_unescaped.replace("\ufffd", "")
if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
and (val_unescaped.split(':')[0] not in
self.allowed_protocols)):
@@ -67,8 +66,8 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
' ',
unescape(attrs[attr]))
if (token['name'] in self.svg_allow_local_href and
- 'xlink:href' in attrs and
- re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
+ 'xlink:href' in attrs and
+ re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
del attrs['xlink:href']
if 'style' in attrs:
attrs['style'] = self.sanitize_css(attrs['style'])
@@ -79,13 +78,14 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
pass
else:
if token['type'] == tokenTypes['EndTag']:
- token['data'] = '</%s>' % token['name']
+ token['data'] = '</{0!s}>'.format(token['name'])
elif token['data']:
- attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in
+ attr = ' {0!s}="{1!s}"'
+ attrs = ''.join([attr.format(k, escape(v)) for k, v in
token['data']])
- token['data'] = '<%s%s>' % (token['name'], attrs)
+ token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
else:
- token['data'] = '<%s>' % token['name']
+ token['data'] = '<{0!s}>'.format(token['name'])
if token['selfClosing']:
token['data'] = token['data'][:-1] + '/>'
token['type'] = tokenTypes['Characters']
@@ -112,8 +112,8 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
# TODO: Make sure this does what it's meant to - I *think* it wants to
# validate style attribute contents.
parts = style.split(';')
- gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*"""
- """|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
+ gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
+ """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
for part in parts:
if not gauntlet.match(part):
return ''
diff --git a/bleach/tests/test_basics.py b/bleach/tests/test_basics.py
index 9eca687..822407f 100644
--- a/bleach/tests/test_basics.py
+++ b/bleach/tests/test_basics.py
@@ -1,7 +1,9 @@
+import six
import html5lib
from nose.tools import eq_
import bleach
+from bleach.tests.tools import in_
def test_empty():
@@ -9,7 +11,12 @@ def test_empty():
def test_nbsp():
- eq_(u'\xa0test string\xa0', bleach.clean('&nbsp;test string&nbsp;'))
+ if six.PY3:
+ expected = '\xa0test string\xa0'
+ else:
+ expected = six.u('\\xa0test string\\xa0')
+
+ eq_(expected, bleach.clean('&nbsp;test string&nbsp;'))
def test_comments_only():
@@ -18,8 +25,8 @@ def test_comments_only():
eq_('', bleach.clean(comment))
eq_('', bleach.clean(open_comment))
eq_(comment, bleach.clean(comment, strip_comments=False))
- eq_('%s-->' % open_comment, bleach.clean(open_comment,
- strip_comments=False))
+ eq_('{0!s}-->'.format(open_comment), bleach.clean(open_comment,
+ strip_comments=False))
def test_with_comments():
@@ -55,9 +62,11 @@ def test_function_arguments():
def test_named_arguments():
ATTRS = {'a': ['rel', 'href']}
- s = u'<a href="http://xx.com" rel="alternate">xx.com</a>'
- eq_('<a href="http://xx.com">xx.com</a>', bleach.clean(s))
- eq_(s, bleach.clean(s, attributes=ATTRS))
+ s = ('<a href="http://xx.com" rel="alternate">xx.com</a>',
+ '<a rel="alternate" href="http://xx.com">xx.com</a>')
+
+ eq_('<a href="http://xx.com">xx.com</a>', bleach.clean(s[0]))
+ in_(s, bleach.clean(s[0], attributes=ATTRS))
def test_disallowed_html():
@@ -81,19 +90,19 @@ def test_bare_entities():
def test_escaped_entities():
- s = u'&lt;em&gt;strong&lt;/em&gt;'
+ s = '&lt;em&gt;strong&lt;/em&gt;'
eq_(s, bleach.clean(s))
def test_serializer():
- s = u'<table></table>'
+ s = '<table></table>'
eq_(s, bleach.clean(s, tags=['table']))
- eq_(u'test<table></table>', bleach.linkify(u'<table>test</table>'))
- eq_(u'<p>test</p>', bleach.clean(u'<p>test</p>', tags=['p']))
+ eq_('test<table></table>', bleach.linkify('<table>test</table>'))
+ eq_('<p>test</p>', bleach.clean('<p>test</p>', tags=['p']))
def test_no_href_links():
- s = u'<a name="anchor">x</a>'
+ s = '<a name="anchor">x</a>'
eq_(s, bleach.linkify(s))
@@ -112,7 +121,7 @@ def test_stripping():
bleach.clean('a test <em>with</em> <b>html</b> tags', strip=True))
eq_('a test <em>with</em> <b>html</b> tags',
bleach.clean('a test <em>with</em> <img src="http://example.com/"> '
- '<b>html</b> tags', strip=True))
+ '<b>html</b> tags', strip=True))
s = '<p><a href="http://example.com/">link text</a></p>'
eq_('<p>link text</p>', bleach.clean(s, tags=['p'], strip=True))
@@ -138,7 +147,7 @@ def test_allowed_styles():
def test_idempotent():
"""Make sure that applying the filter twice doesn't change anything."""
- dirty = u'<span>invalid & </span> < extra http://link.com<em>'
+ dirty = '<span>invalid & </span> < extra http://link.com<em>'
clean = bleach.clean(dirty)
eq_(clean, bleach.clean(clean))
@@ -147,10 +156,23 @@ def test_idempotent():
eq_(linked, bleach.linkify(linked))
+def test_rel_already_there():
+ """Make sure rel attribute is updated not replaced"""
+ linked = ('Click <a href="http://example.com" rel="tooltip">'
+ 'here</a>.')
+ link_good = (('Click <a href="http://example.com" rel="tooltip nofollow">'
+ 'here</a>.'),
+ ('Click <a rel="tooltip nofollow" href="http://example.com">'
+ 'here</a>.'))
+
+ in_(link_good, bleach.linkify(linked))
+ in_(link_good, bleach.linkify(link_good[0]))
+
+
def test_lowercase_html():
"""We should output lowercase HTML."""
- dirty = u'<EM CLASS="FOO">BAR</EM>'
- clean = u'<em class="FOO">BAR</em>'
+ dirty = '<EM CLASS="FOO">BAR</EM>'
+ clean = '<em class="FOO">BAR</em>'
eq_(clean, bleach.clean(dirty, attributes=['class']))
@@ -160,14 +182,15 @@ def test_wildcard_attributes():
'img': ['src'],
}
TAG = ['img', 'em']
- dirty = (u'both <em id="foo" style="color: black">can</em> have '
- u'<img id="bar" src="foo"/>')
- clean = u'both <em id="foo">can</em> have <img id="bar" src="foo">'
- eq_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR))
+ dirty = ('both <em id="foo" style="color: black">can</em> have '
+ '<img id="bar" src="foo"/>')
+ clean = ('both <em id="foo">can</em> have <img src="foo" id="bar">',
+ 'both <em id="foo">can</em> have <img id="bar" src="foo">')
+ in_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR))
def test_sarcasm():
"""Jokes should crash.<sarcasm/>"""
- dirty = u'Yeah right <sarcasm/>'
- clean = u'Yeah right &lt;sarcasm/&gt;'
+ dirty = 'Yeah right <sarcasm/>'
+ clean = 'Yeah right &lt;sarcasm/&gt;'
eq_(clean, bleach.clean(dirty))
diff --git a/bleach/tests/test_css.py b/bleach/tests/test_css.py
index 588c8ce..b40596f 100644
--- a/bleach/tests/test_css.py
+++ b/bleach/tests/test_css.py
@@ -29,14 +29,14 @@ def test_allowed_css():
('font-family: "Arial";', 'font-family: "Arial";', ['font-family']),
)
- p_single = '<p style="%s">bar</p>'
- p_double = "<p style='%s'>bar</p>"
+ p_single = '<p style="{0!s}">bar</p>'
+ p_double = "<p style='{0!s}'>bar</p>"
def check(i, o, s):
if '"' in i:
- eq_(p_double % o, clean(p_double % i, styles=s))
+ eq_(p_double.format(o), clean(p_double.format(i), styles=s))
else:
- eq_(p_single % o, clean(p_single % i, styles=s))
+ eq_(p_single.format(o), clean(p_single.format(i), styles=s))
for i, o, s in tests:
yield check, i, o, s
@@ -70,12 +70,13 @@ def test_style_hang():
"""font: normal normal normal 100%/normal 'Courier New', """
"""'Andale Mono', monospace; background-position: initial """
"""initial; background-repeat: initial initial;""")
- html = '<p style="%s">Hello world</p>' % style
+ html = '<p style="{0!s}">Hello world</p>'.format(style)
styles = [
'border', 'float', 'overflow', 'min-height', 'vertical-align',
'white-space',
'margin', 'margin-left', 'margin-top', 'margin-bottom', 'margin-right',
- 'padding', 'padding-left', 'padding-top', 'padding-bottom', 'padding-right',
+ 'padding', 'padding-left', 'padding-top', 'padding-bottom',
+ 'padding-right',
'background',
'background-color',
'font', 'font-size', 'font-weight', 'text-align', 'text-transform',
diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py
index ac593c4..abf889d 100644
--- a/bleach/tests/test_links.py
+++ b/bleach/tests/test_links.py
@@ -1,18 +1,20 @@
-import urllib
+try:
+ from urllib.parse import quote_plus
+except ImportError:
+ from urllib import quote_plus
from html5lib.tokenizer import HTMLTokenizer
from nose.tools import eq_
from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
-
-
+from bleach.tests.tools import in_
def test_url_re():
def no_match(s):
match = url_re.search(s)
if match:
- assert not match, 'matched %s' % s[slice(*match.span())]
+ assert not match, 'matched {0!s}'.format(s[slice(*match.span())])
yield no_match, 'just what i am looking for...it'
@@ -21,36 +23,48 @@ def test_empty():
def test_simple_link():
- eq_('a <a href="http://example.com" rel="nofollow">http://example.com'
+ in_(('a <a href="http://example.com" rel="nofollow">http://example.com'
'</a> link',
+ 'a <a rel="nofollow" href="http://example.com">http://example.com'
+ '</a> link'),
linkify('a http://example.com link'))
- eq_('a <a href="https://example.com" rel="nofollow">https://example.com'
+ in_(('a <a href="https://example.com" rel="nofollow">https://example.com'
'</a> link',
+ 'a <a rel="nofollow" href="https://example.com">https://example.com'
+ '</a> link'),
linkify('a https://example.com link'))
- eq_('an <a href="http://example.com" rel="nofollow">example.com</a> link',
- linkify('an example.com link'))
+ in_(('a <a href="http://example.com" rel="nofollow">example.com</a> link',
+ 'a <a rel="nofollow" href="http://example.com">example.com</a> link'),
+ linkify('a example.com link'))
def test_trailing_slash():
- eq_('<a href="http://example.com/" rel="nofollow">http://example.com/</a>',
- linkify('http://example.com/'))
- eq_('<a href="http://example.com/foo/" rel="nofollow">'
- 'http://example.com/foo/</a>',
- linkify('http://example.com/foo/'))
- eq_('<a href="http://example.com/foo/bar/" rel="nofollow">'
- 'http://example.com/foo/bar/</a>',
- linkify('http://example.com/foo/bar/'))
+ in_(('<a href="http://examp.com/" rel="nofollow">http://examp.com/</a>',
+ '<a rel="nofollow" href="http://examp.com/">http://examp.com/</a>'),
+ linkify('http://examp.com/'))
+ in_(('<a href="http://example.com/foo/" rel="nofollow">'
+ 'http://example.com/foo/</a>',
+ '<a rel="nofollow" href="http://example.com/foo/">'
+ 'http://example.com/foo/</a>'),
+ linkify('http://example.com/foo/'))
+ in_(('<a href="http://example.com/foo/bar/" rel="nofollow">'
+ 'http://example.com/foo/bar/</a>',
+ '<a rel="nofollow" href="http://example.com/foo/bar/">'
+ 'http://example.com/foo/bar/</a>'),
+ linkify('http://example.com/foo/bar/'))
def test_mangle_link():
"""We can muck with the href attribute of the link."""
def filter_url(attrs, new=False):
- attrs['href'] = (u'http://bouncer/?u=%s' %
- urllib.quote_plus(attrs['href']))
+ quoted = quote_plus(attrs['href'])
+ attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted)
return attrs
- eq_('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">'
- 'http://example.com</a>',
+ in_(('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">'
+ 'http://example.com</a>',
+ '<a rel="nofollow" href="http://bouncer/?u=http%3A%2F%2Fexample.com">'
+ 'http://example.com</a>'),
linkify('http://example.com', DC + [filter_url]))
@@ -76,13 +90,19 @@ def test_email_link():
'james@example.com.au</a> mailto', True,
'aussie james@example.com.au mailto'),
# This is kind of a pathological case. I guess we do our best here.
- ('email to <a href="james@example.com" rel="nofollow">'
- 'james@example.com</a>', True,
- 'email to <a href="james@example.com">james@example.com</a>'),
+ (('email to <a href="james@example.com" rel="nofollow">'
+ 'james@example.com</a>',
+ 'email to <a rel="nofollow" href="james@example.com">'
+ 'james@example.com</a>'),
+ True,
+ 'email to <a href="james@example.com">james@example.com</a>'),
)
def _check(o, p, i):
- eq_(o, linkify(i, parse_email=p))
+ if isinstance(o, (list, tuple)):
+ in_(o, linkify(i, parse_email=p))
+ else:
+ eq_(o, linkify(i, parse_email=p))
for (o, p, i) in tests:
yield _check, o, p, i
@@ -151,7 +171,8 @@ def test_set_attrs():
attrs['rev'] = 'canonical'
return attrs
- eq_('<a href="http://ex.mp" rev="canonical">ex.mp</a>',
+ in_(('<a href="http://ex.mp" rev="canonical">ex.mp</a>',
+ '<a rev="canonical" href="http://ex.mp">ex.mp</a>'),
linkify('ex.mp', [set_attr]))
@@ -179,15 +200,19 @@ def test_stop_email():
def test_tlds():
- eq_('<a href="http://example.com" rel="nofollow">example.com</a>',
+ in_(('<a href="http://example.com" rel="nofollow">example.com</a>',
+ '<a rel="nofollow" href="http://example.com">example.com</a>'),
linkify('example.com'))
- eq_('<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>',
+ in_(('<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>',
+ '<a rel="nofollow" href="http://example.co.uk">example.co.uk</a>'),
linkify('example.co.uk'))
- eq_('<a href="http://example.edu" rel="nofollow">example.edu</a>',
+ in_(('<a href="http://example.edu" rel="nofollow">example.edu</a>',
+ '<a rel="nofollow" href="http://example.edu">example.edu</a>'),
linkify('example.edu'))
eq_('example.xxx', linkify('example.xxx'))
eq_(' brie', linkify(' brie'))
- eq_('<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>',
+ in_(('<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>',
+ '<a rel="nofollow" href="http://bit.ly/fun">bit.ly/fun</a>'),
linkify('bit.ly/fun'))
@@ -197,61 +222,81 @@ def test_escaping():
def test_nofollow_off():
eq_('<a href="http://example.com">example.com</a>',
- linkify(u'example.com', []))
+ linkify('example.com', []))
def test_link_in_html():
- eq_('<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>',
+ in_(('<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>',
+ '<i><a rel="nofollow" href="http://yy.com">http://yy.com</a></i>'),
linkify('<i>http://yy.com</i>'))
- eq_('<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com</a>'
- '</strong></em>',
+
+ in_(('<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com'
+ '</a></strong></em>',
+ '<em><strong><a rel="nofollow" href="http://xx.com">http://xx.com'
+ '</a></strong></em>'),
linkify('<em><strong>http://xx.com</strong></em>'))
def test_links_https():
- eq_('<a href="https://yy.com" rel="nofollow">https://yy.com</a>',
+ in_(('<a href="https://yy.com" rel="nofollow">https://yy.com</a>',
+ '<a rel="nofollow" href="https://yy.com">https://yy.com</a>'),
linkify('https://yy.com'))
def test_add_rel_nofollow():
"""Verify that rel="nofollow" is added to an existing link"""
- eq_('<a href="http://yy.com" rel="nofollow">http://yy.com</a>',
+ in_(('<a href="http://yy.com" rel="nofollow">http://yy.com</a>',
+ '<a rel="nofollow" href="http://yy.com">http://yy.com</a>'),
linkify('<a href="http://yy.com">http://yy.com</a>'))
def test_url_with_path():
- eq_('<a href="http://example.com/path/to/file" rel="nofollow">'
- 'http://example.com/path/to/file</a>',
+ in_(('<a href="http://example.com/path/to/file" rel="nofollow">'
+ 'http://example.com/path/to/file</a>',
+ '<a rel="nofollow" href="http://example.com/path/to/file">'
+ 'http://example.com/path/to/file</a>'),
linkify('http://example.com/path/to/file'))
def test_link_ftp():
- eq_('<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
- 'ftp://ftp.mozilla.org/some/file</a>',
+ in_(('<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
+ 'ftp://ftp.mozilla.org/some/file</a>',
+ '<a rel="nofollow" href="ftp://ftp.mozilla.org/some/file">'
+ 'ftp://ftp.mozilla.org/some/file</a>'),
linkify('ftp://ftp.mozilla.org/some/file'))
def test_link_query():
- eq_('<a href="http://xx.com/?test=win" rel="nofollow">'
+ in_(('<a href="http://xx.com/?test=win" rel="nofollow">'
'http://xx.com/?test=win</a>',
+ '<a rel="nofollow" href="http://xx.com/?test=win">'
+ 'http://xx.com/?test=win</a>'),
linkify('http://xx.com/?test=win'))
- eq_('<a href="http://xx.com/?test=win" rel="nofollow">'
+ in_(('<a href="http://xx.com/?test=win" rel="nofollow">'
'xx.com/?test=win</a>',
+ '<a rel="nofollow" href="http://xx.com/?test=win">'
+ 'xx.com/?test=win</a>'),
linkify('xx.com/?test=win'))
- eq_('<a href="http://xx.com?test=win" rel="nofollow">'
+ in_(('<a href="http://xx.com?test=win" rel="nofollow">'
'xx.com?test=win</a>',
+ '<a rel="nofollow" href="http://xx.com?test=win">'
+ 'xx.com?test=win</a>'),
linkify('xx.com?test=win'))
def test_link_fragment():
- eq_('<a href="http://xx.com/path#frag" rel="nofollow">'
- 'http://xx.com/path#frag</a>',
+ in_(('<a href="http://xx.com/path#frag" rel="nofollow">'
+ 'http://xx.com/path#frag</a>',
+ '<a rel="nofollow" href="http://xx.com/path#frag">'
+ 'http://xx.com/path#frag</a>'),
linkify('http://xx.com/path#frag'))
def test_link_entities():
- eq_('<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">'
+ in_(('<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">'
'http://xx.com/?a=1&amp;b=2</a>',
+ '<a rel="nofollow" href="http://xx.com/?a=1&amp;b=2">'
+ 'http://xx.com/?a=1&amp;b=2</a>'),
linkify('http://xx.com/?a=1&b=2'))
@@ -262,9 +307,12 @@ def test_escaped_html():
def test_link_http_complete():
- eq_('<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d'
+ in_(('<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d'
'&amp;e#f" rel="nofollow">'
'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>',
+ '<a rel="nofollow" href="https://user:pass@ftp.mozilla.org/x/'
+ 'y.exe?a=b&amp;c=d&amp;e#f">'
+ 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>'),
linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f'))
@@ -282,8 +330,10 @@ def test_javascript_url():
def test_unsafe_url():
"""Any unsafe char ({}[]<>, etc.) in the path should end URL scanning."""
- eq_('All your{"<a href="http://xx.yy.com/grover.png" '
- 'rel="nofollow">xx.yy.com/grover.png</a>"}base are',
+ in_(('All your{"<a href="http://xx.yy.com/grover.png" '
+ 'rel="nofollow">xx.yy.com/grover.png</a>"}base are',
+ 'All your{"<a rel="nofollow" href="http://xx.yy.com/grover.png"'
+ '>xx.yy.com/grover.png</a>"}base are'),
linkify('All your{"xx.yy.com/grover.png"}base are'))
@@ -291,17 +341,23 @@ def test_skip_pre():
"""Skip linkification in <pre> tags."""
simple = 'http://xx.com <pre>http://xx.com</pre>'
linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
+ '<pre>http://xx.com</pre>',
+ '<a rel="nofollow" href="http://xx.com">http://xx.com</a> '
'<pre>http://xx.com</pre>')
all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
'<pre><a href="http://xx.com" rel="nofollow">http://xx.com'
+ '</a></pre>',
+ '<a rel="nofollow" href="http://xx.com">http://xx.com</a> '
+ '<pre><a rel="nofollow" href="http://xx.com">http://xx.com'
'</a></pre>')
- eq_(linked, linkify(simple, skip_pre=True))
- eq_(all_linked, linkify(simple))
+ in_(linked, linkify(simple, skip_pre=True))
+ in_(all_linked, linkify(simple))
already_linked = '<pre><a href="http://xx.com">xx</a></pre>'
- nofollowed = '<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>'
- eq_(nofollowed, linkify(already_linked))
- eq_(nofollowed, linkify(already_linked, skip_pre=True))
+ nofollowed = ('<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>',
+ '<pre><a rel="nofollow" href="http://xx.com">xx</a></pre>')
+ in_(nofollowed, linkify(already_linked))
+ in_(nofollowed, linkify(already_linked, skip_pre=True))
def test_libgl():
@@ -311,11 +367,13 @@ def test_libgl():
def test_end_of_sentence():
"""example.com. should match."""
- out = u'<a href="http://%s" rel="nofollow">%s</a>%s'
- in_ = u'%s%s'
+ outs = ('<a href="http://{0!s}" rel="nofollow">{0!s}</a>{1!s}',
+ '<a rel="nofollow" href="http://{0!s}">{0!s}</a>{1!s}')
+ intxt = '{0!s}{1!s}'
def check(u, p):
- eq_(out % (u, u, p), linkify(in_ % (u, p)))
+ in_([out.format(u, p) for out in outs],
+ linkify(intxt.format(u, p)))
tests = (
('example.com', '.'),
@@ -330,49 +388,50 @@ def test_end_of_sentence():
def test_end_of_clause():
"""example.com/foo, shouldn't include the ,"""
- eq_('<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar',
+ in_(('<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar',
+ '<a rel="nofollow" href="http://ex.com/foo">ex.com/foo</a>, bar'),
linkify('ex.com/foo, bar'))
def test_sarcasm():
"""Jokes should crash.<sarcasm/>"""
- dirty = u'Yeah right <sarcasm/>'
- clean = u'Yeah right &lt;sarcasm/&gt;'
+ dirty = 'Yeah right <sarcasm/>'
+ clean = 'Yeah right &lt;sarcasm/&gt;'
eq_(clean, linkify(dirty))
def test_wrapping_parentheses():
"""URLs wrapped in parantheses should not include them."""
- out = u'%s<a href="http://%s" rel="nofollow">%s</a>%s'
+ outs = ('{0!s}<a href="http://{1!s}" rel="nofollow">{2!s}</a>{3!s}',
+ '{0!s}<a rel="nofollow" href="http://{1!s}">{2!s}</a>{3!s}')
tests = (
- ('(example.com)', out % ('(', 'example.com', 'example.com', ')')),
- ('(example.com/)', out % ('(', 'example.com/', 'example.com/', ')')),
- ('(example.com/foo)', out % ('(', 'example.com/foo',
- 'example.com/foo', ')')),
- ('(((example.com/))))', out % ('(((', 'example.com/)',
- 'example.com/)', ')))')),
- ('example.com/))', out % ('', 'example.com/))',
- 'example.com/))', '')),
+ ('(example.com)', ('(', 'example.com', 'example.com', ')')),
+ ('(example.com/)', ('(', 'example.com/', 'example.com/', ')')),
+ ('(example.com/foo)', ('(', 'example.com/foo',
+ 'example.com/foo', ')')),
+ ('(((example.com/))))', ('(((', 'example.com/)',
+ 'example.com/)', ')))')),
+ ('example.com/))', ('', 'example.com/))', 'example.com/))', '')),
('http://en.wikipedia.org/wiki/Test_(assessment)',
- out % ('', 'en.wikipedia.org/wiki/Test_(assessment)',
- 'http://en.wikipedia.org/wiki/Test_(assessment)', '')),
+ ('', 'en.wikipedia.org/wiki/Test_(assessment)',
+ 'http://en.wikipedia.org/wiki/Test_(assessment)', '')),
('(http://en.wikipedia.org/wiki/Test_(assessment))',
- out % ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
- 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')),
+ ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
+ 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')),
('((http://en.wikipedia.org/wiki/Test_(assessment))',
- out % ('((', 'en.wikipedia.org/wiki/Test_(assessment',
- 'http://en.wikipedia.org/wiki/Test_(assessment', '))')),
+ ('((', 'en.wikipedia.org/wiki/Test_(assessment',
+ 'http://en.wikipedia.org/wiki/Test_(assessment', '))')),
('(http://en.wikipedia.org/wiki/Test_(assessment)))',
- out % ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
- 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')),
+ ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
+ 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')),
('(http://en.wikipedia.org/wiki/)Test_(assessment',
- out % ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
- 'http://en.wikipedia.org/wiki/)Test_(assessment', '')),
+ ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
+ 'http://en.wikipedia.org/wiki/)Test_(assessment', '')),
)
def check(test, expected_output):
- eq_(expected_output, linkify(test))
+ in_([o.format(*expected_output) for o in outs], linkify(test))
for test, expected_output in tests:
yield check, test, expected_output
@@ -389,7 +448,9 @@ def test_ports():
)
def check(test, output):
- eq_(u'<a href="{0}" rel="nofollow">{0}</a>{1}'.format(*output),
+ outs = ('<a href="{0}" rel="nofollow">{0}</a>{1}',
+ '<a rel="nofollow" href="{0}">{0}</a>{1}')
+ in_([out.format(*output) for out in outs],
linkify(test))
for test, output in tests:
@@ -406,8 +467,9 @@ def test_tokenizer():
def test_ignore_bad_protocols():
eq_('foohttp://bar',
linkify('foohttp://bar'))
- eq_('foohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>',
- linkify('foohttp://exampl.com'))
+ in_(('fohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>',
+ 'fohttp://<a rel="nofollow" href="http://exampl.com">exampl.com</a>'),
+ linkify('fohttp://exampl.com'))
def test_max_recursion_depth():
@@ -420,21 +482,28 @@ def test_link_emails_and_urls():
"""parse_email=True shouldn't prevent URLs from getting linkified."""
output = ('<a href="http://example.com" rel="nofollow">'
'http://example.com</a> <a href="mailto:person@example.com">'
+ 'person@example.com</a>',
+ '<a rel="nofollow" href="http://example.com">'
+ 'http://example.com</a> <a href="mailto:person@example.com">'
'person@example.com</a>')
- eq_(output, linkify('http://example.com person@example.com',
+ in_(output, linkify('http://example.com person@example.com',
parse_email=True))
def test_links_case_insensitive():
"""Protocols and domain names are case insensitive."""
expect = ('<a href="HTTP://EXAMPLE.COM" rel="nofollow">'
+ 'HTTP://EXAMPLE.COM</a>',
+ '<a rel="nofollow" href="HTTP://EXAMPLE.COM">'
'HTTP://EXAMPLE.COM</a>')
- eq_(expect, linkify('HTTP://EXAMPLE.COM'))
+ in_(expect, linkify('HTTP://EXAMPLE.COM'))
def test_elements_inside_links():
- eq_(u'<a href="#" rel="nofollow">hello<br></a>',
+ in_(('<a href="#" rel="nofollow">hello<br></a>',
+ '<a rel="nofollow" href="#">hello<br></a>'),
linkify('<a href="#">hello<br></a>'))
- eq_(u'<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>',
+ in_(('<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>',
+ '<a rel="nofollow" href="#"><strong>bold</strong> hello<br></a>'),
linkify('<a href="#"><strong>bold</strong> hello<br></a>'))
diff --git a/bleach/tests/test_security.py b/bleach/tests/test_security.py
index 6c2b33f..6adab59 100644
--- a/bleach/tests/test_security.py
+++ b/bleach/tests/test_security.py
@@ -25,10 +25,10 @@ def test_invalid_attr():
clean('<a onclick="evil" href="test">test</a>'))
eq_('<img src="test">',
clean('<img onclick="evil" src="test" />',
- tags=IMG, attributes=IMG_ATTR))
+ tags=IMG, attributes=IMG_ATTR))
eq_('<img src="test">',
clean('<img href="invalid" src="test" />',
- tags=IMG, attributes=IMG_ATTR))
+ tags=IMG, attributes=IMG_ATTR))
def test_unquoted_attr():
@@ -57,7 +57,7 @@ def test_invalid_filter_attr():
eq_('<img src="http://example.com/">',
clean('<img onclick="evil" src="http://example.com/" />',
- tags=IMG, attributes=IMG_ATTR))
+ tags=IMG, attributes=IMG_ATTR))
eq_('<img>', clean('<img onclick="evil" src="http://badhost.com/" />',
tags=IMG, attributes=IMG_ATTR))
@@ -91,9 +91,9 @@ def test_nasty():
"""Nested, broken up, multiple tags, are still foiled!"""
test = ('<scr<script></script>ipt type="text/javascript">alert("foo");</'
'<script></script>script<del></del>>')
- expect = (u'&lt;scr&lt;script&gt;&lt;/script&gt;ipt type="text/javascript"'
- u'&gt;alert("foo");&lt;/script&gt;script&lt;del&gt;&lt;/del&gt;'
- u'&gt;')
+ expect = ('&lt;scr&lt;script&gt;&lt;/script&gt;ipt type="text/javascript"'
+ '&gt;alert("foo");&lt;/script&gt;script&lt;del&gt;&lt;/del&gt;'
+ '&gt;')
eq_(expect, clean(test))
diff --git a/bleach/tests/test_unicode.py b/bleach/tests/test_unicode.py
index 67123cc..796924d 100644
--- a/bleach/tests/test_unicode.py
+++ b/bleach/tests/test_unicode.py
@@ -1,54 +1,59 @@
# -*- coding: utf-8 -*-
-
+from __future__ import unicode_literals
from nose.tools import eq_
from bleach import clean, linkify
+from bleach.tests.tools import in_
def test_japanese_safe_simple():
- eq_(u'ヘルプとチュートリアル', clean(u'ヘルプとチュートリアル'))
- eq_(u'ヘルプとチュートリアル', linkify(u'ヘルプとチュートリアル'))
+ eq_('ヘルプとチュートリアル', clean('ヘルプとチュートリアル'))
+ eq_('ヘルプとチュートリアル', linkify('ヘルプとチュートリアル'))
def test_japanese_strip():
- eq_(u'<em>ヘルプとチュートリアル</em>',
- clean(u'<em>ヘルプとチュートリアル</em>'))
- eq_(u'&lt;span&gt;ヘルプとチュートリアル&lt;/span&gt;',
- clean(u'<span>ヘルプとチュートリアル</span>'))
+ eq_('<em>ヘルプとチュートリアル</em>',
+ clean('<em>ヘルプとチュートリアル</em>'))
+ eq_('&lt;span&gt;ヘルプとチュートリアル&lt;/span&gt;',
+ clean('<span>ヘルプとチュートリアル</span>'))
def test_russian_simple():
- eq_(u'Домашняя', clean(u'Домашняя'))
- eq_(u'Домашняя', linkify(u'Домашняя'))
+ eq_('Домашняя', clean('Домашняя'))
+ eq_('Домашняя', linkify('Домашняя'))
def test_mixed():
- eq_(u'Домашняяヘルプとチュートリアル',
- clean(u'Домашняяヘルプとチュートリアル'))
+ eq_('Домашняяヘルプとチュートリアル',
+ clean('Домашняяヘルプとチュートリアル'))
def test_mixed_linkify():
- eq_(u'Домашняя <a href="http://example.com" rel="nofollow">'
- u'http://example.com</a> ヘルプとチュートリアル',
- linkify(u'Домашняя http://example.com ヘルプとチュートリアル'))
+ in_(('Домашняя <a href="http://example.com" rel="nofollow">'
+ 'http://example.com</a> ヘルプとチュートリアル',
+ 'Домашняя <a rel="nofollow" href="http://example.com">'
+ 'http://example.com</a> ヘルプとチュートリアル'),
+ linkify('Домашняя http://example.com ヘルプとチュートリアル'))
def test_url_utf8():
"""Allow UTF8 characters in URLs themselves."""
- out = u'<a href="%(url)s" rel="nofollow">%(url)s</a>'
+ outs = ('<a href="{0!s}" rel="nofollow">{0!s}</a>',
+ '<a rel="nofollow" href="{0!s}">{0!s}</a>')
+
+ out = lambda url: [x.format(url) for x in outs]
tests = (
- ('http://éxámplé.com/', out % {'url': u'http://éxámplé.com/'}),
- ('http://éxámplé.com/íàñá/',
- out % {'url': u'http://éxámplé.com/íàñá/'}),
+ ('http://éxámplé.com/', out('http://éxámplé.com/')),
+ ('http://éxámplé.com/íàñá/', out('http://éxámplé.com/íàñá/')),
('http://éxámplé.com/íàñá/?foo=bar',
- out % {'url': u'http://éxámplé.com/íàñá/?foo=bar'}),
+ out('http://éxámplé.com/íàñá/?foo=bar')),
('http://éxámplé.com/íàñá/?fóo=bár',
- out % {'url': u'http://éxámplé.com/íàñá/?fóo=bár'}),
+ out('http://éxámplé.com/íàñá/?fóo=bár')),
)
def check(test, expected_output):
- eq_(expected_output, linkify(test))
+ in_(expected_output, linkify(test))
for test, expected_output in tests:
yield check, test, expected_output
diff --git a/bleach/tests/tools.py b/bleach/tests/tools.py
new file mode 100644
index 0000000..87f926c
--- /dev/null
+++ b/bleach/tests/tools.py
@@ -0,0 +1,7 @@
+
+
+def in_(l, a, msg=None):
+ """Shorthand for 'assert a in l, "%r not in %r" % (a, l)
+ """
+ if not a in l:
+ raise AssertionError(msg or "%r not in %r" % (a, l))
diff --git a/docs/clean.rst b/docs/clean.rst
index a31dc89..2fb888b 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -85,7 +85,7 @@ allowed but no values will be.
For example, to allow users to set the color and font-weight of text::
attrs = {
- '*': 'style'
+ '*': ['style']
}
tags = ['p', 'em', 'strong']
styles = ['color', 'font-weight']
diff --git a/docs/conf.py b/docs/conf.py
index a63aedf..96b2fc8 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -48,9 +48,9 @@ copyright = u'2012, James Socol'
# built documents.
#
# The short X.Y version.
-version = '1.2'
+version = '1.3'
# The full version, including alpha/beta/rc tags.
-release = '1.2.0'
+release = '1.3.1'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
diff --git a/docs/index.rst b/docs/index.rst
index 0929e53..0439786 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -16,14 +16,9 @@ regular-expression-based sanitizers.
Bleach's ``linkify`` function is highly configurable and can be used to find,
edit, and filter links most other auto-linkers can't.
-The version of bleach on GitHub_ is the always the most up-to-date and the
+The version of bleach on GitHub_ is always the most up-to-date and the
``master`` branch should always work.
-.. warn::
-
- Bleach is currently incompatible with html5lib 1.0b and any versions below
- 0.9.5.
-
Installing Bleach
=================
@@ -56,7 +51,6 @@ Contents:
goals
-
Indices and tables
==================
@@ -64,6 +58,6 @@ Indices and tables
* :ref:`modindex`
* :ref:`search`
-.. _html5lib: http://code.google.com/p/html5lib/
+.. _html5lib: https://github.com/html5lib/html5lib-python
.. _GitHub: https://github.com/jsocol/bleach
-.. _PyPI: http://pypi.python.org/pypi/bleach
+.. _PyPI: https://pypi.python.org/pypi/bleach
diff --git a/requirements.txt b/requirements.txt
index 1500a14..d6e9357 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
-# These are the requirements to run the test suite.
-nose==1.3.0
-html5lib==0.9.5
+six
+html5lib>=0.999
+# Requirements to run the test suite:
+nose
+flake8
diff --git a/setup.py b/setup.py
index e48c3f7..6d5cfb4 100644
--- a/setup.py
+++ b/setup.py
@@ -2,26 +2,35 @@ from setuptools import setup, find_packages
setup(
name='bleach',
- version='1.2.2',
+ version='1.4',
description='An easy whitelist-based HTML-sanitizing tool.',
long_description=open('README.rst').read(),
author='James Socol',
- author_email='james@mozilla.com',
+ author_email='me@jamessocol.com',
url='http://github.com/jsocol/bleach',
- license='BSD',
+ license='Apache Software License',
packages=find_packages(),
include_package_data=True,
package_data={'': ['README.rst']},
zip_safe=False,
- install_requires=['html5lib==0.95'],
+ install_requires=[
+ 'six',
+ 'html5lib>=0.999',
+ ],
classifiers=[
- 'Development Status :: 4 - Beta',
+ 'Development Status :: 5 - Production/Stable',
'Environment :: Web Environment',
'Environment :: Web Environment :: Mozilla',
'Intended Audience :: Developers',
- 'License :: OSI Approved :: BSD License',
+ 'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python',
+ 'Programming Language :: Python :: 2',
+ 'Programming Language :: Python :: 2.6',
+ 'Programming Language :: Python :: 2.7',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.2',
+ 'Programming Language :: Python :: 3.3',
'Topic :: Software Development :: Libraries :: Python Modules',
]
)
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..4d8e5f6
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,12 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+envlist = py26, py27, py32, py33, pypy
+
+[testenv]
+commands = nosetests {posargs:-v}
+deps =
+ nose