diff options
-rw-r--r-- | CHANGES | 20 | ||||
-rw-r--r-- | CONTRIBUTORS | 5 | ||||
-rw-r--r-- | LICENSE | 35 | ||||
-rw-r--r-- | README.rst | 2 | ||||
-rw-r--r-- | bleach/__init__.py | 227 | ||||
-rw-r--r-- | bleach/callbacks.py | 7 | ||||
-rw-r--r-- | bleach/encoding.py | 60 | ||||
-rw-r--r-- | bleach/sanitizer.py | 24 | ||||
-rw-r--r-- | bleach/tests/test_basics.py | 65 | ||||
-rw-r--r-- | bleach/tests/test_css.py | 13 | ||||
-rw-r--r-- | bleach/tests/test_links.py | 241 | ||||
-rw-r--r-- | bleach/tests/test_security.py | 12 | ||||
-rw-r--r-- | bleach/tests/test_unicode.py | 47 | ||||
-rw-r--r-- | bleach/tests/tools.py | 7 | ||||
-rw-r--r-- | docs/clean.rst | 2 | ||||
-rw-r--r-- | docs/conf.py | 4 | ||||
-rw-r--r-- | docs/index.rst | 12 | ||||
-rw-r--r-- | requirements.txt | 8 | ||||
-rw-r--r-- | setup.py | 21 | ||||
-rw-r--r-- | tox.ini | 12 |
20 files changed, 514 insertions, 310 deletions
@@ -1,6 +1,26 @@ Bleach Changes ============== +Version 1.4 +----------- + +- Update linkify to use etree type Treeewalker instead of simpletree. +- Updated html5lib to version >= 0.999. +- Update all code to be compatible with Python 3 and 2 using six. +- Switch to Apache License. + + +Version 1.3 +----------- + +- Used by Python 3-only fork. + + +Version 1.2.2 +------------- + +- Pin html5lib to version 0.95 for now due to major API break. + Version 1.2.1 ------------- diff --git a/CONTRIBUTORS b/CONTRIBUTORS index f014916..c2d052a 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -3,7 +3,7 @@ within and without the Mozilla Corporation and Foundation. Lead Developer: -- James Socol <james@mozilla.com> +- James Socol <me@jamessocol.com> Contributors: @@ -23,3 +23,6 @@ Patches: - Anton Kovalyov - Mark Paschal - Alex Ehlke +- Marc DM +- mdxs +- Marc Abramowitz @@ -1,28 +1,13 @@ -Copyright (c) 2010, Mozilla Foundation -All rights reserved. +Copyright (c) 2014, Mozilla Foundation -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - 3. Neither the name of bleach nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. @@ -60,7 +60,7 @@ Then install it by running:: $ python setup.py install -.. _html5lib: http://code.google.com/p/html5lib/ +.. _html5lib: https://github.com/html5lib/html5lib-python .. _GitHub: https://github.com/jsocol/bleach .. _ReadTheDocs: http://bleach.readthedocs.org/ .. _PyPI: http://pypi.python.org/pypi/bleach diff --git a/bleach/__init__.py b/bleach/__init__.py index af75d0f..b110972 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -1,6 +1,8 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals import logging import re -import sys import html5lib from html5lib.sanitizer import HTMLSanitizer @@ -11,8 +13,8 @@ from .encoding import force_unicode from .sanitizer import BleachSanitizer -VERSION = (1, 2, 1) -__version__ = '1.2.1' +VERSION = (1, 4, 0) +__version__ = '1.4' __all__ = ['clean', 'linkify'] @@ -61,12 +63,12 @@ TLDS.reverse() url_re = re.compile( r"""\(* # Match any opening parentheses. - \b(?<![@.])(?:(?:%s):/{0,3}(?:(?:\w+:)?\w+@)?)? # http:// - ([\w-]+\.)+(?:%s)(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)? - (?:[/?][^\s\{\}\|\\\^\[\]`<>"]*)? + \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http:// + ([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)? + (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)? # /path/zz (excluding "unsafe" chars from RFC 1738, # except for # and ~, which happen in practice) - """ % (u'|'.join(PROTOCOLS), u'|'.join(TLDS)), + """.format('|'.join(PROTOCOLS), '|'.join(TLDS)), re.IGNORECASE | re.VERBOSE | re.UNICODE) proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) @@ -75,8 +77,8 @@ punct_re = re.compile(r'([\.,]+)$') email_re = re.compile( r"""(?<!//) - (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+ - (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)* # dot-atom + (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+ + (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)* # dot-atom |^"([\001-\010\013\014\016-\037!#-\[\]-\177] |\\[\001-011\013\014\016-\177])*" # quoted-string )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.? # domain @@ -85,17 +87,18 @@ email_re = re.compile( NODE_TEXT = 4 # The numeric ID of a text node in simpletree. -DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] +ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x]) +# a simple routine that returns the tag name with the namespace prefix +# as returned by etree's Element.tag attribute -PY_26 = (sys.version_info < (2, 7)) -RECURSION_EXCEPTION = RuntimeError if not PY_26 else AttributeError +DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, strip=False, strip_comments=True): """Clean an HTML fragment and return it""" if not text: - return u'' + return '' text = force_unicode(text) @@ -123,22 +126,38 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, text = force_unicode(text) if not text: - return u'' + return '' parser = html5lib.HTMLParser(tokenizer=tokenizer) forest = parser.parseFragment(text) + _seen = set([]) - def replace_nodes(tree, new_frag, node): + def replace_nodes(tree, new_frag, node, index=0): + """ + Doesn't really replace nodes, but inserts the nodes contained in + new_frag into the treee at position index and returns the number + of nodes inserted. + If node is passed in, it is removed from the tree + """ + count = 0 new_tree = parser.parseFragment(new_frag) - for n in new_tree.childNodes: - # Prevent us from re-parsing links new links as existing links. - if n.name == 'a': - n._seen = True - tree.insertBefore(n, node) - tree.removeChild(node) - # Return the number of new nodes. - return len(new_tree.childNodes) - 1 + # capture any non-tag text at the start of the fragment + if new_tree.text: + if index == 0: + tree.text += new_tree.text + else: + tree[index-1].tail += new_tree.text + # the put in the tagged elements into the old tree + for n in new_tree: + if n.tag == ETREE_TAG('a'): + _seen.add(n) + tree.insert(index+count, n) + count += 1 + # if we got a node to remove... + if node is not None: + tree.remove(node) + return count def strip_wrapping_parentheses(fragment): """Strips wrapping parentheses. @@ -189,58 +208,102 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, return None return attrs + def _render_inner(node): + out = ['' if node.text is None else node.text] + for subnode in node: + out.append(_render(subnode)) + if subnode.tail: + out.append(subnode.tail) + return ''.join(out) + def linkify_nodes(tree, parse_text=True): - # I know this isn't Pythonic, but we're sometimes mutating - # tree.childNodes, which ends up breaking the loop and causing us to - # reparse code. - children = len(tree.childNodes) - current = 0 # A pointer to the "current" node. - while current < children: - node = tree.childNodes[current] - if node.type == NODE_TEXT and parse_text: - new_frag = _render(node) - # Look for email addresses? - if parse_email: - new_frag = re.sub(email_re, email_repl, new_frag) - if new_frag != _render(node): - adj = replace_nodes(tree, new_frag, node) + children = len(tree) + current_child = -1 + # start at -1 to process the parent first + while current_child < len(tree): + if current_child < 0: + node = tree + if parse_text and node.text: + new_txt = old_txt = node.text + if parse_email: + new_txt = re.sub(email_re, email_repl, node.text) + if new_txt and new_txt != node.text: + node.text = '' + adj = replace_nodes(tree, new_txt, None, 0) + children += adj + current_child += adj + linkify_nodes(tree, True) + continue + + new_txt = re.sub(url_re, link_repl, new_txt) + if new_txt != old_txt: + node.text = '' + adj = replace_nodes(tree, new_txt, None, 0) children += adj - current += adj - linkify_nodes(tree) + current_child += adj continue - new_frag = re.sub(url_re, link_repl, new_frag) - if new_frag != _render(node): - adj = replace_nodes(tree, new_frag, node) + else: + node = tree[current_child] + + if parse_text and node.tail: + new_tail = old_tail = node.tail + if parse_email: + new_tail = re.sub(email_re, email_repl, new_tail) + if new_tail != node.tail: + node.tail = '' + adj = replace_nodes(tree, new_tail, None, + current_child+1) + #insert the new nodes made from my tail into + # the tree right after me. current_child+1 + children += adj + + new_tail = re.sub(url_re, link_repl, new_tail) + if new_tail != old_tail: + node.tail = '' + adj = replace_nodes(tree, new_tail, None, current_child+1) children += adj - current += adj - elif node.name == 'a' and not getattr(node, '_seen', False): - if 'href' in node.attributes: - attrs = node.attributes - _text = attrs['_text'] = ''.join(c.toxml() for - c in node.childNodes) + + if node.tag == ETREE_TAG('a') and not (node in _seen): + if not node.get('href', None) is None: + attrs = dict(node.items()) + + _text = attrs['_text'] = _render_inner(node) + attrs = apply_callbacks(attrs, False) - if attrs is not None: + + if attrs is None: + # <a> tag replaced by the text within it + adj = replace_nodes(tree, _text, node, + current_child) + current_child -= 1 + # pull back current_child by 1 to scan the + # new nodes again. + else: text = force_unicode(attrs.pop('_text')) - node.attributes = attrs - for n in reversed(node.childNodes): - node.removeChild(n) + for attr_key, attr_val in attrs.items(): + node.set(attr_key, attr_val) + + for n in reversed(list(node)): + node.remove(n) text = parser.parseFragment(text) - for n in text.childNodes: - node.appendChild(n) - node._seen = True - else: - replace_nodes(tree, _text, node) - elif skip_pre and node.name == 'pre': - linkify_nodes(node, False) - elif not getattr(node, '_seen', False): - linkify_nodes(node) - current += 1 + node.text = text.text + for n in text: + node.append(n) + _seen.add(node) + + elif current_child >= 0: + if node.tag == ETREE_TAG('pre') and skip_pre: + linkify_nodes(node, False) + elif not (node in _seen): + linkify_nodes(node, True) + + current_child += 1 def email_repl(match): addr = match.group(0).replace('"', '"') link = { '_text': addr, - 'href': 'mailto:%s' % addr, + 'href': 'mailto:{0!s}'.format(addr), } link = apply_callbacks(link, True) @@ -250,18 +313,18 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, _href = link.pop('href') _text = link.pop('_text') - repl = '<a href="%s" %s>%s</a>' - attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items()) - return repl % (_href, attribs, _text) + repl = '<a href="{0!s}" {1!s}>{2!s}</a>' + attr = '{0!s}="{1!s}"' + attribs = ' '.join(attr.format(k, v) for k, v in link.items()) + return repl.format(_href, attribs, _text) def link_repl(match): url = match.group(0) open_brackets = close_brackets = 0 if url.startswith('('): - url, open_brackets, close_brackets = ( - strip_wrapping_parentheses(url) - ) - end = u'' + _wrapping = strip_wrapping_parentheses(url) + url, open_brackets, close_brackets = _wrapping + end = '' m = re.search(punct_re, url) if m: end = m.group(0) @@ -269,7 +332,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, if re.search(proto_re, url): href = url else: - href = u''.join([u'http://', url]) + href = ''.join(['http://', url]) link = { '_text': url, @@ -284,32 +347,30 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, _text = link.pop('_text') _href = link.pop('href') - repl = u'%s<a href="%s" %s>%s</a>%s%s' - attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items()) + repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}' + attr = '{0!s}="{1!s}"' + attribs = ' '.join(attr.format(k, v) for k, v in link.items()) - return repl % ('(' * open_brackets, - _href, attribs, _text, end, - ')' * close_brackets) + return repl.format('(' * open_brackets, + _href, attribs, _text, end, + ')' * close_brackets) try: linkify_nodes(forest) - except (RECURSION_EXCEPTION), e: + except RuntimeError as e: # If we hit the max recursion depth, just return what we've got. - log.exception('Probable recursion error: %r' % e) + log.exception('Probable recursion error: {0!r}'.format(e)) return _render(forest) def _render(tree): """Try rendering as HTML, then XML, then give up.""" - try: - return force_unicode(_serialize(tree)) - except AssertionError: # The treewalker throws this sometimes. - return force_unicode(tree.toxml()) + return force_unicode(_serialize(tree)) def _serialize(domtree): - walker = html5lib.treewalkers.getTreeWalker('simpletree') + walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) diff --git a/bleach/callbacks.py b/bleach/callbacks.py index cc4682d..227f089 100644 --- a/bleach/callbacks.py +++ b/bleach/callbacks.py @@ -1,10 +1,15 @@ """A set of basic callbacks for bleach.linkify.""" +from __future__ import unicode_literals def nofollow(attrs, new=False): if attrs['href'].startswith('mailto:'): return attrs - attrs['rel'] = 'nofollow' + rel = [x for x in attrs.get('rel', '').split(' ') if x] + if not 'nofollow' in [x.lower() for x in rel]: + rel.append('nofollow') + attrs['rel'] = ' '.join(rel) + return attrs diff --git a/bleach/encoding.py b/bleach/encoding.py index b9a989d..707adaa 100644 --- a/bleach/encoding.py +++ b/bleach/encoding.py @@ -1,6 +1,7 @@ import datetime from decimal import Decimal import types +import six def is_protected_type(obj): @@ -10,45 +11,52 @@ def is_protected_type(obj): force_unicode(strings_only=True). """ return isinstance(obj, ( - types.NoneType, - int, long, - datetime.datetime, datetime.date, datetime.time, - float, Decimal) + six.integer_types + + (types.NoneType, + datetime.datetime, datetime.date, datetime.time, + float, Decimal)) ) def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): """ - Similar to smart_unicode, except that lazy instances are resolved to + Similar to smart_text, except that lazy instances are resolved to strings, rather than kept as lazy objects. If strings_only is True, don't convert (some) non-string-like objects. """ + # Handle the common case first, saves 30-40% when s is an instance of + # six.text_type. This function gets called often in that setting. + if isinstance(s, six.text_type): + return s if strings_only and is_protected_type(s): return s try: - if not isinstance(s, basestring,): + if not isinstance(s, six.string_types): if hasattr(s, '__unicode__'): - s = unicode(s) + s = s.__unicode__() else: - try: - s = unicode(str(s), encoding, errors) - except UnicodeEncodeError: - if not isinstance(s, Exception): - raise - # If we get to here, the caller has passed in an Exception - # subclass populated with non-ASCII data without special - # handling to display as a string. We need to handle this - # without raising a further exception. We do an - # approximation to what the Exception's standard str() - # output should be. - s = ' '.join([force_unicode(arg, encoding, strings_only, - errors) for arg in s]) - elif not isinstance(s, unicode): - # Note: We use .decode() here, instead of unicode(s, encoding, - # errors), so that if s is a SafeString, it ends up being a - # SafeUnicode at the end. + if six.PY3: + if isinstance(s, bytes): + s = six.text_type(s, encoding, errors) + else: + s = six.text_type(s) + else: + s = six.text_type(bytes(s), encoding, errors) + else: + # Note: We use .decode() here, instead of six.text_type(s, + # encoding, errors), so that if s is a SafeBytes, it ends up being + # a SafeText at the end. s = s.decode(encoding, errors) - except UnicodeDecodeError, e: - raise UnicodeDecodeError(*e.args) + except UnicodeDecodeError as e: + if not isinstance(s, Exception): + raise UnicodeDecodeError(*e.args) + else: + # If we get to here, the caller has passed in an Exception + # subclass populated with non-ASCII bytestring data without a + # working unicode method. Try to handle this without raising a + # further exception by individually forcing the exception args + # to unicode. + s = ' '.join([force_unicode(arg, encoding, strings_only, + errors) for arg in s]) return s diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 4640012..88246f8 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import re from xml.sax.saxutils import escape, unescape @@ -14,8 +15,6 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): """Mixin to replace sanitize_token() and sanitize_css().""" allowed_svg_properties = [] - # TODO: When the next html5lib version comes out, nuke this. - attr_val_is_uri = HTMLSanitizerMixin.attr_val_is_uri + ['poster'] def sanitize_token(self, token): """Sanitize a token either by HTML-encoding or dropping. @@ -30,7 +29,7 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): """ if (getattr(self, 'wildcard_attributes', None) is None and - isinstance(self.allowed_attributes, dict)): + isinstance(self.allowed_attributes, dict)): self.wildcard_attributes = self.allowed_attributes.get('*', []) if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'], @@ -56,7 +55,7 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): unescape(attrs[attr])).lower() # Remove replacement characters from unescaped # characters. - val_unescaped = val_unescaped.replace(u"\ufffd", "") + val_unescaped = val_unescaped.replace("\ufffd", "") if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols)): @@ -67,8 +66,8 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): ' ', unescape(attrs[attr])) if (token['name'] in self.svg_allow_local_href and - 'xlink:href' in attrs and - re.search(r'^\s*[^#\s].*', attrs['xlink:href'])): + 'xlink:href' in attrs and + re.search(r'^\s*[^#\s].*', attrs['xlink:href'])): del attrs['xlink:href'] if 'style' in attrs: attrs['style'] = self.sanitize_css(attrs['style']) @@ -79,13 +78,14 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): pass else: if token['type'] == tokenTypes['EndTag']: - token['data'] = '</%s>' % token['name'] + token['data'] = '</{0!s}>'.format(token['name']) elif token['data']: - attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in + attr = ' {0!s}="{1!s}"' + attrs = ''.join([attr.format(k, escape(v)) for k, v in token['data']]) - token['data'] = '<%s%s>' % (token['name'], attrs) + token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs) else: - token['data'] = '<%s>' % token['name'] + token['data'] = '<{0!s}>'.format(token['name']) if token['selfClosing']: token['data'] = token['data'][:-1] + '/>' token['type'] = tokenTypes['Characters'] @@ -112,8 +112,8 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): # TODO: Make sure this does what it's meant to - I *think* it wants to # validate style attribute contents. parts = style.split(';') - gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*""" - """|"[\s\w]+"|\([\d,%\.\s]+\))*$""") + gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'""" + """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""") for part in parts: if not gauntlet.match(part): return '' diff --git a/bleach/tests/test_basics.py b/bleach/tests/test_basics.py index 9eca687..822407f 100644 --- a/bleach/tests/test_basics.py +++ b/bleach/tests/test_basics.py @@ -1,7 +1,9 @@ +import six import html5lib from nose.tools import eq_ import bleach +from bleach.tests.tools import in_ def test_empty(): @@ -9,7 +11,12 @@ def test_empty(): def test_nbsp(): - eq_(u'\xa0test string\xa0', bleach.clean(' test string ')) + if six.PY3: + expected = '\xa0test string\xa0' + else: + expected = six.u('\\xa0test string\\xa0') + + eq_(expected, bleach.clean(' test string ')) def test_comments_only(): @@ -18,8 +25,8 @@ def test_comments_only(): eq_('', bleach.clean(comment)) eq_('', bleach.clean(open_comment)) eq_(comment, bleach.clean(comment, strip_comments=False)) - eq_('%s-->' % open_comment, bleach.clean(open_comment, - strip_comments=False)) + eq_('{0!s}-->'.format(open_comment), bleach.clean(open_comment, + strip_comments=False)) def test_with_comments(): @@ -55,9 +62,11 @@ def test_function_arguments(): def test_named_arguments(): ATTRS = {'a': ['rel', 'href']} - s = u'<a href="http://xx.com" rel="alternate">xx.com</a>' - eq_('<a href="http://xx.com">xx.com</a>', bleach.clean(s)) - eq_(s, bleach.clean(s, attributes=ATTRS)) + s = ('<a href="http://xx.com" rel="alternate">xx.com</a>', + '<a rel="alternate" href="http://xx.com">xx.com</a>') + + eq_('<a href="http://xx.com">xx.com</a>', bleach.clean(s[0])) + in_(s, bleach.clean(s[0], attributes=ATTRS)) def test_disallowed_html(): @@ -81,19 +90,19 @@ def test_bare_entities(): def test_escaped_entities(): - s = u'<em>strong</em>' + s = '<em>strong</em>' eq_(s, bleach.clean(s)) def test_serializer(): - s = u'<table></table>' + s = '<table></table>' eq_(s, bleach.clean(s, tags=['table'])) - eq_(u'test<table></table>', bleach.linkify(u'<table>test</table>')) - eq_(u'<p>test</p>', bleach.clean(u'<p>test</p>', tags=['p'])) + eq_('test<table></table>', bleach.linkify('<table>test</table>')) + eq_('<p>test</p>', bleach.clean('<p>test</p>', tags=['p'])) def test_no_href_links(): - s = u'<a name="anchor">x</a>' + s = '<a name="anchor">x</a>' eq_(s, bleach.linkify(s)) @@ -112,7 +121,7 @@ def test_stripping(): bleach.clean('a test <em>with</em> <b>html</b> tags', strip=True)) eq_('a test <em>with</em> <b>html</b> tags', bleach.clean('a test <em>with</em> <img src="http://example.com/"> ' - '<b>html</b> tags', strip=True)) + '<b>html</b> tags', strip=True)) s = '<p><a href="http://example.com/">link text</a></p>' eq_('<p>link text</p>', bleach.clean(s, tags=['p'], strip=True)) @@ -138,7 +147,7 @@ def test_allowed_styles(): def test_idempotent(): """Make sure that applying the filter twice doesn't change anything.""" - dirty = u'<span>invalid & </span> < extra http://link.com<em>' + dirty = '<span>invalid & </span> < extra http://link.com<em>' clean = bleach.clean(dirty) eq_(clean, bleach.clean(clean)) @@ -147,10 +156,23 @@ def test_idempotent(): eq_(linked, bleach.linkify(linked)) +def test_rel_already_there(): + """Make sure rel attribute is updated not replaced""" + linked = ('Click <a href="http://example.com" rel="tooltip">' + 'here</a>.') + link_good = (('Click <a href="http://example.com" rel="tooltip nofollow">' + 'here</a>.'), + ('Click <a rel="tooltip nofollow" href="http://example.com">' + 'here</a>.')) + + in_(link_good, bleach.linkify(linked)) + in_(link_good, bleach.linkify(link_good[0])) + + def test_lowercase_html(): """We should output lowercase HTML.""" - dirty = u'<EM CLASS="FOO">BAR</EM>' - clean = u'<em class="FOO">BAR</em>' + dirty = '<EM CLASS="FOO">BAR</EM>' + clean = '<em class="FOO">BAR</em>' eq_(clean, bleach.clean(dirty, attributes=['class'])) @@ -160,14 +182,15 @@ def test_wildcard_attributes(): 'img': ['src'], } TAG = ['img', 'em'] - dirty = (u'both <em id="foo" style="color: black">can</em> have ' - u'<img id="bar" src="foo"/>') - clean = u'both <em id="foo">can</em> have <img id="bar" src="foo">' - eq_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR)) + dirty = ('both <em id="foo" style="color: black">can</em> have ' + '<img id="bar" src="foo"/>') + clean = ('both <em id="foo">can</em> have <img src="foo" id="bar">', + 'both <em id="foo">can</em> have <img id="bar" src="foo">') + in_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR)) def test_sarcasm(): """Jokes should crash.<sarcasm/>""" - dirty = u'Yeah right <sarcasm/>' - clean = u'Yeah right <sarcasm/>' + dirty = 'Yeah right <sarcasm/>' + clean = 'Yeah right <sarcasm/>' eq_(clean, bleach.clean(dirty)) diff --git a/bleach/tests/test_css.py b/bleach/tests/test_css.py index 588c8ce..b40596f 100644 --- a/bleach/tests/test_css.py +++ b/bleach/tests/test_css.py @@ -29,14 +29,14 @@ def test_allowed_css(): ('font-family: "Arial";', 'font-family: "Arial";', ['font-family']), ) - p_single = '<p style="%s">bar</p>' - p_double = "<p style='%s'>bar</p>" + p_single = '<p style="{0!s}">bar</p>' + p_double = "<p style='{0!s}'>bar</p>" def check(i, o, s): if '"' in i: - eq_(p_double % o, clean(p_double % i, styles=s)) + eq_(p_double.format(o), clean(p_double.format(i), styles=s)) else: - eq_(p_single % o, clean(p_single % i, styles=s)) + eq_(p_single.format(o), clean(p_single.format(i), styles=s)) for i, o, s in tests: yield check, i, o, s @@ -70,12 +70,13 @@ def test_style_hang(): """font: normal normal normal 100%/normal 'Courier New', """ """'Andale Mono', monospace; background-position: initial """ """initial; background-repeat: initial initial;""") - html = '<p style="%s">Hello world</p>' % style + html = '<p style="{0!s}">Hello world</p>'.format(style) styles = [ 'border', 'float', 'overflow', 'min-height', 'vertical-align', 'white-space', 'margin', 'margin-left', 'margin-top', 'margin-bottom', 'margin-right', - 'padding', 'padding-left', 'padding-top', 'padding-bottom', 'padding-right', + 'padding', 'padding-left', 'padding-top', 'padding-bottom', + 'padding-right', 'background', 'background-color', 'font', 'font-size', 'font-weight', 'text-align', 'text-transform', diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py index ac593c4..abf889d 100644 --- a/bleach/tests/test_links.py +++ b/bleach/tests/test_links.py @@ -1,18 +1,20 @@ -import urllib +try: + from urllib.parse import quote_plus +except ImportError: + from urllib import quote_plus from html5lib.tokenizer import HTMLTokenizer from nose.tools import eq_ from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC - - +from bleach.tests.tools import in_ def test_url_re(): def no_match(s): match = url_re.search(s) if match: - assert not match, 'matched %s' % s[slice(*match.span())] + assert not match, 'matched {0!s}'.format(s[slice(*match.span())]) yield no_match, 'just what i am looking for...it' @@ -21,36 +23,48 @@ def test_empty(): def test_simple_link(): - eq_('a <a href="http://example.com" rel="nofollow">http://example.com' + in_(('a <a href="http://example.com" rel="nofollow">http://example.com' '</a> link', + 'a <a rel="nofollow" href="http://example.com">http://example.com' + '</a> link'), linkify('a http://example.com link')) - eq_('a <a href="https://example.com" rel="nofollow">https://example.com' + in_(('a <a href="https://example.com" rel="nofollow">https://example.com' '</a> link', + 'a <a rel="nofollow" href="https://example.com">https://example.com' + '</a> link'), linkify('a https://example.com link')) - eq_('an <a href="http://example.com" rel="nofollow">example.com</a> link', - linkify('an example.com link')) + in_(('a <a href="http://example.com" rel="nofollow">example.com</a> link', + 'a <a rel="nofollow" href="http://example.com">example.com</a> link'), + linkify('a example.com link')) def test_trailing_slash(): - eq_('<a href="http://example.com/" rel="nofollow">http://example.com/</a>', - linkify('http://example.com/')) - eq_('<a href="http://example.com/foo/" rel="nofollow">' - 'http://example.com/foo/</a>', - linkify('http://example.com/foo/')) - eq_('<a href="http://example.com/foo/bar/" rel="nofollow">' - 'http://example.com/foo/bar/</a>', - linkify('http://example.com/foo/bar/')) + in_(('<a href="http://examp.com/" rel="nofollow">http://examp.com/</a>', + '<a rel="nofollow" href="http://examp.com/">http://examp.com/</a>'), + linkify('http://examp.com/')) + in_(('<a href="http://example.com/foo/" rel="nofollow">' + 'http://example.com/foo/</a>', + '<a rel="nofollow" href="http://example.com/foo/">' + 'http://example.com/foo/</a>'), + linkify('http://example.com/foo/')) + in_(('<a href="http://example.com/foo/bar/" rel="nofollow">' + 'http://example.com/foo/bar/</a>', + '<a rel="nofollow" href="http://example.com/foo/bar/">' + 'http://example.com/foo/bar/</a>'), + linkify('http://example.com/foo/bar/')) def test_mangle_link(): """We can muck with the href attribute of the link.""" def filter_url(attrs, new=False): - attrs['href'] = (u'http://bouncer/?u=%s' % - urllib.quote_plus(attrs['href'])) + quoted = quote_plus(attrs['href']) + attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted) return attrs - eq_('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">' - 'http://example.com</a>', + in_(('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">' + 'http://example.com</a>', + '<a rel="nofollow" href="http://bouncer/?u=http%3A%2F%2Fexample.com">' + 'http://example.com</a>'), linkify('http://example.com', DC + [filter_url])) @@ -76,13 +90,19 @@ def test_email_link(): 'james@example.com.au</a> mailto', True, 'aussie james@example.com.au mailto'), # This is kind of a pathological case. I guess we do our best here. - ('email to <a href="james@example.com" rel="nofollow">' - 'james@example.com</a>', True, - 'email to <a href="james@example.com">james@example.com</a>'), + (('email to <a href="james@example.com" rel="nofollow">' + 'james@example.com</a>', + 'email to <a rel="nofollow" href="james@example.com">' + 'james@example.com</a>'), + True, + 'email to <a href="james@example.com">james@example.com</a>'), ) def _check(o, p, i): - eq_(o, linkify(i, parse_email=p)) + if isinstance(o, (list, tuple)): + in_(o, linkify(i, parse_email=p)) + else: + eq_(o, linkify(i, parse_email=p)) for (o, p, i) in tests: yield _check, o, p, i @@ -151,7 +171,8 @@ def test_set_attrs(): attrs['rev'] = 'canonical' return attrs - eq_('<a href="http://ex.mp" rev="canonical">ex.mp</a>', + in_(('<a href="http://ex.mp" rev="canonical">ex.mp</a>', + '<a rev="canonical" href="http://ex.mp">ex.mp</a>'), linkify('ex.mp', [set_attr])) @@ -179,15 +200,19 @@ def test_stop_email(): def test_tlds(): - eq_('<a href="http://example.com" rel="nofollow">example.com</a>', + in_(('<a href="http://example.com" rel="nofollow">example.com</a>', + '<a rel="nofollow" href="http://example.com">example.com</a>'), linkify('example.com')) - eq_('<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>', + in_(('<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>', + '<a rel="nofollow" href="http://example.co.uk">example.co.uk</a>'), linkify('example.co.uk')) - eq_('<a href="http://example.edu" rel="nofollow">example.edu</a>', + in_(('<a href="http://example.edu" rel="nofollow">example.edu</a>', + '<a rel="nofollow" href="http://example.edu">example.edu</a>'), linkify('example.edu')) eq_('example.xxx', linkify('example.xxx')) eq_(' brie', linkify(' brie')) - eq_('<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>', + in_(('<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>', + '<a rel="nofollow" href="http://bit.ly/fun">bit.ly/fun</a>'), linkify('bit.ly/fun')) @@ -197,61 +222,81 @@ def test_escaping(): def test_nofollow_off(): eq_('<a href="http://example.com">example.com</a>', - linkify(u'example.com', [])) + linkify('example.com', [])) def test_link_in_html(): - eq_('<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>', + in_(('<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>', + '<i><a rel="nofollow" href="http://yy.com">http://yy.com</a></i>'), linkify('<i>http://yy.com</i>')) - eq_('<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com</a>' - '</strong></em>', + + in_(('<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com' + '</a></strong></em>', + '<em><strong><a rel="nofollow" href="http://xx.com">http://xx.com' + '</a></strong></em>'), linkify('<em><strong>http://xx.com</strong></em>')) def test_links_https(): - eq_('<a href="https://yy.com" rel="nofollow">https://yy.com</a>', + in_(('<a href="https://yy.com" rel="nofollow">https://yy.com</a>', + '<a rel="nofollow" href="https://yy.com">https://yy.com</a>'), linkify('https://yy.com')) def test_add_rel_nofollow(): """Verify that rel="nofollow" is added to an existing link""" - eq_('<a href="http://yy.com" rel="nofollow">http://yy.com</a>', + in_(('<a href="http://yy.com" rel="nofollow">http://yy.com</a>', + '<a rel="nofollow" href="http://yy.com">http://yy.com</a>'), linkify('<a href="http://yy.com">http://yy.com</a>')) def test_url_with_path(): - eq_('<a href="http://example.com/path/to/file" rel="nofollow">' - 'http://example.com/path/to/file</a>', + in_(('<a href="http://example.com/path/to/file" rel="nofollow">' + 'http://example.com/path/to/file</a>', + '<a rel="nofollow" href="http://example.com/path/to/file">' + 'http://example.com/path/to/file</a>'), linkify('http://example.com/path/to/file')) def test_link_ftp(): - eq_('<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">' - 'ftp://ftp.mozilla.org/some/file</a>', + in_(('<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">' + 'ftp://ftp.mozilla.org/some/file</a>', + '<a rel="nofollow" href="ftp://ftp.mozilla.org/some/file">' + 'ftp://ftp.mozilla.org/some/file</a>'), linkify('ftp://ftp.mozilla.org/some/file')) def test_link_query(): - eq_('<a href="http://xx.com/?test=win" rel="nofollow">' + in_(('<a href="http://xx.com/?test=win" rel="nofollow">' 'http://xx.com/?test=win</a>', + '<a rel="nofollow" href="http://xx.com/?test=win">' + 'http://xx.com/?test=win</a>'), linkify('http://xx.com/?test=win')) - eq_('<a href="http://xx.com/?test=win" rel="nofollow">' + in_(('<a href="http://xx.com/?test=win" rel="nofollow">' 'xx.com/?test=win</a>', + '<a rel="nofollow" href="http://xx.com/?test=win">' + 'xx.com/?test=win</a>'), linkify('xx.com/?test=win')) - eq_('<a href="http://xx.com?test=win" rel="nofollow">' + in_(('<a href="http://xx.com?test=win" rel="nofollow">' 'xx.com?test=win</a>', + '<a rel="nofollow" href="http://xx.com?test=win">' + 'xx.com?test=win</a>'), linkify('xx.com?test=win')) def test_link_fragment(): - eq_('<a href="http://xx.com/path#frag" rel="nofollow">' - 'http://xx.com/path#frag</a>', + in_(('<a href="http://xx.com/path#frag" rel="nofollow">' + 'http://xx.com/path#frag</a>', + '<a rel="nofollow" href="http://xx.com/path#frag">' + 'http://xx.com/path#frag</a>'), linkify('http://xx.com/path#frag')) def test_link_entities(): - eq_('<a href="http://xx.com/?a=1&b=2" rel="nofollow">' + in_(('<a href="http://xx.com/?a=1&b=2" rel="nofollow">' 'http://xx.com/?a=1&b=2</a>', + '<a rel="nofollow" href="http://xx.com/?a=1&b=2">' + 'http://xx.com/?a=1&b=2</a>'), linkify('http://xx.com/?a=1&b=2')) @@ -262,9 +307,12 @@ def test_escaped_html(): def test_link_http_complete(): - eq_('<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d' + in_(('<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d' '&e#f" rel="nofollow">' 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f</a>', + '<a rel="nofollow" href="https://user:pass@ftp.mozilla.org/x/' + 'y.exe?a=b&c=d&e#f">' + 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f</a>'), linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f')) @@ -282,8 +330,10 @@ def test_javascript_url(): def test_unsafe_url(): """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" - eq_('All your{"<a href="http://xx.yy.com/grover.png" ' - 'rel="nofollow">xx.yy.com/grover.png</a>"}base are', + in_(('All your{"<a href="http://xx.yy.com/grover.png" ' + 'rel="nofollow">xx.yy.com/grover.png</a>"}base are', + 'All your{"<a rel="nofollow" href="http://xx.yy.com/grover.png"' + '>xx.yy.com/grover.png</a>"}base are'), linkify('All your{"xx.yy.com/grover.png"}base are')) @@ -291,17 +341,23 @@ def test_skip_pre(): """Skip linkification in <pre> tags.""" simple = 'http://xx.com <pre>http://xx.com</pre>' linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> ' + '<pre>http://xx.com</pre>', + '<a rel="nofollow" href="http://xx.com">http://xx.com</a> ' '<pre>http://xx.com</pre>') all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> ' '<pre><a href="http://xx.com" rel="nofollow">http://xx.com' + '</a></pre>', + '<a rel="nofollow" href="http://xx.com">http://xx.com</a> ' + '<pre><a rel="nofollow" href="http://xx.com">http://xx.com' '</a></pre>') - eq_(linked, linkify(simple, skip_pre=True)) - eq_(all_linked, linkify(simple)) + in_(linked, linkify(simple, skip_pre=True)) + in_(all_linked, linkify(simple)) already_linked = '<pre><a href="http://xx.com">xx</a></pre>' - nofollowed = '<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>' - eq_(nofollowed, linkify(already_linked)) - eq_(nofollowed, linkify(already_linked, skip_pre=True)) + nofollowed = ('<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>', + '<pre><a rel="nofollow" href="http://xx.com">xx</a></pre>') + in_(nofollowed, linkify(already_linked)) + in_(nofollowed, linkify(already_linked, skip_pre=True)) def test_libgl(): @@ -311,11 +367,13 @@ def test_libgl(): def test_end_of_sentence(): """example.com. should match.""" - out = u'<a href="http://%s" rel="nofollow">%s</a>%s' - in_ = u'%s%s' + outs = ('<a href="http://{0!s}" rel="nofollow">{0!s}</a>{1!s}', + '<a rel="nofollow" href="http://{0!s}">{0!s}</a>{1!s}') + intxt = '{0!s}{1!s}' def check(u, p): - eq_(out % (u, u, p), linkify(in_ % (u, p))) + in_([out.format(u, p) for out in outs], + linkify(intxt.format(u, p))) tests = ( ('example.com', '.'), @@ -330,49 +388,50 @@ def test_end_of_sentence(): def test_end_of_clause(): """example.com/foo, shouldn't include the ,""" - eq_('<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar', + in_(('<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar', + '<a rel="nofollow" href="http://ex.com/foo">ex.com/foo</a>, bar'), linkify('ex.com/foo, bar')) def test_sarcasm(): """Jokes should crash.<sarcasm/>""" - dirty = u'Yeah right <sarcasm/>' - clean = u'Yeah right <sarcasm/>' + dirty = 'Yeah right <sarcasm/>' + clean = 'Yeah right <sarcasm/>' eq_(clean, linkify(dirty)) def test_wrapping_parentheses(): """URLs wrapped in parantheses should not include them.""" - out = u'%s<a href="http://%s" rel="nofollow">%s</a>%s' + outs = ('{0!s}<a href="http://{1!s}" rel="nofollow">{2!s}</a>{3!s}', + '{0!s}<a rel="nofollow" href="http://{1!s}">{2!s}</a>{3!s}') tests = ( - ('(example.com)', out % ('(', 'example.com', 'example.com', ')')), - ('(example.com/)', out % ('(', 'example.com/', 'example.com/', ')')), - ('(example.com/foo)', out % ('(', 'example.com/foo', - 'example.com/foo', ')')), - ('(((example.com/))))', out % ('(((', 'example.com/)', - 'example.com/)', ')))')), - ('example.com/))', out % ('', 'example.com/))', - 'example.com/))', '')), + ('(example.com)', ('(', 'example.com', 'example.com', ')')), + ('(example.com/)', ('(', 'example.com/', 'example.com/', ')')), + ('(example.com/foo)', ('(', 'example.com/foo', + 'example.com/foo', ')')), + ('(((example.com/))))', ('(((', 'example.com/)', + 'example.com/)', ')))')), + ('example.com/))', ('', 'example.com/))', 'example.com/))', '')), ('http://en.wikipedia.org/wiki/Test_(assessment)', - out % ('', 'en.wikipedia.org/wiki/Test_(assessment)', - 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), + ('', 'en.wikipedia.org/wiki/Test_(assessment)', + 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), ('(http://en.wikipedia.org/wiki/Test_(assessment))', - out % ('(', 'en.wikipedia.org/wiki/Test_(assessment)', - 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')), + ('(', 'en.wikipedia.org/wiki/Test_(assessment)', + 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')), ('((http://en.wikipedia.org/wiki/Test_(assessment))', - out % ('((', 'en.wikipedia.org/wiki/Test_(assessment', - 'http://en.wikipedia.org/wiki/Test_(assessment', '))')), + ('((', 'en.wikipedia.org/wiki/Test_(assessment', + 'http://en.wikipedia.org/wiki/Test_(assessment', '))')), ('(http://en.wikipedia.org/wiki/Test_(assessment)))', - out % ('(', 'en.wikipedia.org/wiki/Test_(assessment))', - 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')), + ('(', 'en.wikipedia.org/wiki/Test_(assessment))', + 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')), ('(http://en.wikipedia.org/wiki/)Test_(assessment', - out % ('(', 'en.wikipedia.org/wiki/)Test_(assessment', - 'http://en.wikipedia.org/wiki/)Test_(assessment', '')), + ('(', 'en.wikipedia.org/wiki/)Test_(assessment', + 'http://en.wikipedia.org/wiki/)Test_(assessment', '')), ) def check(test, expected_output): - eq_(expected_output, linkify(test)) + in_([o.format(*expected_output) for o in outs], linkify(test)) for test, expected_output in tests: yield check, test, expected_output @@ -389,7 +448,9 @@ def test_ports(): ) def check(test, output): - eq_(u'<a href="{0}" rel="nofollow">{0}</a>{1}'.format(*output), + outs = ('<a href="{0}" rel="nofollow">{0}</a>{1}', + '<a rel="nofollow" href="{0}">{0}</a>{1}') + in_([out.format(*output) for out in outs], linkify(test)) for test, output in tests: @@ -406,8 +467,9 @@ def test_tokenizer(): def test_ignore_bad_protocols(): eq_('foohttp://bar', linkify('foohttp://bar')) - eq_('foohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>', - linkify('foohttp://exampl.com')) + in_(('fohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>', + 'fohttp://<a rel="nofollow" href="http://exampl.com">exampl.com</a>'), + linkify('fohttp://exampl.com')) def test_max_recursion_depth(): @@ -420,21 +482,28 @@ def test_link_emails_and_urls(): """parse_email=True shouldn't prevent URLs from getting linkified.""" output = ('<a href="http://example.com" rel="nofollow">' 'http://example.com</a> <a href="mailto:person@example.com">' + 'person@example.com</a>', + '<a rel="nofollow" href="http://example.com">' + 'http://example.com</a> <a href="mailto:person@example.com">' 'person@example.com</a>') - eq_(output, linkify('http://example.com person@example.com', + in_(output, linkify('http://example.com person@example.com', parse_email=True)) def test_links_case_insensitive(): """Protocols and domain names are case insensitive.""" expect = ('<a href="HTTP://EXAMPLE.COM" rel="nofollow">' + 'HTTP://EXAMPLE.COM</a>', + '<a rel="nofollow" href="HTTP://EXAMPLE.COM">' 'HTTP://EXAMPLE.COM</a>') - eq_(expect, linkify('HTTP://EXAMPLE.COM')) + in_(expect, linkify('HTTP://EXAMPLE.COM')) def test_elements_inside_links(): - eq_(u'<a href="#" rel="nofollow">hello<br></a>', + in_(('<a href="#" rel="nofollow">hello<br></a>', + '<a rel="nofollow" href="#">hello<br></a>'), linkify('<a href="#">hello<br></a>')) - eq_(u'<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>', + in_(('<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>', + '<a rel="nofollow" href="#"><strong>bold</strong> hello<br></a>'), linkify('<a href="#"><strong>bold</strong> hello<br></a>')) diff --git a/bleach/tests/test_security.py b/bleach/tests/test_security.py index 6c2b33f..6adab59 100644 --- a/bleach/tests/test_security.py +++ b/bleach/tests/test_security.py @@ -25,10 +25,10 @@ def test_invalid_attr(): clean('<a onclick="evil" href="test">test</a>')) eq_('<img src="test">', clean('<img onclick="evil" src="test" />', - tags=IMG, attributes=IMG_ATTR)) + tags=IMG, attributes=IMG_ATTR)) eq_('<img src="test">', clean('<img href="invalid" src="test" />', - tags=IMG, attributes=IMG_ATTR)) + tags=IMG, attributes=IMG_ATTR)) def test_unquoted_attr(): @@ -57,7 +57,7 @@ def test_invalid_filter_attr(): eq_('<img src="http://example.com/">', clean('<img onclick="evil" src="http://example.com/" />', - tags=IMG, attributes=IMG_ATTR)) + tags=IMG, attributes=IMG_ATTR)) eq_('<img>', clean('<img onclick="evil" src="http://badhost.com/" />', tags=IMG, attributes=IMG_ATTR)) @@ -91,9 +91,9 @@ def test_nasty(): """Nested, broken up, multiple tags, are still foiled!""" test = ('<scr<script></script>ipt type="text/javascript">alert("foo");</' '<script></script>script<del></del>>') - expect = (u'<scr<script></script>ipt type="text/javascript"' - u'>alert("foo");</script>script<del></del>' - u'>') + expect = ('<scr<script></script>ipt type="text/javascript"' + '>alert("foo");</script>script<del></del>' + '>') eq_(expect, clean(test)) diff --git a/bleach/tests/test_unicode.py b/bleach/tests/test_unicode.py index 67123cc..796924d 100644 --- a/bleach/tests/test_unicode.py +++ b/bleach/tests/test_unicode.py @@ -1,54 +1,59 @@ # -*- coding: utf-8 -*- - +from __future__ import unicode_literals from nose.tools import eq_ from bleach import clean, linkify +from bleach.tests.tools import in_ def test_japanese_safe_simple(): - eq_(u'ヘルプとチュートリアル', clean(u'ヘルプとチュートリアル')) - eq_(u'ヘルプとチュートリアル', linkify(u'ヘルプとチュートリアル')) + eq_('ヘルプとチュートリアル', clean('ヘルプとチュートリアル')) + eq_('ヘルプとチュートリアル', linkify('ヘルプとチュートリアル')) def test_japanese_strip(): - eq_(u'<em>ヘルプとチュートリアル</em>', - clean(u'<em>ヘルプとチュートリアル</em>')) - eq_(u'<span>ヘルプとチュートリアル</span>', - clean(u'<span>ヘルプとチュートリアル</span>')) + eq_('<em>ヘルプとチュートリアル</em>', + clean('<em>ヘルプとチュートリアル</em>')) + eq_('<span>ヘルプとチュートリアル</span>', + clean('<span>ヘルプとチュートリアル</span>')) def test_russian_simple(): - eq_(u'Домашняя', clean(u'Домашняя')) - eq_(u'Домашняя', linkify(u'Домашняя')) + eq_('Домашняя', clean('Домашняя')) + eq_('Домашняя', linkify('Домашняя')) def test_mixed(): - eq_(u'Домашняяヘルプとチュートリアル', - clean(u'Домашняяヘルプとチュートリアル')) + eq_('Домашняяヘルプとチュートリアル', + clean('Домашняяヘルプとチュートリアル')) def test_mixed_linkify(): - eq_(u'Домашняя <a href="http://example.com" rel="nofollow">' - u'http://example.com</a> ヘルプとチュートリアル', - linkify(u'Домашняя http://example.com ヘルプとチュートリアル')) + in_(('Домашняя <a href="http://example.com" rel="nofollow">' + 'http://example.com</a> ヘルプとチュートリアル', + 'Домашняя <a rel="nofollow" href="http://example.com">' + 'http://example.com</a> ヘルプとチュートリアル'), + linkify('Домашняя http://example.com ヘルプとチュートリアル')) def test_url_utf8(): """Allow UTF8 characters in URLs themselves.""" - out = u'<a href="%(url)s" rel="nofollow">%(url)s</a>' + outs = ('<a href="{0!s}" rel="nofollow">{0!s}</a>', + '<a rel="nofollow" href="{0!s}">{0!s}</a>') + + out = lambda url: [x.format(url) for x in outs] tests = ( - ('http://éxámplé.com/', out % {'url': u'http://éxámplé.com/'}), - ('http://éxámplé.com/íàñá/', - out % {'url': u'http://éxámplé.com/íàñá/'}), + ('http://éxámplé.com/', out('http://éxámplé.com/')), + ('http://éxámplé.com/íàñá/', out('http://éxámplé.com/íàñá/')), ('http://éxámplé.com/íàñá/?foo=bar', - out % {'url': u'http://éxámplé.com/íàñá/?foo=bar'}), + out('http://éxámplé.com/íàñá/?foo=bar')), ('http://éxámplé.com/íàñá/?fóo=bár', - out % {'url': u'http://éxámplé.com/íàñá/?fóo=bár'}), + out('http://éxámplé.com/íàñá/?fóo=bár')), ) def check(test, expected_output): - eq_(expected_output, linkify(test)) + in_(expected_output, linkify(test)) for test, expected_output in tests: yield check, test, expected_output diff --git a/bleach/tests/tools.py b/bleach/tests/tools.py new file mode 100644 index 0000000..87f926c --- /dev/null +++ b/bleach/tests/tools.py @@ -0,0 +1,7 @@ + + +def in_(l, a, msg=None): + """Shorthand for 'assert a in l, "%r not in %r" % (a, l) + """ + if not a in l: + raise AssertionError(msg or "%r not in %r" % (a, l)) diff --git a/docs/clean.rst b/docs/clean.rst index a31dc89..2fb888b 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -85,7 +85,7 @@ allowed but no values will be. For example, to allow users to set the color and font-weight of text:: attrs = { - '*': 'style' + '*': ['style'] } tags = ['p', 'em', 'strong'] styles = ['color', 'font-weight'] diff --git a/docs/conf.py b/docs/conf.py index a63aedf..96b2fc8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,9 +48,9 @@ copyright = u'2012, James Socol' # built documents. # # The short X.Y version. -version = '1.2' +version = '1.3' # The full version, including alpha/beta/rc tags. -release = '1.2.0' +release = '1.3.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/index.rst b/docs/index.rst index 0929e53..0439786 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,14 +16,9 @@ regular-expression-based sanitizers. Bleach's ``linkify`` function is highly configurable and can be used to find, edit, and filter links most other auto-linkers can't. -The version of bleach on GitHub_ is the always the most up-to-date and the +The version of bleach on GitHub_ is always the most up-to-date and the ``master`` branch should always work. -.. warn:: - - Bleach is currently incompatible with html5lib 1.0b and any versions below - 0.9.5. - Installing Bleach ================= @@ -56,7 +51,6 @@ Contents: goals - Indices and tables ================== @@ -64,6 +58,6 @@ Indices and tables * :ref:`modindex` * :ref:`search` -.. _html5lib: http://code.google.com/p/html5lib/ +.. _html5lib: https://github.com/html5lib/html5lib-python .. _GitHub: https://github.com/jsocol/bleach -.. _PyPI: http://pypi.python.org/pypi/bleach +.. _PyPI: https://pypi.python.org/pypi/bleach diff --git a/requirements.txt b/requirements.txt index 1500a14..d6e9357 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ -# These are the requirements to run the test suite. -nose==1.3.0 -html5lib==0.9.5 +six +html5lib>=0.999 +# Requirements to run the test suite: +nose +flake8 @@ -2,26 +2,35 @@ from setuptools import setup, find_packages setup( name='bleach', - version='1.2.2', + version='1.4', description='An easy whitelist-based HTML-sanitizing tool.', long_description=open('README.rst').read(), author='James Socol', - author_email='james@mozilla.com', + author_email='me@jamessocol.com', url='http://github.com/jsocol/bleach', - license='BSD', + license='Apache Software License', packages=find_packages(), include_package_data=True, package_data={'': ['README.rst']}, zip_safe=False, - install_requires=['html5lib==0.95'], + install_requires=[ + 'six', + 'html5lib>=0.999', + ], classifiers=[ - 'Development Status :: 4 - Beta', + 'Development Status :: 5 - Production/Stable', 'Environment :: Web Environment', 'Environment :: Web Environment :: Mozilla', 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', + 'License :: OSI Approved :: Apache Software License', 'Operating System :: OS Independent', 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', 'Topic :: Software Development :: Libraries :: Python Modules', ] ) @@ -0,0 +1,12 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py26, py27, py32, py33, pypy + +[testenv] +commands = nosetests {posargs:-v} +deps = + nose |