From 6cff86ce6de27fbd4f9fc07716fb1205b14ffae4 Mon Sep 17 00:00:00 2001 From: Per Andersson Date: Thu, 30 Jan 2014 01:25:11 +0100 Subject: Imported Upstream version 1.4 --- CHANGES | 20 ++++ CONTRIBUTORS | 5 +- LICENSE | 35 ++---- README.rst | 2 +- bleach/__init__.py | 227 ++++++++++++++++++++++++--------------- bleach/callbacks.py | 7 +- bleach/encoding.py | 60 ++++++----- bleach/sanitizer.py | 24 ++--- bleach/tests/test_basics.py | 65 ++++++++---- bleach/tests/test_css.py | 13 +-- bleach/tests/test_links.py | 241 +++++++++++++++++++++++++++--------------- bleach/tests/test_security.py | 12 +-- bleach/tests/test_unicode.py | 47 ++++---- bleach/tests/tools.py | 7 ++ docs/clean.rst | 2 +- docs/conf.py | 4 +- docs/index.rst | 12 +-- requirements.txt | 8 +- setup.py | 21 ++-- tox.ini | 12 +++ 20 files changed, 514 insertions(+), 310 deletions(-) create mode 100644 bleach/tests/tools.py create mode 100644 tox.ini diff --git a/CHANGES b/CHANGES index d9bad9c..1def1a2 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,26 @@ Bleach Changes ============== +Version 1.4 +----------- + +- Update linkify to use etree type Treeewalker instead of simpletree. +- Updated html5lib to version >= 0.999. +- Update all code to be compatible with Python 3 and 2 using six. +- Switch to Apache License. + + +Version 1.3 +----------- + +- Used by Python 3-only fork. + + +Version 1.2.2 +------------- + +- Pin html5lib to version 0.95 for now due to major API break. + Version 1.2.1 ------------- diff --git a/CONTRIBUTORS b/CONTRIBUTORS index f014916..c2d052a 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -3,7 +3,7 @@ within and without the Mozilla Corporation and Foundation. Lead Developer: -- James Socol +- James Socol Contributors: @@ -23,3 +23,6 @@ Patches: - Anton Kovalyov - Mark Paschal - Alex Ehlke +- Marc DM +- mdxs +- Marc Abramowitz diff --git a/LICENSE b/LICENSE index b2df30c..f7afaef 100644 --- a/LICENSE +++ b/LICENSE @@ -1,28 +1,13 @@ -Copyright (c) 2010, Mozilla Foundation -All rights reserved. +Copyright (c) 2014, Mozilla Foundation -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - 3. Neither the name of bleach nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/README.rst b/README.rst index 093edc1..5e52cae 100644 --- a/README.rst +++ b/README.rst @@ -60,7 +60,7 @@ Then install it by running:: $ python setup.py install -.. _html5lib: http://code.google.com/p/html5lib/ +.. _html5lib: https://github.com/html5lib/html5lib-python .. _GitHub: https://github.com/jsocol/bleach .. _ReadTheDocs: http://bleach.readthedocs.org/ .. _PyPI: http://pypi.python.org/pypi/bleach diff --git a/bleach/__init__.py b/bleach/__init__.py index af75d0f..b110972 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -1,6 +1,8 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals import logging import re -import sys import html5lib from html5lib.sanitizer import HTMLSanitizer @@ -11,8 +13,8 @@ from .encoding import force_unicode from .sanitizer import BleachSanitizer -VERSION = (1, 2, 1) -__version__ = '1.2.1' +VERSION = (1, 4, 0) +__version__ = '1.4' __all__ = ['clean', 'linkify'] @@ -61,12 +63,12 @@ TLDS.reverse() url_re = re.compile( r"""\(* # Match any opening parentheses. - \b(?"]*)? + \b(?"]*)? # /path/zz (excluding "unsafe" chars from RFC 1738, # except for # and ~, which happen in practice) - """ % (u'|'.join(PROTOCOLS), u'|'.join(TLDS)), + """.format('|'.join(PROTOCOLS), '|'.join(TLDS)), re.IGNORECASE | re.VERBOSE | re.UNICODE) proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) @@ -75,8 +77,8 @@ punct_re = re.compile(r'([\.,]+)$') email_re = re.compile( r"""(? tag replaced by the text within it + adj = replace_nodes(tree, _text, node, + current_child) + current_child -= 1 + # pull back current_child by 1 to scan the + # new nodes again. + else: text = force_unicode(attrs.pop('_text')) - node.attributes = attrs - for n in reversed(node.childNodes): - node.removeChild(n) + for attr_key, attr_val in attrs.items(): + node.set(attr_key, attr_val) + + for n in reversed(list(node)): + node.remove(n) text = parser.parseFragment(text) - for n in text.childNodes: - node.appendChild(n) - node._seen = True - else: - replace_nodes(tree, _text, node) - elif skip_pre and node.name == 'pre': - linkify_nodes(node, False) - elif not getattr(node, '_seen', False): - linkify_nodes(node) - current += 1 + node.text = text.text + for n in text: + node.append(n) + _seen.add(node) + + elif current_child >= 0: + if node.tag == ETREE_TAG('pre') and skip_pre: + linkify_nodes(node, False) + elif not (node in _seen): + linkify_nodes(node, True) + + current_child += 1 def email_repl(match): addr = match.group(0).replace('"', '"') link = { '_text': addr, - 'href': 'mailto:%s' % addr, + 'href': 'mailto:{0!s}'.format(addr), } link = apply_callbacks(link, True) @@ -250,18 +313,18 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, _href = link.pop('href') _text = link.pop('_text') - repl = '%s' - attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items()) - return repl % (_href, attribs, _text) + repl = '{2!s}' + attr = '{0!s}="{1!s}"' + attribs = ' '.join(attr.format(k, v) for k, v in link.items()) + return repl.format(_href, attribs, _text) def link_repl(match): url = match.group(0) open_brackets = close_brackets = 0 if url.startswith('('): - url, open_brackets, close_brackets = ( - strip_wrapping_parentheses(url) - ) - end = u'' + _wrapping = strip_wrapping_parentheses(url) + url, open_brackets, close_brackets = _wrapping + end = '' m = re.search(punct_re, url) if m: end = m.group(0) @@ -269,7 +332,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, if re.search(proto_re, url): href = url else: - href = u''.join([u'http://', url]) + href = ''.join(['http://', url]) link = { '_text': url, @@ -284,32 +347,30 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, _text = link.pop('_text') _href = link.pop('href') - repl = u'%s%s%s%s' - attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items()) + repl = '{0!s}{3!s}{4!s}{5!s}' + attr = '{0!s}="{1!s}"' + attribs = ' '.join(attr.format(k, v) for k, v in link.items()) - return repl % ('(' * open_brackets, - _href, attribs, _text, end, - ')' * close_brackets) + return repl.format('(' * open_brackets, + _href, attribs, _text, end, + ')' * close_brackets) try: linkify_nodes(forest) - except (RECURSION_EXCEPTION), e: + except RuntimeError as e: # If we hit the max recursion depth, just return what we've got. - log.exception('Probable recursion error: %r' % e) + log.exception('Probable recursion error: {0!r}'.format(e)) return _render(forest) def _render(tree): """Try rendering as HTML, then XML, then give up.""" - try: - return force_unicode(_serialize(tree)) - except AssertionError: # The treewalker throws this sometimes. - return force_unicode(tree.toxml()) + return force_unicode(_serialize(tree)) def _serialize(domtree): - walker = html5lib.treewalkers.getTreeWalker('simpletree') + walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) diff --git a/bleach/callbacks.py b/bleach/callbacks.py index cc4682d..227f089 100644 --- a/bleach/callbacks.py +++ b/bleach/callbacks.py @@ -1,10 +1,15 @@ """A set of basic callbacks for bleach.linkify.""" +from __future__ import unicode_literals def nofollow(attrs, new=False): if attrs['href'].startswith('mailto:'): return attrs - attrs['rel'] = 'nofollow' + rel = [x for x in attrs.get('rel', '').split(' ') if x] + if not 'nofollow' in [x.lower() for x in rel]: + rel.append('nofollow') + attrs['rel'] = ' '.join(rel) + return attrs diff --git a/bleach/encoding.py b/bleach/encoding.py index b9a989d..707adaa 100644 --- a/bleach/encoding.py +++ b/bleach/encoding.py @@ -1,6 +1,7 @@ import datetime from decimal import Decimal import types +import six def is_protected_type(obj): @@ -10,45 +11,52 @@ def is_protected_type(obj): force_unicode(strings_only=True). """ return isinstance(obj, ( - types.NoneType, - int, long, - datetime.datetime, datetime.date, datetime.time, - float, Decimal) + six.integer_types + + (types.NoneType, + datetime.datetime, datetime.date, datetime.time, + float, Decimal)) ) def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): """ - Similar to smart_unicode, except that lazy instances are resolved to + Similar to smart_text, except that lazy instances are resolved to strings, rather than kept as lazy objects. If strings_only is True, don't convert (some) non-string-like objects. """ + # Handle the common case first, saves 30-40% when s is an instance of + # six.text_type. This function gets called often in that setting. + if isinstance(s, six.text_type): + return s if strings_only and is_protected_type(s): return s try: - if not isinstance(s, basestring,): + if not isinstance(s, six.string_types): if hasattr(s, '__unicode__'): - s = unicode(s) + s = s.__unicode__() else: - try: - s = unicode(str(s), encoding, errors) - except UnicodeEncodeError: - if not isinstance(s, Exception): - raise - # If we get to here, the caller has passed in an Exception - # subclass populated with non-ASCII data without special - # handling to display as a string. We need to handle this - # without raising a further exception. We do an - # approximation to what the Exception's standard str() - # output should be. - s = ' '.join([force_unicode(arg, encoding, strings_only, - errors) for arg in s]) - elif not isinstance(s, unicode): - # Note: We use .decode() here, instead of unicode(s, encoding, - # errors), so that if s is a SafeString, it ends up being a - # SafeUnicode at the end. + if six.PY3: + if isinstance(s, bytes): + s = six.text_type(s, encoding, errors) + else: + s = six.text_type(s) + else: + s = six.text_type(bytes(s), encoding, errors) + else: + # Note: We use .decode() here, instead of six.text_type(s, + # encoding, errors), so that if s is a SafeBytes, it ends up being + # a SafeText at the end. s = s.decode(encoding, errors) - except UnicodeDecodeError, e: - raise UnicodeDecodeError(*e.args) + except UnicodeDecodeError as e: + if not isinstance(s, Exception): + raise UnicodeDecodeError(*e.args) + else: + # If we get to here, the caller has passed in an Exception + # subclass populated with non-ASCII bytestring data without a + # working unicode method. Try to handle this without raising a + # further exception by individually forcing the exception args + # to unicode. + s = ' '.join([force_unicode(arg, encoding, strings_only, + errors) for arg in s]) return s diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 4640012..88246f8 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import re from xml.sax.saxutils import escape, unescape @@ -14,8 +15,6 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): """Mixin to replace sanitize_token() and sanitize_css().""" allowed_svg_properties = [] - # TODO: When the next html5lib version comes out, nuke this. - attr_val_is_uri = HTMLSanitizerMixin.attr_val_is_uri + ['poster'] def sanitize_token(self, token): """Sanitize a token either by HTML-encoding or dropping. @@ -30,7 +29,7 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): """ if (getattr(self, 'wildcard_attributes', None) is None and - isinstance(self.allowed_attributes, dict)): + isinstance(self.allowed_attributes, dict)): self.wildcard_attributes = self.allowed_attributes.get('*', []) if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'], @@ -56,7 +55,7 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): unescape(attrs[attr])).lower() # Remove replacement characters from unescaped # characters. - val_unescaped = val_unescaped.replace(u"\ufffd", "") + val_unescaped = val_unescaped.replace("\ufffd", "") if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols)): @@ -67,8 +66,8 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): ' ', unescape(attrs[attr])) if (token['name'] in self.svg_allow_local_href and - 'xlink:href' in attrs and - re.search(r'^\s*[^#\s].*', attrs['xlink:href'])): + 'xlink:href' in attrs and + re.search(r'^\s*[^#\s].*', attrs['xlink:href'])): del attrs['xlink:href'] if 'style' in attrs: attrs['style'] = self.sanitize_css(attrs['style']) @@ -79,13 +78,14 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): pass else: if token['type'] == tokenTypes['EndTag']: - token['data'] = '' % token['name'] + token['data'] = ''.format(token['name']) elif token['data']: - attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in + attr = ' {0!s}="{1!s}"' + attrs = ''.join([attr.format(k, escape(v)) for k, v in token['data']]) - token['data'] = '<%s%s>' % (token['name'], attrs) + token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs) else: - token['data'] = '<%s>' % token['name'] + token['data'] = '<{0!s}>'.format(token['name']) if token['selfClosing']: token['data'] = token['data'][:-1] + '/>' token['type'] = tokenTypes['Characters'] @@ -112,8 +112,8 @@ class BleachSanitizerMixin(HTMLSanitizerMixin): # TODO: Make sure this does what it's meant to - I *think* it wants to # validate style attribute contents. parts = style.split(';') - gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*""" - """|"[\s\w]+"|\([\d,%\.\s]+\))*$""") + gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'""" + """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""") for part in parts: if not gauntlet.match(part): return '' diff --git a/bleach/tests/test_basics.py b/bleach/tests/test_basics.py index 9eca687..822407f 100644 --- a/bleach/tests/test_basics.py +++ b/bleach/tests/test_basics.py @@ -1,7 +1,9 @@ +import six import html5lib from nose.tools import eq_ import bleach +from bleach.tests.tools import in_ def test_empty(): @@ -9,7 +11,12 @@ def test_empty(): def test_nbsp(): - eq_(u'\xa0test string\xa0', bleach.clean(' test string ')) + if six.PY3: + expected = '\xa0test string\xa0' + else: + expected = six.u('\\xa0test string\\xa0') + + eq_(expected, bleach.clean(' test string ')) def test_comments_only(): @@ -18,8 +25,8 @@ def test_comments_only(): eq_('', bleach.clean(comment)) eq_('', bleach.clean(open_comment)) eq_(comment, bleach.clean(comment, strip_comments=False)) - eq_('%s-->' % open_comment, bleach.clean(open_comment, - strip_comments=False)) + eq_('{0!s}-->'.format(open_comment), bleach.clean(open_comment, + strip_comments=False)) def test_with_comments(): @@ -55,9 +62,11 @@ def test_function_arguments(): def test_named_arguments(): ATTRS = {'a': ['rel', 'href']} - s = u'xx.com' - eq_('xx.com', bleach.clean(s)) - eq_(s, bleach.clean(s, attributes=ATTRS)) + s = ('xx.com', + 'xx.com') + + eq_('xx.com', bleach.clean(s[0])) + in_(s, bleach.clean(s[0], attributes=ATTRS)) def test_disallowed_html(): @@ -81,19 +90,19 @@ def test_bare_entities(): def test_escaped_entities(): - s = u'<em>strong</em>' + s = '<em>strong</em>' eq_(s, bleach.clean(s)) def test_serializer(): - s = u'
' + s = '
' eq_(s, bleach.clean(s, tags=['table'])) - eq_(u'test
', bleach.linkify(u'test
')) - eq_(u'

test

', bleach.clean(u'

test

', tags=['p'])) + eq_('test
', bleach.linkify('test
')) + eq_('

test

', bleach.clean('

test

', tags=['p'])) def test_no_href_links(): - s = u'x' + s = 'x' eq_(s, bleach.linkify(s)) @@ -112,7 +121,7 @@ def test_stripping(): bleach.clean('a test with html tags', strip=True)) eq_('a test with html tags', bleach.clean('a test with ' - 'html tags', strip=True)) + 'html tags', strip=True)) s = '

link text

' eq_('

link text

', bleach.clean(s, tags=['p'], strip=True)) @@ -138,7 +147,7 @@ def test_allowed_styles(): def test_idempotent(): """Make sure that applying the filter twice doesn't change anything.""" - dirty = u'invalid & < extra http://link.com' + dirty = 'invalid & < extra http://link.com' clean = bleach.clean(dirty) eq_(clean, bleach.clean(clean)) @@ -147,10 +156,23 @@ def test_idempotent(): eq_(linked, bleach.linkify(linked)) +def test_rel_already_there(): + """Make sure rel attribute is updated not replaced""" + linked = ('Click ' + 'here.') + link_good = (('Click ' + 'here.'), + ('Click ' + 'here.')) + + in_(link_good, bleach.linkify(linked)) + in_(link_good, bleach.linkify(link_good[0])) + + def test_lowercase_html(): """We should output lowercase HTML.""" - dirty = u'BAR' - clean = u'BAR' + dirty = 'BAR' + clean = 'BAR' eq_(clean, bleach.clean(dirty, attributes=['class'])) @@ -160,14 +182,15 @@ def test_wildcard_attributes(): 'img': ['src'], } TAG = ['img', 'em'] - dirty = (u'both can have ' - u'') - clean = u'both can have ' - eq_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR)) + dirty = ('both can have ' + '') + clean = ('both can have ', + 'both can have ') + in_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR)) def test_sarcasm(): """Jokes should crash.""" - dirty = u'Yeah right ' - clean = u'Yeah right <sarcasm/>' + dirty = 'Yeah right ' + clean = 'Yeah right <sarcasm/>' eq_(clean, bleach.clean(dirty)) diff --git a/bleach/tests/test_css.py b/bleach/tests/test_css.py index 588c8ce..b40596f 100644 --- a/bleach/tests/test_css.py +++ b/bleach/tests/test_css.py @@ -29,14 +29,14 @@ def test_allowed_css(): ('font-family: "Arial";', 'font-family: "Arial";', ['font-family']), ) - p_single = '

bar

' - p_double = "

bar

" + p_single = '

bar

' + p_double = "

bar

" def check(i, o, s): if '"' in i: - eq_(p_double % o, clean(p_double % i, styles=s)) + eq_(p_double.format(o), clean(p_double.format(i), styles=s)) else: - eq_(p_single % o, clean(p_single % i, styles=s)) + eq_(p_single.format(o), clean(p_single.format(i), styles=s)) for i, o, s in tests: yield check, i, o, s @@ -70,12 +70,13 @@ def test_style_hang(): """font: normal normal normal 100%/normal 'Courier New', """ """'Andale Mono', monospace; background-position: initial """ """initial; background-repeat: initial initial;""") - html = '

Hello world

' % style + html = '

Hello world

'.format(style) styles = [ 'border', 'float', 'overflow', 'min-height', 'vertical-align', 'white-space', 'margin', 'margin-left', 'margin-top', 'margin-bottom', 'margin-right', - 'padding', 'padding-left', 'padding-top', 'padding-bottom', 'padding-right', + 'padding', 'padding-left', 'padding-top', 'padding-bottom', + 'padding-right', 'background', 'background-color', 'font', 'font-size', 'font-weight', 'text-align', 'text-transform', diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py index ac593c4..abf889d 100644 --- a/bleach/tests/test_links.py +++ b/bleach/tests/test_links.py @@ -1,18 +1,20 @@ -import urllib +try: + from urllib.parse import quote_plus +except ImportError: + from urllib import quote_plus from html5lib.tokenizer import HTMLTokenizer from nose.tools import eq_ from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC - - +from bleach.tests.tools import in_ def test_url_re(): def no_match(s): match = url_re.search(s) if match: - assert not match, 'matched %s' % s[slice(*match.span())] + assert not match, 'matched {0!s}'.format(s[slice(*match.span())]) yield no_match, 'just what i am looking for...it' @@ -21,36 +23,48 @@ def test_empty(): def test_simple_link(): - eq_('a http://example.com' + in_(('a http://example.com' ' link', + 'a http://example.com' + ' link'), linkify('a http://example.com link')) - eq_('a https://example.com' + in_(('a https://example.com' ' link', + 'a https://example.com' + ' link'), linkify('a https://example.com link')) - eq_('an example.com link', - linkify('an example.com link')) + in_(('a example.com link', + 'a example.com link'), + linkify('a example.com link')) def test_trailing_slash(): - eq_('http://example.com/', - linkify('http://example.com/')) - eq_('' - 'http://example.com/foo/', - linkify('http://example.com/foo/')) - eq_('' - 'http://example.com/foo/bar/', - linkify('http://example.com/foo/bar/')) + in_(('http://examp.com/', + 'http://examp.com/'), + linkify('http://examp.com/')) + in_(('' + 'http://example.com/foo/', + '' + 'http://example.com/foo/'), + linkify('http://example.com/foo/')) + in_(('' + 'http://example.com/foo/bar/', + '' + 'http://example.com/foo/bar/'), + linkify('http://example.com/foo/bar/')) def test_mangle_link(): """We can muck with the href attribute of the link.""" def filter_url(attrs, new=False): - attrs['href'] = (u'http://bouncer/?u=%s' % - urllib.quote_plus(attrs['href'])) + quoted = quote_plus(attrs['href']) + attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted) return attrs - eq_('' - 'http://example.com', + in_(('' + 'http://example.com', + '' + 'http://example.com'), linkify('http://example.com', DC + [filter_url])) @@ -76,13 +90,19 @@ def test_email_link(): 'james@example.com.au mailto', True, 'aussie james@example.com.au mailto'), # This is kind of a pathological case. I guess we do our best here. - ('email to ' - 'james@example.com', True, - 'email to james@example.com'), + (('email to ' + 'james@example.com', + 'email to ' + 'james@example.com'), + True, + 'email to james@example.com'), ) def _check(o, p, i): - eq_(o, linkify(i, parse_email=p)) + if isinstance(o, (list, tuple)): + in_(o, linkify(i, parse_email=p)) + else: + eq_(o, linkify(i, parse_email=p)) for (o, p, i) in tests: yield _check, o, p, i @@ -151,7 +171,8 @@ def test_set_attrs(): attrs['rev'] = 'canonical' return attrs - eq_('ex.mp', + in_(('ex.mp', + 'ex.mp'), linkify('ex.mp', [set_attr])) @@ -179,15 +200,19 @@ def test_stop_email(): def test_tlds(): - eq_('example.com', + in_(('example.com', + 'example.com'), linkify('example.com')) - eq_('example.co.uk', + in_(('example.co.uk', + 'example.co.uk'), linkify('example.co.uk')) - eq_('example.edu', + in_(('example.edu', + 'example.edu'), linkify('example.edu')) eq_('example.xxx', linkify('example.xxx')) eq_(' brie', linkify(' brie')) - eq_('bit.ly/fun', + in_(('bit.ly/fun', + 'bit.ly/fun'), linkify('bit.ly/fun')) @@ -197,61 +222,81 @@ def test_escaping(): def test_nofollow_off(): eq_('example.com', - linkify(u'example.com', [])) + linkify('example.com', [])) def test_link_in_html(): - eq_('http://yy.com', + in_(('http://yy.com', + 'http://yy.com'), linkify('http://yy.com')) - eq_('http://xx.com' - '', + + in_(('http://xx.com' + '', + 'http://xx.com' + ''), linkify('http://xx.com')) def test_links_https(): - eq_('https://yy.com', + in_(('https://yy.com', + 'https://yy.com'), linkify('https://yy.com')) def test_add_rel_nofollow(): """Verify that rel="nofollow" is added to an existing link""" - eq_('http://yy.com', + in_(('http://yy.com', + 'http://yy.com'), linkify('http://yy.com')) def test_url_with_path(): - eq_('' - 'http://example.com/path/to/file', + in_(('' + 'http://example.com/path/to/file', + '' + 'http://example.com/path/to/file'), linkify('http://example.com/path/to/file')) def test_link_ftp(): - eq_('' - 'ftp://ftp.mozilla.org/some/file', + in_(('' + 'ftp://ftp.mozilla.org/some/file', + '' + 'ftp://ftp.mozilla.org/some/file'), linkify('ftp://ftp.mozilla.org/some/file')) def test_link_query(): - eq_('' + in_(('' 'http://xx.com/?test=win', + '' + 'http://xx.com/?test=win'), linkify('http://xx.com/?test=win')) - eq_('' + in_(('' 'xx.com/?test=win', + '' + 'xx.com/?test=win'), linkify('xx.com/?test=win')) - eq_('' + in_(('' 'xx.com?test=win', + '' + 'xx.com?test=win'), linkify('xx.com?test=win')) def test_link_fragment(): - eq_('' - 'http://xx.com/path#frag', + in_(('' + 'http://xx.com/path#frag', + '' + 'http://xx.com/path#frag'), linkify('http://xx.com/path#frag')) def test_link_entities(): - eq_('' + in_(('' 'http://xx.com/?a=1&b=2', + '' + 'http://xx.com/?a=1&b=2'), linkify('http://xx.com/?a=1&b=2')) @@ -262,9 +307,12 @@ def test_escaped_html(): def test_link_http_complete(): - eq_('' 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f', + '' + 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f'), linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f')) @@ -282,8 +330,10 @@ def test_javascript_url(): def test_unsafe_url(): """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" - eq_('All your{"xx.yy.com/grover.png"}base are', + in_(('All your{"xx.yy.com/grover.png"}base are', + 'All your{"xx.yy.com/grover.png"}base are'), linkify('All your{"xx.yy.com/grover.png"}base are')) @@ -291,17 +341,23 @@ def test_skip_pre(): """Skip linkification in
 tags."""
     simple = 'http://xx.com 
http://xx.com
' linked = ('http://xx.com ' + '
http://xx.com
', + 'http://xx.com ' '
http://xx.com
') all_linked = ('http://xx.com ' '
http://xx.com'
+                  '
', + 'http://xx.com ' + '
http://xx.com'
                   '
') - eq_(linked, linkify(simple, skip_pre=True)) - eq_(all_linked, linkify(simple)) + in_(linked, linkify(simple, skip_pre=True)) + in_(all_linked, linkify(simple)) already_linked = '
xx
' - nofollowed = '
xx
' - eq_(nofollowed, linkify(already_linked)) - eq_(nofollowed, linkify(already_linked, skip_pre=True)) + nofollowed = ('
xx
', + '
xx
') + in_(nofollowed, linkify(already_linked)) + in_(nofollowed, linkify(already_linked, skip_pre=True)) def test_libgl(): @@ -311,11 +367,13 @@ def test_libgl(): def test_end_of_sentence(): """example.com. should match.""" - out = u'%s%s' - in_ = u'%s%s' + outs = ('{0!s}{1!s}', + '{0!s}{1!s}') + intxt = '{0!s}{1!s}' def check(u, p): - eq_(out % (u, u, p), linkify(in_ % (u, p))) + in_([out.format(u, p) for out in outs], + linkify(intxt.format(u, p))) tests = ( ('example.com', '.'), @@ -330,49 +388,50 @@ def test_end_of_sentence(): def test_end_of_clause(): """example.com/foo, shouldn't include the ,""" - eq_('ex.com/foo, bar', + in_(('ex.com/foo, bar', + 'ex.com/foo, bar'), linkify('ex.com/foo, bar')) def test_sarcasm(): """Jokes should crash.""" - dirty = u'Yeah right ' - clean = u'Yeah right <sarcasm/>' + dirty = 'Yeah right ' + clean = 'Yeah right <sarcasm/>' eq_(clean, linkify(dirty)) def test_wrapping_parentheses(): """URLs wrapped in parantheses should not include them.""" - out = u'%s%s%s' + outs = ('{0!s}{2!s}{3!s}', + '{0!s}{2!s}{3!s}') tests = ( - ('(example.com)', out % ('(', 'example.com', 'example.com', ')')), - ('(example.com/)', out % ('(', 'example.com/', 'example.com/', ')')), - ('(example.com/foo)', out % ('(', 'example.com/foo', - 'example.com/foo', ')')), - ('(((example.com/))))', out % ('(((', 'example.com/)', - 'example.com/)', ')))')), - ('example.com/))', out % ('', 'example.com/))', - 'example.com/))', '')), + ('(example.com)', ('(', 'example.com', 'example.com', ')')), + ('(example.com/)', ('(', 'example.com/', 'example.com/', ')')), + ('(example.com/foo)', ('(', 'example.com/foo', + 'example.com/foo', ')')), + ('(((example.com/))))', ('(((', 'example.com/)', + 'example.com/)', ')))')), + ('example.com/))', ('', 'example.com/))', 'example.com/))', '')), ('http://en.wikipedia.org/wiki/Test_(assessment)', - out % ('', 'en.wikipedia.org/wiki/Test_(assessment)', - 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), + ('', 'en.wikipedia.org/wiki/Test_(assessment)', + 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), ('(http://en.wikipedia.org/wiki/Test_(assessment))', - out % ('(', 'en.wikipedia.org/wiki/Test_(assessment)', - 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')), + ('(', 'en.wikipedia.org/wiki/Test_(assessment)', + 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')), ('((http://en.wikipedia.org/wiki/Test_(assessment))', - out % ('((', 'en.wikipedia.org/wiki/Test_(assessment', - 'http://en.wikipedia.org/wiki/Test_(assessment', '))')), + ('((', 'en.wikipedia.org/wiki/Test_(assessment', + 'http://en.wikipedia.org/wiki/Test_(assessment', '))')), ('(http://en.wikipedia.org/wiki/Test_(assessment)))', - out % ('(', 'en.wikipedia.org/wiki/Test_(assessment))', - 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')), + ('(', 'en.wikipedia.org/wiki/Test_(assessment))', + 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')), ('(http://en.wikipedia.org/wiki/)Test_(assessment', - out % ('(', 'en.wikipedia.org/wiki/)Test_(assessment', - 'http://en.wikipedia.org/wiki/)Test_(assessment', '')), + ('(', 'en.wikipedia.org/wiki/)Test_(assessment', + 'http://en.wikipedia.org/wiki/)Test_(assessment', '')), ) def check(test, expected_output): - eq_(expected_output, linkify(test)) + in_([o.format(*expected_output) for o in outs], linkify(test)) for test, expected_output in tests: yield check, test, expected_output @@ -389,7 +448,9 @@ def test_ports(): ) def check(test, output): - eq_(u'{0}{1}'.format(*output), + outs = ('{0}{1}', + '{0}{1}') + in_([out.format(*output) for out in outs], linkify(test)) for test, output in tests: @@ -406,8 +467,9 @@ def test_tokenizer(): def test_ignore_bad_protocols(): eq_('foohttp://bar', linkify('foohttp://bar')) - eq_('foohttp://exampl.com', - linkify('foohttp://exampl.com')) + in_(('fohttp://exampl.com', + 'fohttp://exampl.com'), + linkify('fohttp://exampl.com')) def test_max_recursion_depth(): @@ -419,22 +481,29 @@ def test_max_recursion_depth(): def test_link_emails_and_urls(): """parse_email=True shouldn't prevent URLs from getting linkified.""" output = ('' + 'http://example.com ' + 'person@example.com', + '' 'http://example.com ' 'person@example.com') - eq_(output, linkify('http://example.com person@example.com', + in_(output, linkify('http://example.com person@example.com', parse_email=True)) def test_links_case_insensitive(): """Protocols and domain names are case insensitive.""" expect = ('' + 'HTTP://EXAMPLE.COM', + '' 'HTTP://EXAMPLE.COM') - eq_(expect, linkify('HTTP://EXAMPLE.COM')) + in_(expect, linkify('HTTP://EXAMPLE.COM')) def test_elements_inside_links(): - eq_(u'hello
', + in_(('hello
', + 'hello
'), linkify('hello
')) - eq_(u'bold hello
', + in_(('bold hello
', + 'bold hello
'), linkify('bold hello
')) diff --git a/bleach/tests/test_security.py b/bleach/tests/test_security.py index 6c2b33f..6adab59 100644 --- a/bleach/tests/test_security.py +++ b/bleach/tests/test_security.py @@ -25,10 +25,10 @@ def test_invalid_attr(): clean('test')) eq_('', clean('', - tags=IMG, attributes=IMG_ATTR)) + tags=IMG, attributes=IMG_ATTR)) eq_('', clean('', - tags=IMG, attributes=IMG_ATTR)) + tags=IMG, attributes=IMG_ATTR)) def test_unquoted_attr(): @@ -57,7 +57,7 @@ def test_invalid_filter_attr(): eq_('', clean('', - tags=IMG, attributes=IMG_ATTR)) + tags=IMG, attributes=IMG_ATTR)) eq_('', clean('', tags=IMG, attributes=IMG_ATTR)) @@ -91,9 +91,9 @@ def test_nasty(): """Nested, broken up, multiple tags, are still foiled!""" test = ('ipt type="text/javascript">alert("foo");script>') - expect = (u'<scr<script></script>ipt type="text/javascript"' - u'>alert("foo");</script>script<del></del>' - u'>') + expect = ('<scr<script></script>ipt type="text/javascript"' + '>alert("foo");</script>script<del></del>' + '>') eq_(expect, clean(test)) diff --git a/bleach/tests/test_unicode.py b/bleach/tests/test_unicode.py index 67123cc..796924d 100644 --- a/bleach/tests/test_unicode.py +++ b/bleach/tests/test_unicode.py @@ -1,54 +1,59 @@ # -*- coding: utf-8 -*- - +from __future__ import unicode_literals from nose.tools import eq_ from bleach import clean, linkify +from bleach.tests.tools import in_ def test_japanese_safe_simple(): - eq_(u'ヘルプとチュートリアル', clean(u'ヘルプとチュートリアル')) - eq_(u'ヘルプとチュートリアル', linkify(u'ヘルプとチュートリアル')) + eq_('ヘルプとチュートリアル', clean('ヘルプとチュートリアル')) + eq_('ヘルプとチュートリアル', linkify('ヘルプとチュートリアル')) def test_japanese_strip(): - eq_(u'ヘルプとチュートリアル', - clean(u'ヘルプとチュートリアル')) - eq_(u'<span>ヘルプとチュートリアル</span>', - clean(u'ヘルプとチュートリアル')) + eq_('ヘルプとチュートリアル', + clean('ヘルプとチュートリアル')) + eq_('<span>ヘルプとチュートリアル</span>', + clean('ヘルプとチュートリアル')) def test_russian_simple(): - eq_(u'Домашняя', clean(u'Домашняя')) - eq_(u'Домашняя', linkify(u'Домашняя')) + eq_('Домашняя', clean('Домашняя')) + eq_('Домашняя', linkify('Домашняя')) def test_mixed(): - eq_(u'Домашняяヘルプとチュートリアル', - clean(u'Домашняяヘルプとチュートリアル')) + eq_('Домашняяヘルプとチュートリアル', + clean('Домашняяヘルプとチュートリアル')) def test_mixed_linkify(): - eq_(u'Домашняя ' - u'http://example.com ヘルプとチュートリアル', - linkify(u'Домашняя http://example.com ヘルプとチュートリアル')) + in_(('Домашняя ' + 'http://example.com ヘルプとチュートリアル', + 'Домашняя ' + 'http://example.com ヘルプとチュートリアル'), + linkify('Домашняя http://example.com ヘルプとチュートリアル')) def test_url_utf8(): """Allow UTF8 characters in URLs themselves.""" - out = u'%(url)s' + outs = ('{0!s}', + '{0!s}') + + out = lambda url: [x.format(url) for x in outs] tests = ( - ('http://éxámplé.com/', out % {'url': u'http://éxámplé.com/'}), - ('http://éxámplé.com/íàñá/', - out % {'url': u'http://éxámplé.com/íàñá/'}), + ('http://éxámplé.com/', out('http://éxámplé.com/')), + ('http://éxámplé.com/íàñá/', out('http://éxámplé.com/íàñá/')), ('http://éxámplé.com/íàñá/?foo=bar', - out % {'url': u'http://éxámplé.com/íàñá/?foo=bar'}), + out('http://éxámplé.com/íàñá/?foo=bar')), ('http://éxámplé.com/íàñá/?fóo=bár', - out % {'url': u'http://éxámplé.com/íàñá/?fóo=bár'}), + out('http://éxámplé.com/íàñá/?fóo=bár')), ) def check(test, expected_output): - eq_(expected_output, linkify(test)) + in_(expected_output, linkify(test)) for test, expected_output in tests: yield check, test, expected_output diff --git a/bleach/tests/tools.py b/bleach/tests/tools.py new file mode 100644 index 0000000..87f926c --- /dev/null +++ b/bleach/tests/tools.py @@ -0,0 +1,7 @@ + + +def in_(l, a, msg=None): + """Shorthand for 'assert a in l, "%r not in %r" % (a, l) + """ + if not a in l: + raise AssertionError(msg or "%r not in %r" % (a, l)) diff --git a/docs/clean.rst b/docs/clean.rst index a31dc89..2fb888b 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -85,7 +85,7 @@ allowed but no values will be. For example, to allow users to set the color and font-weight of text:: attrs = { - '*': 'style' + '*': ['style'] } tags = ['p', 'em', 'strong'] styles = ['color', 'font-weight'] diff --git a/docs/conf.py b/docs/conf.py index a63aedf..96b2fc8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,9 +48,9 @@ copyright = u'2012, James Socol' # built documents. # # The short X.Y version. -version = '1.2' +version = '1.3' # The full version, including alpha/beta/rc tags. -release = '1.2.0' +release = '1.3.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/index.rst b/docs/index.rst index 0929e53..0439786 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,14 +16,9 @@ regular-expression-based sanitizers. Bleach's ``linkify`` function is highly configurable and can be used to find, edit, and filter links most other auto-linkers can't. -The version of bleach on GitHub_ is the always the most up-to-date and the +The version of bleach on GitHub_ is always the most up-to-date and the ``master`` branch should always work. -.. warn:: - - Bleach is currently incompatible with html5lib 1.0b and any versions below - 0.9.5. - Installing Bleach ================= @@ -56,7 +51,6 @@ Contents: goals - Indices and tables ================== @@ -64,6 +58,6 @@ Indices and tables * :ref:`modindex` * :ref:`search` -.. _html5lib: http://code.google.com/p/html5lib/ +.. _html5lib: https://github.com/html5lib/html5lib-python .. _GitHub: https://github.com/jsocol/bleach -.. _PyPI: http://pypi.python.org/pypi/bleach +.. _PyPI: https://pypi.python.org/pypi/bleach diff --git a/requirements.txt b/requirements.txt index 1500a14..d6e9357 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ -# These are the requirements to run the test suite. -nose==1.3.0 -html5lib==0.9.5 +six +html5lib>=0.999 +# Requirements to run the test suite: +nose +flake8 diff --git a/setup.py b/setup.py index e48c3f7..6d5cfb4 100644 --- a/setup.py +++ b/setup.py @@ -2,26 +2,35 @@ from setuptools import setup, find_packages setup( name='bleach', - version='1.2.2', + version='1.4', description='An easy whitelist-based HTML-sanitizing tool.', long_description=open('README.rst').read(), author='James Socol', - author_email='james@mozilla.com', + author_email='me@jamessocol.com', url='http://github.com/jsocol/bleach', - license='BSD', + license='Apache Software License', packages=find_packages(), include_package_data=True, package_data={'': ['README.rst']}, zip_safe=False, - install_requires=['html5lib==0.95'], + install_requires=[ + 'six', + 'html5lib>=0.999', + ], classifiers=[ - 'Development Status :: 4 - Beta', + 'Development Status :: 5 - Production/Stable', 'Environment :: Web Environment', 'Environment :: Web Environment :: Mozilla', 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', + 'License :: OSI Approved :: Apache Software License', 'Operating System :: OS Independent', 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', 'Topic :: Software Development :: Libraries :: Python Modules', ] ) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..4d8e5f6 --- /dev/null +++ b/tox.ini @@ -0,0 +1,12 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py26, py27, py32, py33, pypy + +[testenv] +commands = nosetests {posargs:-v} +deps = + nose -- cgit v1.2.3