diff options
-rw-r--r-- | .gitignore | 8 | ||||
-rw-r--r-- | .travis.yml | 6 | ||||
-rw-r--r-- | CONTRIBUTORS | 1 | ||||
-rw-r--r-- | LICENSE | 28 | ||||
-rw-r--r-- | MANIFEST.in | 2 | ||||
-rw-r--r-- | README.rst | 159 | ||||
-rw-r--r-- | bleach/__init__.py | 342 | ||||
-rw-r--r-- | bleach/encoding.py | 54 | ||||
-rw-r--r-- | bleach/sanitizer.py | 143 | ||||
-rw-r--r-- | bleach/tests/__init__.py | 0 | ||||
-rw-r--r-- | bleach/tests/test_basics.py | 170 | ||||
-rw-r--r-- | bleach/tests/test_css.py | 85 | ||||
-rw-r--r-- | bleach/tests/test_delinkify.py | 109 | ||||
-rw-r--r-- | bleach/tests/test_links.py | 312 | ||||
-rw-r--r-- | bleach/tests/test_security.py | 108 | ||||
-rw-r--r-- | bleach/tests/test_unicode.py | 54 | ||||
-rw-r--r-- | requirements.txt | 3 | ||||
-rw-r--r-- | setup.py | 27 |
18 files changed, 1611 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6714ae6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +*.pyo +*.pyc +pip-log.txt +.coverage +dist +*.egg-info +.noseids +build diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..e767f15 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,6 @@ +language: python +python: + - "2.6" + - "2.7" +install: pip install -Ur requirements.txt --use-mirrors +script: nosetests diff --git a/CONTRIBUTORS b/CONTRIBUTORS new file mode 100644 index 0000000..f612983 --- /dev/null +++ b/CONTRIBUTORS @@ -0,0 +1 @@ +See https://github.com/jsocol/bleach/contributors @@ -0,0 +1,28 @@ +Copyright (c) 2010, Mozilla Foundation +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of bleach nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..9d5d250 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include LICENSE +include README.rst diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..08dfc50 --- /dev/null +++ b/README.rst @@ -0,0 +1,159 @@ +====== +Bleach +====== + +Bleach is an HTML sanitizing library that escapes or strips markup and +attributes based on a white list. Bleach can also linkify text safely, applying +filters that Django's ``urlize`` filter cannot, and optionally setting ``rel`` +attributes, even on links already in the text. + +Bleach is intended for sanitizing text from *untrusted* sources. If you find +yourself jumping through hoops to allow your site administrators to do lots of +things, you're probably outside the use cases. Either trust those users, or +don't. + +Because it relies on html5lib_, Bleach is as good as modern browsers at dealing +with weird, quirky HTML fragments. And *any* of Bleach's methods will fix +unbalanced or mis-nested tags. + +The version on `github <http://github.com/jsocol/bleach>`_ is the most +up-to-date and contains the latest bug fixes. + + +Basic Use +========= + +The simplest way to use Bleach is:: + + >>> import bleach + + >>> bleach.clean('an <script>evil()</script> example') + u'an <script>evil()</script> example' + + >>> bleach.linkify('an http://example.com url') + u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url + + >>> bleach.delinkify('a <a href="http://ex.mp">link</a>') + u'a link' + +*NB*: Bleach always returns a ``unicode`` object, whether you give it a +bytestring or a ``unicode`` object, but Bleach does not attempt to detect +incoming character encodings, and will assume UTF-8. If you are using a +different character encoding, you should convert from a bytestring to +``unicode`` before passing the text to Bleach. + + +Customizing Bleach +================== + +``clean()``, ``linkify()`` and ``delinkify()`` can take several optional +keyword arguments to customize their behavior. + + +``clean()`` +----------- + +``bleach.clean()`` is the primary tool in Bleach. It uses html5lib_ to parse a +document fragment into a tree and does the sanitization during tokenizing, +which is incredibly powerful and has several advantages over regular +expression-based sanitization. + +``tags`` + A whitelist of HTML tags. Must be a list. Defaults to + ``bleach.ALLOWED_TAGS``. +``attributes`` + A whitelist of HTML attributes. Either a list, in which case all attributes + are allowed on all elements, or a dict, with tag names as keys and lists of + allowed attributes as values ('*' is a wildcard key to allow an attribute on + any tag). Or it is possible to pass a callable instead of a list that accepts + name and value of attribute and returns True of False. Defaults to + ``bleach.ALLOWED_ATTRIBUTES``. +``styles`` + A whitelist of allowed CSS properties within a ``style`` attribute. (Note + that ``style`` attributes are not allowed by default.) Must be a list. + Defaults to ``[]``. +``strip`` + Strip disallowed HTML instead of escaping it. A boolean. Defaults to + ``False``. +``strip_comments`` + Strip HTML comments. A boolean. Defaults to ``True``. + + +``linkify()`` +------------- + +``bleach.linkify()`` turns things that look like URLs or (optionally) email +addresses and turns them into links. It does this smartly, only looking in text +nodes, and never within ``<a>`` tags. + +There are options that affect output, and some of these are also applied to +links already found in the text. These are designed to allow you to set +attributes like ``rel="nofollow"`` or ``target``, or push outgoing links +through a redirection URL, and do this to links already in the text, as well. + +``nofollow`` + Add ``rel="nofollow"`` to non-relative links (both created by ``linkify()`` + and those already present in the text). Defaults to ``True``. +``filter_url`` + A callable through which the ``href`` attribute of links (both created by + ``linkify()`` and already present in the text) will be passed. Must accept a + single argument and return a string. +``filter_text`` + A callable through which the text of links (only those created by + ``linkify``) will be passed. Must accept a single argument and return a + string. +``skip_pre`` + Do not create new links inside ``<pre>`` sections. Still follows + ``nofollow``. Defaults to ``False``. +``parse_email`` + Linkify email addresses with ``mailto:``. Defaults to ``False``. +``target`` + Set a ``target`` attribute on links. Like ``nofollow``, if ``target`` is not + ``None``, will set the attribute on links already in the text, as well. + Defaults to ``None``. + + +``delinkify()`` +--------------- + +``bleach.delinkify()`` is basically the opposite of ``linkify()``. It strips +links out of text except, optionally, relative links, or links to domains +you've whitelisted. + +``allow_domains`` + Allow links to the domains in this list. Set to ``None`` or an empty list to + disallow all non-relative domains. See below for wildcards. Defaults to + ``None``. +``allow_relative`` + Allow relative links (i.e. those with no hostname). Defaults to ``False``. + + +Wildcards +^^^^^^^^^ + +To allow links to a domain and its subdomains, ``allow_domains`` accepts two +types of wildcard arguments in domains: + +``*`` + Allow a single level of subdomain. This can be anywhere in the hostname, even + the TLD. This allows you to, for example, allow links to ``example.*``. + ``*.example.com`` will match both ``foo.example.com`` and ``example.com``. + :: + >>> delinkify('<a href="http://foo.ex.mp">bar</a>', \ + ... allow_domains=['*.ex.*']) + u'<a href="http://foo.ex.mp">bar</a>' + >>> delinkify('<a href="http://ex.mp">bar</a>', allow_domains=['*.ex.mp']) + u'<a href="http://ex.mp">bar</a> +``**`` + To allow any number of *preceding* subdomains, you can start a hostname with + ``**``. Note that unlike ``*``, ``**`` may only appear once, and only at the + beginning of a hostname. + :: + >>> delinkify('<a href="http://a.b.ex.mp">t</a>', \ + ... allow_domains=['**.ex.mp']) + u'<a href="http://a.b.ex.mp">t</a>' + If ``**`` appears anywhere but the beginning of a hostname, ``delinkify`` + will throw ``bleach.ValidationError`` (which is a ``ValueError`` subclass, + for easy catching). + +.. _html5lib: http://code.google.com/p/html5lib/ diff --git a/bleach/__init__.py b/bleach/__init__.py new file mode 100644 index 0000000..bc8e49c --- /dev/null +++ b/bleach/__init__.py @@ -0,0 +1,342 @@ +import itertools +import logging +import re +import sys +import urlparse + +import html5lib +from html5lib.sanitizer import HTMLSanitizer +from html5lib.serializer.htmlserializer import HTMLSerializer + +from encoding import force_unicode +from sanitizer import BleachSanitizer + + +VERSION = (1, 1, 5) +__version__ = '.'.join(map(str, VERSION)) + +__all__ = ['clean', 'linkify'] + +log = logging.getLogger('bleach') + +ALLOWED_TAGS = [ + 'a', + 'abbr', + 'acronym', + 'b', + 'blockquote', + 'code', + 'em', + 'i', + 'li', + 'ol', + 'strong', + 'ul', +] + +ALLOWED_ATTRIBUTES = { + 'a': ['href', 'title'], + 'abbr': ['title'], + 'acronym': ['title'], +} + +ALLOWED_STYLES = [] + +TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az + ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat + cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk + dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg + gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il + im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp + kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk + ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne + net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn pr pro + ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl sm sn so + sr st su sv sy sz tc td tel tf tg th tj tk tl tm tn to tp tr travel tt + tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws xn ye yt yu za zm + zw""".split() + +TLDS.reverse() + +url_re = re.compile( + r"""\(* # Match any opening parentheses. + \b(?<![@.])(?:\w[\w-]*:/{0,3}(?:(?:\w+:)?\w+@)?)? # http:// + ([\w-]+\.)+(?:%s)(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)? + (?:[/?][^\s\{\}\|\\\^\[\]`<>"]*)? + # /path/zz (excluding "unsafe" chars from RFC 1738, + # except for # and ~, which happen in practice) + """ % u'|'.join(TLDS), re.VERBOSE | re.UNICODE) + +proto_re = re.compile(r'^[\w-]+:/{0,3}') + +punct_re = re.compile(r'([\.,]+)$') + +email_re = re.compile( + r"""(?<!//) + (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+ + (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)* # dot-atom + |^"([\001-\010\013\014\016-\037!#-\[\]-\177] + |\\[\001-011\013\014\016-\177])*" # quoted-string + )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.? # domain + """, + re.IGNORECASE | re.MULTILINE | re.VERBOSE) + +NODE_TEXT = 4 # The numeric ID of a text node in simpletree. + +identity = lambda x: x # The identity function. + + +def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, + styles=ALLOWED_STYLES, strip=False, strip_comments=True): + """Clean an HTML fragment and return it""" + if not text: + return u'' + + text = force_unicode(text) + if text.startswith(u'<!--'): + text = u' ' + text + + class s(BleachSanitizer): + allowed_elements = tags + allowed_attributes = attributes + allowed_css_properties = styles + strip_disallowed_elements = strip + strip_html_comments = strip_comments + + parser = html5lib.HTMLParser(tokenizer=s) + + return _render(parser.parseFragment(text)).strip() + + +def linkify(text, nofollow=True, target=None, filter_url=identity, + filter_text=identity, skip_pre=False, parse_email=False, + tokenizer=HTMLSanitizer): + """Convert URL-like strings in an HTML fragment to links. + + linkify() converts strings that look like URLs or domain names in a + blob of text that may be an HTML fragment to links, while preserving + (a) links already in the string, (b) urls found in attributes, and + (c) email addresses. + + If the nofollow argument is True (the default) then rel="nofollow" + will be added to links created by linkify() as well as links already + found in the text. + + The target argument will optionally add a target attribute with the + given value to links created by linkify() as well as links already + found in the text. + + linkify() uses up to two filters on each link. For links created by + linkify(), the href attribute is passed through filter_url() + and the text of the link is passed through filter_text(). For links + already found in the document, the href attribute is passed through + filter_url(), but the text is untouched. + """ + text = force_unicode(text) + + if not text: + return u'' + + parser = html5lib.HTMLParser(tokenizer=tokenizer) + + forest = parser.parseFragment(text) + + if nofollow: + rel = u'rel="nofollow"' + else: + rel = u'' + + def replace_nodes(tree, new_frag, node): + new_tree = parser.parseFragment(new_frag) + for n in new_tree.childNodes: + tree.insertBefore(n, node) + tree.removeChild(node) + + def strip_wrapping_parentheses(fragment): + """Strips wrapping parentheses. + + Returns a tuple of the following format:: + + (string stripped from wrapping parentheses, + count of stripped opening parentheses, + count of stripped closing parentheses) + """ + opening_parentheses = closing_parentheses = 0 + # Count consecutive opening parentheses + # at the beginning of the fragment (string). + for char in fragment: + if char == '(': + opening_parentheses += 1 + else: + break + + if opening_parentheses: + newer_frag = '' + # Cut the consecutive opening brackets from the fragment. + fragment = fragment[opening_parentheses:] + # Reverse the fragment for easier detection of parentheses + # inside the URL. + reverse_fragment = fragment[::-1] + skip = False + for char in reverse_fragment: + # Remove the closing parentheses if it has a matching + # opening parentheses (they are balanced). + if (char == ')' and + closing_parentheses < opening_parentheses and + not skip): + closing_parentheses += 1 + continue + # Do not remove ')' from the URL itself. + elif char != ')': + skip = True + newer_frag += char + fragment = newer_frag[::-1] + + return fragment, opening_parentheses, closing_parentheses + + def linkify_nodes(tree, parse_text=True): + for node in tree.childNodes: + if node.type == NODE_TEXT and parse_text: + new_frag = node.toxml() + if parse_email: + new_frag = re.sub(email_re, email_repl, new_frag) + if new_frag != node.toxml(): + replace_nodes(tree, new_frag, node) + linkify_nodes(tree) + continue + new_frag = re.sub(url_re, link_repl, new_frag) + replace_nodes(tree, new_frag, node) + elif node.name == 'a': + if 'href' in node.attributes: + if nofollow: + node.attributes['rel'] = 'nofollow' + if target is not None: + node.attributes['target'] = target + href = node.attributes['href'] + node.attributes['href'] = filter_url(href) + elif skip_pre and node.name == 'pre': + linkify_nodes(node, False) + else: + linkify_nodes(node) + + def email_repl(match): + repl = u'<a href="mailto:%(mail)s">%(mail)s</a>' + return repl % {'mail': match.group(0).replace('"', '"')} + + def link_repl(match): + url = match.group(0) + open_brackets = close_brackets = 0 + if url.startswith('('): + url, open_brackets, close_brackets = ( + strip_wrapping_parentheses(url) + ) + end = u'' + m = re.search(punct_re, url) + if m: + end = m.group(0) + url = url[0:m.start()] + if re.search(proto_re, url): + href = url + else: + href = u''.join([u'http://', url]) + + repl = u'%s<a href="%s" %s>%s</a>%s%s' + + attribs = [rel] + if target is not None: + attribs.append('target="%s"' % target) + + return repl % ('(' * open_brackets, + filter_url(href), ' '.join(attribs), filter_text(url), + end, ')' * close_brackets) + + linkify_nodes(forest) + + return _render(forest) + + +def delinkify(text, allow_domains=None, allow_relative=False): + """Remove links from text, except those allowed to stay.""" + text = force_unicode(text) + if not text: + return u'' + + parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer) + forest = parser.parseFragment(text) + + if allow_domains is None: + allow_domains = [] + elif isinstance(allow_domains, basestring): + allow_domains = [allow_domains] + + def delinkify_nodes(tree): + """Remove <a> tags and replace them with their contents.""" + for node in tree.childNodes: + if node.name == 'a': + if 'href' not in node.attributes: + continue + parts = urlparse.urlparse(node.attributes['href']) + host = parts.hostname + if any(_domain_match(host, d) for d in allow_domains): + continue + if host is None and allow_relative: + continue + # Replace the node with its children. + # You can't nest <a> tags, and html5lib takes care of that + # for us in the tree-building step. + for n in node.childNodes: + tree.insertBefore(n, node) + tree.removeChild(node) + elif node.type != NODE_TEXT: # Don't try to delinkify text. + delinkify_nodes(node) + + delinkify_nodes(forest) + return _render(forest) + + +def _domain_match(test, compare): + test = test.lower() + compare = compare.lower() + if '*' not in compare: + return test == compare + c = compare.split('.')[::-1] + if '**' in c and (c.count('**') > 1 or not compare.startswith('**')): + raise ValidationError( + 'Only 1 ** is allowed, and must start the domain.') + t = test.split('.')[::-1] + z = itertools.izip_longest(c, t) + for c, t in z: + if c == t: + continue + elif c == '*': + continue + elif c == '**': + return True + return False + # Got all the way through and everything matched. + return True + + +class ValidationError(ValueError): + pass + + +def _render(tree): + """Try rendering as HTML, then XML, then give up.""" + try: + return force_unicode(_serialize(tree)) + except Exception, e: + log.error('HTML: %r' % e, exc_info=sys.exc_info()) + try: + return force_unicode(tree.toxml()) + except Exception, e: + log.error('XML: %r' % e, exc_info=sys.exc_info()) + return u'' + + +def _serialize(domtree): + walker = html5lib.treewalkers.getTreeWalker('simpletree') + stream = walker(domtree) + serializer = HTMLSerializer(quote_attr_values=True, + omit_optional_tags=False) + return serializer.render(stream) diff --git a/bleach/encoding.py b/bleach/encoding.py new file mode 100644 index 0000000..b9a989d --- /dev/null +++ b/bleach/encoding.py @@ -0,0 +1,54 @@ +import datetime +from decimal import Decimal +import types + + +def is_protected_type(obj): + """Determine if the object instance is of a protected type. + + Objects of protected types are preserved as-is when passed to + force_unicode(strings_only=True). + """ + return isinstance(obj, ( + types.NoneType, + int, long, + datetime.datetime, datetime.date, datetime.time, + float, Decimal) + ) + + +def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): + """ + Similar to smart_unicode, except that lazy instances are resolved to + strings, rather than kept as lazy objects. + + If strings_only is True, don't convert (some) non-string-like objects. + """ + if strings_only and is_protected_type(s): + return s + try: + if not isinstance(s, basestring,): + if hasattr(s, '__unicode__'): + s = unicode(s) + else: + try: + s = unicode(str(s), encoding, errors) + except UnicodeEncodeError: + if not isinstance(s, Exception): + raise + # If we get to here, the caller has passed in an Exception + # subclass populated with non-ASCII data without special + # handling to display as a string. We need to handle this + # without raising a further exception. We do an + # approximation to what the Exception's standard str() + # output should be. + s = ' '.join([force_unicode(arg, encoding, strings_only, + errors) for arg in s]) + elif not isinstance(s, unicode): + # Note: We use .decode() here, instead of unicode(s, encoding, + # errors), so that if s is a SafeString, it ends up being a + # SafeUnicode at the end. + s = s.decode(encoding, errors) + except UnicodeDecodeError, e: + raise UnicodeDecodeError(*e.args) + return s diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py new file mode 100644 index 0000000..677287e --- /dev/null +++ b/bleach/sanitizer.py @@ -0,0 +1,143 @@ +import re +from xml.sax.saxutils import escape, unescape + +from html5lib.constants import tokenTypes +from html5lib.sanitizer import HTMLSanitizerMixin +from html5lib.tokenizer import HTMLTokenizer + + +class BleachSanitizerMixin(HTMLSanitizerMixin): + """Mixin to replace sanitize_token() and sanitize_css().""" + + allowed_svg_properties = [] + # TODO: When the next html5lib version comes out, nuke this. + attr_val_is_uri = HTMLSanitizerMixin.attr_val_is_uri + ['poster'] + + def sanitize_token(self, token): + """Sanitize a token either by HTML-encoding or dropping. + + Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be + a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}. + + Here callable is a function with two arguments of attribute name + and value. It should return true of false. + + Also gives the option to strip tags instead of encoding. + + """ + if (getattr(self, 'wildcard_attributes', None) is None and + isinstance(self.allowed_attributes, dict)): + self.wildcard_attributes = self.allowed_attributes.get('*', []) + + if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'], + tokenTypes['EmptyTag']): + if token['name'] in self.allowed_elements: + if 'data' in token: + if isinstance(self.allowed_attributes, dict): + allowed_attributes = self.allowed_attributes.get( + token['name'], []) + if not callable(allowed_attributes): + allowed_attributes += self.wildcard_attributes + else: + allowed_attributes = self.allowed_attributes + attrs = dict([(name, val) for name, val in + token['data'][::-1] + if (allowed_attributes(name, val) + if callable(allowed_attributes) + else name in allowed_attributes)]) + for attr in self.attr_val_is_uri: + if not attr in attrs: + continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', + unescape(attrs[attr])).lower() + # Remove replacement characters from unescaped + # characters. + val_unescaped = val_unescaped.replace(u"\ufffd", "") + if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) + and (val_unescaped.split(':')[0] not in + self.allowed_protocols)): + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: + if attr in attrs: + attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + if (token['name'] in self.svg_allow_local_href and + 'xlink:href' in attrs and + re.search(r'^\s*[^#\s].*', attrs['xlink:href'])): + del attrs['xlink:href'] + if 'style' in attrs: + attrs['style'] = self.sanitize_css(attrs['style']) + token['data'] = [(name, val) for name, val in + attrs.items()] + return token + elif self.strip_disallowed_elements: + pass + else: + if token['type'] == tokenTypes['EndTag']: + token['data'] = '</%s>' % token['name'] + elif token['data']: + attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in + token['data']]) + token['data'] = '<%s%s>' % (token['name'], attrs) + else: + token['data'] = '<%s>' % token['name'] + if token['selfClosing']: + token['data'] = token['data'][:-1] + '/>' + token['type'] = tokenTypes['Characters'] + del token["name"] + return token + elif token['type'] == tokenTypes['Comment']: + if not self.strip_html_comments: + return token + else: + return token + + def sanitize_css(self, style): + """HTMLSanitizerMixin.sanitize_css replacement. + + HTMLSanitizerMixin.sanitize_css always whitelists background-*, + border-*, margin-*, and padding-*. We only whitelist what's in + the whitelist. + + """ + # disallow urls + style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) + + # gauntlet + # TODO: Make sure this does what it's meant to - I *think* it wants to + # validate style attribute contents. + parts = style.split(';') + gauntlet = re.compile("""^([-/:,#%.'\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*""" + """|"[\s\w]+"|\([\d,%\.\s]+\))*$""") + for part in parts: + if not gauntlet.match(part): + return '' + + if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): + return '' + + clean = [] + for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style): + if not value: + continue + if prop.lower() in self.allowed_css_properties: + clean.append(prop + ': ' + value + ';') + elif prop.lower() in self.allowed_svg_properties: + clean.append(prop + ': ' + value + ';') + + return ' '.join(clean) + + +class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin): + def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, + lowercaseElementName=True, lowercaseAttrName=True, **kwargs): + HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, + lowercaseElementName, lowercaseAttrName, + **kwargs) + + def __iter__(self): + for token in HTMLTokenizer.__iter__(self): + token = self.sanitize_token(token) + if token: + yield token diff --git a/bleach/tests/__init__.py b/bleach/tests/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/bleach/tests/__init__.py diff --git a/bleach/tests/test_basics.py b/bleach/tests/test_basics.py new file mode 100644 index 0000000..60be11d --- /dev/null +++ b/bleach/tests/test_basics.py @@ -0,0 +1,170 @@ +import html5lib +from nose.tools import eq_ + +import bleach + + +def test_empty(): + eq_('', bleach.clean('')) + + +def test_comments_only(): + comment = '<!-- this is a comment -->' + open_comment = '<!-- this is an open comment' + eq_('', bleach.clean(comment)) + eq_('', bleach.clean(open_comment)) + eq_(comment, bleach.clean(comment, strip_comments=False)) + eq_('%s-->' % open_comment, bleach.clean(open_comment, + strip_comments=False)) + + +def test_with_comments(): + html = '<!-- comment -->Just text' + eq_('Just text', bleach.clean(html)) + eq_(html, bleach.clean(html, strip_comments=False)) + + +def test_no_html(): + eq_('no html string', bleach.clean('no html string')) + + +def test_allowed_html(): + eq_('an <strong>allowed</strong> tag', + bleach.clean('an <strong>allowed</strong> tag')) + eq_('another <em>good</em> tag', + bleach.clean('another <em>good</em> tag')) + + +def test_bad_html(): + eq_('a <em>fixed tag</em>', + bleach.clean('a <em>fixed tag')) + + +def test_function_arguments(): + TAGS = ['span', 'br'] + ATTRS = {'span': ['style']} + + eq_('a <br><span style="">test</span>', + bleach.clean('a <br/><span style="color:red">test</span>', + tags=TAGS, attributes=ATTRS)) + + +def test_named_arguments(): + ATTRS = {'a': ['rel', 'href']} + s = u'<a href="http://xx.com" rel="alternate">xx.com</a>' + eq_('<a href="http://xx.com">xx.com</a>', bleach.clean(s)) + eq_(s, bleach.clean(s, attributes=ATTRS)) + + +def test_disallowed_html(): + eq_('a <script>safe()</script> test', + bleach.clean('a <script>safe()</script> test')) + eq_('a <style>body{}</style> test', + bleach.clean('a <style>body{}</style> test')) + + +def test_bad_href(): + eq_('<em>no link</em>', + bleach.clean('<em href="fail">no link</em>')) + + +def test_bare_entities(): + eq_('an & entity', bleach.clean('an & entity')) + eq_('an < entity', bleach.clean('an < entity')) + eq_('tag < <em>and</em> entity', + bleach.clean('tag < <em>and</em> entity')) + eq_('&', bleach.clean('&')) + + +def test_escaped_entities(): + s = u'<em>strong</em>' + eq_(s, bleach.clean(s)) + + +def test_serializer(): + s = u'<table></table>' + eq_(s, bleach.clean(s, tags=['table'])) + eq_(u'test<table></table>', bleach.linkify(u'<table>test</table>')) + eq_(u'<p>test</p>', bleach.clean(u'<p>test</p>', tags=['p'])) + + +def test_no_href_links(): + s = u'<a name="anchor">x</a>' + eq_(s, bleach.linkify(s)) + eq_(s, bleach.linkify(s, nofollow=False)) + + +def test_weird_strings(): + s = '</3' + eq_(bleach.clean(s), '') + + +def test_xml_render(): + parser = html5lib.HTMLParser() + eq_(bleach._render(parser.parseFragment('')), '') + + +def test_stripping(): + eq_('a test <em>with</em> <b>html</b> tags', + bleach.clean('a test <em>with</em> <b>html</b> tags', strip=True)) + eq_('a test <em>with</em> <b>html</b> tags', + bleach.clean('a test <em>with</em> <img src="http://example.com/"> ' + '<b>html</b> tags', strip=True)) + + s = '<p><a href="http://example.com/">link text</a></p>' + eq_('<p>link text</p>', bleach.clean(s, tags=['p'], strip=True)) + s = '<p><span>multiply <span>nested <span>text</span></span></span></p>' + eq_('<p>multiply nested text</p>', bleach.clean(s, tags=['p'], strip=True)) + + s = ('<p><a href="http://example.com/"><img src="http://example.com/">' + '</a></p>') + eq_('<p><a href="http://example.com/"></a></p>', + bleach.clean(s, tags=['p', 'a'], strip=True)) + + +def test_allowed_styles(): + ATTR = ['style'] + STYLE = ['color'] + blank = '<b style=""></b>' + s = '<b style="color: blue;"></b>' + eq_(blank, bleach.clean('<b style="top:0"></b>', attributes=ATTR)) + eq_(s, bleach.clean(s, attributes=ATTR, styles=STYLE)) + eq_(s, bleach.clean('<b style="top: 0; color: blue;"></b>', + attributes=ATTR, styles=STYLE)) + + +def test_idempotent(): + """Make sure that applying the filter twice doesn't change anything.""" + dirty = u'<span>invalid & </span> < extra http://link.com<em>' + + clean = bleach.clean(dirty) + eq_(clean, bleach.clean(clean)) + + linked = bleach.linkify(dirty) + eq_(linked, bleach.linkify(linked)) + + +def test_lowercase_html(): + """We should output lowercase HTML.""" + dirty = u'<EM CLASS="FOO">BAR</EM>' + clean = u'<em class="FOO">BAR</em>' + eq_(clean, bleach.clean(dirty, attributes=['class'])) + + +def test_wildcard_attributes(): + ATTR = { + '*': ['id'], + 'img': ['src'], + } + TAG = ['img', 'em'] + dirty = (u'both <em id="foo" style="color: black">can</em> have ' + u'<img id="bar" src="foo"/>') + clean = u'both <em id="foo">can</em> have <img id="bar" src="foo">' + eq_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR)) + + +def test_sarcasm(): + """Jokes should crash.<sarcasm/>""" + dirty = u'Yeah right <sarcasm/>' + clean = u'Yeah right <sarcasm/>' + eq_(clean, bleach.clean(dirty)) diff --git a/bleach/tests/test_css.py b/bleach/tests/test_css.py new file mode 100644 index 0000000..fdb3f65 --- /dev/null +++ b/bleach/tests/test_css.py @@ -0,0 +1,85 @@ +from functools import partial + +from nose.tools import eq_ + +from bleach import clean + + +clean = partial(clean, tags=['p'], attributes=['style']) + + +def test_allowed_css(): + tests = ( + ('font-family: Arial; color: red; float: left; ' + 'background-color: red;', 'color: red;', ['color']), + ('border: 1px solid blue; color: red; float: left;', 'color: red;', + ['color']), + ('border: 1px solid blue; color: red; float: left;', + 'color: red; float: left;', ['color', 'float']), + ('color: red; float: left; padding: 1em;', 'color: red; float: left;', + ['color', 'float']), + ('color: red; float: left; padding: 1em;', 'color: red;', ['color']), + ('cursor: -moz-grab;', 'cursor: -moz-grab;', ['cursor']), + ('color: hsl(30,100%,50%);', 'color: hsl(30,100%,50%);', ['color']), + ('color: rgba(255,0,0,0.4);', 'color: rgba(255,0,0,0.4);', ['color']), + ("text-overflow: ',' ellipsis;", "text-overflow: ',' ellipsis;", ['text-overflow']), + ) + + p = '<p style="%s">bar</p>' + + def check(input, output, styles): + eq_(p % output, clean(p % input, styles=styles)) + + for i, o, s in tests: + yield check, i, o, s + + +def test_valid_css(): + """The sanitizer should fix missing CSS values.""" + styles = ['color', 'float'] + eq_('<p style="float: left;">foo</p>', + clean('<p style="float: left; color: ">foo</p>', styles=styles)) + eq_('<p style="">foo</p>', + clean('<p style="color: float: left;">foo</p>', styles=styles)) + + +def test_style_hang(): + """The sanitizer should not hang on any inline styles""" + # TODO: Neaten this up. It's copypasta from MDN/Kuma to repro the bug + style = ("""margin-top: 0px; margin-right: 0px; margin-bottom: 1.286em; """ + """margin-left: 0px; padding-top: 15px; padding-right: 15px; """ + """padding-bottom: 15px; padding-left: 15px; border-top-width: """ + """1px; border-right-width: 1px; border-bottom-width: 1px; """ + """border-left-width: 1px; border-top-style: dotted; """ + """border-right-style: dotted; border-bottom-style: dotted; """ + """border-left-style: dotted; border-top-color: rgb(203, 200, """ + """185); border-right-color: rgb(203, 200, 185); """ + """border-bottom-color: rgb(203, 200, 185); border-left-color: """ + """rgb(203, 200, 185); background-image: initial; """ + """background-attachment: initial; background-origin: initial; """ + """background-clip: initial; background-color: """ + """rgb(246, 246, 242); overflow-x: auto; overflow-y: auto; """ + """font: normal normal normal 100%/normal 'Courier New', """ + """'Andale Mono', monospace; background-position: initial """ + """initial; background-repeat: initial initial;""") + html = '<p style="%s">Hello world</p>' % style + styles = [ + 'border', 'float', 'overflow', 'min-height', 'vertical-align', + 'white-space', + 'margin', 'margin-left', 'margin-top', 'margin-bottom', 'margin-right', + 'padding', 'padding-left', 'padding-top', 'padding-bottom', 'padding-right', + 'background', + 'background-color', + 'font', 'font-size', 'font-weight', 'text-align', 'text-transform', + ] + + expected = ("""<p style="margin-top: 0px; margin-right: 0px; """ + """margin-bottom: 1.286em; margin-left: 0px; padding-top: """ + """15px; padding-right: 15px; padding-bottom: 15px; """ + """padding-left: 15px; background-color: """ + """rgb(246, 246, 242); font: normal normal normal """ + """100%/normal 'Courier New', 'Andale Mono', monospace;">""" + """Hello world</p>""") + + result = clean(html, styles=styles) + eq_(expected, result) diff --git a/bleach/tests/test_delinkify.py b/bleach/tests/test_delinkify.py new file mode 100644 index 0000000..f216d2f --- /dev/null +++ b/bleach/tests/test_delinkify.py @@ -0,0 +1,109 @@ +from nose.tools import eq_ + +import bleach + + +def test_delinkify(): + eq_('test', bleach.delinkify('<a href="http://ex.mp">test</a>')) + eq_('footestbar', + bleach.delinkify('foo<a href="http://ex.mp">test</a>bar')) + + +def test_whitelist(): + html = '<a href="http://ex.mp">test</a>' + eq_(html, bleach.delinkify(html, allow_domains=['ex.mp'])) + eq_('test', bleach.delinkify(html, allow_domains=['ex2.mp'])) + # Allow a single domain as a special case. + eq_(html, bleach.delinkify(html, allow_domains='ex.mp')) + + +def test_nested_a(): + html = '<a href="http://ex.mp">test<a href="http://foo.bar">test</a></a>' + eq_('testtest', bleach.delinkify(html)) + eq_('<a href="http://ex.mp">test</a>test', + bleach.delinkify(html, allow_domains=['ex.mp'])) + + +def test_nested_tag(): + html = '<a href="http://ex.mp">test<span>test</span></a>' + eq_('test<span>test</span>', bleach.delinkify(html)) + + +def test_a_name(): + """Don't screw with non-link <a> tags.""" + html = '<a name="foo">bar</a>' + eq_(html, bleach.delinkify(html)) + + +def test_relative(): + """Relative links are optionally OK.""" + html = 'some <a href="/foo/bar">link</a>' + eq_('some link', bleach.delinkify(html)) + eq_(html, bleach.delinkify(html, allow_relative=True)) + + +def test_protocol_relative(): + """Protocol-relative links aren't relative.""" + html = 'bad <a href="//ex.mp">link</a>' + expect = 'bad link' + eq_(expect, bleach.delinkify(html)) + eq_(expect, bleach.delinkify(html, allow_relative=True)) + eq_(html, bleach.delinkify(html, allow_domains='ex.mp')) + + +def test_domain_match(): + tests = ( + ('ex.mp', 'ex.mp', True), + ('ex.mp', '*.ex.mp', True), + ('test.ex.mp', '*.ex.mp', True), + ('test.ex.mp', 'ex.mp', False), + ('test.test.ex.mp', '*.ex.mp', False), + ('test.test.ex.mp', '**.ex.mp', True), + ('wrong.mp', 'ex.mp', False), + ('wrong.mp', '*.ex.mp', False), + ('really.wrong.mp', 'ex.mp', False), + ('really.wrong.mp', '*.ex.mp', False), + ('really.very.wrong.mp', '*.ex.mp', False), + ('EX.mp', 'ex.mp', True), # Domains are case-insensitive. + ('ex.mp', 'an.ex.mp', False), + ('ex.mp', '*.an.ex.mp', False), + ('an.ex.am.pl', 'an.*.am.pl', True), + ('a.ex.am.pl', 'an.*.am.pl', False), + ('ex.am.pl', 'an.*.am.pl', False), + ) + + def _check(t, c, v): + eq_(v, bleach._domain_match(t, c)) + + for t, c, v in tests: + yield _check, t, c, v + + +def test_double_star(): + assert bleach._domain_match('ex.mp', '**.ex.mp') + try: + bleach._domain_match('ex.mp', 'an.**.ex.mp') + except bleach.ValidationError: + pass + else: + assert False, '_domain_match should not accept an.**.ex.mp' + + +def test_allow_subdomains(): + domains = ('ex.mp', '*.exa.mp', 'an.exam.pl', '*.my.examp.le') + html = ( + ('<a href="http://an.ex.mp">bad</a>', 'bad'), + ('<a href="http://exa.mp">good</a>', None), + ('<a href="http://an.exa.mp">good</a>', None), + ('<a href="http://an.exam.pl">good</a>', None), + ('<a href="http://another.exam.pl">bad</a>', 'bad'), + ('<a href="http://a.bad.examp.le">bad</a>', 'bad'), + ('<a href="http://a.very.bad.examp.le">bad</a>', 'bad'), + ) + + def _check(html, text): + output = bleach.delinkify(html, allow_domains=domains) + eq_(html if text is None else text, output) + + for t, o in html: + yield _check, t, o diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py new file mode 100644 index 0000000..7caf006 --- /dev/null +++ b/bleach/tests/test_links.py @@ -0,0 +1,312 @@ +import urllib + +from html5lib.tokenizer import HTMLTokenizer +from nose.tools import eq_ + +from bleach import linkify, url_re + + +def filter_url(url): + return u'http://bouncer/?u=%s' % urllib.quote_plus(url) + + +def test_url_re(): + def no_match(s): + match = url_re.search(s) + if match: + assert not match, 'matched %s' % s[slice(*match.span())] + yield no_match, 'just what i am looking for...it' + + +def test_empty(): + eq_('', linkify('')) + + +def test_simple_link(): + eq_('a <a href="http://example.com" rel="nofollow">http://example.com' + '</a> link', + linkify('a http://example.com link')) + eq_('a <a href="https://example.com" rel="nofollow">https://example.com' + '</a> link', + linkify('a https://example.com link')) + eq_('an <a href="http://example.com" rel="nofollow">example.com</a> link', + linkify('an example.com link')) + + +def test_trailing_slash(): + eq_('<a href="http://example.com/" rel="nofollow">http://example.com/</a>', + linkify('http://example.com/')) + eq_('<a href="http://example.com/foo/" rel="nofollow">' + 'http://example.com/foo/</a>', + linkify('http://example.com/foo/')) + eq_('<a href="http://example.com/foo/bar/" rel="nofollow">' + 'http://example.com/foo/bar/</a>', + linkify('http://example.com/foo/bar/')) + + +def test_mangle_link(): + eq_('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">' + 'http://example.com</a>', + linkify('http://example.com', filter_url=filter_url)) + + +def test_email_link(): + eq_('a james@example.com mailto', + linkify('a james@example.com mailto')) + eq_('a james@example.com.au mailto', + linkify('a james@example.com.au mailto')) + eq_('a <a href="mailto:james@example.com" rel="nofollow">' + 'james@example.com</a> mailto', + linkify('a james@example.com mailto', parse_email=True)) + eq_('aussie <a href="mailto:james@example.com.au" rel="nofollow">' + 'james@example.com.au</a> mailto', + linkify('aussie james@example.com.au mailto', parse_email=True)) + eq_('email to <a href="james@example.com" rel="nofollow">' + 'james@example.com</a>', + linkify('email to <a href="james@example.com">' + 'james@example.com</a>', parse_email=True)) + + +def test_email_link_escaping(): + eq_('''<a href='mailto:"james"@example.com' rel="nofollow">''' + '''"james"@example.com</a>''', + linkify('"james"@example.com', parse_email=True)) + eq_('''<a href="mailto:"j'ames"@example.com" rel="nofollow">''' + '''"j'ames"@example.com</a>''', + linkify('"j\'ames"@example.com', parse_email=True)) + eq_('''<a href='mailto:"ja>mes"@example.com' rel="nofollow">''' + '''"ja>mes"@example.com</a>''', + linkify('"ja>mes"@example.com', parse_email=True)) + + +def test_tlds(): + eq_('<a href="http://example.com" rel="nofollow">example.com</a>', + linkify('example.com')) + eq_('<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>', + linkify('example.co.uk')) + eq_('<a href="http://example.edu" rel="nofollow">example.edu</a>', + linkify('example.edu')) + eq_('example.xxx', linkify('example.xxx')) + eq_(' brie', linkify(' brie')) + eq_('<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>', + linkify('bit.ly/fun')) + + +def test_escaping(): + eq_('< unrelated', linkify('< unrelated')) + + +def test_nofollow_off(): + eq_('<a href="http://example.com">example.com</a>', + linkify(u'example.com', nofollow=False)) + + +def test_link_in_html(): + eq_('<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>', + linkify('<i>http://yy.com</i>')) + eq_('<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com</a>' + '</strong></em>', + linkify('<em><strong>http://xx.com</strong></em>')) + + +def test_links_https(): + eq_('<a href="https://yy.com" rel="nofollow">https://yy.com</a>', + linkify('https://yy.com')) + + +def test_add_rel_nofollow(): + """Verify that rel="nofollow" is added to an existing link""" + eq_('<a href="http://yy.com" rel="nofollow">http://yy.com</a>', + linkify('<a href="http://yy.com">http://yy.com</a>')) + + +def test_url_with_path(): + eq_('<a href="http://example.com/path/to/file" rel="nofollow">' + 'http://example.com/path/to/file</a>', + linkify('http://example.com/path/to/file')) + + +def test_link_ftp(): + eq_('<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">' + 'ftp://ftp.mozilla.org/some/file</a>', + linkify('ftp://ftp.mozilla.org/some/file')) + + +def test_link_query(): + eq_('<a href="http://xx.com/?test=win" rel="nofollow">' + 'http://xx.com/?test=win</a>', + linkify('http://xx.com/?test=win')) + eq_('<a href="http://xx.com/?test=win" rel="nofollow">' + 'xx.com/?test=win</a>', + linkify('xx.com/?test=win')) + eq_('<a href="http://xx.com?test=win" rel="nofollow">' + 'xx.com?test=win</a>', + linkify('xx.com?test=win')) + + +def test_link_fragment(): + eq_('<a href="http://xx.com/path#frag" rel="nofollow">' + 'http://xx.com/path#frag</a>', + linkify('http://xx.com/path#frag')) + + +def test_link_entities(): + eq_('<a href="http://xx.com/?a=1&b=2" rel="nofollow">' + 'http://xx.com/?a=1&b=2</a>', + linkify('http://xx.com/?a=1&b=2')) + + +def test_escaped_html(): + """If I pass in escaped HTML, it should probably come out escaped.""" + s = '<em>strong</em>' + eq_(s, linkify(s)) + + +def test_link_http_complete(): + eq_('<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d' + '&e#f" rel="nofollow">' + 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f</a>', + linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f')) + + +def test_non_url(): + """document.vulnerable should absolutely not be linkified.""" + s = 'document.vulnerable' + eq_(s, linkify(s)) + + +def test_javascript_url(): + """javascript: urls should never be linkified.""" + s = 'javascript:document.vulnerable' + eq_(s, linkify(s)) + + +def test_unsafe_url(): + """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" + eq_('All your{"<a href="http://xx.yy.com/grover.png" ' + 'rel="nofollow">xx.yy.com/grover.png</a>"}base are', + linkify('All your{"xx.yy.com/grover.png"}base are')) + + +def test_skip_pre(): + """Skip linkification in <pre> tags.""" + simple = 'http://xx.com <pre>http://xx.com</pre>' + linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> ' + '<pre>http://xx.com</pre>') + all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> ' + '<pre><a href="http://xx.com" rel="nofollow">http://xx.com' + '</a></pre>') + eq_(linked, linkify(simple, skip_pre=True)) + eq_(all_linked, linkify(simple)) + + already_linked = '<pre><a href="http://xx.com">xx</a></pre>' + nofollowed = '<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>' + eq_(nofollowed, linkify(already_linked)) + eq_(nofollowed, linkify(already_linked, skip_pre=True)) + + +def test_libgl(): + """libgl.so.1 should not be linkified.""" + eq_('libgl.so.1', linkify('libgl.so.1')) + + +def test_end_of_sentence(): + """example.com. should match.""" + out = u'<a href="http://%s" rel="nofollow">%s</a>%s' + in_ = u'%s%s' + + def check(u, p): + eq_(out % (u, u, p), linkify(in_ % (u, p))) + + tests = ( + ('example.com', '.'), + ('example.com', '...'), + ('ex.com/foo', '.'), + ('ex.com/foo', '....'), + ) + + for u, p in tests: + yield check, u, p + + +def test_end_of_clause(): + """example.com/foo, shouldn't include the ,""" + eq_('<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar', + linkify('ex.com/foo, bar')) + + +def test_sarcasm(): + """Jokes should crash.<sarcasm/>""" + dirty = u'Yeah right <sarcasm/>' + clean = u'Yeah right <sarcasm/>' + eq_(clean, linkify(dirty)) + + +def test_wrapping_parentheses(): + """URLs wrapped in parantheses should not include them.""" + out = u'%s<a href="http://%s" rel="nofollow">%s</a>%s' + + tests = ( + ('(example.com)', out % ('(', 'example.com', 'example.com', ')')), + ('(example.com/)', out % ('(', 'example.com/', 'example.com/', ')')), + ('(example.com/foo)', out % ('(', 'example.com/foo', + 'example.com/foo', ')')), + ('(((example.com/))))', out % ('(((', 'example.com/)', + 'example.com/)', ')))')), + ('example.com/))', out % ('', 'example.com/))', + 'example.com/))', '')), + ('http://en.wikipedia.org/wiki/Test_(assessment)', + out % ('', 'en.wikipedia.org/wiki/Test_(assessment)', + 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), + ('(http://en.wikipedia.org/wiki/Test_(assessment))', + out % ('(', 'en.wikipedia.org/wiki/Test_(assessment)', + 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')), + ('((http://en.wikipedia.org/wiki/Test_(assessment))', + out % ('((', 'en.wikipedia.org/wiki/Test_(assessment', + 'http://en.wikipedia.org/wiki/Test_(assessment', '))')), + ('(http://en.wikipedia.org/wiki/Test_(assessment)))', + out % ('(', 'en.wikipedia.org/wiki/Test_(assessment))', + 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')), + ('(http://en.wikipedia.org/wiki/)Test_(assessment', + out % ('(', 'en.wikipedia.org/wiki/)Test_(assessment', + 'http://en.wikipedia.org/wiki/)Test_(assessment', '')), + ) + + def check(test, expected_output): + eq_(expected_output, linkify(test)) + + for test, expected_output in tests: + yield check, test, expected_output + + +def test_ports(): + """URLs can contain port numbers.""" + tests = ( + ('http://foo.com:8000', ('http://foo.com:8000', '')), + ('http://foo.com:8000/', ('http://foo.com:8000/', '')), + ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')), + ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')), + ('http://foo.com:', ('http://foo.com', ':')), + ) + + def check(test, output): + eq_(u'<a href="{0}" rel="nofollow">{0}</a>{1}'.format(*output), + linkify(test)) + + for test, output in tests: + yield check, test, output + + +def test_target(): + eq_('<a href="http://example.com" rel="nofollow" ' + 'target="_blank">example.com</a>', + linkify(u'example.com', target='_blank')) + eq_('<a href="http://example.com" target="_blank">example.com</a>', + linkify(u'example.com', target='_blank', nofollow=False)) + + +def test_tokenizer(): + """Linkify doesn't always have to sanitize.""" + raw = '<em>test<x></x></em>' + eq_('<em>test<x></x></em>', linkify(raw)) + eq_(raw, linkify(raw, tokenizer=HTMLTokenizer)) diff --git a/bleach/tests/test_security.py b/bleach/tests/test_security.py new file mode 100644 index 0000000..9e9bb7b --- /dev/null +++ b/bleach/tests/test_security.py @@ -0,0 +1,108 @@ +"""More advanced security tests""" + +from nose.tools import eq_ + +from bleach import clean + + +def test_nested_script_tag(): + eq_('<<script>script>evil()<</script>/script>', + clean('<<script>script>evil()<</script>/script>')) + eq_('<<x>script>evil()<</x>/script>', + clean('<<x>script>evil()<</x>/script>')) + + +def test_nested_script_tag_r(): + eq_('<script<script>>evil()</script<>>', + clean('<script<script>>evil()</script</script>>')) + + +def test_invalid_attr(): + IMG = ['img', ] + IMG_ATTR = ['src'] + + eq_('<a href="test">test</a>', + clean('<a onclick="evil" href="test">test</a>')) + eq_('<img src="test">', + clean('<img onclick="evil" src="test" />', + tags=IMG, attributes=IMG_ATTR)) + eq_('<img src="test">', + clean('<img href="invalid" src="test" />', + tags=IMG, attributes=IMG_ATTR)) + + +def test_unquoted_attr(): + eq_('<abbr title="mytitle">myabbr</abbr>', + clean('<abbr title=mytitle>myabbr</abbr>')) + + +def test_unquoted_event_handler(): + eq_('<a href="http://xx.com">xx.com</a>', + clean('<a href="http://xx.com" onclick=foo()>xx.com</a>')) + + +def test_invalid_attr_value(): + eq_('<img src="javascript:alert(\'XSS\');">', + clean('<img src="javascript:alert(\'XSS\');">')) + + +def test_invalid_href_attr(): + eq_('<a>xss</a>', + clean('<a href="javascript:alert(\'XSS\')">xss</a>')) + + +def test_invalid_filter_attr(): + IMG = ['img', ] + IMG_ATTR = {'img': lambda n, v: n == 'src' and v == "http://example.com/"} + + eq_('<img src="http://example.com/">', + clean('<img onclick="evil" src="http://example.com/" />', + tags=IMG, attributes=IMG_ATTR)) + + eq_('<img>', clean('<img onclick="evil" src="http://badhost.com/" />', + tags=IMG, attributes=IMG_ATTR)) + + +def test_invalid_tag_char(): + eq_('<script xss="" src="http://xx.com/xss.js"></script>', + clean('<script/xss src="http://xx.com/xss.js"></script>')) + eq_('<script src="http://xx.com/xss.js"></script>', + clean('<script/src="http://xx.com/xss.js"></script>')) + + +def test_unclosed_tag(): + eq_('<script src="http://xx.com/xss.js&lt;b">', + clean('<script src=http://xx.com/xss.js<b>')) + eq_('<script src="http://xx.com/xss.js" <b="">', + clean('<script src="http://xx.com/xss.js"<b>')) + eq_('<script src="http://xx.com/xss.js" <b="">', + clean('<script src="http://xx.com/xss.js" <b>')) + + +def test_strip(): + """Using strip=True shouldn't result in malicious content.""" + s = '<scri<script>pt>alert(1)</scr</script>ipt>' + eq_('pt>alert(1)ipt>', clean(s, strip=True)) + s = '<scri<scri<script>pt>pt>alert(1)</script>' + eq_('pt>pt>alert(1)', clean(s, strip=True)) + + +def test_nasty(): + """Nested, broken up, multiple tags, are still foiled!""" + test = ('<scr<script></script>ipt type="text/javascript">alert("foo");</' + '<script></script>script<del></del>>') + expect = (u'<scr<script></script>ipt type="text/javascript"' + u'>alert("foo");</script>script<del></del>' + u'>') + eq_(expect, clean(test)) + + +def test_poster_attribute(): + """Poster attributes should not allow javascript.""" + tags = ['video'] + attrs = {'video': ['poster']} + test = '<video poster="javascript:alert(1)"></video>' + expect = '<video></video>' + eq_(expect, clean(test, tags=tags, attributes=attrs)) + ok = '<video poster="/foo.png"></video>' + eq_(ok, clean(ok, tags=tags, attributes=attrs)) diff --git a/bleach/tests/test_unicode.py b/bleach/tests/test_unicode.py new file mode 100644 index 0000000..67123cc --- /dev/null +++ b/bleach/tests/test_unicode.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +from nose.tools import eq_ + +from bleach import clean, linkify + + +def test_japanese_safe_simple(): + eq_(u'ヘルプとチュートリアル', clean(u'ヘルプとチュートリアル')) + eq_(u'ヘルプとチュートリアル', linkify(u'ヘルプとチュートリアル')) + + +def test_japanese_strip(): + eq_(u'<em>ヘルプとチュートリアル</em>', + clean(u'<em>ヘルプとチュートリアル</em>')) + eq_(u'<span>ヘルプとチュートリアル</span>', + clean(u'<span>ヘルプとチュートリアル</span>')) + + +def test_russian_simple(): + eq_(u'Домашняя', clean(u'Домашняя')) + eq_(u'Домашняя', linkify(u'Домашняя')) + + +def test_mixed(): + eq_(u'Домашняяヘルプとチュートリアル', + clean(u'Домашняяヘルプとチュートリアル')) + + +def test_mixed_linkify(): + eq_(u'Домашняя <a href="http://example.com" rel="nofollow">' + u'http://example.com</a> ヘルプとチュートリアル', + linkify(u'Домашняя http://example.com ヘルプとチュートリアル')) + + +def test_url_utf8(): + """Allow UTF8 characters in URLs themselves.""" + out = u'<a href="%(url)s" rel="nofollow">%(url)s</a>' + + tests = ( + ('http://éxámplé.com/', out % {'url': u'http://éxámplé.com/'}), + ('http://éxámplé.com/íàñá/', + out % {'url': u'http://éxámplé.com/íàñá/'}), + ('http://éxámplé.com/íàñá/?foo=bar', + out % {'url': u'http://éxámplé.com/íàñá/?foo=bar'}), + ('http://éxámplé.com/íàñá/?fóo=bár', + out % {'url': u'http://éxámplé.com/íàñá/?fóo=bár'}), + ) + + def check(test, expected_output): + eq_(expected_output, linkify(test)) + + for test, expected_output in tests: + yield check, test, expected_output diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c525a9e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +# These are the requirements to run the test suite. +nose +html5lib diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..115d811 --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +from setuptools import setup, find_packages + +setup( + name='bleach', + version='1.1.5', + description='An easy whitelist-based HTML-sanitizing tool.', + long_description=open('README.rst').read(), + author='James Socol', + author_email='james@mozilla.com', + url='http://github.com/jsocol/bleach', + license='BSD', + packages=find_packages(), + include_package_data=True, + package_data={'': ['README.rst']}, + zip_safe=False, + install_requires=['html5lib>=0.95'], + classifiers=[ + 'Development Status :: 4 - Beta', + 'Environment :: Web Environment', + 'Environment :: Web Environment :: Mozilla', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Topic :: Software Development :: Libraries :: Python Modules', + ] +) |