aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPer Andersson <avtobiff@gmail.com>2012-09-07 02:45:18 +0200
committerPer Andersson <avtobiff@gmail.com>2012-09-07 02:45:18 +0200
commit38dc3b8f231cf36bcc771001318556d9e84c2889 (patch)
treee2507fa7d649f5f505bc3544b961d8614336d4cc
downloadpython-bleach-38dc3b8f231cf36bcc771001318556d9e84c2889.tar
python-bleach-38dc3b8f231cf36bcc771001318556d9e84c2889.tar.gz
Imported Upstream version 1.1.5upstream/1.1.5
-rw-r--r--.gitignore8
-rw-r--r--.travis.yml6
-rw-r--r--CONTRIBUTORS1
-rw-r--r--LICENSE28
-rw-r--r--MANIFEST.in2
-rw-r--r--README.rst159
-rw-r--r--bleach/__init__.py342
-rw-r--r--bleach/encoding.py54
-rw-r--r--bleach/sanitizer.py143
-rw-r--r--bleach/tests/__init__.py0
-rw-r--r--bleach/tests/test_basics.py170
-rw-r--r--bleach/tests/test_css.py85
-rw-r--r--bleach/tests/test_delinkify.py109
-rw-r--r--bleach/tests/test_links.py312
-rw-r--r--bleach/tests/test_security.py108
-rw-r--r--bleach/tests/test_unicode.py54
-rw-r--r--requirements.txt3
-rw-r--r--setup.py27
18 files changed, 1611 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6714ae6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+*.pyo
+*.pyc
+pip-log.txt
+.coverage
+dist
+*.egg-info
+.noseids
+build
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..e767f15
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,6 @@
+language: python
+python:
+ - "2.6"
+ - "2.7"
+install: pip install -Ur requirements.txt --use-mirrors
+script: nosetests
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..f612983
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1 @@
+See https://github.com/jsocol/bleach/contributors
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..b2df30c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2010, Mozilla Foundation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. Neither the name of bleach nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..9d5d250
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include LICENSE
+include README.rst
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..08dfc50
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,159 @@
+======
+Bleach
+======
+
+Bleach is an HTML sanitizing library that escapes or strips markup and
+attributes based on a white list. Bleach can also linkify text safely, applying
+filters that Django's ``urlize`` filter cannot, and optionally setting ``rel``
+attributes, even on links already in the text.
+
+Bleach is intended for sanitizing text from *untrusted* sources. If you find
+yourself jumping through hoops to allow your site administrators to do lots of
+things, you're probably outside the use cases. Either trust those users, or
+don't.
+
+Because it relies on html5lib_, Bleach is as good as modern browsers at dealing
+with weird, quirky HTML fragments. And *any* of Bleach's methods will fix
+unbalanced or mis-nested tags.
+
+The version on `github <http://github.com/jsocol/bleach>`_ is the most
+up-to-date and contains the latest bug fixes.
+
+
+Basic Use
+=========
+
+The simplest way to use Bleach is::
+
+ >>> import bleach
+
+ >>> bleach.clean('an <script>evil()</script> example')
+ u'an &lt;script&gt;evil()&lt;/script&gt; example'
+
+ >>> bleach.linkify('an http://example.com url')
+ u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url
+
+ >>> bleach.delinkify('a <a href="http://ex.mp">link</a>')
+ u'a link'
+
+*NB*: Bleach always returns a ``unicode`` object, whether you give it a
+bytestring or a ``unicode`` object, but Bleach does not attempt to detect
+incoming character encodings, and will assume UTF-8. If you are using a
+different character encoding, you should convert from a bytestring to
+``unicode`` before passing the text to Bleach.
+
+
+Customizing Bleach
+==================
+
+``clean()``, ``linkify()`` and ``delinkify()`` can take several optional
+keyword arguments to customize their behavior.
+
+
+``clean()``
+-----------
+
+``bleach.clean()`` is the primary tool in Bleach. It uses html5lib_ to parse a
+document fragment into a tree and does the sanitization during tokenizing,
+which is incredibly powerful and has several advantages over regular
+expression-based sanitization.
+
+``tags``
+ A whitelist of HTML tags. Must be a list. Defaults to
+ ``bleach.ALLOWED_TAGS``.
+``attributes``
+ A whitelist of HTML attributes. Either a list, in which case all attributes
+ are allowed on all elements, or a dict, with tag names as keys and lists of
+ allowed attributes as values ('*' is a wildcard key to allow an attribute on
+ any tag). Or it is possible to pass a callable instead of a list that accepts
+ name and value of attribute and returns True of False. Defaults to
+ ``bleach.ALLOWED_ATTRIBUTES``.
+``styles``
+ A whitelist of allowed CSS properties within a ``style`` attribute. (Note
+ that ``style`` attributes are not allowed by default.) Must be a list.
+ Defaults to ``[]``.
+``strip``
+ Strip disallowed HTML instead of escaping it. A boolean. Defaults to
+ ``False``.
+``strip_comments``
+ Strip HTML comments. A boolean. Defaults to ``True``.
+
+
+``linkify()``
+-------------
+
+``bleach.linkify()`` turns things that look like URLs or (optionally) email
+addresses and turns them into links. It does this smartly, only looking in text
+nodes, and never within ``<a>`` tags.
+
+There are options that affect output, and some of these are also applied to
+links already found in the text. These are designed to allow you to set
+attributes like ``rel="nofollow"`` or ``target``, or push outgoing links
+through a redirection URL, and do this to links already in the text, as well.
+
+``nofollow``
+ Add ``rel="nofollow"`` to non-relative links (both created by ``linkify()``
+ and those already present in the text). Defaults to ``True``.
+``filter_url``
+ A callable through which the ``href`` attribute of links (both created by
+ ``linkify()`` and already present in the text) will be passed. Must accept a
+ single argument and return a string.
+``filter_text``
+ A callable through which the text of links (only those created by
+ ``linkify``) will be passed. Must accept a single argument and return a
+ string.
+``skip_pre``
+ Do not create new links inside ``<pre>`` sections. Still follows
+ ``nofollow``. Defaults to ``False``.
+``parse_email``
+ Linkify email addresses with ``mailto:``. Defaults to ``False``.
+``target``
+ Set a ``target`` attribute on links. Like ``nofollow``, if ``target`` is not
+ ``None``, will set the attribute on links already in the text, as well.
+ Defaults to ``None``.
+
+
+``delinkify()``
+---------------
+
+``bleach.delinkify()`` is basically the opposite of ``linkify()``. It strips
+links out of text except, optionally, relative links, or links to domains
+you've whitelisted.
+
+``allow_domains``
+ Allow links to the domains in this list. Set to ``None`` or an empty list to
+ disallow all non-relative domains. See below for wildcards. Defaults to
+ ``None``.
+``allow_relative``
+ Allow relative links (i.e. those with no hostname). Defaults to ``False``.
+
+
+Wildcards
+^^^^^^^^^
+
+To allow links to a domain and its subdomains, ``allow_domains`` accepts two
+types of wildcard arguments in domains:
+
+``*``
+ Allow a single level of subdomain. This can be anywhere in the hostname, even
+ the TLD. This allows you to, for example, allow links to ``example.*``.
+ ``*.example.com`` will match both ``foo.example.com`` and ``example.com``.
+ ::
+ >>> delinkify('<a href="http://foo.ex.mp">bar</a>', \
+ ... allow_domains=['*.ex.*'])
+ u'<a href="http://foo.ex.mp">bar</a>'
+ >>> delinkify('<a href="http://ex.mp">bar</a>', allow_domains=['*.ex.mp'])
+ u'<a href="http://ex.mp">bar</a>
+``**``
+ To allow any number of *preceding* subdomains, you can start a hostname with
+ ``**``. Note that unlike ``*``, ``**`` may only appear once, and only at the
+ beginning of a hostname.
+ ::
+ >>> delinkify('<a href="http://a.b.ex.mp">t</a>', \
+ ... allow_domains=['**.ex.mp'])
+ u'<a href="http://a.b.ex.mp">t</a>'
+ If ``**`` appears anywhere but the beginning of a hostname, ``delinkify``
+ will throw ``bleach.ValidationError`` (which is a ``ValueError`` subclass,
+ for easy catching).
+
+.. _html5lib: http://code.google.com/p/html5lib/
diff --git a/bleach/__init__.py b/bleach/__init__.py
new file mode 100644
index 0000000..bc8e49c
--- /dev/null
+++ b/bleach/__init__.py
@@ -0,0 +1,342 @@
+import itertools
+import logging
+import re
+import sys
+import urlparse
+
+import html5lib
+from html5lib.sanitizer import HTMLSanitizer
+from html5lib.serializer.htmlserializer import HTMLSerializer
+
+from encoding import force_unicode
+from sanitizer import BleachSanitizer
+
+
+VERSION = (1, 1, 5)
+__version__ = '.'.join(map(str, VERSION))
+
+__all__ = ['clean', 'linkify']
+
+log = logging.getLogger('bleach')
+
+ALLOWED_TAGS = [
+ 'a',
+ 'abbr',
+ 'acronym',
+ 'b',
+ 'blockquote',
+ 'code',
+ 'em',
+ 'i',
+ 'li',
+ 'ol',
+ 'strong',
+ 'ul',
+]
+
+ALLOWED_ATTRIBUTES = {
+ 'a': ['href', 'title'],
+ 'abbr': ['title'],
+ 'acronym': ['title'],
+}
+
+ALLOWED_STYLES = []
+
+TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
+ ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
+ cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
+ dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
+ gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
+ im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
+ kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
+ ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
+ net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn pr pro
+ ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl sm sn so
+ sr st su sv sy sz tc td tel tf tg th tj tk tl tm tn to tp tr travel tt
+ tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws xn ye yt yu za zm
+ zw""".split()
+
+TLDS.reverse()
+
+url_re = re.compile(
+ r"""\(* # Match any opening parentheses.
+ \b(?<![@.])(?:\w[\w-]*:/{0,3}(?:(?:\w+:)?\w+@)?)? # http://
+ ([\w-]+\.)+(?:%s)(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)?
+ (?:[/?][^\s\{\}\|\\\^\[\]`<>"]*)?
+ # /path/zz (excluding "unsafe" chars from RFC 1738,
+ # except for # and ~, which happen in practice)
+ """ % u'|'.join(TLDS), re.VERBOSE | re.UNICODE)
+
+proto_re = re.compile(r'^[\w-]+:/{0,3}')
+
+punct_re = re.compile(r'([\.,]+)$')
+
+email_re = re.compile(
+ r"""(?<!//)
+ (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
+ (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)* # dot-atom
+ |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
+ |\\[\001-011\013\014\016-\177])*" # quoted-string
+ )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.? # domain
+ """,
+ re.IGNORECASE | re.MULTILINE | re.VERBOSE)
+
+NODE_TEXT = 4 # The numeric ID of a text node in simpletree.
+
+identity = lambda x: x # The identity function.
+
+
+def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+ styles=ALLOWED_STYLES, strip=False, strip_comments=True):
+ """Clean an HTML fragment and return it"""
+ if not text:
+ return u''
+
+ text = force_unicode(text)
+ if text.startswith(u'<!--'):
+ text = u' ' + text
+
+ class s(BleachSanitizer):
+ allowed_elements = tags
+ allowed_attributes = attributes
+ allowed_css_properties = styles
+ strip_disallowed_elements = strip
+ strip_html_comments = strip_comments
+
+ parser = html5lib.HTMLParser(tokenizer=s)
+
+ return _render(parser.parseFragment(text)).strip()
+
+
+def linkify(text, nofollow=True, target=None, filter_url=identity,
+ filter_text=identity, skip_pre=False, parse_email=False,
+ tokenizer=HTMLSanitizer):
+ """Convert URL-like strings in an HTML fragment to links.
+
+ linkify() converts strings that look like URLs or domain names in a
+ blob of text that may be an HTML fragment to links, while preserving
+ (a) links already in the string, (b) urls found in attributes, and
+ (c) email addresses.
+
+ If the nofollow argument is True (the default) then rel="nofollow"
+ will be added to links created by linkify() as well as links already
+ found in the text.
+
+ The target argument will optionally add a target attribute with the
+ given value to links created by linkify() as well as links already
+ found in the text.
+
+ linkify() uses up to two filters on each link. For links created by
+ linkify(), the href attribute is passed through filter_url()
+ and the text of the link is passed through filter_text(). For links
+ already found in the document, the href attribute is passed through
+ filter_url(), but the text is untouched.
+ """
+ text = force_unicode(text)
+
+ if not text:
+ return u''
+
+ parser = html5lib.HTMLParser(tokenizer=tokenizer)
+
+ forest = parser.parseFragment(text)
+
+ if nofollow:
+ rel = u'rel="nofollow"'
+ else:
+ rel = u''
+
+ def replace_nodes(tree, new_frag, node):
+ new_tree = parser.parseFragment(new_frag)
+ for n in new_tree.childNodes:
+ tree.insertBefore(n, node)
+ tree.removeChild(node)
+
+ def strip_wrapping_parentheses(fragment):
+ """Strips wrapping parentheses.
+
+ Returns a tuple of the following format::
+
+ (string stripped from wrapping parentheses,
+ count of stripped opening parentheses,
+ count of stripped closing parentheses)
+ """
+ opening_parentheses = closing_parentheses = 0
+ # Count consecutive opening parentheses
+ # at the beginning of the fragment (string).
+ for char in fragment:
+ if char == '(':
+ opening_parentheses += 1
+ else:
+ break
+
+ if opening_parentheses:
+ newer_frag = ''
+ # Cut the consecutive opening brackets from the fragment.
+ fragment = fragment[opening_parentheses:]
+ # Reverse the fragment for easier detection of parentheses
+ # inside the URL.
+ reverse_fragment = fragment[::-1]
+ skip = False
+ for char in reverse_fragment:
+ # Remove the closing parentheses if it has a matching
+ # opening parentheses (they are balanced).
+ if (char == ')' and
+ closing_parentheses < opening_parentheses and
+ not skip):
+ closing_parentheses += 1
+ continue
+ # Do not remove ')' from the URL itself.
+ elif char != ')':
+ skip = True
+ newer_frag += char
+ fragment = newer_frag[::-1]
+
+ return fragment, opening_parentheses, closing_parentheses
+
+ def linkify_nodes(tree, parse_text=True):
+ for node in tree.childNodes:
+ if node.type == NODE_TEXT and parse_text:
+ new_frag = node.toxml()
+ if parse_email:
+ new_frag = re.sub(email_re, email_repl, new_frag)
+ if new_frag != node.toxml():
+ replace_nodes(tree, new_frag, node)
+ linkify_nodes(tree)
+ continue
+ new_frag = re.sub(url_re, link_repl, new_frag)
+ replace_nodes(tree, new_frag, node)
+ elif node.name == 'a':
+ if 'href' in node.attributes:
+ if nofollow:
+ node.attributes['rel'] = 'nofollow'
+ if target is not None:
+ node.attributes['target'] = target
+ href = node.attributes['href']
+ node.attributes['href'] = filter_url(href)
+ elif skip_pre and node.name == 'pre':
+ linkify_nodes(node, False)
+ else:
+ linkify_nodes(node)
+
+ def email_repl(match):
+ repl = u'<a href="mailto:%(mail)s">%(mail)s</a>'
+ return repl % {'mail': match.group(0).replace('"', '&quot;')}
+
+ def link_repl(match):
+ url = match.group(0)
+ open_brackets = close_brackets = 0
+ if url.startswith('('):
+ url, open_brackets, close_brackets = (
+ strip_wrapping_parentheses(url)
+ )
+ end = u''
+ m = re.search(punct_re, url)
+ if m:
+ end = m.group(0)
+ url = url[0:m.start()]
+ if re.search(proto_re, url):
+ href = url
+ else:
+ href = u''.join([u'http://', url])
+
+ repl = u'%s<a href="%s" %s>%s</a>%s%s'
+
+ attribs = [rel]
+ if target is not None:
+ attribs.append('target="%s"' % target)
+
+ return repl % ('(' * open_brackets,
+ filter_url(href), ' '.join(attribs), filter_text(url),
+ end, ')' * close_brackets)
+
+ linkify_nodes(forest)
+
+ return _render(forest)
+
+
+def delinkify(text, allow_domains=None, allow_relative=False):
+ """Remove links from text, except those allowed to stay."""
+ text = force_unicode(text)
+ if not text:
+ return u''
+
+ parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer)
+ forest = parser.parseFragment(text)
+
+ if allow_domains is None:
+ allow_domains = []
+ elif isinstance(allow_domains, basestring):
+ allow_domains = [allow_domains]
+
+ def delinkify_nodes(tree):
+ """Remove <a> tags and replace them with their contents."""
+ for node in tree.childNodes:
+ if node.name == 'a':
+ if 'href' not in node.attributes:
+ continue
+ parts = urlparse.urlparse(node.attributes['href'])
+ host = parts.hostname
+ if any(_domain_match(host, d) for d in allow_domains):
+ continue
+ if host is None and allow_relative:
+ continue
+ # Replace the node with its children.
+ # You can't nest <a> tags, and html5lib takes care of that
+ # for us in the tree-building step.
+ for n in node.childNodes:
+ tree.insertBefore(n, node)
+ tree.removeChild(node)
+ elif node.type != NODE_TEXT: # Don't try to delinkify text.
+ delinkify_nodes(node)
+
+ delinkify_nodes(forest)
+ return _render(forest)
+
+
+def _domain_match(test, compare):
+ test = test.lower()
+ compare = compare.lower()
+ if '*' not in compare:
+ return test == compare
+ c = compare.split('.')[::-1]
+ if '**' in c and (c.count('**') > 1 or not compare.startswith('**')):
+ raise ValidationError(
+ 'Only 1 ** is allowed, and must start the domain.')
+ t = test.split('.')[::-1]
+ z = itertools.izip_longest(c, t)
+ for c, t in z:
+ if c == t:
+ continue
+ elif c == '*':
+ continue
+ elif c == '**':
+ return True
+ return False
+ # Got all the way through and everything matched.
+ return True
+
+
+class ValidationError(ValueError):
+ pass
+
+
+def _render(tree):
+ """Try rendering as HTML, then XML, then give up."""
+ try:
+ return force_unicode(_serialize(tree))
+ except Exception, e:
+ log.error('HTML: %r' % e, exc_info=sys.exc_info())
+ try:
+ return force_unicode(tree.toxml())
+ except Exception, e:
+ log.error('XML: %r' % e, exc_info=sys.exc_info())
+ return u''
+
+
+def _serialize(domtree):
+ walker = html5lib.treewalkers.getTreeWalker('simpletree')
+ stream = walker(domtree)
+ serializer = HTMLSerializer(quote_attr_values=True,
+ omit_optional_tags=False)
+ return serializer.render(stream)
diff --git a/bleach/encoding.py b/bleach/encoding.py
new file mode 100644
index 0000000..b9a989d
--- /dev/null
+++ b/bleach/encoding.py
@@ -0,0 +1,54 @@
+import datetime
+from decimal import Decimal
+import types
+
+
+def is_protected_type(obj):
+ """Determine if the object instance is of a protected type.
+
+ Objects of protected types are preserved as-is when passed to
+ force_unicode(strings_only=True).
+ """
+ return isinstance(obj, (
+ types.NoneType,
+ int, long,
+ datetime.datetime, datetime.date, datetime.time,
+ float, Decimal)
+ )
+
+
+def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+ """
+ Similar to smart_unicode, except that lazy instances are resolved to
+ strings, rather than kept as lazy objects.
+
+ If strings_only is True, don't convert (some) non-string-like objects.
+ """
+ if strings_only and is_protected_type(s):
+ return s
+ try:
+ if not isinstance(s, basestring,):
+ if hasattr(s, '__unicode__'):
+ s = unicode(s)
+ else:
+ try:
+ s = unicode(str(s), encoding, errors)
+ except UnicodeEncodeError:
+ if not isinstance(s, Exception):
+ raise
+ # If we get to here, the caller has passed in an Exception
+ # subclass populated with non-ASCII data without special
+ # handling to display as a string. We need to handle this
+ # without raising a further exception. We do an
+ # approximation to what the Exception's standard str()
+ # output should be.
+ s = ' '.join([force_unicode(arg, encoding, strings_only,
+ errors) for arg in s])
+ elif not isinstance(s, unicode):
+ # Note: We use .decode() here, instead of unicode(s, encoding,
+ # errors), so that if s is a SafeString, it ends up being a
+ # SafeUnicode at the end.
+ s = s.decode(encoding, errors)
+ except UnicodeDecodeError, e:
+ raise UnicodeDecodeError(*e.args)
+ return s
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
new file mode 100644
index 0000000..677287e
--- /dev/null
+++ b/bleach/sanitizer.py
@@ -0,0 +1,143 @@
+import re
+from xml.sax.saxutils import escape, unescape
+
+from html5lib.constants import tokenTypes
+from html5lib.sanitizer import HTMLSanitizerMixin
+from html5lib.tokenizer import HTMLTokenizer
+
+
+class BleachSanitizerMixin(HTMLSanitizerMixin):
+ """Mixin to replace sanitize_token() and sanitize_css()."""
+
+ allowed_svg_properties = []
+ # TODO: When the next html5lib version comes out, nuke this.
+ attr_val_is_uri = HTMLSanitizerMixin.attr_val_is_uri + ['poster']
+
+ def sanitize_token(self, token):
+ """Sanitize a token either by HTML-encoding or dropping.
+
+ Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
+ a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
+
+ Here callable is a function with two arguments of attribute name
+ and value. It should return true of false.
+
+ Also gives the option to strip tags instead of encoding.
+
+ """
+ if (getattr(self, 'wildcard_attributes', None) is None and
+ isinstance(self.allowed_attributes, dict)):
+ self.wildcard_attributes = self.allowed_attributes.get('*', [])
+
+ if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
+ tokenTypes['EmptyTag']):
+ if token['name'] in self.allowed_elements:
+ if 'data' in token:
+ if isinstance(self.allowed_attributes, dict):
+ allowed_attributes = self.allowed_attributes.get(
+ token['name'], [])
+ if not callable(allowed_attributes):
+ allowed_attributes += self.wildcard_attributes
+ else:
+ allowed_attributes = self.allowed_attributes
+ attrs = dict([(name, val) for name, val in
+ token['data'][::-1]
+ if (allowed_attributes(name, val)
+ if callable(allowed_attributes)
+ else name in allowed_attributes)])
+ for attr in self.attr_val_is_uri:
+ if not attr in attrs:
+ continue
+ val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+ unescape(attrs[attr])).lower()
+ # Remove replacement characters from unescaped
+ # characters.
+ val_unescaped = val_unescaped.replace(u"\ufffd", "")
+ if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
+ and (val_unescaped.split(':')[0] not in
+ self.allowed_protocols)):
+ del attrs[attr]
+ for attr in self.svg_attr_val_allows_ref:
+ if attr in attrs:
+ attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+ ' ',
+ unescape(attrs[attr]))
+ if (token['name'] in self.svg_allow_local_href and
+ 'xlink:href' in attrs and
+ re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
+ del attrs['xlink:href']
+ if 'style' in attrs:
+ attrs['style'] = self.sanitize_css(attrs['style'])
+ token['data'] = [(name, val) for name, val in
+ attrs.items()]
+ return token
+ elif self.strip_disallowed_elements:
+ pass
+ else:
+ if token['type'] == tokenTypes['EndTag']:
+ token['data'] = '</%s>' % token['name']
+ elif token['data']:
+ attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in
+ token['data']])
+ token['data'] = '<%s%s>' % (token['name'], attrs)
+ else:
+ token['data'] = '<%s>' % token['name']
+ if token['selfClosing']:
+ token['data'] = token['data'][:-1] + '/>'
+ token['type'] = tokenTypes['Characters']
+ del token["name"]
+ return token
+ elif token['type'] == tokenTypes['Comment']:
+ if not self.strip_html_comments:
+ return token
+ else:
+ return token
+
+ def sanitize_css(self, style):
+ """HTMLSanitizerMixin.sanitize_css replacement.
+
+ HTMLSanitizerMixin.sanitize_css always whitelists background-*,
+ border-*, margin-*, and padding-*. We only whitelist what's in
+ the whitelist.
+
+ """
+ # disallow urls
+ style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+ # gauntlet
+ # TODO: Make sure this does what it's meant to - I *think* it wants to
+ # validate style attribute contents.
+ parts = style.split(';')
+ gauntlet = re.compile("""^([-/:,#%.'\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*"""
+ """|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
+ for part in parts:
+ if not gauntlet.match(part):
+ return ''
+
+ if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+ return ''
+
+ clean = []
+ for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
+ if not value:
+ continue
+ if prop.lower() in self.allowed_css_properties:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.lower() in self.allowed_svg_properties:
+ clean.append(prop + ': ' + value + ';')
+
+ return ' '.join(clean)
+
+
+class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
+ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
+ lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
+ HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
+ lowercaseElementName, lowercaseAttrName,
+ **kwargs)
+
+ def __iter__(self):
+ for token in HTMLTokenizer.__iter__(self):
+ token = self.sanitize_token(token)
+ if token:
+ yield token
diff --git a/bleach/tests/__init__.py b/bleach/tests/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/bleach/tests/__init__.py
diff --git a/bleach/tests/test_basics.py b/bleach/tests/test_basics.py
new file mode 100644
index 0000000..60be11d
--- /dev/null
+++ b/bleach/tests/test_basics.py
@@ -0,0 +1,170 @@
+import html5lib
+from nose.tools import eq_
+
+import bleach
+
+
+def test_empty():
+ eq_('', bleach.clean(''))
+
+
+def test_comments_only():
+ comment = '<!-- this is a comment -->'
+ open_comment = '<!-- this is an open comment'
+ eq_('', bleach.clean(comment))
+ eq_('', bleach.clean(open_comment))
+ eq_(comment, bleach.clean(comment, strip_comments=False))
+ eq_('%s-->' % open_comment, bleach.clean(open_comment,
+ strip_comments=False))
+
+
+def test_with_comments():
+ html = '<!-- comment -->Just text'
+ eq_('Just text', bleach.clean(html))
+ eq_(html, bleach.clean(html, strip_comments=False))
+
+
+def test_no_html():
+ eq_('no html string', bleach.clean('no html string'))
+
+
+def test_allowed_html():
+ eq_('an <strong>allowed</strong> tag',
+ bleach.clean('an <strong>allowed</strong> tag'))
+ eq_('another <em>good</em> tag',
+ bleach.clean('another <em>good</em> tag'))
+
+
+def test_bad_html():
+ eq_('a <em>fixed tag</em>',
+ bleach.clean('a <em>fixed tag'))
+
+
+def test_function_arguments():
+ TAGS = ['span', 'br']
+ ATTRS = {'span': ['style']}
+
+ eq_('a <br><span style="">test</span>',
+ bleach.clean('a <br/><span style="color:red">test</span>',
+ tags=TAGS, attributes=ATTRS))
+
+
+def test_named_arguments():
+ ATTRS = {'a': ['rel', 'href']}
+ s = u'<a href="http://xx.com" rel="alternate">xx.com</a>'
+ eq_('<a href="http://xx.com">xx.com</a>', bleach.clean(s))
+ eq_(s, bleach.clean(s, attributes=ATTRS))
+
+
+def test_disallowed_html():
+ eq_('a &lt;script&gt;safe()&lt;/script&gt; test',
+ bleach.clean('a <script>safe()</script> test'))
+ eq_('a &lt;style&gt;body{}&lt;/style&gt; test',
+ bleach.clean('a <style>body{}</style> test'))
+
+
+def test_bad_href():
+ eq_('<em>no link</em>',
+ bleach.clean('<em href="fail">no link</em>'))
+
+
+def test_bare_entities():
+ eq_('an &amp; entity', bleach.clean('an & entity'))
+ eq_('an &lt; entity', bleach.clean('an < entity'))
+ eq_('tag &lt; <em>and</em> entity',
+ bleach.clean('tag < <em>and</em> entity'))
+ eq_('&amp;', bleach.clean('&amp;'))
+
+
+def test_escaped_entities():
+ s = u'&lt;em&gt;strong&lt;/em&gt;'
+ eq_(s, bleach.clean(s))
+
+
+def test_serializer():
+ s = u'<table></table>'
+ eq_(s, bleach.clean(s, tags=['table']))
+ eq_(u'test<table></table>', bleach.linkify(u'<table>test</table>'))
+ eq_(u'<p>test</p>', bleach.clean(u'<p>test</p>', tags=['p']))
+
+
+def test_no_href_links():
+ s = u'<a name="anchor">x</a>'
+ eq_(s, bleach.linkify(s))
+ eq_(s, bleach.linkify(s, nofollow=False))
+
+
+def test_weird_strings():
+ s = '</3'
+ eq_(bleach.clean(s), '')
+
+
+def test_xml_render():
+ parser = html5lib.HTMLParser()
+ eq_(bleach._render(parser.parseFragment('')), '')
+
+
+def test_stripping():
+ eq_('a test <em>with</em> <b>html</b> tags',
+ bleach.clean('a test <em>with</em> <b>html</b> tags', strip=True))
+ eq_('a test <em>with</em> <b>html</b> tags',
+ bleach.clean('a test <em>with</em> <img src="http://example.com/"> '
+ '<b>html</b> tags', strip=True))
+
+ s = '<p><a href="http://example.com/">link text</a></p>'
+ eq_('<p>link text</p>', bleach.clean(s, tags=['p'], strip=True))
+ s = '<p><span>multiply <span>nested <span>text</span></span></span></p>'
+ eq_('<p>multiply nested text</p>', bleach.clean(s, tags=['p'], strip=True))
+
+ s = ('<p><a href="http://example.com/"><img src="http://example.com/">'
+ '</a></p>')
+ eq_('<p><a href="http://example.com/"></a></p>',
+ bleach.clean(s, tags=['p', 'a'], strip=True))
+
+
+def test_allowed_styles():
+ ATTR = ['style']
+ STYLE = ['color']
+ blank = '<b style=""></b>'
+ s = '<b style="color: blue;"></b>'
+ eq_(blank, bleach.clean('<b style="top:0"></b>', attributes=ATTR))
+ eq_(s, bleach.clean(s, attributes=ATTR, styles=STYLE))
+ eq_(s, bleach.clean('<b style="top: 0; color: blue;"></b>',
+ attributes=ATTR, styles=STYLE))
+
+
+def test_idempotent():
+ """Make sure that applying the filter twice doesn't change anything."""
+ dirty = u'<span>invalid & </span> < extra http://link.com<em>'
+
+ clean = bleach.clean(dirty)
+ eq_(clean, bleach.clean(clean))
+
+ linked = bleach.linkify(dirty)
+ eq_(linked, bleach.linkify(linked))
+
+
+def test_lowercase_html():
+ """We should output lowercase HTML."""
+ dirty = u'<EM CLASS="FOO">BAR</EM>'
+ clean = u'<em class="FOO">BAR</em>'
+ eq_(clean, bleach.clean(dirty, attributes=['class']))
+
+
+def test_wildcard_attributes():
+ ATTR = {
+ '*': ['id'],
+ 'img': ['src'],
+ }
+ TAG = ['img', 'em']
+ dirty = (u'both <em id="foo" style="color: black">can</em> have '
+ u'<img id="bar" src="foo"/>')
+ clean = u'both <em id="foo">can</em> have <img id="bar" src="foo">'
+ eq_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR))
+
+
+def test_sarcasm():
+ """Jokes should crash.<sarcasm/>"""
+ dirty = u'Yeah right <sarcasm/>'
+ clean = u'Yeah right &lt;sarcasm/&gt;'
+ eq_(clean, bleach.clean(dirty))
diff --git a/bleach/tests/test_css.py b/bleach/tests/test_css.py
new file mode 100644
index 0000000..fdb3f65
--- /dev/null
+++ b/bleach/tests/test_css.py
@@ -0,0 +1,85 @@
+from functools import partial
+
+from nose.tools import eq_
+
+from bleach import clean
+
+
+clean = partial(clean, tags=['p'], attributes=['style'])
+
+
+def test_allowed_css():
+ tests = (
+ ('font-family: Arial; color: red; float: left; '
+ 'background-color: red;', 'color: red;', ['color']),
+ ('border: 1px solid blue; color: red; float: left;', 'color: red;',
+ ['color']),
+ ('border: 1px solid blue; color: red; float: left;',
+ 'color: red; float: left;', ['color', 'float']),
+ ('color: red; float: left; padding: 1em;', 'color: red; float: left;',
+ ['color', 'float']),
+ ('color: red; float: left; padding: 1em;', 'color: red;', ['color']),
+ ('cursor: -moz-grab;', 'cursor: -moz-grab;', ['cursor']),
+ ('color: hsl(30,100%,50%);', 'color: hsl(30,100%,50%);', ['color']),
+ ('color: rgba(255,0,0,0.4);', 'color: rgba(255,0,0,0.4);', ['color']),
+ ("text-overflow: ',' ellipsis;", "text-overflow: ',' ellipsis;", ['text-overflow']),
+ )
+
+ p = '<p style="%s">bar</p>'
+
+ def check(input, output, styles):
+ eq_(p % output, clean(p % input, styles=styles))
+
+ for i, o, s in tests:
+ yield check, i, o, s
+
+
+def test_valid_css():
+ """The sanitizer should fix missing CSS values."""
+ styles = ['color', 'float']
+ eq_('<p style="float: left;">foo</p>',
+ clean('<p style="float: left; color: ">foo</p>', styles=styles))
+ eq_('<p style="">foo</p>',
+ clean('<p style="color: float: left;">foo</p>', styles=styles))
+
+
+def test_style_hang():
+ """The sanitizer should not hang on any inline styles"""
+ # TODO: Neaten this up. It's copypasta from MDN/Kuma to repro the bug
+ style = ("""margin-top: 0px; margin-right: 0px; margin-bottom: 1.286em; """
+ """margin-left: 0px; padding-top: 15px; padding-right: 15px; """
+ """padding-bottom: 15px; padding-left: 15px; border-top-width: """
+ """1px; border-right-width: 1px; border-bottom-width: 1px; """
+ """border-left-width: 1px; border-top-style: dotted; """
+ """border-right-style: dotted; border-bottom-style: dotted; """
+ """border-left-style: dotted; border-top-color: rgb(203, 200, """
+ """185); border-right-color: rgb(203, 200, 185); """
+ """border-bottom-color: rgb(203, 200, 185); border-left-color: """
+ """rgb(203, 200, 185); background-image: initial; """
+ """background-attachment: initial; background-origin: initial; """
+ """background-clip: initial; background-color: """
+ """rgb(246, 246, 242); overflow-x: auto; overflow-y: auto; """
+ """font: normal normal normal 100%/normal 'Courier New', """
+ """'Andale Mono', monospace; background-position: initial """
+ """initial; background-repeat: initial initial;""")
+ html = '<p style="%s">Hello world</p>' % style
+ styles = [
+ 'border', 'float', 'overflow', 'min-height', 'vertical-align',
+ 'white-space',
+ 'margin', 'margin-left', 'margin-top', 'margin-bottom', 'margin-right',
+ 'padding', 'padding-left', 'padding-top', 'padding-bottom', 'padding-right',
+ 'background',
+ 'background-color',
+ 'font', 'font-size', 'font-weight', 'text-align', 'text-transform',
+ ]
+
+ expected = ("""<p style="margin-top: 0px; margin-right: 0px; """
+ """margin-bottom: 1.286em; margin-left: 0px; padding-top: """
+ """15px; padding-right: 15px; padding-bottom: 15px; """
+ """padding-left: 15px; background-color: """
+ """rgb(246, 246, 242); font: normal normal normal """
+ """100%/normal 'Courier New', 'Andale Mono', monospace;">"""
+ """Hello world</p>""")
+
+ result = clean(html, styles=styles)
+ eq_(expected, result)
diff --git a/bleach/tests/test_delinkify.py b/bleach/tests/test_delinkify.py
new file mode 100644
index 0000000..f216d2f
--- /dev/null
+++ b/bleach/tests/test_delinkify.py
@@ -0,0 +1,109 @@
+from nose.tools import eq_
+
+import bleach
+
+
+def test_delinkify():
+ eq_('test', bleach.delinkify('<a href="http://ex.mp">test</a>'))
+ eq_('footestbar',
+ bleach.delinkify('foo<a href="http://ex.mp">test</a>bar'))
+
+
+def test_whitelist():
+ html = '<a href="http://ex.mp">test</a>'
+ eq_(html, bleach.delinkify(html, allow_domains=['ex.mp']))
+ eq_('test', bleach.delinkify(html, allow_domains=['ex2.mp']))
+ # Allow a single domain as a special case.
+ eq_(html, bleach.delinkify(html, allow_domains='ex.mp'))
+
+
+def test_nested_a():
+ html = '<a href="http://ex.mp">test<a href="http://foo.bar">test</a></a>'
+ eq_('testtest', bleach.delinkify(html))
+ eq_('<a href="http://ex.mp">test</a>test',
+ bleach.delinkify(html, allow_domains=['ex.mp']))
+
+
+def test_nested_tag():
+ html = '<a href="http://ex.mp">test<span>test</span></a>'
+ eq_('test<span>test</span>', bleach.delinkify(html))
+
+
+def test_a_name():
+ """Don't screw with non-link <a> tags."""
+ html = '<a name="foo">bar</a>'
+ eq_(html, bleach.delinkify(html))
+
+
+def test_relative():
+ """Relative links are optionally OK."""
+ html = 'some <a href="/foo/bar">link</a>'
+ eq_('some link', bleach.delinkify(html))
+ eq_(html, bleach.delinkify(html, allow_relative=True))
+
+
+def test_protocol_relative():
+ """Protocol-relative links aren't relative."""
+ html = 'bad <a href="//ex.mp">link</a>'
+ expect = 'bad link'
+ eq_(expect, bleach.delinkify(html))
+ eq_(expect, bleach.delinkify(html, allow_relative=True))
+ eq_(html, bleach.delinkify(html, allow_domains='ex.mp'))
+
+
+def test_domain_match():
+ tests = (
+ ('ex.mp', 'ex.mp', True),
+ ('ex.mp', '*.ex.mp', True),
+ ('test.ex.mp', '*.ex.mp', True),
+ ('test.ex.mp', 'ex.mp', False),
+ ('test.test.ex.mp', '*.ex.mp', False),
+ ('test.test.ex.mp', '**.ex.mp', True),
+ ('wrong.mp', 'ex.mp', False),
+ ('wrong.mp', '*.ex.mp', False),
+ ('really.wrong.mp', 'ex.mp', False),
+ ('really.wrong.mp', '*.ex.mp', False),
+ ('really.very.wrong.mp', '*.ex.mp', False),
+ ('EX.mp', 'ex.mp', True), # Domains are case-insensitive.
+ ('ex.mp', 'an.ex.mp', False),
+ ('ex.mp', '*.an.ex.mp', False),
+ ('an.ex.am.pl', 'an.*.am.pl', True),
+ ('a.ex.am.pl', 'an.*.am.pl', False),
+ ('ex.am.pl', 'an.*.am.pl', False),
+ )
+
+ def _check(t, c, v):
+ eq_(v, bleach._domain_match(t, c))
+
+ for t, c, v in tests:
+ yield _check, t, c, v
+
+
+def test_double_star():
+ assert bleach._domain_match('ex.mp', '**.ex.mp')
+ try:
+ bleach._domain_match('ex.mp', 'an.**.ex.mp')
+ except bleach.ValidationError:
+ pass
+ else:
+ assert False, '_domain_match should not accept an.**.ex.mp'
+
+
+def test_allow_subdomains():
+ domains = ('ex.mp', '*.exa.mp', 'an.exam.pl', '*.my.examp.le')
+ html = (
+ ('<a href="http://an.ex.mp">bad</a>', 'bad'),
+ ('<a href="http://exa.mp">good</a>', None),
+ ('<a href="http://an.exa.mp">good</a>', None),
+ ('<a href="http://an.exam.pl">good</a>', None),
+ ('<a href="http://another.exam.pl">bad</a>', 'bad'),
+ ('<a href="http://a.bad.examp.le">bad</a>', 'bad'),
+ ('<a href="http://a.very.bad.examp.le">bad</a>', 'bad'),
+ )
+
+ def _check(html, text):
+ output = bleach.delinkify(html, allow_domains=domains)
+ eq_(html if text is None else text, output)
+
+ for t, o in html:
+ yield _check, t, o
diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py
new file mode 100644
index 0000000..7caf006
--- /dev/null
+++ b/bleach/tests/test_links.py
@@ -0,0 +1,312 @@
+import urllib
+
+from html5lib.tokenizer import HTMLTokenizer
+from nose.tools import eq_
+
+from bleach import linkify, url_re
+
+
+def filter_url(url):
+ return u'http://bouncer/?u=%s' % urllib.quote_plus(url)
+
+
+def test_url_re():
+ def no_match(s):
+ match = url_re.search(s)
+ if match:
+ assert not match, 'matched %s' % s[slice(*match.span())]
+ yield no_match, 'just what i am looking for...it'
+
+
+def test_empty():
+ eq_('', linkify(''))
+
+
+def test_simple_link():
+ eq_('a <a href="http://example.com" rel="nofollow">http://example.com'
+ '</a> link',
+ linkify('a http://example.com link'))
+ eq_('a <a href="https://example.com" rel="nofollow">https://example.com'
+ '</a> link',
+ linkify('a https://example.com link'))
+ eq_('an <a href="http://example.com" rel="nofollow">example.com</a> link',
+ linkify('an example.com link'))
+
+
+def test_trailing_slash():
+ eq_('<a href="http://example.com/" rel="nofollow">http://example.com/</a>',
+ linkify('http://example.com/'))
+ eq_('<a href="http://example.com/foo/" rel="nofollow">'
+ 'http://example.com/foo/</a>',
+ linkify('http://example.com/foo/'))
+ eq_('<a href="http://example.com/foo/bar/" rel="nofollow">'
+ 'http://example.com/foo/bar/</a>',
+ linkify('http://example.com/foo/bar/'))
+
+
+def test_mangle_link():
+ eq_('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">'
+ 'http://example.com</a>',
+ linkify('http://example.com', filter_url=filter_url))
+
+
+def test_email_link():
+ eq_('a james@example.com mailto',
+ linkify('a james@example.com mailto'))
+ eq_('a james@example.com.au mailto',
+ linkify('a james@example.com.au mailto'))
+ eq_('a <a href="mailto:james@example.com" rel="nofollow">'
+ 'james@example.com</a> mailto',
+ linkify('a james@example.com mailto', parse_email=True))
+ eq_('aussie <a href="mailto:james@example.com.au" rel="nofollow">'
+ 'james@example.com.au</a> mailto',
+ linkify('aussie james@example.com.au mailto', parse_email=True))
+ eq_('email to <a href="james@example.com" rel="nofollow">'
+ 'james@example.com</a>',
+ linkify('email to <a href="james@example.com">'
+ 'james@example.com</a>', parse_email=True))
+
+
+def test_email_link_escaping():
+ eq_('''<a href='mailto:"james"@example.com' rel="nofollow">'''
+ '''"james"@example.com</a>''',
+ linkify('"james"@example.com', parse_email=True))
+ eq_('''<a href="mailto:&quot;j'ames&quot;@example.com" rel="nofollow">'''
+ '''"j'ames"@example.com</a>''',
+ linkify('"j\'ames"@example.com', parse_email=True))
+ eq_('''<a href='mailto:"ja>mes"@example.com' rel="nofollow">'''
+ '''"ja&gt;mes"@example.com</a>''',
+ linkify('"ja>mes"@example.com', parse_email=True))
+
+
+def test_tlds():
+ eq_('<a href="http://example.com" rel="nofollow">example.com</a>',
+ linkify('example.com'))
+ eq_('<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>',
+ linkify('example.co.uk'))
+ eq_('<a href="http://example.edu" rel="nofollow">example.edu</a>',
+ linkify('example.edu'))
+ eq_('example.xxx', linkify('example.xxx'))
+ eq_(' brie', linkify(' brie'))
+ eq_('<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>',
+ linkify('bit.ly/fun'))
+
+
+def test_escaping():
+ eq_('&lt; unrelated', linkify('< unrelated'))
+
+
+def test_nofollow_off():
+ eq_('<a href="http://example.com">example.com</a>',
+ linkify(u'example.com', nofollow=False))
+
+
+def test_link_in_html():
+ eq_('<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>',
+ linkify('<i>http://yy.com</i>'))
+ eq_('<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com</a>'
+ '</strong></em>',
+ linkify('<em><strong>http://xx.com</strong></em>'))
+
+
+def test_links_https():
+ eq_('<a href="https://yy.com" rel="nofollow">https://yy.com</a>',
+ linkify('https://yy.com'))
+
+
+def test_add_rel_nofollow():
+ """Verify that rel="nofollow" is added to an existing link"""
+ eq_('<a href="http://yy.com" rel="nofollow">http://yy.com</a>',
+ linkify('<a href="http://yy.com">http://yy.com</a>'))
+
+
+def test_url_with_path():
+ eq_('<a href="http://example.com/path/to/file" rel="nofollow">'
+ 'http://example.com/path/to/file</a>',
+ linkify('http://example.com/path/to/file'))
+
+
+def test_link_ftp():
+ eq_('<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
+ 'ftp://ftp.mozilla.org/some/file</a>',
+ linkify('ftp://ftp.mozilla.org/some/file'))
+
+
+def test_link_query():
+ eq_('<a href="http://xx.com/?test=win" rel="nofollow">'
+ 'http://xx.com/?test=win</a>',
+ linkify('http://xx.com/?test=win'))
+ eq_('<a href="http://xx.com/?test=win" rel="nofollow">'
+ 'xx.com/?test=win</a>',
+ linkify('xx.com/?test=win'))
+ eq_('<a href="http://xx.com?test=win" rel="nofollow">'
+ 'xx.com?test=win</a>',
+ linkify('xx.com?test=win'))
+
+
+def test_link_fragment():
+ eq_('<a href="http://xx.com/path#frag" rel="nofollow">'
+ 'http://xx.com/path#frag</a>',
+ linkify('http://xx.com/path#frag'))
+
+
+def test_link_entities():
+ eq_('<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">'
+ 'http://xx.com/?a=1&amp;b=2</a>',
+ linkify('http://xx.com/?a=1&b=2'))
+
+
+def test_escaped_html():
+ """If I pass in escaped HTML, it should probably come out escaped."""
+ s = '&lt;em&gt;strong&lt;/em&gt;'
+ eq_(s, linkify(s))
+
+
+def test_link_http_complete():
+ eq_('<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d'
+ '&amp;e#f" rel="nofollow">'
+ 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>',
+ linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f'))
+
+
+def test_non_url():
+ """document.vulnerable should absolutely not be linkified."""
+ s = 'document.vulnerable'
+ eq_(s, linkify(s))
+
+
+def test_javascript_url():
+ """javascript: urls should never be linkified."""
+ s = 'javascript:document.vulnerable'
+ eq_(s, linkify(s))
+
+
+def test_unsafe_url():
+ """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning."""
+ eq_('All your{"<a href="http://xx.yy.com/grover.png" '
+ 'rel="nofollow">xx.yy.com/grover.png</a>"}base are',
+ linkify('All your{"xx.yy.com/grover.png"}base are'))
+
+
+def test_skip_pre():
+ """Skip linkification in <pre> tags."""
+ simple = 'http://xx.com <pre>http://xx.com</pre>'
+ linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
+ '<pre>http://xx.com</pre>')
+ all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
+ '<pre><a href="http://xx.com" rel="nofollow">http://xx.com'
+ '</a></pre>')
+ eq_(linked, linkify(simple, skip_pre=True))
+ eq_(all_linked, linkify(simple))
+
+ already_linked = '<pre><a href="http://xx.com">xx</a></pre>'
+ nofollowed = '<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>'
+ eq_(nofollowed, linkify(already_linked))
+ eq_(nofollowed, linkify(already_linked, skip_pre=True))
+
+
+def test_libgl():
+ """libgl.so.1 should not be linkified."""
+ eq_('libgl.so.1', linkify('libgl.so.1'))
+
+
+def test_end_of_sentence():
+ """example.com. should match."""
+ out = u'<a href="http://%s" rel="nofollow">%s</a>%s'
+ in_ = u'%s%s'
+
+ def check(u, p):
+ eq_(out % (u, u, p), linkify(in_ % (u, p)))
+
+ tests = (
+ ('example.com', '.'),
+ ('example.com', '...'),
+ ('ex.com/foo', '.'),
+ ('ex.com/foo', '....'),
+ )
+
+ for u, p in tests:
+ yield check, u, p
+
+
+def test_end_of_clause():
+ """example.com/foo, shouldn't include the ,"""
+ eq_('<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar',
+ linkify('ex.com/foo, bar'))
+
+
+def test_sarcasm():
+ """Jokes should crash.<sarcasm/>"""
+ dirty = u'Yeah right <sarcasm/>'
+ clean = u'Yeah right &lt;sarcasm/&gt;'
+ eq_(clean, linkify(dirty))
+
+
+def test_wrapping_parentheses():
+ """URLs wrapped in parantheses should not include them."""
+ out = u'%s<a href="http://%s" rel="nofollow">%s</a>%s'
+
+ tests = (
+ ('(example.com)', out % ('(', 'example.com', 'example.com', ')')),
+ ('(example.com/)', out % ('(', 'example.com/', 'example.com/', ')')),
+ ('(example.com/foo)', out % ('(', 'example.com/foo',
+ 'example.com/foo', ')')),
+ ('(((example.com/))))', out % ('(((', 'example.com/)',
+ 'example.com/)', ')))')),
+ ('example.com/))', out % ('', 'example.com/))',
+ 'example.com/))', '')),
+ ('http://en.wikipedia.org/wiki/Test_(assessment)',
+ out % ('', 'en.wikipedia.org/wiki/Test_(assessment)',
+ 'http://en.wikipedia.org/wiki/Test_(assessment)', '')),
+ ('(http://en.wikipedia.org/wiki/Test_(assessment))',
+ out % ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
+ 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')),
+ ('((http://en.wikipedia.org/wiki/Test_(assessment))',
+ out % ('((', 'en.wikipedia.org/wiki/Test_(assessment',
+ 'http://en.wikipedia.org/wiki/Test_(assessment', '))')),
+ ('(http://en.wikipedia.org/wiki/Test_(assessment)))',
+ out % ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
+ 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')),
+ ('(http://en.wikipedia.org/wiki/)Test_(assessment',
+ out % ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
+ 'http://en.wikipedia.org/wiki/)Test_(assessment', '')),
+ )
+
+ def check(test, expected_output):
+ eq_(expected_output, linkify(test))
+
+ for test, expected_output in tests:
+ yield check, test, expected_output
+
+
+def test_ports():
+ """URLs can contain port numbers."""
+ tests = (
+ ('http://foo.com:8000', ('http://foo.com:8000', '')),
+ ('http://foo.com:8000/', ('http://foo.com:8000/', '')),
+ ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')),
+ ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')),
+ ('http://foo.com:', ('http://foo.com', ':')),
+ )
+
+ def check(test, output):
+ eq_(u'<a href="{0}" rel="nofollow">{0}</a>{1}'.format(*output),
+ linkify(test))
+
+ for test, output in tests:
+ yield check, test, output
+
+
+def test_target():
+ eq_('<a href="http://example.com" rel="nofollow" '
+ 'target="_blank">example.com</a>',
+ linkify(u'example.com', target='_blank'))
+ eq_('<a href="http://example.com" target="_blank">example.com</a>',
+ linkify(u'example.com', target='_blank', nofollow=False))
+
+
+def test_tokenizer():
+ """Linkify doesn't always have to sanitize."""
+ raw = '<em>test<x></x></em>'
+ eq_('<em>test&lt;x&gt;&lt;/x&gt;</em>', linkify(raw))
+ eq_(raw, linkify(raw, tokenizer=HTMLTokenizer))
diff --git a/bleach/tests/test_security.py b/bleach/tests/test_security.py
new file mode 100644
index 0000000..9e9bb7b
--- /dev/null
+++ b/bleach/tests/test_security.py
@@ -0,0 +1,108 @@
+"""More advanced security tests"""
+
+from nose.tools import eq_
+
+from bleach import clean
+
+
+def test_nested_script_tag():
+ eq_('&lt;&lt;script&gt;script&gt;evil()&lt;&lt;/script&gt;/script&gt;',
+ clean('<<script>script>evil()<</script>/script>'))
+ eq_('&lt;&lt;x&gt;script&gt;evil()&lt;&lt;/x&gt;/script&gt;',
+ clean('<<x>script>evil()<</x>/script>'))
+
+
+def test_nested_script_tag_r():
+ eq_('&lt;script&lt;script&gt;&gt;evil()&lt;/script&lt;&gt;&gt;',
+ clean('<script<script>>evil()</script</script>>'))
+
+
+def test_invalid_attr():
+ IMG = ['img', ]
+ IMG_ATTR = ['src']
+
+ eq_('<a href="test">test</a>',
+ clean('<a onclick="evil" href="test">test</a>'))
+ eq_('<img src="test">',
+ clean('<img onclick="evil" src="test" />',
+ tags=IMG, attributes=IMG_ATTR))
+ eq_('<img src="test">',
+ clean('<img href="invalid" src="test" />',
+ tags=IMG, attributes=IMG_ATTR))
+
+
+def test_unquoted_attr():
+ eq_('<abbr title="mytitle">myabbr</abbr>',
+ clean('<abbr title=mytitle>myabbr</abbr>'))
+
+
+def test_unquoted_event_handler():
+ eq_('<a href="http://xx.com">xx.com</a>',
+ clean('<a href="http://xx.com" onclick=foo()>xx.com</a>'))
+
+
+def test_invalid_attr_value():
+ eq_('&lt;img src="javascript:alert(\'XSS\');"&gt;',
+ clean('<img src="javascript:alert(\'XSS\');">'))
+
+
+def test_invalid_href_attr():
+ eq_('<a>xss</a>',
+ clean('<a href="javascript:alert(\'XSS\')">xss</a>'))
+
+
+def test_invalid_filter_attr():
+ IMG = ['img', ]
+ IMG_ATTR = {'img': lambda n, v: n == 'src' and v == "http://example.com/"}
+
+ eq_('<img src="http://example.com/">',
+ clean('<img onclick="evil" src="http://example.com/" />',
+ tags=IMG, attributes=IMG_ATTR))
+
+ eq_('<img>', clean('<img onclick="evil" src="http://badhost.com/" />',
+ tags=IMG, attributes=IMG_ATTR))
+
+
+def test_invalid_tag_char():
+ eq_('&lt;script xss="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;',
+ clean('<script/xss src="http://xx.com/xss.js"></script>'))
+ eq_('&lt;script src="http://xx.com/xss.js"&gt;&lt;/script&gt;',
+ clean('<script/src="http://xx.com/xss.js"></script>'))
+
+
+def test_unclosed_tag():
+ eq_('&lt;script src="http://xx.com/xss.js&amp;lt;b"&gt;',
+ clean('<script src=http://xx.com/xss.js<b>'))
+ eq_('&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;',
+ clean('<script src="http://xx.com/xss.js"<b>'))
+ eq_('&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;',
+ clean('<script src="http://xx.com/xss.js" <b>'))
+
+
+def test_strip():
+ """Using strip=True shouldn't result in malicious content."""
+ s = '<scri<script>pt>alert(1)</scr</script>ipt>'
+ eq_('pt&gt;alert(1)ipt&gt;', clean(s, strip=True))
+ s = '<scri<scri<script>pt>pt>alert(1)</script>'
+ eq_('pt&gt;pt&gt;alert(1)', clean(s, strip=True))
+
+
+def test_nasty():
+ """Nested, broken up, multiple tags, are still foiled!"""
+ test = ('<scr<script></script>ipt type="text/javascript">alert("foo");</'
+ '<script></script>script<del></del>>')
+ expect = (u'&lt;scr&lt;script&gt;&lt;/script&gt;ipt type="text/javascript"'
+ u'&gt;alert("foo");&lt;/script&gt;script&lt;del&gt;&lt;/del&gt;'
+ u'&gt;')
+ eq_(expect, clean(test))
+
+
+def test_poster_attribute():
+ """Poster attributes should not allow javascript."""
+ tags = ['video']
+ attrs = {'video': ['poster']}
+ test = '<video poster="javascript:alert(1)"></video>'
+ expect = '<video></video>'
+ eq_(expect, clean(test, tags=tags, attributes=attrs))
+ ok = '<video poster="/foo.png"></video>'
+ eq_(ok, clean(ok, tags=tags, attributes=attrs))
diff --git a/bleach/tests/test_unicode.py b/bleach/tests/test_unicode.py
new file mode 100644
index 0000000..67123cc
--- /dev/null
+++ b/bleach/tests/test_unicode.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+from nose.tools import eq_
+
+from bleach import clean, linkify
+
+
+def test_japanese_safe_simple():
+ eq_(u'ヘルプとチュートリアル', clean(u'ヘルプとチュートリアル'))
+ eq_(u'ヘルプとチュートリアル', linkify(u'ヘルプとチュートリアル'))
+
+
+def test_japanese_strip():
+ eq_(u'<em>ヘルプとチュートリアル</em>',
+ clean(u'<em>ヘルプとチュートリアル</em>'))
+ eq_(u'&lt;span&gt;ヘルプとチュートリアル&lt;/span&gt;',
+ clean(u'<span>ヘルプとチュートリアル</span>'))
+
+
+def test_russian_simple():
+ eq_(u'Домашняя', clean(u'Домашняя'))
+ eq_(u'Домашняя', linkify(u'Домашняя'))
+
+
+def test_mixed():
+ eq_(u'Домашняяヘルプとチュートリアル',
+ clean(u'Домашняяヘルプとチュートリアル'))
+
+
+def test_mixed_linkify():
+ eq_(u'Домашняя <a href="http://example.com" rel="nofollow">'
+ u'http://example.com</a> ヘルプとチュートリアル',
+ linkify(u'Домашняя http://example.com ヘルプとチュートリアル'))
+
+
+def test_url_utf8():
+ """Allow UTF8 characters in URLs themselves."""
+ out = u'<a href="%(url)s" rel="nofollow">%(url)s</a>'
+
+ tests = (
+ ('http://éxámplé.com/', out % {'url': u'http://éxámplé.com/'}),
+ ('http://éxámplé.com/íàñá/',
+ out % {'url': u'http://éxámplé.com/íàñá/'}),
+ ('http://éxámplé.com/íàñá/?foo=bar',
+ out % {'url': u'http://éxámplé.com/íàñá/?foo=bar'}),
+ ('http://éxámplé.com/íàñá/?fóo=bár',
+ out % {'url': u'http://éxámplé.com/íàñá/?fóo=bár'}),
+ )
+
+ def check(test, expected_output):
+ eq_(expected_output, linkify(test))
+
+ for test, expected_output in tests:
+ yield check, test, expected_output
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c525a9e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+# These are the requirements to run the test suite.
+nose
+html5lib
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..115d811
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,27 @@
+from setuptools import setup, find_packages
+
+setup(
+ name='bleach',
+ version='1.1.5',
+ description='An easy whitelist-based HTML-sanitizing tool.',
+ long_description=open('README.rst').read(),
+ author='James Socol',
+ author_email='james@mozilla.com',
+ url='http://github.com/jsocol/bleach',
+ license='BSD',
+ packages=find_packages(),
+ include_package_data=True,
+ package_data={'': ['README.rst']},
+ zip_safe=False,
+ install_requires=['html5lib>=0.95'],
+ classifiers=[
+ 'Development Status :: 4 - Beta',
+ 'Environment :: Web Environment',
+ 'Environment :: Web Environment :: Mozilla',
+ 'Intended Audience :: Developers',
+ 'License :: OSI Approved :: BSD License',
+ 'Operating System :: OS Independent',
+ 'Programming Language :: Python',
+ 'Topic :: Software Development :: Libraries :: Python Modules',
+ ]
+)