From 38dc3b8f231cf36bcc771001318556d9e84c2889 Mon Sep 17 00:00:00 2001 From: Per Andersson Date: Fri, 7 Sep 2012 02:45:18 +0200 Subject: Imported Upstream version 1.1.5 --- .gitignore | 8 + .travis.yml | 6 + CONTRIBUTORS | 1 + LICENSE | 28 ++++ MANIFEST.in | 2 + README.rst | 159 +++++++++++++++++++ bleach/__init__.py | 342 +++++++++++++++++++++++++++++++++++++++++ bleach/encoding.py | 54 +++++++ bleach/sanitizer.py | 143 +++++++++++++++++ bleach/tests/__init__.py | 0 bleach/tests/test_basics.py | 170 ++++++++++++++++++++ bleach/tests/test_css.py | 85 ++++++++++ bleach/tests/test_delinkify.py | 109 +++++++++++++ bleach/tests/test_links.py | 312 +++++++++++++++++++++++++++++++++++++ bleach/tests/test_security.py | 108 +++++++++++++ bleach/tests/test_unicode.py | 54 +++++++ requirements.txt | 3 + setup.py | 27 ++++ 18 files changed, 1611 insertions(+) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 CONTRIBUTORS create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 README.rst create mode 100644 bleach/__init__.py create mode 100644 bleach/encoding.py create mode 100644 bleach/sanitizer.py create mode 100644 bleach/tests/__init__.py create mode 100644 bleach/tests/test_basics.py create mode 100644 bleach/tests/test_css.py create mode 100644 bleach/tests/test_delinkify.py create mode 100644 bleach/tests/test_links.py create mode 100644 bleach/tests/test_security.py create mode 100644 bleach/tests/test_unicode.py create mode 100644 requirements.txt create mode 100644 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6714ae6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +*.pyo +*.pyc +pip-log.txt +.coverage +dist +*.egg-info +.noseids +build diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..e767f15 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,6 @@ +language: python +python: + - "2.6" + - "2.7" +install: pip install -Ur requirements.txt --use-mirrors +script: nosetests diff --git a/CONTRIBUTORS b/CONTRIBUTORS new file mode 100644 index 0000000..f612983 --- /dev/null +++ b/CONTRIBUTORS @@ -0,0 +1 @@ +See https://github.com/jsocol/bleach/contributors diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b2df30c --- /dev/null +++ b/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2010, Mozilla Foundation +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of bleach nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..9d5d250 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include LICENSE +include README.rst diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..08dfc50 --- /dev/null +++ b/README.rst @@ -0,0 +1,159 @@ +====== +Bleach +====== + +Bleach is an HTML sanitizing library that escapes or strips markup and +attributes based on a white list. Bleach can also linkify text safely, applying +filters that Django's ``urlize`` filter cannot, and optionally setting ``rel`` +attributes, even on links already in the text. + +Bleach is intended for sanitizing text from *untrusted* sources. If you find +yourself jumping through hoops to allow your site administrators to do lots of +things, you're probably outside the use cases. Either trust those users, or +don't. + +Because it relies on html5lib_, Bleach is as good as modern browsers at dealing +with weird, quirky HTML fragments. And *any* of Bleach's methods will fix +unbalanced or mis-nested tags. + +The version on `github `_ is the most +up-to-date and contains the latest bug fixes. + + +Basic Use +========= + +The simplest way to use Bleach is:: + + >>> import bleach + + >>> bleach.clean('an example') + u'an <script>evil()</script> example' + + >>> bleach.linkify('an http://example.com url') + u'an http://example.com url + + >>> bleach.delinkify('a link') + u'a link' + +*NB*: Bleach always returns a ``unicode`` object, whether you give it a +bytestring or a ``unicode`` object, but Bleach does not attempt to detect +incoming character encodings, and will assume UTF-8. If you are using a +different character encoding, you should convert from a bytestring to +``unicode`` before passing the text to Bleach. + + +Customizing Bleach +================== + +``clean()``, ``linkify()`` and ``delinkify()`` can take several optional +keyword arguments to customize their behavior. + + +``clean()`` +----------- + +``bleach.clean()`` is the primary tool in Bleach. It uses html5lib_ to parse a +document fragment into a tree and does the sanitization during tokenizing, +which is incredibly powerful and has several advantages over regular +expression-based sanitization. + +``tags`` + A whitelist of HTML tags. Must be a list. Defaults to + ``bleach.ALLOWED_TAGS``. +``attributes`` + A whitelist of HTML attributes. Either a list, in which case all attributes + are allowed on all elements, or a dict, with tag names as keys and lists of + allowed attributes as values ('*' is a wildcard key to allow an attribute on + any tag). Or it is possible to pass a callable instead of a list that accepts + name and value of attribute and returns True of False. Defaults to + ``bleach.ALLOWED_ATTRIBUTES``. +``styles`` + A whitelist of allowed CSS properties within a ``style`` attribute. (Note + that ``style`` attributes are not allowed by default.) Must be a list. + Defaults to ``[]``. +``strip`` + Strip disallowed HTML instead of escaping it. A boolean. Defaults to + ``False``. +``strip_comments`` + Strip HTML comments. A boolean. Defaults to ``True``. + + +``linkify()`` +------------- + +``bleach.linkify()`` turns things that look like URLs or (optionally) email +addresses and turns them into links. It does this smartly, only looking in text +nodes, and never within ```` tags. + +There are options that affect output, and some of these are also applied to +links already found in the text. These are designed to allow you to set +attributes like ``rel="nofollow"`` or ``target``, or push outgoing links +through a redirection URL, and do this to links already in the text, as well. + +``nofollow`` + Add ``rel="nofollow"`` to non-relative links (both created by ``linkify()`` + and those already present in the text). Defaults to ``True``. +``filter_url`` + A callable through which the ``href`` attribute of links (both created by + ``linkify()`` and already present in the text) will be passed. Must accept a + single argument and return a string. +``filter_text`` + A callable through which the text of links (only those created by + ``linkify``) will be passed. Must accept a single argument and return a + string. +``skip_pre`` + Do not create new links inside ``
`` sections. Still follows
+  ``nofollow``. Defaults to ``False``.
+``parse_email``
+  Linkify email addresses with ``mailto:``. Defaults to ``False``.
+``target``
+  Set a ``target`` attribute on links. Like ``nofollow``, if ``target`` is not
+  ``None``, will set the attribute on links already in the text, as well.
+  Defaults to ``None``.
+
+
+``delinkify()``
+---------------
+
+``bleach.delinkify()`` is basically the opposite of ``linkify()``. It strips
+links out of text except, optionally, relative links, or links to domains
+you've whitelisted.
+
+``allow_domains``
+  Allow links to the domains in this list. Set to ``None`` or an empty list to
+  disallow all non-relative domains. See below for wildcards. Defaults to
+  ``None``.
+``allow_relative``
+  Allow relative links (i.e. those with no hostname). Defaults to ``False``.
+
+
+Wildcards
+^^^^^^^^^
+
+To allow links to a domain and its subdomains, ``allow_domains`` accepts two
+types of wildcard arguments in domains:
+
+``*``
+  Allow a single level of subdomain. This can be anywhere in the hostname, even
+  the TLD. This allows you to, for example, allow links to ``example.*``.
+  ``*.example.com`` will match both ``foo.example.com`` and ``example.com``.
+  ::
+    >>> delinkify('bar', \
+    ... allow_domains=['*.ex.*'])
+    u'bar'
+    >>> delinkify('bar', allow_domains=['*.ex.mp'])
+    u'bar
+``**``
+  To allow any number of *preceding* subdomains, you can start a hostname with
+  ``**``. Note that unlike ``*``, ``**`` may only appear once, and only at the
+  beginning of a hostname.
+  ::
+    >>> delinkify('t', \
+    ... allow_domains=['**.ex.mp'])
+    u't'
+  If ``**`` appears anywhere but the beginning of a hostname, ``delinkify``
+  will throw ``bleach.ValidationError`` (which is a ``ValueError`` subclass,
+  for easy catching).
+
+.. _html5lib: http://code.google.com/p/html5lib/
diff --git a/bleach/__init__.py b/bleach/__init__.py
new file mode 100644
index 0000000..bc8e49c
--- /dev/null
+++ b/bleach/__init__.py
@@ -0,0 +1,342 @@
+import itertools
+import logging
+import re
+import sys
+import urlparse
+
+import html5lib
+from html5lib.sanitizer import HTMLSanitizer
+from html5lib.serializer.htmlserializer import HTMLSerializer
+
+from encoding import force_unicode
+from sanitizer import BleachSanitizer
+
+
+VERSION = (1, 1, 5)
+__version__ = '.'.join(map(str, VERSION))
+
+__all__ = ['clean', 'linkify']
+
+log = logging.getLogger('bleach')
+
+ALLOWED_TAGS = [
+    'a',
+    'abbr',
+    'acronym',
+    'b',
+    'blockquote',
+    'code',
+    'em',
+    'i',
+    'li',
+    'ol',
+    'strong',
+    'ul',
+]
+
+ALLOWED_ATTRIBUTES = {
+    'a': ['href', 'title'],
+    'abbr': ['title'],
+    'acronym': ['title'],
+}
+
+ALLOWED_STYLES = []
+
+TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
+       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
+       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
+       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
+       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
+       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
+       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
+       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
+       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn pr pro
+       ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl sm sn so
+       sr st su sv sy sz tc td tel tf tg th tj tk tl tm tn to tp tr travel tt
+       tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws xn ye yt yu za zm
+       zw""".split()
+
+TLDS.reverse()
+
+url_re = re.compile(
+    r"""\(*  # Match any opening parentheses.
+    \b(?"]*)?
+        # /path/zz (excluding "unsafe" chars from RFC 1738,
+        # except for # and ~, which happen in practice)
+    """ % u'|'.join(TLDS), re.VERBOSE | re.UNICODE)
+
+proto_re = re.compile(r'^[\w-]+:/{0,3}')
+
+punct_re = re.compile(r'([\.,]+)$')
+
+email_re = re.compile(
+    r"""(?%(mail)s'
+        return repl % {'mail': match.group(0).replace('"', '"')}
+
+    def link_repl(match):
+        url = match.group(0)
+        open_brackets = close_brackets = 0
+        if url.startswith('('):
+            url, open_brackets, close_brackets = (
+                    strip_wrapping_parentheses(url)
+            )
+        end = u''
+        m = re.search(punct_re, url)
+        if m:
+            end = m.group(0)
+            url = url[0:m.start()]
+        if re.search(proto_re, url):
+            href = url
+        else:
+            href = u''.join([u'http://', url])
+
+        repl = u'%s%s%s%s'
+
+        attribs = [rel]
+        if target is not None:
+            attribs.append('target="%s"' % target)
+
+        return repl % ('(' * open_brackets,
+                       filter_url(href), ' '.join(attribs), filter_text(url),
+                       end, ')' * close_brackets)
+
+    linkify_nodes(forest)
+
+    return _render(forest)
+
+
+def delinkify(text, allow_domains=None, allow_relative=False):
+    """Remove links from text, except those allowed to stay."""
+    text = force_unicode(text)
+    if not text:
+        return u''
+
+    parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer)
+    forest = parser.parseFragment(text)
+
+    if allow_domains is None:
+        allow_domains = []
+    elif isinstance(allow_domains, basestring):
+        allow_domains = [allow_domains]
+
+    def delinkify_nodes(tree):
+        """Remove  tags and replace them with their contents."""
+        for node in tree.childNodes:
+            if node.name == 'a':
+                if 'href' not in node.attributes:
+                    continue
+                parts = urlparse.urlparse(node.attributes['href'])
+                host = parts.hostname
+                if any(_domain_match(host, d) for d in allow_domains):
+                    continue
+                if host is None and allow_relative:
+                    continue
+                # Replace the node with its children.
+                # You can't nest  tags, and html5lib takes care of that
+                # for us in the tree-building step.
+                for n in node.childNodes:
+                    tree.insertBefore(n, node)
+                tree.removeChild(node)
+            elif node.type != NODE_TEXT: # Don't try to delinkify text.
+                delinkify_nodes(node)
+
+    delinkify_nodes(forest)
+    return _render(forest)
+
+
+def _domain_match(test, compare):
+    test = test.lower()
+    compare = compare.lower()
+    if '*' not in compare:
+        return test == compare
+    c = compare.split('.')[::-1]
+    if '**' in c and (c.count('**') > 1 or not compare.startswith('**')):
+        raise ValidationError(
+            'Only 1 ** is allowed, and must start the domain.')
+    t = test.split('.')[::-1]
+    z = itertools.izip_longest(c, t)
+    for c, t in z:
+        if c == t:
+            continue
+        elif c == '*':
+            continue
+        elif c == '**':
+            return True
+        return False
+    # Got all the way through and everything matched.
+    return True
+
+
+class ValidationError(ValueError):
+    pass
+
+
+def _render(tree):
+    """Try rendering as HTML, then XML, then give up."""
+    try:
+        return force_unicode(_serialize(tree))
+    except Exception, e:
+        log.error('HTML: %r' % e, exc_info=sys.exc_info())
+        try:
+            return force_unicode(tree.toxml())
+        except Exception, e:
+            log.error('XML: %r' % e, exc_info=sys.exc_info())
+            return u''
+
+
+def _serialize(domtree):
+    walker = html5lib.treewalkers.getTreeWalker('simpletree')
+    stream = walker(domtree)
+    serializer = HTMLSerializer(quote_attr_values=True,
+                                omit_optional_tags=False)
+    return serializer.render(stream)
diff --git a/bleach/encoding.py b/bleach/encoding.py
new file mode 100644
index 0000000..b9a989d
--- /dev/null
+++ b/bleach/encoding.py
@@ -0,0 +1,54 @@
+import datetime
+from decimal import Decimal
+import types
+
+
+def is_protected_type(obj):
+    """Determine if the object instance is of a protected type.
+
+    Objects of protected types are preserved as-is when passed to
+    force_unicode(strings_only=True).
+    """
+    return isinstance(obj, (
+        types.NoneType,
+        int, long,
+        datetime.datetime, datetime.date, datetime.time,
+        float, Decimal)
+    )
+
+
+def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+    """
+    Similar to smart_unicode, except that lazy instances are resolved to
+    strings, rather than kept as lazy objects.
+
+    If strings_only is True, don't convert (some) non-string-like objects.
+    """
+    if strings_only and is_protected_type(s):
+        return s
+    try:
+        if not isinstance(s, basestring,):
+            if hasattr(s, '__unicode__'):
+                s = unicode(s)
+            else:
+                try:
+                    s = unicode(str(s), encoding, errors)
+                except UnicodeEncodeError:
+                    if not isinstance(s, Exception):
+                        raise
+                    # If we get to here, the caller has passed in an Exception
+                    # subclass populated with non-ASCII data without special
+                    # handling to display as a string. We need to handle this
+                    # without raising a further exception. We do an
+                    # approximation to what the Exception's standard str()
+                    # output should be.
+                    s = ' '.join([force_unicode(arg, encoding, strings_only,
+                            errors) for arg in s])
+        elif not isinstance(s, unicode):
+            # Note: We use .decode() here, instead of unicode(s, encoding,
+            # errors), so that if s is a SafeString, it ends up being a
+            # SafeUnicode at the end.
+            s = s.decode(encoding, errors)
+    except UnicodeDecodeError, e:
+        raise UnicodeDecodeError(*e.args)
+    return s
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
new file mode 100644
index 0000000..677287e
--- /dev/null
+++ b/bleach/sanitizer.py
@@ -0,0 +1,143 @@
+import re
+from xml.sax.saxutils import escape, unescape
+
+from html5lib.constants import tokenTypes
+from html5lib.sanitizer import HTMLSanitizerMixin
+from html5lib.tokenizer import HTMLTokenizer
+
+
+class BleachSanitizerMixin(HTMLSanitizerMixin):
+    """Mixin to replace sanitize_token() and sanitize_css()."""
+
+    allowed_svg_properties = []
+    # TODO: When the next html5lib version comes out, nuke this.
+    attr_val_is_uri = HTMLSanitizerMixin.attr_val_is_uri + ['poster']
+
+    def sanitize_token(self, token):
+        """Sanitize a token either by HTML-encoding or dropping.
+
+        Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
+        a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
+
+        Here callable is a function with two arguments of attribute name
+        and value. It should return true of false.
+
+        Also gives the option to strip tags instead of encoding.
+
+        """
+        if (getattr(self, 'wildcard_attributes', None) is None and
+            isinstance(self.allowed_attributes, dict)):
+            self.wildcard_attributes = self.allowed_attributes.get('*', [])
+
+        if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
+                             tokenTypes['EmptyTag']):
+            if token['name'] in self.allowed_elements:
+                if 'data' in token:
+                    if isinstance(self.allowed_attributes, dict):
+                        allowed_attributes = self.allowed_attributes.get(
+                            token['name'], [])
+                        if not callable(allowed_attributes):
+                            allowed_attributes += self.wildcard_attributes
+                    else:
+                        allowed_attributes = self.allowed_attributes
+                    attrs = dict([(name, val) for name, val in
+                                  token['data'][::-1]
+                                  if (allowed_attributes(name, val)
+                                      if callable(allowed_attributes)
+                                      else name in allowed_attributes)])
+                    for attr in self.attr_val_is_uri:
+                        if not attr in attrs:
+                            continue
+                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+                                               unescape(attrs[attr])).lower()
+                        # Remove replacement characters from unescaped
+                        # characters.
+                        val_unescaped = val_unescaped.replace(u"\ufffd", "")
+                        if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
+                            and (val_unescaped.split(':')[0] not in
+                                 self.allowed_protocols)):
+                            del attrs[attr]
+                    for attr in self.svg_attr_val_allows_ref:
+                        if attr in attrs:
+                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                                 ' ',
+                                                 unescape(attrs[attr]))
+                    if (token['name'] in self.svg_allow_local_href and
+                        'xlink:href' in attrs and
+                        re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
+                        del attrs['xlink:href']
+                    if 'style' in attrs:
+                        attrs['style'] = self.sanitize_css(attrs['style'])
+                    token['data'] = [(name, val) for name, val in
+                                     attrs.items()]
+                return token
+            elif self.strip_disallowed_elements:
+                pass
+            else:
+                if token['type'] == tokenTypes['EndTag']:
+                    token['data'] = '' % token['name']
+                elif token['data']:
+                    attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in
+                                    token['data']])
+                    token['data'] = '<%s%s>' % (token['name'], attrs)
+                else:
+                    token['data'] = '<%s>' % token['name']
+                if token['selfClosing']:
+                    token['data'] = token['data'][:-1] + '/>'
+                token['type'] = tokenTypes['Characters']
+                del token["name"]
+                return token
+        elif token['type'] == tokenTypes['Comment']:
+            if not self.strip_html_comments:
+                return token
+        else:
+            return token
+
+    def sanitize_css(self, style):
+        """HTMLSanitizerMixin.sanitize_css replacement.
+
+        HTMLSanitizerMixin.sanitize_css always whitelists background-*,
+        border-*, margin-*, and padding-*. We only whitelist what's in
+        the whitelist.
+
+        """
+        # disallow urls
+        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+        # gauntlet
+        # TODO: Make sure this does what it's meant to - I *think* it wants to
+        # validate style attribute contents.
+        parts = style.split(';')
+        gauntlet = re.compile("""^([-/:,#%.'\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*"""
+                              """|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
+        for part in parts:
+            if not gauntlet.match(part):
+                return ''
+
+        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ''
+
+        clean = []
+        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
+            if not value:
+                continue
+            if prop.lower() in self.allowed_css_properties:
+                clean.append(prop + ': ' + value + ';')
+            elif prop.lower() in self.allowed_svg_properties:
+                clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
+
+
+class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
+    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
+                 lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
+        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
+                               lowercaseElementName, lowercaseAttrName,
+                               **kwargs)
+
+    def __iter__(self):
+        for token in HTMLTokenizer.__iter__(self):
+            token = self.sanitize_token(token)
+            if token:
+                yield token
diff --git a/bleach/tests/__init__.py b/bleach/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bleach/tests/test_basics.py b/bleach/tests/test_basics.py
new file mode 100644
index 0000000..60be11d
--- /dev/null
+++ b/bleach/tests/test_basics.py
@@ -0,0 +1,170 @@
+import html5lib
+from nose.tools import eq_
+
+import bleach
+
+
+def test_empty():
+    eq_('', bleach.clean(''))
+
+
+def test_comments_only():
+    comment = ''
+    open_comment = '' % open_comment, bleach.clean(open_comment,
+                                             strip_comments=False))
+
+
+def test_with_comments():
+    html = 'Just text'
+    eq_('Just text', bleach.clean(html))
+    eq_(html, bleach.clean(html, strip_comments=False))
+
+
+def test_no_html():
+    eq_('no html string', bleach.clean('no html string'))
+
+
+def test_allowed_html():
+    eq_('an allowed tag',
+        bleach.clean('an allowed tag'))
+    eq_('another good tag',
+        bleach.clean('another good tag'))
+
+
+def test_bad_html():
+    eq_('a fixed tag',
+        bleach.clean('a fixed tag'))
+
+
+def test_function_arguments():
+    TAGS = ['span', 'br']
+    ATTRS = {'span': ['style']}
+
+    eq_('a 
test', + bleach.clean('a
test', + tags=TAGS, attributes=ATTRS)) + + +def test_named_arguments(): + ATTRS = {'a': ['rel', 'href']} + s = u'
xx.com' + eq_('xx.com', bleach.clean(s)) + eq_(s, bleach.clean(s, attributes=ATTRS)) + + +def test_disallowed_html(): + eq_('a <script>safe()</script> test', + bleach.clean('a test')) + eq_('a <style>body{}</style> test', + bleach.clean('a test')) + + +def test_bad_href(): + eq_('no link', + bleach.clean('no link')) + + +def test_bare_entities(): + eq_('an & entity', bleach.clean('an & entity')) + eq_('an < entity', bleach.clean('an < entity')) + eq_('tag < and entity', + bleach.clean('tag < and entity')) + eq_('&', bleach.clean('&')) + + +def test_escaped_entities(): + s = u'<em>strong</em>' + eq_(s, bleach.clean(s)) + + +def test_serializer(): + s = u'
' + eq_(s, bleach.clean(s, tags=['table'])) + eq_(u'test
', bleach.linkify(u'test
')) + eq_(u'

test

', bleach.clean(u'

test

', tags=['p'])) + + +def test_no_href_links(): + s = u'x' + eq_(s, bleach.linkify(s)) + eq_(s, bleach.linkify(s, nofollow=False)) + + +def test_weird_strings(): + s = 'with
html tags', + bleach.clean('a test with html tags', strip=True)) + eq_('a test with html tags', + bleach.clean('a test with ' + 'html tags', strip=True)) + + s = '

link text

' + eq_('

link text

', bleach.clean(s, tags=['p'], strip=True)) + s = '

multiply nested text

' + eq_('

multiply nested text

', bleach.clean(s, tags=['p'], strip=True)) + + s = ('

' + '

') + eq_('

', + bleach.clean(s, tags=['p', 'a'], strip=True)) + + +def test_allowed_styles(): + ATTR = ['style'] + STYLE = ['color'] + blank = '' + s = '' + eq_(blank, bleach.clean('', attributes=ATTR)) + eq_(s, bleach.clean(s, attributes=ATTR, styles=STYLE)) + eq_(s, bleach.clean('', + attributes=ATTR, styles=STYLE)) + + +def test_idempotent(): + """Make sure that applying the filter twice doesn't change anything.""" + dirty = u'invalid & < extra http://link.com' + + clean = bleach.clean(dirty) + eq_(clean, bleach.clean(clean)) + + linked = bleach.linkify(dirty) + eq_(linked, bleach.linkify(linked)) + + +def test_lowercase_html(): + """We should output lowercase HTML.""" + dirty = u'BAR' + clean = u'BAR' + eq_(clean, bleach.clean(dirty, attributes=['class'])) + + +def test_wildcard_attributes(): + ATTR = { + '*': ['id'], + 'img': ['src'], + } + TAG = ['img', 'em'] + dirty = (u'both can have ' + u'') + clean = u'both can have ' + eq_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR)) + + +def test_sarcasm(): + """Jokes should crash.""" + dirty = u'Yeah right ' + clean = u'Yeah right <sarcasm/>' + eq_(clean, bleach.clean(dirty)) diff --git a/bleach/tests/test_css.py b/bleach/tests/test_css.py new file mode 100644 index 0000000..fdb3f65 --- /dev/null +++ b/bleach/tests/test_css.py @@ -0,0 +1,85 @@ +from functools import partial + +from nose.tools import eq_ + +from bleach import clean + + +clean = partial(clean, tags=['p'], attributes=['style']) + + +def test_allowed_css(): + tests = ( + ('font-family: Arial; color: red; float: left; ' + 'background-color: red;', 'color: red;', ['color']), + ('border: 1px solid blue; color: red; float: left;', 'color: red;', + ['color']), + ('border: 1px solid blue; color: red; float: left;', + 'color: red; float: left;', ['color', 'float']), + ('color: red; float: left; padding: 1em;', 'color: red; float: left;', + ['color', 'float']), + ('color: red; float: left; padding: 1em;', 'color: red;', ['color']), + ('cursor: -moz-grab;', 'cursor: -moz-grab;', ['cursor']), + ('color: hsl(30,100%,50%);', 'color: hsl(30,100%,50%);', ['color']), + ('color: rgba(255,0,0,0.4);', 'color: rgba(255,0,0,0.4);', ['color']), + ("text-overflow: ',' ellipsis;", "text-overflow: ',' ellipsis;", ['text-overflow']), + ) + + p = '

bar

' + + def check(input, output, styles): + eq_(p % output, clean(p % input, styles=styles)) + + for i, o, s in tests: + yield check, i, o, s + + +def test_valid_css(): + """The sanitizer should fix missing CSS values.""" + styles = ['color', 'float'] + eq_('

foo

', + clean('

foo

', styles=styles)) + eq_('

foo

', + clean('

foo

', styles=styles)) + + +def test_style_hang(): + """The sanitizer should not hang on any inline styles""" + # TODO: Neaten this up. It's copypasta from MDN/Kuma to repro the bug + style = ("""margin-top: 0px; margin-right: 0px; margin-bottom: 1.286em; """ + """margin-left: 0px; padding-top: 15px; padding-right: 15px; """ + """padding-bottom: 15px; padding-left: 15px; border-top-width: """ + """1px; border-right-width: 1px; border-bottom-width: 1px; """ + """border-left-width: 1px; border-top-style: dotted; """ + """border-right-style: dotted; border-bottom-style: dotted; """ + """border-left-style: dotted; border-top-color: rgb(203, 200, """ + """185); border-right-color: rgb(203, 200, 185); """ + """border-bottom-color: rgb(203, 200, 185); border-left-color: """ + """rgb(203, 200, 185); background-image: initial; """ + """background-attachment: initial; background-origin: initial; """ + """background-clip: initial; background-color: """ + """rgb(246, 246, 242); overflow-x: auto; overflow-y: auto; """ + """font: normal normal normal 100%/normal 'Courier New', """ + """'Andale Mono', monospace; background-position: initial """ + """initial; background-repeat: initial initial;""") + html = '

Hello world

' % style + styles = [ + 'border', 'float', 'overflow', 'min-height', 'vertical-align', + 'white-space', + 'margin', 'margin-left', 'margin-top', 'margin-bottom', 'margin-right', + 'padding', 'padding-left', 'padding-top', 'padding-bottom', 'padding-right', + 'background', + 'background-color', + 'font', 'font-size', 'font-weight', 'text-align', 'text-transform', + ] + + expected = ("""

""" + """Hello world

""") + + result = clean(html, styles=styles) + eq_(expected, result) diff --git a/bleach/tests/test_delinkify.py b/bleach/tests/test_delinkify.py new file mode 100644 index 0000000..f216d2f --- /dev/null +++ b/bleach/tests/test_delinkify.py @@ -0,0 +1,109 @@ +from nose.tools import eq_ + +import bleach + + +def test_delinkify(): + eq_('test', bleach.delinkify('test')) + eq_('footestbar', + bleach.delinkify('footestbar')) + + +def test_whitelist(): + html = 'test' + eq_(html, bleach.delinkify(html, allow_domains=['ex.mp'])) + eq_('test', bleach.delinkify(html, allow_domains=['ex2.mp'])) + # Allow a single domain as a special case. + eq_(html, bleach.delinkify(html, allow_domains='ex.mp')) + + +def test_nested_a(): + html = 'testtest' + eq_('testtest', bleach.delinkify(html)) + eq_('testtest', + bleach.delinkify(html, allow_domains=['ex.mp'])) + + +def test_nested_tag(): + html = 'testtest' + eq_('testtest', bleach.delinkify(html)) + + +def test_a_name(): + """Don't screw with non-link tags.""" + html = 'bar' + eq_(html, bleach.delinkify(html)) + + +def test_relative(): + """Relative links are optionally OK.""" + html = 'some link' + eq_('some link', bleach.delinkify(html)) + eq_(html, bleach.delinkify(html, allow_relative=True)) + + +def test_protocol_relative(): + """Protocol-relative links aren't relative.""" + html = 'bad link' + expect = 'bad link' + eq_(expect, bleach.delinkify(html)) + eq_(expect, bleach.delinkify(html, allow_relative=True)) + eq_(html, bleach.delinkify(html, allow_domains='ex.mp')) + + +def test_domain_match(): + tests = ( + ('ex.mp', 'ex.mp', True), + ('ex.mp', '*.ex.mp', True), + ('test.ex.mp', '*.ex.mp', True), + ('test.ex.mp', 'ex.mp', False), + ('test.test.ex.mp', '*.ex.mp', False), + ('test.test.ex.mp', '**.ex.mp', True), + ('wrong.mp', 'ex.mp', False), + ('wrong.mp', '*.ex.mp', False), + ('really.wrong.mp', 'ex.mp', False), + ('really.wrong.mp', '*.ex.mp', False), + ('really.very.wrong.mp', '*.ex.mp', False), + ('EX.mp', 'ex.mp', True), # Domains are case-insensitive. + ('ex.mp', 'an.ex.mp', False), + ('ex.mp', '*.an.ex.mp', False), + ('an.ex.am.pl', 'an.*.am.pl', True), + ('a.ex.am.pl', 'an.*.am.pl', False), + ('ex.am.pl', 'an.*.am.pl', False), + ) + + def _check(t, c, v): + eq_(v, bleach._domain_match(t, c)) + + for t, c, v in tests: + yield _check, t, c, v + + +def test_double_star(): + assert bleach._domain_match('ex.mp', '**.ex.mp') + try: + bleach._domain_match('ex.mp', 'an.**.ex.mp') + except bleach.ValidationError: + pass + else: + assert False, '_domain_match should not accept an.**.ex.mp' + + +def test_allow_subdomains(): + domains = ('ex.mp', '*.exa.mp', 'an.exam.pl', '*.my.examp.le') + html = ( + ('bad', 'bad'), + ('good', None), + ('good', None), + ('good', None), + ('bad', 'bad'), + ('bad', 'bad'), + ('bad', 'bad'), + ) + + def _check(html, text): + output = bleach.delinkify(html, allow_domains=domains) + eq_(html if text is None else text, output) + + for t, o in html: + yield _check, t, o diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py new file mode 100644 index 0000000..7caf006 --- /dev/null +++ b/bleach/tests/test_links.py @@ -0,0 +1,312 @@ +import urllib + +from html5lib.tokenizer import HTMLTokenizer +from nose.tools import eq_ + +from bleach import linkify, url_re + + +def filter_url(url): + return u'http://bouncer/?u=%s' % urllib.quote_plus(url) + + +def test_url_re(): + def no_match(s): + match = url_re.search(s) + if match: + assert not match, 'matched %s' % s[slice(*match.span())] + yield no_match, 'just what i am looking for...it' + + +def test_empty(): + eq_('', linkify('')) + + +def test_simple_link(): + eq_('a http://example.com' + ' link', + linkify('a http://example.com link')) + eq_('a https://example.com' + ' link', + linkify('a https://example.com link')) + eq_('an example.com link', + linkify('an example.com link')) + + +def test_trailing_slash(): + eq_('http://example.com/', + linkify('http://example.com/')) + eq_('' + 'http://example.com/foo/', + linkify('http://example.com/foo/')) + eq_('' + 'http://example.com/foo/bar/', + linkify('http://example.com/foo/bar/')) + + +def test_mangle_link(): + eq_('' + 'http://example.com', + linkify('http://example.com', filter_url=filter_url)) + + +def test_email_link(): + eq_('a james@example.com mailto', + linkify('a james@example.com mailto')) + eq_('a james@example.com.au mailto', + linkify('a james@example.com.au mailto')) + eq_('a ' + 'james@example.com mailto', + linkify('a james@example.com mailto', parse_email=True)) + eq_('aussie ' + 'james@example.com.au mailto', + linkify('aussie james@example.com.au mailto', parse_email=True)) + eq_('email to ' + 'james@example.com', + linkify('email to ' + 'james@example.com', parse_email=True)) + + +def test_email_link_escaping(): + eq_('''''' + '''"james"@example.com''', + linkify('"james"@example.com', parse_email=True)) + eq_('''''' + '''"j'ames"@example.com''', + linkify('"j\'ames"@example.com', parse_email=True)) + eq_('''''' + '''"ja>mes"@example.com''', + linkify('"ja>mes"@example.com', parse_email=True)) + + +def test_tlds(): + eq_('example.com', + linkify('example.com')) + eq_('example.co.uk', + linkify('example.co.uk')) + eq_('example.edu', + linkify('example.edu')) + eq_('example.xxx', linkify('example.xxx')) + eq_(' brie', linkify(' brie')) + eq_('bit.ly/fun', + linkify('bit.ly/fun')) + + +def test_escaping(): + eq_('< unrelated', linkify('< unrelated')) + + +def test_nofollow_off(): + eq_('example.com', + linkify(u'example.com', nofollow=False)) + + +def test_link_in_html(): + eq_('http://yy.com', + linkify('http://yy.com')) + eq_('http://xx.com' + '', + linkify('http://xx.com')) + + +def test_links_https(): + eq_('https://yy.com', + linkify('https://yy.com')) + + +def test_add_rel_nofollow(): + """Verify that rel="nofollow" is added to an existing link""" + eq_('http://yy.com', + linkify('http://yy.com')) + + +def test_url_with_path(): + eq_('' + 'http://example.com/path/to/file', + linkify('http://example.com/path/to/file')) + + +def test_link_ftp(): + eq_('' + 'ftp://ftp.mozilla.org/some/file', + linkify('ftp://ftp.mozilla.org/some/file')) + + +def test_link_query(): + eq_('' + 'http://xx.com/?test=win', + linkify('http://xx.com/?test=win')) + eq_('' + 'xx.com/?test=win', + linkify('xx.com/?test=win')) + eq_('' + 'xx.com?test=win', + linkify('xx.com?test=win')) + + +def test_link_fragment(): + eq_('' + 'http://xx.com/path#frag', + linkify('http://xx.com/path#frag')) + + +def test_link_entities(): + eq_('' + 'http://xx.com/?a=1&b=2', + linkify('http://xx.com/?a=1&b=2')) + + +def test_escaped_html(): + """If I pass in escaped HTML, it should probably come out escaped.""" + s = '<em>strong</em>' + eq_(s, linkify(s)) + + +def test_link_http_complete(): + eq_('' + 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f', + linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f')) + + +def test_non_url(): + """document.vulnerable should absolutely not be linkified.""" + s = 'document.vulnerable' + eq_(s, linkify(s)) + + +def test_javascript_url(): + """javascript: urls should never be linkified.""" + s = 'javascript:document.vulnerable' + eq_(s, linkify(s)) + + +def test_unsafe_url(): + """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" + eq_('All your{"xx.yy.com/grover.png"}base are', + linkify('All your{"xx.yy.com/grover.png"}base are')) + + +def test_skip_pre(): + """Skip linkification in
 tags."""
+    simple = 'http://xx.com 
http://xx.com
' + linked = ('http://xx.com ' + '
http://xx.com
') + all_linked = ('http://xx.com ' + '
http://xx.com'
+                  '
') + eq_(linked, linkify(simple, skip_pre=True)) + eq_(all_linked, linkify(simple)) + + already_linked = '
xx
' + nofollowed = '
xx
' + eq_(nofollowed, linkify(already_linked)) + eq_(nofollowed, linkify(already_linked, skip_pre=True)) + + +def test_libgl(): + """libgl.so.1 should not be linkified.""" + eq_('libgl.so.1', linkify('libgl.so.1')) + + +def test_end_of_sentence(): + """example.com. should match.""" + out = u'%s%s' + in_ = u'%s%s' + + def check(u, p): + eq_(out % (u, u, p), linkify(in_ % (u, p))) + + tests = ( + ('example.com', '.'), + ('example.com', '...'), + ('ex.com/foo', '.'), + ('ex.com/foo', '....'), + ) + + for u, p in tests: + yield check, u, p + + +def test_end_of_clause(): + """example.com/foo, shouldn't include the ,""" + eq_('ex.com/foo, bar', + linkify('ex.com/foo, bar')) + + +def test_sarcasm(): + """Jokes should crash.""" + dirty = u'Yeah right ' + clean = u'Yeah right <sarcasm/>' + eq_(clean, linkify(dirty)) + + +def test_wrapping_parentheses(): + """URLs wrapped in parantheses should not include them.""" + out = u'%s%s%s' + + tests = ( + ('(example.com)', out % ('(', 'example.com', 'example.com', ')')), + ('(example.com/)', out % ('(', 'example.com/', 'example.com/', ')')), + ('(example.com/foo)', out % ('(', 'example.com/foo', + 'example.com/foo', ')')), + ('(((example.com/))))', out % ('(((', 'example.com/)', + 'example.com/)', ')))')), + ('example.com/))', out % ('', 'example.com/))', + 'example.com/))', '')), + ('http://en.wikipedia.org/wiki/Test_(assessment)', + out % ('', 'en.wikipedia.org/wiki/Test_(assessment)', + 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), + ('(http://en.wikipedia.org/wiki/Test_(assessment))', + out % ('(', 'en.wikipedia.org/wiki/Test_(assessment)', + 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')), + ('((http://en.wikipedia.org/wiki/Test_(assessment))', + out % ('((', 'en.wikipedia.org/wiki/Test_(assessment', + 'http://en.wikipedia.org/wiki/Test_(assessment', '))')), + ('(http://en.wikipedia.org/wiki/Test_(assessment)))', + out % ('(', 'en.wikipedia.org/wiki/Test_(assessment))', + 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')), + ('(http://en.wikipedia.org/wiki/)Test_(assessment', + out % ('(', 'en.wikipedia.org/wiki/)Test_(assessment', + 'http://en.wikipedia.org/wiki/)Test_(assessment', '')), + ) + + def check(test, expected_output): + eq_(expected_output, linkify(test)) + + for test, expected_output in tests: + yield check, test, expected_output + + +def test_ports(): + """URLs can contain port numbers.""" + tests = ( + ('http://foo.com:8000', ('http://foo.com:8000', '')), + ('http://foo.com:8000/', ('http://foo.com:8000/', '')), + ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')), + ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')), + ('http://foo.com:', ('http://foo.com', ':')), + ) + + def check(test, output): + eq_(u'{0}{1}'.format(*output), + linkify(test)) + + for test, output in tests: + yield check, test, output + + +def test_target(): + eq_('example.com', + linkify(u'example.com', target='_blank')) + eq_('example.com', + linkify(u'example.com', target='_blank', nofollow=False)) + + +def test_tokenizer(): + """Linkify doesn't always have to sanitize.""" + raw = 'test' + eq_('test<x></x>', linkify(raw)) + eq_(raw, linkify(raw, tokenizer=HTMLTokenizer)) diff --git a/bleach/tests/test_security.py b/bleach/tests/test_security.py new file mode 100644 index 0000000..9e9bb7b --- /dev/null +++ b/bleach/tests/test_security.py @@ -0,0 +1,108 @@ +"""More advanced security tests""" + +from nose.tools import eq_ + +from bleach import clean + + +def test_nested_script_tag(): + eq_('<<script>script>evil()<</script>/script>', + clean('</script>')) + eq_('<<x>script>evil()<</x>/script>', + clean('<script>evil()</script>')) + + +def test_nested_script_tag_r(): + eq_('<script<script>>evil()</script<>>', + clean('>evil()>')) + + +def test_invalid_attr(): + IMG = ['img', ] + IMG_ATTR = ['src'] + + eq_('test', + clean('test')) + eq_('', + clean('', + tags=IMG, attributes=IMG_ATTR)) + eq_('', + clean('', + tags=IMG, attributes=IMG_ATTR)) + + +def test_unquoted_attr(): + eq_('myabbr', + clean('myabbr')) + + +def test_unquoted_event_handler(): + eq_('xx.com', + clean('xx.com')) + + +def test_invalid_attr_value(): + eq_('<img src="javascript:alert(\'XSS\');">', + clean('')) + + +def test_invalid_href_attr(): + eq_('xss', + clean('xss')) + + +def test_invalid_filter_attr(): + IMG = ['img', ] + IMG_ATTR = {'img': lambda n, v: n == 'src' and v == "http://example.com/"} + + eq_('', + clean('', + tags=IMG, attributes=IMG_ATTR)) + + eq_('', clean('', + tags=IMG, attributes=IMG_ATTR)) + + +def test_invalid_tag_char(): + eq_('<script xss="" src="http://xx.com/xss.js"></script>', + clean('')) + eq_('<script src="http://xx.com/xss.js"></script>', + clean('')) + + +def test_unclosed_tag(): + eq_('<script src="http://xx.com/xss.js&lt;b">', + clean('ipt>' + eq_('pt>alert(1)ipt>', clean(s, strip=True)) + s = 'pt>pt>alert(1)' + eq_('pt>pt>alert(1)', clean(s, strip=True)) + + +def test_nasty(): + """Nested, broken up, multiple tags, are still foiled!""" + test = ('ipt type="text/javascript">alert("foo");script>') + expect = (u'<scr<script></script>ipt type="text/javascript"' + u'>alert("foo");</script>script<del></del>' + u'>') + eq_(expect, clean(test)) + + +def test_poster_attribute(): + """Poster attributes should not allow javascript.""" + tags = ['video'] + attrs = {'video': ['poster']} + test = '' + expect = '' + eq_(expect, clean(test, tags=tags, attributes=attrs)) + ok = '' + eq_(ok, clean(ok, tags=tags, attributes=attrs)) diff --git a/bleach/tests/test_unicode.py b/bleach/tests/test_unicode.py new file mode 100644 index 0000000..67123cc --- /dev/null +++ b/bleach/tests/test_unicode.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +from nose.tools import eq_ + +from bleach import clean, linkify + + +def test_japanese_safe_simple(): + eq_(u'ヘルプとチュートリアル', clean(u'ヘルプとチュートリアル')) + eq_(u'ヘルプとチュートリアル', linkify(u'ヘルプとチュートリアル')) + + +def test_japanese_strip(): + eq_(u'ヘルプとチュートリアル', + clean(u'ヘルプとチュートリアル')) + eq_(u'<span>ヘルプとチュートリアル</span>', + clean(u'ヘルプとチュートリアル')) + + +def test_russian_simple(): + eq_(u'Домашняя', clean(u'Домашняя')) + eq_(u'Домашняя', linkify(u'Домашняя')) + + +def test_mixed(): + eq_(u'Домашняяヘルプとチュートリアル', + clean(u'Домашняяヘルプとチュートリアル')) + + +def test_mixed_linkify(): + eq_(u'Домашняя ' + u'http://example.com ヘルプとチュートリアル', + linkify(u'Домашняя http://example.com ヘルプとチュートリアル')) + + +def test_url_utf8(): + """Allow UTF8 characters in URLs themselves.""" + out = u'%(url)s' + + tests = ( + ('http://éxámplé.com/', out % {'url': u'http://éxámplé.com/'}), + ('http://éxámplé.com/íàñá/', + out % {'url': u'http://éxámplé.com/íàñá/'}), + ('http://éxámplé.com/íàñá/?foo=bar', + out % {'url': u'http://éxámplé.com/íàñá/?foo=bar'}), + ('http://éxámplé.com/íàñá/?fóo=bár', + out % {'url': u'http://éxámplé.com/íàñá/?fóo=bár'}), + ) + + def check(test, expected_output): + eq_(expected_output, linkify(test)) + + for test, expected_output in tests: + yield check, test, expected_output diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c525a9e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +# These are the requirements to run the test suite. +nose +html5lib diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..115d811 --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +from setuptools import setup, find_packages + +setup( + name='bleach', + version='1.1.5', + description='An easy whitelist-based HTML-sanitizing tool.', + long_description=open('README.rst').read(), + author='James Socol', + author_email='james@mozilla.com', + url='http://github.com/jsocol/bleach', + license='BSD', + packages=find_packages(), + include_package_data=True, + package_data={'': ['README.rst']}, + zip_safe=False, + install_requires=['html5lib>=0.95'], + classifiers=[ + 'Development Status :: 4 - Beta', + 'Environment :: Web Environment', + 'Environment :: Web Environment :: Mozilla', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Topic :: Software Development :: Libraries :: Python Modules', + ] +) -- cgit v1.2.3