18 files changed, 1611 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6714ae6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+*.pyo
+*.pyc
+pip-log.txt
+.coverage
+dist
+*.egg-info
+.noseids
+build
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..e767f15
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,6 @@
+language: python
+python:
+ - "2.6"
+ - "2.7"
+install: pip install -Ur requirements.txt --use-mirrors
+script: nosetests
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..f612983
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1 @@
+See https://github.com/jsocol/bleach/contributors
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..b2df30c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2010, Mozilla Foundation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+    3. Neither the name of bleach nor the names of its contributors may
+       be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..9d5d250
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include LICENSE
+include README.rst
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..08dfc50
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,159 @@
+======
+Bleach
+======
+
+Bleach is an HTML sanitizing library that escapes or strips markup and
+attributes based on a white list. Bleach can also linkify text safely, applying
+filters that Django's ``urlize`` filter cannot, and optionally setting ``rel``
+attributes, even on links already in the text.
+
+Bleach is intended for sanitizing text from *untrusted* sources. If you find
+yourself jumping through hoops to allow your site administrators to do lots of
+things, you're probably outside the use cases. Either trust those users, or
+don't.
+
+Because it relies on html5lib_, Bleach is as good as modern browsers at dealing
+with weird, quirky HTML fragments. And *any* of Bleach's methods will fix
+unbalanced or mis-nested tags.
+
+The version on `github <http://github.com/jsocol/bleach>`_ is the most
+up-to-date and contains the latest bug fixes.
+
+
+Basic Use
+=========
+
+The simplest way to use Bleach is::
+
+    >>> import bleach
+
+    >>> bleach.clean('an <script>evil()</script> example')
+    u'an &lt;script&gt;evil()&lt;/script&gt; example'
+
+    >>> bleach.linkify('an http://example.com url')
+    u'an <a href="http://example.com" rel="nofollow">http://example.com</a> url
+
+    >>> bleach.delinkify('a <a href="http://ex.mp">link</a>')
+    u'a link'
+
+*NB*: Bleach always returns a ``unicode`` object, whether you give it a
+bytestring or a ``unicode`` object, but Bleach does not attempt to detect
+incoming character encodings, and will assume UTF-8. If you are using a
+different character encoding, you should convert from a bytestring to
+``unicode`` before passing the text to Bleach.
+
+
+Customizing Bleach
+==================
+
+``clean()``, ``linkify()`` and ``delinkify()`` can take several optional
+keyword arguments to customize their behavior.
+
+
+``clean()``
+-----------
+
+``bleach.clean()`` is the primary tool in Bleach. It uses html5lib_ to parse a
+document fragment into a tree and does the sanitization during tokenizing,
+which is incredibly powerful and has several advantages over regular
+expression-based sanitization.
+
+``tags``
+  A whitelist of HTML tags. Must be a list. Defaults to
+  ``bleach.ALLOWED_TAGS``.
+``attributes``
+  A whitelist of HTML attributes. Either a list, in which case all attributes
+  are allowed on all elements, or a dict, with tag names as keys and lists of
+  allowed attributes as values ('*' is a wildcard key to allow an attribute on
+  any tag). Or it is possible to pass a callable instead of a list that accepts
+  name and value of attribute and returns True of False. Defaults to
+  ``bleach.ALLOWED_ATTRIBUTES``.
+``styles``
+  A whitelist of allowed CSS properties within a ``style`` attribute. (Note
+  that ``style`` attributes are not allowed by default.) Must be a list.
+  Defaults to ``[]``.
+``strip``
+  Strip disallowed HTML instead of escaping it. A boolean. Defaults to
+  ``False``.
+``strip_comments``
+  Strip HTML comments. A boolean. Defaults to ``True``.
+
+
+``linkify()``
+-------------
+
+``bleach.linkify()`` turns things that look like URLs or (optionally) email
+addresses and turns them into links. It does this smartly, only looking in text
+nodes, and never within ``<a>`` tags.
+
+There are options that affect output, and some of these are also applied to
+links already found in the text. These are designed to allow you to set
+attributes like ``rel="nofollow"`` or ``target``, or push outgoing links
+through a redirection URL, and do this to links already in the text, as well.
+
+``nofollow``
+  Add ``rel="nofollow"`` to non-relative links (both created by ``linkify()``
+  and those already present in the text). Defaults to ``True``.
+``filter_url``
+  A callable through which the ``href`` attribute of links (both created by
+  ``linkify()`` and already present in the text) will be passed. Must accept a
+  single argument and return a string.
+``filter_text``
+  A callable through which the text of links (only those created by
+  ``linkify``) will be passed. Must accept a single argument and return a
+  string.
+``skip_pre``
+  Do not create new links inside ``<pre>`` sections. Still follows
+  ``nofollow``. Defaults to ``False``.
+``parse_email``
+  Linkify email addresses with ``mailto:``. Defaults to ``False``.
+``target``
+  Set a ``target`` attribute on links. Like ``nofollow``, if ``target`` is not
+  ``None``, will set the attribute on links already in the text, as well.
+  Defaults to ``None``.
+
+
+``delinkify()``
+---------------
+
+``bleach.delinkify()`` is basically the opposite of ``linkify()``. It strips
+links out of text except, optionally, relative links, or links to domains
+you've whitelisted.
+
+``allow_domains``
+  Allow links to the domains in this list. Set to ``None`` or an empty list to
+  disallow all non-relative domains. See below for wildcards. Defaults to
+  ``None``.
+``allow_relative``
+  Allow relative links (i.e. those with no hostname). Defaults to ``False``.
+
+
+Wildcards
+^^^^^^^^^
+
+To allow links to a domain and its subdomains, ``allow_domains`` accepts two
+types of wildcard arguments in domains:
+
+``*``
+  Allow a single level of subdomain. This can be anywhere in the hostname, even
+  the TLD. This allows you to, for example, allow links to ``example.*``.
+  ``*.example.com`` will match both ``foo.example.com`` and ``example.com``.
+  ::
+    >>> delinkify('<a href="http://foo.ex.mp">bar</a>', \
+    ... allow_domains=['*.ex.*'])
+    u'<a href="http://foo.ex.mp">bar</a>'
+    >>> delinkify('<a href="http://ex.mp">bar</a>', allow_domains=['*.ex.mp'])
+    u'<a href="http://ex.mp">bar</a>
+``**``
+  To allow any number of *preceding* subdomains, you can start a hostname with
+  ``**``. Note that unlike ``*``, ``**`` may only appear once, and only at the
+  beginning of a hostname.
+  ::
+    >>> delinkify('<a href="http://a.b.ex.mp">t</a>', \
+    ... allow_domains=['**.ex.mp'])
+    u'<a href="http://a.b.ex.mp">t</a>'
+  If ``**`` appears anywhere but the beginning of a hostname, ``delinkify``
+  will throw ``bleach.ValidationError`` (which is a ``ValueError`` subclass,
+  for easy catching).
+
+.. _html5lib: http://code.google.com/p/html5lib/
diff --git a/bleach/__init__.py b/bleach/__init__.py
new file mode 100644
index 0000000..bc8e49c
--- /dev/null
+++ b/bleach/__init__.py
@@ -0,0 +1,342 @@
+import itertools
+import logging
+import re
+import sys
+import urlparse
+
+import html5lib
+from html5lib.sanitizer import HTMLSanitizer
+from html5lib.serializer.htmlserializer import HTMLSerializer
+
+from encoding import force_unicode
+from sanitizer import BleachSanitizer
+
+
+VERSION = (1, 1, 5)
+__version__ = '.'.join(map(str, VERSION))
+
+__all__ = ['clean', 'linkify']
+
+log = logging.getLogger('bleach')
+
+ALLOWED_TAGS = [
+    'a',
+    'abbr',
+    'acronym',
+    'b',
+    'blockquote',
+    'code',
+    'em',
+    'i',
+    'li',
+    'ol',
+    'strong',
+    'ul',
+]
+
+ALLOWED_ATTRIBUTES = {
+    'a': ['href', 'title'],
+    'abbr': ['title'],
+    'acronym': ['title'],
+}
+
+ALLOWED_STYLES = []
+
+TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
+       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
+       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
+       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
+       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
+       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
+       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
+       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
+       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn pr pro
+       ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl sm sn so
+       sr st su sv sy sz tc td tel tf tg th tj tk tl tm tn to tp tr travel tt
+       tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws xn ye yt yu za zm
+       zw""".split()
+
+TLDS.reverse()
+
+url_re = re.compile(
+    r"""\(*  # Match any opening parentheses.
+    \b(?<![@.])(?:\w[\w-]*:/{0,3}(?:(?:\w+:)?\w+@)?)?  # http://
+    ([\w-]+\.)+(?:%s)(?:\:\d+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+    (?:[/?][^\s\{\}\|\\\^\[\]`<>"]*)?
+        # /path/zz (excluding "unsafe" chars from RFC 1738,
+        # except for # and ~, which happen in practice)
+    """ % u'|'.join(TLDS), re.VERBOSE | re.UNICODE)
+
+proto_re = re.compile(r'^[\w-]+:/{0,3}')
+
+punct_re = re.compile(r'([\.,]+)$')
+
+email_re = re.compile(
+    r"""(?<!//)
+    (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
+        (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*  # dot-atom
+    |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
+        |\\[\001-011\013\014\016-\177])*"  # quoted-string
+    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.?  # domain
+    """,
+    re.IGNORECASE | re.MULTILINE | re.VERBOSE)
+
+NODE_TEXT = 4  # The numeric ID of a text node in simpletree.
+
+identity = lambda x: x  # The identity function.
+
+
+def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+          styles=ALLOWED_STYLES, strip=False, strip_comments=True):
+    """Clean an HTML fragment and return it"""
+    if not text:
+        return u''
+
+    text = force_unicode(text)
+    if text.startswith(u'<!--'):
+        text = u' ' + text
+
+    class s(BleachSanitizer):
+        allowed_elements = tags
+        allowed_attributes = attributes
+        allowed_css_properties = styles
+        strip_disallowed_elements = strip
+        strip_html_comments = strip_comments
+
+    parser = html5lib.HTMLParser(tokenizer=s)
+
+    return _render(parser.parseFragment(text)).strip()
+
+
+def linkify(text, nofollow=True, target=None, filter_url=identity,
+            filter_text=identity, skip_pre=False, parse_email=False,
+            tokenizer=HTMLSanitizer):
+    """Convert URL-like strings in an HTML fragment to links.
+
+    linkify() converts strings that look like URLs or domain names in a
+    blob of text that may be an HTML fragment to links, while preserving
+    (a) links already in the string, (b) urls found in attributes, and
+    (c) email addresses.
+
+    If the nofollow argument is True (the default) then rel="nofollow"
+    will be added to links created by linkify() as well as links already
+    found in the text.
+
+    The target argument will optionally add a target attribute with the
+    given value to links created by linkify() as well as links already
+    found in the text.
+
+    linkify() uses up to two filters on each link. For links created by
+    linkify(), the href attribute is passed through filter_url()
+    and the text of the link is passed through filter_text(). For links
+    already found in the document, the href attribute is passed through
+    filter_url(), but the text is untouched.
+    """
+    text = force_unicode(text)
+
+    if not text:
+        return u''
+
+    parser = html5lib.HTMLParser(tokenizer=tokenizer)
+
+    forest = parser.parseFragment(text)
+
+    if nofollow:
+        rel = u'rel="nofollow"'
+    else:
+        rel = u''
+
+    def replace_nodes(tree, new_frag, node):
+        new_tree = parser.parseFragment(new_frag)
+        for n in new_tree.childNodes:
+            tree.insertBefore(n, node)
+        tree.removeChild(node)
+
+    def strip_wrapping_parentheses(fragment):
+        """Strips wrapping parentheses.
+
+        Returns a tuple of the following format::
+
+            (string stripped from wrapping parentheses,
+             count of stripped opening parentheses,
+             count of stripped closing parentheses)
+        """
+        opening_parentheses = closing_parentheses = 0
+        # Count consecutive opening parentheses
+        # at the beginning of the fragment (string).
+        for char in fragment:
+            if char == '(':
+                opening_parentheses += 1
+            else:
+                break
+
+        if opening_parentheses:
+            newer_frag = ''
+            # Cut the consecutive opening brackets from the fragment.
+            fragment = fragment[opening_parentheses:]
+            # Reverse the fragment for easier detection of parentheses
+            # inside the URL.
+            reverse_fragment = fragment[::-1]
+            skip = False
+            for char in reverse_fragment:
+                # Remove the closing parentheses if it has a matching
+                # opening parentheses (they are balanced).
+                if (char == ')' and
+                        closing_parentheses < opening_parentheses and
+                        not skip):
+                    closing_parentheses += 1
+                    continue
+                # Do not remove ')' from the URL itself.
+                elif char != ')':
+                    skip = True
+                newer_frag += char
+            fragment = newer_frag[::-1]
+
+        return fragment, opening_parentheses, closing_parentheses
+
+    def linkify_nodes(tree, parse_text=True):
+        for node in tree.childNodes:
+            if node.type == NODE_TEXT and parse_text:
+                new_frag = node.toxml()
+                if parse_email:
+                    new_frag = re.sub(email_re, email_repl, new_frag)
+                    if new_frag != node.toxml():
+                        replace_nodes(tree, new_frag, node)
+                        linkify_nodes(tree)
+                        continue
+                new_frag = re.sub(url_re, link_repl, new_frag)
+                replace_nodes(tree, new_frag, node)
+            elif node.name == 'a':
+                if 'href' in node.attributes:
+                    if nofollow:
+                        node.attributes['rel'] = 'nofollow'
+                    if target is not None:
+                        node.attributes['target'] = target
+                    href = node.attributes['href']
+                    node.attributes['href'] = filter_url(href)
+            elif skip_pre and node.name == 'pre':
+                linkify_nodes(node, False)
+            else:
+                linkify_nodes(node)
+
+    def email_repl(match):
+        repl = u'<a href="mailto:%(mail)s">%(mail)s</a>'
+        return repl % {'mail': match.group(0).replace('"', '&quot;')}
+
+    def link_repl(match):
+        url = match.group(0)
+        open_brackets = close_brackets = 0
+        if url.startswith('('):
+            url, open_brackets, close_brackets = (
+                    strip_wrapping_parentheses(url)
+            )
+        end = u''
+        m = re.search(punct_re, url)
+        if m:
+            end = m.group(0)
+            url = url[0:m.start()]
+        if re.search(proto_re, url):
+            href = url
+        else:
+            href = u''.join([u'http://', url])
+
+        repl = u'%s<a href="%s" %s>%s</a>%s%s'
+
+        attribs = [rel]
+        if target is not None:
+            attribs.append('target="%s"' % target)
+
+        return repl % ('(' * open_brackets,
+                       filter_url(href), ' '.join(attribs), filter_text(url),
+                       end, ')' * close_brackets)
+
+    linkify_nodes(forest)
+
+    return _render(forest)
+
+
+def delinkify(text, allow_domains=None, allow_relative=False):
+    """Remove links from text, except those allowed to stay."""
+    text = force_unicode(text)
+    if not text:
+        return u''
+
+    parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer)
+    forest = parser.parseFragment(text)
+
+    if allow_domains is None:
+        allow_domains = []
+    elif isinstance(allow_domains, basestring):
+        allow_domains = [allow_domains]
+
+    def delinkify_nodes(tree):
+        """Remove <a> tags and replace them with their contents."""
+        for node in tree.childNodes:
+            if node.name == 'a':
+                if 'href' not in node.attributes:
+                    continue
+                parts = urlparse.urlparse(node.attributes['href'])
+                host = parts.hostname
+                if any(_domain_match(host, d) for d in allow_domains):
+                    continue
+                if host is None and allow_relative:
+                    continue
+                # Replace the node with its children.
+                # You can't nest <a> tags, and html5lib takes care of that
+                # for us in the tree-building step.
+                for n in node.childNodes:
+                    tree.insertBefore(n, node)
+                tree.removeChild(node)
+            elif node.type != NODE_TEXT: # Don't try to delinkify text.
+                delinkify_nodes(node)
+
+    delinkify_nodes(forest)
+    return _render(forest)
+
+
+def _domain_match(test, compare):
+    test = test.lower()
+    compare = compare.lower()
+    if '*' not in compare:
+        return test == compare
+    c = compare.split('.')[::-1]
+    if '**' in c and (c.count('**') > 1 or not compare.startswith('**')):
+        raise ValidationError(
+            'Only 1 ** is allowed, and must start the domain.')
+    t = test.split('.')[::-1]
+    z = itertools.izip_longest(c, t)
+    for c, t in z:
+        if c == t:
+            continue
+        elif c == '*':
+            continue
+        elif c == '**':
+            return True
+        return False
+    # Got all the way through and everything matched.
+    return True
+
+
+class ValidationError(ValueError):
+    pass
+
+
+def _render(tree):
+    """Try rendering as HTML, then XML, then give up."""
+    try:
+        return force_unicode(_serialize(tree))
+    except Exception, e:
+        log.error('HTML: %r' % e, exc_info=sys.exc_info())
+        try:
+            return force_unicode(tree.toxml())
+        except Exception, e:
+            log.error('XML: %r' % e, exc_info=sys.exc_info())
+            return u''
+
+
+def _serialize(domtree):
+    walker = html5lib.treewalkers.getTreeWalker('simpletree')
+    stream = walker(domtree)
+    serializer = HTMLSerializer(quote_attr_values=True,
+                                omit_optional_tags=False)
+    return serializer.render(stream)
diff --git a/bleach/encoding.py b/bleach/encoding.py
new file mode 100644
index 0000000..b9a989d
--- /dev/null
+++ b/bleach/encoding.py
@@ -0,0 +1,54 @@
+import datetime
+from decimal import Decimal
+import types
+
+
+def is_protected_type(obj):
+    """Determine if the object instance is of a protected type.
+
+    Objects of protected types are preserved as-is when passed to
+    force_unicode(strings_only=True).
+    """
+    return isinstance(obj, (
+        types.NoneType,
+        int, long,
+        datetime.datetime, datetime.date, datetime.time,
+        float, Decimal)
+    )
+
+
+def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+    """
+    Similar to smart_unicode, except that lazy instances are resolved to
+    strings, rather than kept as lazy objects.
+
+    If strings_only is True, don't convert (some) non-string-like objects.
+    """
+    if strings_only and is_protected_type(s):
+        return s
+    try:
+        if not isinstance(s, basestring,):
+            if hasattr(s, '__unicode__'):
+                s = unicode(s)
+            else:
+                try:
+                    s = unicode(str(s), encoding, errors)
+                except UnicodeEncodeError:
+                    if not isinstance(s, Exception):
+                        raise
+                    # If we get to here, the caller has passed in an Exception
+                    # subclass populated with non-ASCII data without special
+                    # handling to display as a string. We need to handle this
+                    # without raising a further exception. We do an
+                    # approximation to what the Exception's standard str()
+                    # output should be.
+                    s = ' '.join([force_unicode(arg, encoding, strings_only,
+                            errors) for arg in s])
+        elif not isinstance(s, unicode):
+            # Note: We use .decode() here, instead of unicode(s, encoding,
+            # errors), so that if s is a SafeString, it ends up being a
+            # SafeUnicode at the end.
+            s = s.decode(encoding, errors)
+    except UnicodeDecodeError, e:
+        raise UnicodeDecodeError(*e.args)
+    return s
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
new file mode 100644
index 0000000..677287e
--- /dev/null
+++ b/bleach/sanitizer.py
@@ -0,0 +1,143 @@
+import re
+from xml.sax.saxutils import escape, unescape
+
+from html5lib.constants import tokenTypes
+from html5lib.sanitizer import HTMLSanitizerMixin
+from html5lib.tokenizer import HTMLTokenizer
+
+
+class BleachSanitizerMixin(HTMLSanitizerMixin):
+    """Mixin to replace sanitize_token() and sanitize_css()."""
+
+    allowed_svg_properties = []
+    # TODO: When the next html5lib version comes out, nuke this.
+    attr_val_is_uri = HTMLSanitizerMixin.attr_val_is_uri + ['poster']
+
+    def sanitize_token(self, token):
+        """Sanitize a token either by HTML-encoding or dropping.
+
+        Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
+        a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
+
+        Here callable is a function with two arguments of attribute name
+        and value. It should return true of false.
+
+        Also gives the option to strip tags instead of encoding.
+
+        """
+        if (getattr(self, 'wildcard_attributes', None) is None and
+            isinstance(self.allowed_attributes, dict)):
+            self.wildcard_attributes = self.allowed_attributes.get('*', [])
+
+        if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
+                             tokenTypes['EmptyTag']):
+            if token['name'] in self.allowed_elements:
+                if 'data' in token:
+                    if isinstance(self.allowed_attributes, dict):
+                        allowed_attributes = self.allowed_attributes.get(
+                            token['name'], [])
+                        if not callable(allowed_attributes):
+                            allowed_attributes += self.wildcard_attributes
+                    else:
+                        allowed_attributes = self.allowed_attributes
+                    attrs = dict([(name, val) for name, val in
+                                  token['data'][::-1]
+                                  if (allowed_attributes(name, val)
+                                      if callable(allowed_attributes)
+                                      else name in allowed_attributes)])
+                    for attr in self.attr_val_is_uri:
+                        if not attr in attrs:
+                            continue
+                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+                                               unescape(attrs[attr])).lower()
+                        # Remove replacement characters from unescaped
+                        # characters.
+                        val_unescaped = val_unescaped.replace(u"\ufffd", "")
+                        if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
+                            and (val_unescaped.split(':')[0] not in
+                                 self.allowed_protocols)):
+                            del attrs[attr]
+                    for attr in self.svg_attr_val_allows_ref:
+                        if attr in attrs:
+                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                                 ' ',
+                                                 unescape(attrs[attr]))
+                    if (token['name'] in self.svg_allow_local_href and
+                        'xlink:href' in attrs and
+                        re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
+                        del attrs['xlink:href']
+                    if 'style' in attrs:
+                        attrs['style'] = self.sanitize_css(attrs['style'])
+                    token['data'] = [(name, val) for name, val in
+                                     attrs.items()]
+                return token
+            elif self.strip_disallowed_elements:
+                pass
+            else:
+                if token['type'] == tokenTypes['EndTag']:
+                    token['data'] = '</%s>' % token['name']
+                elif token['data']:
+                    attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in
+                                    token['data']])
+                    token['data'] = '<%s%s>' % (token['name'], attrs)
+                else:
+                    token['data'] = '<%s>' % token['name']
+                if token['selfClosing']:
+                    token['data'] = token['data'][:-1] + '/>'
+                token['type'] = tokenTypes['Characters']
+                del token["name"]
+                return token
+        elif token['type'] == tokenTypes['Comment']:
+            if not self.strip_html_comments:
+                return token
+        else:
+            return token
+
+    def sanitize_css(self, style):
+        """HTMLSanitizerMixin.sanitize_css replacement.
+
+        HTMLSanitizerMixin.sanitize_css always whitelists background-*,
+        border-*, margin-*, and padding-*. We only whitelist what's in
+        the whitelist.
+
+        """
+        # disallow urls
+        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+        # gauntlet
+        # TODO: Make sure this does what it's meant to - I *think* it wants to
+        # validate style attribute contents.
+        parts = style.split(';')
+        gauntlet = re.compile("""^([-/:,#%.'\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*"""
+                              """|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
+        for part in parts:
+            if not gauntlet.match(part):
+                return ''
+
+        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ''
+
+        clean = []
+        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
+            if not value:
+                continue
+            if prop.lower() in self.allowed_css_properties:
+                clean.append(prop + ': ' + value + ';')
+            elif prop.lower() in self.allowed_svg_properties:
+                clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
+
+
+class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
+    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
+                 lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
+        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
+                               lowercaseElementName, lowercaseAttrName,
+                               **kwargs)
+
+    def __iter__(self):
+        for token in HTMLTokenizer.__iter__(self):
+            token = self.sanitize_token(token)
+            if token:
+                yield token
diff --git a/bleach/tests/__init__.py b/bleach/tests/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/bleach/tests/__init__.py
diff --git a/bleach/tests/test_basics.py b/bleach/tests/test_basics.py
new file mode 100644
index 0000000..60be11d
--- /dev/null
+++ b/bleach/tests/test_basics.py
@@ -0,0 +1,170 @@
+import html5lib
+from nose.tools import eq_
+
+import bleach
+
+
+def test_empty():
+    eq_('', bleach.clean(''))
+
+
+def test_comments_only():
+    comment = '<!-- this is a comment -->'
+    open_comment = '<!-- this is an open comment'
+    eq_('', bleach.clean(comment))
+    eq_('', bleach.clean(open_comment))
+    eq_(comment, bleach.clean(comment, strip_comments=False))
+    eq_('%s-->' % open_comment, bleach.clean(open_comment,
+                                             strip_comments=False))
+
+
+def test_with_comments():
+    html = '<!-- comment -->Just text'
+    eq_('Just text', bleach.clean(html))
+    eq_(html, bleach.clean(html, strip_comments=False))
+
+
+def test_no_html():
+    eq_('no html string', bleach.clean('no html string'))
+
+
+def test_allowed_html():
+    eq_('an <strong>allowed</strong> tag',
+        bleach.clean('an <strong>allowed</strong> tag'))
+    eq_('another <em>good</em> tag',
+        bleach.clean('another <em>good</em> tag'))
+
+
+def test_bad_html():
+    eq_('a <em>fixed tag</em>',
+        bleach.clean('a <em>fixed tag'))
+
+
+def test_function_arguments():
+    TAGS = ['span', 'br']
+    ATTRS = {'span': ['style']}
+
+    eq_('a <br><span style="">test</span>',
+        bleach.clean('a <br/><span style="color:red">test</span>',
+                     tags=TAGS, attributes=ATTRS))
+
+
+def test_named_arguments():
+    ATTRS = {'a': ['rel', 'href']}
+    s = u'<a href="http://xx.com" rel="alternate">xx.com</a>'
+    eq_('<a href="http://xx.com">xx.com</a>', bleach.clean(s))
+    eq_(s, bleach.clean(s, attributes=ATTRS))
+
+
+def test_disallowed_html():
+    eq_('a &lt;script&gt;safe()&lt;/script&gt; test',
+        bleach.clean('a <script>safe()</script> test'))
+    eq_('a &lt;style&gt;body{}&lt;/style&gt; test',
+        bleach.clean('a <style>body{}</style> test'))
+
+
+def test_bad_href():
+    eq_('<em>no link</em>',
+        bleach.clean('<em href="fail">no link</em>'))
+
+
+def test_bare_entities():
+    eq_('an &amp; entity', bleach.clean('an & entity'))
+    eq_('an &lt; entity', bleach.clean('an < entity'))
+    eq_('tag &lt; <em>and</em> entity',
+        bleach.clean('tag < <em>and</em> entity'))
+    eq_('&amp;', bleach.clean('&amp;'))
+
+
+def test_escaped_entities():
+    s = u'&lt;em&gt;strong&lt;/em&gt;'
+    eq_(s, bleach.clean(s))
+
+
+def test_serializer():
+    s = u'<table></table>'
+    eq_(s, bleach.clean(s, tags=['table']))
+    eq_(u'test<table></table>', bleach.linkify(u'<table>test</table>'))
+    eq_(u'<p>test</p>', bleach.clean(u'<p>test</p>', tags=['p']))
+
+
+def test_no_href_links():
+    s = u'<a name="anchor">x</a>'
+    eq_(s, bleach.linkify(s))
+    eq_(s, bleach.linkify(s, nofollow=False))
+
+
+def test_weird_strings():
+    s = '</3'
+    eq_(bleach.clean(s), '')
+
+
+def test_xml_render():
+    parser = html5lib.HTMLParser()
+    eq_(bleach._render(parser.parseFragment('')), '')
+
+
+def test_stripping():
+    eq_('a test <em>with</em> <b>html</b> tags',
+        bleach.clean('a test <em>with</em> <b>html</b> tags', strip=True))
+    eq_('a test <em>with</em>  <b>html</b> tags',
+        bleach.clean('a test <em>with</em> <img src="http://example.com/"> '
+                '<b>html</b> tags', strip=True))
+
+    s = '<p><a href="http://example.com/">link text</a></p>'
+    eq_('<p>link text</p>', bleach.clean(s, tags=['p'], strip=True))
+    s = '<p><span>multiply <span>nested <span>text</span></span></span></p>'
+    eq_('<p>multiply nested text</p>', bleach.clean(s, tags=['p'], strip=True))
+
+    s = ('<p><a href="http://example.com/"><img src="http://example.com/">'
+         '</a></p>')
+    eq_('<p><a href="http://example.com/"></a></p>',
+        bleach.clean(s, tags=['p', 'a'], strip=True))
+
+
+def test_allowed_styles():
+    ATTR = ['style']
+    STYLE = ['color']
+    blank = '<b style=""></b>'
+    s = '<b style="color: blue;"></b>'
+    eq_(blank, bleach.clean('<b style="top:0"></b>', attributes=ATTR))
+    eq_(s, bleach.clean(s, attributes=ATTR, styles=STYLE))
+    eq_(s, bleach.clean('<b style="top: 0; color: blue;"></b>',
+                        attributes=ATTR, styles=STYLE))
+
+
+def test_idempotent():
+    """Make sure that applying the filter twice doesn't change anything."""
+    dirty = u'<span>invalid & </span> < extra http://link.com<em>'
+
+    clean = bleach.clean(dirty)
+    eq_(clean, bleach.clean(clean))
+
+    linked = bleach.linkify(dirty)
+    eq_(linked, bleach.linkify(linked))
+
+
+def test_lowercase_html():
+    """We should output lowercase HTML."""
+    dirty = u'<EM CLASS="FOO">BAR</EM>'
+    clean = u'<em class="FOO">BAR</em>'
+    eq_(clean, bleach.clean(dirty, attributes=['class']))
+
+
+def test_wildcard_attributes():
+    ATTR = {
+        '*': ['id'],
+        'img': ['src'],
+    }
+    TAG = ['img', 'em']
+    dirty = (u'both <em id="foo" style="color: black">can</em> have '
+             u'<img id="bar" src="foo"/>')
+    clean = u'both <em id="foo">can</em> have <img id="bar" src="foo">'
+    eq_(clean, bleach.clean(dirty, tags=TAG, attributes=ATTR))
+
+
+def test_sarcasm():
+    """Jokes should crash.<sarcasm/>"""
+    dirty = u'Yeah right <sarcasm/>'
+    clean = u'Yeah right &lt;sarcasm/&gt;'
+    eq_(clean, bleach.clean(dirty))
diff --git a/bleach/tests/test_css.py b/bleach/tests/test_css.py
new file mode 100644
index 0000000..fdb3f65
--- /dev/null
+++ b/bleach/tests/test_css.py
@@ -0,0 +1,85 @@
+from functools import partial
+
+from nose.tools import eq_
+
+from bleach import clean
+
+
+clean = partial(clean, tags=['p'], attributes=['style'])
+
+
+def test_allowed_css():
+    tests = (
+        ('font-family: Arial; color: red; float: left; '
+         'background-color: red;', 'color: red;', ['color']),
+        ('border: 1px solid blue; color: red; float: left;', 'color: red;',
+         ['color']),
+        ('border: 1px solid blue; color: red; float: left;',
+         'color: red; float: left;', ['color', 'float']),
+        ('color: red; float: left; padding: 1em;', 'color: red; float: left;',
+         ['color', 'float']),
+        ('color: red; float: left; padding: 1em;', 'color: red;', ['color']),
+        ('cursor: -moz-grab;', 'cursor: -moz-grab;', ['cursor']),
+        ('color: hsl(30,100%,50%);', 'color: hsl(30,100%,50%);', ['color']),
+        ('color: rgba(255,0,0,0.4);', 'color: rgba(255,0,0,0.4);', ['color']),
+        ("text-overflow: ',' ellipsis;", "text-overflow: ',' ellipsis;", ['text-overflow']),
+    )
+
+    p = '<p style="%s">bar</p>'
+
+    def check(input, output, styles):
+        eq_(p % output, clean(p % input, styles=styles))
+
+    for i, o, s in tests:
+        yield check, i, o, s
+
+
+def test_valid_css():
+    """The sanitizer should fix missing CSS values."""
+    styles = ['color', 'float']
+    eq_('<p style="float: left;">foo</p>',
+        clean('<p style="float: left; color: ">foo</p>', styles=styles))
+    eq_('<p style="">foo</p>',
+        clean('<p style="color: float: left;">foo</p>', styles=styles))
+
+
+def test_style_hang():
+    """The sanitizer should not hang on any inline styles"""
+    # TODO: Neaten this up. It's copypasta from MDN/Kuma to repro the bug
+    style = ("""margin-top: 0px; margin-right: 0px; margin-bottom: 1.286em; """
+             """margin-left: 0px; padding-top: 15px; padding-right: 15px; """
+             """padding-bottom: 15px; padding-left: 15px; border-top-width: """
+             """1px; border-right-width: 1px; border-bottom-width: 1px; """
+             """border-left-width: 1px; border-top-style: dotted; """
+             """border-right-style: dotted; border-bottom-style: dotted; """
+             """border-left-style: dotted; border-top-color: rgb(203, 200, """
+             """185); border-right-color: rgb(203, 200, 185); """
+             """border-bottom-color: rgb(203, 200, 185); border-left-color: """
+             """rgb(203, 200, 185); background-image: initial; """
+             """background-attachment: initial; background-origin: initial; """
+             """background-clip: initial; background-color: """
+             """rgb(246, 246, 242); overflow-x: auto; overflow-y: auto; """
+             """font: normal normal normal 100%/normal 'Courier New', """
+             """'Andale Mono', monospace; background-position: initial """
+             """initial; background-repeat: initial initial;""")
+    html = '<p style="%s">Hello world</p>' % style
+    styles = [
+        'border', 'float', 'overflow', 'min-height', 'vertical-align',
+        'white-space',
+        'margin', 'margin-left', 'margin-top', 'margin-bottom', 'margin-right',
+        'padding', 'padding-left', 'padding-top', 'padding-bottom', 'padding-right',
+        'background',
+        'background-color',
+        'font', 'font-size', 'font-weight', 'text-align', 'text-transform',
+    ]
+
+    expected = ("""<p style="margin-top: 0px; margin-right: 0px; """
+                """margin-bottom: 1.286em; margin-left: 0px; padding-top: """
+                """15px; padding-right: 15px; padding-bottom: 15px; """
+                """padding-left: 15px; background-color: """
+                """rgb(246, 246, 242); font: normal normal normal """
+                """100%/normal 'Courier New', 'Andale Mono', monospace;">"""
+                """Hello world</p>""")
+
+    result = clean(html, styles=styles)
+    eq_(expected, result)
diff --git a/bleach/tests/test_delinkify.py b/bleach/tests/test_delinkify.py
new file mode 100644
index 0000000..f216d2f
--- /dev/null
+++ b/bleach/tests/test_delinkify.py
@@ -0,0 +1,109 @@
+from nose.tools import eq_
+
+import bleach
+
+
+def test_delinkify():
+    eq_('test', bleach.delinkify('<a href="http://ex.mp">test</a>'))
+    eq_('footestbar',
+        bleach.delinkify('foo<a href="http://ex.mp">test</a>bar'))
+
+
+def test_whitelist():
+    html = '<a href="http://ex.mp">test</a>'
+    eq_(html, bleach.delinkify(html, allow_domains=['ex.mp']))
+    eq_('test', bleach.delinkify(html, allow_domains=['ex2.mp']))
+    # Allow a single domain as a special case.
+    eq_(html, bleach.delinkify(html, allow_domains='ex.mp'))
+
+
+def test_nested_a():
+    html = '<a href="http://ex.mp">test<a href="http://foo.bar">test</a></a>'
+    eq_('testtest', bleach.delinkify(html))
+    eq_('<a href="http://ex.mp">test</a>test',
+        bleach.delinkify(html, allow_domains=['ex.mp']))
+
+
+def test_nested_tag():
+    html = '<a href="http://ex.mp">test<span>test</span></a>'
+    eq_('test<span>test</span>', bleach.delinkify(html))
+
+
+def test_a_name():
+    """Don't screw with non-link <a> tags."""
+    html = '<a name="foo">bar</a>'
+    eq_(html, bleach.delinkify(html))
+
+
+def test_relative():
+    """Relative links are optionally OK."""
+    html = 'some <a href="/foo/bar">link</a>'
+    eq_('some link', bleach.delinkify(html))
+    eq_(html, bleach.delinkify(html, allow_relative=True))
+
+
+def test_protocol_relative():
+    """Protocol-relative links aren't relative."""
+    html = 'bad <a href="//ex.mp">link</a>'
+    expect = 'bad link'
+    eq_(expect, bleach.delinkify(html))
+    eq_(expect, bleach.delinkify(html, allow_relative=True))
+    eq_(html, bleach.delinkify(html, allow_domains='ex.mp'))
+
+
+def test_domain_match():
+    tests = (
+        ('ex.mp', 'ex.mp', True),
+        ('ex.mp', '*.ex.mp', True),
+        ('test.ex.mp', '*.ex.mp', True),
+        ('test.ex.mp', 'ex.mp', False),
+        ('test.test.ex.mp', '*.ex.mp', False),
+        ('test.test.ex.mp', '**.ex.mp', True),
+        ('wrong.mp', 'ex.mp', False),
+        ('wrong.mp', '*.ex.mp', False),
+        ('really.wrong.mp', 'ex.mp', False),
+        ('really.wrong.mp', '*.ex.mp', False),
+        ('really.very.wrong.mp', '*.ex.mp', False),
+        ('EX.mp', 'ex.mp', True),  # Domains are case-insensitive.
+        ('ex.mp', 'an.ex.mp', False),
+        ('ex.mp', '*.an.ex.mp', False),
+        ('an.ex.am.pl', 'an.*.am.pl', True),
+        ('a.ex.am.pl', 'an.*.am.pl', False),
+        ('ex.am.pl', 'an.*.am.pl', False),
+    )
+
+    def _check(t, c, v):
+        eq_(v, bleach._domain_match(t, c))
+
+    for t, c, v in tests:
+        yield _check, t, c, v
+
+
+def test_double_star():
+    assert bleach._domain_match('ex.mp', '**.ex.mp')
+    try:
+        bleach._domain_match('ex.mp', 'an.**.ex.mp')
+    except bleach.ValidationError:
+        pass
+    else:
+        assert False, '_domain_match should not accept an.**.ex.mp'
+
+
+def test_allow_subdomains():
+    domains = ('ex.mp', '*.exa.mp', 'an.exam.pl', '*.my.examp.le')
+    html = (
+        ('<a href="http://an.ex.mp">bad</a>', 'bad'),
+        ('<a href="http://exa.mp">good</a>', None),
+        ('<a href="http://an.exa.mp">good</a>', None),
+        ('<a href="http://an.exam.pl">good</a>', None),
+        ('<a href="http://another.exam.pl">bad</a>', 'bad'),
+        ('<a href="http://a.bad.examp.le">bad</a>', 'bad'),
+        ('<a href="http://a.very.bad.examp.le">bad</a>', 'bad'),
+    )
+
+    def _check(html, text):
+        output = bleach.delinkify(html, allow_domains=domains)
+        eq_(html if text is None else text, output)
+
+    for t, o in html:
+        yield _check, t, o
diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py
new file mode 100644
index 0000000..7caf006
--- /dev/null
+++ b/bleach/tests/test_links.py
@@ -0,0 +1,312 @@
+import urllib
+
+from html5lib.tokenizer import HTMLTokenizer
+from nose.tools import eq_
+
+from bleach import linkify, url_re
+
+
+def filter_url(url):
+    return u'http://bouncer/?u=%s' % urllib.quote_plus(url)
+
+
+def test_url_re():
+    def no_match(s):
+        match = url_re.search(s)
+        if match:
+            assert not match, 'matched %s' % s[slice(*match.span())]
+    yield no_match, 'just what i am looking for...it'
+
+
+def test_empty():
+    eq_('', linkify(''))
+
+
+def test_simple_link():
+    eq_('a <a href="http://example.com" rel="nofollow">http://example.com'
+        '</a> link',
+        linkify('a http://example.com link'))
+    eq_('a <a href="https://example.com" rel="nofollow">https://example.com'
+        '</a> link',
+        linkify('a https://example.com link'))
+    eq_('an <a href="http://example.com" rel="nofollow">example.com</a> link',
+        linkify('an example.com link'))
+
+
+def test_trailing_slash():
+    eq_('<a href="http://example.com/" rel="nofollow">http://example.com/</a>',
+       linkify('http://example.com/'))
+    eq_('<a href="http://example.com/foo/" rel="nofollow">'
+        'http://example.com/foo/</a>',
+       linkify('http://example.com/foo/'))
+    eq_('<a href="http://example.com/foo/bar/" rel="nofollow">'
+        'http://example.com/foo/bar/</a>',
+       linkify('http://example.com/foo/bar/'))
+
+
+def test_mangle_link():
+    eq_('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">'
+        'http://example.com</a>',
+        linkify('http://example.com', filter_url=filter_url))
+
+
+def test_email_link():
+    eq_('a james@example.com mailto',
+        linkify('a james@example.com mailto'))
+    eq_('a james@example.com.au mailto',
+        linkify('a james@example.com.au mailto'))
+    eq_('a <a href="mailto:james@example.com" rel="nofollow">'
+        'james@example.com</a> mailto',
+        linkify('a james@example.com mailto', parse_email=True))
+    eq_('aussie <a href="mailto:james@example.com.au" rel="nofollow">'
+        'james@example.com.au</a> mailto',
+        linkify('aussie james@example.com.au mailto', parse_email=True))
+    eq_('email to <a href="james@example.com" rel="nofollow">'
+        'james@example.com</a>',
+        linkify('email to <a href="james@example.com">'
+        'james@example.com</a>', parse_email=True))
+
+
+def test_email_link_escaping():
+    eq_('''<a href='mailto:"james"@example.com' rel="nofollow">'''
+        '''"james"@example.com</a>''',
+        linkify('"james"@example.com', parse_email=True))
+    eq_('''<a href="mailto:&quot;j'ames&quot;@example.com" rel="nofollow">'''
+        '''"j'ames"@example.com</a>''',
+        linkify('"j\'ames"@example.com', parse_email=True))
+    eq_('''<a href='mailto:"ja>mes"@example.com' rel="nofollow">'''
+        '''"ja&gt;mes"@example.com</a>''',
+        linkify('"ja>mes"@example.com', parse_email=True))
+
+
+def test_tlds():
+    eq_('<a href="http://example.com" rel="nofollow">example.com</a>',
+        linkify('example.com'))
+    eq_('<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>',
+        linkify('example.co.uk'))
+    eq_('<a href="http://example.edu" rel="nofollow">example.edu</a>',
+        linkify('example.edu'))
+    eq_('example.xxx', linkify('example.xxx'))
+    eq_(' brie', linkify(' brie'))
+    eq_('<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>',
+        linkify('bit.ly/fun'))
+
+
+def test_escaping():
+    eq_('&lt; unrelated', linkify('< unrelated'))
+
+
+def test_nofollow_off():
+    eq_('<a href="http://example.com">example.com</a>',
+        linkify(u'example.com', nofollow=False))
+
+
+def test_link_in_html():
+    eq_('<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>',
+        linkify('<i>http://yy.com</i>'))
+    eq_('<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com</a>'
+        '</strong></em>',
+        linkify('<em><strong>http://xx.com</strong></em>'))
+
+
+def test_links_https():
+    eq_('<a href="https://yy.com" rel="nofollow">https://yy.com</a>',
+        linkify('https://yy.com'))
+
+
+def test_add_rel_nofollow():
+    """Verify that rel="nofollow" is added to an existing link"""
+    eq_('<a href="http://yy.com" rel="nofollow">http://yy.com</a>',
+        linkify('<a href="http://yy.com">http://yy.com</a>'))
+
+
+def test_url_with_path():
+    eq_('<a href="http://example.com/path/to/file" rel="nofollow">'
+        'http://example.com/path/to/file</a>',
+        linkify('http://example.com/path/to/file'))
+
+
+def test_link_ftp():
+    eq_('<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
+        'ftp://ftp.mozilla.org/some/file</a>',
+        linkify('ftp://ftp.mozilla.org/some/file'))
+
+
+def test_link_query():
+    eq_('<a href="http://xx.com/?test=win" rel="nofollow">'
+        'http://xx.com/?test=win</a>',
+        linkify('http://xx.com/?test=win'))
+    eq_('<a href="http://xx.com/?test=win" rel="nofollow">'
+        'xx.com/?test=win</a>',
+        linkify('xx.com/?test=win'))
+    eq_('<a href="http://xx.com?test=win" rel="nofollow">'
+        'xx.com?test=win</a>',
+        linkify('xx.com?test=win'))
+
+
+def test_link_fragment():
+    eq_('<a href="http://xx.com/path#frag" rel="nofollow">'
+        'http://xx.com/path#frag</a>',
+        linkify('http://xx.com/path#frag'))
+
+
+def test_link_entities():
+    eq_('<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">'
+        'http://xx.com/?a=1&amp;b=2</a>',
+        linkify('http://xx.com/?a=1&b=2'))
+
+
+def test_escaped_html():
+    """If I pass in escaped HTML, it should probably come out escaped."""
+    s = '&lt;em&gt;strong&lt;/em&gt;'
+    eq_(s, linkify(s))
+
+
+def test_link_http_complete():
+    eq_('<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d'
+        '&amp;e#f" rel="nofollow">'
+        'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>',
+        linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f'))
+
+
+def test_non_url():
+    """document.vulnerable should absolutely not be linkified."""
+    s = 'document.vulnerable'
+    eq_(s, linkify(s))
+
+
+def test_javascript_url():
+    """javascript: urls should never be linkified."""
+    s = 'javascript:document.vulnerable'
+    eq_(s, linkify(s))
+
+
+def test_unsafe_url():
+    """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning."""
+    eq_('All your{"<a href="http://xx.yy.com/grover.png" '
+                     'rel="nofollow">xx.yy.com/grover.png</a>"}base are',
+        linkify('All your{"xx.yy.com/grover.png"}base are'))
+
+
+def test_skip_pre():
+    """Skip linkification in <pre> tags."""
+    simple = 'http://xx.com <pre>http://xx.com</pre>'
+    linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
+              '<pre>http://xx.com</pre>')
+    all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
+                  '<pre><a href="http://xx.com" rel="nofollow">http://xx.com'
+                  '</a></pre>')
+    eq_(linked, linkify(simple, skip_pre=True))
+    eq_(all_linked, linkify(simple))
+
+    already_linked = '<pre><a href="http://xx.com">xx</a></pre>'
+    nofollowed = '<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>'
+    eq_(nofollowed, linkify(already_linked))
+    eq_(nofollowed, linkify(already_linked, skip_pre=True))
+
+
+def test_libgl():
+    """libgl.so.1 should not be linkified."""
+    eq_('libgl.so.1', linkify('libgl.so.1'))
+
+
+def test_end_of_sentence():
+    """example.com. should match."""
+    out = u'<a href="http://%s" rel="nofollow">%s</a>%s'
+    in_ = u'%s%s'
+
+    def check(u, p):
+        eq_(out % (u, u, p), linkify(in_ % (u, p)))
+
+    tests = (
+        ('example.com', '.'),
+        ('example.com', '...'),
+        ('ex.com/foo', '.'),
+        ('ex.com/foo', '....'),
+    )
+
+    for u, p in tests:
+        yield check, u, p
+
+
+def test_end_of_clause():
+    """example.com/foo, shouldn't include the ,"""
+    eq_('<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar',
+        linkify('ex.com/foo, bar'))
+
+
+def test_sarcasm():
+    """Jokes should crash.<sarcasm/>"""
+    dirty = u'Yeah right <sarcasm/>'
+    clean = u'Yeah right &lt;sarcasm/&gt;'
+    eq_(clean, linkify(dirty))
+
+
+def test_wrapping_parentheses():
+    """URLs wrapped in parantheses should not include them."""
+    out = u'%s<a href="http://%s" rel="nofollow">%s</a>%s'
+
+    tests = (
+        ('(example.com)', out % ('(', 'example.com', 'example.com', ')')),
+        ('(example.com/)', out % ('(', 'example.com/', 'example.com/', ')')),
+        ('(example.com/foo)', out % ('(', 'example.com/foo',
+                                     'example.com/foo', ')')),
+        ('(((example.com/))))', out % ('(((', 'example.com/)',
+                                       'example.com/)', ')))')),
+        ('example.com/))', out % ('', 'example.com/))',
+                                  'example.com/))', '')),
+        ('http://en.wikipedia.org/wiki/Test_(assessment)',
+            out % ('', 'en.wikipedia.org/wiki/Test_(assessment)',
+                   'http://en.wikipedia.org/wiki/Test_(assessment)', '')),
+        ('(http://en.wikipedia.org/wiki/Test_(assessment))',
+            out % ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
+                   'http://en.wikipedia.org/wiki/Test_(assessment)', ')')),
+        ('((http://en.wikipedia.org/wiki/Test_(assessment))',
+            out % ('((', 'en.wikipedia.org/wiki/Test_(assessment',
+                   'http://en.wikipedia.org/wiki/Test_(assessment', '))')),
+        ('(http://en.wikipedia.org/wiki/Test_(assessment)))',
+            out % ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
+                   'http://en.wikipedia.org/wiki/Test_(assessment))', ')')),
+        ('(http://en.wikipedia.org/wiki/)Test_(assessment',
+            out % ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
+                   'http://en.wikipedia.org/wiki/)Test_(assessment', '')),
+    )
+
+    def check(test, expected_output):
+        eq_(expected_output, linkify(test))
+
+    for test, expected_output in tests:
+        yield check, test, expected_output
+
+
+def test_ports():
+    """URLs can contain port numbers."""
+    tests = (
+        ('http://foo.com:8000', ('http://foo.com:8000', '')),
+        ('http://foo.com:8000/', ('http://foo.com:8000/', '')),
+        ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')),
+        ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')),
+        ('http://foo.com:', ('http://foo.com', ':')),
+    )
+
+    def check(test, output):
+        eq_(u'<a href="{0}" rel="nofollow">{0}</a>{1}'.format(*output),
+            linkify(test))
+
+    for test, output in tests:
+        yield check, test, output
+
+
+def test_target():
+    eq_('<a href="http://example.com" rel="nofollow" '
+        'target="_blank">example.com</a>',
+        linkify(u'example.com', target='_blank'))
+    eq_('<a href="http://example.com" target="_blank">example.com</a>',
+        linkify(u'example.com', target='_blank', nofollow=False))
+
+
+def test_tokenizer():
+    """Linkify doesn't always have to sanitize."""
+    raw = '<em>test<x></x></em>'
+    eq_('<em>test&lt;x&gt;&lt;/x&gt;</em>', linkify(raw))
+    eq_(raw, linkify(raw, tokenizer=HTMLTokenizer))
diff --git a/bleach/tests/test_security.py b/bleach/tests/test_security.py
new file mode 100644
index 0000000..9e9bb7b
--- /dev/null
+++ b/bleach/tests/test_security.py
@@ -0,0 +1,108 @@
+"""More advanced security tests"""
+
+from nose.tools import eq_
+
+from bleach import clean
+
+
+def test_nested_script_tag():
+    eq_('&lt;&lt;script&gt;script&gt;evil()&lt;&lt;/script&gt;/script&gt;',
+        clean('<<script>script>evil()<</script>/script>'))
+    eq_('&lt;&lt;x&gt;script&gt;evil()&lt;&lt;/x&gt;/script&gt;',
+        clean('<<x>script>evil()<</x>/script>'))
+
+
+def test_nested_script_tag_r():
+    eq_('&lt;script&lt;script&gt;&gt;evil()&lt;/script&lt;&gt;&gt;',
+        clean('<script<script>>evil()</script</script>>'))
+
+
+def test_invalid_attr():
+    IMG = ['img', ]
+    IMG_ATTR = ['src']
+
+    eq_('<a href="test">test</a>',
+        clean('<a onclick="evil" href="test">test</a>'))
+    eq_('<img src="test">',
+        clean('<img onclick="evil" src="test" />',
+                tags=IMG, attributes=IMG_ATTR))
+    eq_('<img src="test">',
+        clean('<img href="invalid" src="test" />',
+                tags=IMG, attributes=IMG_ATTR))
+
+
+def test_unquoted_attr():
+    eq_('<abbr title="mytitle">myabbr</abbr>',
+        clean('<abbr title=mytitle>myabbr</abbr>'))
+
+
+def test_unquoted_event_handler():
+    eq_('<a href="http://xx.com">xx.com</a>',
+        clean('<a href="http://xx.com" onclick=foo()>xx.com</a>'))
+
+
+def test_invalid_attr_value():
+    eq_('&lt;img src="javascript:alert(\'XSS\');"&gt;',
+        clean('<img src="javascript:alert(\'XSS\');">'))
+
+
+def test_invalid_href_attr():
+    eq_('<a>xss</a>',
+        clean('<a href="javascript:alert(\'XSS\')">xss</a>'))
+
+
+def test_invalid_filter_attr():
+    IMG = ['img', ]
+    IMG_ATTR = {'img': lambda n, v: n == 'src' and v == "http://example.com/"}
+
+    eq_('<img src="http://example.com/">',
+        clean('<img onclick="evil" src="http://example.com/" />',
+                tags=IMG, attributes=IMG_ATTR))
+
+    eq_('<img>', clean('<img onclick="evil" src="http://badhost.com/" />',
+                       tags=IMG, attributes=IMG_ATTR))
+
+
+def test_invalid_tag_char():
+    eq_('&lt;script xss="" src="http://xx.com/xss.js"&gt;&lt;/script&gt;',
+        clean('<script/xss src="http://xx.com/xss.js"></script>'))
+    eq_('&lt;script src="http://xx.com/xss.js"&gt;&lt;/script&gt;',
+        clean('<script/src="http://xx.com/xss.js"></script>'))
+
+
+def test_unclosed_tag():
+    eq_('&lt;script src="http://xx.com/xss.js&amp;lt;b"&gt;',
+        clean('<script src=http://xx.com/xss.js<b>'))
+    eq_('&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;',
+        clean('<script src="http://xx.com/xss.js"<b>'))
+    eq_('&lt;script src="http://xx.com/xss.js" &lt;b=""&gt;',
+        clean('<script src="http://xx.com/xss.js" <b>'))
+
+
+def test_strip():
+    """Using strip=True shouldn't result in malicious content."""
+    s = '<scri<script>pt>alert(1)</scr</script>ipt>'
+    eq_('pt&gt;alert(1)ipt&gt;', clean(s, strip=True))
+    s = '<scri<scri<script>pt>pt>alert(1)</script>'
+    eq_('pt&gt;pt&gt;alert(1)', clean(s, strip=True))
+
+
+def test_nasty():
+    """Nested, broken up, multiple tags, are still foiled!"""
+    test = ('<scr<script></script>ipt type="text/javascript">alert("foo");</'
+            '<script></script>script<del></del>>')
+    expect = (u'&lt;scr&lt;script&gt;&lt;/script&gt;ipt type="text/javascript"'
+              u'&gt;alert("foo");&lt;/script&gt;script&lt;del&gt;&lt;/del&gt;'
+              u'&gt;')
+    eq_(expect, clean(test))
+
+
+def test_poster_attribute():
+    """Poster attributes should not allow javascript."""
+    tags = ['video']
+    attrs = {'video': ['poster']}
+    test = '<video poster="javascript:alert(1)"></video>'
+    expect = '<video></video>'
+    eq_(expect, clean(test, tags=tags, attributes=attrs))
+    ok = '<video poster="/foo.png"></video>'
+    eq_(ok, clean(ok, tags=tags, attributes=attrs))
diff --git a/bleach/tests/test_unicode.py b/bleach/tests/test_unicode.py
new file mode 100644
index 0000000..67123cc
--- /dev/null
+++ b/bleach/tests/test_unicode.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+from nose.tools import eq_
+
+from bleach import clean, linkify
+
+
+def test_japanese_safe_simple():
+    eq_(u'ヘルプとチュートリアル', clean(u'ヘルプとチュートリアル'))
+    eq_(u'ヘルプとチュートリアル', linkify(u'ヘルプとチュートリアル'))
+
+
+def test_japanese_strip():
+    eq_(u'<em>ヘルプとチュートリアル</em>',
+        clean(u'<em>ヘルプとチュートリアル</em>'))
+    eq_(u'&lt;span&gt;ヘルプとチュートリアル&lt;/span&gt;',
+        clean(u'<span>ヘルプとチュートリアル</span>'))
+
+
+def test_russian_simple():
+    eq_(u'Домашняя', clean(u'Домашняя'))
+    eq_(u'Домашняя', linkify(u'Домашняя'))
+
+
+def test_mixed():
+    eq_(u'Домашняяヘルプとチュートリアル',
+        clean(u'Домашняяヘルプとチュートリアル'))
+
+
+def test_mixed_linkify():
+    eq_(u'Домашняя <a href="http://example.com" rel="nofollow">'
+        u'http://example.com</a> ヘルプとチュートリアル',
+        linkify(u'Домашняя http://example.com ヘルプとチュートリアル'))
+
+
+def test_url_utf8():
+    """Allow UTF8 characters in URLs themselves."""
+    out = u'<a href="%(url)s" rel="nofollow">%(url)s</a>'
+
+    tests = (
+        ('http://éxámplé.com/', out % {'url': u'http://éxámplé.com/'}),
+        ('http://éxámplé.com/íàñá/',
+                out % {'url': u'http://éxámplé.com/íàñá/'}),
+        ('http://éxámplé.com/íàñá/?foo=bar',
+            out % {'url': u'http://éxámplé.com/íàñá/?foo=bar'}),
+        ('http://éxámplé.com/íàñá/?fóo=bár',
+            out % {'url': u'http://éxámplé.com/íàñá/?fóo=bár'}),
+    )
+
+    def check(test, expected_output):
+        eq_(expected_output, linkify(test))
+
+    for test, expected_output in tests:
+        yield check, test, expected_output
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c525a9e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+# These are the requirements to run the test suite.
+nose
+html5lib
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..115d811
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,27 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='bleach',
+    version='1.1.5',
+    description='An easy whitelist-based HTML-sanitizing tool.',
+    long_description=open('README.rst').read(),
+    author='James Socol',
+    author_email='james@mozilla.com',
+    url='http://github.com/jsocol/bleach',
+    license='BSD',
+    packages=find_packages(),
+    include_package_data=True,
+    package_data={'': ['README.rst']},
+    zip_safe=False,
+    install_requires=['html5lib>=0.95'],
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Environment :: Web Environment',
+        'Environment :: Web Environment :: Mozilla',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: BSD License',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+    ]
+)