aboutsummaryrefslogtreecommitdiff
path: root/bleach
diff options
context:
space:
mode:
authorPer Andersson <avtobiff@gmail.com>2013-06-09 19:45:54 +0200
committerPer Andersson <avtobiff@gmail.com>2013-06-09 19:45:54 +0200
commitfac84c6d90e0875e6c1b10c5ef02d577ee008af4 (patch)
tree4080efdb87c814d5dc409e9e87aa449f4b273ff3 /bleach
parent38dc3b8f231cf36bcc771001318556d9e84c2889 (diff)
downloadpython-bleach-fac84c6d90e0875e6c1b10c5ef02d577ee008af4.tar
python-bleach-fac84c6d90e0875e6c1b10c5ef02d577ee008af4.tar.gz
Imported Upstream version 1.2.2upstream/1.2.2
Diffstat (limited to 'bleach')
-rw-r--r--bleach/__init__.py230
-rw-r--r--bleach/callbacks.py15
-rw-r--r--bleach/sanitizer.py6
-rw-r--r--bleach/tests/test_basics.py5
-rw-r--r--bleach/tests/test_css.py16
-rw-r--r--bleach/tests/test_delinkify.py109
-rw-r--r--bleach/tests/test_links.py200
-rw-r--r--bleach/tests/test_security.py4
8 files changed, 306 insertions, 279 deletions
diff --git a/bleach/__init__.py b/bleach/__init__.py
index bc8e49c..af75d0f 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -1,19 +1,18 @@
-import itertools
import logging
import re
import sys
-import urlparse
import html5lib
from html5lib.sanitizer import HTMLSanitizer
from html5lib.serializer.htmlserializer import HTMLSerializer
-from encoding import force_unicode
-from sanitizer import BleachSanitizer
+from . import callbacks as linkify_callbacks
+from .encoding import force_unicode
+from .sanitizer import BleachSanitizer
-VERSION = (1, 1, 5)
-__version__ = '.'.join(map(str, VERSION))
+VERSION = (1, 2, 1)
+__version__ = '1.2.1'
__all__ = ['clean', 'linkify']
@@ -56,18 +55,21 @@ TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws xn ye yt yu za zm
zw""".split()
+PROTOCOLS = HTMLSanitizer.acceptable_protocols
+
TLDS.reverse()
url_re = re.compile(
r"""\(* # Match any opening parentheses.
- \b(?<![@.])(?:\w[\w-]*:/{0,3}(?:(?:\w+:)?\w+@)?)? # http://
+ \b(?<![@.])(?:(?:%s):/{0,3}(?:(?:\w+:)?\w+@)?)? # http://
([\w-]+\.)+(?:%s)(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)?
(?:[/?][^\s\{\}\|\\\^\[\]`<>"]*)?
# /path/zz (excluding "unsafe" chars from RFC 1738,
# except for # and ~, which happen in practice)
- """ % u'|'.join(TLDS), re.VERBOSE | re.UNICODE)
+ """ % (u'|'.join(PROTOCOLS), u'|'.join(TLDS)),
+ re.IGNORECASE | re.VERBOSE | re.UNICODE)
-proto_re = re.compile(r'^[\w-]+:/{0,3}')
+proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
punct_re = re.compile(r'([\.,]+)$')
@@ -83,7 +85,10 @@ email_re = re.compile(
NODE_TEXT = 4 # The numeric ID of a text node in simpletree.
-identity = lambda x: x # The identity function.
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+
+PY_26 = (sys.version_info < (2, 7))
+RECURSION_EXCEPTION = RuntimeError if not PY_26 else AttributeError
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
@@ -93,8 +98,6 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
return u''
text = force_unicode(text)
- if text.startswith(u'<!--'):
- text = u' ' + text
class s(BleachSanitizer):
allowed_elements = tags
@@ -105,32 +108,17 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
parser = html5lib.HTMLParser(tokenizer=s)
- return _render(parser.parseFragment(text)).strip()
+ return _render(parser.parseFragment(text))
-def linkify(text, nofollow=True, target=None, filter_url=identity,
- filter_text=identity, skip_pre=False, parse_email=False,
- tokenizer=HTMLSanitizer):
+def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
+ parse_email=False, tokenizer=HTMLSanitizer):
"""Convert URL-like strings in an HTML fragment to links.
linkify() converts strings that look like URLs or domain names in a
blob of text that may be an HTML fragment to links, while preserving
(a) links already in the string, (b) urls found in attributes, and
(c) email addresses.
-
- If the nofollow argument is True (the default) then rel="nofollow"
- will be added to links created by linkify() as well as links already
- found in the text.
-
- The target argument will optionally add a target attribute with the
- given value to links created by linkify() as well as links already
- found in the text.
-
- linkify() uses up to two filters on each link. For links created by
- linkify(), the href attribute is passed through filter_url()
- and the text of the link is passed through filter_text(). For links
- already found in the document, the href attribute is passed through
- filter_url(), but the text is untouched.
"""
text = force_unicode(text)
@@ -141,16 +129,16 @@ def linkify(text, nofollow=True, target=None, filter_url=identity,
forest = parser.parseFragment(text)
- if nofollow:
- rel = u'rel="nofollow"'
- else:
- rel = u''
-
def replace_nodes(tree, new_frag, node):
new_tree = parser.parseFragment(new_frag)
for n in new_tree.childNodes:
+ # Prevent us from re-parsing links new links as existing links.
+ if n.name == 'a':
+ n._seen = True
tree.insertBefore(n, node)
tree.removeChild(node)
+ # Return the number of new nodes.
+ return len(new_tree.childNodes) - 1
def strip_wrapping_parentheses(fragment):
"""Strips wrapping parentheses.
@@ -194,34 +182,77 @@ def linkify(text, nofollow=True, target=None, filter_url=identity,
return fragment, opening_parentheses, closing_parentheses
+ def apply_callbacks(attrs, new):
+ for cb in callbacks:
+ attrs = cb(attrs, new)
+ if attrs is None:
+ return None
+ return attrs
+
def linkify_nodes(tree, parse_text=True):
- for node in tree.childNodes:
+ # I know this isn't Pythonic, but we're sometimes mutating
+ # tree.childNodes, which ends up breaking the loop and causing us to
+ # reparse code.
+ children = len(tree.childNodes)
+ current = 0 # A pointer to the "current" node.
+ while current < children:
+ node = tree.childNodes[current]
if node.type == NODE_TEXT and parse_text:
- new_frag = node.toxml()
+ new_frag = _render(node)
+ # Look for email addresses?
if parse_email:
new_frag = re.sub(email_re, email_repl, new_frag)
- if new_frag != node.toxml():
- replace_nodes(tree, new_frag, node)
+ if new_frag != _render(node):
+ adj = replace_nodes(tree, new_frag, node)
+ children += adj
+ current += adj
linkify_nodes(tree)
continue
new_frag = re.sub(url_re, link_repl, new_frag)
- replace_nodes(tree, new_frag, node)
- elif node.name == 'a':
+ if new_frag != _render(node):
+ adj = replace_nodes(tree, new_frag, node)
+ children += adj
+ current += adj
+ elif node.name == 'a' and not getattr(node, '_seen', False):
if 'href' in node.attributes:
- if nofollow:
- node.attributes['rel'] = 'nofollow'
- if target is not None:
- node.attributes['target'] = target
- href = node.attributes['href']
- node.attributes['href'] = filter_url(href)
+ attrs = node.attributes
+ _text = attrs['_text'] = ''.join(c.toxml() for
+ c in node.childNodes)
+ attrs = apply_callbacks(attrs, False)
+ if attrs is not None:
+ text = force_unicode(attrs.pop('_text'))
+ node.attributes = attrs
+ for n in reversed(node.childNodes):
+ node.removeChild(n)
+ text = parser.parseFragment(text)
+ for n in text.childNodes:
+ node.appendChild(n)
+ node._seen = True
+ else:
+ replace_nodes(tree, _text, node)
elif skip_pre and node.name == 'pre':
linkify_nodes(node, False)
- else:
+ elif not getattr(node, '_seen', False):
linkify_nodes(node)
+ current += 1
def email_repl(match):
- repl = u'<a href="mailto:%(mail)s">%(mail)s</a>'
- return repl % {'mail': match.group(0).replace('"', '&quot;')}
+ addr = match.group(0).replace('"', '&quot;')
+ link = {
+ '_text': addr,
+ 'href': 'mailto:%s' % addr,
+ }
+ link = apply_callbacks(link, True)
+
+ if link is None:
+ return addr
+
+ _href = link.pop('href')
+ _text = link.pop('_text')
+
+ repl = '<a href="%s" %s>%s</a>'
+ attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())
+ return repl % (_href, attribs, _text)
def link_repl(match):
url = match.group(0)
@@ -240,98 +271,41 @@ def linkify(text, nofollow=True, target=None, filter_url=identity,
else:
href = u''.join([u'http://', url])
- repl = u'%s<a href="%s" %s>%s</a>%s%s'
-
- attribs = [rel]
- if target is not None:
- attribs.append('target="%s"' % target)
-
- return repl % ('(' * open_brackets,
- filter_url(href), ' '.join(attribs), filter_text(url),
- end, ')' * close_brackets)
+ link = {
+ '_text': url,
+ 'href': href,
+ }
- linkify_nodes(forest)
+ link = apply_callbacks(link, True)
- return _render(forest)
+ if link is None:
+ return url
+ _text = link.pop('_text')
+ _href = link.pop('href')
-def delinkify(text, allow_domains=None, allow_relative=False):
- """Remove links from text, except those allowed to stay."""
- text = force_unicode(text)
- if not text:
- return u''
+ repl = u'%s<a href="%s" %s>%s</a>%s%s'
+ attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items())
- parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer)
- forest = parser.parseFragment(text)
+ return repl % ('(' * open_brackets,
+ _href, attribs, _text, end,
+ ')' * close_brackets)
- if allow_domains is None:
- allow_domains = []
- elif isinstance(allow_domains, basestring):
- allow_domains = [allow_domains]
+ try:
+ linkify_nodes(forest)
+ except (RECURSION_EXCEPTION), e:
+ # If we hit the max recursion depth, just return what we've got.
+ log.exception('Probable recursion error: %r' % e)
- def delinkify_nodes(tree):
- """Remove <a> tags and replace them with their contents."""
- for node in tree.childNodes:
- if node.name == 'a':
- if 'href' not in node.attributes:
- continue
- parts = urlparse.urlparse(node.attributes['href'])
- host = parts.hostname
- if any(_domain_match(host, d) for d in allow_domains):
- continue
- if host is None and allow_relative:
- continue
- # Replace the node with its children.
- # You can't nest <a> tags, and html5lib takes care of that
- # for us in the tree-building step.
- for n in node.childNodes:
- tree.insertBefore(n, node)
- tree.removeChild(node)
- elif node.type != NODE_TEXT: # Don't try to delinkify text.
- delinkify_nodes(node)
-
- delinkify_nodes(forest)
return _render(forest)
-def _domain_match(test, compare):
- test = test.lower()
- compare = compare.lower()
- if '*' not in compare:
- return test == compare
- c = compare.split('.')[::-1]
- if '**' in c and (c.count('**') > 1 or not compare.startswith('**')):
- raise ValidationError(
- 'Only 1 ** is allowed, and must start the domain.')
- t = test.split('.')[::-1]
- z = itertools.izip_longest(c, t)
- for c, t in z:
- if c == t:
- continue
- elif c == '*':
- continue
- elif c == '**':
- return True
- return False
- # Got all the way through and everything matched.
- return True
-
-
-class ValidationError(ValueError):
- pass
-
-
def _render(tree):
"""Try rendering as HTML, then XML, then give up."""
try:
return force_unicode(_serialize(tree))
- except Exception, e:
- log.error('HTML: %r' % e, exc_info=sys.exc_info())
- try:
- return force_unicode(tree.toxml())
- except Exception, e:
- log.error('XML: %r' % e, exc_info=sys.exc_info())
- return u''
+ except AssertionError: # The treewalker throws this sometimes.
+ return force_unicode(tree.toxml())
def _serialize(domtree):
diff --git a/bleach/callbacks.py b/bleach/callbacks.py
new file mode 100644
index 0000000..cc4682d
--- /dev/null
+++ b/bleach/callbacks.py
@@ -0,0 +1,15 @@
+"""A set of basic callbacks for bleach.linkify."""
+
+
+def nofollow(attrs, new=False):
+ if attrs['href'].startswith('mailto:'):
+ return attrs
+ attrs['rel'] = 'nofollow'
+ return attrs
+
+
+def target_blank(attrs, new=False):
+ if attrs['href'].startswith('mailto:'):
+ return attrs
+ attrs['target'] = '_blank'
+ return attrs
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 677287e..4640012 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -6,6 +6,10 @@ from html5lib.sanitizer import HTMLSanitizerMixin
from html5lib.tokenizer import HTMLTokenizer
+PROTOS = HTMLSanitizerMixin.acceptable_protocols
+PROTOS.remove('feed')
+
+
class BleachSanitizerMixin(HTMLSanitizerMixin):
"""Mixin to replace sanitize_token() and sanitize_css()."""
@@ -108,7 +112,7 @@ class BleachSanitizerMixin(HTMLSanitizerMixin):
# TODO: Make sure this does what it's meant to - I *think* it wants to
# validate style attribute contents.
parts = style.split(';')
- gauntlet = re.compile("""^([-/:,#%.'\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*"""
+ gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*"""
"""|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
for part in parts:
if not gauntlet.match(part):
diff --git a/bleach/tests/test_basics.py b/bleach/tests/test_basics.py
index 60be11d..9eca687 100644
--- a/bleach/tests/test_basics.py
+++ b/bleach/tests/test_basics.py
@@ -8,6 +8,10 @@ def test_empty():
eq_('', bleach.clean(''))
+def test_nbsp():
+ eq_(u'\xa0test string\xa0', bleach.clean('&nbsp;test string&nbsp;'))
+
+
def test_comments_only():
comment = '<!-- this is a comment -->'
open_comment = '<!-- this is an open comment'
@@ -91,7 +95,6 @@ def test_serializer():
def test_no_href_links():
s = u'<a name="anchor">x</a>'
eq_(s, bleach.linkify(s))
- eq_(s, bleach.linkify(s, nofollow=False))
def test_weird_strings():
diff --git a/bleach/tests/test_css.py b/bleach/tests/test_css.py
index fdb3f65..588c8ce 100644
--- a/bleach/tests/test_css.py
+++ b/bleach/tests/test_css.py
@@ -22,13 +22,21 @@ def test_allowed_css():
('cursor: -moz-grab;', 'cursor: -moz-grab;', ['cursor']),
('color: hsl(30,100%,50%);', 'color: hsl(30,100%,50%);', ['color']),
('color: rgba(255,0,0,0.4);', 'color: rgba(255,0,0,0.4);', ['color']),
- ("text-overflow: ',' ellipsis;", "text-overflow: ',' ellipsis;", ['text-overflow']),
+ ("text-overflow: ',' ellipsis;", "text-overflow: ',' ellipsis;",
+ ['text-overflow']),
+ ('text-overflow: "," ellipsis;', 'text-overflow: "," ellipsis;',
+ ['text-overflow']),
+ ('font-family: "Arial";', 'font-family: "Arial";', ['font-family']),
)
- p = '<p style="%s">bar</p>'
+ p_single = '<p style="%s">bar</p>'
+ p_double = "<p style='%s'>bar</p>"
- def check(input, output, styles):
- eq_(p % output, clean(p % input, styles=styles))
+ def check(i, o, s):
+ if '"' in i:
+ eq_(p_double % o, clean(p_double % i, styles=s))
+ else:
+ eq_(p_single % o, clean(p_single % i, styles=s))
for i, o, s in tests:
yield check, i, o, s
diff --git a/bleach/tests/test_delinkify.py b/bleach/tests/test_delinkify.py
deleted file mode 100644
index f216d2f..0000000
--- a/bleach/tests/test_delinkify.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from nose.tools import eq_
-
-import bleach
-
-
-def test_delinkify():
- eq_('test', bleach.delinkify('<a href="http://ex.mp">test</a>'))
- eq_('footestbar',
- bleach.delinkify('foo<a href="http://ex.mp">test</a>bar'))
-
-
-def test_whitelist():
- html = '<a href="http://ex.mp">test</a>'
- eq_(html, bleach.delinkify(html, allow_domains=['ex.mp']))
- eq_('test', bleach.delinkify(html, allow_domains=['ex2.mp']))
- # Allow a single domain as a special case.
- eq_(html, bleach.delinkify(html, allow_domains='ex.mp'))
-
-
-def test_nested_a():
- html = '<a href="http://ex.mp">test<a href="http://foo.bar">test</a></a>'
- eq_('testtest', bleach.delinkify(html))
- eq_('<a href="http://ex.mp">test</a>test',
- bleach.delinkify(html, allow_domains=['ex.mp']))
-
-
-def test_nested_tag():
- html = '<a href="http://ex.mp">test<span>test</span></a>'
- eq_('test<span>test</span>', bleach.delinkify(html))
-
-
-def test_a_name():
- """Don't screw with non-link <a> tags."""
- html = '<a name="foo">bar</a>'
- eq_(html, bleach.delinkify(html))
-
-
-def test_relative():
- """Relative links are optionally OK."""
- html = 'some <a href="/foo/bar">link</a>'
- eq_('some link', bleach.delinkify(html))
- eq_(html, bleach.delinkify(html, allow_relative=True))
-
-
-def test_protocol_relative():
- """Protocol-relative links aren't relative."""
- html = 'bad <a href="//ex.mp">link</a>'
- expect = 'bad link'
- eq_(expect, bleach.delinkify(html))
- eq_(expect, bleach.delinkify(html, allow_relative=True))
- eq_(html, bleach.delinkify(html, allow_domains='ex.mp'))
-
-
-def test_domain_match():
- tests = (
- ('ex.mp', 'ex.mp', True),
- ('ex.mp', '*.ex.mp', True),
- ('test.ex.mp', '*.ex.mp', True),
- ('test.ex.mp', 'ex.mp', False),
- ('test.test.ex.mp', '*.ex.mp', False),
- ('test.test.ex.mp', '**.ex.mp', True),
- ('wrong.mp', 'ex.mp', False),
- ('wrong.mp', '*.ex.mp', False),
- ('really.wrong.mp', 'ex.mp', False),
- ('really.wrong.mp', '*.ex.mp', False),
- ('really.very.wrong.mp', '*.ex.mp', False),
- ('EX.mp', 'ex.mp', True), # Domains are case-insensitive.
- ('ex.mp', 'an.ex.mp', False),
- ('ex.mp', '*.an.ex.mp', False),
- ('an.ex.am.pl', 'an.*.am.pl', True),
- ('a.ex.am.pl', 'an.*.am.pl', False),
- ('ex.am.pl', 'an.*.am.pl', False),
- )
-
- def _check(t, c, v):
- eq_(v, bleach._domain_match(t, c))
-
- for t, c, v in tests:
- yield _check, t, c, v
-
-
-def test_double_star():
- assert bleach._domain_match('ex.mp', '**.ex.mp')
- try:
- bleach._domain_match('ex.mp', 'an.**.ex.mp')
- except bleach.ValidationError:
- pass
- else:
- assert False, '_domain_match should not accept an.**.ex.mp'
-
-
-def test_allow_subdomains():
- domains = ('ex.mp', '*.exa.mp', 'an.exam.pl', '*.my.examp.le')
- html = (
- ('<a href="http://an.ex.mp">bad</a>', 'bad'),
- ('<a href="http://exa.mp">good</a>', None),
- ('<a href="http://an.exa.mp">good</a>', None),
- ('<a href="http://an.exam.pl">good</a>', None),
- ('<a href="http://another.exam.pl">bad</a>', 'bad'),
- ('<a href="http://a.bad.examp.le">bad</a>', 'bad'),
- ('<a href="http://a.very.bad.examp.le">bad</a>', 'bad'),
- )
-
- def _check(html, text):
- output = bleach.delinkify(html, allow_domains=domains)
- eq_(html if text is None else text, output)
-
- for t, o in html:
- yield _check, t, o
diff --git a/bleach/tests/test_links.py b/bleach/tests/test_links.py
index 7caf006..ac593c4 100644
--- a/bleach/tests/test_links.py
+++ b/bleach/tests/test_links.py
@@ -3,11 +3,9 @@ import urllib
from html5lib.tokenizer import HTMLTokenizer
from nose.tools import eq_
-from bleach import linkify, url_re
+from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
-def filter_url(url):
- return u'http://bouncer/?u=%s' % urllib.quote_plus(url)
def test_url_re():
@@ -45,38 +43,139 @@ def test_trailing_slash():
def test_mangle_link():
+ """We can muck with the href attribute of the link."""
+ def filter_url(attrs, new=False):
+ attrs['href'] = (u'http://bouncer/?u=%s' %
+ urllib.quote_plus(attrs['href']))
+ return attrs
+
eq_('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">'
'http://example.com</a>',
- linkify('http://example.com', filter_url=filter_url))
+ linkify('http://example.com', DC + [filter_url]))
+
+
+def test_mangle_text():
+ """We can muck with the inner text of a link."""
+
+ def ft(attrs, new=False):
+ attrs['_text'] = 'bar'
+ return attrs
+
+ eq_('<a href="http://ex.mp">bar</a> <a href="http://ex.mp/foo">bar</a>',
+ linkify('http://ex.mp <a href="http://ex.mp/foo">foo</a>', [ft]))
def test_email_link():
- eq_('a james@example.com mailto',
- linkify('a james@example.com mailto'))
- eq_('a james@example.com.au mailto',
- linkify('a james@example.com.au mailto'))
- eq_('a <a href="mailto:james@example.com" rel="nofollow">'
- 'james@example.com</a> mailto',
- linkify('a james@example.com mailto', parse_email=True))
- eq_('aussie <a href="mailto:james@example.com.au" rel="nofollow">'
- 'james@example.com.au</a> mailto',
- linkify('aussie james@example.com.au mailto', parse_email=True))
- eq_('email to <a href="james@example.com" rel="nofollow">'
- 'james@example.com</a>',
- linkify('email to <a href="james@example.com">'
- 'james@example.com</a>', parse_email=True))
+ tests = (
+ ('a james@example.com mailto', False, 'a james@example.com mailto'),
+ ('a james@example.com.au mailto', False,
+ 'a james@example.com.au mailto'),
+ ('a <a href="mailto:james@example.com">james@example.com</a> mailto',
+ True, 'a james@example.com mailto'),
+ ('aussie <a href="mailto:james@example.com.au">'
+ 'james@example.com.au</a> mailto', True,
+ 'aussie james@example.com.au mailto'),
+ # This is kind of a pathological case. I guess we do our best here.
+ ('email to <a href="james@example.com" rel="nofollow">'
+ 'james@example.com</a>', True,
+ 'email to <a href="james@example.com">james@example.com</a>'),
+ )
+
+ def _check(o, p, i):
+ eq_(o, linkify(i, parse_email=p))
+
+ for (o, p, i) in tests:
+ yield _check, o, p, i
def test_email_link_escaping():
- eq_('''<a href='mailto:"james"@example.com' rel="nofollow">'''
- '''"james"@example.com</a>''',
- linkify('"james"@example.com', parse_email=True))
- eq_('''<a href="mailto:&quot;j'ames&quot;@example.com" rel="nofollow">'''
- '''"j'ames"@example.com</a>''',
- linkify('"j\'ames"@example.com', parse_email=True))
- eq_('''<a href='mailto:"ja>mes"@example.com' rel="nofollow">'''
- '''"ja&gt;mes"@example.com</a>''',
- linkify('"ja>mes"@example.com', parse_email=True))
+ tests = (
+ ('''<a href='mailto:"james"@example.com'>'''
+ '''"james"@example.com</a>''',
+ '"james"@example.com'),
+ ('''<a href="mailto:&quot;j'ames&quot;@example.com">'''
+ '''"j'ames"@example.com</a>''',
+ '"j\'ames"@example.com'),
+ ('''<a href='mailto:"ja>mes"@example.com'>'''
+ '''"ja&gt;mes"@example.com</a>''',
+ '"ja>mes"@example.com'),
+ )
+
+ def _check(o, i):
+ eq_(o, linkify(i, parse_email=True))
+
+ for (o, i) in tests:
+ yield _check, o, i
+
+
+def test_prevent_links():
+ """Returning None from any callback should remove links or prevent them
+ from being created."""
+
+ def no_new_links(attrs, new=False):
+ if new:
+ return None
+ return attrs
+
+ def no_old_links(attrs, new=False):
+ if not new:
+ return None
+ return attrs
+
+ def noop(attrs, new=False):
+ return attrs
+
+ in_text = 'a ex.mp <a href="http://example.com">example</a>'
+ out_text = 'a <a href="http://ex.mp">ex.mp</a> example'
+ tests = (
+ ([noop], ('a <a href="http://ex.mp">ex.mp</a> '
+ '<a href="http://example.com">example</a>'), 'noop'),
+ ([no_new_links, noop], in_text, 'no new, noop'),
+ ([noop, no_new_links], in_text, 'noop, no new'),
+ ([no_old_links, noop], out_text, 'no old, noop'),
+ ([noop, no_old_links], out_text, 'noop, no old'),
+ ([no_old_links, no_new_links], 'a ex.mp example', 'no links'),
+ )
+
+ def _check(cb, o, msg):
+ eq_(o, linkify(in_text, cb), msg)
+
+ for (cb, o, msg) in tests:
+ yield _check, cb, o, msg
+
+
+def test_set_attrs():
+ """We can set random attributes on links."""
+
+ def set_attr(attrs, new=False):
+ attrs['rev'] = 'canonical'
+ return attrs
+
+ eq_('<a href="http://ex.mp" rev="canonical">ex.mp</a>',
+ linkify('ex.mp', [set_attr]))
+
+
+def test_only_proto_links():
+ """Only create links if there's a protocol."""
+ def only_proto(attrs, new=False):
+ if new and not attrs['_text'].startswith(('http:', 'https:')):
+ return None
+ return attrs
+
+ in_text = 'a ex.mp http://ex.mp <a href="/foo">bar</a>'
+ out_text = ('a ex.mp <a href="http://ex.mp">http://ex.mp</a> '
+ '<a href="/foo">bar</a>')
+ eq_(out_text, linkify(in_text, [only_proto]))
+
+
+def test_stop_email():
+ """Returning None should prevent a link from being created."""
+ def no_email(attrs, new=False):
+ if attrs['href'].startswith('mailto:'):
+ return None
+ return attrs
+ text = 'do not link james@example.com'
+ eq_(text, linkify(text, parse_email=True, callbacks=[no_email]))
def test_tlds():
@@ -98,7 +197,7 @@ def test_escaping():
def test_nofollow_off():
eq_('<a href="http://example.com">example.com</a>',
- linkify(u'example.com', nofollow=False))
+ linkify(u'example.com', []))
def test_link_in_html():
@@ -297,16 +396,45 @@ def test_ports():
yield check, test, output
-def test_target():
- eq_('<a href="http://example.com" rel="nofollow" '
- 'target="_blank">example.com</a>',
- linkify(u'example.com', target='_blank'))
- eq_('<a href="http://example.com" target="_blank">example.com</a>',
- linkify(u'example.com', target='_blank', nofollow=False))
-
-
def test_tokenizer():
"""Linkify doesn't always have to sanitize."""
raw = '<em>test<x></x></em>'
eq_('<em>test&lt;x&gt;&lt;/x&gt;</em>', linkify(raw))
eq_(raw, linkify(raw, tokenizer=HTMLTokenizer))
+
+
+def test_ignore_bad_protocols():
+ eq_('foohttp://bar',
+ linkify('foohttp://bar'))
+ eq_('foohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>',
+ linkify('foohttp://exampl.com'))
+
+
+def test_max_recursion_depth():
+ """If we hit the max recursion depth, just return the string."""
+ test = '<em>' * 2000 + 'foo' + '</em>' * 2000
+ eq_(test, linkify(test))
+
+
+def test_link_emails_and_urls():
+ """parse_email=True shouldn't prevent URLs from getting linkified."""
+ output = ('<a href="http://example.com" rel="nofollow">'
+ 'http://example.com</a> <a href="mailto:person@example.com">'
+ 'person@example.com</a>')
+ eq_(output, linkify('http://example.com person@example.com',
+ parse_email=True))
+
+
+def test_links_case_insensitive():
+ """Protocols and domain names are case insensitive."""
+ expect = ('<a href="HTTP://EXAMPLE.COM" rel="nofollow">'
+ 'HTTP://EXAMPLE.COM</a>')
+ eq_(expect, linkify('HTTP://EXAMPLE.COM'))
+
+
+def test_elements_inside_links():
+ eq_(u'<a href="#" rel="nofollow">hello<br></a>',
+ linkify('<a href="#">hello<br></a>'))
+
+ eq_(u'<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>',
+ linkify('<a href="#"><strong>bold</strong> hello<br></a>'))
diff --git a/bleach/tests/test_security.py b/bleach/tests/test_security.py
index 9e9bb7b..6c2b33f 100644
--- a/bleach/tests/test_security.py
+++ b/bleach/tests/test_security.py
@@ -106,3 +106,7 @@ def test_poster_attribute():
eq_(expect, clean(test, tags=tags, attributes=attrs))
ok = '<video poster="/foo.png"></video>'
eq_(ok, clean(ok, tags=tags, attributes=attrs))
+
+
+def test_feed_protocol():
+ eq_('<a>foo</a>', clean('<a href="feed:file:///tmp/foo">foo</a>'))