try:
from urllib.parse import quote_plus
except ImportError:
from urllib import quote_plus
from html5lib.tokenizer import HTMLTokenizer
from nose.tools import eq_
from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
def test_url_re():
def no_match(s):
match = url_re.search(s)
if match:
assert not match, 'matched {0!s}'.format(s[slice(*match.span())])
yield no_match, 'just what i am looking for...it'
def test_empty():
eq_('', linkify(''))
def test_simple_link():
eq_('a http://example.com'
' link',
linkify('a http://example.com link'))
eq_('a https://example.com'
' link',
linkify('a https://example.com link'))
eq_('a example.com link',
linkify('a example.com link'))
def test_trailing_slash():
eq_('http://examp.com/',
linkify('http://examp.com/'))
eq_(''
'http://example.com/foo/',
linkify('http://example.com/foo/'))
eq_(''
'http://example.com/foo/bar/',
linkify('http://example.com/foo/bar/'))
def test_mangle_link():
"""We can muck with the href attribute of the link."""
def filter_url(attrs, new=False):
quoted = quote_plus(attrs['href'])
attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted)
return attrs
eq_(''
'http://example.com',
linkify('http://example.com', DC + [filter_url]))
def test_mangle_text():
"""We can muck with the inner text of a link."""
def ft(attrs, new=False):
attrs['_text'] = 'bar'
return attrs
eq_('bar bar',
linkify('http://ex.mp foo', [ft]))
def test_email_link():
tests = (
('a james@example.com mailto', False, 'a james@example.com mailto'),
('a james@example.com.au mailto', False,
'a james@example.com.au mailto'),
('a james@example.com mailto',
True, 'a james@example.com mailto'),
('aussie '
'james@example.com.au mailto', True,
'aussie james@example.com.au mailto'),
# This is kind of a pathological case. I guess we do our best here.
('email to '
'james@example.com',
True,
'email to james@example.com'),
('
'
'jinkyun@example.com',
True,
'
jinkyun@example.com'),
)
def _check(o, p, i):
eq_(o, linkify(i, parse_email=p))
for (o, p, i) in tests:
yield _check, o, p, i
def test_email_link_escaping():
tests = (
(''''''
'''"james"@example.com''',
'"james"@example.com'),
(''''''
'''"j'ames"@example.com''',
'"j\'ames"@example.com'),
(''''''
'''"ja>mes"@example.com''',
'"ja>mes"@example.com'),
)
def _check(o, i):
eq_(o, linkify(i, parse_email=True))
for (o, i) in tests:
yield _check, o, i
def test_prevent_links():
"""Returning None from any callback should remove links or prevent them
from being created."""
def no_new_links(attrs, new=False):
if new:
return None
return attrs
def no_old_links(attrs, new=False):
if not new:
return None
return attrs
def noop(attrs, new=False):
return attrs
in_text = 'a ex.mp example'
out_text = 'a ex.mp example'
tests = (
([noop], ('a ex.mp '
'example'), 'noop'),
([no_new_links, noop], in_text, 'no new, noop'),
([noop, no_new_links], in_text, 'noop, no new'),
([no_old_links, noop], out_text, 'no old, noop'),
([noop, no_old_links], out_text, 'noop, no old'),
([no_old_links, no_new_links], 'a ex.mp example', 'no links'),
)
def _check(cb, o, msg):
eq_(o, linkify(in_text, cb), msg)
for (cb, o, msg) in tests:
yield _check, cb, o, msg
def test_set_attrs():
"""We can set random attributes on links."""
def set_attr(attrs, new=False):
attrs['rev'] = 'canonical'
return attrs
eq_('ex.mp',
linkify('ex.mp', [set_attr]))
def test_only_proto_links():
"""Only create links if there's a protocol."""
def only_proto(attrs, new=False):
if new and not attrs['_text'].startswith(('http:', 'https:')):
return None
return attrs
in_text = 'a ex.mp http://ex.mp bar'
out_text = ('a ex.mp http://ex.mp '
'bar')
eq_(out_text, linkify(in_text, [only_proto]))
def test_stop_email():
"""Returning None should prevent a link from being created."""
def no_email(attrs, new=False):
if attrs['href'].startswith('mailto:'):
return None
return attrs
text = 'do not link james@example.com'
eq_(text, linkify(text, parse_email=True, callbacks=[no_email]))
def test_tlds():
eq_('example.com',
linkify('example.com'))
eq_('example.co',
linkify('example.co'))
eq_('example.co.uk',
linkify('example.co.uk'))
eq_('example.edu',
linkify('example.edu'))
eq_('example.xxx',
linkify('example.xxx'))
eq_('example.yyy', linkify('example.yyy'))
eq_(' brie', linkify(' brie'))
eq_('bit.ly/fun',
linkify('bit.ly/fun'))
def test_escaping():
eq_('< unrelated', linkify('< unrelated'))
def test_nofollow_off():
eq_('example.com',
linkify('example.com', []))
def test_link_in_html():
eq_('http://yy.com',
linkify('http://yy.com'))
eq_('http://xx.com'
'',
linkify('http://xx.com'))
def test_links_https():
eq_('https://yy.com',
linkify('https://yy.com'))
def test_add_rel_nofollow():
"""Verify that rel="nofollow" is added to an existing link"""
eq_('http://yy.com',
linkify('http://yy.com'))
def test_url_with_path():
eq_(''
'http://example.com/path/to/file',
linkify('http://example.com/path/to/file'))
def test_link_ftp():
eq_(''
'ftp://ftp.mozilla.org/some/file',
linkify('ftp://ftp.mozilla.org/some/file'))
def test_link_query():
eq_(''
'http://xx.com/?test=win',
linkify('http://xx.com/?test=win'))
eq_(''
'xx.com/?test=win',
linkify('xx.com/?test=win'))
eq_(''
'xx.com?test=win',
linkify('xx.com?test=win'))
def test_link_fragment():
eq_(''
'http://xx.com/path#frag',
linkify('http://xx.com/path#frag'))
def test_link_entities():
eq_(''
'http://xx.com/?a=1&b=2',
linkify('http://xx.com/?a=1&b=2'))
def test_escaped_html():
"""If I pass in escaped HTML, it should probably come out escaped."""
s = '<em>strong</em>'
eq_(s, linkify(s))
def test_link_http_complete():
eq_(''
'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f',
linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f'))
def test_non_url():
"""document.vulnerable should absolutely not be linkified."""
s = 'document.vulnerable'
eq_(s, linkify(s))
def test_javascript_url():
"""javascript: urls should never be linkified."""
s = 'javascript:document.vulnerable'
eq_(s, linkify(s))
def test_unsafe_url():
"""Any unsafe char ({}[]<>, etc.) in the path should end URL scanning."""
eq_('All your{"xx.yy.com/grover.png"}base are',
linkify('All your{"xx.yy.com/grover.png"}base are'))
def test_skip_pre():
"""Skip linkification in
tags.""" simple = 'http://xx.comhttp://xx.com' linked = ('http://xx.com ' 'http://xx.com') all_linked = ('http://xx.com ' 'http://xx.com' '') eq_(linked, linkify(simple, skip_pre=True)) eq_(all_linked, linkify(simple)) already_linked = 'xx' nofollowed = 'xx' eq_(nofollowed, linkify(already_linked)) eq_(nofollowed, linkify(already_linked, skip_pre=True)) def test_libgl(): """libgl.so.1 should not be linkified.""" eq_('libgl.so.1', linkify('libgl.so.1')) def test_end_of_sentence(): """example.com. should match.""" out = '{0!s}{1!s}' intxt = '{0!s}{1!s}' def check(u, p): eq_(out.format(u, p), linkify(intxt.format(u, p))) tests = ( ('example.com', '.'), ('example.com', '...'), ('ex.com/foo', '.'), ('ex.com/foo', '....'), ) for u, p in tests: yield check, u, p def test_end_of_clause(): """example.com/foo, shouldn't include the ,""" eq_('ex.com/foo, bar', linkify('ex.com/foo, bar')) def test_sarcasm(): """Jokes should crash.""" dirty = 'Yeah right ' clean = 'Yeah right <sarcasm/>' eq_(clean, linkify(dirty)) def test_wrapping_parentheses(): """URLs wrapped in parantheses should not include them.""" out = '{0!s}{2!s}{3!s}' tests = ( ('(example.com)', ('(', 'example.com', 'example.com', ')')), ('(example.com/)', ('(', 'example.com/', 'example.com/', ')')), ('(example.com/foo)', ('(', 'example.com/foo', 'example.com/foo', ')')), ('(((example.com/))))', ('(((', 'example.com/)', 'example.com/)', ')))')), ('example.com/))', ('', 'example.com/))', 'example.com/))', '')), ('http://en.wikipedia.org/wiki/Test_(assessment)', ('', 'en.wikipedia.org/wiki/Test_(assessment)', 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), ('(http://en.wikipedia.org/wiki/Test_(assessment))', ('(', 'en.wikipedia.org/wiki/Test_(assessment)', 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')), ('((http://en.wikipedia.org/wiki/Test_(assessment))', ('((', 'en.wikipedia.org/wiki/Test_(assessment', 'http://en.wikipedia.org/wiki/Test_(assessment', '))')), ('(http://en.wikipedia.org/wiki/Test_(assessment)))', ('(', 'en.wikipedia.org/wiki/Test_(assessment))', 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')), ('(http://en.wikipedia.org/wiki/)Test_(assessment', ('(', 'en.wikipedia.org/wiki/)Test_(assessment', 'http://en.wikipedia.org/wiki/)Test_(assessment', '')), ) def check(test, expected_output): eq_(out.format(*expected_output), linkify(test)) for test, expected_output in tests: yield check, test, expected_output def test_parentheses_with_removing(): expect = '(test.py)' eq_(expect, linkify(expect, callbacks=[lambda *a: None])) def test_ports(): """URLs can contain port numbers.""" tests = ( ('http://foo.com:8000', ('http://foo.com:8000', '')), ('http://foo.com:8000/', ('http://foo.com:8000/', '')), ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')), ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')), ('http://foo.com:', ('http://foo.com', ':')), ) def check(test, output): out = '{0}{1}' eq_(out.format(*output), linkify(test)) for test, output in tests: yield check, test, output def test_tokenizer(): """Linkify doesn't always have to sanitize.""" raw = 'test ' eq_('test<x></x>', linkify(raw)) eq_(raw, linkify(raw, tokenizer=HTMLTokenizer)) def test_ignore_bad_protocols(): eq_('foohttp://bar', linkify('foohttp://bar')) eq_('fohttp://exampl.com', linkify('fohttp://exampl.com')) def test_max_recursion_depth(): """If we hit the max recursion depth, just return the string.""" test = '' * 2000 + 'foo' + '' * 2000 eq_(test, linkify(test)) def test_link_emails_and_urls(): """parse_email=True shouldn't prevent URLs from getting linkified.""" output = ('' 'http://example.com ' 'person@example.com') eq_(output, linkify('http://example.com person@example.com', parse_email=True)) def test_links_case_insensitive(): """Protocols and domain names are case insensitive.""" expect = ('' 'HTTP://EXAMPLE.COM') eq_(expect, linkify('HTTP://EXAMPLE.COM')) def test_elements_inside_links(): eq_('hello
', linkify('hello
')) eq_('bold hello
', linkify('bold hello
')) def test_remove_first_childlink(): expect = 'something
' callbacks = [lambda *a: None] eq_(expect, linkify('', callbacks=callbacks))