import urllib from html5lib.tokenizer import HTMLTokenizer from nose.tools import eq_ from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC def test_url_re(): def no_match(s): match = url_re.search(s) if match: assert not match, 'matched %s' % s[slice(*match.span())] yield no_match, 'just what i am looking for...it' def test_empty(): eq_('', linkify('')) def test_simple_link(): eq_('a http://example.com' ' link', linkify('a http://example.com link')) eq_('a https://example.com' ' link', linkify('a https://example.com link')) eq_('an example.com link', linkify('an example.com link')) def test_trailing_slash(): eq_('http://example.com/', linkify('http://example.com/')) eq_('' 'http://example.com/foo/', linkify('http://example.com/foo/')) eq_('' 'http://example.com/foo/bar/', linkify('http://example.com/foo/bar/')) def test_mangle_link(): """We can muck with the href attribute of the link.""" def filter_url(attrs, new=False): attrs['href'] = (u'http://bouncer/?u=%s' % urllib.quote_plus(attrs['href'])) return attrs eq_('' 'http://example.com', linkify('http://example.com', DC + [filter_url])) def test_mangle_text(): """We can muck with the inner text of a link.""" def ft(attrs, new=False): attrs['_text'] = 'bar' return attrs eq_('bar bar', linkify('http://ex.mp foo', [ft])) def test_email_link(): tests = ( ('a james@example.com mailto', False, 'a james@example.com mailto'), ('a james@example.com.au mailto', False, 'a james@example.com.au mailto'), ('a james@example.com mailto', True, 'a james@example.com mailto'), ('aussie ' 'james@example.com.au mailto', True, 'aussie james@example.com.au mailto'), # This is kind of a pathological case. I guess we do our best here. ('email to ' 'james@example.com', True, 'email to james@example.com'), ) def _check(o, p, i): eq_(o, linkify(i, parse_email=p)) for (o, p, i) in tests: yield _check, o, p, i def test_email_link_escaping(): tests = ( ('''''' '''"james"@example.com''', '"james"@example.com'), ('''''' '''"j'ames"@example.com''', '"j\'ames"@example.com'), ('''''' '''"ja>mes"@example.com''', '"ja>mes"@example.com'), ) def _check(o, i): eq_(o, linkify(i, parse_email=True)) for (o, i) in tests: yield _check, o, i def test_prevent_links(): """Returning None from any callback should remove links or prevent them from being created.""" def no_new_links(attrs, new=False): if new: return None return attrs def no_old_links(attrs, new=False): if not new: return None return attrs def noop(attrs, new=False): return attrs in_text = 'a ex.mp example' out_text = 'a ex.mp example' tests = ( ([noop], ('a ex.mp ' 'example'), 'noop'), ([no_new_links, noop], in_text, 'no new, noop'), ([noop, no_new_links], in_text, 'noop, no new'), ([no_old_links, noop], out_text, 'no old, noop'), ([noop, no_old_links], out_text, 'noop, no old'), ([no_old_links, no_new_links], 'a ex.mp example', 'no links'), ) def _check(cb, o, msg): eq_(o, linkify(in_text, cb), msg) for (cb, o, msg) in tests: yield _check, cb, o, msg def test_set_attrs(): """We can set random attributes on links.""" def set_attr(attrs, new=False): attrs['rev'] = 'canonical' return attrs eq_('ex.mp', linkify('ex.mp', [set_attr])) def test_only_proto_links(): """Only create links if there's a protocol.""" def only_proto(attrs, new=False): if new and not attrs['_text'].startswith(('http:', 'https:')): return None return attrs in_text = 'a ex.mp http://ex.mp bar' out_text = ('a ex.mp http://ex.mp ' 'bar') eq_(out_text, linkify(in_text, [only_proto])) def test_stop_email(): """Returning None should prevent a link from being created.""" def no_email(attrs, new=False): if attrs['href'].startswith('mailto:'): return None return attrs text = 'do not link james@example.com' eq_(text, linkify(text, parse_email=True, callbacks=[no_email])) def test_tlds(): eq_('example.com', linkify('example.com')) eq_('example.co.uk', linkify('example.co.uk')) eq_('example.edu', linkify('example.edu')) eq_('example.xxx', linkify('example.xxx')) eq_(' brie', linkify(' brie')) eq_('bit.ly/fun', linkify('bit.ly/fun')) def test_escaping(): eq_('< unrelated', linkify('< unrelated')) def test_nofollow_off(): eq_('example.com', linkify(u'example.com', [])) def test_link_in_html(): eq_('http://yy.com', linkify('http://yy.com')) eq_('http://xx.com' '', linkify('http://xx.com')) def test_links_https(): eq_('https://yy.com', linkify('https://yy.com')) def test_add_rel_nofollow(): """Verify that rel="nofollow" is added to an existing link""" eq_('http://yy.com', linkify('http://yy.com')) def test_url_with_path(): eq_('' 'http://example.com/path/to/file', linkify('http://example.com/path/to/file')) def test_link_ftp(): eq_('' 'ftp://ftp.mozilla.org/some/file', linkify('ftp://ftp.mozilla.org/some/file')) def test_link_query(): eq_('' 'http://xx.com/?test=win', linkify('http://xx.com/?test=win')) eq_('' 'xx.com/?test=win', linkify('xx.com/?test=win')) eq_('' 'xx.com?test=win', linkify('xx.com?test=win')) def test_link_fragment(): eq_('' 'http://xx.com/path#frag', linkify('http://xx.com/path#frag')) def test_link_entities(): eq_('' 'http://xx.com/?a=1&b=2', linkify('http://xx.com/?a=1&b=2')) def test_escaped_html(): """If I pass in escaped HTML, it should probably come out escaped.""" s = '<em>strong</em>' eq_(s, linkify(s)) def test_link_http_complete(): eq_('' 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f', linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f')) def test_non_url(): """document.vulnerable should absolutely not be linkified.""" s = 'document.vulnerable' eq_(s, linkify(s)) def test_javascript_url(): """javascript: urls should never be linkified.""" s = 'javascript:document.vulnerable' eq_(s, linkify(s)) def test_unsafe_url(): """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" eq_('All your{"xx.yy.com/grover.png"}base are', linkify('All your{"xx.yy.com/grover.png"}base are')) def test_skip_pre(): """Skip linkification in
 tags."""
    simple = 'http://xx.com 
http://xx.com
' linked = ('http://xx.com ' '
http://xx.com
') all_linked = ('http://xx.com ' '
http://xx.com'
                  '
') eq_(linked, linkify(simple, skip_pre=True)) eq_(all_linked, linkify(simple)) already_linked = '
xx
' nofollowed = '
xx
' eq_(nofollowed, linkify(already_linked)) eq_(nofollowed, linkify(already_linked, skip_pre=True)) def test_libgl(): """libgl.so.1 should not be linkified.""" eq_('libgl.so.1', linkify('libgl.so.1')) def test_end_of_sentence(): """example.com. should match.""" out = u'%s%s' in_ = u'%s%s' def check(u, p): eq_(out % (u, u, p), linkify(in_ % (u, p))) tests = ( ('example.com', '.'), ('example.com', '...'), ('ex.com/foo', '.'), ('ex.com/foo', '....'), ) for u, p in tests: yield check, u, p def test_end_of_clause(): """example.com/foo, shouldn't include the ,""" eq_('ex.com/foo, bar', linkify('ex.com/foo, bar')) def test_sarcasm(): """Jokes should crash.""" dirty = u'Yeah right ' clean = u'Yeah right <sarcasm/>' eq_(clean, linkify(dirty)) def test_wrapping_parentheses(): """URLs wrapped in parantheses should not include them.""" out = u'%s%s%s' tests = ( ('(example.com)', out % ('(', 'example.com', 'example.com', ')')), ('(example.com/)', out % ('(', 'example.com/', 'example.com/', ')')), ('(example.com/foo)', out % ('(', 'example.com/foo', 'example.com/foo', ')')), ('(((example.com/))))', out % ('(((', 'example.com/)', 'example.com/)', ')))')), ('example.com/))', out % ('', 'example.com/))', 'example.com/))', '')), ('http://en.wikipedia.org/wiki/Test_(assessment)', out % ('', 'en.wikipedia.org/wiki/Test_(assessment)', 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), ('(http://en.wikipedia.org/wiki/Test_(assessment))', out % ('(', 'en.wikipedia.org/wiki/Test_(assessment)', 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')), ('((http://en.wikipedia.org/wiki/Test_(assessment))', out % ('((', 'en.wikipedia.org/wiki/Test_(assessment', 'http://en.wikipedia.org/wiki/Test_(assessment', '))')), ('(http://en.wikipedia.org/wiki/Test_(assessment)))', out % ('(', 'en.wikipedia.org/wiki/Test_(assessment))', 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')), ('(http://en.wikipedia.org/wiki/)Test_(assessment', out % ('(', 'en.wikipedia.org/wiki/)Test_(assessment', 'http://en.wikipedia.org/wiki/)Test_(assessment', '')), ) def check(test, expected_output): eq_(expected_output, linkify(test)) for test, expected_output in tests: yield check, test, expected_output def test_ports(): """URLs can contain port numbers.""" tests = ( ('http://foo.com:8000', ('http://foo.com:8000', '')), ('http://foo.com:8000/', ('http://foo.com:8000/', '')), ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')), ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')), ('http://foo.com:', ('http://foo.com', ':')), ) def check(test, output): eq_(u'{0}{1}'.format(*output), linkify(test)) for test, output in tests: yield check, test, output def test_tokenizer(): """Linkify doesn't always have to sanitize.""" raw = 'test' eq_('test<x></x>', linkify(raw)) eq_(raw, linkify(raw, tokenizer=HTMLTokenizer)) def test_ignore_bad_protocols(): eq_('foohttp://bar', linkify('foohttp://bar')) eq_('foohttp://exampl.com', linkify('foohttp://exampl.com')) def test_max_recursion_depth(): """If we hit the max recursion depth, just return the string.""" test = '' * 2000 + 'foo' + '' * 2000 eq_(test, linkify(test)) def test_link_emails_and_urls(): """parse_email=True shouldn't prevent URLs from getting linkified.""" output = ('' 'http://example.com ' 'person@example.com') eq_(output, linkify('http://example.com person@example.com', parse_email=True)) def test_links_case_insensitive(): """Protocols and domain names are case insensitive.""" expect = ('' 'HTTP://EXAMPLE.COM') eq_(expect, linkify('HTTP://EXAMPLE.COM')) def test_elements_inside_links(): eq_(u'hello
', linkify('hello
')) eq_(u'bold hello
', linkify('bold hello
'))