try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus from html5lib.tokenizer import HTMLTokenizer from nose.tools import eq_ from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC from bleach.tests.tools import in_ def test_url_re(): def no_match(s): match = url_re.search(s) if match: assert not match, 'matched {0!s}'.format(s[slice(*match.span())]) yield no_match, 'just what i am looking for...it' def test_empty(): eq_('', linkify('')) def test_simple_link(): in_(('a http://example.com' ' link', 'a http://example.com' ' link'), linkify('a http://example.com link')) in_(('a https://example.com' ' link', 'a https://example.com' ' link'), linkify('a https://example.com link')) in_(('a example.com link', 'a example.com link'), linkify('a example.com link')) def test_trailing_slash(): in_(('http://examp.com/', 'http://examp.com/'), linkify('http://examp.com/')) in_(('' 'http://example.com/foo/', '' 'http://example.com/foo/'), linkify('http://example.com/foo/')) in_(('' 'http://example.com/foo/bar/', '' 'http://example.com/foo/bar/'), linkify('http://example.com/foo/bar/')) def test_mangle_link(): """We can muck with the href attribute of the link.""" def filter_url(attrs, new=False): quoted = quote_plus(attrs['href']) attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted) return attrs in_(('' 'http://example.com', '' 'http://example.com'), linkify('http://example.com', DC + [filter_url])) def test_mangle_text(): """We can muck with the inner text of a link.""" def ft(attrs, new=False): attrs['_text'] = 'bar' return attrs eq_('bar bar', linkify('http://ex.mp foo', [ft])) def test_email_link(): tests = ( ('a james@example.com mailto', False, 'a james@example.com mailto'), ('a james@example.com.au mailto', False, 'a james@example.com.au mailto'), ('a james@example.com mailto', True, 'a james@example.com mailto'), ('aussie ' 'james@example.com.au mailto', True, 'aussie james@example.com.au mailto'), # This is kind of a pathological case. I guess we do our best here. (('email to ' 'james@example.com', 'email to ' 'james@example.com'), True, 'email to james@example.com'), ) def _check(o, p, i): if isinstance(o, (list, tuple)): in_(o, linkify(i, parse_email=p)) else: eq_(o, linkify(i, parse_email=p)) for (o, p, i) in tests: yield _check, o, p, i def test_email_link_escaping(): tests = ( ('''''' '''"james"@example.com''', '"james"@example.com'), ('''''' '''"j'ames"@example.com''', '"j\'ames"@example.com'), ('''''' '''"ja>mes"@example.com''', '"ja>mes"@example.com'), ) def _check(o, i): eq_(o, linkify(i, parse_email=True)) for (o, i) in tests: yield _check, o, i def test_prevent_links(): """Returning None from any callback should remove links or prevent them from being created.""" def no_new_links(attrs, new=False): if new: return None return attrs def no_old_links(attrs, new=False): if not new: return None return attrs def noop(attrs, new=False): return attrs in_text = 'a ex.mp example' out_text = 'a ex.mp example' tests = ( ([noop], ('a ex.mp ' 'example'), 'noop'), ([no_new_links, noop], in_text, 'no new, noop'), ([noop, no_new_links], in_text, 'noop, no new'), ([no_old_links, noop], out_text, 'no old, noop'), ([noop, no_old_links], out_text, 'noop, no old'), ([no_old_links, no_new_links], 'a ex.mp example', 'no links'), ) def _check(cb, o, msg): eq_(o, linkify(in_text, cb), msg) for (cb, o, msg) in tests: yield _check, cb, o, msg def test_set_attrs(): """We can set random attributes on links.""" def set_attr(attrs, new=False): attrs['rev'] = 'canonical' return attrs in_(('ex.mp', 'ex.mp'), linkify('ex.mp', [set_attr])) def test_only_proto_links(): """Only create links if there's a protocol.""" def only_proto(attrs, new=False): if new and not attrs['_text'].startswith(('http:', 'https:')): return None return attrs in_text = 'a ex.mp http://ex.mp bar' out_text = ('a ex.mp http://ex.mp ' 'bar') eq_(out_text, linkify(in_text, [only_proto])) def test_stop_email(): """Returning None should prevent a link from being created.""" def no_email(attrs, new=False): if attrs['href'].startswith('mailto:'): return None return attrs text = 'do not link james@example.com' eq_(text, linkify(text, parse_email=True, callbacks=[no_email])) def test_tlds(): in_(('example.com', 'example.com'), linkify('example.com')) in_(('example.co.uk', 'example.co.uk'), linkify('example.co.uk')) in_(('example.edu', 'example.edu'), linkify('example.edu')) eq_('example.xxx', linkify('example.xxx')) eq_(' brie', linkify(' brie')) in_(('bit.ly/fun', 'bit.ly/fun'), linkify('bit.ly/fun')) def test_escaping(): eq_('< unrelated', linkify('< unrelated')) def test_nofollow_off(): eq_('example.com', linkify('example.com', [])) def test_link_in_html(): in_(('http://yy.com', 'http://yy.com'), linkify('http://yy.com')) in_(('http://xx.com' '', 'http://xx.com' ''), linkify('http://xx.com')) def test_links_https(): in_(('https://yy.com', 'https://yy.com'), linkify('https://yy.com')) def test_add_rel_nofollow(): """Verify that rel="nofollow" is added to an existing link""" in_(('http://yy.com', 'http://yy.com'), linkify('http://yy.com')) def test_url_with_path(): in_(('' 'http://example.com/path/to/file', '' 'http://example.com/path/to/file'), linkify('http://example.com/path/to/file')) def test_link_ftp(): in_(('' 'ftp://ftp.mozilla.org/some/file', '' 'ftp://ftp.mozilla.org/some/file'), linkify('ftp://ftp.mozilla.org/some/file')) def test_link_query(): in_(('' 'http://xx.com/?test=win', '' 'http://xx.com/?test=win'), linkify('http://xx.com/?test=win')) in_(('' 'xx.com/?test=win', '' 'xx.com/?test=win'), linkify('xx.com/?test=win')) in_(('' 'xx.com?test=win', '' 'xx.com?test=win'), linkify('xx.com?test=win')) def test_link_fragment(): in_(('' 'http://xx.com/path#frag', '' 'http://xx.com/path#frag'), linkify('http://xx.com/path#frag')) def test_link_entities(): in_(('' 'http://xx.com/?a=1&b=2', '' 'http://xx.com/?a=1&b=2'), linkify('http://xx.com/?a=1&b=2')) def test_escaped_html(): """If I pass in escaped HTML, it should probably come out escaped.""" s = '<em>strong</em>' eq_(s, linkify(s)) def test_link_http_complete(): in_(('' 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f', '' 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f'), linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f')) def test_non_url(): """document.vulnerable should absolutely not be linkified.""" s = 'document.vulnerable' eq_(s, linkify(s)) def test_javascript_url(): """javascript: urls should never be linkified.""" s = 'javascript:document.vulnerable' eq_(s, linkify(s)) def test_unsafe_url(): """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" in_(('All your{"xx.yy.com/grover.png"}base are', 'All your{"xx.yy.com/grover.png"}base are'), linkify('All your{"xx.yy.com/grover.png"}base are')) def test_skip_pre(): """Skip linkification in
 tags."""
    simple = 'http://xx.com 
http://xx.com
' linked = ('http://xx.com ' '
http://xx.com
', 'http://xx.com ' '
http://xx.com
') all_linked = ('http://xx.com ' '
http://xx.com'
                  '
', 'http://xx.com ' '
http://xx.com'
                  '
') in_(linked, linkify(simple, skip_pre=True)) in_(all_linked, linkify(simple)) already_linked = '
xx
' nofollowed = ('
xx
', '
xx
') in_(nofollowed, linkify(already_linked)) in_(nofollowed, linkify(already_linked, skip_pre=True)) def test_libgl(): """libgl.so.1 should not be linkified.""" eq_('libgl.so.1', linkify('libgl.so.1')) def test_end_of_sentence(): """example.com. should match.""" outs = ('{0!s}{1!s}', '{0!s}{1!s}') intxt = '{0!s}{1!s}' def check(u, p): in_([out.format(u, p) for out in outs], linkify(intxt.format(u, p))) tests = ( ('example.com', '.'), ('example.com', '...'), ('ex.com/foo', '.'), ('ex.com/foo', '....'), ) for u, p in tests: yield check, u, p def test_end_of_clause(): """example.com/foo, shouldn't include the ,""" in_(('ex.com/foo, bar', 'ex.com/foo, bar'), linkify('ex.com/foo, bar')) def test_sarcasm(): """Jokes should crash.""" dirty = 'Yeah right ' clean = 'Yeah right <sarcasm/>' eq_(clean, linkify(dirty)) def test_wrapping_parentheses(): """URLs wrapped in parantheses should not include them.""" outs = ('{0!s}{2!s}{3!s}', '{0!s}{2!s}{3!s}') tests = ( ('(example.com)', ('(', 'example.com', 'example.com', ')')), ('(example.com/)', ('(', 'example.com/', 'example.com/', ')')), ('(example.com/foo)', ('(', 'example.com/foo', 'example.com/foo', ')')), ('(((example.com/))))', ('(((', 'example.com/)', 'example.com/)', ')))')), ('example.com/))', ('', 'example.com/))', 'example.com/))', '')), ('http://en.wikipedia.org/wiki/Test_(assessment)', ('', 'en.wikipedia.org/wiki/Test_(assessment)', 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), ('(http://en.wikipedia.org/wiki/Test_(assessment))', ('(', 'en.wikipedia.org/wiki/Test_(assessment)', 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')), ('((http://en.wikipedia.org/wiki/Test_(assessment))', ('((', 'en.wikipedia.org/wiki/Test_(assessment', 'http://en.wikipedia.org/wiki/Test_(assessment', '))')), ('(http://en.wikipedia.org/wiki/Test_(assessment)))', ('(', 'en.wikipedia.org/wiki/Test_(assessment))', 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')), ('(http://en.wikipedia.org/wiki/)Test_(assessment', ('(', 'en.wikipedia.org/wiki/)Test_(assessment', 'http://en.wikipedia.org/wiki/)Test_(assessment', '')), ) def check(test, expected_output): in_([o.format(*expected_output) for o in outs], linkify(test)) for test, expected_output in tests: yield check, test, expected_output def test_ports(): """URLs can contain port numbers.""" tests = ( ('http://foo.com:8000', ('http://foo.com:8000', '')), ('http://foo.com:8000/', ('http://foo.com:8000/', '')), ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')), ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')), ('http://foo.com:', ('http://foo.com', ':')), ) def check(test, output): outs = ('{0}{1}', '{0}{1}') in_([out.format(*output) for out in outs], linkify(test)) for test, output in tests: yield check, test, output def test_tokenizer(): """Linkify doesn't always have to sanitize.""" raw = 'test' eq_('test<x></x>', linkify(raw)) eq_(raw, linkify(raw, tokenizer=HTMLTokenizer)) def test_ignore_bad_protocols(): eq_('foohttp://bar', linkify('foohttp://bar')) in_(('fohttp://exampl.com', 'fohttp://exampl.com'), linkify('fohttp://exampl.com')) def test_max_recursion_depth(): """If we hit the max recursion depth, just return the string.""" test = '' * 2000 + 'foo' + '' * 2000 eq_(test, linkify(test)) def test_link_emails_and_urls(): """parse_email=True shouldn't prevent URLs from getting linkified.""" output = ('' 'http://example.com ' 'person@example.com', '' 'http://example.com ' 'person@example.com') in_(output, linkify('http://example.com person@example.com', parse_email=True)) def test_links_case_insensitive(): """Protocols and domain names are case insensitive.""" expect = ('' 'HTTP://EXAMPLE.COM', '' 'HTTP://EXAMPLE.COM') in_(expect, linkify('HTTP://EXAMPLE.COM')) def test_elements_inside_links(): in_(('hello
', 'hello
'), linkify('hello
')) in_(('bold hello
', 'bold hello
'), linkify('bold hello
'))