import urllib from html5lib.tokenizer import HTMLTokenizer from nose.tools import eq_ from bleach import linkify, url_re def filter_url(url): return u'http://bouncer/?u=%s' % urllib.quote_plus(url) def test_url_re(): def no_match(s): match = url_re.search(s) if match: assert not match, 'matched %s' % s[slice(*match.span())] yield no_match, 'just what i am looking for...it' def test_empty(): eq_('', linkify('')) def test_simple_link(): eq_('a http://example.com' ' link', linkify('a http://example.com link')) eq_('a https://example.com' ' link', linkify('a https://example.com link')) eq_('an example.com link', linkify('an example.com link')) def test_trailing_slash(): eq_('http://example.com/', linkify('http://example.com/')) eq_('' 'http://example.com/foo/', linkify('http://example.com/foo/')) eq_('' 'http://example.com/foo/bar/', linkify('http://example.com/foo/bar/')) def test_mangle_link(): eq_('' 'http://example.com', linkify('http://example.com', filter_url=filter_url)) def test_email_link(): eq_('a james@example.com mailto', linkify('a james@example.com mailto')) eq_('a james@example.com.au mailto', linkify('a james@example.com.au mailto')) eq_('a ' 'james@example.com mailto', linkify('a james@example.com mailto', parse_email=True)) eq_('aussie ' 'james@example.com.au mailto', linkify('aussie james@example.com.au mailto', parse_email=True)) eq_('email to ' 'james@example.com', linkify('email to ' 'james@example.com', parse_email=True)) def test_email_link_escaping(): eq_('''''' '''"james"@example.com''', linkify('"james"@example.com', parse_email=True)) eq_('''''' '''"j'ames"@example.com''', linkify('"j\'ames"@example.com', parse_email=True)) eq_('''''' '''"ja>mes"@example.com''', linkify('"ja>mes"@example.com', parse_email=True)) def test_tlds(): eq_('example.com', linkify('example.com')) eq_('example.co.uk', linkify('example.co.uk')) eq_('example.edu', linkify('example.edu')) eq_('example.xxx', linkify('example.xxx')) eq_(' brie', linkify(' brie')) eq_('bit.ly/fun', linkify('bit.ly/fun')) def test_escaping(): eq_('< unrelated', linkify('< unrelated')) def test_nofollow_off(): eq_('example.com', linkify(u'example.com', nofollow=False)) def test_link_in_html(): eq_('http://yy.com', linkify('http://yy.com')) eq_('http://xx.com' '', linkify('http://xx.com')) def test_links_https(): eq_('https://yy.com', linkify('https://yy.com')) def test_add_rel_nofollow(): """Verify that rel="nofollow" is added to an existing link""" eq_('http://yy.com', linkify('http://yy.com')) def test_url_with_path(): eq_('' 'http://example.com/path/to/file', linkify('http://example.com/path/to/file')) def test_link_ftp(): eq_('' 'ftp://ftp.mozilla.org/some/file', linkify('ftp://ftp.mozilla.org/some/file')) def test_link_query(): eq_('' 'http://xx.com/?test=win', linkify('http://xx.com/?test=win')) eq_('' 'xx.com/?test=win', linkify('xx.com/?test=win')) eq_('' 'xx.com?test=win', linkify('xx.com?test=win')) def test_link_fragment(): eq_('' 'http://xx.com/path#frag', linkify('http://xx.com/path#frag')) def test_link_entities(): eq_('' 'http://xx.com/?a=1&b=2', linkify('http://xx.com/?a=1&b=2')) def test_escaped_html(): """If I pass in escaped HTML, it should probably come out escaped.""" s = '<em>strong</em>' eq_(s, linkify(s)) def test_link_http_complete(): eq_('' 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f', linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f')) def test_non_url(): """document.vulnerable should absolutely not be linkified.""" s = 'document.vulnerable' eq_(s, linkify(s)) def test_javascript_url(): """javascript: urls should never be linkified.""" s = 'javascript:document.vulnerable' eq_(s, linkify(s)) def test_unsafe_url(): """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" eq_('All your{"xx.yy.com/grover.png"}base are', linkify('All your{"xx.yy.com/grover.png"}base are')) def test_skip_pre(): """Skip linkification in
 tags."""
    simple = 'http://xx.com 
http://xx.com
' linked = ('http://xx.com ' '
http://xx.com
') all_linked = ('http://xx.com ' '
http://xx.com'
                  '
') eq_(linked, linkify(simple, skip_pre=True)) eq_(all_linked, linkify(simple)) already_linked = '
xx
' nofollowed = '
xx
' eq_(nofollowed, linkify(already_linked)) eq_(nofollowed, linkify(already_linked, skip_pre=True)) def test_libgl(): """libgl.so.1 should not be linkified.""" eq_('libgl.so.1', linkify('libgl.so.1')) def test_end_of_sentence(): """example.com. should match.""" out = u'%s%s' in_ = u'%s%s' def check(u, p): eq_(out % (u, u, p), linkify(in_ % (u, p))) tests = ( ('example.com', '.'), ('example.com', '...'), ('ex.com/foo', '.'), ('ex.com/foo', '....'), ) for u, p in tests: yield check, u, p def test_end_of_clause(): """example.com/foo, shouldn't include the ,""" eq_('ex.com/foo, bar', linkify('ex.com/foo, bar')) def test_sarcasm(): """Jokes should crash.""" dirty = u'Yeah right ' clean = u'Yeah right <sarcasm/>' eq_(clean, linkify(dirty)) def test_wrapping_parentheses(): """URLs wrapped in parantheses should not include them.""" out = u'%s%s%s' tests = ( ('(example.com)', out % ('(', 'example.com', 'example.com', ')')), ('(example.com/)', out % ('(', 'example.com/', 'example.com/', ')')), ('(example.com/foo)', out % ('(', 'example.com/foo', 'example.com/foo', ')')), ('(((example.com/))))', out % ('(((', 'example.com/)', 'example.com/)', ')))')), ('example.com/))', out % ('', 'example.com/))', 'example.com/))', '')), ('http://en.wikipedia.org/wiki/Test_(assessment)', out % ('', 'en.wikipedia.org/wiki/Test_(assessment)', 'http://en.wikipedia.org/wiki/Test_(assessment)', '')), ('(http://en.wikipedia.org/wiki/Test_(assessment))', out % ('(', 'en.wikipedia.org/wiki/Test_(assessment)', 'http://en.wikipedia.org/wiki/Test_(assessment)', ')')), ('((http://en.wikipedia.org/wiki/Test_(assessment))', out % ('((', 'en.wikipedia.org/wiki/Test_(assessment', 'http://en.wikipedia.org/wiki/Test_(assessment', '))')), ('(http://en.wikipedia.org/wiki/Test_(assessment)))', out % ('(', 'en.wikipedia.org/wiki/Test_(assessment))', 'http://en.wikipedia.org/wiki/Test_(assessment))', ')')), ('(http://en.wikipedia.org/wiki/)Test_(assessment', out % ('(', 'en.wikipedia.org/wiki/)Test_(assessment', 'http://en.wikipedia.org/wiki/)Test_(assessment', '')), ) def check(test, expected_output): eq_(expected_output, linkify(test)) for test, expected_output in tests: yield check, test, expected_output def test_ports(): """URLs can contain port numbers.""" tests = ( ('http://foo.com:8000', ('http://foo.com:8000', '')), ('http://foo.com:8000/', ('http://foo.com:8000/', '')), ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')), ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')), ('http://foo.com:', ('http://foo.com', ':')), ) def check(test, output): eq_(u'{0}{1}'.format(*output), linkify(test)) for test, output in tests: yield check, test, output def test_target(): eq_('example.com', linkify(u'example.com', target='_blank')) eq_('example.com', linkify(u'example.com', target='_blank', nofollow=False)) def test_tokenizer(): """Linkify doesn't always have to sanitize.""" raw = 'test' eq_('test<x></x>', linkify(raw)) eq_(raw, linkify(raw, tokenizer=HTMLTokenizer))