import urllib from html5lib.tokenizer import HTMLTokenizer from nose.tools import eq_ from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC def test_url_re(): def no_match(s): match = url_re.search(s) if match: assert not match, 'matched %s' % s[slice(*match.span())] yield no_match, 'just what i am looking for...it' def test_empty(): eq_('', linkify('')) def test_simple_link(): eq_('a http://example.com' ' link', linkify('a http://example.com link')) eq_('a https://example.com' ' link', linkify('a https://example.com link')) eq_('an example.com link', linkify('an example.com link')) def test_trailing_slash(): eq_('http://example.com/', linkify('http://example.com/')) eq_('' 'http://example.com/foo/', linkify('http://example.com/foo/')) eq_('' 'http://example.com/foo/bar/', linkify('http://example.com/foo/bar/')) def test_mangle_link(): """We can muck with the href attribute of the link.""" def filter_url(attrs, new=False): attrs['href'] = (u'http://bouncer/?u=%s' % urllib.quote_plus(attrs['href'])) return attrs eq_('' 'http://example.com', linkify('http://example.com', DC + [filter_url])) def test_mangle_text(): """We can muck with the inner text of a link.""" def ft(attrs, new=False): attrs['_text'] = 'bar' return attrs eq_('bar bar', linkify('http://ex.mp foo', [ft])) def test_email_link(): tests = ( ('a james@example.com mailto', False, 'a james@example.com mailto'), ('a james@example.com.au mailto', False, 'a james@example.com.au mailto'), ('a james@example.com mailto', True, 'a james@example.com mailto'), ('aussie ' 'james@example.com.au mailto', True, 'aussie james@example.com.au mailto'), # This is kind of a pathological case. I guess we do our best here. ('email to ' 'james@example.com', True, 'email to james@example.com'), ) def _check(o, p, i): eq_(o, linkify(i, parse_email=p)) for (o, p, i) in tests: yield _check, o, p, i def test_email_link_escaping(): tests = ( ('''''' '''"james"@example.com''', '"james"@example.com'), ('''''' '''"j'ames"@example.com''', '"j\'ames"@example.com'), ('''''' '''"ja>mes"@example.com''', '"ja>mes"@example.com'), ) def _check(o, i): eq_(o, linkify(i, parse_email=True)) for (o, i) in tests: yield _check, o, i def test_prevent_links(): """Returning None from any callback should remove links or prevent them from being created.""" def no_new_links(attrs, new=False): if new: return None return attrs def no_old_links(attrs, new=False): if not new: return None return attrs def noop(attrs, new=False): return attrs in_text = 'a ex.mp example' out_text = 'a ex.mp example' tests = ( ([noop], ('a ex.mp ' 'example'), 'noop'), ([no_new_links, noop], in_text, 'no new, noop'), ([noop, no_new_links], in_text, 'noop, no new'), ([no_old_links, noop], out_text, 'no old, noop'), ([noop, no_old_links], out_text, 'noop, no old'), ([no_old_links, no_new_links], 'a ex.mp example', 'no links'), ) def _check(cb, o, msg): eq_(o, linkify(in_text, cb), msg) for (cb, o, msg) in tests: yield _check, cb, o, msg def test_set_attrs(): """We can set random attributes on links.""" def set_attr(attrs, new=False): attrs['rev'] = 'canonical' return attrs eq_('ex.mp', linkify('ex.mp', [set_attr])) def test_only_proto_links(): """Only create links if there's a protocol.""" def only_proto(attrs, new=False): if new and not attrs['_text'].startswith(('http:', 'https:')): return None return attrs in_text = 'a ex.mp http://ex.mp bar' out_text = ('a ex.mp http://ex.mp ' 'bar') eq_(out_text, linkify(in_text, [only_proto])) def test_stop_email(): """Returning None should prevent a link from being created.""" def no_email(attrs, new=False): if attrs['href'].startswith('mailto:'): return None return attrs text = 'do not link james@example.com' eq_(text, linkify(text, parse_email=True, callbacks=[no_email])) def test_tlds(): eq_('example.com', linkify('example.com')) eq_('example.co.uk', linkify('example.co.uk')) eq_('example.edu', linkify('example.edu')) eq_('example.xxx', linkify('example.xxx')) eq_(' brie', linkify(' brie')) eq_('bit.ly/fun', linkify('bit.ly/fun')) def test_escaping(): eq_('< unrelated', linkify('< unrelated')) def test_nofollow_off(): eq_('example.com', linkify(u'example.com', [])) def test_link_in_html(): eq_('http://yy.com', linkify('http://yy.com')) eq_('http://xx.com' '', linkify('http://xx.com')) def test_links_https(): eq_('https://yy.com', linkify('https://yy.com')) def test_add_rel_nofollow(): """Verify that rel="nofollow" is added to an existing link""" eq_('http://yy.com', linkify('http://yy.com')) def test_url_with_path(): eq_('' 'http://example.com/path/to/file', linkify('http://example.com/path/to/file')) def test_link_ftp(): eq_('' 'ftp://ftp.mozilla.org/some/file', linkify('ftp://ftp.mozilla.org/some/file')) def test_link_query(): eq_('' 'http://xx.com/?test=win', linkify('http://xx.com/?test=win')) eq_('' 'xx.com/?test=win', linkify('xx.com/?test=win')) eq_('' 'xx.com?test=win', linkify('xx.com?test=win')) def test_link_fragment(): eq_('' 'http://xx.com/path#frag', linkify('http://xx.com/path#frag')) def test_link_entities(): eq_('' 'http://xx.com/?a=1&b=2', linkify('http://xx.com/?a=1&b=2')) def test_escaped_html(): """If I pass in escaped HTML, it should probably come out escaped.""" s = '<em>strong</em>' eq_(s, linkify(s)) def test_link_http_complete(): eq_('' 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f', linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f')) def test_non_url(): """document.vulnerable should absolutely not be linkified.""" s = 'document.vulnerable' eq_(s, linkify(s)) def test_javascript_url(): """javascript: urls should never be linkified.""" s = 'javascript:document.vulnerable' eq_(s, linkify(s)) def test_unsafe_url(): """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" eq_('All your{"xx.yy.com/grover.png"}base are', linkify('All your{"xx.yy.com/grover.png"}base are')) def test_skip_pre(): """Skip linkification in

 tags."""
    simple = 'http://xx.com http://xx.com'
    linked = ('http://xx.com '
              'http://xx.com')
    all_linked = ('http://xx.com '
                  'http://xx.com'
                  '')
    eq_(linked, linkify(simple, skip_pre=True))
    eq_(all_linked, linkify(simple))

    already_linked = 'xx'
    nofollowed = 'xx'
    eq_(nofollowed, linkify(already_linked))
    eq_(nofollowed, linkify(already_linked, skip_pre=True))


def test_libgl():
    """libgl.so.1 should not be linkified."""
    eq_('libgl.so.1', linkify('libgl.so.1'))


def test_end_of_sentence():
    """example.com. should match."""
    out = u'%s%s'
    in_ = u'%s%s'

    def check(u, p):
        eq_(out % (u, u, p), linkify(in_ % (u, p)))

    tests = (
        ('example.com', '.'),
        ('example.com', '...'),
        ('ex.com/foo', '.'),
        ('ex.com/foo', '....'),
    )

    for u, p in tests:
        yield check, u, p


def test_end_of_clause():
    """example.com/foo, shouldn't include the ,"""
    eq_('ex.com/foo, bar',
        linkify('ex.com/foo, bar'))


def test_sarcasm():
    """Jokes should crash."""
    dirty = u'Yeah right '
    clean = u'Yeah right <sarcasm/>'
    eq_(clean, linkify(dirty))


def test_wrapping_parentheses():
    """URLs wrapped in parantheses should not include them."""
    out = u'%s%s%s'

    tests = (
        ('(example.com)', out % ('(', 'example.com', 'example.com', ')')),
        ('(example.com/)', out % ('(', 'example.com/', 'example.com/', ')')),
        ('(example.com/foo)', out % ('(', 'example.com/foo',
                                     'example.com/foo', ')')),
        ('(((example.com/))))', out % ('(((', 'example.com/)',
                                       'example.com/)', ')))')),
        ('example.com/))', out % ('', 'example.com/))',
                                  'example.com/))', '')),
        ('http://en.wikipedia.org/wiki/Test_(assessment)',
            out % ('', 'en.wikipedia.org/wiki/Test_(assessment)',
                   'http://en.wikipedia.org/wiki/Test_(assessment)', '')),
        ('(http://en.wikipedia.org/wiki/Test_(assessment))',
            out % ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
                   'http://en.wikipedia.org/wiki/Test_(assessment)', ')')),
        ('((http://en.wikipedia.org/wiki/Test_(assessment))',
            out % ('((', 'en.wikipedia.org/wiki/Test_(assessment',
                   'http://en.wikipedia.org/wiki/Test_(assessment', '))')),
        ('(http://en.wikipedia.org/wiki/Test_(assessment)))',
            out % ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
                   'http://en.wikipedia.org/wiki/Test_(assessment))', ')')),
        ('(http://en.wikipedia.org/wiki/)Test_(assessment',
            out % ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
                   'http://en.wikipedia.org/wiki/)Test_(assessment', '')),
    )

    def check(test, expected_output):
        eq_(expected_output, linkify(test))

    for test, expected_output in tests:
        yield check, test, expected_output


def test_ports():
    """URLs can contain port numbers."""
    tests = (
        ('http://foo.com:8000', ('http://foo.com:8000', '')),
        ('http://foo.com:8000/', ('http://foo.com:8000/', '')),
        ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')),
        ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')),
        ('http://foo.com:', ('http://foo.com', ':')),
    )

    def check(test, output):
        eq_(u'{0}{1}'.format(*output),
            linkify(test))

    for test, output in tests:
        yield check, test, output


def test_tokenizer():
    """Linkify doesn't always have to sanitize."""
    raw = 'test'
    eq_('test<x></x>', linkify(raw))
    eq_(raw, linkify(raw, tokenizer=HTMLTokenizer))


def test_ignore_bad_protocols():
    eq_('foohttp://bar',
        linkify('foohttp://bar'))
    eq_('foohttp://exampl.com',
        linkify('foohttp://exampl.com'))


def test_max_recursion_depth():
    """If we hit the max recursion depth, just return the string."""
    test = '' * 2000 + 'foo' + '' * 2000
    eq_(test, linkify(test))


def test_link_emails_and_urls():
    """parse_email=True shouldn't prevent URLs from getting linkified."""
    output = (''
              'http://example.com '
              'person@example.com')
    eq_(output, linkify('http://example.com person@example.com',
                        parse_email=True))


def test_links_case_insensitive():
    """Protocols and domain names are case insensitive."""
    expect = (''
              'HTTP://EXAMPLE.COM')
    eq_(expect, linkify('HTTP://EXAMPLE.COM'))


def test_elements_inside_links():
    eq_(u'hello
',
        linkify('hello
'))

    eq_(u'bold hello
',
        linkify('bold hello
'))