From 11b8160e584470439c8c0b3ab51012c9300f6788 Mon Sep 17 00:00:00 2001 From: Christopher Baines Date: Sun, 6 Dec 2015 13:31:51 +0000 Subject: Imported Upstream version 1.4.2 --- .gitignore | 9 ++ .travis.yml | 14 ++++ CHANGES | 18 +++- CONTRIBUTING.rst | 7 ++ README.rst | 19 ++++- bleach/__init__.py | 36 ++++---- bleach/callbacks.py | 2 +- bleach/sanitizer.py | 2 +- bleach/tests/test_links.py | 193 +++++++++++++++++-------------------------- bleach/tests/test_unicode.py | 6 +- bleach/tests/tools.py | 2 +- docs/conf.py | 6 +- docs/goals.rst | 3 + requirements.txt | 1 + setup.cfg | 2 + setup.py | 25 ++++-- tox.ini | 4 +- 17 files changed, 198 insertions(+), 151 deletions(-) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 CONTRIBUTING.rst create mode 100644 setup.cfg diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96e22b0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +*.pyo +*.pyc +pip-log.txt +.coverage +dist +*.egg-info +.noseids +build +.tox diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..193f70a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,14 @@ +sudo: false +language: python +python: + - "2.6" + - "2.7" + - "3.2" + - "3.3" + - "3.4" + - "pypy" +install: + - "pip install -r requirements.txt" +script: + - nosetests + - flake8 bleach/ diff --git a/CHANGES b/CHANGES index 1def1a2..00ed505 100644 --- a/CHANGES +++ b/CHANGES @@ -1,10 +1,26 @@ Bleach Changes ============== +Version 1.4.2 +------------- + +- Fix hang in linkify with parse_email=True. #124 +- Fix crash in linkify when removing a link that is a first-child. #136 +- Updated TLDs. +- Don't remove exterior brackets when linkifying. #146 + + +Version 1.4.1 +------------- + +- Consistent order of attributes in output. +- Python 3.4. + + Version 1.4 ----------- -- Update linkify to use etree type Treeewalker instead of simpletree. +- Update linkify to use etree type Treewalker instead of simpletree. - Updated html5lib to version >= 0.999. - Update all code to be compatible with Python 3 and 2 using six. - Switch to Apache License. diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst new file mode 100644 index 0000000..015ceb8 --- /dev/null +++ b/CONTRIBUTING.rst @@ -0,0 +1,7 @@ +Reporting Security Issues +========================= + +If you believe you have found an exploit in a patched version of Bleach, +master or the latest released version on PyPI, **please do not post it +in a GitHub issue**. Please contact me privately, at +`me+bleach@jamessocol.com `. diff --git a/README.rst b/README.rst index 5e52cae..2dc7420 100644 --- a/README.rst +++ b/README.rst @@ -2,6 +2,12 @@ Bleach ====== +.. image:: https://travis-ci.org/jsocol/bleach.png?branch=master + :target: https://travis-ci.org/jsocol/bleach + +.. image:: https://badge.fury.io/py/Bleach.svg + :target: http://badge.fury.io/py/Bleach + Bleach is an HTML sanitizing library that escapes or strips markup and attributes based on a white list. Bleach can also linkify text safely, applying filters that Django's ``urlize`` filter cannot, and optionally setting ``rel`` @@ -20,10 +26,21 @@ The version on GitHub_ is the most up-to-date and contains the latest bug fixes. You can find full documentation on `ReadTheDocs`_. +Reporting Security Issues +========================= + +If you believe you have found an exploit in a patched version of Bleach, +master or the latest released version on PyPI, **please do not post it +in a GitHub issue**. Please contact me privately, at +`me+bleach@jamessocol.com `. + + Basic Use ========= -The simplest way to use Bleach is:: +The simplest way to use Bleach is: + +.. code-block:: python >>> import bleach diff --git a/bleach/__init__.py b/bleach/__init__.py index b110972..1d8caa2 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -13,8 +13,8 @@ from .encoding import force_unicode from .sanitizer import BleachSanitizer -VERSION = (1, 4, 0) -__version__ = '1.4' +VERSION = (1, 4, 2) +__version__ = '.'.join([str(n) for n in VERSION]) __all__ = ['clean', 'linkify'] @@ -51,16 +51,17 @@ TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne - net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn pr pro - ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl sm sn so - sr st su sv sy sz tc td tel tf tg th tj tk tl tm tn to tp tr travel tt - tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws xn ye yt yu za zm - zw""".split() - -PROTOCOLS = HTMLSanitizer.acceptable_protocols + net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post + pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl + sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to + tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws + xn xxx ye yt yu za zm zw""".split() +# Make sure that .com doesn't get matched by .co first TLDS.reverse() +PROTOCOLS = HTMLSanitizer.acceptable_protocols + url_re = re.compile( r"""\(* # Match any opening parentheses. \b(?http://example.com' + eq_('a http://example.com' ' link', - 'a http://example.com' - ' link'), linkify('a http://example.com link')) - in_(('a https://example.com' + eq_('a https://example.com' ' link', - 'a https://example.com' - ' link'), linkify('a https://example.com link')) - in_(('a example.com link', - 'a example.com link'), + eq_('a example.com link', linkify('a example.com link')) def test_trailing_slash(): - in_(('http://examp.com/', - 'http://examp.com/'), + eq_('http://examp.com/', linkify('http://examp.com/')) - in_(('' - 'http://example.com/foo/', - '' - 'http://example.com/foo/'), + eq_('' + 'http://example.com/foo/', linkify('http://example.com/foo/')) - in_(('' - 'http://example.com/foo/bar/', - '' - 'http://example.com/foo/bar/'), + eq_('' + 'http://example.com/foo/bar/', linkify('http://example.com/foo/bar/')) @@ -61,10 +50,8 @@ def test_mangle_link(): attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted) return attrs - in_(('' - 'http://example.com', - '' - 'http://example.com'), + eq_('' + 'http://example.com', linkify('http://example.com', DC + [filter_url])) @@ -90,19 +77,18 @@ def test_email_link(): 'james@example.com.au mailto', True, 'aussie james@example.com.au mailto'), # This is kind of a pathological case. I guess we do our best here. - (('email to ' - 'james@example.com', - 'email to ' - 'james@example.com'), + ('email to ' + 'james@example.com', True, 'email to james@example.com'), + ('
' + 'jinkyun@example.com', + True, + '
jinkyun@example.com'), ) def _check(o, p, i): - if isinstance(o, (list, tuple)): - in_(o, linkify(i, parse_email=p)) - else: - eq_(o, linkify(i, parse_email=p)) + eq_(o, linkify(i, parse_email=p)) for (o, p, i) in tests: yield _check, o, p, i @@ -171,8 +157,7 @@ def test_set_attrs(): attrs['rev'] = 'canonical' return attrs - in_(('ex.mp', - 'ex.mp'), + eq_('ex.mp', linkify('ex.mp', [set_attr])) @@ -200,19 +185,19 @@ def test_stop_email(): def test_tlds(): - in_(('example.com', - 'example.com'), + eq_('example.com', linkify('example.com')) - in_(('example.co.uk', - 'example.co.uk'), + eq_('example.co', + linkify('example.co')) + eq_('example.co.uk', linkify('example.co.uk')) - in_(('example.edu', - 'example.edu'), + eq_('example.edu', linkify('example.edu')) - eq_('example.xxx', linkify('example.xxx')) + eq_('example.xxx', + linkify('example.xxx')) + eq_('example.yyy', linkify('example.yyy')) eq_(' brie', linkify(' brie')) - in_(('bit.ly/fun', - 'bit.ly/fun'), + eq_('bit.ly/fun', linkify('bit.ly/fun')) @@ -226,77 +211,58 @@ def test_nofollow_off(): def test_link_in_html(): - in_(('http://yy.com', - 'http://yy.com'), + eq_('http://yy.com', linkify('http://yy.com')) - in_(('http://xx.com' - '', - 'http://xx.com' - ''), + eq_('http://xx.com' + '', linkify('http://xx.com')) def test_links_https(): - in_(('https://yy.com', - 'https://yy.com'), + eq_('https://yy.com', linkify('https://yy.com')) def test_add_rel_nofollow(): """Verify that rel="nofollow" is added to an existing link""" - in_(('http://yy.com', - 'http://yy.com'), + eq_('http://yy.com', linkify('http://yy.com')) def test_url_with_path(): - in_(('' - 'http://example.com/path/to/file', - '' - 'http://example.com/path/to/file'), + eq_('' + 'http://example.com/path/to/file', linkify('http://example.com/path/to/file')) def test_link_ftp(): - in_(('' - 'ftp://ftp.mozilla.org/some/file', - '' - 'ftp://ftp.mozilla.org/some/file'), + eq_('' + 'ftp://ftp.mozilla.org/some/file', linkify('ftp://ftp.mozilla.org/some/file')) def test_link_query(): - in_(('' + eq_('' 'http://xx.com/?test=win', - '' - 'http://xx.com/?test=win'), linkify('http://xx.com/?test=win')) - in_(('' + eq_('' 'xx.com/?test=win', - '' - 'xx.com/?test=win'), linkify('xx.com/?test=win')) - in_(('' + eq_('' 'xx.com?test=win', - '' - 'xx.com?test=win'), linkify('xx.com?test=win')) def test_link_fragment(): - in_(('' - 'http://xx.com/path#frag', - '' - 'http://xx.com/path#frag'), + eq_('' + 'http://xx.com/path#frag', linkify('http://xx.com/path#frag')) def test_link_entities(): - in_(('' + eq_('' 'http://xx.com/?a=1&b=2', - '' - 'http://xx.com/?a=1&b=2'), linkify('http://xx.com/?a=1&b=2')) @@ -307,12 +273,9 @@ def test_escaped_html(): def test_link_http_complete(): - in_(('' 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f', - '' - 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f'), linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f')) @@ -330,10 +293,8 @@ def test_javascript_url(): def test_unsafe_url(): """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" - in_(('All your{"xx.yy.com/grover.png"}base are', - 'All your{"xx.yy.com/grover.png"}base are'), + eq_('All your{"xx.yy.com/grover.png"}base are', linkify('All your{"xx.yy.com/grover.png"}base are')) @@ -341,23 +302,17 @@ def test_skip_pre(): """Skip linkification in
 tags."""
     simple = 'http://xx.com 
http://xx.com
' linked = ('http://xx.com ' - '
http://xx.com
', - 'http://xx.com ' '
http://xx.com
') all_linked = ('http://xx.com ' '
http://xx.com'
-                  '
', - 'http://xx.com ' - '
http://xx.com'
                   '
') - in_(linked, linkify(simple, skip_pre=True)) - in_(all_linked, linkify(simple)) + eq_(linked, linkify(simple, skip_pre=True)) + eq_(all_linked, linkify(simple)) already_linked = '
xx
' - nofollowed = ('
xx
', - '
xx
') - in_(nofollowed, linkify(already_linked)) - in_(nofollowed, linkify(already_linked, skip_pre=True)) + nofollowed = '
xx
' + eq_(nofollowed, linkify(already_linked)) + eq_(nofollowed, linkify(already_linked, skip_pre=True)) def test_libgl(): @@ -367,12 +322,11 @@ def test_libgl(): def test_end_of_sentence(): """example.com. should match.""" - outs = ('{0!s}{1!s}', - '{0!s}{1!s}') + out = '{0!s}{1!s}' intxt = '{0!s}{1!s}' def check(u, p): - in_([out.format(u, p) for out in outs], + eq_(out.format(u, p), linkify(intxt.format(u, p))) tests = ( @@ -388,8 +342,7 @@ def test_end_of_sentence(): def test_end_of_clause(): """example.com/foo, shouldn't include the ,""" - in_(('ex.com/foo, bar', - 'ex.com/foo, bar'), + eq_('ex.com/foo, bar', linkify('ex.com/foo, bar')) @@ -402,8 +355,7 @@ def test_sarcasm(): def test_wrapping_parentheses(): """URLs wrapped in parantheses should not include them.""" - outs = ('{0!s}{2!s}{3!s}', - '{0!s}{2!s}{3!s}') + out = '{0!s}{2!s}{3!s}' tests = ( ('(example.com)', ('(', 'example.com', 'example.com', ')')), @@ -431,12 +383,17 @@ def test_wrapping_parentheses(): ) def check(test, expected_output): - in_([o.format(*expected_output) for o in outs], linkify(test)) + eq_(out.format(*expected_output), linkify(test)) for test, expected_output in tests: yield check, test, expected_output +def test_parentheses_with_removing(): + expect = '(test.py)' + eq_(expect, linkify(expect, callbacks=[lambda *a: None])) + + def test_ports(): """URLs can contain port numbers.""" tests = ( @@ -448,9 +405,8 @@ def test_ports(): ) def check(test, output): - outs = ('{0}{1}', - '{0}{1}') - in_([out.format(*output) for out in outs], + out = '{0}{1}' + eq_(out.format(*output), linkify(test)) for test, output in tests: @@ -467,8 +423,7 @@ def test_tokenizer(): def test_ignore_bad_protocols(): eq_('foohttp://bar', linkify('foohttp://bar')) - in_(('fohttp://exampl.com', - 'fohttp://exampl.com'), + eq_('fohttp://exampl.com', linkify('fohttp://exampl.com')) @@ -481,29 +436,29 @@ def test_max_recursion_depth(): def test_link_emails_and_urls(): """parse_email=True shouldn't prevent URLs from getting linkified.""" output = ('' - 'http://example.com ' - 'person@example.com', - '' 'http://example.com ' 'person@example.com') - in_(output, linkify('http://example.com person@example.com', + eq_(output, linkify('http://example.com person@example.com', parse_email=True)) def test_links_case_insensitive(): """Protocols and domain names are case insensitive.""" expect = ('' - 'HTTP://EXAMPLE.COM', - '' 'HTTP://EXAMPLE.COM') - in_(expect, linkify('HTTP://EXAMPLE.COM')) + eq_(expect, linkify('HTTP://EXAMPLE.COM')) def test_elements_inside_links(): - in_(('hello
', - 'hello
'), + eq_('hello
', linkify('hello
')) - in_(('bold hello
', - 'bold hello
'), + eq_('bold hello
', linkify('bold hello
')) + + +def test_remove_first_childlink(): + expect = '

something

' + callbacks = [lambda *a: None] + eq_(expect, + linkify('

something

', callbacks=callbacks)) diff --git a/bleach/tests/test_unicode.py b/bleach/tests/test_unicode.py index 796924d..723df5f 100644 --- a/bleach/tests/test_unicode.py +++ b/bleach/tests/test_unicode.py @@ -30,9 +30,9 @@ def test_mixed(): def test_mixed_linkify(): in_(('Домашняя ' - 'http://example.com ヘルプとチュートリアル', - 'Домашняя ' - 'http://example.com ヘルプとチュートリアル'), + 'http://example.com ヘルプとチュートリアル', + 'Домашняя ' + 'http://example.com ヘルプとチュートリアル'), linkify('Домашняя http://example.com ヘルプとチュートリアル')) diff --git a/bleach/tests/tools.py b/bleach/tests/tools.py index 87f926c..3ae047e 100644 --- a/bleach/tests/tools.py +++ b/bleach/tests/tools.py @@ -3,5 +3,5 @@ def in_(l, a, msg=None): """Shorthand for 'assert a in l, "%r not in %r" % (a, l) """ - if not a in l: + if a not in l: raise AssertionError(msg or "%r not in %r" % (a, l)) diff --git a/docs/conf.py b/docs/conf.py index 96b2fc8..78bee32 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -41,16 +41,16 @@ master_doc = 'index' # General information about the project. project = u'Bleach' -copyright = u'2012, James Socol' +copyright = u'2012-2104, James Socol' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '1.3' +version = '1.4' # The full version, including alpha/beta/rc tags. -release = '1.3.1' +release = '1.4.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/goals.rst b/docs/goals.rst index 5477f9c..d62d54b 100644 --- a/docs/goals.rst +++ b/docs/goals.rst @@ -66,6 +66,9 @@ non-goal use cases include: you have to allow so many tags that a blacklist approach (e.g. forbidding ``