From fac84c6d90e0875e6c1b10c5ef02d577ee008af4 Mon Sep 17 00:00:00 2001 From: Per Andersson Date: Sun, 9 Jun 2013 19:45:54 +0200 Subject: Imported Upstream version 1.2.2 --- .gitignore | 8 -- .travis.yml | 6 - CHANGES | 87 +++++++++++++++ CONTRIBUTORS | 26 ++++- README.rst | 141 ++++-------------------- bleach/__init__.py | 230 +++++++++++++++++---------------------- bleach/callbacks.py | 15 +++ bleach/sanitizer.py | 6 +- bleach/tests/test_basics.py | 5 +- bleach/tests/test_css.py | 16 ++- bleach/tests/test_delinkify.py | 109 ------------------- bleach/tests/test_links.py | 200 ++++++++++++++++++++++++++++------ bleach/tests/test_security.py | 4 + docs/Makefile | 153 ++++++++++++++++++++++++++ docs/clean.rst | 122 +++++++++++++++++++++ docs/conf.py | 242 +++++++++++++++++++++++++++++++++++++++++ docs/goals.rst | 76 +++++++++++++ docs/index.rst | 69 ++++++++++++ docs/linkify.rst | 212 ++++++++++++++++++++++++++++++++++++ docs/make.bat | 190 ++++++++++++++++++++++++++++++++ requirements.txt | 4 +- setup.py | 4 +- 22 files changed, 1510 insertions(+), 415 deletions(-) delete mode 100644 .gitignore delete mode 100644 .travis.yml create mode 100644 CHANGES create mode 100644 bleach/callbacks.py delete mode 100644 bleach/tests/test_delinkify.py create mode 100644 docs/Makefile create mode 100644 docs/clean.rst create mode 100644 docs/conf.py create mode 100644 docs/goals.rst create mode 100644 docs/index.rst create mode 100644 docs/linkify.rst create mode 100644 docs/make.bat diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 6714ae6..0000000 --- a/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -*.pyo -*.pyc -pip-log.txt -.coverage -dist -*.egg-info -.noseids -build diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index e767f15..0000000 --- a/.travis.yml +++ /dev/null @@ -1,6 +0,0 @@ -language: python -python: - - "2.6" - - "2.7" -install: pip install -Ur requirements.txt --use-mirrors -script: nosetests diff --git a/CHANGES b/CHANGES new file mode 100644 index 0000000..d9bad9c --- /dev/null +++ b/CHANGES @@ -0,0 +1,87 @@ +Bleach Changes +============== + +Version 1.2.1 +------------- + +- clean() no longer considers "feed:" an acceptable protocol due to + inconsistencies in browser behavior. + + +Version 1.2 +----------- + +- linkify() has changed considerably. Many keyword arguments have been + replaced with a single callbacks list. Please see the documentation + for more information. +- Bleach will no longer consider unacceptable protocols when linkifying. +- linkify() now takes a tokenizer argument that allows it to skip + sanitization. +- delinkify() is gone. +- Removed exception handling from _render. clean() and linkify() may now + throw. +- linkify() correctly ignores case for protocols and domain names. +- linkify() correctly handles markup within an tag. + + +Version 1.1.3 +------------- + +- Fix parsing bare URLs when parse_email=True. + + +Version 1.1.2 +------------- + +- Fix hang in style attribute sanitizer. (#61) +- Allow '/' in style attribute values. + + +Version 1.1.1 +------------- + +- Fix tokenizer for html5lib 0.9.5. + + +Version 1.1.0 +------------- + +- linkify() now understands port numbers. (#38) +- Documented character encoding behavior. (#41) +- Add an optional target argument to linkify(). +- Add delinkify() method. (#45) +- Support subdomain whitelist for delinkify(). (#47, #48) + + +Version 1.0.4 +------------- + +- Switch to SemVer git tags. +- Make linkify() smarter about trailing punctuation. (#30) +- Pass exc_info to logger during rendering issues. +- Add wildcard key for attributes. (#19) +- Make linkify() use the HTMLSanitizer tokenizer. (#36) +- Fix URLs wrapped in parentheses. (#23) +- Make linkify() UTF-8 safe. (#33) + + +Version 1.0.3 +------------- + +- linkify() works with 3rd level domains. (#24) +- clean() supports vendor prefixes in style values. (#31, #32) +- Fix linkify() email escaping. + + +Version 1.0.2 +------------- + +- linkify() supports email addresses. +- clean() supports callables in attributes filter. + + +Version 1.0.1 +------------- + +- linkify() doesn't drop trailing slashes. (#21) +- linkify() won't linkify 'libgl.so.1'. (#22) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index f612983..f014916 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -1 +1,25 @@ -See https://github.com/jsocol/bleach/contributors +Bleach is written and maintained by James Socol and various contributors +within and without the Mozilla Corporation and Foundation. + +Lead Developer: + +- James Socol + +Contributors: + +- Jeff Balogh +- Ricky Rosario +- Chris Beaven +- Luis Nell + +Patches: + +- Les Orchard +- Paul Craciunoiu +- Sébastien Fievet +- TimothyFitz +- Adrian "ThiefMaster" +- Adam Lofts +- Anton Kovalyov +- Mark Paschal +- Alex Ehlke diff --git a/README.rst b/README.rst index 08dfc50..093edc1 100644 --- a/README.rst +++ b/README.rst @@ -16,8 +16,8 @@ Because it relies on html5lib_, Bleach is as good as modern browsers at dealing with weird, quirky HTML fragments. And *any* of Bleach's methods will fix unbalanced or mis-nested tags. -The version on `github `_ is the most -up-to-date and contains the latest bug fixes. +The version on GitHub_ is the most up-to-date and contains the latest bug +fixes. You can find full documentation on `ReadTheDocs`_. Basic Use @@ -33,9 +33,6 @@ The simplest way to use Bleach is:: >>> bleach.linkify('an http://example.com url') u'an http://example.com url - >>> bleach.delinkify('a link') - u'a link' - *NB*: Bleach always returns a ``unicode`` object, whether you give it a bytestring or a ``unicode`` object, but Bleach does not attempt to detect incoming character encodings, and will assume UTF-8. If you are using a @@ -43,117 +40,27 @@ different character encoding, you should convert from a bytestring to ``unicode`` before passing the text to Bleach. -Customizing Bleach -================== - -``clean()``, ``linkify()`` and ``delinkify()`` can take several optional -keyword arguments to customize their behavior. - - -``clean()`` ------------ - -``bleach.clean()`` is the primary tool in Bleach. It uses html5lib_ to parse a -document fragment into a tree and does the sanitization during tokenizing, -which is incredibly powerful and has several advantages over regular -expression-based sanitization. - -``tags`` - A whitelist of HTML tags. Must be a list. Defaults to - ``bleach.ALLOWED_TAGS``. -``attributes`` - A whitelist of HTML attributes. Either a list, in which case all attributes - are allowed on all elements, or a dict, with tag names as keys and lists of - allowed attributes as values ('*' is a wildcard key to allow an attribute on - any tag). Or it is possible to pass a callable instead of a list that accepts - name and value of attribute and returns True of False. Defaults to - ``bleach.ALLOWED_ATTRIBUTES``. -``styles`` - A whitelist of allowed CSS properties within a ``style`` attribute. (Note - that ``style`` attributes are not allowed by default.) Must be a list. - Defaults to ``[]``. -``strip`` - Strip disallowed HTML instead of escaping it. A boolean. Defaults to - ``False``. -``strip_comments`` - Strip HTML comments. A boolean. Defaults to ``True``. - - -``linkify()`` -------------- - -``bleach.linkify()`` turns things that look like URLs or (optionally) email -addresses and turns them into links. It does this smartly, only looking in text -nodes, and never within ```` tags. - -There are options that affect output, and some of these are also applied to -links already found in the text. These are designed to allow you to set -attributes like ``rel="nofollow"`` or ``target``, or push outgoing links -through a redirection URL, and do this to links already in the text, as well. - -``nofollow`` - Add ``rel="nofollow"`` to non-relative links (both created by ``linkify()`` - and those already present in the text). Defaults to ``True``. -``filter_url`` - A callable through which the ``href`` attribute of links (both created by - ``linkify()`` and already present in the text) will be passed. Must accept a - single argument and return a string. -``filter_text`` - A callable through which the text of links (only those created by - ``linkify``) will be passed. Must accept a single argument and return a - string. -``skip_pre`` - Do not create new links inside ``
`` sections. Still follows
-  ``nofollow``. Defaults to ``False``.
-``parse_email``
-  Linkify email addresses with ``mailto:``. Defaults to ``False``.
-``target``
-  Set a ``target`` attribute on links. Like ``nofollow``, if ``target`` is not
-  ``None``, will set the attribute on links already in the text, as well.
-  Defaults to ``None``.
-
-
-``delinkify()``
----------------
-
-``bleach.delinkify()`` is basically the opposite of ``linkify()``. It strips
-links out of text except, optionally, relative links, or links to domains
-you've whitelisted.
-
-``allow_domains``
-  Allow links to the domains in this list. Set to ``None`` or an empty list to
-  disallow all non-relative domains. See below for wildcards. Defaults to
-  ``None``.
-``allow_relative``
-  Allow relative links (i.e. those with no hostname). Defaults to ``False``.
-
-
-Wildcards
-^^^^^^^^^
-
-To allow links to a domain and its subdomains, ``allow_domains`` accepts two
-types of wildcard arguments in domains:
-
-``*``
-  Allow a single level of subdomain. This can be anywhere in the hostname, even
-  the TLD. This allows you to, for example, allow links to ``example.*``.
-  ``*.example.com`` will match both ``foo.example.com`` and ``example.com``.
-  ::
-    >>> delinkify('bar', \
-    ... allow_domains=['*.ex.*'])
-    u'bar'
-    >>> delinkify('bar', allow_domains=['*.ex.mp'])
-    u'bar
-``**``
-  To allow any number of *preceding* subdomains, you can start a hostname with
-  ``**``. Note that unlike ``*``, ``**`` may only appear once, and only at the
-  beginning of a hostname.
-  ::
-    >>> delinkify('t', \
-    ... allow_domains=['**.ex.mp'])
-    u't'
-  If ``**`` appears anywhere but the beginning of a hostname, ``delinkify``
-  will throw ``bleach.ValidationError`` (which is a ``ValueError`` subclass,
-  for easy catching).
+Installation
+------------
+
+Bleach is available on PyPI_, so you can install it with ``pip``::
+
+    $ pip install bleach
+
+Or with ``easy_install``::
+
+    $ easy_install bleach
+
+Or by cloning the repo from GitHub_::
+
+    $ git clone git://github.com/jsocol/bleach.git
+
+Then install it by running::
+
+    $ python setup.py install
+
 
 .. _html5lib: http://code.google.com/p/html5lib/
+.. _GitHub: https://github.com/jsocol/bleach
+.. _ReadTheDocs: http://bleach.readthedocs.org/
+.. _PyPI: http://pypi.python.org/pypi/bleach
diff --git a/bleach/__init__.py b/bleach/__init__.py
index bc8e49c..af75d0f 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -1,19 +1,18 @@
-import itertools
 import logging
 import re
 import sys
-import urlparse
 
 import html5lib
 from html5lib.sanitizer import HTMLSanitizer
 from html5lib.serializer.htmlserializer import HTMLSerializer
 
-from encoding import force_unicode
-from sanitizer import BleachSanitizer
+from . import callbacks as linkify_callbacks
+from .encoding import force_unicode
+from .sanitizer import BleachSanitizer
 
 
-VERSION = (1, 1, 5)
-__version__ = '.'.join(map(str, VERSION))
+VERSION = (1, 2, 1)
+__version__ = '1.2.1'
 
 __all__ = ['clean', 'linkify']
 
@@ -56,18 +55,21 @@ TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
        tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws xn ye yt yu za zm
        zw""".split()
 
+PROTOCOLS = HTMLSanitizer.acceptable_protocols
+
 TLDS.reverse()
 
 url_re = re.compile(
     r"""\(*  # Match any opening parentheses.
-    \b(?"]*)?
         # /path/zz (excluding "unsafe" chars from RFC 1738,
         # except for # and ~, which happen in practice)
-    """ % u'|'.join(TLDS), re.VERBOSE | re.UNICODE)
+    """ % (u'|'.join(PROTOCOLS), u'|'.join(TLDS)),
+    re.IGNORECASE | re.VERBOSE | re.UNICODE)
 
-proto_re = re.compile(r'^[\w-]+:/{0,3}')
+proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
 
 punct_re = re.compile(r'([\.,]+)$')
 
@@ -83,7 +85,10 @@ email_re = re.compile(
 
 NODE_TEXT = 4  # The numeric ID of a text node in simpletree.
 
-identity = lambda x: x  # The identity function.
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+
+PY_26 = (sys.version_info < (2, 7))
+RECURSION_EXCEPTION = RuntimeError if not PY_26 else AttributeError
 
 
 def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
@@ -93,8 +98,6 @@ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
         return u''
 
     text = force_unicode(text)
-    if text.startswith(u''
     open_comment = ' html'
+
+    >>> bleach.clean(html)
+    u'my html'
+
+    >>> bleach.clean(html, strip_comments=False)
+    u'my html'
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..a63aedf
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,242 @@
+# -*- coding: utf-8 -*-
+#
+# Bleach documentation build configuration file, created by
+# sphinx-quickstart on Fri May 11 21:11:39 2012.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.pngmath', 'sphinx.ext.viewcode']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Bleach'
+copyright = u'2012, James Socol'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.2'
+# The full version, including alpha/beta/rc tags.
+release = '1.2.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# " v documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a  tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Bleachdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+  ('index', 'Bleach.tex', u'Bleach Documentation',
+   u'James Socol', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'bleach', u'Bleach Documentation',
+     [u'James Socol'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'Bleach', u'Bleach Documentation',
+   u'James Socol', 'Bleach', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
diff --git a/docs/goals.rst b/docs/goals.rst
new file mode 100644
index 0000000..5477f9c
--- /dev/null
+++ b/docs/goals.rst
@@ -0,0 +1,76 @@
+===============
+Goals of Bleach
+===============
+
+This document lists the goals and non-goals of Bleach. My hope is that by
+focusing on these goals and explicitly listing the non-goals, the project will
+evolve in a stronger direction.
+
+
+Goals
+=====
+
+
+Whitelisting
+------------
+
+Bleach should always take a whitelist-based approach to allowing any kind of
+content or markup. Blacklisting is error-prone and not future proof.
+
+For example, you should have to opt-in to allowing the ``onclick`` attribute,
+not blacklist all the other ``on*`` attributes. Future versions of HTML may add
+new event handlers, like ``ontouch``, that old blacklists would not prevent.
+
+
+Sanitizing Input
+----------------
+
+The primary goal of Bleach is to sanitize user input that is allowed to contain
+*some* HTML as markup and is to be included in the content of a larger page.
+Examples might include:
+
+* User comments on a blog.
+
+* "Bio" sections of a user profile.
+
+* Descriptions of a product or application.
+
+These examples, and others, are traditionally prone to security issues like XSS
+or other script injection, or annoying issues like unclosed tags and invalid
+markup. Bleach will take a proactive, whitelist-only approach to allowing HTML
+content, and will use the HTML5 parsing algorithm to handle invalid markup.
+
+See the :ref:`chapter on clean() ` for more info.
+
+
+Safely Creating Links
+---------------------
+
+The secondary goal of Bleach is to provide a mechanism for finding or altering
+links (```` tags with ``href`` attributes, or things that look like URLs or
+email addresses) in text.
+
+While Bleach itself will always operate on a whitelist-based security model,
+the :ref:`linkify() method ` is flexible enough to allow the
+creation, alteration, and removal of links based on an extremely wide range of
+use cases.
+
+
+Non-Goals
+=========
+
+Bleach is designed to work with fragments of HTML by untrusted users. Some
+non-goal use cases include:
+
+* **Sanitizing complete HTML documents.** Once you're creating whole documents,
+  you have to allow so many tags that a blacklist approach (e.g. forbidding
+  ``