aboutsummaryrefslogtreecommitdiff
path: root/docs
diff options
context:
space:
mode:
Diffstat (limited to 'docs')
-rw-r--r--docs/Makefile153
-rw-r--r--docs/clean.rst122
-rw-r--r--docs/conf.py242
-rw-r--r--docs/goals.rst76
-rw-r--r--docs/index.rst69
-rw-r--r--docs/linkify.rst212
-rw-r--r--docs/make.bat190
7 files changed, 1064 insertions, 0 deletions
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..81ad9f9
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,153 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ -rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Bleach.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Bleach.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/Bleach"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Bleach"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/docs/clean.rst b/docs/clean.rst
new file mode 100644
index 0000000..a31dc89
--- /dev/null
+++ b/docs/clean.rst
@@ -0,0 +1,122 @@
+.. _clean-chapter:
+.. highlightlang:: python
+
+==================
+``bleach.clean()``
+==================
+
+``clean()`` is Bleach's HTML sanitization method::
+
+ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+ styles=ALLOWED_STYLES, strip=False, strip_comments=True):
+ """Clean an HTML fragment and return it."""
+
+Given a fragment of HTML, Bleach will parse it according to the HTML5 parsing
+algorithm and sanitize any disallowed tags or attributes. This algorithm also
+takes care of things like unclosed and (some) misnested tags.
+
+.. note::
+ You may pass in a ``string`` or a ``unicode`` object, but Bleach will
+ always return ``unicode``.
+
+
+Tag Whitelist
+=============
+
+The ``tags`` kwarg is a whitelist of allowed HTML tags. It should be a list,
+tuple, or other iterable. Any other HTML tags will be escaped or stripped from
+the text. Its default value is a relatively conservative list found in
+``bleach.ALLOWED_TAGS``.
+
+
+Attribute Whitelist
+===================
+
+The ``attributes`` kwarg is a whitelist of attributes. It can be a list, in
+which case the attributes are allowed for any tag, or a dictionary, in which
+case the keys are tag names (or a wildcard: ``*`` for all tags) and the values
+are lists of allowed attributes.
+
+For example::
+
+ attrs = {
+ '*': ['class'],
+ 'a': ['href', 'rel'],
+ 'img': ['src', 'alt'],
+ }
+
+In this case, ``class`` is allowed on any allowed element (from the ``tags``
+argument), ``<a>`` tags are allowed to have ``href`` and ``rel`` attributes,
+and so on.
+
+The default value is also a conservative dict found in
+``bleach.ALLOWED_ATTRIBUTES``.
+
+
+Callable Filters
+----------------
+
+You can also use a callable (instead of a list) in the ``attributes`` kwarg. If
+the callable returns ``True``, the attribute is allowed. Otherwise, it is
+stripped. For example::
+
+ def filter_src(name, value):
+ if name in ('alt', 'height', 'width'):
+ return True
+ if name == 'src':
+ p = urlparse(value)
+ return (not p.netloc) or p.netloc == 'mydomain.com'
+ return False
+
+ attrs = {
+ 'img': filter_src,
+ }
+
+
+Styles Whitelist
+================
+
+If you allow the ``style`` attribute, you will also need to whitelist styles
+users are allowed to set, for example ``color`` and ``background-color``.
+
+The default value is an empty list, i.e., the ``style`` attribute will be
+allowed but no values will be.
+
+For example, to allow users to set the color and font-weight of text::
+
+ attrs = {
+ '*': 'style'
+ }
+ tags = ['p', 'em', 'strong']
+ styles = ['color', 'font-weight']
+ cleaned_text = bleach.clean(text, tags, attrs, styles)
+
+
+Stripping Markup
+================
+
+By default, Bleach *escapes* disallowed or invalid markup. For example::
+
+ >>> bleach.clean('<span>is not allowed</span>')
+ u'&lt;span&gt;is not allowed&lt;/span&gt;
+
+If you would rather Bleach stripped this markup entirely, you can pass
+``strip=True``::
+
+ >>> bleach.clean('<span>is not allowed</span>', strip=True)
+ u'is not allowed'
+
+
+Stripping Comments
+==================
+
+By default, Bleach will strip out HTML comments. To disable this behavior, set
+``strip_comments=False``::
+
+ >>> html = 'my<!-- commented --> html'
+
+ >>> bleach.clean(html)
+ u'my html'
+
+ >>> bleach.clean(html, strip_comments=False)
+ u'my<!-- commented --> html'
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..a63aedf
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,242 @@
+# -*- coding: utf-8 -*-
+#
+# Bleach documentation build configuration file, created by
+# sphinx-quickstart on Fri May 11 21:11:39 2012.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.pngmath', 'sphinx.ext.viewcode']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Bleach'
+copyright = u'2012, James Socol'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.2'
+# The full version, including alpha/beta/rc tags.
+release = '1.2.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Bleachdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+ ('index', 'Bleach.tex', u'Bleach Documentation',
+ u'James Socol', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'bleach', u'Bleach Documentation',
+ [u'James Socol'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ ('index', 'Bleach', u'Bleach Documentation',
+ u'James Socol', 'Bleach', 'One line description of project.',
+ 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
diff --git a/docs/goals.rst b/docs/goals.rst
new file mode 100644
index 0000000..5477f9c
--- /dev/null
+++ b/docs/goals.rst
@@ -0,0 +1,76 @@
+===============
+Goals of Bleach
+===============
+
+This document lists the goals and non-goals of Bleach. My hope is that by
+focusing on these goals and explicitly listing the non-goals, the project will
+evolve in a stronger direction.
+
+
+Goals
+=====
+
+
+Whitelisting
+------------
+
+Bleach should always take a whitelist-based approach to allowing any kind of
+content or markup. Blacklisting is error-prone and not future proof.
+
+For example, you should have to opt-in to allowing the ``onclick`` attribute,
+not blacklist all the other ``on*`` attributes. Future versions of HTML may add
+new event handlers, like ``ontouch``, that old blacklists would not prevent.
+
+
+Sanitizing Input
+----------------
+
+The primary goal of Bleach is to sanitize user input that is allowed to contain
+*some* HTML as markup and is to be included in the content of a larger page.
+Examples might include:
+
+* User comments on a blog.
+
+* "Bio" sections of a user profile.
+
+* Descriptions of a product or application.
+
+These examples, and others, are traditionally prone to security issues like XSS
+or other script injection, or annoying issues like unclosed tags and invalid
+markup. Bleach will take a proactive, whitelist-only approach to allowing HTML
+content, and will use the HTML5 parsing algorithm to handle invalid markup.
+
+See the :ref:`chapter on clean() <clean-chapter>` for more info.
+
+
+Safely Creating Links
+---------------------
+
+The secondary goal of Bleach is to provide a mechanism for finding or altering
+links (``<a>`` tags with ``href`` attributes, or things that look like URLs or
+email addresses) in text.
+
+While Bleach itself will always operate on a whitelist-based security model,
+the :ref:`linkify() method <linkify-chapter>` is flexible enough to allow the
+creation, alteration, and removal of links based on an extremely wide range of
+use cases.
+
+
+Non-Goals
+=========
+
+Bleach is designed to work with fragments of HTML by untrusted users. Some
+non-goal use cases include:
+
+* **Sanitizing complete HTML documents.** Once you're creating whole documents,
+ you have to allow so many tags that a blacklist approach (e.g. forbidding
+ ``<script>`` or ``<object>``) may be more appropriate.
+
+* **Cleaning up after trusted users.** Bleach is powerful but it is not fast.
+ If you trust your users, trust them and don't rely on Bleach to clean up
+ their mess.
+
+* **Allowing arbitrary styling.** There are a number of interesting CSS
+ properties that can do dangerous things, like Opera's ``-o-link``. Painful as
+ it is, if you want your users to be able to change nearly anything in a
+ ``style`` attribute, you should have to opt into this.
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..0929e53
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,69 @@
+.. Bleach documentation master file, created by
+ sphinx-quickstart on Fri May 11 21:11:39 2012.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+Welcome to Bleach's documentation!
+==================================
+
+Bleach is a whitelist-based HTML sanitization and text linkification library.
+It is designed to take untrusted user input with *some* HTML.
+
+Because Bleach uses html5lib_ to parse document fragments the same way browsers
+do, it is extremely resilient to unknown attacks, much more so than
+regular-expression-based sanitizers.
+
+Bleach's ``linkify`` function is highly configurable and can be used to find,
+edit, and filter links most other auto-linkers can't.
+
+The version of bleach on GitHub_ is the always the most up-to-date and the
+``master`` branch should always work.
+
+.. warn::
+
+ Bleach is currently incompatible with html5lib 1.0b and any versions below
+ 0.9.5.
+
+
+Installing Bleach
+=================
+
+Bleach is available on PyPI_, so you can install it with ``pip``::
+
+ $ pip install bleach
+
+Or with ``easy_install``::
+
+ $ easy_install bleach
+
+Or by cloning the repo from GitHub_::
+
+ $ git clone git://github.com/jsocol/bleach.git
+
+Then install it by running::
+
+ $ python setup.py install
+
+
+Contents:
+=========
+
+.. toctree::
+ :maxdepth: 2
+
+ clean
+ linkify
+ goals
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
+.. _html5lib: http://code.google.com/p/html5lib/
+.. _GitHub: https://github.com/jsocol/bleach
+.. _PyPI: http://pypi.python.org/pypi/bleach
diff --git a/docs/linkify.rst b/docs/linkify.rst
new file mode 100644
index 0000000..42de69c
--- /dev/null
+++ b/docs/linkify.rst
@@ -0,0 +1,212 @@
+.. _linkify-chapter:
+.. highlightlang:: python
+
+====================
+``bleach.linkify()``
+====================
+
+``linkify()`` searches text for links, URLs, and email addresses and lets you
+control how and when those links are rendered::
+
+ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
+ parse_email=False, tokenizer=HTMLSanitizer):
+ """Convert URL-like strings in an HTML fragment to links.
+
+``linkify()`` works by building a document tree, so it's guaranteed never to do
+weird things to URLs in attribute values, can modify the value of attributes on
+``<a>`` tags, and can even do things like skip ``<pre>`` sections.
+
+By default, ``linkify()`` will perform some sanitization, only allowing a set
+of "safe" tags. Because it uses the HTML5 parsing algorithm, it will always
+handle things like unclosed tags.
+
+.. note::
+ You may pass a ``string`` or ``unicode`` object, but Bleach will always
+ return ``unicode``.
+
+
+Callbacks
+=========
+
+The second argument to ``linkify()`` is a list or other iterable of callback
+functions. These callbacks can modify links that exist and links that are being
+created, or remove them completely.
+
+Each callback will get the following arguments::
+
+ def my_callback(attrs, new=False):
+
+The ``attrs`` argument is a dict of attributes of the ``<a>`` tag. The ``new``
+argument is a boolean indicating if the link is new (e.g. an email address or
+URL found in the text) or already existed (e.g. an ``<a>`` tag found in the
+text). The ``attrs`` dict also contains a ``_text`` key, which is the innerText
+of the ``<a>`` tag.
+
+The callback must return a dict of attributes (including ``_text``) or
+``None``. The new dict of attributes will be passed to the next callback in the
+list. If any callback returns ``None``, the link will not be created and the
+original text left in place, or will be removed, and its original innerText
+left in place.
+
+The default value is simply to add ``rel="nofollow"``. See ``bleach.callbacks``
+for some included callback functions.
+
+
+Setting Attributes
+------------------
+
+For example, to set ``rel="nofollow"`` on all links found in the text, a simple
+(and included) callback might be::
+
+ def set_nofollow(attrs, new=False):
+ attrs['rel'] = 'nofollow'
+ return attrs
+
+This would overwrite the value of the ``rel`` attribute if it was set.
+
+You could also make external links open in a new tab, or set a class::
+
+ from urlparse import urlparse
+
+ def set_target(attrs, new=False):
+ p = urlparse(attrs['href'])
+ if p.netloc not in ['my-domain.com', 'other-domain.com']:
+ attrs['target'] = '_blank'
+ attrs['class'] = 'external'
+ else:
+ attrs.pop('target', None)
+ return attrs
+
+
+Removing Attributes
+-------------------
+
+You can easily remove attributes you don't want to allow, even on existing
+links (``<a>`` tags) in the text. (See also :ref:`clean() <clean-chapter>` for
+sanitizing attributes.)
+
+::
+
+ def allowed_attributes(attrs, new=False):
+ """Only allow href, target, rel and title."""
+ allowed = ['href', 'target', 'rel', 'title']
+ return dict((k, v) for k, v in attrs.items() if k in allowed)
+
+Or you could remove a specific attribute, if it exists::
+
+ def remove_title1(attrs, new=False):
+ attrs.pop('title', None)
+ return attrs
+
+ def remove_title2(attrs, new=False):
+ if 'title' in attrs:
+ del attrs['title']
+ return attrs
+
+
+Altering Attributes
+-------------------
+
+You can alter and overwrite attributes, including the link text, via the
+``_text`` key, to, for example, pass outgoing links through a warning page, or
+limit the length of text inside an ``<a>`` tag.
+
+::
+
+ def shorten_url(attrs, new=False):
+ """Shorten overly-long URLs in the text."""
+ if not new: # Only looking at newly-created links.
+ return attrs
+ # _text will be the same as the URL for new links.
+ text = attrs['_text']
+ if len(text) > 25:
+ attrs['_text'] = text[0:22] + '...'
+ return attrs
+
+::
+
+ from urllib2 import quote
+ from urlparse import urlparse
+
+ def outgoing_bouncer(attrs, new=False):
+ """Send outgoing links through a bouncer."""
+ p = urlparse(attrs['href'])
+ if p.netloc not in ['my-domain.com', 'www.my-domain.com', '']:
+ bouncer = 'http://outgoing.my-domain.com/?destination=%s'
+ attrs['href'] = bouncer % quote(attrs['href'])
+ return attrs
+
+
+Preventing Links
+----------------
+
+A slightly more complex example is inspired by Crate_, where strings like
+``models.py`` are often found, and linkified. ``.py`` is the ccTLD for
+Paraguay, so ``example.py`` may be a legitimate URL, but in the case of a site
+dedicated to Python packages, odds are it is not. In this case, Crate_ could
+write the following callback::
+
+ def dont_linkify_python(attrs, new=False):
+ if not new: # This is an existing <a> tag, leave it be.
+ return attrs
+
+ # If the TLD is '.py', make sure it starts with http: or https:
+ href = attrs['href']
+ if href.endswith('.py') and not href.startswith(('http:', 'https:')):
+ # This looks like a Python file, not a URL. Don't make a link.
+ return None
+
+ # Everything checks out, keep going to the next callback.
+ return attrs
+
+
+Removing Links
+--------------
+
+If you want to remove certain links, even if they are written in the text with
+``<a>`` tags, you can still return ``None``::
+
+ def remove_mailto(attrs, new=False):
+ """Remove any mailto: links."""
+ if attrs['href'].startswith('mailto:'):
+ return None
+ return attrs
+
+
+``skip_pre``
+============
+
+``<pre>`` tags are often special, literal sections. If you don't want to create
+any new links within a ``<pre>`` section, pass ``skip_pre=True``.
+
+.. note::
+ Though new links will not be created, existing links created with ``<a>``
+ tags will still be passed through all the callbacks.
+
+
+``parse_email``
+===============
+
+By default, ``linkify()`` does not create ``mailto:`` links for email
+addresses, but if you pass ``parse_email=True``, it will. ``mailto:`` links
+will go through exactly the same set of callbacks as all other links, whether
+they are newly created or already in the text, so be careful when writing
+callbacks that may need to behave differently if the protocol is ``mailto:``.
+
+
+``tokenizer``
+============
+
+``linkify()`` uses the ``html5lib.sanitizer.HTMLSanitizer`` tokenizer by
+default. This has the effect of scrubbing some tags and attributes. To use a
+more lenient, or totally different, tokenizer, you can specify the tokenizer
+class here. (See the implementation of :ref:`clean() <clean-chapter>` for an
+example of building a custom tokenizer.)
+
+::
+
+ from html5lib.tokenizer import HTMLTokenizer
+ linked_text = linkify(text, tokenizer=HTMLTokenizer)
+
+
+.. _Crate: https://crate.io/
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..84c919b
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,190 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+ set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+ set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+ :help
+ echo.Please use `make ^<target^>` where ^<target^> is one of
+ echo. html to make standalone HTML files
+ echo. dirhtml to make HTML files named index.html in directories
+ echo. singlehtml to make a single large HTML file
+ echo. pickle to make pickle files
+ echo. json to make JSON files
+ echo. htmlhelp to make HTML files and a HTML help project
+ echo. qthelp to make HTML files and a qthelp project
+ echo. devhelp to make HTML files and a Devhelp project
+ echo. epub to make an epub
+ echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+ echo. text to make text files
+ echo. man to make manual pages
+ echo. texinfo to make Texinfo files
+ echo. gettext to make PO message catalogs
+ echo. changes to make an overview over all changed/added/deprecated items
+ echo. linkcheck to check all external links for integrity
+ echo. doctest to run all doctests embedded in the documentation if enabled
+ goto end
+)
+
+if "%1" == "clean" (
+ for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+ del /q /s %BUILDDIR%\*
+ goto end
+)
+
+if "%1" == "html" (
+ %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+ goto end
+)
+
+if "%1" == "dirhtml" (
+ %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+ goto end
+)
+
+if "%1" == "singlehtml" (
+ %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+ goto end
+)
+
+if "%1" == "pickle" (
+ %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the pickle files.
+ goto end
+)
+
+if "%1" == "json" (
+ %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the JSON files.
+ goto end
+)
+
+if "%1" == "htmlhelp" (
+ %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+ goto end
+)
+
+if "%1" == "qthelp" (
+ %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+ echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Bleach.qhcp
+ echo.To view the help file:
+ echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Bleach.ghc
+ goto end
+)
+
+if "%1" == "devhelp" (
+ %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished.
+ goto end
+)
+
+if "%1" == "epub" (
+ %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The epub file is in %BUILDDIR%/epub.
+ goto end
+)
+
+if "%1" == "latex" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "text" (
+ %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The text files are in %BUILDDIR%/text.
+ goto end
+)
+
+if "%1" == "man" (
+ %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The manual pages are in %BUILDDIR%/man.
+ goto end
+)
+
+if "%1" == "texinfo" (
+ %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+ goto end
+)
+
+if "%1" == "gettext" (
+ %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+ goto end
+)
+
+if "%1" == "changes" (
+ %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.The overview file is in %BUILDDIR%/changes.
+ goto end
+)
+
+if "%1" == "linkcheck" (
+ %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+ goto end
+)
+
+if "%1" == "doctest" (
+ %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+ goto end
+)
+
+:end