diff options
Diffstat (limited to 'docs')
-rw-r--r-- | docs/Makefile | 153 | ||||
-rw-r--r-- | docs/clean.rst | 122 | ||||
-rw-r--r-- | docs/conf.py | 242 | ||||
-rw-r--r-- | docs/goals.rst | 76 | ||||
-rw-r--r-- | docs/index.rst | 69 | ||||
-rw-r--r-- | docs/linkify.rst | 212 | ||||
-rw-r--r-- | docs/make.bat | 190 |
7 files changed, 1064 insertions, 0 deletions
diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..81ad9f9 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,153 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Bleach.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Bleach.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/Bleach" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Bleach" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/docs/clean.rst b/docs/clean.rst new file mode 100644 index 0000000..a31dc89 --- /dev/null +++ b/docs/clean.rst @@ -0,0 +1,122 @@ +.. _clean-chapter: +.. highlightlang:: python + +================== +``bleach.clean()`` +================== + +``clean()`` is Bleach's HTML sanitization method:: + + def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, + styles=ALLOWED_STYLES, strip=False, strip_comments=True): + """Clean an HTML fragment and return it.""" + +Given a fragment of HTML, Bleach will parse it according to the HTML5 parsing +algorithm and sanitize any disallowed tags or attributes. This algorithm also +takes care of things like unclosed and (some) misnested tags. + +.. note:: + You may pass in a ``string`` or a ``unicode`` object, but Bleach will + always return ``unicode``. + + +Tag Whitelist +============= + +The ``tags`` kwarg is a whitelist of allowed HTML tags. It should be a list, +tuple, or other iterable. Any other HTML tags will be escaped or stripped from +the text. Its default value is a relatively conservative list found in +``bleach.ALLOWED_TAGS``. + + +Attribute Whitelist +=================== + +The ``attributes`` kwarg is a whitelist of attributes. It can be a list, in +which case the attributes are allowed for any tag, or a dictionary, in which +case the keys are tag names (or a wildcard: ``*`` for all tags) and the values +are lists of allowed attributes. + +For example:: + + attrs = { + '*': ['class'], + 'a': ['href', 'rel'], + 'img': ['src', 'alt'], + } + +In this case, ``class`` is allowed on any allowed element (from the ``tags`` +argument), ``<a>`` tags are allowed to have ``href`` and ``rel`` attributes, +and so on. + +The default value is also a conservative dict found in +``bleach.ALLOWED_ATTRIBUTES``. + + +Callable Filters +---------------- + +You can also use a callable (instead of a list) in the ``attributes`` kwarg. If +the callable returns ``True``, the attribute is allowed. Otherwise, it is +stripped. For example:: + + def filter_src(name, value): + if name in ('alt', 'height', 'width'): + return True + if name == 'src': + p = urlparse(value) + return (not p.netloc) or p.netloc == 'mydomain.com' + return False + + attrs = { + 'img': filter_src, + } + + +Styles Whitelist +================ + +If you allow the ``style`` attribute, you will also need to whitelist styles +users are allowed to set, for example ``color`` and ``background-color``. + +The default value is an empty list, i.e., the ``style`` attribute will be +allowed but no values will be. + +For example, to allow users to set the color and font-weight of text:: + + attrs = { + '*': 'style' + } + tags = ['p', 'em', 'strong'] + styles = ['color', 'font-weight'] + cleaned_text = bleach.clean(text, tags, attrs, styles) + + +Stripping Markup +================ + +By default, Bleach *escapes* disallowed or invalid markup. For example:: + + >>> bleach.clean('<span>is not allowed</span>') + u'<span>is not allowed</span> + +If you would rather Bleach stripped this markup entirely, you can pass +``strip=True``:: + + >>> bleach.clean('<span>is not allowed</span>', strip=True) + u'is not allowed' + + +Stripping Comments +================== + +By default, Bleach will strip out HTML comments. To disable this behavior, set +``strip_comments=False``:: + + >>> html = 'my<!-- commented --> html' + + >>> bleach.clean(html) + u'my html' + + >>> bleach.clean(html, strip_comments=False) + u'my<!-- commented --> html' diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..a63aedf --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- +# +# Bleach documentation build configuration file, created by +# sphinx-quickstart on Fri May 11 21:11:39 2012. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.pngmath', 'sphinx.ext.viewcode'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Bleach' +copyright = u'2012, James Socol' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '1.2' +# The full version, including alpha/beta/rc tags. +release = '1.2.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Bleachdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'Bleach.tex', u'Bleach Documentation', + u'James Socol', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'bleach', u'Bleach Documentation', + [u'James Socol'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'Bleach', u'Bleach Documentation', + u'James Socol', 'Bleach', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' diff --git a/docs/goals.rst b/docs/goals.rst new file mode 100644 index 0000000..5477f9c --- /dev/null +++ b/docs/goals.rst @@ -0,0 +1,76 @@ +=============== +Goals of Bleach +=============== + +This document lists the goals and non-goals of Bleach. My hope is that by +focusing on these goals and explicitly listing the non-goals, the project will +evolve in a stronger direction. + + +Goals +===== + + +Whitelisting +------------ + +Bleach should always take a whitelist-based approach to allowing any kind of +content or markup. Blacklisting is error-prone and not future proof. + +For example, you should have to opt-in to allowing the ``onclick`` attribute, +not blacklist all the other ``on*`` attributes. Future versions of HTML may add +new event handlers, like ``ontouch``, that old blacklists would not prevent. + + +Sanitizing Input +---------------- + +The primary goal of Bleach is to sanitize user input that is allowed to contain +*some* HTML as markup and is to be included in the content of a larger page. +Examples might include: + +* User comments on a blog. + +* "Bio" sections of a user profile. + +* Descriptions of a product or application. + +These examples, and others, are traditionally prone to security issues like XSS +or other script injection, or annoying issues like unclosed tags and invalid +markup. Bleach will take a proactive, whitelist-only approach to allowing HTML +content, and will use the HTML5 parsing algorithm to handle invalid markup. + +See the :ref:`chapter on clean() <clean-chapter>` for more info. + + +Safely Creating Links +--------------------- + +The secondary goal of Bleach is to provide a mechanism for finding or altering +links (``<a>`` tags with ``href`` attributes, or things that look like URLs or +email addresses) in text. + +While Bleach itself will always operate on a whitelist-based security model, +the :ref:`linkify() method <linkify-chapter>` is flexible enough to allow the +creation, alteration, and removal of links based on an extremely wide range of +use cases. + + +Non-Goals +========= + +Bleach is designed to work with fragments of HTML by untrusted users. Some +non-goal use cases include: + +* **Sanitizing complete HTML documents.** Once you're creating whole documents, + you have to allow so many tags that a blacklist approach (e.g. forbidding + ``<script>`` or ``<object>``) may be more appropriate. + +* **Cleaning up after trusted users.** Bleach is powerful but it is not fast. + If you trust your users, trust them and don't rely on Bleach to clean up + their mess. + +* **Allowing arbitrary styling.** There are a number of interesting CSS + properties that can do dangerous things, like Opera's ``-o-link``. Painful as + it is, if you want your users to be able to change nearly anything in a + ``style`` attribute, you should have to opt into this. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..0929e53 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,69 @@ +.. Bleach documentation master file, created by + sphinx-quickstart on Fri May 11 21:11:39 2012. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to Bleach's documentation! +================================== + +Bleach is a whitelist-based HTML sanitization and text linkification library. +It is designed to take untrusted user input with *some* HTML. + +Because Bleach uses html5lib_ to parse document fragments the same way browsers +do, it is extremely resilient to unknown attacks, much more so than +regular-expression-based sanitizers. + +Bleach's ``linkify`` function is highly configurable and can be used to find, +edit, and filter links most other auto-linkers can't. + +The version of bleach on GitHub_ is the always the most up-to-date and the +``master`` branch should always work. + +.. warn:: + + Bleach is currently incompatible with html5lib 1.0b and any versions below + 0.9.5. + + +Installing Bleach +================= + +Bleach is available on PyPI_, so you can install it with ``pip``:: + + $ pip install bleach + +Or with ``easy_install``:: + + $ easy_install bleach + +Or by cloning the repo from GitHub_:: + + $ git clone git://github.com/jsocol/bleach.git + +Then install it by running:: + + $ python setup.py install + + +Contents: +========= + +.. toctree:: + :maxdepth: 2 + + clean + linkify + goals + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + +.. _html5lib: http://code.google.com/p/html5lib/ +.. _GitHub: https://github.com/jsocol/bleach +.. _PyPI: http://pypi.python.org/pypi/bleach diff --git a/docs/linkify.rst b/docs/linkify.rst new file mode 100644 index 0000000..42de69c --- /dev/null +++ b/docs/linkify.rst @@ -0,0 +1,212 @@ +.. _linkify-chapter: +.. highlightlang:: python + +==================== +``bleach.linkify()`` +==================== + +``linkify()`` searches text for links, URLs, and email addresses and lets you +control how and when those links are rendered:: + + def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, + parse_email=False, tokenizer=HTMLSanitizer): + """Convert URL-like strings in an HTML fragment to links. + +``linkify()`` works by building a document tree, so it's guaranteed never to do +weird things to URLs in attribute values, can modify the value of attributes on +``<a>`` tags, and can even do things like skip ``<pre>`` sections. + +By default, ``linkify()`` will perform some sanitization, only allowing a set +of "safe" tags. Because it uses the HTML5 parsing algorithm, it will always +handle things like unclosed tags. + +.. note:: + You may pass a ``string`` or ``unicode`` object, but Bleach will always + return ``unicode``. + + +Callbacks +========= + +The second argument to ``linkify()`` is a list or other iterable of callback +functions. These callbacks can modify links that exist and links that are being +created, or remove them completely. + +Each callback will get the following arguments:: + + def my_callback(attrs, new=False): + +The ``attrs`` argument is a dict of attributes of the ``<a>`` tag. The ``new`` +argument is a boolean indicating if the link is new (e.g. an email address or +URL found in the text) or already existed (e.g. an ``<a>`` tag found in the +text). The ``attrs`` dict also contains a ``_text`` key, which is the innerText +of the ``<a>`` tag. + +The callback must return a dict of attributes (including ``_text``) or +``None``. The new dict of attributes will be passed to the next callback in the +list. If any callback returns ``None``, the link will not be created and the +original text left in place, or will be removed, and its original innerText +left in place. + +The default value is simply to add ``rel="nofollow"``. See ``bleach.callbacks`` +for some included callback functions. + + +Setting Attributes +------------------ + +For example, to set ``rel="nofollow"`` on all links found in the text, a simple +(and included) callback might be:: + + def set_nofollow(attrs, new=False): + attrs['rel'] = 'nofollow' + return attrs + +This would overwrite the value of the ``rel`` attribute if it was set. + +You could also make external links open in a new tab, or set a class:: + + from urlparse import urlparse + + def set_target(attrs, new=False): + p = urlparse(attrs['href']) + if p.netloc not in ['my-domain.com', 'other-domain.com']: + attrs['target'] = '_blank' + attrs['class'] = 'external' + else: + attrs.pop('target', None) + return attrs + + +Removing Attributes +------------------- + +You can easily remove attributes you don't want to allow, even on existing +links (``<a>`` tags) in the text. (See also :ref:`clean() <clean-chapter>` for +sanitizing attributes.) + +:: + + def allowed_attributes(attrs, new=False): + """Only allow href, target, rel and title.""" + allowed = ['href', 'target', 'rel', 'title'] + return dict((k, v) for k, v in attrs.items() if k in allowed) + +Or you could remove a specific attribute, if it exists:: + + def remove_title1(attrs, new=False): + attrs.pop('title', None) + return attrs + + def remove_title2(attrs, new=False): + if 'title' in attrs: + del attrs['title'] + return attrs + + +Altering Attributes +------------------- + +You can alter and overwrite attributes, including the link text, via the +``_text`` key, to, for example, pass outgoing links through a warning page, or +limit the length of text inside an ``<a>`` tag. + +:: + + def shorten_url(attrs, new=False): + """Shorten overly-long URLs in the text.""" + if not new: # Only looking at newly-created links. + return attrs + # _text will be the same as the URL for new links. + text = attrs['_text'] + if len(text) > 25: + attrs['_text'] = text[0:22] + '...' + return attrs + +:: + + from urllib2 import quote + from urlparse import urlparse + + def outgoing_bouncer(attrs, new=False): + """Send outgoing links through a bouncer.""" + p = urlparse(attrs['href']) + if p.netloc not in ['my-domain.com', 'www.my-domain.com', '']: + bouncer = 'http://outgoing.my-domain.com/?destination=%s' + attrs['href'] = bouncer % quote(attrs['href']) + return attrs + + +Preventing Links +---------------- + +A slightly more complex example is inspired by Crate_, where strings like +``models.py`` are often found, and linkified. ``.py`` is the ccTLD for +Paraguay, so ``example.py`` may be a legitimate URL, but in the case of a site +dedicated to Python packages, odds are it is not. In this case, Crate_ could +write the following callback:: + + def dont_linkify_python(attrs, new=False): + if not new: # This is an existing <a> tag, leave it be. + return attrs + + # If the TLD is '.py', make sure it starts with http: or https: + href = attrs['href'] + if href.endswith('.py') and not href.startswith(('http:', 'https:')): + # This looks like a Python file, not a URL. Don't make a link. + return None + + # Everything checks out, keep going to the next callback. + return attrs + + +Removing Links +-------------- + +If you want to remove certain links, even if they are written in the text with +``<a>`` tags, you can still return ``None``:: + + def remove_mailto(attrs, new=False): + """Remove any mailto: links.""" + if attrs['href'].startswith('mailto:'): + return None + return attrs + + +``skip_pre`` +============ + +``<pre>`` tags are often special, literal sections. If you don't want to create +any new links within a ``<pre>`` section, pass ``skip_pre=True``. + +.. note:: + Though new links will not be created, existing links created with ``<a>`` + tags will still be passed through all the callbacks. + + +``parse_email`` +=============== + +By default, ``linkify()`` does not create ``mailto:`` links for email +addresses, but if you pass ``parse_email=True``, it will. ``mailto:`` links +will go through exactly the same set of callbacks as all other links, whether +they are newly created or already in the text, so be careful when writing +callbacks that may need to behave differently if the protocol is ``mailto:``. + + +``tokenizer`` +============ + +``linkify()`` uses the ``html5lib.sanitizer.HTMLSanitizer`` tokenizer by +default. This has the effect of scrubbing some tags and attributes. To use a +more lenient, or totally different, tokenizer, you can specify the tokenizer +class here. (See the implementation of :ref:`clean() <clean-chapter>` for an +example of building a custom tokenizer.) + +:: + + from html5lib.tokenizer import HTMLTokenizer + linked_text = linkify(text, tokenizer=HTMLTokenizer) + + +.. _Crate: https://crate.io/ diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..84c919b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,190 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^<target^>` where ^<target^> is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Bleach.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Bleach.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +:end |