diff options
Diffstat (limited to 'bleach/__init__.py')
-rw-r--r-- | bleach/__init__.py | 227 |
1 files changed, 144 insertions, 83 deletions
diff --git a/bleach/__init__.py b/bleach/__init__.py index af75d0f..b110972 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -1,6 +1,8 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals import logging import re -import sys import html5lib from html5lib.sanitizer import HTMLSanitizer @@ -11,8 +13,8 @@ from .encoding import force_unicode from .sanitizer import BleachSanitizer -VERSION = (1, 2, 1) -__version__ = '1.2.1' +VERSION = (1, 4, 0) +__version__ = '1.4' __all__ = ['clean', 'linkify'] @@ -61,12 +63,12 @@ TLDS.reverse() url_re = re.compile( r"""\(* # Match any opening parentheses. - \b(?<![@.])(?:(?:%s):/{0,3}(?:(?:\w+:)?\w+@)?)? # http:// - ([\w-]+\.)+(?:%s)(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)? - (?:[/?][^\s\{\}\|\\\^\[\]`<>"]*)? + \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http:// + ([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)? + (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)? # /path/zz (excluding "unsafe" chars from RFC 1738, # except for # and ~, which happen in practice) - """ % (u'|'.join(PROTOCOLS), u'|'.join(TLDS)), + """.format('|'.join(PROTOCOLS), '|'.join(TLDS)), re.IGNORECASE | re.VERBOSE | re.UNICODE) proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) @@ -75,8 +77,8 @@ punct_re = re.compile(r'([\.,]+)$') email_re = re.compile( r"""(?<!//) - (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+ - (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)* # dot-atom + (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+ + (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)* # dot-atom |^"([\001-\010\013\014\016-\037!#-\[\]-\177] |\\[\001-011\013\014\016-\177])*" # quoted-string )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.? # domain @@ -85,17 +87,18 @@ email_re = re.compile( NODE_TEXT = 4 # The numeric ID of a text node in simpletree. -DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] +ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x]) +# a simple routine that returns the tag name with the namespace prefix +# as returned by etree's Element.tag attribute -PY_26 = (sys.version_info < (2, 7)) -RECURSION_EXCEPTION = RuntimeError if not PY_26 else AttributeError +DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, strip=False, strip_comments=True): """Clean an HTML fragment and return it""" if not text: - return u'' + return '' text = force_unicode(text) @@ -123,22 +126,38 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, text = force_unicode(text) if not text: - return u'' + return '' parser = html5lib.HTMLParser(tokenizer=tokenizer) forest = parser.parseFragment(text) + _seen = set([]) - def replace_nodes(tree, new_frag, node): + def replace_nodes(tree, new_frag, node, index=0): + """ + Doesn't really replace nodes, but inserts the nodes contained in + new_frag into the treee at position index and returns the number + of nodes inserted. + If node is passed in, it is removed from the tree + """ + count = 0 new_tree = parser.parseFragment(new_frag) - for n in new_tree.childNodes: - # Prevent us from re-parsing links new links as existing links. - if n.name == 'a': - n._seen = True - tree.insertBefore(n, node) - tree.removeChild(node) - # Return the number of new nodes. - return len(new_tree.childNodes) - 1 + # capture any non-tag text at the start of the fragment + if new_tree.text: + if index == 0: + tree.text += new_tree.text + else: + tree[index-1].tail += new_tree.text + # the put in the tagged elements into the old tree + for n in new_tree: + if n.tag == ETREE_TAG('a'): + _seen.add(n) + tree.insert(index+count, n) + count += 1 + # if we got a node to remove... + if node is not None: + tree.remove(node) + return count def strip_wrapping_parentheses(fragment): """Strips wrapping parentheses. @@ -189,58 +208,102 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, return None return attrs + def _render_inner(node): + out = ['' if node.text is None else node.text] + for subnode in node: + out.append(_render(subnode)) + if subnode.tail: + out.append(subnode.tail) + return ''.join(out) + def linkify_nodes(tree, parse_text=True): - # I know this isn't Pythonic, but we're sometimes mutating - # tree.childNodes, which ends up breaking the loop and causing us to - # reparse code. - children = len(tree.childNodes) - current = 0 # A pointer to the "current" node. - while current < children: - node = tree.childNodes[current] - if node.type == NODE_TEXT and parse_text: - new_frag = _render(node) - # Look for email addresses? - if parse_email: - new_frag = re.sub(email_re, email_repl, new_frag) - if new_frag != _render(node): - adj = replace_nodes(tree, new_frag, node) + children = len(tree) + current_child = -1 + # start at -1 to process the parent first + while current_child < len(tree): + if current_child < 0: + node = tree + if parse_text and node.text: + new_txt = old_txt = node.text + if parse_email: + new_txt = re.sub(email_re, email_repl, node.text) + if new_txt and new_txt != node.text: + node.text = '' + adj = replace_nodes(tree, new_txt, None, 0) + children += adj + current_child += adj + linkify_nodes(tree, True) + continue + + new_txt = re.sub(url_re, link_repl, new_txt) + if new_txt != old_txt: + node.text = '' + adj = replace_nodes(tree, new_txt, None, 0) children += adj - current += adj - linkify_nodes(tree) + current_child += adj continue - new_frag = re.sub(url_re, link_repl, new_frag) - if new_frag != _render(node): - adj = replace_nodes(tree, new_frag, node) + else: + node = tree[current_child] + + if parse_text and node.tail: + new_tail = old_tail = node.tail + if parse_email: + new_tail = re.sub(email_re, email_repl, new_tail) + if new_tail != node.tail: + node.tail = '' + adj = replace_nodes(tree, new_tail, None, + current_child+1) + #insert the new nodes made from my tail into + # the tree right after me. current_child+1 + children += adj + + new_tail = re.sub(url_re, link_repl, new_tail) + if new_tail != old_tail: + node.tail = '' + adj = replace_nodes(tree, new_tail, None, current_child+1) children += adj - current += adj - elif node.name == 'a' and not getattr(node, '_seen', False): - if 'href' in node.attributes: - attrs = node.attributes - _text = attrs['_text'] = ''.join(c.toxml() for - c in node.childNodes) + + if node.tag == ETREE_TAG('a') and not (node in _seen): + if not node.get('href', None) is None: + attrs = dict(node.items()) + + _text = attrs['_text'] = _render_inner(node) + attrs = apply_callbacks(attrs, False) - if attrs is not None: + + if attrs is None: + # <a> tag replaced by the text within it + adj = replace_nodes(tree, _text, node, + current_child) + current_child -= 1 + # pull back current_child by 1 to scan the + # new nodes again. + else: text = force_unicode(attrs.pop('_text')) - node.attributes = attrs - for n in reversed(node.childNodes): - node.removeChild(n) + for attr_key, attr_val in attrs.items(): + node.set(attr_key, attr_val) + + for n in reversed(list(node)): + node.remove(n) text = parser.parseFragment(text) - for n in text.childNodes: - node.appendChild(n) - node._seen = True - else: - replace_nodes(tree, _text, node) - elif skip_pre and node.name == 'pre': - linkify_nodes(node, False) - elif not getattr(node, '_seen', False): - linkify_nodes(node) - current += 1 + node.text = text.text + for n in text: + node.append(n) + _seen.add(node) + + elif current_child >= 0: + if node.tag == ETREE_TAG('pre') and skip_pre: + linkify_nodes(node, False) + elif not (node in _seen): + linkify_nodes(node, True) + + current_child += 1 def email_repl(match): addr = match.group(0).replace('"', '"') link = { '_text': addr, - 'href': 'mailto:%s' % addr, + 'href': 'mailto:{0!s}'.format(addr), } link = apply_callbacks(link, True) @@ -250,18 +313,18 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, _href = link.pop('href') _text = link.pop('_text') - repl = '<a href="%s" %s>%s</a>' - attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items()) - return repl % (_href, attribs, _text) + repl = '<a href="{0!s}" {1!s}>{2!s}</a>' + attr = '{0!s}="{1!s}"' + attribs = ' '.join(attr.format(k, v) for k, v in link.items()) + return repl.format(_href, attribs, _text) def link_repl(match): url = match.group(0) open_brackets = close_brackets = 0 if url.startswith('('): - url, open_brackets, close_brackets = ( - strip_wrapping_parentheses(url) - ) - end = u'' + _wrapping = strip_wrapping_parentheses(url) + url, open_brackets, close_brackets = _wrapping + end = '' m = re.search(punct_re, url) if m: end = m.group(0) @@ -269,7 +332,7 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, if re.search(proto_re, url): href = url else: - href = u''.join([u'http://', url]) + href = ''.join(['http://', url]) link = { '_text': url, @@ -284,32 +347,30 @@ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, _text = link.pop('_text') _href = link.pop('href') - repl = u'%s<a href="%s" %s>%s</a>%s%s' - attribs = ' '.join('%s="%s"' % (k, v) for k, v in link.items()) + repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}' + attr = '{0!s}="{1!s}"' + attribs = ' '.join(attr.format(k, v) for k, v in link.items()) - return repl % ('(' * open_brackets, - _href, attribs, _text, end, - ')' * close_brackets) + return repl.format('(' * open_brackets, + _href, attribs, _text, end, + ')' * close_brackets) try: linkify_nodes(forest) - except (RECURSION_EXCEPTION), e: + except RuntimeError as e: # If we hit the max recursion depth, just return what we've got. - log.exception('Probable recursion error: %r' % e) + log.exception('Probable recursion error: {0!r}'.format(e)) return _render(forest) def _render(tree): """Try rendering as HTML, then XML, then give up.""" - try: - return force_unicode(_serialize(tree)) - except AssertionError: # The treewalker throws this sometimes. - return force_unicode(tree.toxml()) + return force_unicode(_serialize(tree)) def _serialize(domtree): - walker = html5lib.treewalkers.getTreeWalker('simpletree') + walker = html5lib.treewalkers.getTreeWalker('etree') stream = walker(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) |