diff options
Diffstat (limited to 'bleach/sanitizer.py')
-rw-r--r-- | bleach/sanitizer.py | 143 |
1 files changed, 143 insertions, 0 deletions
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py new file mode 100644 index 0000000..677287e --- /dev/null +++ b/bleach/sanitizer.py @@ -0,0 +1,143 @@ +import re +from xml.sax.saxutils import escape, unescape + +from html5lib.constants import tokenTypes +from html5lib.sanitizer import HTMLSanitizerMixin +from html5lib.tokenizer import HTMLTokenizer + + +class BleachSanitizerMixin(HTMLSanitizerMixin): + """Mixin to replace sanitize_token() and sanitize_css().""" + + allowed_svg_properties = [] + # TODO: When the next html5lib version comes out, nuke this. + attr_val_is_uri = HTMLSanitizerMixin.attr_val_is_uri + ['poster'] + + def sanitize_token(self, token): + """Sanitize a token either by HTML-encoding or dropping. + + Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be + a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}. + + Here callable is a function with two arguments of attribute name + and value. It should return true of false. + + Also gives the option to strip tags instead of encoding. + + """ + if (getattr(self, 'wildcard_attributes', None) is None and + isinstance(self.allowed_attributes, dict)): + self.wildcard_attributes = self.allowed_attributes.get('*', []) + + if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'], + tokenTypes['EmptyTag']): + if token['name'] in self.allowed_elements: + if 'data' in token: + if isinstance(self.allowed_attributes, dict): + allowed_attributes = self.allowed_attributes.get( + token['name'], []) + if not callable(allowed_attributes): + allowed_attributes += self.wildcard_attributes + else: + allowed_attributes = self.allowed_attributes + attrs = dict([(name, val) for name, val in + token['data'][::-1] + if (allowed_attributes(name, val) + if callable(allowed_attributes) + else name in allowed_attributes)]) + for attr in self.attr_val_is_uri: + if not attr in attrs: + continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', + unescape(attrs[attr])).lower() + # Remove replacement characters from unescaped + # characters. + val_unescaped = val_unescaped.replace(u"\ufffd", "") + if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) + and (val_unescaped.split(':')[0] not in + self.allowed_protocols)): + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: + if attr in attrs: + attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + if (token['name'] in self.svg_allow_local_href and + 'xlink:href' in attrs and + re.search(r'^\s*[^#\s].*', attrs['xlink:href'])): + del attrs['xlink:href'] + if 'style' in attrs: + attrs['style'] = self.sanitize_css(attrs['style']) + token['data'] = [(name, val) for name, val in + attrs.items()] + return token + elif self.strip_disallowed_elements: + pass + else: + if token['type'] == tokenTypes['EndTag']: + token['data'] = '</%s>' % token['name'] + elif token['data']: + attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in + token['data']]) + token['data'] = '<%s%s>' % (token['name'], attrs) + else: + token['data'] = '<%s>' % token['name'] + if token['selfClosing']: + token['data'] = token['data'][:-1] + '/>' + token['type'] = tokenTypes['Characters'] + del token["name"] + return token + elif token['type'] == tokenTypes['Comment']: + if not self.strip_html_comments: + return token + else: + return token + + def sanitize_css(self, style): + """HTMLSanitizerMixin.sanitize_css replacement. + + HTMLSanitizerMixin.sanitize_css always whitelists background-*, + border-*, margin-*, and padding-*. We only whitelist what's in + the whitelist. + + """ + # disallow urls + style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) + + # gauntlet + # TODO: Make sure this does what it's meant to - I *think* it wants to + # validate style attribute contents. + parts = style.split(';') + gauntlet = re.compile("""^([-/:,#%.'\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*""" + """|"[\s\w]+"|\([\d,%\.\s]+\))*$""") + for part in parts: + if not gauntlet.match(part): + return '' + + if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): + return '' + + clean = [] + for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style): + if not value: + continue + if prop.lower() in self.allowed_css_properties: + clean.append(prop + ': ' + value + ';') + elif prop.lower() in self.allowed_svg_properties: + clean.append(prop + ': ' + value + ';') + + return ' '.join(clean) + + +class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin): + def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, + lowercaseElementName=True, lowercaseAttrName=True, **kwargs): + HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, + lowercaseElementName, lowercaseAttrName, + **kwargs) + + def __iter__(self): + for token in HTMLTokenizer.__iter__(self): + token = self.sanitize_token(token) + if token: + yield token |