aboutsummaryrefslogtreecommitdiff
path: root/bleach/sanitizer.py
diff options
context:
space:
mode:
Diffstat (limited to 'bleach/sanitizer.py')
-rw-r--r--bleach/sanitizer.py143
1 files changed, 143 insertions, 0 deletions
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
new file mode 100644
index 0000000..677287e
--- /dev/null
+++ b/bleach/sanitizer.py
@@ -0,0 +1,143 @@
+import re
+from xml.sax.saxutils import escape, unescape
+
+from html5lib.constants import tokenTypes
+from html5lib.sanitizer import HTMLSanitizerMixin
+from html5lib.tokenizer import HTMLTokenizer
+
+
+class BleachSanitizerMixin(HTMLSanitizerMixin):
+ """Mixin to replace sanitize_token() and sanitize_css()."""
+
+ allowed_svg_properties = []
+ # TODO: When the next html5lib version comes out, nuke this.
+ attr_val_is_uri = HTMLSanitizerMixin.attr_val_is_uri + ['poster']
+
+ def sanitize_token(self, token):
+ """Sanitize a token either by HTML-encoding or dropping.
+
+ Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
+ a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
+
+ Here callable is a function with two arguments of attribute name
+ and value. It should return true of false.
+
+ Also gives the option to strip tags instead of encoding.
+
+ """
+ if (getattr(self, 'wildcard_attributes', None) is None and
+ isinstance(self.allowed_attributes, dict)):
+ self.wildcard_attributes = self.allowed_attributes.get('*', [])
+
+ if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
+ tokenTypes['EmptyTag']):
+ if token['name'] in self.allowed_elements:
+ if 'data' in token:
+ if isinstance(self.allowed_attributes, dict):
+ allowed_attributes = self.allowed_attributes.get(
+ token['name'], [])
+ if not callable(allowed_attributes):
+ allowed_attributes += self.wildcard_attributes
+ else:
+ allowed_attributes = self.allowed_attributes
+ attrs = dict([(name, val) for name, val in
+ token['data'][::-1]
+ if (allowed_attributes(name, val)
+ if callable(allowed_attributes)
+ else name in allowed_attributes)])
+ for attr in self.attr_val_is_uri:
+ if not attr in attrs:
+ continue
+ val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+ unescape(attrs[attr])).lower()
+ # Remove replacement characters from unescaped
+ # characters.
+ val_unescaped = val_unescaped.replace(u"\ufffd", "")
+ if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
+ and (val_unescaped.split(':')[0] not in
+ self.allowed_protocols)):
+ del attrs[attr]
+ for attr in self.svg_attr_val_allows_ref:
+ if attr in attrs:
+ attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+ ' ',
+ unescape(attrs[attr]))
+ if (token['name'] in self.svg_allow_local_href and
+ 'xlink:href' in attrs and
+ re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
+ del attrs['xlink:href']
+ if 'style' in attrs:
+ attrs['style'] = self.sanitize_css(attrs['style'])
+ token['data'] = [(name, val) for name, val in
+ attrs.items()]
+ return token
+ elif self.strip_disallowed_elements:
+ pass
+ else:
+ if token['type'] == tokenTypes['EndTag']:
+ token['data'] = '</%s>' % token['name']
+ elif token['data']:
+ attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in
+ token['data']])
+ token['data'] = '<%s%s>' % (token['name'], attrs)
+ else:
+ token['data'] = '<%s>' % token['name']
+ if token['selfClosing']:
+ token['data'] = token['data'][:-1] + '/>'
+ token['type'] = tokenTypes['Characters']
+ del token["name"]
+ return token
+ elif token['type'] == tokenTypes['Comment']:
+ if not self.strip_html_comments:
+ return token
+ else:
+ return token
+
+ def sanitize_css(self, style):
+ """HTMLSanitizerMixin.sanitize_css replacement.
+
+ HTMLSanitizerMixin.sanitize_css always whitelists background-*,
+ border-*, margin-*, and padding-*. We only whitelist what's in
+ the whitelist.
+
+ """
+ # disallow urls
+ style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+ # gauntlet
+ # TODO: Make sure this does what it's meant to - I *think* it wants to
+ # validate style attribute contents.
+ parts = style.split(';')
+ gauntlet = re.compile("""^([-/:,#%.'\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*"""
+ """|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
+ for part in parts:
+ if not gauntlet.match(part):
+ return ''
+
+ if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+ return ''
+
+ clean = []
+ for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
+ if not value:
+ continue
+ if prop.lower() in self.allowed_css_properties:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.lower() in self.allowed_svg_properties:
+ clean.append(prop + ': ' + value + ';')
+
+ return ' '.join(clean)
+
+
+class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
+ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
+ lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
+ HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
+ lowercaseElementName, lowercaseAttrName,
+ **kwargs)
+
+ def __iter__(self):
+ for token in HTMLTokenizer.__iter__(self):
+ token = self.sanitize_token(token)
+ if token:
+ yield token