1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
|
from __future__ import unicode_literals
import re
from xml.sax.saxutils import escape, unescape
from html5lib.constants import tokenTypes
from html5lib.sanitizer import HTMLSanitizerMixin
from html5lib.tokenizer import HTMLTokenizer
PROTOS = HTMLSanitizerMixin.acceptable_protocols
PROTOS.remove('feed')
class BleachSanitizerMixin(HTMLSanitizerMixin):
"""Mixin to replace sanitize_token() and sanitize_css()."""
allowed_svg_properties = []
def sanitize_token(self, token):
"""Sanitize a token either by HTML-encoding or dropping.
Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
Here callable is a function with two arguments of attribute name
and value. It should return true of false.
Also gives the option to strip tags instead of encoding.
"""
if (getattr(self, 'wildcard_attributes', None) is None and
isinstance(self.allowed_attributes, dict)):
self.wildcard_attributes = self.allowed_attributes.get('*', [])
if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
tokenTypes['EmptyTag']):
if token['name'] in self.allowed_elements:
if 'data' in token:
if isinstance(self.allowed_attributes, dict):
allowed_attributes = self.allowed_attributes.get(
token['name'], [])
if not callable(allowed_attributes):
allowed_attributes += self.wildcard_attributes
else:
allowed_attributes = self.allowed_attributes
attrs = dict([(name, val) for name, val in
token['data'][::-1]
if (allowed_attributes(name, val)
if callable(allowed_attributes)
else name in allowed_attributes)])
for attr in self.attr_val_is_uri:
if attr not in attrs:
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
# Remove replacement characters from unescaped
# characters.
val_unescaped = val_unescaped.replace("\ufffd", "")
if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
and (val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token['name'] in self.svg_allow_local_href and
'xlink:href' in attrs and
re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
del attrs['xlink:href']
if 'style' in attrs:
attrs['style'] = self.sanitize_css(attrs['style'])
token['data'] = [(name, val) for name, val in
attrs.items()]
return token
elif self.strip_disallowed_elements:
pass
else:
if token['type'] == tokenTypes['EndTag']:
token['data'] = '</{0!s}>'.format(token['name'])
elif token['data']:
attr = ' {0!s}="{1!s}"'
attrs = ''.join([attr.format(k, escape(v)) for k, v in
token['data']])
token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
else:
token['data'] = '<{0!s}>'.format(token['name'])
if token['selfClosing']:
token['data'] = token['data'][:-1] + '/>'
token['type'] = tokenTypes['Characters']
del token["name"]
return token
elif token['type'] == tokenTypes['Comment']:
if not self.strip_html_comments:
return token
else:
return token
def sanitize_css(self, style):
"""HTMLSanitizerMixin.sanitize_css replacement.
HTMLSanitizerMixin.sanitize_css always whitelists background-*,
border-*, margin-*, and padding-*. We only whitelist what's in
the whitelist.
"""
# disallow urls
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet
# TODO: Make sure this does what it's meant to - I *think* it wants to
# validate style attribute contents.
parts = style.split(';')
gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
"""\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
for part in parts:
if not gauntlet.match(part):
return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ''
clean = []
for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)
class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
lowercaseElementName, lowercaseAttrName,
**kwargs)
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token
|