Skip to content

Commit 1d3434f

Browse files
Marc DMgsnedders
Marc DM
authored andcommitted
Fix #63: treewalker fails to handle bytes data from outwith html5lib
We do, ourselves, ensure everything we put in the tree is unicode under Python 2; users, however, may not be so careful and assign an attribute using a bare string, for example. We should handle this gracefully under Python 2 by coercing gently into unicode, failing in the normal way if we are unable to decode the bytes. I (Geoffrey, the committer) am not entirely convinced this is the right way to test this; on the other hand, a thorough testsuite for the issues previously in the treewalker would be large, especially considering the large number of treewalkers we support.
1 parent 84d8a74 commit 1d3434f

File tree

2 files changed

+104
-33
lines changed

2 files changed

+104
-33
lines changed

html5lib/tests/test_treewalkers.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,3 +310,54 @@ def test_treewalker():
310310
"document")]
311311
errors = errors.split("\n")
312312
yield runTreewalkerTest, innerHTML, input, expected, errors, treeCls
313+
314+
315+
def set_attribute_on_first_child(docfrag, name, value, treeName):
316+
"""naively sets an attribute on the first child of the document
317+
fragment passed in"""
318+
setter = {'ElementTree': lambda d: d[0].set,
319+
'DOM': lambda d: d.firstChild.setAttribute}
320+
setter['cElementTree'] = setter['ElementTree']
321+
try:
322+
setter.get(treeName, setter['DOM'])(docfrag)(name, value)
323+
except AttributeError:
324+
setter['ElementTree'](docfrag)(name, value)
325+
326+
327+
def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
328+
"""tests what happens when we add attributes to the intext"""
329+
treeName, treeClass = tree
330+
parser = html5parser.HTMLParser(tree=treeClass["builder"])
331+
document = parser.parseFragment(intext)
332+
for nom, val in attrs_to_add:
333+
set_attribute_on_first_child(document, nom, val, treeName)
334+
335+
document = treeClass.get("adapter", lambda x: x)(document)
336+
output = convertTokens(treeClass["walker"](document))
337+
output = attrlist.sub(sortattrs, output)
338+
if not output in expected:
339+
raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
340+
341+
342+
def test_treewalker_six_mix():
343+
"""Str/Unicode mix. If str attrs added to tree"""
344+
345+
# On Python 2.x string literals are of type str. Unless, like this
346+
# file, the programmer imports unicode_literals from __future__.
347+
# In that case, string literals become objects of type unicode.
348+
349+
# This test simulates a Py2 user, modifying attributes on a document
350+
# fragment but not using the u'' syntax nor importing unicode_literals
351+
sm_tests = [
352+
('<a href="http://example.com">Example</a>',
353+
[(str('class'), str('test123'))],
354+
'<a>\n class="test123"\n href="http://example.com"\n "Example"'),
355+
356+
('<link href="http://example.com/cow">',
357+
[(str('rel'), str('alternate'))],
358+
'<link>\n href="http://example.com/cow"\n rel="alternate"\n "Example"')
359+
]
360+
361+
for tree in treeTypes.items():
362+
for intext, attrs, expected in sm_tests:
363+
yield runTreewalkerEditTest, intext, expected, attrs, tree

html5lib/treewalkers/_base.py

Lines changed: 53 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from __future__ import absolute_import, division, unicode_literals
2-
from six import text_type
2+
from six import text_type, string_types
33

44
import gettext
55
_ = gettext.gettext
@@ -8,6 +8,24 @@
88
spaceCharacters = "".join(spaceCharacters)
99

1010

11+
def to_text(s, blank_if_none=True):
12+
"""Wrapper around six.text_type to convert None to empty string"""
13+
if s is None:
14+
if blank_if_none:
15+
return ""
16+
else:
17+
return None
18+
elif isinstance(s, text_type):
19+
return s
20+
else:
21+
return text_type(s)
22+
23+
24+
def is_text_or_none(string):
25+
"""Wrapper around isinstance(string_types) or is None"""
26+
return string is None or isinstance(string, string_types)
27+
28+
1129
class TreeWalker(object):
1230
def __init__(self, tree):
1331
self.tree = tree
@@ -19,45 +37,47 @@ def error(self, msg):
1937
return {"type": "SerializeError", "data": msg}
2038

2139
def emptyTag(self, namespace, name, attrs, hasChildren=False):
22-
assert namespace is None or isinstance(namespace, text_type), type(namespace)
23-
assert isinstance(name, text_type), type(name)
24-
assert all((namespace is None or isinstance(namespace, text_type)) and
25-
isinstance(name, text_type) and
26-
isinstance(value, text_type)
40+
assert namespace is None or isinstance(namespace, string_types), type(namespace)
41+
assert isinstance(name, string_types), type(name)
42+
assert all((namespace is None or isinstance(namespace, string_types)) and
43+
isinstance(name, string_types) and
44+
isinstance(value, string_types)
2745
for (namespace, name), value in attrs.items())
2846

29-
yield {"type": "EmptyTag", "name": name,
30-
"namespace": namespace,
47+
yield {"type": "EmptyTag", "name": to_text(name, False),
48+
"namespace": to_text(namespace),
3149
"data": attrs}
3250
if hasChildren:
3351
yield self.error(_("Void element has children"))
3452

3553
def startTag(self, namespace, name, attrs):
36-
assert namespace is None or isinstance(namespace, text_type), type(namespace)
37-
assert isinstance(name, text_type), type(name)
38-
assert all((namespace is None or isinstance(namespace, text_type)) and
39-
isinstance(name, text_type) and
40-
isinstance(value, text_type)
54+
assert namespace is None or isinstance(namespace, string_types), type(namespace)
55+
assert isinstance(name, string_types), type(name)
56+
assert all((namespace is None or isinstance(namespace, string_types)) and
57+
isinstance(name, string_types) and
58+
isinstance(value, string_types)
4159
for (namespace, name), value in attrs.items())
4260

4361
return {"type": "StartTag",
44-
"name": name,
45-
"namespace": namespace,
46-
"data": attrs}
62+
"name": text_type(name),
63+
"namespace": to_text(namespace),
64+
"data": dict(((to_text(namespace, False), to_text(name)),
65+
to_text(value, False))
66+
for (namespace, name), value in attrs.items())}
4767

4868
def endTag(self, namespace, name):
49-
assert namespace is None or isinstance(namespace, text_type), type(namespace)
50-
assert isinstance(name, text_type), type(namespace)
69+
assert namespace is None or isinstance(namespace, string_types), type(namespace)
70+
assert isinstance(name, string_types), type(namespace)
5171

5272
return {"type": "EndTag",
53-
"name": name,
54-
"namespace": namespace,
73+
"name": to_text(name, False),
74+
"namespace": to_text(namespace),
5575
"data": {}}
5676

5777
def text(self, data):
58-
assert isinstance(data, text_type), type(data)
78+
assert isinstance(data, string_types), type(data)
5979

60-
data = data
80+
data = to_text(data)
6181
middle = data.lstrip(spaceCharacters)
6282
left = data[:len(data) - len(middle)]
6383
if left:
@@ -71,25 +91,25 @@ def text(self, data):
7191
yield {"type": "SpaceCharacters", "data": right}
7292

7393
def comment(self, data):
74-
assert isinstance(data, text_type), type(data)
94+
assert isinstance(data, string_types), type(data)
7595

76-
return {"type": "Comment", "data": data}
96+
return {"type": "Comment", "data": text_type(data)}
7797

7898
def doctype(self, name, publicId=None, systemId=None, correct=True):
79-
assert name is None or isinstance(name, text_type), type(name)
80-
assert publicId is None or isinstance(publicId, text_type), type(publicId)
81-
assert systemId is None or isinstance(systemId, text_type), type(systemId)
99+
assert is_text_or_none(name), type(name)
100+
assert is_text_or_none(publicId), type(publicId)
101+
assert is_text_or_none(systemId), type(systemId)
82102

83103
return {"type": "Doctype",
84-
"name": name if name is not None else "",
85-
"publicId": publicId,
86-
"systemId": systemId,
87-
"correct": correct}
104+
"name": to_text(name),
105+
"publicId": to_text(publicId),
106+
"systemId": to_text(systemId),
107+
"correct": to_text(correct)}
88108

89109
def entity(self, name):
90-
assert isinstance(name, text_type), type(name)
110+
assert isinstance(name, string_types), type(name)
91111

92-
return {"type": "Entity", "name": name}
112+
return {"type": "Entity", "name": text_type(name)}
93113

94114
def unknown(self, nodeType):
95115
return self.error(_("Unknown node type: ") + nodeType)

0 commit comments

Comments
 (0)