Skip to content

Commit 0c8d7bc

Browse files
committed
Fix #6: dom2sax crash by replacing dom2sax with a generic to_sax
This moves the functionality to a new treeadapters module (where later the adapters from test_treewalker.py will get moved) and removes the previous dom2sax function.
1 parent f61e328 commit 0c8d7bc

File tree

6 files changed

+69
-78
lines changed

6 files changed

+69
-78
lines changed

CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ Change Log
66

77
Released on XXX, 2013
88

9+
* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
10+
``treeadapters.sax.to_sax`` which is generic and supports any
11+
treewalker; it also resolves all known bugs with ``dom2sax``.
12+
913

1014
1.0b1
1115
~~~~~

html5lib/constants.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,24 @@
433433
(namespaces["mathml"], "mtext")
434434
))
435435

436+
adjustForeignAttributes = {
437+
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
438+
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
439+
"xlink:href": ("xlink", "href", namespaces["xlink"]),
440+
"xlink:role": ("xlink", "role", namespaces["xlink"]),
441+
"xlink:show": ("xlink", "show", namespaces["xlink"]),
442+
"xlink:title": ("xlink", "title", namespaces["xlink"]),
443+
"xlink:type": ("xlink", "type", namespaces["xlink"]),
444+
"xml:base": ("xml", "base", namespaces["xml"]),
445+
"xml:lang": ("xml", "lang", namespaces["xml"]),
446+
"xml:space": ("xml", "space", namespaces["xml"]),
447+
"xmlns": (None, "xmlns", namespaces["xmlns"]),
448+
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
449+
}
450+
451+
unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
452+
adjustForeignAttributes.items()])
453+
436454
spaceCharacters = frozenset((
437455
"\t",
438456
"\n",

html5lib/html5parser.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from .constants import cdataElements, rcdataElements
1818
from .constants import tokenTypes, ReparseException, namespaces
1919
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
20+
from .constants import adjustForeignAttributes as adjustForeignAttributesMap
2021

2122

2223
def parse(doc, treebuilder="etree", encoding=None,
@@ -333,20 +334,7 @@ def adjustSVGAttributes(self, token):
333334
del token["data"][originalName]
334335

335336
def adjustForeignAttributes(self, token):
336-
replacements = {
337-
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
338-
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
339-
"xlink:href": ("xlink", "href", namespaces["xlink"]),
340-
"xlink:role": ("xlink", "role", namespaces["xlink"]),
341-
"xlink:show": ("xlink", "show", namespaces["xlink"]),
342-
"xlink:title": ("xlink", "title", namespaces["xlink"]),
343-
"xlink:type": ("xlink", "type", namespaces["xlink"]),
344-
"xml:base": ("xml", "base", namespaces["xml"]),
345-
"xml:lang": ("xml", "lang", namespaces["xml"]),
346-
"xml:space": ("xml", "space", namespaces["xml"]),
347-
"xmlns": (None, "xmlns", namespaces["xmlns"]),
348-
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
349-
}
337+
replacements = adjustForeignAttributesMap
350338

351339
for originalName in token["data"].keys():
352340
if originalName in replacements:

html5lib/treeadapters/__init__.py

Whitespace-only changes.

html5lib/treeadapters/sax.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
3+
from xml.sax.xmlreader import AttributesNSImpl
4+
5+
from ..constants import adjustForeignAttributes, unadjustForeignAttributes
6+
7+
prefix_mapping = {}
8+
for prefix, localName, namespace in adjustForeignAttributes.values():
9+
if prefix is not None:
10+
prefix_mapping[prefix] = namespace
11+
12+
13+
def to_sax(walker, handler):
14+
"""Call SAX-like content handler based on treewalker walker"""
15+
handler.startDocument()
16+
for prefix, namespace in prefix_mapping.items():
17+
handler.startPrefixMapping(prefix, namespace)
18+
19+
for token in walker:
20+
type = token["type"]
21+
if type == "Doctype":
22+
continue
23+
elif type in ("StartTag", "EmptyTag"):
24+
attrs = AttributesNSImpl(token["data"],
25+
unadjustForeignAttributes)
26+
handler.startElementNS((token["namespace"], token["name"]),
27+
token["name"],
28+
attrs)
29+
if type == "EmptyTag":
30+
handler.endElementNS((token["namespace"], token["name"]),
31+
token["name"])
32+
elif type == "EndTag":
33+
handler.endElementNS((token["namespace"], token["name"]),
34+
token["name"])
35+
elif type in ("Characters", "SpaceCharacters"):
36+
handler.characters(token["data"])
37+
elif type == "Comment":
38+
pass
39+
else:
40+
assert False, "Unknown token type"
41+
42+
for prefix, namespace in prefix_mapping.items():
43+
handler.endPrefixMapping(prefix)
44+
handler.endDocument()

html5lib/treebuilders/dom.py

Lines changed: 1 addition & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import absolute_import, division, unicode_literals
22

33

4-
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
4+
from xml.dom import minidom, Node
55
import weakref
66

77
from . import _base
@@ -220,69 +220,6 @@ def serializeElement(element, indent=0):
220220

221221
return "\n".join(rv)
222222

223-
def dom2sax(node, handler, nsmap={'xml': XML_NAMESPACE}):
224-
if node.nodeType == Node.ELEMENT_NODE:
225-
if not nsmap:
226-
handler.startElement(node.nodeName, node.attributes)
227-
for child in node.childNodes:
228-
dom2sax(child, handler, nsmap)
229-
handler.endElement(node.nodeName)
230-
else:
231-
attributes = dict(node.attributes.itemsNS())
232-
233-
# gather namespace declarations
234-
prefixes = []
235-
for attrname in list(node.attributes.keys()):
236-
attr = node.getAttributeNode(attrname)
237-
if (attr.namespaceURI == XMLNS_NAMESPACE or
238-
(attr.namespaceURI is None and attr.nodeName.startswith('xmlns'))):
239-
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
240-
handler.startPrefixMapping(prefix, attr.nodeValue)
241-
prefixes.append(prefix)
242-
nsmap = nsmap.copy()
243-
nsmap[prefix] = attr.nodeValue
244-
del attributes[(attr.namespaceURI, attr.nodeName)]
245-
246-
# apply namespace declarations
247-
for attrname in list(node.attributes.keys()):
248-
attr = node.getAttributeNode(attrname)
249-
if attr.namespaceURI is None and ':' in attr.nodeName:
250-
prefix = attr.nodeName.split(':')[0]
251-
if prefix in nsmap:
252-
del attributes[(attr.namespaceURI, attr.nodeName)]
253-
attributes[(nsmap[prefix], attr.nodeName)] = attr.nodeValue
254-
255-
# SAX events
256-
ns = node.namespaceURI or nsmap.get(None, None)
257-
handler.startElementNS((ns, node.nodeName), node.nodeName, attributes)
258-
for child in node.childNodes:
259-
dom2sax(child, handler, nsmap)
260-
handler.endElementNS((ns, node.nodeName), node.nodeName)
261-
for prefix in prefixes:
262-
handler.endPrefixMapping(prefix)
263-
264-
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
265-
handler.characters(node.nodeValue)
266-
267-
elif node.nodeType == Node.DOCUMENT_NODE:
268-
handler.startDocument()
269-
for child in node.childNodes:
270-
dom2sax(child, handler, nsmap)
271-
handler.endDocument()
272-
273-
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
274-
for child in node.childNodes:
275-
dom2sax(child, handler, nsmap)
276-
277-
else:
278-
# ATTRIBUTE_NODE
279-
# ENTITY_NODE
280-
# PROCESSING_INSTRUCTION_NODE
281-
# COMMENT_NODE
282-
# DOCUMENT_TYPE_NODE
283-
# NOTATION_NODE
284-
pass
285-
286223
return locals()
287224

288225

0 commit comments

Comments
 (0)