Skip to content

Commit 72f2169

Browse files
committed
Fix #51: DataLossWarning hiding real exceptions in parser.
Previously we stopped parser tests once DataLossWarning was raised; we now run let the parser run to completion before checking for raised warnings. This hid several cases of where real exceptions were raised after DataLossWarning was raised. This commit reveals these real exceptions in the testsuite and fixes current failure. Similar fixes are needed for the other tests.
1 parent 19e1b9b commit 72f2169

File tree

3 files changed

+58
-27
lines changed

3 files changed

+58
-27
lines changed

html5lib/ihatexml.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ def escapeRegexp(string):
179179

180180
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
181181

182+
# Simpler things
183+
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
184+
182185

183186
class InfosetFilter(object):
184187
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
@@ -188,7 +191,8 @@ def __init__(self, replaceChars=None,
188191
dropXmlnsAttrNs=False,
189192
preventDoubleDashComments=False,
190193
preventDashAtCommentEnd=False,
191-
replaceFormFeedCharacters=True):
194+
replaceFormFeedCharacters=True,
195+
preventSingleQuotePubid=False):
192196

193197
self.dropXmlnsLocalName = dropXmlnsLocalName
194198
self.dropXmlnsAttrNs = dropXmlnsAttrNs
@@ -198,6 +202,8 @@ def __init__(self, replaceChars=None,
198202

199203
self.replaceFormFeedCharacters = replaceFormFeedCharacters
200204

205+
self.preventSingleQuotePubid = preventSingleQuotePubid
206+
201207
self.replaceCache = {}
202208

203209
def coerceAttribute(self, name, namespace=None):
@@ -229,6 +235,17 @@ def coerceCharacters(self, data):
229235
# Other non-xml characters
230236
return data
231237

238+
def coercePubid(self, data):
239+
dataOutput = data
240+
for char in nonPubidCharRegexp.findall(data):
241+
warnings.warn("Coercing non-XML pubid", DataLossWarning)
242+
replacement = self.getReplacementCharacter(char)
243+
dataOutput = dataOutput.replace(char, replacement)
244+
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
245+
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
246+
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
247+
return dataOutput
248+
232249
def toXmlName(self, name):
233250
nameFirst = name[0]
234251
nameRest = name[1:]
@@ -260,7 +277,7 @@ def fromXmlName(self, name):
260277
return name
261278

262279
def escapeChar(self, char):
263-
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
280+
replacement = "U%05X" % ord(char)
264281
self.replaceCache[char] = replacement
265282
return replacement
266283

html5lib/tests/test_parser.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,28 +27,25 @@ def convertTreeDump(data):
2727

2828
def runParserTest(innerHTML, input, expected, errors, treeClass,
2929
namespaceHTMLElements):
30-
warnings.resetwarnings()
31-
warnings.simplefilter("error")
32-
# XXX - move this out into the setup function
33-
# concatenate all consecutive character tokens into a single token
34-
try:
30+
with warnings.catch_warnings(record=True) as w:
31+
warnings.simplefilter("always")
3532
p = html5parser.HTMLParser(tree=treeClass,
3633
namespaceHTMLElements=namespaceHTMLElements)
37-
except constants.DataLossWarning:
38-
return
3934

40-
try:
41-
if innerHTML:
42-
document = p.parseFragment(input, innerHTML)
43-
else:
44-
try:
35+
try:
36+
if innerHTML:
37+
document = p.parseFragment(input, innerHTML)
38+
else:
4539
document = p.parse(input)
46-
except constants.DataLossWarning:
47-
return
48-
except:
49-
errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
50-
"\nTraceback:", traceback.format_exc()])
51-
assert False, errorMsg
40+
except:
41+
errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
42+
"\nTraceback:", traceback.format_exc()])
43+
assert False, errorMsg
44+
45+
otherW = [x for x in w if not issubclass(x.category, constants.DataLossWarning)]
46+
assert len(otherW) == 0, [(x.category, x.message) for x in otherW]
47+
if len(w):
48+
return
5249

5350
output = convertTreeDump(p.tree.testSerializer(document))
5451

html5lib/treebuilders/etree_lxml.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -291,11 +291,16 @@ def insertDoctype(self, token):
291291
publicId = token["publicId"]
292292
systemId = token["systemId"]
293293

294-
if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
295-
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
294+
if not name:
295+
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
296+
self.doctype = None
297+
else:
298+
coercedName = self.infosetFilter.coerceElement(name)
299+
if coercedName != name:
300+
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
296301

297-
doctype = self.doctypeClass(name, publicId, systemId)
298-
self.doctype = doctype
302+
doctype = self.doctypeClass(coercedName, publicId, systemId)
303+
self.doctype = doctype
299304

300305
def insertCommentInitial(self, data, parent=None):
301306
self.initial_comments.append(data)
@@ -313,12 +318,24 @@ def insertRoot(self, token):
313318
# Therefore we need to use the built-in parser to create our iniial
314319
# tree, after which we can add elements like normal
315320
docStr = ""
316-
if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
321+
if self.doctype:
322+
assert self.doctype.name
317323
docStr += "<!DOCTYPE %s" % self.doctype.name
318324
if (self.doctype.publicId is not None or
319325
self.doctype.systemId is not None):
320-
docStr += ' PUBLIC "%s" "%s"' % (self.doctype.publicId or "",
321-
self.doctype.systemId or "")
326+
docStr += (' PUBLIC "%s" ' %
327+
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
328+
if self.doctype.systemId:
329+
sysid = self.doctype.systemId
330+
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
331+
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
332+
sysid = sysid.replace("'", 'U00027')
333+
if sysid.find("'") >= 0:
334+
docStr += '"%s"' % sysid
335+
else:
336+
docStr += "'%s'" % sysid
337+
else:
338+
docStr += "''"
322339
docStr += ">"
323340
if self.doctype.name != token["name"]:
324341
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)

0 commit comments

Comments
 (0)