Skip to content

Commit 7d9166a

Browse files
author
James Graham
committed
Add phase transition logging support
1 parent 90a9f73 commit 7d9166a

File tree

2 files changed

+67
-15
lines changed

2 files changed

+67
-15
lines changed

html5lib/html5parser.py

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def startswithany(str, prefixes):
2828
return False
2929

3030
import sys
31+
import types
3132

3233
import inputstream
3334
import tokenizer
@@ -37,14 +38,18 @@ def startswithany(str, prefixes):
3738
from treebuilders import simpletree
3839

3940
import utils
41+
import constants
4042
from constants import spaceCharacters, asciiUpper2Lower
4143
from constants import scopingElements, formattingElements, specialElements
4244
from constants import headingElements, tableInsertModeElements
4345
from constants import cdataElements, rcdataElements, voidElements
4446
from constants import tokenTypes, ReparseException, namespaces
4547

48+
debug_log = True
49+
4650
def parse(doc, treebuilder="simpletree", encoding=None,
4751
namespaceHTMLElements=True):
52+
"""Parse a string or file-like object into a tree"""
4853
tb = treebuilders.getTreeBuilder(treebuilder)
4954
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
5055
return p.parse(doc, encoding=encoding)
@@ -55,6 +60,17 @@ def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
5560
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
5661
return p.parseFragment(doc, container=container, encoding=encoding)
5762

63+
def method_decorator_metaclass(function):
64+
class Decorated(type):
65+
def __new__(meta, classname, bases, classDict):
66+
for attributeName, attribute in classDict.iteritems():
67+
if type(attribute) == types.FunctionType:
68+
attribute = function(attribute)
69+
70+
classDict[attributeName] = attribute
71+
return type.__new__(meta, classname, bases, classDict)
72+
return Decorated
73+
5874
class HTMLParser(object):
5975
"""HTML parser. Generates a tree structure from a stream of (possibly
6076
malformed) HTML"""
@@ -129,6 +145,7 @@ def reset(self):
129145
self.tree.reset()
130146
self.firstStartTag = False
131147
self.errors = []
148+
self.log = [] #only used with debug mode
132149
# "quirks" / "limited quirks" / "no quirks"
133150
self.compatMode = "no quirks"
134151

@@ -420,6 +437,31 @@ def parseRCDataRawtext(self, token, contentType):
420437

421438
self.phase = self.phases["text"]
422439

440+
def log(function):
441+
"""Logger that records which phase processes each token"""
442+
type_names = dict((value, key) for key, value in
443+
constants.tokenTypes.iteritems())
444+
def wrapped(self, *args, **kwargs):
445+
if function.__name__ != "__init__" and len(args) > 0:
446+
token = args[0]
447+
try:
448+
info = {"type":type_names[token['type']]}
449+
except:
450+
print token
451+
raise
452+
if token['type'] in constants.tagTokenTypes:
453+
info["name"] = token['name']
454+
455+
self.parser.log.append((self.parser.tokenizer.state.__name__,
456+
self.parser.phase.__class__.__name__,
457+
self.__class__.__name__,
458+
function.__name__,
459+
info))
460+
return function(self, *args, **kwargs)
461+
else:
462+
return function(self, *args, **kwargs)
463+
return wrapped
464+
423465
class Phase(object):
424466
"""Base class for helper object that implements each phase of processing
425467
"""
@@ -434,6 +476,9 @@ class Phase(object):
434476
# * EndTag
435477
# - endTag* methods
436478

479+
if debug_log:
480+
__metaclass__ = method_decorator_metaclass(log)
481+
437482
def __init__(self, parser, tree):
438483
self.parser = parser
439484
self.tree = tree
@@ -1008,7 +1053,7 @@ def startTagForm(self, token):
10081053
self.parser.parseError(u"unexpected-start-tag", {"name": "form"})
10091054
else:
10101055
if self.tree.elementInScope("p"):
1011-
self.endTagP("p")
1056+
self.endTagP(impliedTagToken("p"))
10121057
self.tree.insertElement(token)
10131058
self.tree.formPointer = self.tree.openElements[-1]
10141059

@@ -1831,7 +1876,7 @@ def processEOF(self):
18311876
return
18321877
else:
18331878
ignoreEndTag = self.ignoreEndTagColgroup()
1834-
self.endTagColgroup("colgroup")
1879+
self.endTagColgroup(impliedTagToken("colgroup"))
18351880
if not ignoreEndTag:
18361881
self.parser.phase.processEOF()
18371882

@@ -1847,7 +1892,7 @@ def startTagCol(self, token):
18471892

18481893
def startTagOther(self, token):
18491894
ignoreEndTag = self.ignoreEndTagColgroup()
1850-
self.endTagColgroup("colgroup")
1895+
self.endTagColgroup(impliedTagToken("colgroup"))
18511896
if not ignoreEndTag:
18521897
self.parser.phase.processStartTag(token)
18531898

@@ -1865,7 +1910,7 @@ def endTagCol(self, token):
18651910

18661911
def endTagOther(self, token):
18671912
ignoreEndTag = self.ignoreEndTagColgroup()
1868-
self.endTagColgroup("colgroup")
1913+
self.endTagColgroup(impliedTagToken("colgroup"))
18691914
if not ignoreEndTag:
18701915
self.parser.phase.processEndTag(token)
18711916

@@ -2016,7 +2061,7 @@ def startTagTableCell(self, token):
20162061

20172062
def startTagTableOther(self, token):
20182063
ignoreEndTag = self.ignoreEndTagTr()
2019-
self.endTagTr("tr")
2064+
self.endTagTr(impliedTagToken("tr"))
20202065
# XXX how are we sure it's always ignored in the innerHTML case?
20212066
if not ignoreEndTag:
20222067
self.parser.phase.processStartTag(token)
@@ -2036,15 +2081,15 @@ def endTagTr(self, token):
20362081

20372082
def endTagTable(self, token):
20382083
ignoreEndTag = self.ignoreEndTagTr()
2039-
self.endTagTr("tr")
2084+
self.endTagTr(impliedTagToken("tr"))
20402085
# Reprocess the current tag if the tr end tag was not ignored
20412086
# XXX how are we sure it's always ignored in the innerHTML case?
20422087
if not ignoreEndTag:
20432088
self.parser.phase.processEndTag(token)
20442089

20452090
def endTagTableRowGroup(self, token):
20462091
if self.tree.elementInScope(token["name"], variant="table"):
2047-
self.endTagTr("tr")
2092+
self.endTagTr(impliedTagToken("tr"))
20482093
self.parser.phase.processEndTag(token)
20492094
else:
20502095
# innerHTML case
@@ -2187,12 +2232,12 @@ def startTagOptgroup(self, token):
21872232

21882233
def startTagSelect(self, token):
21892234
self.parser.parseError("unexpected-select-in-select")
2190-
self.endTagSelect("select")
2235+
self.endTagSelect(impliedTagToken("select"))
21912236

21922237
def startTagInput(self, token):
21932238
self.parser.parseError("unexpected-input-in-select")
21942239
if self.tree.elementInScope("select", variant="table"):
2195-
self.endTagSelect("select")
2240+
self.endTagSelect(impliedTagToken("select"))
21962241
self.parser.phase.processStartTag(token)
21972242

21982243
def startTagOther(self, token):

parse.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
11
#!/usr/bin/env python
22
"""usage: %prog [options] filename
33
4-
Parse a document to a simpletree tree, with optional profiling
4+
Parse a document to a tree, with optional profiling
55
"""
6-
#RELEASE move ./examples/
76

87
import sys
98
import os
109
from optparse import OptionParser
1110

12-
#RELEASE remove
13-
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
14-
#END RELEASE
1511
from html5lib import html5parser, sanitizer
1612
from html5lib.tokenizer import HTMLTokenizer
1713
from html5lib import treebuilders, serializer, treewalkers
@@ -52,6 +48,8 @@ def parse():
5248
else:
5349
tokenizer = HTMLTokenizer
5450

51+
if opts.log:
52+
html5parser.debug_log = True
5553

5654
p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer)
5755

@@ -87,10 +85,16 @@ def parse():
8785
def printOutput(parser, document, opts):
8886
if opts.encoding:
8987
print "Encoding:", parser.tokenizer.stream.charEncoding
88+
89+
if opts.log:
90+
for item in parser.log:
91+
print item
92+
9093
if opts.xml:
9194
sys.stdout.write(document.toxml("utf-8"))
9295
elif opts.tree:
93-
if not hasattr(document,'__getitem__'): document = [document]
96+
if not hasattr(document,'__getitem__'):
97+
document = [document]
9498
for fragment in document:
9599
print parser.tree.testSerializer(fragment).encode("utf-8")
96100
elif opts.hilite:
@@ -199,6 +203,9 @@ def getOptParser():
199203
parser.add_option("", "--sanitize", action="store_true", default=False,
200204
dest="sanitize", help="sanitize")
201205

206+
parser.add_option("-l", "--log", action="store_true", default=False,
207+
dest="log", help="log state transitions")
208+
202209
return parser
203210

204211
if __name__ == "__main__":

0 commit comments

Comments
 (0)