@@ -28,6 +28,7 @@ def startswithany(str, prefixes):
28
28
return False
29
29
30
30
import sys
31
+ import types
31
32
32
33
import inputstream
33
34
import tokenizer
@@ -37,14 +38,18 @@ def startswithany(str, prefixes):
37
38
from treebuilders import simpletree
38
39
39
40
import utils
41
+ import constants
40
42
from constants import spaceCharacters , asciiUpper2Lower
41
43
from constants import scopingElements , formattingElements , specialElements
42
44
from constants import headingElements , tableInsertModeElements
43
45
from constants import cdataElements , rcdataElements , voidElements
44
46
from constants import tokenTypes , ReparseException , namespaces
45
47
48
+ debug_log = True
49
+
46
50
def parse (doc , treebuilder = "simpletree" , encoding = None ,
47
51
namespaceHTMLElements = True ):
52
+ """Parse a string or file-like object into a tree"""
48
53
tb = treebuilders .getTreeBuilder (treebuilder )
49
54
p = HTMLParser (tb , namespaceHTMLElements = namespaceHTMLElements )
50
55
return p .parse (doc , encoding = encoding )
@@ -55,6 +60,17 @@ def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
55
60
p = HTMLParser (tb , namespaceHTMLElements = namespaceHTMLElements )
56
61
return p .parseFragment (doc , container = container , encoding = encoding )
57
62
63
+ def method_decorator_metaclass (function ):
64
+ class Decorated (type ):
65
+ def __new__ (meta , classname , bases , classDict ):
66
+ for attributeName , attribute in classDict .iteritems ():
67
+ if type (attribute ) == types .FunctionType :
68
+ attribute = function (attribute )
69
+
70
+ classDict [attributeName ] = attribute
71
+ return type .__new__ (meta , classname , bases , classDict )
72
+ return Decorated
73
+
58
74
class HTMLParser (object ):
59
75
"""HTML parser. Generates a tree structure from a stream of (possibly
60
76
malformed) HTML"""
@@ -129,6 +145,7 @@ def reset(self):
129
145
self .tree .reset ()
130
146
self .firstStartTag = False
131
147
self .errors = []
148
+ self .log = [] #only used with debug mode
132
149
# "quirks" / "limited quirks" / "no quirks"
133
150
self .compatMode = "no quirks"
134
151
@@ -420,6 +437,31 @@ def parseRCDataRawtext(self, token, contentType):
420
437
421
438
self .phase = self .phases ["text" ]
422
439
440
+ def log (function ):
441
+ """Logger that records which phase processes each token"""
442
+ type_names = dict ((value , key ) for key , value in
443
+ constants .tokenTypes .iteritems ())
444
+ def wrapped (self , * args , ** kwargs ):
445
+ if function .__name__ != "__init__" and len (args ) > 0 :
446
+ token = args [0 ]
447
+ try :
448
+ info = {"type" :type_names [token ['type' ]]}
449
+ except :
450
+ print token
451
+ raise
452
+ if token ['type' ] in constants .tagTokenTypes :
453
+ info ["name" ] = token ['name' ]
454
+
455
+ self .parser .log .append ((self .parser .tokenizer .state .__name__ ,
456
+ self .parser .phase .__class__ .__name__ ,
457
+ self .__class__ .__name__ ,
458
+ function .__name__ ,
459
+ info ))
460
+ return function (self , * args , ** kwargs )
461
+ else :
462
+ return function (self , * args , ** kwargs )
463
+ return wrapped
464
+
423
465
class Phase (object ):
424
466
"""Base class for helper object that implements each phase of processing
425
467
"""
@@ -434,6 +476,9 @@ class Phase(object):
434
476
# * EndTag
435
477
# - endTag* methods
436
478
479
+ if debug_log :
480
+ __metaclass__ = method_decorator_metaclass (log )
481
+
437
482
def __init__ (self , parser , tree ):
438
483
self .parser = parser
439
484
self .tree = tree
@@ -1008,7 +1053,7 @@ def startTagForm(self, token):
1008
1053
self .parser .parseError (u"unexpected-start-tag" , {"name" : "form" })
1009
1054
else :
1010
1055
if self .tree .elementInScope ("p" ):
1011
- self .endTagP ("p" )
1056
+ self .endTagP (impliedTagToken ( "p" ) )
1012
1057
self .tree .insertElement (token )
1013
1058
self .tree .formPointer = self .tree .openElements [- 1 ]
1014
1059
@@ -1831,7 +1876,7 @@ def processEOF(self):
1831
1876
return
1832
1877
else :
1833
1878
ignoreEndTag = self .ignoreEndTagColgroup ()
1834
- self .endTagColgroup ("colgroup" )
1879
+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
1835
1880
if not ignoreEndTag :
1836
1881
self .parser .phase .processEOF ()
1837
1882
@@ -1847,7 +1892,7 @@ def startTagCol(self, token):
1847
1892
1848
1893
def startTagOther (self , token ):
1849
1894
ignoreEndTag = self .ignoreEndTagColgroup ()
1850
- self .endTagColgroup ("colgroup" )
1895
+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
1851
1896
if not ignoreEndTag :
1852
1897
self .parser .phase .processStartTag (token )
1853
1898
@@ -1865,7 +1910,7 @@ def endTagCol(self, token):
1865
1910
1866
1911
def endTagOther (self , token ):
1867
1912
ignoreEndTag = self .ignoreEndTagColgroup ()
1868
- self .endTagColgroup ("colgroup" )
1913
+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
1869
1914
if not ignoreEndTag :
1870
1915
self .parser .phase .processEndTag (token )
1871
1916
@@ -2016,7 +2061,7 @@ def startTagTableCell(self, token):
2016
2061
2017
2062
def startTagTableOther (self , token ):
2018
2063
ignoreEndTag = self .ignoreEndTagTr ()
2019
- self .endTagTr ("tr" )
2064
+ self .endTagTr (impliedTagToken ( "tr" ) )
2020
2065
# XXX how are we sure it's always ignored in the innerHTML case?
2021
2066
if not ignoreEndTag :
2022
2067
self .parser .phase .processStartTag (token )
@@ -2036,15 +2081,15 @@ def endTagTr(self, token):
2036
2081
2037
2082
def endTagTable (self , token ):
2038
2083
ignoreEndTag = self .ignoreEndTagTr ()
2039
- self .endTagTr ("tr" )
2084
+ self .endTagTr (impliedTagToken ( "tr" ) )
2040
2085
# Reprocess the current tag if the tr end tag was not ignored
2041
2086
# XXX how are we sure it's always ignored in the innerHTML case?
2042
2087
if not ignoreEndTag :
2043
2088
self .parser .phase .processEndTag (token )
2044
2089
2045
2090
def endTagTableRowGroup (self , token ):
2046
2091
if self .tree .elementInScope (token ["name" ], variant = "table" ):
2047
- self .endTagTr ("tr" )
2092
+ self .endTagTr (impliedTagToken ( "tr" ) )
2048
2093
self .parser .phase .processEndTag (token )
2049
2094
else :
2050
2095
# innerHTML case
@@ -2187,12 +2232,12 @@ def startTagOptgroup(self, token):
2187
2232
2188
2233
def startTagSelect (self , token ):
2189
2234
self .parser .parseError ("unexpected-select-in-select" )
2190
- self .endTagSelect ("select" )
2235
+ self .endTagSelect (impliedTagToken ( "select" ) )
2191
2236
2192
2237
def startTagInput (self , token ):
2193
2238
self .parser .parseError ("unexpected-input-in-select" )
2194
2239
if self .tree .elementInScope ("select" , variant = "table" ):
2195
- self .endTagSelect ("select" )
2240
+ self .endTagSelect (impliedTagToken ( "select" ) )
2196
2241
self .parser .phase .processStartTag (token )
2197
2242
2198
2243
def startTagOther (self , token ):
0 commit comments