Skip to content

Commit f37d906

Browse files
author
Mark Pilgrim
committed
added support for validating base href attribute
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40989
1 parent 8a4c71b commit f37d906

File tree

2 files changed

+108
-3
lines changed

2 files changed

+108
-3
lines changed

src/html5lib/filters/rfc3987.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# adapted from feedvalidator, original copyright license is
2+
#
3+
# Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
import re
24+
25+
iana_schemes = [ # http://www.iana.org/assignments/uri-schemes.html
26+
"ftp", "http", "gopher", "mailto", "news", "nntp", "telnet", "wais",
27+
"file", "prospero", "z39.50s", "z39.50r", "cid", "mid", "vemmi",
28+
"service", "imap", "nfs", "acap", "rtsp", "tip", "pop", "data", "dav",
29+
"opaquelocktoken", "sip", "sips", "tel", "fax", "modem", "ldap",
30+
"https", "soap.beep", "soap.beeps", "xmlrpc.beep", "xmlrpc.beeps",
31+
"urn", "go", "h323", "ipp", "tftp", "mupdate", "pres", "im", "mtqp",
32+
"iris.beep", "dict", "snmp", "crid", "tag", "dns", "info"
33+
]
34+
35+
rfc2396_re = re.compile("([a-zA-Z][0-9a-zA-Z+\\-\\.]*:)?/{0,2}" +
36+
"[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]*$")
37+
rfc2396_full_re = re.compile("[a-zA-Z][0-9a-zA-Z+\\-\\.]*:(//)?" +
38+
"[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]+$")
39+
urn_re = re.compile(r"^[Uu][Rr][Nn]:[a-zA-Z0-9][a-zA-Z0-9-]{1,31}:([a-zA-Z0-9()+,\.:=@;$_!*'\-]|%[0-9A-Fa-f]{2})+$")
40+
tag_re = re.compile(r"^tag:([a-z0-9\-\._]+?@)?[a-z0-9\.\-]+?,\d{4}(-\d{2}(-\d{2})?)?:[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*(#[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*)?$")
41+
42+
def isValidURI(value, uriPattern=rfc2396_re):
43+
scheme=value.split(':')[0].lower()
44+
if scheme == 'tag':
45+
if not tag_re.match(value):
46+
return False, "invalid-tag-uri"
47+
elif scheme == "urn":
48+
if not urn_re.match(value):
49+
return False, "invalid-urn"
50+
elif not uriPattern.match(value):
51+
urichars_re=re.compile("[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]")
52+
for c in value:
53+
if ord(c)<128 and not urichars_re.match(c):
54+
return False, "invalid-uri-char"
55+
else:
56+
try:
57+
if uriPattern.match(value.encode('idna')):
58+
return False, "uri-not-iri"
59+
except:
60+
pass
61+
return False, "invalid-uri"
62+
elif scheme in ['http','ftp']:
63+
if not re.match('^\w+://[^/].*',value):
64+
return False, "invalid-http-or-ftp-uri"
65+
elif value.find(':')>=0 and scheme.isalpha() and scheme not in iana_schemes:
66+
return False, "unregistered-scheme"
67+
return True, ""
68+
69+
def isValidIRI(value):
70+
try:
71+
if value: value = value.encode('idna')
72+
except:
73+
pass
74+
return isValidURI(value)
75+
76+
def isValidFullyQualifiedURI(value):
77+
return isValidURI(value, rfc2396_full_re)
78+

src/html5lib/filters/validator.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from sets import ImmutableSet as frozenset
2121
import _base
2222
import iso639codes
23+
import rfc3987
2324
from html5lib.constants import E, spaceCharacters, digits
2425
from html5lib import tokenizer
2526
import gettext
@@ -339,7 +340,31 @@ def checkStartTagUnknownAttributes(self, token):
339340
# Attribute validation helpers
340341
##########################################################################
341342

342-
def checkIDValue(self, token, tagName, attrName, attrValue):
343+
def checkURI(self, token, tagName, attrName, attrValue):
344+
isValid, errorCode = rfc3987.isValidURI(attrValue)
345+
if not isValid:
346+
yield {"type": "ParseError",
347+
"data": errorCode,
348+
"datavars": {"tagName": tagName,
349+
"attributeName": attrName}}
350+
yield {"type": "ParseError",
351+
"data": "invalid-attribute-value",
352+
"datavars": {"tagName": tagName,
353+
"attributeName": attrName}}
354+
355+
def checkIRI(self, token, tagName, attrName, attrValue):
356+
isValid, errorCode = rfc3987.isValidIRI(attrValue)
357+
if not isValid:
358+
yield {"type": "ParseError",
359+
"data": errorCode,
360+
"datavars": {"tagName": tagName,
361+
"attributeName": attrName}}
362+
yield {"type": "ParseError",
363+
"data": "invalid-attribute-value",
364+
"datavars": {"tagName": tagName,
365+
"attributeName": attrName}}
366+
367+
def checkID(self, token, tagName, attrName, attrValue):
343368
if not attrValue:
344369
yield {"type": "ParseError",
345370
"data": "attribute-value-can-not-be-blank",
@@ -509,7 +534,7 @@ def validateAttributeValueLang(self, token, tagName, attrName, attrValue):
509534
"attributeValue": attrValue}}
510535

511536
def validateAttributeValueContextmenu(self, token, tagName, attrName, attrValue):
512-
for t in self.checkIDValue(token, tagName, attrName, attrValue) or []: yield t
537+
for t in self.checkID(token, tagName, attrName, attrValue) or []: yield t
513538
self.thingsThatPointToAnID.append(token)
514539

515540
def validateAttributeValueId(self, token, tagName, attrName, attrValue):
@@ -518,7 +543,7 @@ def validateAttributeValueId(self, token, tagName, attrName, attrValue):
518543
# later check 1) whether an ID is duplicated, and 2) whether all the
519544
# things that point to something else by ID (like <label for> or
520545
# <span contextmenu>) point to an ID that actually exists somewhere.
521-
for t in self.checkIDValue(token, tagName, attrName, attrValue) or []: yield t
546+
for t in self.checkID(token, tagName, attrName, attrValue) or []: yield t
522547
if not attrValue: return
523548
if attrValue in self.IDsWeHaveKnownAndLoved:
524549
yield {"type": "ParseError",
@@ -548,7 +573,9 @@ def validateAttributeValueBaseHref(self, token, tagName, attrName, attrValue):
548573
# XXX
549574
pass
550575

576+
validateAttributeValueBaseHref = checkIRI
551577
validateAttributeValueBaseTarget = checkBrowsingContext
578+
validateAttributeValueLinkHref = checkIRI
552579

553580
##########################################################################
554581
# Whole document validation (IDs, etc.)

0 commit comments

Comments
 (0)