Skip to content

Commit 66c4f3f

Browse files
maxkingwarsaw
authored andcommitted
bpo-21315: Fix parsing of encoded words with missing leading ws. (#13425)
* bpo-21315: Fix parsing of encoded words with missing leading ws. Because of missing leading whitespace, encoded word would get parsed as unstructured token. This patch fixes that by looking for encoded words when splitting tokens with whitespace. Missing trailing whitespace around encoded word now register a defect instead. Original patch suggestion by David R. Murray on bpo-21315.
1 parent 142566c commit 66c4f3f

File tree

4 files changed

+49
-3
lines changed

4 files changed

+49
-3
lines changed

Lib/email/_header_value_parser.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,18 @@
9696
def quote_string(value):
9797
return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
9898

99+
# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
100+
rfc2047_matcher = re.compile(r'''
101+
=\? # literal =?
102+
[^?]* # charset
103+
\? # literal ?
104+
[qQbB] # literal 'q' or 'b', case insensitive
105+
\? # literal ?
106+
.*? # encoded word
107+
\?= # literal ?=
108+
''', re.VERBOSE | re.MULTILINE)
109+
110+
99111
#
100112
# TokenList and its subclasses
101113
#
@@ -1052,6 +1064,10 @@ def get_encoded_word(value):
10521064
_validate_xtext(vtext)
10531065
ew.append(vtext)
10541066
text = ''.join(remainder)
1067+
# Encoded words should be followed by a WS
1068+
if value and value[0] not in WSP:
1069+
ew.defects.append(errors.InvalidHeaderDefect(
1070+
"missing trailing whitespace after encoded-word"))
10551071
return ew, value
10561072

10571073
def get_unstructured(value):
@@ -1104,6 +1120,11 @@ def get_unstructured(value):
11041120
unstructured.append(token)
11051121
continue
11061122
tok, *remainder = _wsp_splitter(value, 1)
1123+
# Split in the middle of an atom if there is a rfc2047 encoded word
1124+
# which does not have WSP on both sides. The defect will be registered
1125+
# the next time through the loop.
1126+
if rfc2047_matcher.search(tok):
1127+
tok, *remainder = value.partition('=?')
11071128
vtext = ValueTerminal(tok, 'vtext')
11081129
_validate_xtext(vtext)
11091130
unstructured.append(vtext)

Lib/test/test_email/test__header_value_parser.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def test_get_encoded_word_gets_first_even_if_no_space(self):
118118
'=?us-ascii?q?first?==?utf-8?q?second?=',
119119
'first',
120120
'first',
121-
[],
121+
[errors.InvalidHeaderDefect],
122122
'=?utf-8?q?second?=')
123123

124124
def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ def test_get_unstructured_no_whitespace_between_ews(self):
361361
'=?utf-8?q?foo?==?utf-8?q?bar?=',
362362
'foobar',
363363
'foobar',
364+
[errors.InvalidHeaderDefect,
365+
errors.InvalidHeaderDefect],
366+
'')
367+
368+
def test_get_unstructured_ew_without_leading_whitespace(self):
369+
self._test_get_x(
370+
self._get_unst,
371+
'nowhitespace=?utf-8?q?somevalue?=',
372+
'nowhitespacesomevalue',
373+
'nowhitespacesomevalue',
374+
[errors.InvalidHeaderDefect],
375+
'')
376+
377+
def test_get_unstructured_ew_without_trailing_whitespace(self):
378+
self._test_get_x(
379+
self._get_unst,
380+
'=?utf-8?q?somevalue?=nowhitespace',
381+
'somevaluenowhitespace',
382+
'somevaluenowhitespace',
364383
[errors.InvalidHeaderDefect],
365384
'')
366385

@@ -546,7 +565,8 @@ def test_encoded_word_inside_quotes(self):
546565
'"=?utf-8?Q?not_really_valid?="',
547566
'"not really valid"',
548567
'not really valid',
549-
[errors.InvalidHeaderDefect],
568+
[errors.InvalidHeaderDefect,
569+
errors.InvalidHeaderDefect],
550570
'')
551571

552572
# get_comment

Lib/test/test_email/test_headerregistry.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
11801180

11811181
'rfc2047_atom_in_quoted_string_is_decoded':
11821182
('"=?utf-8?q?=C3=89ric?=" <foo@example.com>',
1183-
[errors.InvalidHeaderDefect],
1183+
[errors.InvalidHeaderDefect,
1184+
errors.InvalidHeaderDefect],
11841185
'Éric <foo@example.com>',
11851186
'Éric',
11861187
'foo@example.com',
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Email headers containing RFC2047 encoded words are parsed despite the missing
2+
whitespace, and a defect registered. Also missing trailing whitespace after
3+
encoded words is now registered as a defect.
4+

0 commit comments

Comments
 (0)