Skip to content

gh-93376: Allow override of mbox From matching #131415

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
18 changes: 17 additions & 1 deletion Doc/library/mailbox.rst
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF.
^^^^^^^^^^^^^^^^^^^^^^


.. class:: mbox(path, factory=None, create=True)
.. class:: mbox(path, factory=None, create=True, from_matcher=None)

A subclass of :class:`Mailbox` for mailboxes in mbox format. Parameter *factory*
is a callable object that accepts a file-like message representation (which
Expand All @@ -575,6 +575,22 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF.
messages in an mbox mailbox are stored in a single file with the beginning of
each message indicated by a line whose first five characters are "From ".

The parameter *from_matcher* can be used to override this default, by providing
a boolean function that takes the line as its sole parameter.
The default matcher is ``lambda line: line.startswith(b'From ')``.
A stricter matcher might be:
``lambda line: re.match(b'From .+ \\d\\d\\d\\d\\r?\\n', line)``.

One alternate matcher is included:
- ``'full'``: this matches the syntax ``From <sender> <asctime>[ info]``
The ``asctime`` field must match the standard syntax, i.e. the fixed length (24 char) string:
``(Mon|...|Sun) (Jan|...|Dec) [ |d]d hh:mm:ss yyyy``.
The date field can have a leading space instead of a leading ``0``.
[The month and day-of-week fields are always in English]
A boolean function might be useful in some cases where the body text contains
un-quoted "From " lines. In such cases, it might help to check that the year (and month)
are the expected values for the mbox. Any other "From " lines are likely to be un-quoted body text.

Several variations of the mbox format exist to address perceived shortcomings in
the original. In the interest of compatibility, :class:`!mbox` implements the
original format, which is sometimes referred to as :dfn:`mboxo`. This means that
Expand Down
14 changes: 5 additions & 9 deletions Lib/graphlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,17 +90,13 @@ def prepare(self):
still be used to obtain as many nodes as possible until cycles block more
progress. After a call to this function, the graph cannot be modified and
therefore no more nodes can be added using "add".

Raise ValueError if nodes have already been passed out of the sorter.

"""
if self._npassedout > 0:
raise ValueError("cannot prepare() after starting sort")
if self._ready_nodes is not None:
raise ValueError("cannot prepare() more than once")

if self._ready_nodes is None:
self._ready_nodes = [
i.node for i in self._node2info.values() if i.npredecessors == 0
]
self._ready_nodes = [
i.node for i in self._node2info.values() if i.npredecessors == 0
]
# ready_nodes is set before we look for cycles on purpose:
# if the user wants to catch the CycleError, that's fine,
# they can continue using the instance to grab as many
Expand Down
20 changes: 18 additions & 2 deletions Lib/mailbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -895,15 +895,31 @@ def _install_message(self, message):
class mbox(_mboxMMDF):
"""A classic mbox mailbox."""

# This is the full syntax, i.e. From sender asctime[ moreinfo]
DAY_RE = b' (?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)'
MON_RE = b' (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'
DTY_RE = b' [ 123]\\d \\d\\d:\\d\\d:\\d\\d \\d{4}' # day, time, year
FULL_RE = b'From \\S+' + DAY_RE + MON_RE + DTY_RE + b'( .+)?' + linesep + b'\\Z'
# we capture the optional moreinfo group so we can check for lines that end in the date

_mangle_from_ = True

# All messages must end in a newline character, and
# _post_message_hooks outputs an empty line between messages.
_append_newline = True

def __init__(self, path, factory=None, create=True):
def __init__(self, path, factory=None, create=True, from_matcher=None):
"""Initialize an mbox mailbox."""
self._message_factory = mboxMessage
if from_matcher is None:
# default to original matcher
self._from_matcher = lambda line: line.startswith(b'From ')
elif from_matcher == 'full': # From sender date[ moreinfo]
import re
regex = re.compile(self.FULL_RE) # compile once
self._from_matcher = lambda line: re.match(regex, line)
else: # assume it is a boolean function with one parameter
self._from_matcher = from_matcher
_mboxMMDF.__init__(self, path, factory, create)

def _post_message_hook(self, f):
Expand All @@ -918,7 +934,7 @@ def _generate_toc(self):
while True:
line_pos = self._file.tell()
line = self._file.readline()
if line.startswith(b'From '):
if self._from_matcher(line):
if len(stops) < len(starts):
if last_was_empty:
stops.append(line_pos - len(linesep))
Expand Down
7 changes: 7 additions & 0 deletions Lib/test/test_email/data/mailbox_01.mbox
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
From MAILER-DAEMON Sun Aug 7 11:40:37 2022 extra info
From: foo
Subject: unquoted From in body; extra info on From line

Hello

From time to time
7 changes: 7 additions & 0 deletions Lib/test/test_email/data/mailbox_02.mbox
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
From MAILER-DAEMON Sun Aug 7 11:40:37 20220 extra info
From: foo
Subject: unquoted From in body; invalid extra info on From line

Hello

From time to time
14 changes: 1 addition & 13 deletions Lib/test/test_graphlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,21 +140,9 @@ def test_calls_before_prepare(self):
def test_prepare_multiple_times(self):
ts = graphlib.TopologicalSorter()
ts.prepare()
ts.prepare()

def test_prepare_after_pass_out(self):
ts = graphlib.TopologicalSorter({'a': 'bc'})
ts.prepare()
self.assertEqual(set(ts.get_ready()), {'b', 'c'})
with self.assertRaisesRegex(ValueError, r"cannot prepare\(\) after starting sort"):
with self.assertRaisesRegex(ValueError, r"cannot prepare\(\) more than once"):
ts.prepare()

def test_prepare_cycleerror_each_time(self):
ts = graphlib.TopologicalSorter({'a': 'b', 'b': 'a'})
for attempt in range(1, 4):
with self.assertRaises(graphlib.CycleError, msg=f"{attempt=}"):
ts.prepare()

def test_invalid_nodes_in_done(self):
ts = graphlib.TopologicalSorter()
ts.add(1, 2, 3, 4)
Expand Down
67 changes: 67 additions & 0 deletions Lib/test/test_mailbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -1310,6 +1310,73 @@ def test_message_separator(self):
data = f.read()
self.assertEndsWith(data, '0\n\n')

# Test reading an mbox file with un-prefixed From in body text
# currently generates 2 messages
def _test_read_mbox(self, matcher=0, count=2):
# create a basic mbox file
self._box.add('From: foo\n\nHello\n')
# Add an un-prefixed From to create a second entry
self._box._file.write(b'From time to time\n')
self._box.close()
# re-read it using the provided matcher
if matcher == 0: # not provided, so omit
self._box = mailbox.mbox(self._path, create=False)
else:
self._box = mailbox.mbox(self._path, create=False, from_matcher=matcher)
# How many messages were found?
self.assertEqual(len(self._box.keys()), count)

def test_read_mbox_omitted(self):
self._test_read_mbox()

def test_read_mbox_none(self):
self._test_read_mbox(None)

def test_read_mbox_default(self):
self._test_read_mbox(lambda line: re.match(b'From ', line))

def test_read_mbox_full1(self):
self._test_read_mbox('full', count=1)

def test_read_mbox_regex1(self):
import re
# stricter matching should only find one message
self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\d\\r?\\n', line), count=1)

def test_read_mbox_regex2(self):
import re
# invalid, so don't find any messages
self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\r?\\n', line), count=0)

class TestMboxFromFile(unittest.TestCase):
# test class without default setUp/tearDown which we don't want

def setUp(self):
self._box = None
self._path = None

def tearDown(self):
if self._box is not None:
self._box.close()
# Don't delete it!

def checkmbox(self, name, matcher, count):
self._path = os.path.join(os.path.dirname(__file__), 'test_email', 'data', name)
self._box = mailbox.mbox(self._path, create=False, from_matcher=matcher)
self.assertEqual(len(self._box.keys()), count)

# default matcher finds two messages as there are 2 From lines
def test_read_mbox_None_01(self):
self.checkmbox('mailbox_01.mbox', None, 2)

def test_read_mbox_None_02(self):
self.checkmbox('mailbox_02.mbox', None, 2)

def test_read_mbox_full_01(self):
self.checkmbox('mailbox_01.mbox', 'full', 1)

def test_read_mbox_full_02(self):
self.checkmbox('mailbox_02.mbox', 'full', 0) # From line has extra non-space chars after YYYY

class TestMMDF(_TestMboxMMDF, unittest.TestCase):

Expand Down
1 change: 0 additions & 1 deletion Misc/ACKS
Original file line number Diff line number Diff line change
Expand Up @@ -1484,7 +1484,6 @@ Michael Pomraning
Martin Pool
Iustin Pop
Claudiu Popa
Daniel Pope
Nick Pope
John Popplewell
Matheus Vieira Portela
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Added *from_matcher* parameter to mailbox.mbox parser.
This allows the user to override the default matcher (which looks for "From " only) with a
more specific matcher that is less likely to match against un-quoted "From " lines in body text.
Loading