From ad87c8f16bde7c2b6940389a6faad1c640cf93cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Germ=C3=A1n=20Poo-Caama=C3=B1o?= Date: Wed, 12 Feb 2014 17:02:15 -0800 Subject: [PATCH] Attempt to parse multiple messages in mbox and fix #10 --- pymlstats/strictmbox.py | 41 ++++++++++++++++++++++++++++++-- pymlstats/tests/test_analyzer.py | 38 ++++++++++++++++++++++++++--- 2 files changed, 74 insertions(+), 5 deletions(-) diff --git a/pymlstats/strictmbox.py b/pymlstats/strictmbox.py index 56b9a30..7880a92 100644 --- a/pymlstats/strictmbox.py +++ b/pymlstats/strictmbox.py @@ -46,17 +46,54 @@ def __init__(self, path, factory=None, create=True): def _generate_toc(self): """Generate key-to-(start, stop) table of contents.""" starts, stops = [], [] + last_was_from = False + last_was_empty = False self._file.seek(0) while True: line_pos = self._file.tell() line = self._file.readline() if line.startswith('From ') and self._strict_isrealfromline(line): + # There is a new message, but in the line before was just + # another new message. We assume that the previous one was + # not a new message, but a text with the same pattern. + if last_was_from: + starts.pop() + stops.pop() if len(stops) < len(starts): - stops.append(line_pos - len(os.linesep)) + if last_was_empty: + stops.append(line_pos - len(os.linesep)) + elif not last_was_from: + stops.append(line_pos) + else: + stops.append(line_pos - len(os.linesep)) starts.append(line_pos) + last_was_from = True + last_was_empty = False elif line == '': - stops.append(line_pos) + if last_was_empty: + stops.append(line_pos - len(os.linesep)) + else: + stops.append(line_pos) + last_was_from = False break + elif line == os.linesep: + if last_was_from: + starts.pop() + stops.pop() + last_was_from = False + last_was_empty = True + else: + # If this is new a message and have an empty line right + # after, then the message does not have headers. + # In such case, it is not a new message but a text with + # similar pattern (false positive for new message) + if last_was_from and len(line.strip()) == 0: + starts.pop() + stops.pop() + + last_was_from = False + last_was_empty = False + self._toc = dict(enumerate(zip(starts, stops))) self._next_key = len(self._toc) self._file_length = self._file.tell() diff --git a/pymlstats/tests/test_analyzer.py b/pymlstats/tests/test_analyzer.py index da69633..dfd33ac 100644 --- a/pymlstats/tests/test_analyzer.py +++ b/pymlstats/tests/test_analyzer.py @@ -34,7 +34,7 @@ def get_analyzer(self, path, **kwargs): def check_single_message(self, expected, messages): for key, value in expected.items(): - output = u"{}:\n" \ + output = u"\n{}:\n" \ u"\tExpected: '{}'\n" \ u"\tObtained: '{}'".format(key, value, messages[0][key]) self.assertEqual(value, messages[0][key], output) @@ -45,7 +45,7 @@ def test_single_message_no_encoding(self): expected = { 'body': u'Hi!\n\nA message in English, with a signature ' u'with a different encoding.\n\nregards, G?ran' - u'\n\n\n\n', + u'\n\n\n', 'content-type': None, 'date': '2010-12-01 14:26:40', 'date_tz': '3600', @@ -113,7 +113,7 @@ def test_single_message_with_8_bit_encoding(self): u'> desktop-devel-list mailing list\n' u'> desktop-devel-list@gnome.org\n' u'> http://mail.gnome.org/mailman/listinfo/desktop-devel-list' - u'\n\n', + u'\n', 'content-type': u'text/plain; charset=utf-8', 'date': '2008-03-17 11:19:29', 'date_tz': '3600', @@ -180,3 +180,35 @@ def test_single_message_with_7_bit_encoding(self): self.assertEqual(1, len(messages), '# of messages') self.check_single_message(expected, messages) self.assertEqual(0, non_parsed, 'non_parsed') + + def test_single_message_tricky(self): + '''Multiple From's that are not new messages''' + maa = self.get_analyzer('mlstats-2007.mbox') + messages, non_parsed = maa.get_messages() + expected = { + 'body': + u'Vaya, olvid? los archivos que testifican todo esto. Ah? van.' + u'\n\n' + u'> Libresoft-tools-devel mailing list\n' + u'> Libresoft-tools-devel at lists.morfeo-project.org\n' + u'-------------- next part --------------\n' + u'From erkko.anttila at nokia.com Mon Aug 1 12:51:16 2005\n' + u'From florian.boor at kernelconcepts.de Mon Aug 1 13:12:02 2005' + u'\nFrom czr770 at iohazard.tts.fi Mon Aug 1 14:43:49 2005' + u'\n', + 'content-type': None, + 'date': '2007-02-14 19:46:10', + 'date_tz': '0', + 'in-reply-to': u'<200702142044.30890.jgascon@gsyc.escet.urjc.es>', + 'list-id': None, + 'message-id': u'<200702142047.46199.jgascon@gsyc.escet.urjc.es>', + 'received': None, + 'references': u'<200702142044.30890.jgascon@gsyc.escet.urjc.es>', + 'from': [(u'Jorge Gascon Perez', u'jgascon@gsyc.escet.urjc.es')], + 'to': None, + 'cc': None + } + + self.check_single_message(expected, messages) + self.assertEqual(2, len(messages), '# of messages') + self.assertEqual(0, non_parsed, 'non_parsed')