Skip to content

Commit

Permalink
Attempt to parse multiple messages in mbox and fix MetricsGrimoire#10
Browse files Browse the repository at this point in the history
  • Loading branch information
gpoo committed May 20, 2014
1 parent 1921f7c commit ad87c8f
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 5 deletions.
41 changes: 39 additions & 2 deletions pymlstats/strictmbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,54 @@ def __init__(self, path, factory=None, create=True):
def _generate_toc(self):
"""Generate key-to-(start, stop) table of contents."""
starts, stops = [], []
last_was_from = False
last_was_empty = False
self._file.seek(0)
while True:
line_pos = self._file.tell()
line = self._file.readline()
if line.startswith('From ') and self._strict_isrealfromline(line):
# There is a new message, but in the line before was just
# another new message. We assume that the previous one was
# not a new message, but a text with the same pattern.
if last_was_from:
starts.pop()
stops.pop()
if len(stops) < len(starts):
stops.append(line_pos - len(os.linesep))
if last_was_empty:
stops.append(line_pos - len(os.linesep))
elif not last_was_from:
stops.append(line_pos)
else:
stops.append(line_pos - len(os.linesep))
starts.append(line_pos)
last_was_from = True
last_was_empty = False
elif line == '':
stops.append(line_pos)
if last_was_empty:
stops.append(line_pos - len(os.linesep))
else:
stops.append(line_pos)
last_was_from = False
break
elif line == os.linesep:
if last_was_from:
starts.pop()
stops.pop()
last_was_from = False
last_was_empty = True
else:
# If this is new a message and have an empty line right
# after, then the message does not have headers.
# In such case, it is not a new message but a text with
# similar pattern (false positive for new message)
if last_was_from and len(line.strip()) == 0:
starts.pop()
stops.pop()

last_was_from = False
last_was_empty = False

self._toc = dict(enumerate(zip(starts, stops)))
self._next_key = len(self._toc)
self._file_length = self._file.tell()
Expand Down
38 changes: 35 additions & 3 deletions pymlstats/tests/test_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def get_analyzer(self, path, **kwargs):

def check_single_message(self, expected, messages):
for key, value in expected.items():
output = u"{}:\n" \
output = u"\n{}:\n" \
u"\tExpected: '{}'\n" \
u"\tObtained: '{}'".format(key, value, messages[0][key])
self.assertEqual(value, messages[0][key], output)
Expand All @@ -45,7 +45,7 @@ def test_single_message_no_encoding(self):
expected = {
'body': u'Hi!\n\nA message in English, with a signature '
u'with a different encoding.\n\nregards, G?ran'
u'\n\n\n\n',
u'\n\n\n',
'content-type': None,
'date': '2010-12-01 14:26:40',
'date_tz': '3600',
Expand Down Expand Up @@ -113,7 +113,7 @@ def test_single_message_with_8_bit_encoding(self):
u'> desktop-devel-list mailing list\n'
u'> [email protected]\n'
u'> http://mail.gnome.org/mailman/listinfo/desktop-devel-list'
u'\n\n',
u'\n',
'content-type': u'text/plain; charset=utf-8',
'date': '2008-03-17 11:19:29',
'date_tz': '3600',
Expand Down Expand Up @@ -180,3 +180,35 @@ def test_single_message_with_7_bit_encoding(self):
self.assertEqual(1, len(messages), '# of messages')
self.check_single_message(expected, messages)
self.assertEqual(0, non_parsed, 'non_parsed')

def test_single_message_tricky(self):
'''Multiple From's that are not new messages'''
maa = self.get_analyzer('mlstats-2007.mbox')
messages, non_parsed = maa.get_messages()
expected = {
'body':
u'Vaya, olvid? los archivos que testifican todo esto. Ah? van.'
u'\n\n'
u'> Libresoft-tools-devel mailing list\n'
u'> Libresoft-tools-devel at lists.morfeo-project.org\n'
u'-------------- next part --------------\n'
u'From erkko.anttila at nokia.com Mon Aug 1 12:51:16 2005\n'
u'From florian.boor at kernelconcepts.de Mon Aug 1 13:12:02 2005'
u'\nFrom czr770 at iohazard.tts.fi Mon Aug 1 14:43:49 2005'
u'\n',
'content-type': None,
'date': '2007-02-14 19:46:10',
'date_tz': '0',
'in-reply-to': u'<[email protected]>',
'list-id': None,
'message-id': u'<[email protected]>',
'received': None,
'references': u'<[email protected]>',
'from': [(u'Jorge Gascon Perez', u'[email protected]')],
'to': None,
'cc': None
}

self.check_single_message(expected, messages)
self.assertEqual(2, len(messages), '# of messages')
self.assertEqual(0, non_parsed, 'non_parsed')

0 comments on commit ad87c8f

Please sign in to comment.