Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add docstrings to every function and class #73

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion benchmark/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,26 @@


class VulgarTongue(Spec):
"""
A class representing a VulgarTongue dictionary.

Attributes:
name (str): The name of the dictionary.
filename (str): The filename of the dictionary file.
headline_field (str): The field name for the headline.
"""

name = "dictionary"
filename = "dcvgr10.txt.gz"
headline_field = "head"

def documents(self):
"""
Generator function that yields documents from the dictionary file.

Yields:
dict: A dictionary representing a document with 'head' and 'body' fields.
"""
path = os.path.join(self.options.dir, self.filename)
f = gzip.GzipFile(path)

Expand All @@ -28,14 +43,26 @@ def documents(self):
yield {"head": head, "body": head + body}

def whoosh_schema(self):
ana = analysis.StemmingAnalyzer()
"""
Returns the Whoosh schema for the VulgarTongue dictionary.

Returns:
Schema: The Whoosh schema for the dictionary.
"""
ana = analysis.stemming_analyzer()

schema = fields.Schema(
head=fields.ID(stored=True), body=fields.TEXT(analyzer=ana, stored=True)
)
return schema

def zcatalog_setup(self, cat):
"""
Sets up the ZCatalog indexes for the VulgarTongue dictionary.

Args:
cat (ZCatalog): The ZCatalog instance.
"""
from zcatalog import indexes # type: ignore @UnresolvedImport

cat["head"] = indexes.FieldIndex(field_name="head")
Expand Down
103 changes: 97 additions & 6 deletions benchmark/enron.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@
from whoosh.support.bench import Bench, Spec
from whoosh.util import now

# Benchmark class


class Enron(Spec):
"""
The Enron class provides functionality for downloading, caching, and processing the Enron email archive.
"""

name = "enron"

enron_archive_url = "http://www.cs.cmu.edu/~enron/enron_mail_082109.tar.gz"
Expand All @@ -40,17 +42,32 @@ class Enron(Spec):

cachefile = None

# Functions for downloading and then reading the email archive and caching
# the messages in an easier-to-digest format

def download_archive(self, archive):
"""
Downloads the Enron email archive from the specified URL and saves it to the given file path.

Args:
archive (str): The file path to save the downloaded archive.

Raises:
FileNotFoundError: If the archive file does not exist.
"""
print(f"Downloading Enron email archive to {archive}...")
t = now()
urlretrieve(self.enron_archive_url, archive)
print(f"Downloaded in {now() - t} seconds")

@staticmethod
def get_texts(archive):
"""
Generator function that yields the text content of each email in the given archive.

Args:
archive (str): The file path of the archive.

Yields:
str: The text content of each email.
"""
archive = tarfile.open(archive, "r:gz")
while True:
entry = next(archive)
Expand All @@ -64,6 +81,16 @@ def get_texts(archive):

@staticmethod
def get_messages(archive, headers=True):
"""
Generator function that yields the parsed messages from the given email archive.

Args:
archive (str): The file path of the archive.
headers (bool, optional): Whether to include message headers. Defaults to True.

Yields:
dict: The dictionary representation of each message.
"""
header_to_field = Enron.header_to_field
for text in Enron.get_texts(archive):
message = message_from_string(text)
Expand All @@ -83,6 +110,16 @@ def get_messages(archive, headers=True):
yield d

def cache_messages(self, archive, cache):
"""
Caches the messages from the given email archive into a pickle file.

Args:
archive (str): The file path of the archive.
cache (str): The file path to save the cached messages.

Raises:
FileNotFoundError: If the archive file does not exist.
"""
print(f"Caching messages in {cache}...")

if not os.path.exists(archive):
Expand All @@ -100,6 +137,9 @@ def cache_messages(self, archive, cache):
print(f"Cached messages in {now() - t} seconds")

def setup(self):
"""
Sets up the Enron email archive by downloading it if necessary and caching the messages.
"""
archive = os.path.abspath(
os.path.join(self.options.dir, self.enron_archive_filename)
)
Expand All @@ -116,6 +156,15 @@ def setup(self):
print("Cache is OK")

def documents(self):
"""
Generator function that yields the cached messages from the pickle file.

Yields:
dict: The dictionary representation of each message.

Raises:
FileNotFoundError: If the message cache does not exist.
"""
if not os.path.exists(self.cache_filename):
raise FileNotFoundError("Message cache does not exist, use --setup")

Expand All @@ -130,7 +179,13 @@ def documents(self):
f.close()

def whoosh_schema(self):
ana = analysis.StemmingAnalyzer(maxsize=40, cachesize=None)
"""
Returns the Whoosh schema for indexing the Enron email archive.

Returns:
whoosh.fields.Schema: The schema for indexing the emails.
"""
ana = analysis.stemming_analyzer(maxsize=40, cachesize=None)
storebody = self.options.storebody
schema = fields.Schema(
body=fields.TEXT(analyzer=ana, stored=storebody),
Expand All @@ -145,6 +200,15 @@ def whoosh_schema(self):
return schema

def xappy_indexer_connection(self, path):
"""
Creates and returns an Xapian indexer connection for indexing the Enron email archive.

Args:
path (str): The path to the Xapian index.

Returns:
xappy.IndexerConnection: The Xapian indexer connection.
"""
conn = xappy.IndexerConnection(path)
conn.add_field_action("body", xappy.FieldActions.INDEX_FREETEXT, language="en")
if self.options.storebody:
Expand All @@ -164,6 +228,12 @@ def xappy_indexer_connection(self, path):
return conn

def zcatalog_setup(self, cat):
"""
Sets up the ZCatalog indexes for indexing the Enron email archive.

Args:
cat (zcatalog.catalog.Catalog): The ZCatalog catalog.
"""
from zcatalog import indexes # type: ignore

for name in ("date", "frm"):
Expand All @@ -172,12 +242,27 @@ def zcatalog_setup(self, cat):
cat[name] = indexes.TextIndex(field_name=name)

def process_document_whoosh(self, d):
"""
Processes a document for indexing with Whoosh.

Args:
d (dict): The document to process.
"""
d["filepos"] = self.filepos
if self.options.storebody:
mf = self.main_field
d[f"_stored_{mf}"] = compress(d[mf], 9)

def process_result_whoosh(self, d):
"""
Processes a search result from Whoosh.

Args:
d (dict): The search result.

Returns:
dict: The processed search result.
"""
mf = self.main_field
if mf in d:
d.fields()[mf] = decompress(d[mf])
Expand All @@ -191,6 +276,12 @@ def process_result_whoosh(self, d):
return d

def process_document_xapian(self, d):
"""
Processes a document for indexing with Xapian.

Args:
d (dict): The document to process.
"""
d[self.main_field] = " ".join([d.get(name, "") for name in self.field_order])


Expand Down
Loading