Sygil-Dev · ZeroCool940711 · Feb 10, 2024 · Feb 24, 2024
diff --git a/benchmark/dictionary.py b/benchmark/dictionary.py
@@ -6,11 +6,26 @@
 
 
 class VulgarTongue(Spec):
+    """
+    A class representing a VulgarTongue dictionary.
+
+    Attributes:
+        name (str): The name of the dictionary.
+        filename (str): The filename of the dictionary file.
+        headline_field (str): The field name for the headline.
+    """
+
     name = "dictionary"
     filename = "dcvgr10.txt.gz"
     headline_field = "head"
 
     def documents(self):
+        """
+        Generator function that yields documents from the dictionary file.
+
+        Yields:
+            dict: A dictionary representing a document with 'head' and 'body' fields.
+        """
         path = os.path.join(self.options.dir, self.filename)
         f = gzip.GzipFile(path)
 
@@ -28,14 +43,26 @@ def documents(self):
             yield {"head": head, "body": head + body}
 
     def whoosh_schema(self):
-        ana = analysis.StemmingAnalyzer()
+        """
+        Returns the Whoosh schema for the VulgarTongue dictionary.
+
+        Returns:
+            Schema: The Whoosh schema for the dictionary.
+        """
+        ana = analysis.stemming_analyzer()
 
         schema = fields.Schema(
             head=fields.ID(stored=True), body=fields.TEXT(analyzer=ana, stored=True)
         )
         return schema
 
     def zcatalog_setup(self, cat):
+        """
+        Sets up the ZCatalog indexes for the VulgarTongue dictionary.
+
+        Args:
+            cat (ZCatalog): The ZCatalog instance.
+        """
         from zcatalog import indexes  # type: ignore @UnresolvedImport
 
         cat["head"] = indexes.FieldIndex(field_name="head")

diff --git a/benchmark/enron.py b/benchmark/enron.py
@@ -14,10 +14,12 @@
 from whoosh.support.bench import Bench, Spec
 from whoosh.util import now
 
-# Benchmark class
-
 
 class Enron(Spec):
+    """
+    The Enron class provides functionality for downloading, caching, and processing the Enron email archive.
+    """
+
     name = "enron"
 
     enron_archive_url = "http://www.cs.cmu.edu/~enron/enron_mail_082109.tar.gz"
@@ -40,17 +42,32 @@ class Enron(Spec):
 
     cachefile = None
 
-    # Functions for downloading and then reading the email archive and caching
-    # the messages in an easier-to-digest format
-
     def download_archive(self, archive):
+        """
+        Downloads the Enron email archive from the specified URL and saves it to the given file path.
+
+        Args:
+            archive (str): The file path to save the downloaded archive.
+
+        Raises:
+            FileNotFoundError: If the archive file does not exist.
+        """
         print(f"Downloading Enron email archive to {archive}...")
         t = now()
         urlretrieve(self.enron_archive_url, archive)
         print(f"Downloaded in {now() - t} seconds")
 
     @staticmethod
     def get_texts(archive):
+        """
+        Generator function that yields the text content of each email in the given archive.
+
+        Args:
+            archive (str): The file path of the archive.
+
+        Yields:
+            str: The text content of each email.
+        """
         archive = tarfile.open(archive, "r:gz")
         while True:
             entry = next(archive)
@@ -64,6 +81,16 @@ def get_texts(archive):
 
     @staticmethod
     def get_messages(archive, headers=True):
+        """
+        Generator function that yields the parsed messages from the given email archive.
+
+        Args:
+            archive (str): The file path of the archive.
+            headers (bool, optional): Whether to include message headers. Defaults to True.
+
+        Yields:
+            dict: The dictionary representation of each message.
+        """
         header_to_field = Enron.header_to_field
         for text in Enron.get_texts(archive):
             message = message_from_string(text)
@@ -83,6 +110,16 @@ def get_messages(archive, headers=True):
             yield d
 
     def cache_messages(self, archive, cache):
+        """
+        Caches the messages from the given email archive into a pickle file.
+
+        Args:
+            archive (str): The file path of the archive.
+            cache (str): The file path to save the cached messages.
+
+        Raises:
+            FileNotFoundError: If the archive file does not exist.
+        """
         print(f"Caching messages in {cache}...")
 
         if not os.path.exists(archive):
@@ -100,6 +137,9 @@ def cache_messages(self, archive, cache):
         print(f"Cached messages in {now() - t} seconds")
 
     def setup(self):
+        """
+        Sets up the Enron email archive by downloading it if necessary and caching the messages.
+        """
         archive = os.path.abspath(
             os.path.join(self.options.dir, self.enron_archive_filename)
         )
@@ -116,6 +156,15 @@ def setup(self):
             print("Cache is OK")
 
     def documents(self):
+        """
+        Generator function that yields the cached messages from the pickle file.
+
+        Yields:
+            dict: The dictionary representation of each message.
+
+        Raises:
+            FileNotFoundError: If the message cache does not exist.
+        """
         if not os.path.exists(self.cache_filename):
             raise FileNotFoundError("Message cache does not exist, use --setup")
 
@@ -130,7 +179,13 @@ def documents(self):
         f.close()
 
     def whoosh_schema(self):
-        ana = analysis.StemmingAnalyzer(maxsize=40, cachesize=None)
+        """
+        Returns the Whoosh schema for indexing the Enron email archive.
+
+        Returns:
+            whoosh.fields.Schema: The schema for indexing the emails.
+        """
+        ana = analysis.stemming_analyzer(maxsize=40, cachesize=None)
         storebody = self.options.storebody
         schema = fields.Schema(
             body=fields.TEXT(analyzer=ana, stored=storebody),
@@ -145,6 +200,15 @@ def whoosh_schema(self):
         return schema
 
     def xappy_indexer_connection(self, path):
+        """
+        Creates and returns an Xapian indexer connection for indexing the Enron email archive.
+
+        Args:
+            path (str): The path to the Xapian index.
+
+        Returns:
+            xappy.IndexerConnection: The Xapian indexer connection.
+        """
         conn = xappy.IndexerConnection(path)
         conn.add_field_action("body", xappy.FieldActions.INDEX_FREETEXT, language="en")
         if self.options.storebody:
@@ -164,6 +228,12 @@ def xappy_indexer_connection(self, path):
         return conn
 
     def zcatalog_setup(self, cat):
+        """
+        Sets up the ZCatalog indexes for indexing the Enron email archive.
+
+        Args:
+            cat (zcatalog.catalog.Catalog): The ZCatalog catalog.
+        """
         from zcatalog import indexes  # type: ignore
 
         for name in ("date", "frm"):
@@ -172,12 +242,27 @@ def zcatalog_setup(self, cat):
             cat[name] = indexes.TextIndex(field_name=name)
 
     def process_document_whoosh(self, d):
+        """
+        Processes a document for indexing with Whoosh.
+
+        Args:
+            d (dict): The document to process.
+        """
         d["filepos"] = self.filepos
         if self.options.storebody:
             mf = self.main_field
             d[f"_stored_{mf}"] = compress(d[mf], 9)
 
     def process_result_whoosh(self, d):
+        """
+        Processes a search result from Whoosh.
+
+        Args:
+            d (dict): The search result.
+
+        Returns:
+            dict: The processed search result.
+        """
         mf = self.main_field
         if mf in d:
             d.fields()[mf] = decompress(d[mf])
@@ -191,6 +276,12 @@ def process_result_whoosh(self, d):
         return d
 
     def process_document_xapian(self, d):
+        """
+        Processes a document for indexing with Xapian.
+
+        Args:
+            d (dict): The document to process.
+        """
         d[self.main_field] = " ".join([d.get(name, "") for name in self.field_order])