From 198fe3824bb46c4913ea92358ac9ecf1c8fbcd7f Mon Sep 17 00:00:00 2001 From: Alejandro Gil Date: Sat, 10 Feb 2024 16:10:10 -0700 Subject: [PATCH 1/4] Add docstrings to every function and class providing information on how it works and how it should be used. --- benchmark/dictionary.py | 29 +- benchmark/enron.py | 103 +- benchmark/marc21.py | 306 +++- benchmark/reuters.py | 36 +- docs/source/analysis.rst | 6 +- docs/source/api/analysis.rst | 20 +- docs/source/batch.rst | 2 +- docs/source/highlight.rst | 16 +- docs/source/ngrams.rst | 2 +- docs/source/recipes.rst | 6 +- docs/source/releases/1_0.rst | 6 +- docs/source/releases/2_0.rst | 2 +- docs/source/schema.rst | 8 +- docs/source/spelling.rst | 2 +- docs/source/stemming.rst | 12 +- scripts/make_checkpoint.py | 24 +- scripts/read_checkpoint.py | 15 +- src/whoosh/analysis/__init__.py | 22 +- src/whoosh/analysis/acore.py | 104 +- src/whoosh/analysis/analyzers.py | 330 +++- src/whoosh/analysis/filters.py | 535 ++++-- src/whoosh/analysis/intraword.py | 138 +- src/whoosh/analysis/morph.py | 425 ++++- src/whoosh/analysis/ngrams.py | 160 +- src/whoosh/analysis/tokenizers.py | 240 ++- src/whoosh/automata/fsa.py | 1461 +++++++++++++++- src/whoosh/automata/fst.py | 1435 ++++++++++++++-- src/whoosh/automata/glob.py | 34 + src/whoosh/automata/lev.py | 12 + src/whoosh/automata/reg.py | 111 ++ src/whoosh/codec/__init__.py | 15 + src/whoosh/codec/base.py | 1630 +++++++++++++++++- src/whoosh/codec/memory.py | 688 +++++++- src/whoosh/codec/plaintext.py | 754 +++++++- src/whoosh/codec/whoosh2.py | 2656 ++++++++++++++++++++++++++++- src/whoosh/codec/whoosh3.py | 1361 ++++++++++++++- src/whoosh/fields.py | 86 +- src/whoosh/filedb/compound.py | 427 ++++- src/whoosh/filedb/fileindex.py | 487 +++++- src/whoosh/filedb/filepostings.py | 448 ++++- src/whoosh/filedb/filereading.py | 489 +++++- src/whoosh/filedb/filestore.py | 1213 ++++++++++++- src/whoosh/filedb/filetables.py | 1410 ++++++++++++++- src/whoosh/filedb/filewriting.py | 315 +++- src/whoosh/filedb/gae.py | 417 +++++ src/whoosh/filedb/misc.py | 35 + src/whoosh/filedb/pools.py | 590 +++++-- src/whoosh/filedb/structfile.py | 1354 ++++++++++++++- src/whoosh/highlight.py | 24 +- src/whoosh/legacy.py | 4 +- src/whoosh/qparser/dateparse.py | 2 + src/whoosh/query/terms.py | 2 +- src/whoosh/support/base85.py | 82 +- src/whoosh/support/bench.py | 847 ++++++++- src/whoosh/support/bitstream.py | 44 +- src/whoosh/support/bitvector.py | 453 ++--- src/whoosh/support/charset.py | 28 +- src/whoosh/support/pyparsing.py | 229 ++- src/whoosh/util/__init__.py | 127 +- tests/test_analysis.py | 12 +- tests/test_classify.py | 4 +- tests/test_codecs.py | 8 +- tests/test_fields.py | 4 +- tests/test_highlighting.py | 16 +- tests/test_indexing.py | 8 +- tests/test_parse_plugins.py | 8 +- tests/test_parsing.py | 8 +- tests/test_postings.py | 2 +- tests/test_results.py | 4 +- tests/test_searching.py | 4 +- tests/test_spans.py | 10 +- tests/test_spelling.py | 6 +- tests/test_writing.py | 4 +- 73 files changed, 20366 insertions(+), 1551 deletions(-) diff --git a/benchmark/dictionary.py b/benchmark/dictionary.py index d5855f4a..5cacf071 100644 --- a/benchmark/dictionary.py +++ b/benchmark/dictionary.py @@ -6,11 +6,26 @@ class VulgarTongue(Spec): + """ + A class representing a VulgarTongue dictionary. + + Attributes: + name (str): The name of the dictionary. + filename (str): The filename of the dictionary file. + headline_field (str): The field name for the headline. + """ + name = "dictionary" filename = "dcvgr10.txt.gz" headline_field = "head" def documents(self): + """ + Generator function that yields documents from the dictionary file. + + Yields: + dict: A dictionary representing a document with 'head' and 'body' fields. + """ path = os.path.join(self.options.dir, self.filename) f = gzip.GzipFile(path) @@ -28,7 +43,13 @@ def documents(self): yield {"head": head, "body": head + body} def whoosh_schema(self): - ana = analysis.StemmingAnalyzer() + """ + Returns the Whoosh schema for the VulgarTongue dictionary. + + Returns: + Schema: The Whoosh schema for the dictionary. + """ + ana = analysis.stemming_analyzer() schema = fields.Schema( head=fields.ID(stored=True), body=fields.TEXT(analyzer=ana, stored=True) @@ -36,6 +57,12 @@ def whoosh_schema(self): return schema def zcatalog_setup(self, cat): + """ + Sets up the ZCatalog indexes for the VulgarTongue dictionary. + + Args: + cat (ZCatalog): The ZCatalog instance. + """ from zcatalog import indexes # type: ignore @UnresolvedImport cat["head"] = indexes.FieldIndex(field_name="head") diff --git a/benchmark/enron.py b/benchmark/enron.py index 38504221..b3167dbb 100644 --- a/benchmark/enron.py +++ b/benchmark/enron.py @@ -14,10 +14,12 @@ from whoosh.support.bench import Bench, Spec from whoosh.util import now -# Benchmark class - class Enron(Spec): + """ + The Enron class provides functionality for downloading, caching, and processing the Enron email archive. + """ + name = "enron" enron_archive_url = "http://www.cs.cmu.edu/~enron/enron_mail_082109.tar.gz" @@ -40,10 +42,16 @@ class Enron(Spec): cachefile = None - # Functions for downloading and then reading the email archive and caching - # the messages in an easier-to-digest format - def download_archive(self, archive): + """ + Downloads the Enron email archive from the specified URL and saves it to the given file path. + + Args: + archive (str): The file path to save the downloaded archive. + + Raises: + FileNotFoundError: If the archive file does not exist. + """ print(f"Downloading Enron email archive to {archive}...") t = now() urlretrieve(self.enron_archive_url, archive) @@ -51,6 +59,15 @@ def download_archive(self, archive): @staticmethod def get_texts(archive): + """ + Generator function that yields the text content of each email in the given archive. + + Args: + archive (str): The file path of the archive. + + Yields: + str: The text content of each email. + """ archive = tarfile.open(archive, "r:gz") while True: entry = next(archive) @@ -64,6 +81,16 @@ def get_texts(archive): @staticmethod def get_messages(archive, headers=True): + """ + Generator function that yields the parsed messages from the given email archive. + + Args: + archive (str): The file path of the archive. + headers (bool, optional): Whether to include message headers. Defaults to True. + + Yields: + dict: The dictionary representation of each message. + """ header_to_field = Enron.header_to_field for text in Enron.get_texts(archive): message = message_from_string(text) @@ -83,6 +110,16 @@ def get_messages(archive, headers=True): yield d def cache_messages(self, archive, cache): + """ + Caches the messages from the given email archive into a pickle file. + + Args: + archive (str): The file path of the archive. + cache (str): The file path to save the cached messages. + + Raises: + FileNotFoundError: If the archive file does not exist. + """ print(f"Caching messages in {cache}...") if not os.path.exists(archive): @@ -100,6 +137,9 @@ def cache_messages(self, archive, cache): print(f"Cached messages in {now() - t} seconds") def setup(self): + """ + Sets up the Enron email archive by downloading it if necessary and caching the messages. + """ archive = os.path.abspath( os.path.join(self.options.dir, self.enron_archive_filename) ) @@ -116,6 +156,15 @@ def setup(self): print("Cache is OK") def documents(self): + """ + Generator function that yields the cached messages from the pickle file. + + Yields: + dict: The dictionary representation of each message. + + Raises: + FileNotFoundError: If the message cache does not exist. + """ if not os.path.exists(self.cache_filename): raise FileNotFoundError("Message cache does not exist, use --setup") @@ -130,7 +179,13 @@ def documents(self): f.close() def whoosh_schema(self): - ana = analysis.StemmingAnalyzer(maxsize=40, cachesize=None) + """ + Returns the Whoosh schema for indexing the Enron email archive. + + Returns: + whoosh.fields.Schema: The schema for indexing the emails. + """ + ana = analysis.stemming_analyzer(maxsize=40, cachesize=None) storebody = self.options.storebody schema = fields.Schema( body=fields.TEXT(analyzer=ana, stored=storebody), @@ -145,6 +200,15 @@ def whoosh_schema(self): return schema def xappy_indexer_connection(self, path): + """ + Creates and returns an Xapian indexer connection for indexing the Enron email archive. + + Args: + path (str): The path to the Xapian index. + + Returns: + xappy.IndexerConnection: The Xapian indexer connection. + """ conn = xappy.IndexerConnection(path) conn.add_field_action("body", xappy.FieldActions.INDEX_FREETEXT, language="en") if self.options.storebody: @@ -164,6 +228,12 @@ def xappy_indexer_connection(self, path): return conn def zcatalog_setup(self, cat): + """ + Sets up the ZCatalog indexes for indexing the Enron email archive. + + Args: + cat (zcatalog.catalog.Catalog): The ZCatalog catalog. + """ from zcatalog import indexes # type: ignore for name in ("date", "frm"): @@ -172,12 +242,27 @@ def zcatalog_setup(self, cat): cat[name] = indexes.TextIndex(field_name=name) def process_document_whoosh(self, d): + """ + Processes a document for indexing with Whoosh. + + Args: + d (dict): The document to process. + """ d["filepos"] = self.filepos if self.options.storebody: mf = self.main_field d[f"_stored_{mf}"] = compress(d[mf], 9) def process_result_whoosh(self, d): + """ + Processes a search result from Whoosh. + + Args: + d (dict): The search result. + + Returns: + dict: The processed search result. + """ mf = self.main_field if mf in d: d.fields()[mf] = decompress(d[mf]) @@ -191,6 +276,12 @@ def process_result_whoosh(self, d): return d def process_document_xapian(self, d): + """ + Processes a document for indexing with Xapian. + + Args: + d (dict): The document to process. + """ d[self.main_field] = " ".join([d.get(name, "") for name in self.field_order]) diff --git a/benchmark/marc21.py b/benchmark/marc21.py index 07fde36f..dbbe3ba0 100644 --- a/benchmark/marc21.py +++ b/benchmark/marc21.py @@ -1,9 +1,9 @@ import fnmatch import logging -import os.path +import os import re -from whoosh import analysis, fields, index, qparser, query, scoring +from whoosh import analysis, fields, index, qparser, scoring from whoosh.util import now log = logging.getLogger(__name__) @@ -21,6 +21,20 @@ def read_file(dbfile, tags=None): + """ + Reads records from a database file. + + Args: + dbfile (file): The file object representing the database file. + tags (list, optional): A list of tags to filter the records. Defaults to None. + + Yields: + tuple: A tuple containing the parsed record and its position in the file. + + Raises: + ValueError: If the length of the record is invalid. + + """ while True: pos = dbfile.tell() first5 = dbfile.read(5) @@ -34,6 +48,23 @@ def read_file(dbfile, tags=None): def read_record(filename, pos, tags=None): + """ + Read a MARC21 record from a file. + + Args: + filename (str): The path to the MARC21 file. + pos (int): The position in the file where the record starts. + tags (list[str], optional): A list of tags to include in the parsed record. + If None, all tags will be included. Defaults to None. + + Returns: + dict: A dictionary representing the parsed MARC21 record. + + Raises: + FileNotFoundError: If the specified file does not exist. + ValueError: If the specified position is invalid. + + """ f = open(filename, "rb") f.seek(pos) first5 = f.read(5) @@ -43,6 +74,32 @@ def read_record(filename, pos, tags=None): def parse_record(data, tags=None): + """ + Parse a MARC21 record from the given data. + + Args: + data (str): The MARC21 record data. + tags (list[str], optional): List of tags to include in the parsed result. If not provided, all tags will be included. + + Returns: + dict: A dictionary representing the parsed MARC21 record, where the keys are the tags and the values are the corresponding data. + + Raises: + AssertionError: If the length of the leader is not equal to LEADER_LEN. + AssertionError: If the dataoffset is not greater than 0. + AssertionError: If the dataoffset is not less than the length of the data. + AssertionError: If the difference between dirend and dirstart is not divisible by DIRECTORY_ENTRY_LEN. + + Example: + data = "..." + tags = ["245", "260"] + result = parse_record(data, tags) + # Returns: + # { + # "245": ["Title"], + # "260": ["Publisher"] + # } + """ leader = data[:LEADER_LEN] assert len(leader) == LEADER_LEN @@ -83,6 +140,16 @@ def parse_record(data, tags=None): def subfield(vs, code): + """ + Extracts the value of a subfield from a list of subfields. + + Parameters: + - vs (list): The list of subfields to search in. + - code (str): The code of the subfield to extract. + + Returns: + - str or None: The value of the subfield if found, None otherwise. + """ for v in vs: if v.startswith(code): return v[1:] @@ -90,14 +157,56 @@ def subfield(vs, code): def joinsubfields(vs): + """ + Joins the subfields of a MARC21 record. + + This function takes a list of subfields and joins them into a single string, + excluding any subfields starting with "6". + + Args: + vs (list): A list of subfields. + + Returns: + str: The joined subfields as a single string. + + Example: + >>> subfields = ['a', 'b', 'c', '6d', 'e'] + >>> joinsubfields(subfields) + 'a b c e' + """ return " ".join(v[1:] for v in vs if v and v[0] != "6") def getfields(d, *tags): + """ + Retrieve the values from a dictionary `d` for the given `tags`. + + Args: + d (dict): The dictionary to retrieve values from. + tags (str): Variable number of tags to retrieve values for. + + Returns: + generator: A generator that yields the values for the given tags. + + Example: + >>> d = {'tag1': 'value1', 'tag2': 'value2', 'tag3': 'value3'} + >>> fields = getfields(d, 'tag1', 'tag3') + >>> list(fields) + ['value1', 'value3'] + """ return (d[tag] for tag in tags if tag in d) def title(d): + """ + Extracts the title from a MARC21 record dictionary. + + Args: + d (dict): The MARC21 record dictionary. + + Returns: + str: The extracted title, or None if no title is found. + """ title = None if "245" in d: svs = d["245"] @@ -110,6 +219,24 @@ def title(d): def isbn(d): + """ + Extracts the ISBN number from the MARC21 record. + + Parameters: + - d (dict): The MARC21 record dictionary. + + Returns: + - str: The extracted ISBN number without hyphens. + + Example: + >>> record = { + ... "020": { + ... "a": "978-0132350884" + ... } + ... } + >>> isbn(record) + '9780132350884' + """ if "020" in d: num = subfield(d["020"], "a") if num: @@ -119,6 +246,18 @@ def isbn(d): def author(d): + """ + Returns the author information from the given dictionary. + + Parameters: + - d (dict): The dictionary containing the MARC21 record. + + Returns: + - str: The author information. + + Raises: + - KeyError: If the dictionary does not contain any author fields (100, 110, or 111). + """ if "100" in d: return joinsubfields(d["100"]) elif "110" in d: @@ -128,6 +267,27 @@ def author(d): def uniform_title(d): + """ + Returns the uniform title from the MARC21 record dictionary. + + Parameters: + - d (dict): The MARC21 record dictionary. + + Returns: + - str: The uniform title. + + Raises: + - None. + + Examples: + >>> record = {"130": ["Uniform Title"]} + >>> uniform_title(record) + 'Uniform Title' + + >>> record = {"240": ["Uniform Title"]} + >>> uniform_title(record) + 'Uniform Title' + """ if "130" in d: return joinsubfields(d["130"]) elif "240" in d: @@ -140,35 +300,139 @@ def uniform_title(d): def subjects(d): + """ + Returns a string containing the joined subfields of the given document's subject fields. + + Parameters: + - d: The document to extract subject fields from. + + Returns: + A string containing the joined subfields of the subject fields. + """ return " ".join(joinsubfields(vs) for vs in getfields(d, *subjectfields)) def physical(d): + """ + Returns the physical description of a MARC21 record. + + Parameters: + - d (dict): The MARC21 record dictionary. + + Returns: + - str: The physical description of the record. + """ return joinsubfields(d["300"]) def location(d): + """ + Returns the location of a record in the MARC21 format. + + Parameters: + - d (dict): The MARC21 record dictionary. + + Returns: + - str: The location of the record. + """ return joinsubfields(d["852"]) def publisher(d): + """ + Extracts the publisher information from the MARC21 record. + + Args: + d (dict): The MARC21 record dictionary. + + Returns: + str: The publisher information, or None if not found. + """ if "260" in d: return subfield(d["260"], "b") def pubyear(d): + """ + Extracts the publication year from a MARC21 record. + + Args: + d (dict): The MARC21 record dictionary. + + Returns: + str: The publication year, or None if not found. + """ if "260" in d: return subfield(d["260"], "c") def uni(v): + """ + Converts a byte string to a Unicode string. + + Parameters: + v (bytes): The byte string to be converted. + + Returns: + str: The converted Unicode string. + + Raises: + None + + Examples: + >>> uni(b'hello') + 'hello' + >>> uni(None) + '' + """ return "" if v is None else v.decode("utf-8", "replace") # Indexing and searching +def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*.mrc"): + """ + Create an index for MARC21 records. + Args: + basedir (str): The base directory containing the MARC21 files. + ixdir (str): The directory to store the index. + procs (int, optional): The number of processors to use for indexing. Defaults to 4. + limitmb (int, optional): The memory limit per processor in megabytes. Defaults to 128. + multisegment (bool, optional): Whether to use multisegment indexing. Defaults to True. + glob (str, optional): The file pattern to match for indexing. Defaults to "*.mrc". -def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*.mrc"): + Returns: + None + + Raises: + OSError: If the specified `ixdir` directory does not exist and cannot be created. + + Notes: + This function creates an index for MARC21 records using the Whoosh library. It takes the base directory + containing the MARC21 files (`basedir`), the directory to store the index (`ixdir`), and optional parameters + for configuring the indexing process. + + The `procs` parameter specifies the number of processors to use for indexing. By default, it is set to 4. + + The `limitmb` parameter sets the memory limit per processor in megabytes. The default value is 128. + + The `multisegment` parameter determines whether to use multisegment indexing. If set to True (default), the + index will be split into multiple segments for better performance. + + The `glob` parameter specifies the file pattern to match for indexing. By default, it is set to "*.mrc". + + If the specified `ixdir` directory does not exist, it will be created before creating the index. + + The function uses a multi-lingual stop words list for text analysis and defines a schema for the index + containing fields for title, author, subject, file, and position. + + The MARC fields to extract are specified in the `mfields` set. + + The function prints the indexing configuration and starts the indexing process. It creates the index in the + specified `ixdir` directory and uses the Whoosh writer to add documents to the index. + + After indexing is complete, the function returns None. + """ if not os.path.exists(ixdir): os.mkdir(ixdir) @@ -177,7 +441,7 @@ def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*. "de la der und le die et en al no von di du da " "del zur ein".split() ) # Schema - ana = analysis.StemmingAnalyzer(stoplist=stoplist) + ana = analysis.stemming_analyzer(stoplist=stoplist) schema = fields.Schema( title=fields.TEXT(analyzer=ana), author=fields.TEXT(phrase=False), @@ -220,6 +484,22 @@ def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*. def print_record(no, basedir, filename, pos): + """ + Print the record information. + + Args: + no (int): The record number. + basedir (str): The base directory. + filename (str): The name of the file. + pos (int): The position of the record. + + Returns: + None + + Raises: + None + + """ path = os.path.join(basedir, filename) record = read_record(path, pos) print("% 5d. %s" % (no + 1, title(record))) @@ -232,6 +512,24 @@ def print_record(no, basedir, filename, pos): def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True): + """ + Perform a search on the index using the given query string. + + Args: + qstring (str): The query string to search for. + ixdir (str): The directory path where the index is located. + basedir (str): The base directory path. + limit (int, optional): The maximum number of results to return. Defaults to None. + optimize (bool, optional): Whether to optimize the search. Defaults to True. + scores (bool, optional): Whether to include scores in the search results. Defaults to True. + + Returns: + None + + Raises: + None + + """ ix = index.open_dir(ixdir) qp = qparser.QueryParser("title", ix.schema) q = qp.parse(qstring) diff --git a/benchmark/reuters.py b/benchmark/reuters.py index dde05363..ba0b1ff7 100644 --- a/benchmark/reuters.py +++ b/benchmark/reuters.py @@ -7,14 +7,30 @@ class Reuters(Spec): + """ + The Reuters class represents a benchmark for the Reuters dataset. + + Attributes: + name (str): The name of the benchmark. + filename (str): The name of the file containing the dataset. + main_field (str): The main field in the dataset. + headline_text (str): The field representing the headline text in the dataset. + """ + name = "reuters" filename = "reuters21578.txt.gz" main_field = "text" headline_text = "headline" def whoosh_schema(self): - # ana = analysis.StemmingAnalyzer() - ana = analysis.StandardAnalyzer() + """ + Returns the schema for the Whoosh index. + + Returns: + Schema: The schema for the Whoosh index. + """ + # ana = analysis.stemming_analyzer() + ana = analysis.standard_analyzer() schema = fields.Schema( id=fields.ID(stored=True), headline=fields.STORED, @@ -23,6 +39,12 @@ def whoosh_schema(self): return schema def zcatalog_setup(self, cat): + """ + Sets up the ZCatalog index. + + Args: + cat (ZCatalog): The ZCatalog instance to set up. + """ from zcatalog import indexes # type: ignore @UnresolvedImport cat["id"] = indexes.FieldIndex(field_name="id") @@ -30,12 +52,18 @@ def zcatalog_setup(self, cat): cat["body"] = indexes.TextIndex(field_name="text") def documents(self): + """ + Generates documents from the dataset. + + Yields: + dict: A document from the dataset. + """ path = os.path.join(self.options.dir, self.filename) f = gzip.GzipFile(path) for line in f: - id, text = line.decode("latin1").split("\t") - yield {"id": id, "text": text, "headline": text[:70]} + id_var, text = line.decode("latin1").split("\t") + yield {"id": id_var, "text": text, "headline": text[:70]} if __name__ == "__main__": diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst index ebbb72a9..ca6457a8 100644 --- a/docs/source/analysis.rst +++ b/docs/source/analysis.rst @@ -85,7 +85,7 @@ Using analyzers When you create a field in a schema, you can specify your analyzer as a keyword argument to the field object:: - schema = Schema(content=TEXT(analyzer=StemmingAnalyzer())) + schema = Schema(content=TEXT(analyzer=stemming_analyzer())) Advanced Analysis @@ -276,8 +276,8 @@ be removed from the stream or left in. :: - >>> from whoosh.analysis import StandardAnalyzer - >>> analyzer = StandardAnalyzer() + >>> from whoosh.analysis import standard_analyzer + >>> analyzer = standard_analyzer() >>> [(t.text, t.stopped) for t in analyzer(u"This is a test")] [(u'test', False)] >>> [(t.text, t.stopped) for t in analyzer(u"This is a test", removestops=False)] diff --git a/docs/source/api/analysis.rst b/docs/source/api/analysis.rst index 81805618..b8da7b7a 100644 --- a/docs/source/api/analysis.rst +++ b/docs/source/api/analysis.rst @@ -7,16 +7,16 @@ Analyzers ========= -.. autofunction:: IDAnalyzer -.. autofunction:: KeywordAnalyzer -.. autofunction:: RegexAnalyzer -.. autofunction:: SimpleAnalyzer -.. autofunction:: StandardAnalyzer -.. autofunction:: StemmingAnalyzer -.. autofunction:: FancyAnalyzer -.. autofunction:: NgramAnalyzer -.. autofunction:: NgramWordAnalyzer -.. autofunction:: LanguageAnalyzer +.. autofunction:: id_analyzer +.. autofunction:: keyword_analyzer +.. autofunction:: regex_analyzer +.. autofunction:: simple_analyzer +.. autofunction:: standard_analyzer +.. autofunction:: stemming_analyzer +.. autofunction:: fancy_analyzer +.. autofunction:: ngram_analyzer +.. autofunction:: ngram_word_analyzer +.. autofunction:: language_analyzer Tokenizers diff --git a/docs/source/batch.rst b/docs/source/batch.rst index b8a741f0..6f749d7f 100644 --- a/docs/source/batch.rst +++ b/docs/source/batch.rst @@ -13,7 +13,7 @@ of documents at once (batch indexing). The following settings and alternate workflows can make batch indexing faster. -StemmingAnalyzer cache +stemming_analyzer cache ====================== The stemming analyzer by default uses a least-recently-used (LRU) cache to limit diff --git a/docs/source/highlight.rst b/docs/source/highlight.rst index bc266c8c..6313263e 100644 --- a/docs/source/highlight.rst +++ b/docs/source/highlight.rst @@ -15,8 +15,8 @@ The highlighting system works as a pipeline, with four component types. * **Order functions** control in what order the top-scoring fragments are presented to the user. For example, you can show the fragments in the order - they appear in the document (FIRST) or show higher-scoring fragments first - (SCORE) + they appear in the document (first) or show higher-scoring fragments first + (score) * **Formatters** turn the fragment objects into human-readable output, such as an HTML string. @@ -199,19 +199,19 @@ fragments with lower values appear before fragments with higher values). The ``highlight`` module has the following order functions. -``FIRST`` (the default) +``first`` (the default) Show fragments in the order they appear in the document. -``SCORE`` +``score`` Show highest scoring fragments first. -The ``highlight`` module also includes ``LONGER`` (longer fragments first) and -``SHORTER`` (shorter fragments first), but they probably aren't as generally +The ``highlight`` module also includes ``longer`` (longer fragments first) and +``shorter`` (shorter fragments first), but they probably aren't as generally useful. To use a different order:: - results.order = highlight.SCORE + results.order = highlight.score Formatter @@ -371,7 +371,7 @@ an analyzer:: from whoosh.highlight import highlight excerpts = highlight(text, terms, analyzer, fragmenter, formatter, top=3, - scorer=BasicFragmentScorer, minscore=1, order=FIRST) + scorer=BasicFragmentScorer, minscore=1, order=first) ``text`` The original text of the document. diff --git a/docs/source/ngrams.rst b/docs/source/ngrams.rst index 56bfe22f..1eeac448 100644 --- a/docs/source/ngrams.rst +++ b/docs/source/ngrams.rst @@ -33,7 +33,7 @@ separation. :: - >>> my_analyzer = StandardAnalyzer() | NgramFilter(minsize=2, maxsize=4) + >>> my_analyzer = standard_analyzer() | NgramFilter(minsize=2, maxsize=4) >>> [token.text for token in my_analyzer(u"rendering shaders")] [u'ren', u'rend', u'end', u'ende', u'nde', u'nder', u'der', u'deri', u'eri', u'erin', u'rin', u'ring', u'ing', u'sha', u'shad', u'had', u'hade', u'ade', diff --git a/docs/source/recipes.rst b/docs/source/recipes.rst index 29c9571b..8ef1d21e 100644 --- a/docs/source/recipes.rst +++ b/docs/source/recipes.rst @@ -61,17 +61,17 @@ Find every document iTunes-style search-as-you-type ------------------------------- -Use the :class:` whoosh.analysis.NgramWordAnalyzer` as the analyzer for the +Use the :class:` whoosh.analysis.ngram_word_analyzer` as the analyzer for the field you want to search as the user types. You can save space in the index by turning off positions in the field using ``phrase=False``, since phrase searching on N-gram fields usually doesn't make much sense:: # For example, to search the "title" field as the user types - analyzer = analysis.NgramWordAnalyzer() + analyzer = analysis.ngram_word_analyzer() title_field = fields.TEXT(analyzer=analyzer, phrase=False) schema = fields.Schema(title=title_field) -See the documentation for the :class:`~ whoosh.analysis.NgramWordAnalyzer` class +See the documentation for the :class:`~ whoosh.analysis.ngram_word_analyzer` class for information on the available options. diff --git a/docs/source/releases/1_0.rst b/docs/source/releases/1_0.rst index 08887f53..7b66e063 100644 --- a/docs/source/releases/1_0.rst +++ b/docs/source/releases/1_0.rst @@ -77,14 +77,14 @@ analyzer as the inverted index. Alternatively, you can pass a Format subclass and Whoosh will instantiate it for you. For example, to store term vectors using the same settings as the inverted -index (Positions format and StandardAnalyzer):: +index (Positions format and standard_analyzer):: from whoosh.fields import Schema, TEXT schema = Schema(content=TEXT(vector=True)) To store term vectors that use the same analyzer as the inverted index -(StandardAnalyzer by default) but only store term frequency:: +(standard_analyzer by default) but only store term frequency:: from whoosh.formats import Frequency @@ -351,7 +351,7 @@ Fixed bug where files could be deleted before a reader could open them in threaded situations. New :class:` whoosh.analysis.NgramFilter` filter, -:class:` whoosh.analysis.NgramWordAnalyzer` analyzer, and +:class:` whoosh.analysis.ngram_word_analyzer` analyzer, and :class:` whoosh.fields.NGRAMWORDS` field type allow producing n-grams from tokenized text. diff --git a/docs/source/releases/2_0.rst b/docs/source/releases/2_0.rst index 20569387..158feaa2 100644 --- a/docs/source/releases/2_0.rst +++ b/docs/source/releases/2_0.rst @@ -46,7 +46,7 @@ Whoosh 2.5 * Whoosh now includes pure-Python implementations of the Snowball stemmers and stop word lists for various languages adapted from NLTK. These are available - through the :class:` whoosh.analysis.LanguageAnalyzer` analyzer or through the + through the :class:` whoosh.analysis.language_analyzer` analyzer or through the ``lang=`` keyword argument to the :class:`~ whoosh.fields.TEXT` field. diff --git a/docs/source/schema.rst b/docs/source/schema.rst index 58da2fc7..fbe8a91d 100644 --- a/docs/source/schema.rst +++ b/docs/source/schema.rst @@ -31,9 +31,9 @@ Whoosh provides some useful predefined field types: This type is for body text. It indexes (and optionally stores) the text and stores term positions to allow phrase searching. - ``TEXT`` fields use :class:`~ whoosh.analysis.StandardAnalyzer` by default. To specify a different + ``TEXT`` fields use :class:`~ whoosh.analysis.standard_analyzer` by default. To specify a different analyzer, use the ``analyzer`` keyword argument to the constructor, e.g. - ``TEXT(analyzer=analysis.StemmingAnalyzer())``. See :doc:`analysis`. + ``TEXT(analyzer=analysis.stemming_analyzer())``. See :doc:`analysis`. By default, ``TEXT`` fields store position information for each indexed term, to allow you to search for phrases. If you don't need to be able to search for @@ -104,12 +104,12 @@ Creating a Schema To create a schema:: from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED - from whoosh.analysis import StemmingAnalyzer + from whoosh.analysis import stemming_analyzer schema = Schema(from_addr=ID(stored=True), to_addr=ID(stored=True), subject=TEXT(stored=True), - body=TEXT(analyzer=StemmingAnalyzer()), + body=TEXT(analyzer=stemming_analyzer()), tags=KEYWORD) If you aren't specifying any constructor keyword arguments to one of the diff --git a/docs/source/spelling.rst b/docs/source/spelling.rst index 36fbb777..84ecbf87 100644 --- a/docs/source/spelling.rst +++ b/docs/source/spelling.rst @@ -37,7 +37,7 @@ However, if you have an analyzer that modifies the indexed words (such as stemming), you can add ``spelling=True`` to a field to have it store separate unmodified versions of the terms for spelling suggestions:: - ana = analysis.StemmingAnalyzer() + ana = analysis.stemming_analyzer() schema = fields.Schema(text=TEXT(analyzer=ana, spelling=True)) You can then use the :meth:` whoosh.searching.Searcher.corrector` method diff --git a/docs/source/stemming.rst b/docs/source/stemming.rst index 0d30b569..7dab76be 100644 --- a/docs/source/stemming.rst +++ b/docs/source/stemming.rst @@ -50,13 +50,13 @@ analyzer chain. >>> [token.text for token in stemmer(stream)] [u"fundament", u"willow"] -The :func:` whoosh.analysis.StemmingAnalyzer` is a pre-packaged analyzer that +The :func:` whoosh.analysis.stemming_analyzer` is a pre-packaged analyzer that combines a tokenizer, lower-case filter, optional stop filter, and stem filter:: from whoosh import fields - from whoosh.analysis import StemmingAnalyzer + from whoosh.analysis import stemming_analyzer - stem_ana = StemmingAnalyzer() + stem_ana = stemming_analyzer() schema = fields.Schema(title=TEXT(analyzer=stem_ana, stored=True), content=TEXT(analyzer=stem_ana)) @@ -170,12 +170,12 @@ text. For example, it will filter the tokens ``u'café', u'resumé', ...`` to ``u'cafe', u'resume', ...``. This is usually the method you'll want to use unless you need to use a charset to tokenize terms:: - from whoosh.analysis import CharsetFilter, StemmingAnalyzer + from whoosh.analysis import CharsetFilter, stemming_analyzer from whoosh import fields from whoosh.support.charset import accent_map # For example, to add an accent-folding filter to a stemming analyzer: - my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) + my_analyzer = stemming_analyzer() | CharsetFilter(accent_map) # To use this analyzer in your schema: my_schema = fields.Schema(content=fields.TEXT(analyzer=my_analyzer)) @@ -197,7 +197,7 @@ required by ``CharsetTokenizer`` and ``CharsetFilter``:: from whoosh.analysis import CharsetFilter from whoosh.support.charset import default_charset, charset_table_to_dict charmap = charset_table_to_dict(default_charset) - my_analyzer = StemmingAnalyzer() | CharsetFilter(charmap) + my_analyzer = stemming_analyzer() | CharsetFilter(charmap) (The Sphinx charset table format is described at http://www.sphinxsearch.com/docs/current.html#conf-charset-table ) diff --git a/scripts/make_checkpoint.py b/scripts/make_checkpoint.py index 9c1b818b..da900fb9 100644 --- a/scripts/make_checkpoint.py +++ b/scripts/make_checkpoint.py @@ -1,8 +1,28 @@ #!python -# Make a "checkpoint" index, capturing the index format created by a certain -# version of Whoosh +""" +This script creates a "checkpoint" index using the Whoosh library. The checkpoint index captures the index format created by a certain version of Whoosh. +Usage: make_checkpoint.py + +Parameters: + (str): The directory where the checkpoint index will be created. + +The script generates a checkpoint index with the following fields: +- path: A unique identifier for each document. +- num: An integer field stored in the index. +- frac: A float field stored in the index. +- dt: A datetime field stored in the index. +- tag: A keyword field. +- title: A text field stored in the index. +- ngrams: A field for generating n-grams from the title. + +The script creates a directory if it doesn't exist and initializes the index with the specified schema. It then adds documents to the index with randomly generated data. The number of documents and the data for each document are determined by the loop iterations. + +Finally, the script deletes specific documents from the index and prints the total number of documents in the index. + +Note: The Whoosh library must be installed in order to run this script. +""" import os.path import random diff --git a/scripts/read_checkpoint.py b/scripts/read_checkpoint.py index d8a9d77c..7f5d2fa8 100644 --- a/scripts/read_checkpoint.py +++ b/scripts/read_checkpoint.py @@ -1,7 +1,20 @@ #!python -# Read a "checkpoint" index, to check backwards compatibility +""" +This script reads a "checkpoint" index to check for backwards compatibility. +The script takes a directory path as a command-line argument and reads the checkpoint index located in that directory. +It then performs various checks on the index to ensure its integrity and compatibility. + +Usage: read_checkpoint.py + +Args: + (str): The directory path where the checkpoint index is located. + +Example: + $ python read_checkpoint.py /path/to/index + +""" import sys diff --git a/src/whoosh/analysis/__init__.py b/src/whoosh/analysis/__init__.py index 0c116bf6..d76c3c6e 100644 --- a/src/whoosh/analysis/__init__.py +++ b/src/whoosh/analysis/__init__.py @@ -45,7 +45,7 @@ generator. * Analyzers are convenience functions/classes that "package up" a tokenizer and - zero or more filters into a single unit. For example, the StandardAnalyzer + zero or more filters into a single unit. For example, the standard_analyzer combines a RegexTokenizer, LowercaseFilter, and StopFilter. Every analyzer is a callable that takes a string and returns a token @@ -69,14 +69,14 @@ ) from whoosh.analysis.analyzers import ( Analyzer, - FancyAnalyzer, - IDAnalyzer, - KeywordAnalyzer, - LanguageAnalyzer, - RegexAnalyzer, - SimpleAnalyzer, - StandardAnalyzer, - StemmingAnalyzer, + fancy_analyzer, + id_analyzer, + keyword_analyzer, + language_analyzer, + regex_analyzer, + simple_analyzer, + standard_analyzer, + stemming_analyzer, ) from whoosh.analysis.filters import ( STOP_WORDS, @@ -103,10 +103,10 @@ ) from whoosh.analysis.morph import DoubleMetaphoneFilter, PyStemmerFilter, StemFilter from whoosh.analysis.ngrams import ( - NgramAnalyzer, NgramFilter, NgramTokenizer, - NgramWordAnalyzer, + ngram_analyzer, + ngram_word_analyzer, ) from whoosh.analysis.tokenizers import ( CharsetTokenizer, diff --git a/src/whoosh/analysis/acore.py b/src/whoosh/analysis/acore.py index 318f1129..f7ac3949 100644 --- a/src/whoosh/analysis/acore.py +++ b/src/whoosh/analysis/acore.py @@ -29,6 +29,16 @@ class CompositionError(Exception): + """ + Exception raised when there is an error in the composition of analysis components. + + This exception is raised when there is an error in the composition of analysis components, + such as when incompatible components are combined together. + + Attributes: + message -- explanation of the error + """ + pass @@ -36,7 +46,15 @@ class CompositionError(Exception): def unstopped(tokenstream): - """Removes tokens from a token stream where token.stopped = True.""" + """ + Removes tokens from a token stream where token.stopped = True. + + Parameters: + - tokenstream (generator): A generator of tokens. + + Returns: + - generator: A generator of tokens where token.stopped = False. + """ return (t for t in tokenstream if not t.stopped) @@ -48,8 +66,26 @@ def entoken( with the attributes filled in with reasonable values (for example, if ``positions`` or ``chars`` is True, the function assumes each token was separated by one space). - """ + Args: + textstream (Iterable[str]): A sequence of unicode strings. + positions (bool, optional): Whether to include position information in the Token objects. Defaults to False. + chars (bool, optional): Whether to include character information in the Token objects. Defaults to False. + start_pos (int, optional): The starting position for the Token objects. Defaults to 0. + start_char (int, optional): The starting character position for the Token objects. Defaults to 0. + **kwargs: Additional keyword arguments to be passed to the Token objects. + + Yields: + Token: A Token object with the attributes filled in based on the input parameters. + + Examples: + >>> textstream = ["Hello", "world"] + >>> for token in entoken(textstream, positions=True, chars=True): + ... print(token.text, token.pos, token.startchar, token.endchar) + ... + Hello 0 0 5 + world 1 5 10 + """ pos = start_pos char = start_char t = Token(positions=positions, chars=chars, **kwargs) @@ -70,8 +106,6 @@ def entoken( # Token object - - class Token: """ Represents a "token" (usually a word) extracted from the source text being @@ -105,14 +139,18 @@ def __init__( self, positions=False, chars=False, removestops=True, mode="", **kwargs ): """ + Initializes a Token object. + :param positions: Whether tokens should have the token position in the 'pos' attribute. :param chars: Whether tokens should have character offsets in the 'startchar' and 'endchar' attributes. - :param removestops: whether to remove stop words from the stream (if + :param removestops: Whether to remove stop words from the stream (if the tokens pass through a stop filter). - :param mode: contains a string describing the purpose for which the + :param mode: Contains a string describing the purpose for which the analyzer is being called, i.e. 'index' or 'query'. + :param kwargs: Additional keyword arguments to be stored as attributes + of the Token object. """ self.positions = positions @@ -124,10 +162,22 @@ def __init__( self.__dict__.update(kwargs) def __repr__(self): + """ + Returns a string representation of the Token object. + + :return: A string representation of the Token object. + """ + parms = ", ".join(f"{name}={value!r}" for name, value in self.__dict__.items()) return f"{self.__class__.__name__}({parms})" def copy(self): + """ + Creates a copy of the Token object. + + :return: A copy of the Token object. + """ + # This is faster than using the copy module return Token(**self.__dict__) @@ -136,9 +186,37 @@ def copy(self): class Composable: + """ + A base class for composable objects in the analysis pipeline. + + Composable objects can be combined using the '|' operator to create composite analyzers. + + Attributes: + is_morph (bool): Indicates whether the composable object has morphological analysis. + + Methods: + __or__(self, other): Combines the current composable object with another composable object to create a composite analyzer. + __repr__(self): Returns a string representation of the composable object. + has_morph(self): Checks if the composable object has morphological analysis. + + """ + is_morph = False def __or__(self, other): + """ + Combines the current composable object with another composable object to create a composite analyzer. + + Args: + other (Composable): The composable object to be combined with. + + Returns: + CompositeAnalyzer: The composite analyzer created by combining the two composable objects. + + Raises: + TypeError: If the 'other' object is not an instance of Composable. + + """ from whoosh.analysis.analyzers import CompositeAnalyzer if not isinstance(other, Composable): @@ -146,6 +224,13 @@ def __or__(self, other): return CompositeAnalyzer(self, other) def __repr__(self): + """ + Returns a string representation of the composable object. + + Returns: + str: The string representation of the composable object. + + """ attrs = "" if self.__dict__: attrs = ", ".join( @@ -154,4 +239,11 @@ def __repr__(self): return self.__class__.__name__ + f"({attrs})" def has_morph(self): + """ + Checks if the composable object has morphological analysis. + + Returns: + bool: True if the composable object has morphological analysis, False otherwise. + + """ return self.is_morph diff --git a/src/whoosh/analysis/analyzers.py b/src/whoosh/analysis/analyzers.py index 236733d9..45deb597 100644 --- a/src/whoosh/analysis/analyzers.py +++ b/src/whoosh/analysis/analyzers.py @@ -39,16 +39,31 @@ ) from whoosh.lang.porter import stem + # Analyzers +class Analyzer(Composable): + """Abstract base class for analyzers. + An analyzer is responsible for processing text data and producing a stream of tokens. + Subclasses of Analyzer should implement the __call__ method to define the tokenization process. -class Analyzer(Composable): - """Abstract base class for analyzers.""" + Attributes: + None + + Methods: + __repr__: Returns a string representation of the analyzer. + __eq__: Checks if two analyzers are equal. + __call__: Processes the input value and returns a stream of tokens. + clean: Cleans up any resources used by the analyzer. + + """ def __repr__(self): + """Returns a string representation of the analyzer.""" return f"{self.__class__.__name__}()" def __eq__(self, other): + """Checks if two analyzers are equal.""" return ( other and self.__class__ is other.__class__ @@ -56,15 +71,65 @@ def __eq__(self, other): ) def __call__(self, value, **kwargs): + """Processes the input value and returns a stream of tokens. + + Args: + value (str): The input value to be analyzed. + **kwargs: Additional keyword arguments that may be required by specific analyzers. + + Returns: + generator: A generator that yields the tokens produced by the analyzer. + + Raises: + NotImplementedError: If the __call__ method is not implemented by a subclass. + + """ raise NotImplementedError def clean(self): - # This method is intentionally left empty. + """Cleans up any resources used by the analyzer. + + This method is intentionally left empty. + + Args: + None + + Returns: + None + + """ pass class CompositeAnalyzer(Analyzer): + """ + A composite analyzer that combines multiple analyzers and tokenizers into a single analyzer. + + Args: + *composables: Variable number of analyzers and tokenizers to be combined. + + Raises: + CompositionError: If more than one tokenizer is provided at the start of the analyzer. + + Example: + analyzer = CompositeAnalyzer(standard_analyzer(), LowercaseFilter()) + tokens = analyzer("Hello World") + for token in tokens: + print(token) + + """ + def __init__(self, *composables): + """ + Initializes the CompositeAnalyzer. + + Args: + *composables: Variable number of analyzers and tokenizers to be combined. + + Raises: + CompositionError: If more than one tokenizer is provided at the start of the analyzer. + + """ self.items = [] for comp in composables: @@ -73,9 +138,6 @@ def __init__(self, *composables): else: self.items.append(comp) - # Tokenizers must start a chain, and then only filters after that - # (because analyzers take a string and return a generator of tokens, - # and filters take and return generators of tokens) for item in self.items[1:]: if isinstance(item, Tokenizer): raise CompositionError( @@ -83,65 +145,132 @@ def __init__(self, *composables): ) def __repr__(self): + """ + Returns a string representation of the CompositeAnalyzer. + + Returns: + str: String representation of the CompositeAnalyzer. + + """ return "{}({})".format( self.__class__.__name__, ", ".join(repr(item) for item in self.items), ) def __call__(self, value, no_morph=False, **kwargs): + """ + Applies the composite analyzer to the given value and returns a generator of tokens. + + Args: + value (str): The input value to be analyzed. + no_morph (bool, optional): Flag to skip morphological analysis. Defaults to False. + **kwargs: Additional keyword arguments to be passed to the analyzers and tokenizers. + + Returns: + generator: A generator of tokens. + + """ items = self.items - # Start with tokenizer gen = items[0](value, **kwargs) - # Run filters for item in items[1:]: if not (no_morph and hasattr(item, "is_morph") and item.is_morph): gen = item(gen) return gen def __getitem__(self, item): + """ + Returns the item at the specified index. + + Args: + item (int): The index of the item to retrieve. + + Returns: + object: The item at the specified index. + + """ return self.items.__getitem__(item) def __len__(self): + """ + Returns the number of items in the CompositeAnalyzer. + + Returns: + int: The number of items in the CompositeAnalyzer. + + """ return len(self.items) def __eq__(self, other): + """ + Checks if the CompositeAnalyzer is equal to another object. + + Args: + other (object): The object to compare with. + + Returns: + bool: True if the CompositeAnalyzer is equal to the other object, False otherwise. + + """ return other and self.__class__ is other.__class__ and self.items == other.items def clean(self): + """ + Cleans up any resources used by the CompositeAnalyzer. + + """ for item in self.items: if hasattr(item, "clean"): item.clean() def has_morph(self): + """ + Checks if the CompositeAnalyzer has any morphological analysis. + + Returns: + bool: True if the CompositeAnalyzer has morphological analysis, False otherwise. + + """ return any(item.is_morph for item in self.items) # Functions that return composed analyzers -def IDAnalyzer(lowercase=False): - """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if - desired. +def id_analyzer(lowercase=False): """ + Returns an analyzer that tokenizes input text into individual tokens using the IDTokenizer. + If lowercase is set to True, it also applies the LowercaseFilter to convert tokens to lowercase. + + Parameters: + - lowercase (bool): Whether to convert tokens to lowercase. Default is False. + + Returns: + - tokenizer (Analyzer): The configured analyzer. + Deprecated: This function is deprecated. It is recommended to use IDTokenizer directly, with a LowercaseFilter if desired. + """ tokenizer = IDTokenizer() if lowercase: tokenizer = tokenizer | LowercaseFilter() return tokenizer -def KeywordAnalyzer(lowercase=False, commas=False): - """Parses whitespace- or comma-separated tokens. +def keyword_analyzer(lowercase=False, commas=False): + """ + Parses whitespace- or comma-separated tokens. + + This analyzer is used to parse whitespace- or comma-separated tokens from a given text. + It can be configured to lowercase the tokens and treat items separated by commas instead of whitespace. - >>> ana = KeywordAnalyzer() + Example usage: + >>> ana = keyword_analyzer() >>> [token.text for token in ana("Hello there, this is a TEST")] ["Hello", "there,", "this", "is", "a", "TEST"] - :param lowercase: whether to lowercase the tokens. - :param commas: if True, items are separated by commas rather than - whitespace. + :param lowercase: A boolean indicating whether to lowercase the tokens. Default is False. + :param commas: A boolean indicating whether items are separated by commas instead of whitespace. Default is False. + :return: A tokenizer object that can be used to tokenize the input text. """ - if commas: tokenizer = CommaSeparatedTokenizer() else: @@ -151,34 +280,51 @@ def KeywordAnalyzer(lowercase=False, commas=False): return tokenizer -def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False): - """Deprecated, just use a RegexTokenizer directly.""" +def regex_analyzer(expression=r"\w+(\.?\w+)*", gaps=False): + r""" + Deprecated, just use a RegexTokenizer directly. + + Args: + expression (str, optional): The regular expression pattern to match. Defaults to r"\w+(\.?\w+)*". + gaps (bool, optional): Whether to split on gaps (non-matching substrings) or matches. Defaults to False. + + Returns: + RegexTokenizer: A tokenizer that tokenizes text using a regular expression pattern. + """ return RegexTokenizer(expression=expression, gaps=gaps) -def SimpleAnalyzer(expression=default_pattern, gaps=False): - """Composes a RegexTokenizer with a LowercaseFilter. +def simple_analyzer(expression=default_pattern, gaps=False): + """ + Composes a RegexTokenizer with a LowercaseFilter. + + This function creates an analyzer that tokenizes text using a regular expression pattern and converts the tokens to lowercase. - >>> ana = SimpleAnalyzer() + Example usage: + >>> ana = simple_analyzer() >>> [token.text for token in ana("Hello there, this is a TEST")] ["hello", "there", "this", "is", "a", "test"] - :param expression: The regular expression pattern to use to extract tokens. - :param gaps: If True, the tokenizer *splits* on the expression, rather - than matching on the expression. + :param expression: The regular expression pattern to use for token extraction. Defaults to `default_pattern`. + :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. Defaults to False. + :return: An analyzer object that tokenizes text using the specified regular expression pattern and converts the tokens to lowercase. """ - return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter() -def StandardAnalyzer( +def standard_analyzer( expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False ): """Composes a RegexTokenizer with a LowercaseFilter and optional StopFilter. - >>> ana = StandardAnalyzer() + This analyzer is used to tokenize and filter text into a stream of tokens. + It applies a regular expression pattern to extract tokens, converts them to lowercase, + and optionally removes stop words. + + Example usage: + >>> ana = standard_analyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["testing", "testing", "testing"] @@ -186,11 +332,11 @@ def StandardAnalyzer( :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. - :param maxsize: Words longer that this are removed from the stream. + :param maxsize: Words longer than this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. + :return: A chain of tokenizers and filters that can be used to analyze text. """ - ret = RegexTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: @@ -198,7 +344,7 @@ def StandardAnalyzer( return chain -def StemmingAnalyzer( +def stemming_analyzer( expression=default_pattern, stoplist=STOP_WORDS, minsize=2, @@ -208,26 +354,49 @@ def StemmingAnalyzer( ignore=None, cachesize=50000, ): - """Composes a RegexTokenizer with a lower case filter, an optional stop + r""" + Composes a RegexTokenizer with a lower case filter, an optional stop filter, and a stemming filter. - >>> ana = StemmingAnalyzer() - >>> [token.text for token in ana("Testing is testing and testing")] + Args: + expression (str, optional): The regular expression pattern to use to extract tokens. + stoplist (list, optional): A list of stop words. Set this to None to disable the stop word filter. + minsize (int, optional): Words smaller than this are removed from the stream. + maxsize (int, optional): Words longer that this are removed from the stream. + gaps (bool, optional): If True, the tokenizer *splits* on the expression, rather than matching on the expression. + stemfn (function, optional): The stemming function to use. Defaults to the `stem` function. + ignore (set, optional): A set of words to not stem. + cachesize (int, optional): The maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. Use None for no cache, or -1 for an unbounded cache. + + Returns: + Analyzer: The composed analyzer. + + Examples: + >>> ana = stemming_analyzer() + >>> [token.text for token in ana("Testing is testing and testing")] + ["test", "test", "test"] + + This function composes an analyzer that tokenizes text using a regular expression pattern, + converts tokens to lowercase, applies an optional stop word filter, and performs stemming + on the tokens. + + The `expression` parameter specifies the regular expression pattern to use for token extraction. + The `stoplist` parameter is a list of stop words to be filtered out. If set to None, the stop word + filter is disabled. The `minsize` and `maxsize` parameters control the minimum and maximum word + lengths to keep in the token stream. The `gaps` parameter determines whether the tokenizer splits + on the expression or matches on it. + + The `stemfn` parameter specifies the stemming function to use. By default, it uses the `stem` function. + The `ignore` parameter is a set of words that should not be stemmed. The `cachesize` parameter sets + the maximum number of stemmed words to cache, improving performance at the cost of memory usage. + + The function returns the composed analyzer, which can be used to process text and extract tokens. + + Example usage: + >>> analyzer = stemming_analyzer(expression=r'\w+', stoplist=['is', 'and'], minsize=3) + >>> [token.text for token in analyzer("Testing is testing and testing")] ["test", "test", "test"] - - :param expression: The regular expression pattern to use to extract tokens. - :param stoplist: A list of stop words. Set this to None to disable - the stop word filter. - :param minsize: Words smaller than this are removed from the stream. - :param maxsize: Words longer that this are removed from the stream. - :param gaps: If True, the tokenizer *splits* on the expression, rather - than matching on the expression. - :param ignore: a set of words to not stem. - :param cachesize: the maximum number of stemmed words to cache. The larger - this number, the faster stemming will be but the more memory it will - use. Use None for no cache, or -1 for an unbounded cache. """ - ret = RegexTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: @@ -235,7 +404,7 @@ def StemmingAnalyzer( return chain | StemFilter(stemfn=stemfn, ignore=ignore, cachesize=cachesize) -def FancyAnalyzer( +def fancy_analyzer( expression=r"\s+", stoplist=STOP_WORDS, minsize=2, @@ -245,22 +414,36 @@ def FancyAnalyzer( mergewords=False, mergenums=False, ): - """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and - StopFilter. + """ + Composes a fancy_analyzer with a RegexTokenizer, IntraWordFilter, LowercaseFilter, and StopFilter. + + This analyzer tokenizes text using a regular expression pattern, applies intra-word filtering, + converts tokens to lowercase, and removes stop words. - >>> ana = FancyAnalyzer() + Example usage: + >>> ana = fancy_analyzer() >>> [token.text for token in ana("Should I call getInt or get_real?")] ["should", "call", "getInt", "get", "int", "get_real", "get", "real"] - :param expression: The regular expression pattern to use to extract tokens. - :param stoplist: A list of stop words. Set this to None to disable - the stop word filter. - :param minsize: Words smaller than this are removed from the stream. - :param maxsize: Words longer that this are removed from the stream. - :param gaps: If True, the tokenizer *splits* on the expression, rather - than matching on the expression. + :param expression: The regular expression pattern to use for token extraction. + :type expression: str, optional + :param stoplist: A list of stop words. Set this to None to disable the stop word filter. + :type stoplist: list or None, optional + :param minsize: Words smaller than this are removed from the token stream. + :type minsize: int, optional + :param gaps: If True, the tokenizer splits on the expression, rather than matching on the expression. + :type gaps: bool, optional + :param splitwords: If True, intra-word filtering splits words. + :type splitwords: bool, optional + :param splitnums: If True, intra-word filtering splits numbers. + :type splitnums: bool, optional + :param mergewords: If True, intra-word filtering merges words. + :type mergewords: bool, optional + :param mergenums: If True, intra-word filtering merges numbers. + :type mergenums: bool, optional + :return: A composed analyzer. + :rtype: Analyzer """ - return ( RegexTokenizer(expression=expression, gaps=gaps) | IntraWordFilter( @@ -274,27 +457,24 @@ def FancyAnalyzer( ) -def LanguageAnalyzer(lang, expression=default_pattern, gaps=False, cachesize=50000): - """Configures a simple analyzer for the given language, with a - LowercaseFilter, StopFilter, and StemFilter. +def language_analyzer(lang, expression=default_pattern, gaps=False, cachesize=50000): + """ + Configures a simple analyzer for the given language, with a LowercaseFilter, StopFilter, and StemFilter. + + :param lang: The language code for the analyzer. The list of available languages is in `whoosh.lang.languages`. + :param expression: The regular expression pattern to use to extract tokens. + :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. + :param cachesize: The maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. + :return: The configured analyzer chain. - >>> ana = LanguageAnalyzer("es") + Example usage: + >>> ana = language_analyzer("es") >>> [token.text for token in ana("Por el mar corren las liebres")] ['mar', 'corr', 'liebr'] The list of available languages is in `whoosh.lang.languages`. - You can use :func:`whoosh.lang.has_stemmer` and - :func:`whoosh.lang.has_stopwords` to check if a given language has a - stemming function and/or stop word list available. - - :param expression: The regular expression pattern to use to extract tokens. - :param gaps: If True, the tokenizer *splits* on the expression, rather - than matching on the expression. - :param cachesize: the maximum number of stemmed words to cache. The larger - this number, the faster stemming will be but the more memory it will - use. + You can use `whoosh.lang.has_stemmer` and `whoosh.lang.has_stopwords` to check if a given language has a stemming function and/or stop word list available. """ - from whoosh.lang import NoStemmer, NoStopWords # Make the start of the chain diff --git a/src/whoosh/analysis/filters.py b/src/whoosh/analysis/filters.py index 1fabefa8..bbf649cc 100644 --- a/src/whoosh/analysis/filters.py +++ b/src/whoosh/analysis/filters.py @@ -103,6 +103,15 @@ class Filter(Composable): """ def __eq__(self, other): + """ + Compare this object with another object for equality. + + Args: + other: The object to compare with. + + Returns: + bool: True if the objects are equal, False otherwise. + """ return ( other and self.__class__ is other.__class__ @@ -110,9 +119,27 @@ def __eq__(self, other): ) def __ne__(self, other): + """ + Check if the current object is not equal to another object. + + Parameters: + - other: The object to compare with. + + Returns: + - bool: True if the objects are not equal, False otherwise. + """ return self != other def __call__(self, tokens): + """ + Applies the filter to the given list of tokens. + + Args: + tokens (list): The list of tokens to be filtered. + + Returns: + list: The filtered list of tokens. + """ raise NotImplementedError @@ -120,18 +147,35 @@ class PassFilter(Filter): """An identity filter: passes the tokens through untouched.""" def __call__(self, tokens): + """ + Apply the pass filter to the given tokens. + + Parameters: + tokens (list): The list of tokens to be filtered. + + Returns: + list: The filtered list of tokens, which is the same as the input list. + """ return tokens class LoggingFilter(Filter): """Prints the contents of every filter that passes through as a debug log entry. + + This filter is used to log the contents of each token that passes through it. It can be helpful for debugging purposes or for monitoring the tokenization process. + + Args: + logger (Logger, optional): The logger to use for logging the token contents. If not provided, the "whoosh.analysis" logger is used. + """ def __init__(self, logger=None): """ - :param target: the logger to use. If omitted, the "whoosh.analysis" - logger is used. + Initializes a new instance of the LoggingFilter class. + + Args: + logger (Logger, optional): The logger to use. If omitted, the "whoosh.analysis" logger is used. """ if logger is None: @@ -141,6 +185,17 @@ def __init__(self, logger=None): self.logger = logger def __call__(self, tokens): + """ + Applies the filter to the given tokens. + + Args: + tokens (iterable): The tokens to filter. + + Yields: + Token: The filtered tokens. + + """ + logger = self.logger for t in tokens: logger.debug(repr(t)) @@ -150,6 +205,22 @@ def __call__(self, tokens): class MultiFilter(Filter): """Chooses one of two or more sub-filters based on the 'mode' attribute of the token stream. + + This class is used to apply different filters to a token stream based on + the value of the 'mode' attribute of each token. It allows you to associate + different filters with different 'mode' attribute values and apply the + appropriate filter to each token. + + Attributes: + default_filter (Filter): The default filter to use when no matching + 'mode' attribute is found. Defaults to PassFilter(). + filters (dict): A dictionary that maps 'mode' attribute values to + instantiated filters. + + Example: + >>> iwf_for_index = IntraWordFilter(mergewords=True, mergenums=False) + >>> iwf_for_query = IntraWordFilter(mergewords=False, mergenums=False) + >>> mf = MultiFilter(index=iwf_for_index, query=iwf_for_query) """ default_filter = PassFilter() @@ -158,16 +229,25 @@ def __init__(self, **kwargs): """Use keyword arguments to associate mode attribute values with instantiated filters. - >>> iwf_for_index = IntraWordFilter(mergewords=True, mergenums=False) - >>> iwf_for_query = IntraWordFilter(mergewords=False, mergenums=False) - >>> mf = MultiFilter(index=iwf_for_index, query=iwf_for_query) + Args: + **kwargs: Keyword arguments where the key is the 'mode' attribute + value and the value is the instantiated filter. - This class expects that the value of the mode attribute is consistent - among all tokens in a token stream. + Note: + This class expects that the value of the mode attribute is consistent + among all tokens in a token stream. """ self.filters = kwargs def __eq__(self, other): + """Check if two MultiFilter instances are equal. + + Args: + other (MultiFilter): The other MultiFilter instance to compare. + + Returns: + bool: True if the two MultiFilter instances are equal, False otherwise. + """ return ( other and self.__class__ is other.__class__ @@ -175,6 +255,17 @@ def __eq__(self, other): ) def __call__(self, tokens): + """Apply the appropriate filter to each token in the token stream. + + Args: + tokens (iterable): An iterable of tokens. + + Returns: + iterable: An iterable of filtered tokens. + + Note: + Only the first token is used to determine the appropriate filter to apply. + """ # Only selects on the first token t = next(tokens) selected_filter = self.filters.get(t.mode, self.default_filter) @@ -184,9 +275,12 @@ def __call__(self, tokens): class TeeFilter(Filter): r"""Interleaves the results of two or more filters (or filter chains). - NOTE: because it needs to create copies of each token for each sub-filter, - this filter is quite slow. + This filter takes the output of multiple filters or filter chains and interleaves them together. + It is useful when you want to apply different transformations to the same input and combine the results. + NOTE: This filter can be slow because it needs to create copies of each token for each sub-filter. + + Usage: >>> target = "ALFA BRAVO CHARLIE" >>> # In one branch, we'll lower-case the tokens >>> f1 = LowercaseFilter() @@ -207,14 +301,41 @@ class TeeFilter(Filter): """ def __init__(self, *filters): + """ + Initialize the TeeFilter with the provided filters. + + Args: + *filters: Variable number of filters or filter chains to be interleaved. + + Raises: + ValueError: If less than two filters are provided. + """ if len(filters) < 2: raise ValueError("TeeFilter requires two or more filters") self.filters = filters def __eq__(self, other): + """ + Check if two TeeFilter instances are equal. + + Args: + other: Another TeeFilter instance. + + Returns: + bool: True if the two instances are equal, False otherwise. + """ return self.__class__ is other.__class__ and self.filters == other.fitlers def __call__(self, tokens): + """ + Apply the TeeFilter to the input tokens. + + Args: + tokens: The input tokens to be filtered. + + Yields: + Token: The interleaved tokens from the filters. + """ from itertools import tee count = len(self.filters) @@ -239,36 +360,119 @@ def __call__(self, tokens): class ReverseTextFilter(Filter): """Reverses the text of each token. - >>> ana = RegexTokenizer() | ReverseTextFilter() - >>> [token.text for token in ana("hello there")] - ["olleh", "ereht"] + This filter takes a stream of tokens and reverses the text of each token. + It can be used as part of an analysis pipeline to modify the text of tokens. + + Example: + >>> ana = RegexTokenizer() | ReverseTextFilter() + >>> [token.text for token in ana("hello there")] + ["olleh", "ereht"] + """ def __call__(self, tokens): + """Apply the reverse text transformation to each token. + + Args: + tokens (iterable): A stream of tokens. + + Yields: + Token: A token with the reversed text. + + """ for t in tokens: t.text = t.text[::-1] yield t class LowercaseFilter(Filter): - """Uses unicode.lower() to lowercase token text. + """A filter that uses unicode.lower() to lowercase token text. + + This filter converts the text of each token to lowercase using the unicode.lower() method. + It is commonly used in text analysis pipelines to normalize the case of tokens. + + Example: + >>> rext = RegexTokenizer() + >>> stream = rext("This is a TEST") + >>> [token.text for token in LowercaseFilter(stream)] + ["this", "is", "a", "test"] + + Usage: + 1. Create an instance of the LowercaseFilter class. + 2. Pass a stream of tokens to the instance using the __call__ method. + 3. Iterate over the filtered tokens to access the lowercase text. + + Note: + The LowercaseFilter modifies the text of each token in-place. It does not create new tokens. - >>> rext = RegexTokenizer() - >>> stream = rext("This is a TEST") - >>> [token.text for token in LowercaseFilter(stream)] - ["this", "is", "a", "test"] """ def __call__(self, tokens): + """Applies the lowercase transformation to each token in the stream. + + Args: + tokens (iterable): A stream of tokens. + + Yields: + Token: A token with its text converted to lowercase. + + """ for t in tokens: t.text = t.text.lower() yield t class StripFilter(Filter): - """Calls unicode.strip() on the token text.""" + """Calls unicode.strip() on the token text. + + This filter is used to remove leading and trailing whitespace from the token text. + It is typically used in text analysis pipelines to clean up the tokenized text. + + Example usage: + ------------- + from whoosh.analysis import Token, Tokenizer, TokenFilter + + class MyTokenizer(Tokenizer): + def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, + start_pos=0, start_char=0, mode='', **kwargs): + # Tokenize the value + tokens = self.tokenizer(value, positions=positions, chars=chars, + keeporiginal=keeporiginal, removestops=removestops, + start_pos=start_pos, start_char=start_char, mode=mode, **kwargs) + + # Apply the StripFilter to remove leading and trailing whitespace + tokens = StripFilter()(tokens) + + return tokens + + # Create an instance of MyTokenizer + tokenizer = MyTokenizer() + + # Tokenize a text + text = " Hello, World! " + tokens = tokenizer(text) + + # Print the tokens + for token in tokens: + print(token.text) + + Output: + ------- + Hello, + World! + + """ def __call__(self, tokens): + """Applies the strip() method to the token text. + + Args: + tokens (iterable of whoosh.analysis.Token): The input tokens. + + Yields: + whoosh.analysis.Token: The modified tokens with leading and trailing whitespace removed. + + """ for t in tokens: t.text = t.text.strip() yield t @@ -280,33 +484,58 @@ class StopFilter(Filter): Make sure you precede this filter with a :class:`LowercaseFilter`. - >>> stopper = RegexTokenizer() | StopFilter() - >>> [token.text for token in stopper(u"this is a test")] - ["test"] - >>> es_stopper = RegexTokenizer() | StopFilter(lang="es") - >>> [token.text for token in es_stopper(u"el lapiz es en la mesa")] - ["lapiz", "mesa"] - - The list of available languages is in `whoosh.lang.languages`. - You can use :func:`whoosh.lang.has_stopwords` to check if a given language - has a stop word list available. + Args: + stoplist (collection, optional): A collection of words to remove from the stream. + This is converted to a frozenset. The default is a list of + common English stop words. + minsize (int, optional): The minimum length of token texts. Tokens with + text smaller than this will be stopped. The default is 2. + maxsize (int, optional): The maximum length of token texts. Tokens with text + larger than this will be stopped. Use None to allow any length. + renumber (bool, optional): Change the 'pos' attribute of unstopped tokens + to reflect their position with the stopped words removed. + lang (str, optional): Automatically get a list of stop words for the given + language. + + Attributes: + stops (frozenset): The set of stop words. + min (int): The minimum length of token texts. + max (int): The maximum length of token texts. + renumber (bool): Indicates whether the 'pos' attribute of unstopped tokens + should be changed to reflect their position with the stopped words removed. + + Examples: + >>> stopper = RegexTokenizer() | StopFilter() + >>> [token.text for token in stopper(u"this is a test")] + ["test"] + >>> es_stopper = RegexTokenizer() | StopFilter(lang="es") + >>> [token.text for token in es_stopper(u"el lapiz es en la mesa")] + ["lapiz", "mesa"] + + Note: + The list of available languages is in `whoosh.lang.languages`. + You can use :func:`whoosh.lang.has_stopwords` to check if a given language + has a stop word list available. """ def __init__( self, stoplist=STOP_WORDS, minsize=2, maxsize=None, renumber=True, lang=None ): """ - :param stoplist: A collection of words to remove from the stream. - This is converted to a frozenset. The default is a list of - common English stop words. - :param minsize: The minimum length of token texts. Tokens with - text smaller than this will be stopped. The default is 2. - :param maxsize: The maximum length of token texts. Tokens with text - larger than this will be stopped. Use None to allow any length. - :param renumber: Change the 'pos' attribute of unstopped tokens - to reflect their position with the stopped words removed. - :param lang: Automatically get a list of stop words for the given - language + Initialize the StopFilter. + + Args: + stoplist (collection, optional): A collection of words to remove from the stream. + This is converted to a frozenset. The default is a list of + common English stop words. + minsize (int, optional): The minimum length of token texts. Tokens with + text smaller than this will be stopped. The default is 2. + maxsize (int, optional): The maximum length of token texts. Tokens with text + larger than this will be stopped. Use None to allow any length. + renumber (bool, optional): Change the 'pos' attribute of unstopped tokens + to reflect their position with the stopped words removed. + lang (str, optional): Automatically get a list of stop words for the given + language """ stops = set() @@ -323,6 +552,15 @@ def __init__( self.renumber = renumber def __eq__(self, other): + """ + Compare the StopFilter with another object for equality. + + Args: + other (object): The object to compare with. + + Returns: + bool: True if the objects are equal, False otherwise. + """ return ( other and self.__class__ is other.__class__ @@ -332,6 +570,15 @@ def __eq__(self, other): ) def __call__(self, tokens): + """ + Apply the StopFilter to the tokens. + + Args: + tokens (iterable): The input tokens. + + Yields: + Token: The filtered tokens. + """ stoplist = self.stops minsize = self.min maxsize = self.max @@ -363,45 +610,65 @@ def __call__(self, tokens): class CharsetFilter(Filter): - """Translates the text of tokens by calling unicode.translate() using the + """ + Translates the text of tokens by calling unicode.translate() using the supplied character mapping object. This is useful for case and accent folding. - The ``whoosh.support.charset`` module has a useful map for accent folding. + The `whoosh.support.charset` module has a useful map for accent folding. + + Example usage: + + ```python + from whoosh.support.charset import accent_map + from whoosh.analysis import RegexTokenizer - >>> from whoosh.support.charset import accent_map - >>> retokenizer = RegexTokenizer() - >>> chfilter = CharsetFilter(accent_map) - >>> [t.text for t in chfilter(retokenizer(u'café'))] - [u'cafe'] + retokenizer = RegexTokenizer() + chfilter = CharsetFilter(accent_map) + tokens = chfilter(retokenizer(u'café')) + [t.text for t in tokens] + # Output: [u'cafe'] + ``` Another way to get a character mapping object is to convert a Sphinx - charset table file using - :func:`whoosh.support.charset.charset_table_to_dict`. + charset table file using `whoosh.support.charset.charset_table_to_dict`. + + Example usage: + + ```python + from whoosh.support.charset import charset_table_to_dict, default_charset + from whoosh.analysis import RegexTokenizer - >>> from whoosh.support.charset import charset_table_to_dict - >>> from whoosh.support.charset import default_charset - >>> retokenizer = RegexTokenizer() - >>> charmap = charset_table_to_dict(default_charset) - >>> chfilter = CharsetFilter(charmap) - >>> [t.text for t in chfilter(retokenizer(u'Stra\\xdfe'))] - [u'strase'] + retokenizer = RegexTokenizer() + charmap = charset_table_to_dict(default_charset) + chfilter = CharsetFilter(charmap) + tokens = chfilter(retokenizer(u'Stra\\xdfe')) + [t.text for t in tokens] + # Output: [u'strase'] + ``` The Sphinx charset table format is described at - http://www.sphinxsearch.com/docs/current.html#conf-charset-table. + https://www.sphinxsearch.com/docs/current.html#conf-charset-table. """ __inittypes__ = {"charmap": dict} def __init__(self, charmap): """ - :param charmap: a dictionary mapping from integer character numbers to + Initializes a CharsetFilter object. + + :param charmap: A dictionary mapping from integer character numbers to unicode characters, as required by the unicode.translate() method. """ - self.charmap = charmap def __eq__(self, other): + """ + Checks if two CharsetFilter objects are equal. + + :param other: The other CharsetFilter object to compare. + :return: True if the two objects are equal, False otherwise. + """ return ( other and self.__class__ is other.__class__ @@ -409,6 +676,12 @@ def __eq__(self, other): ) def __call__(self, tokens): + """ + Applies the CharsetFilter to a sequence of tokens. + + :param tokens: An iterable sequence of tokens. + :return: A generator that yields the transformed tokens. + """ assert hasattr(tokens, "__iter__") charmap = self.charmap for t in tokens: @@ -423,37 +696,61 @@ class DelimitedAttributeFilter(Filter): The defaults are set up to use the ``^`` character as a delimiter and store the value after the ``^`` as the boost for the token. - >>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost") - >>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter() - >>> for t in ana(u("image render^2 file^0.5")) - ... print("%r %f" % (t.text, t.boost)) - 'image' 1.0 - 'render' 2.0 - 'file' 0.5 - - Note that you need to make sure your tokenizer includes the delimiter and - data as part of the token! + Args: + delimiter (str): A string that, when present in a token's text, separates + the actual text from the "data" payload. + attribute (str): The name of the attribute in which to store the data on + the token. + default (Any): The value to use for the attribute for tokens that don't have + delimited data. + type (type): The type of the data, for example ``str`` or ``float``. This is + used to convert the string value of the data before storing it in the + attribute. + + Example: + >>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost") + >>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter() + >>> for t in ana(u("image render^2 file^0.5")): + ... print("%r %f" % (t.text, t.boost)) + 'image' 1.0 + 'render' 2.0 + 'file' 0.5 + + Note: + You need to make sure your tokenizer includes the delimiter and data as part + of the token! """ def __init__(self, delimiter="^", attribute="boost", default=1.0, type=float): """ - :param delimiter: a string that, when present in a token's text, - separates the actual text from the "data" payload. - :param attribute: the name of the attribute in which to store the - data on the token. - :param default: the value to use for the attribute for tokens that - don't have delimited data. - :param type: the type of the data, for example ``str`` or ``float``. - This is used to convert the string value of the data before - storing it in the attribute. + Initialize the DelimitedAttributeFilter. + + Args: + delimiter (str): A string that, when present in a token's text, separates + the actual text from the "data" payload. + attribute (str): The name of the attribute in which to store the data on + the token. + default (Any): The value to use for the attribute for tokens that don't have + delimited data. + type (type): The type of the data, for example ``str`` or ``float``. This is + used to convert the string value of the data before storing it in the + attribute. """ - self.delim = delimiter self.attr = attribute self.default = default self.type = type def __eq__(self, other): + """ + Compare the DelimitedAttributeFilter with another object for equality. + + Args: + other (Any): The object to compare with. + + Returns: + bool: True if the objects are equal, False otherwise. + """ return ( other and self.__class__ is other.__class__ @@ -463,6 +760,15 @@ def __eq__(self, other): ) def __call__(self, tokens): + """ + Apply the DelimitedAttributeFilter to a sequence of tokens. + + Args: + tokens (Iterable[Token]): The sequence of tokens to filter. + + Yields: + Token: The filtered tokens. + """ delim = self.delim attr = self.attr default = self.default @@ -485,33 +791,59 @@ def __call__(self, tokens): class SubstitutionFilter(Filter): """Performs a regular expression substitution on the token text. - This is especially useful for removing text from tokens, for example - hyphens:: - - ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("-", "") + This filter applies a regular expression substitution to the text of each token. + It is particularly useful for removing or replacing specific patterns of text within tokens. + The filter utilizes the `re.sub()` method to perform the substitution. + + Example usage: + -------------- + # Create an analyzer that removes hyphens from tokens + tokenizer = RegexTokenizer(r"\\S+") + substitution_filter = SubstitutionFilter("-", "") + analyzer = tokenizer | substitution_filter + + Parameters: + ----------- + pattern : str or Pattern + A pattern string or compiled regular expression object describing the text to replace. + replacement : str + The substitution text. + + Methods: + -------- + __call__(tokens) + Applies the substitution filter to the given tokens. - Because it has the full power of the re.sub() method behind it, this filter - can perform some fairly complex transformations. For example, to take - tokens like ``'a=b', 'c=d', 'e=f'`` and change them to ``'b=a', 'd=c', - 'f=e'``:: - - # Analyzer that swaps the text on either side of an equal sign - rt = RegexTokenizer(r"\\S+") - sf = SubstitutionFilter("([^/]*)/(./*)", r"\\2/\\1") - ana = rt | sf """ def __init__(self, pattern, replacement): """ - :param pattern: a pattern string or compiled regular expression object - describing the text to replace. - :param replacement: the substitution text. + Initializes a SubstitutionFilter object. + + Parameters: + ----------- + pattern : str or Pattern + A pattern string or compiled regular expression object describing the text to replace. + replacement : str + The substitution text. """ - self.pattern = rcompile(pattern) self.replacement = replacement def __eq__(self, other): + """ + Checks if two SubstitutionFilter objects are equal. + + Parameters: + ----------- + other : SubstitutionFilter + The other SubstitutionFilter object to compare. + + Returns: + -------- + bool + True if the two SubstitutionFilter objects are equal, False otherwise. + """ return ( other and self.__class__ is other.__class__ @@ -520,6 +852,19 @@ def __eq__(self, other): ) def __call__(self, tokens): + """ + Applies the substitution filter to the given tokens. + + Parameters: + ----------- + tokens : iterable + An iterable of Token objects. + + Yields: + ------- + Token + The modified Token objects after applying the substitution filter. + """ pattern = self.pattern replacement = self.replacement diff --git a/src/whoosh/analysis/intraword.py b/src/whoosh/analysis/intraword.py index ae22e58b..9eea4c78 100644 --- a/src/whoosh/analysis/intraword.py +++ b/src/whoosh/analysis/intraword.py @@ -43,27 +43,49 @@ class CompoundWordFilter(Filter): The ``keep_compound`` argument lets you decide whether to keep the compound word in the token stream along with the word segments. - >>> cwf = CompoundWordFilter(wordset, keep_compound=True) - >>> analyzer = RegexTokenizer(r"\S+") | cwf - >>> [t.text for t in analyzer("I do not like greeneggs and ham") - ["I", "do", "not", "like", "greeneggs", "green", "eggs", "and", "ham"] - >>> cwf.keep_compound = False - >>> [t.text for t in analyzer("I do not like greeneggs and ham") - ["I", "do", "not", "like", "green", "eggs", "and", "ham"] + Args: + wordset (object): An object with a ``__contains__`` method, such as a + set, containing strings to look for inside the tokens. + keep_compound (bool, optional): If True (the default), the original compound + token will be retained in the stream before the subwords. + + Example: + >>> cwf = CompoundWordFilter(wordset, keep_compound=True) + >>> analyzer = RegexTokenizer(r"\S+") | cwf + >>> [t.text for t in analyzer("I do not like greeneggs and ham")] + ["I", "do", "not", "like", "greeneggs", "green", "eggs", "and", "ham"] + >>> cwf.keep_compound = False + >>> [t.text for t in analyzer("I do not like greeneggs and ham")] + ["I", "do", "not", "like", "green", "eggs", "and", "ham"] """ def __init__(self, wordset, keep_compound=True): """ - :param wordset: an object with a ``__contains__`` method, such as a - set, containing strings to look for inside the tokens. - :param keep_compound: if True (the default), the original compound - token will be retained in the stream before the subwords. + Initialize the CompoundWordFilter. + + Args: + wordset (object): An object with a ``__contains__`` method, such as a + set, containing strings to look for inside the tokens. + keep_compound (bool, optional): If True (the default), the original compound + token will be retained in the stream before the subwords. """ self.wordset = wordset self.keep_compound = keep_compound def subwords(self, s, memo): + """ + Recursively break a compound word into its individual parts. + + Args: + s (str): The compound word to be broken down. + memo (dict): A dictionary to store previously computed subwords. + + Returns: + list or None: A list of subwords if the compound word can be broken down, + None otherwise. + """ + if s in self.wordset: return [s] if s in memo: @@ -82,6 +104,16 @@ def subwords(self, s, memo): return None def __call__(self, tokens): + """ + Apply the CompoundWordFilter to a stream of tokens. + + Args: + tokens (iterable): The input stream of tokens. + + Yields: + Token: The modified tokens after applying the filter. + """ + keep_compound = self.keep_compound memo = {} subwords = self.subwords @@ -98,27 +130,45 @@ def __call__(self, tokens): class BiWordFilter(Filter): - """Merges adjacent tokens into "bi-word" tokens, so that for example:: + """Merges adjacent tokens into "bi-word" tokens. - "the", "sign", "of", "four" + This filter merges adjacent tokens into "bi-word" tokens. For example, the tokens + "the", "sign", "of", "four" would be transformed into "the-sign", "sign-of", "of-four". - becomes:: + Bi-word tokens can be used to create fields for pseudo-phrase searching. If all the + terms in a query match the document, it probably contains the phrase. Using bi-word + tokens can make the searching faster than actually doing a phrase search on individual + word terms. - "the-sign", "sign-of", "of-four" + The `BiWordFilter` is much faster than using the otherwise equivalent `ShingleFilter(2)`. - This can be used to create fields for pseudo-phrase searching, where if - all the terms match the document probably contains the phrase, but the - searching is faster than actually doing a phrase search on individual word - terms. + Args: + sep (str): The separator to use when merging adjacent tokens. Default is "-". - The ``BiWordFilter`` is much faster than using the otherwise equivalent - ``ShingleFilter(2)``. """ def __init__(self, sep="-"): + """ + Initializes the IntrawordFilter with the specified separator character. + + Args: + sep (str): The separator character used to split words. Defaults to "-". + """ self.sep = sep def __call__(self, tokens): + """Merges adjacent tokens into bi-word tokens. + + This method takes a stream of tokens and merges adjacent tokens into "bi-word" tokens. + It yields the bi-word tokens as it iterates through the input token stream. + + Args: + tokens (iterable): The input token stream. + + Yields: + Token: The bi-word tokens. + + """ sep = self.sep prev_text = None prev_startchar = None @@ -333,6 +383,15 @@ def __init__( self.mergenums = mergenums def __eq__(self, other): + """ + Check if this object is equal to another object. + + Args: + other: The object to compare with. + + Returns: + bool: True if the objects are equal, False otherwise. + """ return ( other and self.__class__ is other.__class__ @@ -340,6 +399,16 @@ def __eq__(self, other): ) def _split(self, string): + """ + Splits the given string into indexable substrings based on the specified boundaries. + + Args: + string (str): The input string to be split. + + Yields: + tuple: A tuple containing the start and end indices of each indexable substring. + + """ bound = self.boundary # Yields (startchar, endchar) pairs for each indexable substring in @@ -391,6 +460,21 @@ def _split(self, string): yield (part_start, part_end) def _merge(self, parts): + """ + Merges consecutive parts in the given list based on their type (alpha or digit). + + Args: + parts (list): The list of parts to be merged. Each part is a tuple of the form (text, pos, startchar, endchar). + + Returns: + None. The original list of parts is modified in-place. + + Example: + parts = [('hello', 0, 0, 4), ('world', 1, 6, 10), ('123', 2, 12, 14)] + _merge(parts) + print(parts) + # Output: [('helloworld', 0, 0, 10), ('123', 2, 12, 14)] + """ mergewords = self.mergewords mergenums = self.mergenums @@ -449,6 +533,18 @@ def insert_item(buf, at, newpos): insert_item(buf, len(parts), pos) def __call__(self, tokens): + """ + Applies the intraword filter to the given tokens. + + This filter renumbers tokens as it expands them. It splits tokens on delimiters, word and/or number boundaries, + and merges consecutive runs of all-letters and/or all-numbers if the options are set. + + Parameters: + - tokens (list): The list of tokens to be processed. + + Returns: + - generator: A generator that yields the processed tokens. + """ mergewords = self.mergewords mergenums = self.mergenums diff --git a/src/whoosh/analysis/morph.py b/src/whoosh/analysis/morph.py index addbfad6..796eb28f 100644 --- a/src/whoosh/analysis/morph.py +++ b/src/whoosh/analysis/morph.py @@ -37,50 +37,81 @@ class StemFilter(Filter): root word (for example, "rendering", "renders", "rendered", etc.) to a single word in the index. - >>> stemmer = RegexTokenizer() | StemFilter() - >>> [token.text for token in stemmer("fundamentally willows")] - ["fundament", "willow"] + Args: + stemfn (object): The function to use for stemming. Default is the Porter stemming algorithm for English. + lang (str): If not None, overrides the stemfn with a language stemmer from the `whoosh.lang.snowball` package. + ignore (list): A set/list of words that should not be stemmed. This is converted into a frozenset. If you omit this argument, all tokens are stemmed. + cachesize (int): The maximum number of words to cache. Use -1 for an unbounded cache, or None for no caching. + + Attributes: + is_morph (bool): Indicates if the filter is a morphological filter. + + Methods: + __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): Initializes the StemFilter object. + __getstate__(self): Returns the state of the object for pickling. + __setstate__(self, state): Sets the state of the object after unpickling. + clear(self): Clears the stem function and sets it based on the provided parameters. + cache_info(self): Returns information about the cache used by the stem function. + __eq__(self, other): Compares two StemFilter objects for equality. + __call__(self, tokens): Applies stemming to the tokens. + + Examples: + stemmer = RegexTokenizer() | StemFilter() + [token.text for token in stemmer("fundamentally willows")] + Output: ["fundament", "willow"] + + stemfilter = StemFilter(stem_function) + stemfilter = StemFilter(lang="ru") + """ - You can pass your own stemming function to the StemFilter. The default - is the Porter stemming algorithm for English. + __inittypes__ = {"stemfn": object, "ignore": list} + is_morph = True - >>> stemfilter = StemFilter(stem_function) + def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): + """ + Initializes the StemFilter object. - You can also use one of the Snowball stemming functions by passing the - `lang` keyword argument. + Args: + stemfn (object): The function to use for stemming. Default is the Porter stemming algorithm for English. + lang (str): If not None, overrides the stemfn with a language stemmer from the `whoosh.lang.snowball` package. + ignore (list): A set/list of words that should not be stemmed. This is converted into a frozenset. If you omit this argument, all tokens are stemmed. + cachesize (int): The maximum number of words to cache. Use -1 for an unbounded cache, or None for no caching. - >>> stemfilter = StemFilter(lang="ru") + Raises: + TypeError: If the `stemfn` argument is not callable. + ValueError: If the `cachesize` argument is not a positive integer or None. - The list of available languages is in `whoosh.lang.languages`. - You can use :func:`whoosh.lang.has_stemmer` to check if a given language has - a stemming function available. + Notes: + The StemFilter object is used to apply stemming to tokens during the analysis process. Stemming is the process of reducing words to their base or root form, which can help improve search accuracy by treating different forms of the same word as equivalent. - By default, this class wraps an LRU cache around the stemming function. The - ``cachesize`` keyword argument sets the size of the cache. To make the - cache unbounded (the class caches every input), use ``cachesize=-1``. To - disable caching, use ``cachesize=None``. + The `stemfn` argument specifies the function to use for stemming. By default, the Porter stemming algorithm for English is used. You can provide your own custom stemming function if desired. - If you compile and install the py-stemmer library, the - :class:`PyStemmerFilter` provides slightly easier access to the language - stemmers in that library. - """ + The `lang` argument allows you to override the `stemfn` with a language stemmer from the `whoosh.lang.snowball` package. If `lang` is not None, the stemmer for the specified language will be used instead of the `stemfn`. - __inittypes__ = {"stemfn": object, "ignore": list} + The `ignore` argument is a set/list of words that should not be stemmed. If you omit this argument, all tokens will be stemmed. The `ignore` set/list is converted into a frozenset for efficient lookup. - is_morph = True + The `cachesize` argument specifies the maximum number of words to cache. Caching can improve performance by avoiding redundant stemming operations. Use -1 for an unbounded cache, or None for no caching. - def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): - """ - :param stemfn: the function to use for stemming. - :param lang: if not None, overrides the stemfn with a language stemmer - from the ``whoosh.lang.snowball`` package. - :param ignore: a set/list of words that should not be stemmed. This is - converted into a frozenset. If you omit this argument, all tokens - are stemmed. - :param cachesize: the maximum number of words to cache. Use ``-1`` for - an unbounded cache, or ``None`` for no caching. - """ + Example: + # Initialize StemFilter with default settings + stem_filter = StemFilter() + + # Initialize StemFilter with custom stemming function + def custom_stemmer(word): + # custom stemming logic + return stemmed_word + + stem_filter = StemFilter(stemfn=custom_stemmer) + + # Initialize StemFilter with language stemmer + stem_filter = StemFilter(lang='english') + + # Initialize StemFilter with ignored words + stem_filter = StemFilter(ignore=['apple', 'banana', 'orange']) + # Initialize StemFilter with caching disabled + stem_filter = StemFilter(cachesize=None) + """ self.stemfn = stemfn self.lang = lang self.ignore = frozenset() if ignore is None else frozenset(ignore) @@ -89,13 +120,77 @@ def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): self.clear() def __getstate__(self): + """ + Get the state of the object for pickling. + + This method is called by the pickle module when pickling an object. + It returns a dictionary representing the state of the object, excluding + the '_stem' attribute. + + Returns: + dict: The state of the object without the '_stem' attribute. + + Example: + >>> obj = MyObject() + >>> state = obj.__getstate__() + >>> print(state) + {'attr1': value1, 'attr2': value2, ...} + + Note: + This method is automatically called by the pickle module and should + not be called directly by user code. + """ # Can't pickle a dynamic function, so we have to remove the _stem # attribute from the state return {k: self.__dict__[k] for k in self.__dict__ if k != "_stem"} def __setstate__(self, state): - # Check for old instances of StemFilter class, which didn't have a - # cachesize attribute and pickled the cache attribute + """ + Set the state of the object during unpickling. + + This method is called by the pickle module when unpickling an object. + It sets the state of the object based on the provided state dictionary. + + Parameters: + - state (dict): The state dictionary containing the object's attributes. + + Notes: + - This method is primarily used for backward compatibility with older versions + of the StemFilter class. + - It checks for old instances of StemFilter class and updates the state + accordingly. + - If the 'cachesize' attribute is not present in the state dictionary, it + sets the 'cachesize' attribute to a default value of 50000. + - If the 'ignores' attribute is present in the state dictionary, it sets the + 'ignore' attribute to the value of 'ignores'. + - If the 'ignore' attribute is not present in the state dictionary, it sets + the 'ignore' attribute to an empty frozenset. + - If the 'lang' attribute is not present in the state dictionary, it sets the + 'lang' attribute to None. + - If the 'cache' attribute is present in the state dictionary, it removes the + 'cache' attribute from the state dictionary. + + Returns: + - None + + Example: + >>> state = { + ... 'cachesize': 10000, + ... 'ignores': {'word1', 'word2'}, + ... 'lang': 'en', + ... 'cache': {}, + ... } + >>> obj = StemFilter() + >>> obj.__setstate__(state) + >>> obj.cachesize + 10000 + >>> obj.ignore + {'word1', 'word2'} + >>> obj.lang + 'en' + >>> 'cache' in obj.__dict__ + False + """ if "cachesize" not in state: self.cachesize = 50000 if "ignores" in state: @@ -108,10 +203,28 @@ def __setstate__(self, state): del state["cache"] self.__dict__.update(state) - # Set the _stem attribute self.clear() def clear(self): + """ + Clears the stem function and sets it based on the provided parameters. + + This method clears the current stem function and sets it based on the provided parameters. + If the language is specified, it retrieves the stemmer function for that language from the 'whoosh.lang' module. + Otherwise, it uses the stem function that was previously set. + + If the 'cachesize' parameter is an integer and not equal to 0, it creates a cache for the stem function. + If 'cachesize' is a negative integer, an unbound cache is created using the stem function. + If 'cachesize' is a positive integer greater than 1, an LFU (Least Frequently Used) cache is created with the specified size. + + If 'cachesize' is not an integer or equal to 0, no cache is created and the stem function is used directly. + + Note: The stem function is responsible for transforming words into their base or root form. + + Usage: + morph = MorphAnalyzer() + morph.clear() + """ if self.lang: from whoosh.lang import stemmer_for_language @@ -128,16 +241,67 @@ def clear(self): self._stem = stemfn def cache_info(self): + """ + Returns information about the cache used by the stem function. + + The cache_info method provides information about the cache used by the stem function. + It returns an object that contains details such as the number of cache hits, misses, + and the current size of the cache. + + Returns: + cache_info (object): An object containing information about the cache used by the stem function. + The object has the following attributes: + - hits (int): The number of cache hits. + - misses (int): The number of cache misses. + - maxsize (int): The maximum size of the cache. + - currsize (int): The current size of the cache. + + Returns None if caching is disabled. + """ if self.cachesize <= 1: return None return self._stem.cache_info() def __eq__(self, other): + """ + Compares two StemFilter objects for equality. + + This method compares the current StemFilter object with another StemFilter object + to determine if they are equal. Two StemFilter objects are considered equal if they + are of the same class and have the same stem function. + + Args: + other (StemFilter): The other StemFilter object to compare. + + Returns: + bool: True if the two StemFilter objects are equal, False otherwise. + """ return ( other and self.__class__ is other.__class__ and self.stemfn == other.stemfn ) def __call__(self, tokens): + """ + Applies stemming to the tokens. + + This method applies stemming to the given tokens using the specified stemmer. + It iterates over the tokens, checks if the token is not stopped, and if the token's text + is not in the ignore list. If the conditions are met, the token's text is stemmed using + the stemmer's stem function. + + Args: + tokens (iterable): The tokens to apply stemming to. + + Yields: + Token: The stemmed tokens. + + Example: + >>> stemmer = Stemmer() + >>> tokens = [Token("running"), Token("jumps"), Token("jumping")] + >>> stemmed_tokens = stemmer(tokens) + >>> list(stemmed_tokens) + [Token("run"), Token("jump"), Token("jump")] + """ stemfn = self._stem ignore = self.ignore @@ -154,19 +318,45 @@ class PyStemmerFilter(StemFilter): third-party library. You must have the py-stemmer library installed to use this filter. - >>> PyStemmerFilter("spanish") + Args: + lang (str, optional): A string identifying the stemming algorithm to use. + You can get a list of available algorithms by using the `algorithms()` + method. The identification strings are directly from the py-stemmer library. + Defaults to "english". + ignore (set or list, optional): A set or list of words that should not be stemmed. + If provided, these words will be excluded from the stemming process. + Defaults to None. + cachesize (int, optional): The maximum number of words to cache. Defaults to 10000. + + Attributes: + lang (str): The language identifier for the stemming algorithm. + ignore (frozenset): The set of words to be ignored during stemming. + cachesize (int): The maximum number of words to cache. + _stem (function): The stemmer function used for stemming. + + Methods: + algorithms(): Returns a list of stemming algorithms provided by the py-stemmer library. + cache_info(): Returns information about the cache (not implemented). + __getstate__(): Returns the state of the object for pickling (excluding _stem attribute). + __setstate__(): Sets the state of the object after unpickling. + + Example: + >>> filter = PyStemmerFilter("spanish") """ def __init__(self, lang="english", ignore=None, cachesize=10000): """ - :param lang: a string identifying the stemming algorithm to use. You - can get a list of available algorithms by with the - :meth:`PyStemmerFilter.algorithms` method. The identification - strings are directly from the py-stemmer library. - :param ignore: a set/list of words that should not be stemmed. This is - converted into a frozenset. If you omit this argument, all tokens - are stemmed. - :param cachesize: the maximum number of words to cache. + Initialize the PyStemmerFilter. + + Args: + lang (str, optional): A string identifying the stemming algorithm to use. + You can get a list of available algorithms by using the `algorithms()` + method. The identification strings are directly from the py-stemmer library. + Defaults to "english". + ignore (set or list, optional): A set or list of words that should not be stemmed. + If provided, these words will be excluded from the stemming process. + Defaults to None. + cachesize (int, optional): The maximum number of words to cache. Defaults to 10000. """ self.lang = lang @@ -175,18 +365,52 @@ def __init__(self, lang="english", ignore=None, cachesize=10000): self._stem = self._get_stemmer_fn() def algorithms(self): - """Returns a list of stemming algorithms provided by the py-stemmer - library. """ + Returns a list of stemming algorithms provided by the py-stemmer library. + + This method uses the py-stemmer library to retrieve a list of available stemming algorithms. + Stemming algorithms are used to reduce words to their base or root form, which can be useful + in natural language processing tasks such as information retrieval, text mining, and language + modeling. + Returns: + list: A list of strings representing the names of available stemming algorithms. + + Example: + >>> analyzer = Analyzer() + >>> algorithms = analyzer.algorithms() + >>> print(algorithms) + ['porter', 'snowball'] + """ import Stemmer # type: ignore @UnresolvedImport return Stemmer.algorithms() def cache_info(self): + """Returns information about the cache. + + This method is not implemented and always returns None. + + Returns: + None: This method does not provide any information about the cache. + """ return None def _get_stemmer_fn(self): + """ + Returns a stemmer function for the specified language. + + This function imports the Stemmer module and initializes a stemmer object + with the specified language. The stemmer object is then configured with + the specified cache size. Finally, the stemWord method of the stemmer + object is returned as the stemmer function. + + Returns: + callable: A stemmer function that takes a word as input and returns its stem. + + Raises: + ImportError: If the Stemmer module cannot be imported. + """ import Stemmer # type: ignore @UnresolvedImport stemmer = Stemmer.Stemmer(self.lang) @@ -194,13 +418,53 @@ def _get_stemmer_fn(self): return stemmer.stemWord def __getstate__(self): - # Can't pickle a dynamic function, so we have to remove the _stem - # attribute from the state + """ + Get the state of the object for pickling. + + This method is called by the pickle module when pickling an object. + It returns a dictionary representing the object's state, excluding the + '_stem' attribute. + + Returns: + dict: A dictionary representing the object's state. + + Note: + The '_stem' attribute is excluded from the state because dynamic + functions cannot be pickled. + + """ return {k: self.__dict__[k] for k in self.__dict__ if k != "_stem"} def __setstate__(self, state): - # Check for old instances of StemFilter class, which didn't have a - # cachesize attribute and pickled the cache attribute + """ + Set the state of the object during unpickling. + + This method is called by the pickle module when unpickling an object. + It is responsible for setting the state of the object based on the + provided `state` dictionary. + + Parameters: + state (dict): The dictionary containing the state of the object. + + Returns: + None + + Raises: + None + + Notes: + - This method is used to handle backward compatibility with old + instances of the `StemFilter` class. + - If the `state` dictionary does not contain the key "cachesize", + the `cachesize` attribute is set to the default value of 10000. + - If the `state` dictionary contains the key "ignores", the `ignore` + attribute is set to the value of "ignores". + - If the `state` dictionary does not contain the key "ignore", the + `ignore` attribute is set to an empty frozenset. + - The "cache" key is removed from the `state` dictionary. + - The `state` dictionary is used to update the object's attributes. + - The `_stem` attribute is set using the `_get_stemmer_fn` method. + """ if "cachesize" not in state: self.cachesize = 10000 if "ignores" in state: @@ -220,26 +484,45 @@ class DoubleMetaphoneFilter(Filter): Metaphone algorithm. This algorithm attempts to encode words in such a way that similar-sounding words reduce to the same code. This may be useful for fields containing the names of people and places, and other uses where - tolerance of spelling differences is desireable. + tolerance of spelling differences is desirable. + + Args: + primary_boost (float, optional): The boost to apply to the token containing the + primary code. Defaults to 1.0. + secondary_boost (float, optional): The boost to apply to the token containing the + secondary code, if any. Defaults to 0.5. + combine (bool, optional): If True, the original unencoded tokens are kept in the + stream, preceding the encoded tokens. Defaults to False. """ is_morph = True def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False): """ - :param primary_boost: the boost to apply to the token containing the - primary code. - :param secondary_boost: the boost to apply to the token containing the - secondary code, if any. - :param combine: if True, the original unencoded tokens are kept in the - stream, preceding the encoded tokens. - """ + Initialize a MorphAnalyzer object. + Args: + primary_boost (float, optional): The boost factor for primary morphological analysis. Defaults to 1.0. + secondary_boost (float, optional): The boost factor for secondary morphological analysis. Defaults to 0.5. + combine (bool, optional): Whether to combine the results of primary and secondary analysis. Defaults to False. + """ self.primary_boost = primary_boost self.secondary_boost = secondary_boost self.combine = combine def __eq__(self, other): + """ + Check if two objects are equal. + + This method compares the current object with another object to determine if they are equal. + The comparison is based on the class type and the primary_boost attribute. + + Parameters: + - other: The object to compare with. + + Returns: + - bool: True if the objects are equal, False otherwise. + """ return ( other and self.__class__ is other.__class__ @@ -247,6 +530,30 @@ def __eq__(self, other): ) def __call__(self, tokens): + """ + Applies morphological analysis to a sequence of tokens. + + Args: + tokens (iterable): The input tokens to be analyzed. + + Yields: + Token: The analyzed tokens with modified text and boost. + + Notes: + This method applies morphological analysis to each token in the input sequence. + It uses the double metaphone algorithm to generate primary and secondary forms of the token's text. + The token's text and boost are then modified based on the generated forms and yielded. + + Example: + >>> analyzer = MorphAnalyzer() + >>> tokens = [Token("running", boost=1.0), Token("swimming", boost=0.8)] + >>> analyzed_tokens = list(analyzer(tokens)) + >>> for token in analyzed_tokens: + ... print(token.text, token.boost) + ... + run 1.0 + swim 0.8 + """ primary_boost = self.primary_boost secondary_boost = self.secondary_boost combine = self.combine diff --git a/src/whoosh/analysis/ngrams.py b/src/whoosh/analysis/ngrams.py index a42fc37e..61f0d06d 100644 --- a/src/whoosh/analysis/ngrams.py +++ b/src/whoosh/analysis/ngrams.py @@ -35,32 +35,59 @@ class NgramTokenizer(Tokenizer): """Splits input text into N-grams instead of words. - >>> ngt = NgramTokenizer(4) - >>> [token.text for token in ngt("hi there")] - ["hi t", "i th", " the", "ther", "here"] + This tokenizer splits the input text into N-grams, where an N-gram is a + contiguous sequence of N characters. The N-grams emitted by this tokenizer + may contain whitespace, punctuation, and other characters. If you only want + sub-word N-grams without whitespace, you can combine a RegexTokenizer with + NgramFilter instead. + + Example: + ngt = NgramTokenizer(4) + tokens = [token.text for token in ngt("hi there")] + # tokens = ["hi t", "i th", " the", "ther", "here"] + + Note: + This tokenizer does not use a regular expression to extract words, so + the N-grams emitted by it will contain whitespace, punctuation, etc. + You may want to massage the input or add a custom filter to this + tokenizer's output. + + Args: + minsize (int): The minimum size of the N-grams. + maxsize (int, optional): The maximum size of the N-grams. If not + provided, maxsize will be set to minsize. + + Attributes: + min (int): The minimum size of the N-grams. + max (int): The maximum size of the N-grams. - Note that this tokenizer does NOT use a regular expression to extract - words, so the grams emitted by it will contain whitespace, punctuation, - etc. You may want to massage the input or add a custom filter to this - tokenizer's output. - - Alternatively, if you only want sub-word grams without whitespace, you - could combine a RegexTokenizer with NgramFilter instead. """ __inittypes__ = {"minsize": int, "maxsize": int} def __init__(self, minsize, maxsize=None): """ - :param minsize: The minimum size of the N-grams. - :param maxsize: The maximum size of the N-grams. If you omit - this parameter, maxsize == minsize. - """ + Initialize the NgramTokenizer. + Args: + minsize (int): The minimum size of the N-grams. + maxsize (int, optional): The maximum size of the N-grams. If not + provided, maxsize will be set to minsize. + + """ self.min = minsize self.max = maxsize or minsize def __eq__(self, other): + """ + Check if two ngram objects are equal. + + Args: + other (Ngram): The other ngram object to compare with. + + Returns: + bool: True if the ngram objects are equal, False otherwise. + """ if self.__class__ is other.__class__: if self.min == other.min and self.max == other.max: return True @@ -78,6 +105,37 @@ def __call__( mode="", **kwargs, ): + """ + Tokenizes the given value into n-grams. + + Args: + value (str): The input string to be tokenized. + positions (bool, optional): Whether to include position information in the tokens. Defaults to False. + chars (bool, optional): Whether to include character offset information in the tokens. Defaults to False. + keeporiginal (bool, optional): Whether to keep the original token text. Defaults to False. + removestops (bool, optional): Whether to remove stop words from the tokens. Defaults to True. + start_pos (int, optional): The starting position for position information. Defaults to 0. + start_char (int, optional): The starting character offset. Defaults to 0. + mode (str, optional): The tokenization mode. Defaults to "". + + Yields: + Token: The generated tokens. + + Raises: + AssertionError: If the input value is not a string. + + Note: + This method tokenizes the input string into n-grams based on the specified parameters. It generates tokens + by sliding a window of size `self.min` to `self.max` over the input string. The generated tokens can include + position information, character offset information, and original token text depending on the specified + parameters. + + If `mode` is set to "query", the method generates tokens by sliding a window of size `self.max` over the + input string. This is typically used for query tokenization. + + If `mode` is not set to "query", the method generates tokens by sliding a window of size `self.min` to + `self.max` over the input string. This is typically used for indexing tokenization. + """ assert isinstance(value, str), f"{value!r} is not unicode" inlen = len(value) @@ -122,8 +180,6 @@ def __call__( # Filter - - class NgramFilter(Filter): """Splits token text into N-grams. @@ -155,6 +211,15 @@ def __init__(self, minsize, maxsize=None, at=None): self.at = 1 def __eq__(self, other): + """ + Check if two ngrams objects are equal. + + Args: + other (object): The object to compare with. + + Returns: + bool: True if the two ngrams objects are equal, False otherwise. + """ return ( other and self.__class__ is other.__class__ @@ -163,6 +228,29 @@ def __eq__(self, other): ) def __call__(self, tokens): + """ + Tokenizes the given tokens into N-grams. + + Args: + tokens (iterable): The input tokens to be tokenized. + + Yields: + Token: The generated N-gram tokens. + + Raises: + AssertionError: If the input tokens are not iterable. + + Note: + This method tokenizes the input tokens into N-grams based on the specified parameters. It generates N-gram tokens by sliding a window of size `self.min` to `self.max` over the input tokens. + + If the token's text length is less than `self.min`, the token is skipped. + + If the token's mode is set to "query", the method generates N-gram tokens by sliding a window of size `self.max` over the token's text. This is typically used for query tokenization. + + If the token's mode is not set to "query", the method generates N-gram tokens by sliding a window of size `self.min` to `self.max` over the token's text. This is typically used for indexing tokenization. + + The generated N-gram tokens can include position information, character offset information, and original token text depending on the specified parameters. + """ assert hasattr(tokens, "__iter__") at = self.at for t in tokens: @@ -233,18 +321,44 @@ def __call__(self, tokens): # Analyzers -def NgramAnalyzer(minsize, maxsize=None): - """Composes an NgramTokenizer and a LowercaseFilter. - - >>> ana = NgramAnalyzer(4) - >>> [token.text for token in ana("hi there")] - ["hi t", "i th", " the", "ther", "here"] +def ngram_analyzer(minsize, maxsize=None): """ + Composes an NgramTokenizer and a LowercaseFilter. + + Args: + minsize (int): The minimum size of the n-grams. + maxsize (int, optional): The maximum size of the n-grams. Defaults to None. + Returns: + Analyzer: An analyzer that tokenizes text into n-grams and applies lowercase filtering. + + Examples: + >>> ana = ngram_analyzer(4) + >>> [token.text for token in ana("hi there")] + ["hi t", "i th", " the", "ther", "here"] + """ return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter() -def NgramWordAnalyzer(minsize, maxsize=None, tokenizer=None, at=None): +def ngram_word_analyzer(minsize, maxsize=None, tokenizer=None, at=None): + """ + Creates an analyzer that tokenizes text into n-grams. + + Args: + minsize (int): The minimum size of the n-grams. + maxsize (int, optional): The maximum size of the n-grams. Defaults to None. + tokenizer (Tokenizer, optional): The tokenizer to use. Defaults to None. + at (str, optional): The position at which to split the n-grams. Defaults to None. + + Returns: + Analyzer: The n-gram word analyzer. + + Example: + >>> analyzer = ngram_word_analyzer(2, 3) + >>> tokens = analyzer("Hello world") + >>> list(tokens) + ['he', 'el', 'll', 'lo', 'wo', 'or', 'rl', 'ld'] + """ if not tokenizer: tokenizer = RegexTokenizer() return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, at=at) diff --git a/src/whoosh/analysis/tokenizers.py b/src/whoosh/analysis/tokenizers.py index 449576ab..5058d295 100644 --- a/src/whoosh/analysis/tokenizers.py +++ b/src/whoosh/analysis/tokenizers.py @@ -35,9 +35,37 @@ class Tokenizer(Composable): - """Base class for Tokenizers.""" + """Base class for tokenizers. + + Tokenizers are responsible for breaking text into individual tokens. This base class + provides the basic structure and behavior that all tokenizers should follow. + + Subclasses should override the `tokenize` method to implement the tokenization logic. + + Example usage: + tokenizer = Tokenizer() + tokens = tokenizer.tokenize("Hello, world!") + for token in tokens: + print(token) + + Attributes: + None + + Methods: + __eq__(self, other): Compare if two tokenizers are equal. + + """ def __eq__(self, other): + """Compare if two tokenizers are equal. + + Args: + other (object): The other tokenizer object to compare. + + Returns: + bool: True if the tokenizers are equal, False otherwise. + + """ return other and self.__class__ is other.__class__ @@ -45,9 +73,23 @@ class IDTokenizer(Tokenizer): """Yields the entire input string as a single token. For use in indexed but untokenized fields, such as a document's path. - >>> idt = IDTokenizer() - >>> [token.text for token in idt("/a/b 123 alpha")] - ["/a/b 123 alpha"] + Example: + idt = IDTokenizer() + [token.text for token in idt("/a/b 123 alpha")] + Output: ["/a/b 123 alpha"] + + Args: + positions (bool, optional): Whether to store token positions. Defaults to False. + chars (bool, optional): Whether to store token character offsets. Defaults to False. + keeporiginal (bool, optional): Whether to store the original token text. Defaults to False. + removestops (bool, optional): Whether to remove stop words. Defaults to True. + start_pos (int, optional): The starting position of the token. Defaults to 0. + start_char (int, optional): The starting character offset of the token. Defaults to 0. + mode (str, optional): The tokenization mode. Defaults to "". + **kwargs: Additional keyword arguments. + + Yields: + Token: The token object containing the token information. """ def __call__( @@ -62,6 +104,27 @@ def __call__( mode="", **kwargs, ): + """ + Tokenizes the given value and yields a Token object. + + Args: + value (str): The input string to be tokenized. + positions (bool, optional): Whether to include position information in the Token object. Defaults to False. + chars (bool, optional): Whether to include character information in the Token object. Defaults to False. + keeporiginal (bool, optional): Whether to store the original value in the Token object. Defaults to False. + removestops (bool, optional): Whether to remove stop words from the Token object. Defaults to True. + start_pos (int, optional): The starting position of the Token object. Defaults to 0. + start_char (int, optional): The starting character position of the Token object. Defaults to 0. + mode (str, optional): The tokenization mode. Defaults to "". + **kwargs: Additional keyword arguments to be passed to the Token object. + + Yields: + Token: A Token object representing a tokenized value. + + Raises: + AssertionError: If the input value is not a string. + + """ assert isinstance(value, str), f"{value!r} is not unicode" t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) t.text = value @@ -80,25 +143,47 @@ class RegexTokenizer(Tokenizer): """ Uses a regular expression to extract tokens from text. + Example: >>> rex = RegexTokenizer() >>> [token.text for token in rex("hi there 3.141 big-time under_score")] ["hi", "there", "3.141", "big", "time", "under_score"] - """ - def __init__(self, expression=default_pattern, gaps=False): - """ - :param expression: A regular expression object or string. Each match + Args: + expression (Union[str, Pattern]): A regular expression object or string. Each match of the expression equals a token. Group 0 (the entire matched text) is used as the text of the token. If you require more complicated handling of the expression match, simply write your own tokenizer. - :param gaps: If True, the tokenizer *splits* on the expression, rather + gaps (bool): If True, the tokenizer *splits* on the expression, rather than matching on the expression. + """ + + def __init__(self, expression=default_pattern, gaps=False): + """ + Initialize the RegexTokenizer. + + Args: + expression (Union[str, Pattern]): A regular expression object or string. Each match + of the expression equals a token. Group 0 (the entire matched text) + is used as the text of the token. If you require more complicated + handling of the expression match, simply write your own tokenizer. + gaps (bool): If True, the tokenizer *splits* on the expression, rather + than matching on the expression. """ self.expression = rcompile(expression) self.gaps = gaps def __eq__(self, other): + """ + Compare the RegexTokenizer with another object for equality. + + Args: + other (object): The object to compare with. + + Returns: + bool: True if the objects are equal, False otherwise. + """ + if self.__class__ is other.__class__: if self.expression.pattern == other.expression.pattern: return True @@ -118,16 +203,21 @@ def __call__( **kwargs, ): """ - :param value: The unicode string to tokenize. - :param positions: Whether to record token positions in the token. - :param chars: Whether to record character offsets in the token. - :param start_pos: The position number of the first token. For example, - if you set start_pos=2, the tokens will be numbered 2,3,4,... - instead of 0,1,2,... - :param start_char: The offset of the first character of the first - token. For example, if you set start_char=2, the text "aaa bbb" - will have chars (2,5),(6,9) instead (0,3),(4,7). - :param tokenize: if True, the text should be tokenized. + Tokenize the input value using the RegexTokenizer. + + Args: + value (str): The unicode string to tokenize. + positions (bool): Whether to record token positions in the token. + chars (bool): Whether to record character offsets in the token. + keeporiginal (bool): Whether to keep the original text of the token. + removestops (bool): Whether to remove stop words from the token. + start_pos (int): The position number of the first token. + start_char (int): The offset of the first character of the first token. + tokenize (bool): If True, the text should be tokenized. + mode (str): The tokenization mode. + + Yields: + Token: The generated tokens. """ assert isinstance(value, str), f"{repr(value)} is not unicode" @@ -225,12 +315,24 @@ class CharsetTokenizer(Tokenizer): def __init__(self, charmap): """ - :param charmap: a mapping from integer character numbers to unicode + Initialize the Tokenizer with a character map. + + :param charmap: A mapping from integer character numbers to Unicode characters, as used by the unicode.translate() method. + :type charmap: dict """ self.charmap = charmap def __eq__(self, other): + """ + Compare this tokenizer with another tokenizer for equality. + + Parameters: + - other: The other tokenizer to compare with. + + Returns: + - True if the tokenizers are equal, False otherwise. + """ return ( other and self.__class__ is other.__class__ @@ -251,16 +353,22 @@ def __call__( **kwargs, ): """ + Tokenizes a given unicode string. + :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. - :param start_pos: The position number of the first token. For example, - if you set start_pos=2, the tokens will be numbered 2,3,4,... - instead of 0,1,2,... - :param start_char: The offset of the first character of the first - token. For example, if you set start_char=2, the text "aaa bbb" - will have chars (2,5),(6,9) instead (0,3),(4,7). - :param tokenize: if True, the text should be tokenized. + :param keeporiginal: Whether to keep the original text in the token. + :param removestops: Whether to remove stop words from the token. + :param start_pos: The position number of the first token. + :param start_char: The offset of the first character of the first token. + :param tokenize: If True, the text should be tokenized. + :param mode: The tokenization mode. + :param kwargs: Additional keyword arguments. + + :return: A generator that yields Token objects. + + :raises AssertionError: If the value is not a unicode string. """ assert isinstance(value, str), f"{value!r} is not unicode" @@ -316,27 +424,48 @@ def __call__( def SpaceSeparatedTokenizer(): - """Returns a RegexTokenizer that splits tokens by whitespace. - - >>> sst = SpaceSeparatedTokenizer() - >>> [token.text for token in sst("hi there big-time, what's up")] - ["hi", "there", "big-time,", "what's", "up"] """ + Returns a RegexTokenizer that splits tokens by whitespace. + + This tokenizer splits input text into tokens based on whitespace characters (spaces, tabs, newlines). + It uses a regular expression pattern to match and extract tokens. + + Example: + sst = SpaceSeparatedTokenizer() + tokens = [token.text for token in sst("hi there big-time, what's up")] + print(tokens) + # Output: ["hi", "there", "big-time,", "what's", "up"] + + Returns: + A RegexTokenizer object that tokenizes input text based on whitespace. + Note: + The regular expression pattern used by this tokenizer is r"[^ \t\r\n]+", + which matches one or more characters that are not whitespace. + + """ return RegexTokenizer(r"[^ \t\r\n]+") def CommaSeparatedTokenizer(): - """Splits tokens by commas. + """ + Tokenizes text by splitting tokens using commas. - Note that the tokenizer calls unicode.strip() on each match of the regular - expression. + This tokenizer splits the input text into tokens by using commas as the delimiter. + It also applies the `StripFilter` to remove leading and trailing whitespace from each token. - >>> cst = CommaSeparatedTokenizer() - >>> [token.text for token in cst("hi there, what's , up")] - ["hi there", "what's", "up"] - """ + Example: + >>> cst = CommaSeparatedTokenizer() + >>> [token.text for token in cst("hi there, what's , up")] + ["hi there", "what's", "up"] + Returns: + A tokenizer object that can be used to tokenize text. + + Note: + The tokenizer relies on the `RegexTokenizer` and `StripFilter` classes from the `whoosh.analysis` module. + + """ from whoosh.analysis.filters import StripFilter return RegexTokenizer(r"[^,]+") | StripFilter() @@ -345,12 +474,45 @@ def CommaSeparatedTokenizer(): class PathTokenizer(Tokenizer): """A simple tokenizer that given a string ``"/a/b/c"`` yields tokens ``["/a", "/a/b", "/a/b/c"]``. + + Args: + expression (str, optional): The regular expression pattern used to tokenize the input string. + Defaults to "[^/]+". + + Attributes: + expr (Pattern): The compiled regular expression pattern. + """ def __init__(self, expression="[^/]+"): + """ + Initialize the Tokenizer with the given regular expression pattern. + + Args: + expression (str, optional): The regular expression pattern used for tokenization. + Defaults to "[^/]+". + + Returns: + None + """ self.expr = rcompile(expression) def __call__(self, value, positions=False, start_pos=0, **kwargs): + """Tokenizes the input string. + + Args: + value (str): The input string to be tokenized. + positions (bool, optional): Whether to include token positions. Defaults to False. + start_pos (int, optional): The starting position for token positions. Defaults to 0. + **kwargs: Additional keyword arguments. + + Yields: + Token: The generated tokens. + + Raises: + AssertionError: If the input value is not a string. + + """ assert isinstance(value, str), f"{value!r} is not unicode" token = Token(positions, **kwargs) pos = start_pos diff --git a/src/whoosh/automata/fsa.py b/src/whoosh/automata/fsa.py index 024dc8c6..791d7842 100644 --- a/src/whoosh/automata/fsa.py +++ b/src/whoosh/automata/fsa.py @@ -9,10 +9,43 @@ class Marker: + """ + Represents a marker object. + + Markers are used to identify specific points in a program or data structure. + They can be used to mark positions in a Finite State Automaton (FSA) or any + other context where a named reference is needed. + + Attributes: + name (str): The name of the marker. + + Methods: + __repr__(): Returns a string representation of the marker. + + Example: + >>> marker = Marker("start") + >>> marker.name + 'start' + >>> repr(marker) + '' + """ + def __init__(self, name): + """ + Initializes a new Marker object. + + Args: + name (str): The name of the marker. + """ self.name = name def __repr__(self): + """ + Returns a string representation of the marker. + + Returns: + str: A string representation of the marker. + """ return f"<{self.name}>" @@ -21,18 +54,76 @@ def __repr__(self): # Base class +class FSA: + """ + Finite State Automaton (FSA) class. + + This class represents a finite state automaton, which is a mathematical model used to describe + sequential logic circuits and pattern matching algorithms. It consists of states, transitions, + and final states. + + Attributes: + initial (object): The initial state of the automaton. + transitions (dict): A dictionary that maps source states to a dictionary of labels and + destination states. + final_states (set): A set of final states in the automaton. + + Methods: + __len__(): Returns the total number of states in the automaton. + __eq__(other): Checks if two automata are equal. + all_states(): Returns a set of all states in the automaton. + all_labels(): Returns a set of all labels used in the automaton. + get_labels(src): Returns an iterator of labels for a given source state. + generate_all(state=None, sofar=""): Generates all possible strings accepted by the automaton. + start(): Returns the initial state of the automaton. + next_state(state, label): Returns the next state given the current state and a label. + is_final(state): Checks if a given state is a final state. + add_transition(src, label, dest): Adds a transition from a source state to a destination state + with a given label. + add_final_state(state): Adds a final state to the automaton. + to_dfa(): Converts the automaton to a deterministic finite automaton (DFA). + accept(string, debug=False): Checks if a given string is accepted by the automaton. + append(fsa): Appends another automaton to the current automaton. + """ -class FSA: def __init__(self, initial): + """ + Initialize a Finite State Automaton (FSA) with the given initial state. + + Args: + initial: The initial state of the FSA. + + Attributes: + initial (State): The initial state of the FSA. + transitions (dict): A dictionary mapping states to dictionaries of transitions. + Each transition dictionary maps input symbols to destination states. + final_states (set): A set of final states in the FSA. + + """ self.initial = initial self.transitions = {} self.final_states = set() def __len__(self): + """ + Returns the number of states in the finite state automaton. + + :return: The number of states in the automaton. + :rtype: int + """ return len(self.all_states()) def __eq__(self, other): + """ + Check if two Finite State Automata (FSAs) are equal. + + Args: + other (FSA): The other FSA to compare with. + + Returns: + bool: True if the FSAs are equal, False otherwise. + """ if self.initial != other.initial: return False if self.final_states != other.final_states: @@ -42,21 +133,114 @@ def __eq__(self, other): return st == ot def all_states(self): + """ + Returns a set of all states in the automaton. + + This method iterates over the transitions in the automaton and collects all the states + encountered. It returns a set containing all the unique states. + + Returns: + set: A set of all states in the automaton. + + Example: + >>> automaton = FSA() + >>> automaton.add_transition('A', 'B', 'a') + >>> automaton.add_transition('B', 'C', 'b') + >>> automaton.add_transition('C', 'D', 'c') + >>> automaton.all_states() + {'A', 'B', 'C', 'D'} + + """ stateset = set(self.transitions) for trans in self.transitions.values(): stateset.update(trans.values()) return stateset def all_labels(self): + """ + Returns a set of all labels used in the automaton. + + This method iterates over all transitions in the automaton and collects + all unique labels used in those transitions. The labels are returned as + a set. + + Returns: + set: A set of all labels used in the automaton. + + Example: + >>> automaton = FSA() + >>> automaton.add_transition(0, 1, 'a') + >>> automaton.add_transition(1, 2, 'b') + >>> automaton.add_transition(2, 3, 'a') + >>> automaton.all_labels() + {'a', 'b'} + + """ labels = set() for trans in self.transitions.values(): labels.update(trans) return labels def get_labels(self, src): + """ + Returns an iterator of labels for a given source state. + + Args: + src (object): The source state. + + Returns: + iterator: An iterator of labels for the given source state. + + Raises: + None + + Examples: + >>> fsa = FSA() + >>> src_state = State() + >>> fsa.add_transition(src_state, 'a', State()) + >>> fsa.add_transition(src_state, 'b', State()) + >>> labels = fsa.get_labels(src_state) + >>> list(labels) + ['a', 'b'] + + Notes: + - This method returns an iterator of labels for the given source state. + - If the source state has no transitions, an empty iterator will be returned. + """ return iter(self.transitions.get(src, [])) def generate_all(self, state=None, sofar=""): + """ + Generates all possible strings accepted by the automaton. + + Args: + state (object, optional): The current state. Defaults to the initial state. + sofar (str, optional): The string generated so far. Defaults to an empty string. + + Yields: + str: The generated string. + + Returns: + None + + Raises: + None + + Examples: + # Create an automaton + automaton = Automaton() + + # Generate all possible strings + for string in automaton.generate_all(): + print(string) + + Notes: + - This method uses a recursive approach to generate all possible strings accepted by the automaton. + - The `state` parameter represents the current state of the automaton. If not provided, it defaults to the initial state. + - The `sofar` parameter represents the string generated so far. If not provided, it defaults to an empty string. + - The method yields each generated string one by one, allowing for efficient memory usage when dealing with large automata. + + """ state = self.start() if state is None else state if self.is_final(state): yield sofar @@ -65,24 +249,177 @@ def generate_all(self, state=None, sofar=""): yield from self.generate_all(newstate, sofar + label) def start(self): + """ + Returns the initial state of the automaton. + + Returns: + object: + The initial state of the automaton. + + Raises: + None. + + Examples: + >>> automaton = FSA() + >>> initial_state = automaton.start() + """ return self.initial def next_state(self, state, label): + """ + Returns the next state given the current state and a label. + + Args: + state (object): The current state. + The current state of the finite state automaton. + + label (object): The label. + The label representing the transition from the current state to the next state. + + Returns: + object: The next state. + The next state of the finite state automaton based on the current state and label. + + Raises: + NotImplementedError: This method should be implemented in a subclass. + This exception is raised when the `next_state` method is called on the base class + and not overridden in a subclass. + + """ raise NotImplementedError def is_final(self, state): + """ + Checks if a given state is a final state. + + Args: + state (object): The state to check. + + Returns: + bool: True if the state is a final state, False otherwise. + + Raises: + NotImplementedError: This method should be implemented in a subclass. + + Examples: + >>> fsa = FSA() + >>> fsa.is_final(0) + False + >>> fsa.is_final(1) + True + + Notes: + This method should be implemented in a subclass to provide the specific logic for determining + whether a state is a final state or not. By default, it raises a NotImplementedError. + + """ raise NotImplementedError def add_transition(self, src, label, dest): + """ + Adds a transition from a source state to a destination state with a given label. + + Args: + src (object): The source state. + label (object): The label. + dest (object): The destination state. + + Raises: + NotImplementedError: This method should be implemented in a subclass. + + Returns: + None + + Example: + >>> fsa = FSA() + >>> src = State('A') + >>> dest = State('B') + >>> label = 'transition' + >>> fsa.add_transition(src, label, dest) + + """ raise NotImplementedError def add_final_state(self, state): + """ + Adds a final state to the automaton. + + Args: + state (object): The final state to add. + + Raises: + NotImplementedError: This method should be implemented in a subclass. + + Example: + >>> automaton = Automaton() + >>> automaton.add_final_state(5) + + This method should be implemented in a subclass to add a final state to the automaton. + A final state is a state that marks the end of a sequence of transitions in the automaton. + The `state` parameter should be an object representing the final state to be added. + + Note: + This method raises a NotImplementedError to indicate that it should be implemented in a subclass. + + """ raise NotImplementedError def to_dfa(self): + """ + Converts the automaton to a deterministic finite automaton (DFA). + + This method takes the current automaton and converts it into an equivalent + deterministic finite automaton (DFA). The resulting DFA will have the same + language recognition capabilities as the original automaton, but with a + potentially different internal representation. + + Returns: + DFA: The converted DFA. + + Raises: + NotImplementedError: This method should be implemented in a subclass. + + Example: + >>> nfa = NFA() + >>> # Add states, transitions, and final states to the NFA + >>> dfa = nfa.to_dfa() + >>> # Use the converted DFA for further processing + + Note: + The `to_dfa` method should be implemented in a subclass to provide the + conversion logic specific to that automaton type. + + """ raise NotImplementedError def accept(self, string, debug=False): + """ + Checks if a given string is accepted by the automaton. + + Args: + string (str): The string to check. + debug (bool, optional): Whether to print debug information. Defaults to False. + + Returns: + bool: True if the string is accepted, False otherwise. + + Raises: + None + + Examples: + >>> automaton = Automaton() + >>> automaton.accept("abc") + True + >>> automaton.accept("def") + False + + Notes: + This method iterates over each character in the input string and transitions the automaton + to the next state based on the current state and the input label. If the automaton reaches + a non-final state or encounters an invalid label, it breaks the loop and returns False. + If the automaton reaches a final state after processing the entire string, it returns True. + + """ state = self.start() for label in string: @@ -96,6 +433,33 @@ def accept(self, string, debug=False): return self.is_final(state) def append(self, fsa): + """ + Appends another automaton to the current automaton. + + Args: + fsa (FSA): The automaton to append. + + Returns: + None + + Raises: + None + + Notes: + This method appends the transitions and final states of the given automaton + to the current automaton. It updates the transitions dictionary by adding + the transitions from the given automaton. It also adds epsilon transitions + from each final state of the current automaton to the initial state of the + given automaton. Finally, it updates the final states of the current automaton + to be the final states of the given automaton. + + Example: + fsa1 = FSA() + fsa2 = FSA() + # ... code to define transitions and final states for fsa1 and fsa2 ... + fsa1.append(fsa2) + # Now fsa1 contains the appended transitions and final states from fsa2. + """ self.transitions.update(fsa.transitions) for state in self.final_states: self.add_transition(state, EPSILON, fsa.initial) @@ -106,12 +470,66 @@ def append(self, fsa): class NFA(FSA): + """ + NFA (Non-Deterministic Finite Automaton) class represents a non-deterministic finite automaton. + It is a subclass of FSA (Finite State Automaton). + + Attributes: + transitions (dict): A dictionary that maps source states to a dictionary of labels and destination states. + final_states (set): A set of final states. + initial: The initial state of the NFA. + + Methods: + dump(stream=sys.stdout): Prints a textual representation of the NFA to the specified stream. + start(): Returns the initial state of the NFA as a frozenset. + add_transition(src, label, dest): Adds a transition from source state to destination state with the specified label. + add_final_state(state): Adds a final state to the NFA. + triples(): Generates all possible triples (source state, label, destination state) in the NFA. + is_final(states): Checks if any of the given states is a final state. + _expand(states): Expands the given set of states by following epsilon transitions. + next_state(states, label): Returns the set of states that can be reached from the given states with the specified label. + get_labels(states): Returns the set of labels that can be reached from the given states. + embed(other): Copies all transitions from another NFA into this NFA. + insert(src, other, dest): Connects the source state to the initial state of another NFA, and the final states of the other NFA to the destination state. + to_dfa(): Converts the NFA to a DFA (Deterministic Finite Automaton). + """ + def __init__(self, initial): + """ + Initializes a Finite State Automaton (FSA) object. + + Parameters: + - initial: The initial state of the FSA. + + Attributes: + - transitions: A dictionary representing the transitions between states. + - final_states: A set containing the final states of the FSA. + - initial: The initial state of the FSA. + """ self.transitions = {} self.final_states = set() self.initial = initial def dump(self, stream=sys.stdout): + """ + Prints a textual representation of the NFA to the specified stream. + + Args: + stream (file): The stream to print the representation to. Defaults to sys.stdout. + + Returns: + None + + Raises: + None + + Example: + nfa = NFA() + nfa.add_transition(0, 'a', 1) + nfa.add_transition(1, 'b', 2) + nfa.add_transition(2, 'c', 3) + nfa.dump() # Prints the NFA representation to sys.stdout + """ starts = self.start() for src in self.transitions: beg = "@" if src in starts else " " @@ -122,24 +540,116 @@ def dump(self, stream=sys.stdout): _ = "||" if self.is_final(dests) else "" def start(self): + """ + Returns the initial state of the NFA as a frozenset. + + This method returns the initial state of the NFA (Non-Deterministic Finite Automaton) + as a frozenset. The initial state is the starting point of the automaton. + + Returns: + frozenset: The initial state of the NFA. + """ return frozenset(self._expand({self.initial})) def add_transition(self, src, label, dest): + """ + Adds a transition from the source state to the destination state with the specified label. + + This method is used to define transitions between states in a finite state automaton. + + Args: + src (object): The source state. + label (object): The label of the transition. + dest (object): The destination state. + + Returns: + None + + Raises: + None + + Example: + >>> fsa = FSA() + >>> fsa.add_transition('state1', 'a', 'state2') + """ self.transitions.setdefault(src, {}).setdefault(label, set()).add(dest) def add_final_state(self, state): + """ + Adds a final state to the NFA. + + Args: + state (object): The final state to add. + + Returns: + None + + Raises: + TypeError: If the state is not a valid object. + + Notes: + This method adds a final state to the NFA (Non-Deterministic Finite Automaton). + A final state is a state that, when reached during the execution of the NFA, + indicates that the input string has been accepted. + + Example: + >>> nfa = NFA() + >>> state = State() + >>> nfa.add_final_state(state) + """ self.final_states.add(state) def triples(self): + """ + Generates all possible triples (source state, label, destination state) in the NFA. + + This method iterates over the transitions of the NFA and yields a tuple for each triple found. + Each triple consists of the source state, the label of the transition, and the destination state. + + Yields: + tuple: A triple (source state, label, destination state). + """ for src, trans in self.transitions.items(): for label, dests in trans.items(): for dest in dests: yield src, label, dest def is_final(self, states): + """ + Checks if any of the given states is a final state. + + Args: + states (set): The set of states to check. + + Returns: + bool: True if any of the states is a final state, False otherwise. + """ return bool(self.final_states.intersection(states)) def _expand(self, states): + """ + Expands the given set of states by following epsilon transitions. + + This method takes a set of states and expands it by following epsilon transitions. + Epsilon transitions are transitions that do not consume any input symbol. + + Args: + states (set): The set of states to expand. + + Returns: + set: The expanded set of states. + + Example: + >>> automaton = FSA() + >>> initial_states = {0} + >>> expanded_states = automaton._expand(initial_states) + >>> print(expanded_states) + {0, 1, 2, 3} + + Note: + This method modifies the input set of states in-place by adding the newly expanded states to it. + If you want to keep the original set of states unchanged, make a copy before calling this method. + """ transitions = self.transitions frontier = set(states) while frontier: @@ -151,6 +661,36 @@ def _expand(self, states): return states def next_state(self, states, label): + """ + Returns the set of states that can be reached from the given states with the specified label. + + Args: + states (set): The set of states to start from. + label: The label of the transition. + + Returns: + frozenset: The set of states that can be reached. + + Raises: + None + + Example: + >>> automaton = FSA() + >>> automaton.add_transition(0, 'a', 1) + >>> automaton.add_transition(1, 'b', 2) + >>> automaton.add_transition(2, 'c', 3) + >>> automaton.next_state({0}, 'a') + frozenset({1}) + + This method takes a set of states and a label as input and returns the set of states that can be reached from the given states with the specified label. It considers the transitions defined in the automaton and follows them to determine the reachable states. + + The method first checks if each state in the input set has any outgoing transitions defined. If a transition with the specified label is found, the destination states are added to the result set. Additionally, if there is a transition with the special label 'ANY', the destination states of that transition are also added to the result set. + + The result set is then expanded to include all states reachable from the initial set of states, considering all possible transitions. + + Note: The input states should be a set of valid states in the automaton. The label can be any valid label defined in the automaton's transitions. + + """ transitions = self.transitions dest_states = set() for state in states: @@ -163,6 +703,26 @@ def next_state(self, states, label): return frozenset(self._expand(dest_states)) def get_labels(self, states): + """ + Returns the set of labels that can be reached from the given states. + + Args: + states (set): The set of states. + + Returns: + set: The set of labels. + + Raises: + None. + + Examples: + >>> automaton = FSA() + >>> automaton.add_transition(1, 'a', 2) + >>> automaton.add_transition(2, 'b', 3) + >>> automaton.add_transition(3, 'c', 4) + >>> automaton.get_labels({1, 2, 3}) + {'a', 'b', 'c'} + """ transitions = self.transitions labels = set() for state in states: @@ -171,7 +731,33 @@ def get_labels(self, states): return labels def embed(self, other): - # Copy all transitions from the other NFA into this one + """ + Copies all transitions from another NFA into this NFA. + + Args: + other (NFA): The other NFA to copy transitions from. + + Returns: + None + + Raises: + None + + Notes: + This method copies all transitions from the specified NFA (`other`) into the current NFA. + It updates the transitions of the current NFA by adding the transitions from `other`. + The transitions are copied based on the source state and the label of the transition. + If a transition with the same source state and label already exists in the current NFA, + the destination states are updated by adding the destination states from `other`. + + Example: + nfa1 = NFA() + nfa2 = NFA() + # ... add transitions to nfa1 and nfa2 ... + + nfa1.embed(nfa2) + # Now nfa1 contains all transitions from nfa2. + """ for s, othertrans in other.transitions.items(): trans = self.transitions.setdefault(s, {}) for label, otherdests in othertrans.items(): @@ -179,15 +765,60 @@ def embed(self, other): dests.update(otherdests) def insert(self, src, other, dest): - self.embed(other) + """ + Connects the source state to the initial state of another NFA, and the final states of the other NFA to the destination state. + + Args: + src (State): The source state to connect from. + other (NFA): The other NFA to connect. + dest (State): The destination state to connect to. + + Returns: + None + + Raises: + TypeError: If src or dest are not instances of the State class. + ValueError: If other is not an instance of the NFA class. + + Notes: + This method modifies the current NFA by embedding the other NFA into it. It connects the source state to the initial state of the other NFA, and connects the final states of the other NFA to the destination state. + + Example: + nfa = NFA() + src = State() + dest = State() + other = NFA() + # ... Initialize src, dest, and other with appropriate values ... - # Connect src to the other NFA's initial state, and the other - # NFA's final states to dest + nfa.insert(src, other, dest) + """ + self.embed(other) self.add_transition(src, EPSILON, other.initial) for finalstate in other.final_states: self.add_transition(finalstate, EPSILON, dest) def to_dfa(self): + """ + Converts the NFA to a DFA (Deterministic Finite Automaton). + + This method performs the conversion of a Non-Deterministic Finite Automaton (NFA) to a + Deterministic Finite Automaton (DFA). The resulting DFA is constructed by exploring + the states and transitions of the NFA. + + Returns: + DFA: The converted DFA. + + Notes: + - The NFA must be initialized before calling this method. + - The NFA should have at least one start state. + - The NFA should have at least one final state. + + Example: + nfa = NFA() + # ... code to initialize the NFA ... + dfa = nfa.to_dfa() + # ... code to use the converted DFA ... + """ dfa = DFA(self.start()) frontier = [self.start()] seen = set() @@ -213,7 +844,69 @@ def to_dfa(self): class DFA(FSA): + """ + Deterministic Finite Automaton (DFA) class. + + This class represents a DFA, which is a type of finite state automaton + where each input symbol uniquely determines the next state. DFAs are + commonly used in pattern matching and string searching algorithms. + + Attributes: + initial (object): The initial state of the DFA. + transitions (dict): A dictionary representing the transitions between + states. The keys are the source states, and the values are + dictionaries where the keys are the input labels and the values + are the destination states. + defaults (dict): A dictionary representing the default transitions + for states that do not have a specific transition defined for a + given input label. The keys are the source states, and the values + are the default destination states. + final_states (set): A set containing the final states of the DFA. + outlabels (dict): A dictionary caching the sorted output labels for + each state. + + Methods: + dump(stream=sys.stdout): Prints a textual representation of the DFA + to the specified stream. + start(): Returns the initial state of the DFA. + add_transition(src, label, dest): Adds a transition from the source + state to the destination state with the given input label. + set_default_transition(src, dest): Sets the default transition for + the source state to the specified destination state. + add_final_state(state): Adds the specified state as a final state of + the DFA. + is_final(state): Checks if the specified state is a final state of + the DFA. + next_state(src, label): Returns the next state of the DFA given the + current state and the input label. + next_valid_string(string, asbytes=False): Returns the lexicographically + smallest valid string that can be obtained by following the DFA + from the initial state using the characters in the input string. + find_next_edge(s, label, asbytes): Finds the next edge label for the + specified state and input label. + reachable_from(src, inclusive=True): Returns the set of states that + can be reached from the specified source state. + minimize(): Minimizes the DFA by removing unreachable states and + merging equivalent states. + to_dfa(): Returns a reference to itself (DFA). + + """ + def __init__(self, initial): + """ + Initializes a new instance of the DFA class. + + Args: + initial (object): The initial state of the DFA. + + Attributes: + initial (object): The initial state of the DFA. + transitions (dict): A dictionary mapping state and input symbol pairs to the next state. + defaults (dict): A dictionary mapping states to default next states. + final_states (set): A set of final states. + outlabels (dict): A dictionary mapping states to output labels. + + """ self.initial = initial self.transitions = {} self.defaults = {} @@ -221,6 +914,33 @@ def __init__(self, initial): self.outlabels = {} def dump(self, stream=sys.stdout): + """ + Prints a textual representation of the DFA to the specified stream. + + Args: + stream (file-like object, optional): The stream to print the + representation to. Defaults to sys.stdout. + + Returns: + None + + Raises: + None + + Example: + >>> dfa = DFA() + >>> dfa.add_transition(0, 'a', 1) + >>> dfa.add_transition(1, 'b', 2) + >>> dfa.add_transition(2, 'c', 3) + >>> dfa.dump() # Prints the DFA representation to sys.stdout + @ 0 + a -> 1 + 1 + b -> 2 + 2 + c -> 3|| + + """ for src in sorted(self.transitions): beg = "@" if src == self.initial else " " print(beg, src, file=stream) @@ -230,25 +950,194 @@ def dump(self, stream=sys.stdout): _ = "||" if self.is_final(dest) else "" def start(self): + """ + Returns the initial state of the DFA. + + Returns: + object: The initial state of the DFA. + + """ return self.initial def add_transition(self, src, label, dest): + """ + Adds a transition from the source state to the destination state with + the given input label. + + Args: + src (object): The source state. + label (object): The input label. + dest (object): The destination state. + + Returns: + None + + Raises: + None + + Examples: + >>> fsa = FSA() + >>> fsa.add_transition('A', 'a', 'B') + >>> fsa.add_transition('B', 'b', 'C') + + """ self.transitions.setdefault(src, {})[label] = dest def set_default_transition(self, src, dest): + """ + Sets the default transition for the source state to the specified + destination state. + + Args: + src (object): The source state. + dest (object): The default destination state. + + Returns: + None + + Raises: + None + + Examples: + # Create an instance of the FSA class + fsa = FSA() + + # Set the default transition from state 'A' to state 'B' + fsa.set_default_transition('A', 'B') + + Notes: + - This method allows you to define a default transition for a source state. + - If a specific transition is not defined for a given input in the FSA, + the default transition will be used. + """ self.defaults[src] = dest def add_final_state(self, state): + """ + Adds the specified state as a final state of the DFA. + + Args: + state (object): The final state to add. + + Returns: + None + + Raises: + TypeError: If the state is not of the expected type. + + Notes: + - This method adds a state to the set of final states of the DFA. + - Final states are used to determine whether a given input sequence is accepted by the DFA. + + Example: + >>> dfa = DFA() + >>> dfa.add_final_state(3) + >>> dfa.add_final_state(5) + """ self.final_states.add(state) def is_final(self, state): + """ + Checks if the specified state is a final state of the DFA. + + Args: + state (object): The state to check. + + Returns: + bool: True if the state is a final state, False otherwise. + + Raises: + None + + Examples: + >>> dfa = DFA() + >>> dfa.add_final_state('q1') + >>> dfa.is_final('q1') + True + >>> dfa.is_final('q2') + False + + Notes: + - This method is used to determine if a given state is a final state in a Deterministic Finite Automaton (DFA). + - A final state is a state in which the DFA accepts the input string and terminates. + - The method returns True if the specified state is a final state, and False otherwise. + """ return state in self.final_states def next_state(self, src, label): + """ + Returns the next state of the DFA given the current state and the + input label. + + Args: + src (object): The current state. + label (object): The input label. + + Returns: + object: The next state. + + Raises: + KeyError: If the current state or input label is not found in the DFA. + + Notes: + - If the current state is not found in the DFA transitions, the default + state for that source state will be returned. + - If the input label is not found in the transitions for the current state, + None will be returned. + + Example: + >>> dfa = DFA() + >>> dfa.add_transition('A', 'a', 'B') + >>> dfa.add_transition('B', 'b', 'C') + >>> dfa.next_state('A', 'a') + 'B' + >>> dfa.next_state('B', 'b') + 'C' + >>> dfa.next_state('C', 'c') + None + """ trans = self.transitions.get(src, {}) return trans.get(label, self.defaults.get(src, None)) def next_valid_string(self, string, asbytes=False): + """ + Returns the lexicographically smallest valid string that can be + obtained by following the DFA from the initial state using the + characters in the input string. + + Args: + string (str or bytes): The input string. + asbytes (bool, optional): Specifies whether the input string is + in bytes format. Defaults to False. + + Returns: + str or bytes: The lexicographically smallest valid string, or + None if no valid string can be obtained. + + Raises: + None + + Examples: + >>> fsa = FSA() + >>> fsa.add_transition(0, 'a', 1) + >>> fsa.add_transition(1, 'b', 2) + >>> fsa.add_transition(2, 'c', 3) + >>> fsa.set_final(3) + >>> fsa.next_valid_string('ab') # Returns 'abc' + >>> fsa.next_valid_string('abc') # Returns 'abc' + >>> fsa.next_valid_string('abcd') # Returns None + + Notes: + - The method follows the DFA (Deterministic Finite Automaton) from + the initial state using the characters in the input string. + - It returns the lexicographically smallest valid string that can be + obtained by following the DFA. + - If the input string is already a valid string, it is returned as is. + - If no valid string can be obtained, None is returned. + - The `asbytes` parameter specifies whether the input string is in + bytes format. By default, it is set to False. + + """ state = self.start() stack = [] @@ -280,6 +1169,34 @@ def next_valid_string(self, string, asbytes=False): return None def find_next_edge(self, s, label, asbytes): + """ + Finds the next edge label for the specified state and input label. + + Args: + s (object): The current state. + label (object): The current input label. + asbytes (bool): Specifies whether the labels are in bytes format. + + Returns: + object: The next edge label, or None if no label is found. + + Raises: + None + + Examples: + >>> automaton = FSA() + >>> automaton.find_next_edge(1, 'a', False) + 'b' + + Notes: + - This method is used to find the next edge label for a given state and input label in the automaton. + - The `s` parameter represents the current state in the automaton. + - The `label` parameter represents the current input label. + - The `asbytes` parameter specifies whether the labels are in bytes format. + - If `label` is None, it is set to b"\x00" if `asbytes` is True, or "\0" if `asbytes` is False. + - The method returns the next edge label if found, or None if no label is found. + + """ if label is None: label = b"\x00" if asbytes else "\0" else: @@ -299,6 +1216,27 @@ def find_next_edge(self, s, label, asbytes): return None def reachable_from(self, src, inclusive=True): + """ + Returns the set of states that can be reached from the specified + source state. + + Args: + src (object): The source state. + inclusive (bool, optional): Specifies whether the source state + should be included in the result. Defaults to True. + + Returns: + set: The set of reachable states. + + Example: + >>> automaton = FSA() + >>> automaton.add_state('A') + >>> automaton.add_state('B') + >>> automaton.add_transition('A', 'B') + >>> automaton.reachable_from('A') + {'A', 'B'} + + """ transitions = self.transitions reached = set() @@ -317,6 +1255,24 @@ def reachable_from(self, src, inclusive=True): return reached def minimize(self): + """ + Minimizes the DFA by removing unreachable states and merging equivalent states. + + This method performs the following steps: + 1. Deletes unreachable states from the DFA. + 2. Partitions the remaining states into equivalence sets. + 3. Chooses one representative state from each equivalence set and maps all equivalent states to it. + 4. Applies the mapping to the existing transitions. + 5. Removes dead states - non-final states with no outgoing arcs except to themselves. + + After the minimization process, the DFA will have a reduced number of states while preserving its language. + + Usage: + dfa = DFA(...) + dfa.minimize() + + :return: None + """ transitions = self.transitions initial = self.initial @@ -404,6 +1360,26 @@ def minimize(self): self.final_states = new_finals def to_dfa(self): + """ + Converts the Finite State Automaton (FSA) to a Deterministic Finite Automaton (DFA). + + This method returns a reference to itself, as the conversion from FSA to DFA is an in-place operation. + + Returns: + DFA: A reference to the converted DFA. + + Notes: + - The conversion from FSA to DFA eliminates non-determinism by creating a new DFA with equivalent language acceptance. + - The resulting DFA may have a larger number of states compared to the original FSA. + - The original FSA is not modified during the conversion process. + + Example: + >>> fsa = FSA() + >>> # Add states, transitions, and final states to the FSA + >>> dfa = fsa.to_dfa() + >>> # Use the converted DFA for further operations + + """ return self @@ -411,6 +1387,41 @@ def to_dfa(self): def renumber_dfa(dfa, base=0): + """ + Renumber the states of a DFA (Deterministic Finite Automaton) starting from a given base number. + + Args: + dfa (DFA): The DFA to renumber. + base (int, optional): The base number to start renumbering from. Defaults to 0. + + Returns: + DFA: The renumbered DFA. + + Raises: + None. + + Examples: + >>> dfa = DFA() + >>> dfa.add_state(0) + >>> dfa.add_state(1) + >>> dfa.add_transition(0, 'a', 1) + >>> dfa.add_transition(1, 'b', 0) + >>> dfa.set_initial_state(0) + >>> dfa.add_final_state(1) + >>> renumbered_dfa = renumber_dfa(dfa, base=10) + >>> renumbered_dfa.get_states() + [10, 11] + >>> renumbered_dfa.get_initial_state() + 10 + >>> renumbered_dfa.get_final_states() + [11] + + Note: + This function renumbers the states of a DFA by assigning new numbers to each state, starting from the base number. + It creates a new DFA object with the renumbered states and updates the transitions, final states, and default transitions accordingly. + The mapping between the old states and the new states is stored in a dictionary called 'mapping'. + + """ c = itertools.count(base) mapping = {} @@ -434,6 +1445,30 @@ def remap(state): def u_to_utf8(dfa, base=0): + """ + Converts Unicode labels in a DFA to UTF-8 labels. + + This function takes a DFA (Deterministic Finite Automaton) and converts + its Unicode labels to UTF-8 labels. It modifies the DFA in-place. + + Parameters: + - dfa (DFA): The DFA to convert. + - base (int): The base value for generating new state IDs. Defaults to 0. + + Raises: + - ValueError: If the DFA contains a transition with the label ANY. + + Returns: + - None: The function modifies the DFA in-place. + + Example usage: + ``` + dfa = DFA() + # ... construct the DFA ... + u_to_utf8(dfa) + # ... continue using the modified DFA ... + ``` + """ c = itertools.count(base) transitions = dfa.transitions @@ -443,7 +1478,7 @@ def u_to_utf8(dfa, base=0): if label is EPSILON: continue elif label is ANY: - raise ValueError + raise ValueError("DFA contains a transition with the label ANY") else: assert isinstance(label, str) label8 = label.encode("utf8") @@ -459,17 +1494,41 @@ def u_to_utf8(dfa, base=0): def find_all_matches(dfa, lookup_func, first=unull): """ - Uses lookup_func to find all words within levenshtein distance k of word. + Finds all words within a given Levenshtein distance of a target word. + + This function uses the provided `lookup_func` to find all words within a specified + Levenshtein distance (`k`) of a target word. It iterates through the DFA (Deterministic + Finite Automaton) `dfa` to generate all possible matches. Args: - word: The word to look up - k: Maximum edit distance - lookup_func: A single argument function that returns the first word in the - database that is greater than or equal to the input argument. + dfa (DFA): The DFA representing the search space. + lookup_func (function): A function that takes a word as input and returns the first + word in the database that is greater than or equal to the input word. + first (str): The first word to start the search from. Defaults to `unull`. + Yields: - Every matching word within levenshtein distance k from the database. - """ + str: Every matching word within the specified Levenshtein distance `k` from the database. + + Example: + >>> dfa = DFA() + >>> lookup_func = lambda word: word + >>> matches = find_all_matches(dfa, lookup_func, first="hello") + >>> for match in matches: + ... print(match) + ... + hello + hallo + hullo + helio + ... + + Note: + The `dfa` parameter should be an instance of the DFA class, which represents the search space. + The `lookup_func` parameter should be a function that returns the first word in the database + that is greater than or equal to the input word. This function is used to efficiently search + for matches within the specified Levenshtein distance. + """ match = dfa.next_valid_string(first) while match: key = lookup_func(match) @@ -485,6 +1544,25 @@ def find_all_matches(dfa, lookup_func, first=unull): def reverse_nfa(n): + """ + Reverses the given NFA (Non-deterministic Finite Automaton). + + Args: + n (NFA): The NFA to be reversed. + + Returns: + NFA: The reversed NFA. + + Notes: + This function creates a new NFA by reversing the transitions of the given NFA. + It adds transitions from the destination states to the source states for each + transition in the original NFA. It also adds transitions from the initial state + of the original NFA to the final states of the original NFA. + + Example: + nfa = NFA(...) + reversed_nfa = reverse_nfa(nfa) + """ s = object() nfa = NFA(s) for src, trans in n.transitions.items(): @@ -498,6 +1576,54 @@ def reverse_nfa(n): def product(dfa1, op, dfa2): + """ + Compute the product of two DFAs. + + This function takes two deterministic finite automata (DFAs) represented by `dfa1` and `dfa2`, + and computes their product DFA based on the given binary operator `op`. + + Parameters: + - dfa1 (DFA): The first DFA. + - op (function): The binary operator used to combine the states of `dfa1` and `dfa2`. + - dfa2 (DFA): The second DFA. + + Returns: + - dfa (DFA): The product DFA. + + Algorithm: + 1. Convert `dfa1` and `dfa2` to DFAs if they are not already. + 2. Create the start state of the product DFA as a tuple of the start states of `dfa1` and `dfa2`. + 3. Initialize an empty stack and push the start state onto the stack. + 4. While the stack is not empty: + - Pop a state from the stack. + - Get the transitions of the corresponding states in `dfa1` and `dfa2`. + - For each label that is common to both sets of transitions: + - Compute the next states in `dfa1` and `dfa2` based on the label. + - If the binary operator `op` returns True for the next states, add a transition to the product DFA. + - Push the next state onto the stack. + - If both next states are final states, mark the next state in the product DFA as a final state. + 5. Return the product DFA. + + Note: + - The `op` function should take two boolean arguments and return a boolean value. + - The `DFA` class represents a deterministic finite automaton. + + Example usage: + ``` + dfa1 = DFA(...) + dfa2 = DFA(...) + product_dfa = product(dfa1, my_operator, dfa2) + ``` + + :param dfa1: The first DFA. + :type dfa1: DFA + :param op: The binary operator used to combine the states of `dfa1` and `dfa2`. + :type op: function + :param dfa2: The second DFA. + :type dfa2: DFA + :return: The product DFA. + :rtype: DFA + """ dfa1 = dfa1.to_dfa() dfa2 = dfa2.to_dfa() start = (dfa1.start(), dfa2.start()) @@ -521,22 +1647,107 @@ def product(dfa1, op, dfa2): def intersection(dfa1, dfa2): + """ + Compute the intersection of two deterministic finite automata (DFAs). + + This function takes two DFAs, `dfa1` and `dfa2`, and returns a new DFA that represents the intersection of the two DFAs. + The intersection of two DFAs is a new DFA that accepts only the strings that are accepted by both `dfa1` and `dfa2`. + + Parameters: + - dfa1 (DFA): The first DFA. + - dfa2 (DFA): The second DFA. + + Returns: + - DFA: The DFA representing the intersection of `dfa1` and `dfa2`. + + Example: + >>> dfa1 = DFA(...) + >>> dfa2 = DFA(...) + >>> result = intersection(dfa1, dfa2) + """ + return product(dfa1, operator.and_, dfa2) def union(dfa1, dfa2): + """ + Computes the union of two deterministic finite automata (DFAs). + + Parameters: + - dfa1 (DFA): The first DFA. + - dfa2 (DFA): The second DFA. + + Returns: + - DFA: The DFA resulting from the union of dfa1 and dfa2. + + Raises: + - TypeError: If either dfa1 or dfa2 is not a DFA object. + + Example: + >>> dfa1 = DFA(...) + >>> dfa2 = DFA(...) + >>> result = union(dfa1, dfa2) + """ + return product(dfa1, operator.or_, dfa2) def epsilon_nfa(): + """ + Creates an epsilon-NFA (non-deterministic finite automaton) with a single epsilon transition. + + Returns: + A basic NFA (Nondeterministic Finite Automaton) with a single epsilon transition. + + Notes: + - The epsilon transition allows the automaton to move from one state to another without consuming any input. + - This function is a helper function that creates a basic NFA with only an epsilon transition. + - The resulting NFA can be further modified and combined with other NFAs to build more complex automata. + + Example: + >>> nfa = epsilon_nfa() + >>> nfa + + """ return basic_nfa(EPSILON) def dot_nfa(): + """ + Creates a non-deterministic finite automaton (NFA) that matches any single character. + + Returns: + NFA: A non-deterministic finite automaton that matches any single character. + + Example: + >>> nfa = dot_nfa() + >>> nfa.match('a') + True + >>> nfa.match('b') + True + >>> nfa.match('1') + True + """ return basic_nfa(ANY) def basic_nfa(label): + """ + Creates a basic NFA (Non-Deterministic Finite Automaton) with a single transition. + + Parameters: + label (str): The label of the transition. + + Returns: + NFA: The created NFA. + + Example: + >>> nfa = basic_nfa('a') + >>> nfa.transitions + {: {'a': []}} + >>> nfa.final_states + {} + """ s = object() e = object() nfa = NFA(s) @@ -546,6 +1757,19 @@ def basic_nfa(label): def charset_nfa(labels): + """ + Constructs a non-deterministic finite automaton (NFA) that recognizes a character set. + + Parameters: + - labels (iterable): An iterable of labels representing the characters in the character set. + + Returns: + - NFA: The constructed NFA. + + Example: + >>> labels = ['a', 'b', 'c'] + >>> nfa = charset_nfa(labels) + """ s = object() e = object() nfa = NFA(s) @@ -556,6 +1780,22 @@ def charset_nfa(labels): def string_nfa(string): + """ + Creates a Non-Deterministic Finite Automaton (NFA) that recognizes the given string. + + Parameters: + - string (str): The string to be recognized by the NFA. + + Returns: + - NFA: The NFA object that recognizes the given string. + + Example: + >>> nfa = string_nfa("abc") + >>> nfa.matches("abc") + True + >>> nfa.matches("def") + False + """ s = object() e = object() nfa = NFA(s) @@ -568,6 +1808,22 @@ def string_nfa(string): def choice_nfa(n1, n2): + """ + Creates a non-deterministic finite automaton (NFA) that represents a choice between two NFAs. + + Parameters: + - n1: The first NFA to choose from. + - n2: The second NFA to choose from. + + Returns: + - nfa: The resulting NFA representing the choice between n1 and n2. + + Example: + nfa1 = NFA(...) + nfa2 = NFA(...) + choice = choice_nfa(nfa1, nfa2) + """ + s = object() e = object() nfa = NFA(s) @@ -583,6 +1839,21 @@ def choice_nfa(n1, n2): def concat_nfa(n1, n2): + """ + Concatenates two NFAs (n1 and n2) into a single NFA. + + Parameters: + - n1 (NFA): The first NFA to be concatenated. + - n2 (NFA): The second NFA to be concatenated. + + Returns: + - nfa (NFA): The resulting NFA after concatenation. + + Example: + nfa1 = NFA(...) + nfa2 = NFA(...) + concatenated_nfa = concat_nfa(nfa1, nfa2) + """ s = object() m = object() e = object() @@ -594,28 +1865,78 @@ def concat_nfa(n1, n2): def star_nfa(n): + r""" + Creates a non-deterministic finite automaton (NFA) that represents the Kleene star operation on the given NFA. + + Parameters: + - n (NFA): The input NFA. + + Returns: + - nfa (NFA): The resulting NFA after applying the Kleene star operation. + + Description: + The star_nfa function takes an NFA as input and constructs a new NFA that represents the Kleene star operation on the input NFA. + The resulting NFA accepts any number of repetitions (including zero) of the language accepted by the input NFA. + + The construction of the new NFA involves adding two new states, 's' and 'e', and modifying the transitions of the input NFA. + The new NFA has the following structure: + + -----<----- + / \ + s ---> n ---> e + \ / + ----->----- + + The state 's' is the start state of the new NFA, 'n' is the start state of the input NFA, and 'e' is a new final state. + The new NFA has transitions from 's' to 'n' and from 'e' to 's' to allow for repetitions of the input NFA's language. + The input NFA's final states are also connected to 's' to allow for zero repetitions of the input NFA's language. + + Example usage: + nfa = star_nfa(input_nfa) + """ + s = object() e = object() nfa = NFA(s) - # -----<----- - # / \ - # s ---> n ---> e - # \ / - # ----->----- nfa.insert(s, n, e) nfa.add_transition(s, EPSILON, e) for finalstate in n.final_states: nfa.add_transition(finalstate, EPSILON, s) nfa.add_final_state(e) + return nfa def plus_nfa(n): + """ + Constructs a non-deterministic finite automaton (NFA) that matches one or more occurrences of the given NFA. + + Parameters: + n (NFA): The NFA to be repeated one or more times. + + Returns: + NFA: The NFA that matches one or more occurrences of the given NFA. + + Example: + >>> nfa = plus_nfa(nfa1) + """ return concat_nfa(n, star_nfa(n)) def optional_nfa(n): + """ + Creates a non-deterministic finite automaton (NFA) that matches zero or one occurrence of the given NFA. + + Parameters: + - n: The NFA to match zero or one occurrence of. + + Returns: + - The NFA that matches zero or one occurrence of the given NFA. + + Example: + >>> nfa = optional_nfa(nfa1) + """ return choice_nfa(n, epsilon_nfa()) @@ -623,23 +1944,89 @@ def optional_nfa(n): class DMNode: - def __init__(self, n): + """ + Represents a deterministic finite state automaton (DFSA) node. + + Attributes: + n (int): The node identifier. + arcs (dict): A dictionary of arcs, where the keys are input symbols and the values are the next nodes. + final (bool): Indicates whether the node is a final state. + + Methods: + __init__(self, n: int): Initializes a new instance of the DMNode class. + __repr__(self) -> str: Returns a string representation of the DMNode. + __hash__(self) -> int: Returns the hash value of the DMNode. + tuple(self) -> tuple: Returns a tuple representation of the DMNode. + + """ + + def __init__(self, n: int): + """ + Initializes a new instance of the DMNode class. + + Args: + n (int): The node identifier. + + """ self.n = n self.arcs = {} self.final = False - def __repr__(self): + def __repr__(self) -> str: + """ + Returns a string representation of the DMNode. + + Returns: + str: The string representation of the DMNode. + + """ return f"<{self.n}, {self.tuple()!r}>" - def __hash__(self): + def __hash__(self) -> int: + """ + Returns the hash value of the DMNode. + + Returns: + int: The hash value of the DMNode. + + """ return hash(self.tuple()) - def tuple(self): + def tuple(self) -> tuple: + """ + Returns a tuple representation of the DMNode. + + Returns: + tuple: The tuple representation of the DMNode. + + """ arcs = tuple(sorted(self.arcs.items())) return arcs, self.final def strings_dfa(strings): + """ + Constructs a Deterministic Finite Automaton (DFA) from a list of strings. + + Args: + strings (list): A list of strings to construct the DFA from. + + Returns: + DFA: The constructed DFA. + + Raises: + ValueError: If the strings are not in lexicographical order or if an empty string is encountered. + + Notes: + - The DFA is constructed by iteratively adding strings to the automaton. + - The DFA is built incrementally, reusing common prefixes between strings to optimize space. + - The DFA is represented using DMNode objects, which store the state transitions and accept states. + - The DFA is returned as an instance of the DFA class. + + Example: + strings = ["apple", "banana", "cherry"] + dfa = strings_dfa(strings) + """ dfa = DFA(0) c = itertools.count(1) @@ -679,12 +2066,35 @@ def strings_dfa(strings): def add_suffix(dfa, nodes, last, downto, seen): + """ + Add a suffix to the given DFA. + + This function takes a DFA (Deterministic Finite Automaton) and adds a suffix to it. + The suffix is constructed from a list of nodes, starting from the last node and + going up to the specified downto index. + + Parameters: + - dfa (DFA): The DFA to which the suffix will be added. + - nodes (list): The list of nodes representing the suffix. + - last (list): The list of labels representing the transitions from the last node + to its parent nodes. + - downto (int): The index indicating the last node in the suffix to be added. + - seen (dict): A dictionary that keeps track of already seen nodes. + + Returns: + None + + Notes: + - If a node with the same characteristics (final/nonfinal, same arcs to same destinations) + is already seen, it is replaced with the already seen node. + - If a node is replaced with an already seen one, the parent node's pointer to this node is fixed. + - The node's transitions are added to the DFA. + + """ while len(nodes) > downto: node = nodes.pop() tup = node.tuple() - # If a node just like this one (final/nonfinal, same arcs to same - # destinations) is already seen, replace with it try: this = seen[tup] except KeyError: @@ -693,12 +2103,9 @@ def add_suffix(dfa, nodes, last, downto, seen): dfa.add_final_state(this) seen[tup] = this else: - # If we replaced the node with an already seen one, fix the parent - # node's pointer to this parent = nodes[-1] inlabel = last[len(nodes) - 1] parent.arcs[inlabel] = this - # Add the node's transitions to the DFA for label, dest in node.arcs.items(): dfa.add_transition(this, label, dest) diff --git a/src/whoosh/automata/fst.py b/src/whoosh/automata/fst.py index 0762b9ce..3c5ec4f9 100644 --- a/src/whoosh/automata/fst.py +++ b/src/whoosh/automata/fst.py @@ -59,18 +59,82 @@ def b(s): + """ + Encodes the input string using the Latin-1 encoding. + + Args: + s (str): The string to be encoded. + + Returns: + bytes: The encoded string. + + Raises: + UnicodeEncodeError: If the input string cannot be encoded using the Latin-1 encoding. + + Example: + >>> b("hello") + b'hello' + """ return s.encode("latin-1") def u(s): + """ + Convert the input string to Unicode if it is a byte string. + + Parameters: + s (str or bytes): The input string to be converted. + + Returns: + str: The converted Unicode string. + + Raises: + None. + + Examples: + >>> u(b'hello') + 'hello' + >>> u('world') + 'world' + """ + return s.decode("ascii") if isinstance(s, bytes) else s class FileVersionError(Exception): - pass + """ + Exception raised when there is a mismatch between the version of a file and the expected version. + + This exception is typically raised when a file is being read or processed and its version does not match the expected version. + It can be used to handle version-related errors in file handling operations. + + Attributes: + message (str): Explanation of the error. + """ + + def __init__(self, message): + """ + Initialize a new instance of FileVersionError. + + Args: + message (str): Explanation of the error. + """ + self.message = message + super().__init__(message) class InactiveCursor(Exception): + """ + Exception raised when attempting to use an inactive cursor. + + An inactive cursor is a cursor that has been closed or is no longer valid. + This exception is raised to indicate that an operation cannot be performed + because the cursor is inactive. + + Attributes: + message -- explanation of the error + """ + pass @@ -86,84 +150,231 @@ class InactiveCursor(Exception): class Values: - """Base for classes the describe how to encode and decode FST values.""" + """Base for classes that describe how to encode and decode FST values. + + This class provides a set of methods that define the behavior of FST values. + Subclasses should implement these methods to handle specific types of values. + + Attributes: + None + + Methods: + is_valid(v): Returns True if v is a valid object that can be stored by this class. + common(v1, v2): Returns the "common" part of the two values. + add(prefix, v): Adds the given prefix to the given value. + subtract(v, prefix): Subtracts the "common" part (the prefix) from the given value. + write(dbfile, v): Writes value v to a file. + read(dbfile): Reads a value from the given file. + skip(dbfile): Skips over a value in the given file. + to_bytes(v): Returns a str (Python 2.x) or bytes (Python 3) representation of the given value. + merge(v1, v2): Merges two values. + + """ @staticmethod def is_valid(v): - """Returns True if v is a valid object that can be stored by this - class. + """Returns True if v is a valid object that can be stored by this class. + + Args: + v: The value to check. + + Returns: + bool: True if v is a valid object, False otherwise. + + Raises: + NotImplementedError: This method should be implemented by subclasses. + """ raise NotImplementedError @staticmethod def common(v1, v2): - """Returns the "common" part of the two values, for whatever "common" - means for this class. For example, a string implementation would return - the common shared prefix, for an int implementation it would return - the minimum of the two numbers. + """Returns the "common" part of the two values. + + The definition of "common" depends on the specific subclass implementation. + For example, a string implementation would return the common shared prefix, + while an int implementation would return the minimum of the two numbers. If there is no common part, this method should return None. + + Args: + v1: The first value. + v2: The second value. + + Returns: + object: The common part of the two values, or None if there is no common part. + + Raises: + NotImplementedError: This method should be implemented by subclasses. + """ raise NotImplementedError @staticmethod def add(prefix, v): - """Adds the given prefix (the result of a call to common()) to the - given value. + """Adds the given prefix to the given value. + + The prefix is the result of a call to the `common()` method. + + Args: + prefix: The prefix to add. + v: The value to add the prefix to. + + Returns: + object: The value with the prefix added. + + Raises: + NotImplementedError: This method should be implemented by subclasses. + """ raise NotImplementedError @staticmethod def subtract(v, prefix): - """Subtracts the "common" part (the prefix) from the given value.""" + """Subtracts the "common" part (the prefix) from the given value. + + Args: + v: The value to subtract the prefix from. + prefix: The prefix to subtract. + + Returns: + object: The value with the prefix subtracted. + + Raises: + NotImplementedError: This method should be implemented by subclasses. + + """ raise NotImplementedError @staticmethod def write(dbfile, v): - """Writes value v to a file.""" + """Writes value v to a file. + + Args: + dbfile: The file to write the value to. + v: The value to write. + + Returns: + None + + Raises: + NotImplementedError: This method should be implemented by subclasses. + + """ raise NotImplementedError @staticmethod def read(dbfile): - """Reads a value from the given file.""" + """Reads a value from the given file. + + Args: + dbfile: The file to read the value from. + + Returns: + object: The value read from the file. + + Raises: + NotImplementedError: This method should be implemented by subclasses. + + """ raise NotImplementedError @classmethod def skip(cls, dbfile): - """Skips over a value in the given file.""" + """Skips over a value in the given file. + + This method is a convenience method that calls the `read()` method. + + Args: + dbfile: The file to skip the value in. + + Returns: + None + + """ cls.read(dbfile) @staticmethod def to_bytes(v): - """Returns a str (Python 2.x) or bytes (Python 3) representation of - the given value. This is used for calculating node digests, so it - should be unique but fast to calculate, and does not have to be - parseable. + """Returns a str (Python 2.x) or bytes (Python 3) representation of the given value. + + This method is used for calculating node digests. The representation should be + unique but fast to calculate, and does not have to be parseable. + + Args: + v: The value to convert. + + Returns: + str or bytes: The representation of the value. + + Raises: + NotImplementedError: This method should be implemented by subclasses. + """ raise NotImplementedError @staticmethod def merge(v1, v2): + """Merges two values. + + The definition of "merge" depends on the specific subclass implementation. + + Args: + v1: The first value. + v2: The second value. + + Returns: + object: The merged value. + + Raises: + NotImplementedError: This method should be implemented by subclasses. + + """ + raise NotImplementedError class IntValues(Values): - """Stores integer values in an FST.""" + """Stores integer values in an FST. + + This class provides methods for working with integer values in a Finite State Transducer (FST). + It defines operations such as validation, common value calculation, addition, subtraction, and serialization. + + """ @staticmethod def is_valid(v): + """Check if a value is a valid integer for the FST. + + Args: + v (int): The value to check. + + Returns: + bool: True if the value is a valid integer, False otherwise. + + """ return isinstance(v, int) and v >= 0 @staticmethod def common(v1, v2): + """Calculate the common value between two integers. + + Args: + v1 (int): The first integer value. + v2 (int): The second integer value. + + Returns: + int or None: The common value if it exists, None otherwise. + + """ if v1 is None or v2 is None: return None if v1 == v2: @@ -172,6 +383,16 @@ def common(v1, v2): @staticmethod def add(base, v): + """Add an integer value to a base value. + + Args: + base (int or None): The base value. + v (int or None): The value to add. + + Returns: + int or None: The result of the addition. + + """ if base is None: return v if v is None: @@ -180,6 +401,16 @@ def add(base, v): @staticmethod def subtract(v, base): + """Subtract a base value from an integer value. + + Args: + v (int or None): The integer value. + base (int or None): The base value. + + Returns: + int or None: The result of the subtraction. + + """ if v is None: return None if base is None: @@ -188,18 +419,49 @@ def subtract(v, base): @staticmethod def write(dbfile, v): + """Write an integer value to a database file. + + Args: + dbfile (file): The database file to write to. + v (int): The integer value to write. + + """ dbfile.write_uint(v) @staticmethod def read(dbfile): + """Read an integer value from a database file. + + Args: + dbfile (file): The database file to read from. + + Returns: + int: The read integer value. + + """ return dbfile.read_uint() @staticmethod def skip(dbfile): + """Skip a fixed number of bytes in a database file. + + Args: + dbfile (file): The database file to skip bytes in. + + """ dbfile.seek(_INT_SIZE, 1) @staticmethod def to_bytes(v): + """Convert an integer value to bytes. + + Args: + v (int): The integer value to convert. + + Returns: + bytes: The byte representation of the integer value. + + """ return pack_int(v) @@ -208,10 +470,29 @@ class SequenceValues(Values): @staticmethod def is_valid(self, v): + """ + Check if a value is a valid sequence. + + Parameters: + - v (object): The value to check. + + Returns: + - bool: True if the value is a list or tuple, False otherwise. + """ return isinstance(self, (list, tuple)) @staticmethod def common(v1, v2): + """ + Find the common prefix between two sequences. + + Parameters: + - v1 (list or tuple): The first sequence. + - v2 (list or tuple): The second sequence. + + Returns: + - list or tuple or None: The common prefix between v1 and v2, or None if there is no common prefix. + """ if v1 is None or v2 is None: return None @@ -231,6 +512,16 @@ def common(v1, v2): @staticmethod def add(prefix, v): + """ + Concatenate a prefix and a sequence. + + Parameters: + - prefix (list or tuple): The prefix sequence. + - v (list or tuple): The sequence to concatenate. + + Returns: + - list or tuple: The concatenation of prefix and v. + """ if prefix is None: return v if v is None: @@ -239,6 +530,16 @@ def add(prefix, v): @staticmethod def subtract(v, prefix): + """ + Remove a prefix from a sequence. + + Parameters: + - v (list or tuple): The sequence. + - prefix (list or tuple): The prefix to remove. + + Returns: + - list or tuple or None: The sequence with the prefix removed, or None if the prefix is not valid. + """ if prefix is None: return v if v is None: @@ -251,67 +552,279 @@ def subtract(v, prefix): @staticmethod def write(dbfile, v): + """ + Write a sequence to a database file. + + Parameters: + - dbfile (file): The database file to write to. + - v (list or tuple): The sequence to write. + """ dbfile.write_pickle(v) @staticmethod def read(dbfile): + """ + Read a sequence from a database file. + + Parameters: + - dbfile (file): The database file to read from. + + Returns: + - list or tuple: The sequence read from the database file. + """ return dbfile.read_pickle() class BytesValues(SequenceValues): - """Stores bytes objects (str in Python 2.x) in an FST.""" + """Stores bytes objects (str in Python 2.x) in an FST. + + This class is used to store bytes objects in a Finite State Transducer (FST). + It provides methods for writing, reading, and skipping bytes objects in a database file. + + Attributes: + None + + Methods: + is_valid: Checks if a given value is a valid bytes object. + write: Writes a bytes object to a database file. + read: Reads a bytes object from a database file. + skip: Skips a bytes object in a database file. + to_bytes: Converts a value to bytes. + + """ @staticmethod def is_valid(v): + """Checks if a given value is a valid bytes object. + + Args: + v (bytes): The value to check. + + Returns: + bool: True if the value is a bytes object, False otherwise. + + """ return isinstance(v, bytes) @staticmethod def write(dbfile, v): + """Writes a bytes object to a database file. + + Args: + dbfile (file): The database file to write to. + v (bytes): The bytes object to write. + + Returns: + None + + """ dbfile.write_int(len(v)) dbfile.write(v) @staticmethod def read(dbfile): + """Reads a bytes object from a database file. + + Args: + dbfile (file): The database file to read from. + + Returns: + bytes: The read bytes object. + + """ length = dbfile.read_int() return dbfile.read(length) @staticmethod def skip(dbfile): + """Skips a bytes object in a database file. + + Args: + dbfile (file): The database file to skip from. + + Returns: + None + + """ length = dbfile.read_int() dbfile.seek(length, 1) @staticmethod def to_bytes(v): + """Converts a value to bytes. + + Args: + v: The value to convert. + + Returns: + bytes: The converted bytes object. + + """ return v class ArrayValues(SequenceValues): - """Stores array.array objects in an FST.""" + """Stores array.array objects in an FST. + + This class is used to store array.array objects in a finite state transducer (FST). + It provides methods for writing, reading, and skipping array.array objects in a database file. + + Args: + typecode (str): The typecode of the array.array objects to be stored. + + Attributes: + typecode (str): The typecode of the array.array objects. + itemsize (int): The size of each item in the array.array objects. + + """ def __init__(self, typecode): + """ + Initialize a new FST object. + + Args: + typecode (str): The typecode of the array used to store the FST. + + Attributes: + typecode (str): The typecode of the array used to store the FST. + itemsize (int): The size of each item in the array. + + Note: + The FST (Finite State Transducer) is a data structure used for efficient string matching and lookup operations. + The typecode specifies the type of elements stored in the FST array, such as 'i' for integers or 'f' for floats. + The itemsize is calculated based on the typecode and represents the size (in bytes) of each element in the array. + """ self.typecode = typecode self.itemsize = array(self.typecode).itemsize def is_valid(self, v): + """ + Check if a value is a valid array.array object. + + Args: + v (Any): The value to be checked. + + Returns: + bool: True if the value is a valid array.array object, False otherwise. + + Raises: + None + + Examples: + >>> a = array.array('i', [1, 2, 3]) + >>> is_valid(a) + True + + >>> b = [1, 2, 3] + >>> is_valid(b) + False + + This method checks if the given value is a valid array.array object. It returns True if the value is a valid array.array object with the same typecode as the current instance, and False otherwise. + """ return isinstance(v, array) and v.typecode == self.typecode @staticmethod def write(dbfile, v): + """Write an array.array object to a database file. + + Args: + dbfile (file): The file object representing the database file. + v (array.array): The array.array object to be written. + + Raises: + TypeError: If `dbfile` is not a file object. + TypeError: If `v` is not an array.array object. + + Notes: + - The `dbfile` should be opened in binary mode. + - The `v` array.array object should contain elements of a single type. + + Example: + >>> import array + >>> v = array.array('i', [1, 2, 3, 4, 5]) + >>> with open('data.db', 'wb') as dbfile: + ... write(dbfile, v) + """ dbfile.write(b(v.typecode)) dbfile.write_int(len(v)) dbfile.write_array(v) def read(self, dbfile): + """Read an array.array object from a database file. + + Args: + dbfile (file): The file object representing the database file. + + Returns: + array.array: The read array.array object. + + Raises: + ValueError: If the file object is not valid or the data cannot be read. + + Notes: + This method reads an array.array object from a database file. The file object + should be opened in binary mode. The method reads the typecode of the array, + the length of the array, and then reads the array data from the file. The + method returns the read array.array object. + + Example: + >>> with open('data.db', 'rb') as file: + ... fst = FST() + ... array_obj = fst.read(file) + ... print(array_obj) + """ typecode = u(dbfile.read(1)) length = dbfile.read_int() return dbfile.read_array(typecode, length) def skip(self, dbfile): + """ + Skip an array.array object in a database file. + + This method is used to skip over an array.array object in a database file. + It reads the length of the array from the file, and then seeks forward in the file + by multiplying the length with the item size. + + Args: + dbfile (file): The file object representing the database file. + + Raises: + ValueError: If the length read from the file is negative. + + Example: + Suppose you have a database file containing an array.array object. + You can use this method to skip over the array.array object in the file. + + >>> with open('database.db', 'rb') as dbfile: + ... skip_array(dbfile) + + """ length = dbfile.read_int() + if length < 0: + raise ValueError(f"Invalid length: {length}") + dbfile.seek(length * self.itemsize, 1) @staticmethod def to_bytes(v): + """Convert an array.array object to bytes. + + Args: + v (array.array): The array.array object to be converted. + + Returns: + bytes: The converted bytes. + + Raises: + TypeError: If the input is not an array.array object. + + Example: + >>> import array + >>> a = array.array('B', [1, 2, 3]) + >>> to_bytes(a) + b'\x01\x02\x03' + """ + return v.tobytes() @@ -319,10 +832,33 @@ class IntListValues(SequenceValues): """Stores lists of positive, increasing integers (that is, lists of integers where each number is >= 0 and each number is greater than or equal to the number that precedes it) in an FST. + + This class provides methods to write and read lists of integers to/from a database file. + + Usage: + To write a list of integers to a database file: + IntListValues.write(dbfile, v) + + To read a list of integers from a database file: + result = IntListValues.read(dbfile) + + To convert a list of integers to bytes: + bytes_data = IntListValues.to_bytes(v) """ @staticmethod def is_valid(v): + """Check if a given value is a valid list of positive, increasing integers. + + This function checks if the given value is a list or tuple of positive, increasing integers. + It returns True if the value is valid, and False otherwise. + + Args: + v (list or tuple): The value to check. + + Returns: + bool: True if the value is a valid list of positive, increasing integers, False otherwise. + """ if isinstance(v, (list, tuple)): if len(v) < 2: return True @@ -334,6 +870,12 @@ def is_valid(v): @staticmethod def write(dbfile, v): + """Write a list of positive, increasing integers to a database file. + + Args: + dbfile: The database file to write to. + v (list or tuple): The list of positive, increasing integers to write. + """ base = 0 dbfile.write_varint(len(v)) for x in v: @@ -344,6 +886,14 @@ def write(dbfile, v): @staticmethod def read(dbfile): + """Read a list of positive, increasing integers from a database file. + + Args: + dbfile: The database file to read from. + + Returns: + list: The list of positive, increasing integers read from the database file. + """ length = dbfile.read_varint() result = [] if length > 0: @@ -355,6 +905,14 @@ def read(dbfile): @staticmethod def to_bytes(v): + """Convert a list of positive, increasing integers to bytes. + + Args: + v (list or tuple): The list of positive, increasing integers to convert. + + Returns: + bytes: The bytes representation of the list of positive, increasing integers. + """ return b(repr(v)) @@ -368,22 +926,48 @@ class Node: """ def __init__(self, owner, address, accept=False): + """ + Initialize a Node object. + + Args: + owner (GraphReader): The owner of the node. + address (int): The address of the node. + accept (bool, optional): Whether the node is an accept state. Defaults to False. + """ self.owner = owner self.address = address self._edges = None self.accept = accept def __iter__(self): + """ + Iterate over the keys of the outgoing edges. + + Returns: + Iterator: An iterator over the keys of the outgoing edges. + """ if not self._edges: self._load() return self._edges.keys() def __contains__(self, key): + """ + Check if the node has an outgoing edge with the given key. + + Args: + key: The key of the outgoing edge. + + Returns: + bool: True if the node has an outgoing edge with the given key, False otherwise. + """ if self._edges is None: self._load() return key in self._edges def _load(self): + """ + Load the outgoing edges of the node. + """ owner = self.owner if self.address is None: d = {} @@ -395,21 +979,52 @@ def _load(self): self._edges = d def keys(self): + """ + Get the keys of the outgoing edges. + + Returns: + list: A list of the keys of the outgoing edges. + """ if self._edges is None: self._load() return self._edges.keys() def all_edges(self): + """ + Get all the outgoing edges. + + Returns: + dict: A dictionary containing all the outgoing edges. + """ if self._edges is None: self._load() return self._edges def edge(self, key): + """ + Get the node reached by following the outgoing edge with the given key. + + Args: + key: The key of the outgoing edge. + + Returns: + Node: The node reached by following the outgoing edge with the given key. + """ if self._edges is None: self._load() return self._edges[key] def flatten(self, sofar=emptybytes): + """ + Flatten the node and yield all the strings that can be formed by concatenating + the keys of the outgoing edges. + + Args: + sofar (bytes, optional): The prefix string formed so far. Defaults to emptybytes. + + Yields: + bytes: The strings that can be formed by concatenating the keys of the outgoing edges. + """ if self.accept: yield sofar for key in sorted(self): @@ -417,38 +1032,102 @@ def flatten(self, sofar=emptybytes): yield from node.flatten(sofar + key) def flatten_strings(self): + """ + Flatten the node and yield all the strings that can be formed by concatenating + the keys of the outgoing edges. + + Yields: + str: The strings that can be formed by concatenating the keys of the outgoing edges. + """ return (utf8decode(k)[0] for k in self.flatten()) class ComboNode(Node): """Base class for nodes that blend the nodes of two different graphs. - Concrete subclasses need to implement the ``edge()`` method and possibly - override the ``accept`` property. + This class serves as a base for nodes that combine the nodes of two different graphs. + Subclasses of ComboNode should implement the `edge()` method and may override the `accept` property. + + Attributes: + a (Node): The first node to be blended. + b (Node): The second node to be blended. """ def __init__(self, a, b): + """Initialize a new ComboNode. + + Args: + a (Node): The first node to be blended. + b (Node): The second node to be blended. + """ self.a = a self.b = b def __repr__(self): + """Return a string representation of the ComboNode. + + Returns: + str: A string representation of the ComboNode. + """ return f"<{self.__class__.__name__} {self.a!r} {self.b!r}>" def __contains__(self, key): + """Check if a key is present in the ComboNode. + + Args: + key: The key to check. + + Returns: + bool: True if the key is present in either `a` or `b`, False otherwise. + """ return key in self.a or key in self.b def __iter__(self): + """Iterate over the keys in the ComboNode. + + Returns: + iter: An iterator over the keys in the ComboNode. + """ return iter(set(self.a) | set(self.b)) @property def accept(self): + """Check if the ComboNode is an accept node. + + Returns: + bool: True if either `a` or `b` is an accept node, False otherwise. + """ return self.a.accept or self.b.accept class UnionNode(ComboNode): """Makes two graphs appear to be the union of the two graphs.""" + def __init__(self, a, b): + """ + Initialize a UnionNode with two graphs. + + Args: + a (Graph): The first graph. + b (Graph): The second graph. + """ + self.a = a + self.b = b + def edge(self, key): + """ + Get the edge for the given key. + + If the key is present in both graphs, returns a UnionNode with the edges from both graphs. + If the key is only present in the first graph, returns the edge from the first graph. + If the key is only present in the second graph, returns the edge from the second graph. + + Args: + key: The key to get the edge for. + + Returns: + UnionNode or Edge: The edge for the given key. + """ a = self.a b = self.b if key in a and key in b: @@ -460,9 +1139,28 @@ def edge(self, key): class IntersectionNode(ComboNode): - """Makes two graphs appear to be the intersection of the two graphs.""" + """Makes two graphs appear to be the intersection of the two graphs. + + This class represents a node in the intersection graph, which is created by taking the intersection of two graphs. + The intersection graph appears as if it contains only the common elements between the two original graphs. + + Attributes: + a (ComboNode): The first graph to be intersected. + b (ComboNode): The second graph to be intersected. + """ def edge(self, key): + """Returns the next node in the intersection graph for the given key. + + Args: + key: The key representing the edge to traverse. + + Returns: + IntersectionNode: The next node in the intersection graph for the given key. + + Raises: + KeyError: If the key is not present in both graphs. + """ a = self.a b = self.b if key in a and key in b: @@ -476,51 +1174,63 @@ class BaseCursor: """Base class for a cursor-type object for navigating an FST/word graph, represented by a :class:`GraphReader` object. - >>> cur = GraphReader(dawgfile).cursor() - >>> for key in cur.follow(): - ... print(repr(key)) - The cursor "rests" on arcs in the FSA/FST graph, rather than nodes. + + Methods: + - is_active(): Returns True if this cursor is still active. + - label(): Returns the label bytes of the current arc. + - prefix(): Returns a sequence of the label bytes for the path from the root to the current arc. + - prefix_bytes(): Returns the label bytes for the path from the root to the current arc as a single joined bytes object. + - prefix_string(): Returns the labels of the path from the root to the current arc as a decoded unicode string. + - peek_key(): Returns a sequence of label bytes representing the next closest key in the graph. + - peek_key_bytes(): Returns the next closest key in the graph as a single bytes object. + - peek_key_string(): Returns the next closest key in the graph as a decoded unicode string. + - stopped(): Returns True if the current arc leads to a stop state. + - value(): Returns the value at the current arc, if reading an FST. + - accept(): Returns True if the current arc leads to an accept state. + - at_last_arc(): Returns True if the current arc is the last outgoing arc from the previous node. + - next_arc(): Moves to the next outgoing arc from the previous node. + - follow(): Follows the current arc. + - switch_to(label): Switches to the sibling arc with the given label bytes. + - skip_to(key): Moves the cursor to the path represented by the given key bytes. + - flatten(): Yields the keys in the graph, starting at the current position. + - flatten_v(): Yields (key, value) tuples in an FST, starting at the current position. + - flatten_strings(): Yields the keys in the graph as decoded unicode strings, starting at the current position. + - find_path(path): Follows the labels in the given path, starting at the current position. """ def is_active(self): """Returns True if this cursor is still active, that is it has not read past the last arc in the graph. """ - raise NotImplementedError def label(self): """Returns the label bytes of the current arc.""" - raise NotImplementedError def prefix(self): """Returns a sequence of the label bytes for the path from the root to the current arc. """ - raise NotImplementedError def prefix_bytes(self): """Returns the label bytes for the path from the root to the current arc as a single joined bytes object. """ - return emptybytes.join(self.prefix()) def prefix_string(self): """Returns the labels of the path from the root to the current arc as a decoded unicode string. """ - return utf8decode(self.prefix_bytes())[0] def peek_key(self): """Returns a sequence of label bytes representing the next closest key in the graph. """ - yield from self.prefix() c = self.copy() while not c.stopped(): @@ -529,53 +1239,44 @@ def peek_key(self): def peek_key_bytes(self): """Returns the next closest key in the graph as a single bytes object.""" - return emptybytes.join(self.peek_key()) def peek_key_string(self): """Returns the next closest key in the graph as a decoded unicode string. """ - return utf8decode(self.peek_key_bytes())[0] def stopped(self): """Returns True if the current arc leads to a stop state.""" - raise NotImplementedError def value(self): """Returns the value at the current arc, if reading an FST.""" - raise NotImplementedError def accept(self): """Returns True if the current arc leads to an accept state (the end of a valid key). """ - raise NotImplementedError def at_last_arc(self): """Returns True if the current arc is the last outgoing arc from the previous node. """ - raise NotImplementedError def next_arc(self): """Moves to the next outgoing arc from the previous node.""" - raise NotImplementedError def follow(self): """Follows the current arc.""" - raise NotImplementedError def switch_to(self, label): """Switch to the sibling arc with the given label bytes.""" - _label = self.label _at_last_arc = self.at_last_arc _next_arc = self.next_arc @@ -590,7 +1291,6 @@ def switch_to(self, label): def skip_to(self, key): """Moves the cursor to the path represented by the given key bytes.""" - _accept = self.accept _prefix = self.prefix _next_arc = self.next_arc @@ -607,7 +1307,6 @@ def skip_to(self, key): def flatten(self): """Yields the keys in the graph, starting at the current position.""" - _is_active = self.is_active _accept = self.accept _stopped = self.stopped @@ -629,18 +1328,17 @@ def flatten_v(self): """Yields (key, value) tuples in an FST, starting at the current position. """ - for key in self.flatten(): yield key, self.value() def flatten_strings(self): + """Yields the keys in the graph as decoded unicode strings, starting at the current position.""" return (utf8decode(k)[0] for k in self.flatten()) def find_path(self, path): """Follows the labels in the given path, starting at the current position. """ - path = to_labels(path) _switch_to = self.switch_to _follow = self.follow @@ -650,17 +1348,22 @@ def find_path(self, path): for i, label in enumerate(path): if not first: _follow() - if not _switch_to(label): + if not _switch_to(label) or (_stopped() and i < len(path) - 1): return False - if _stopped(): - if i < len(path) - 1: - return False first = False return True class Cursor(BaseCursor): def __init__(self, graph, root=None, stack=None): + """ + Initializes a Cursor object. + + Args: + graph (Graph): The graph to navigate. + root (int, optional): The root node of the graph. Defaults to None. + stack (list, optional): The stack of arcs. Defaults to None. + """ self.graph = graph self.vtype = graph.vtype self.root = root if root is not None else graph.default_root() @@ -669,43 +1372,87 @@ def __init__(self, graph, root=None, stack=None): else: self.reset() - def _current_attr(self, name): - stack = self.stack - if not stack: - raise InactiveCursor - return getattr(stack[-1], name) - def is_active(self): + """ + Checks if the cursor is active. + + Returns: + bool: True if the cursor is active, False otherwise. + """ return bool(self.stack) def stopped(self): + """ + Checks if the cursor has stopped. + + Returns: + bool: True if the cursor has stopped, False otherwise. + """ return self._current_attr("target") is None def accept(self): + """ + Checks if the cursor is in an accepting state. + + Returns: + bool: True if the cursor is in an accepting state, False otherwise. + """ return self._current_attr("accept") def at_last_arc(self): + """ + Checks if the cursor is at the last arc. + + Returns: + bool: True if the cursor is at the last arc, False otherwise. + """ return self._current_attr("lastarc") def label(self): + """ + Returns the label of the current arc. + + Returns: + object: The label of the current arc. + """ return self._current_attr("label") def reset(self): + """ + Resets the cursor to its initial state. + """ self.stack = [] self.sums = [None] self._push(self.graph.arc_at(self.root)) def copy(self): + """ + Creates a copy of the cursor. + + Returns: + Cursor: A copy of the cursor. + """ return self.__class__(self.graph, self.root, copy.deepcopy(self.stack)) def prefix(self): + """ + Returns the prefix labels of the current stack. + + Yields: + object: The prefix labels of the current stack. + """ stack = self.stack if not stack: raise InactiveCursor return (arc.label for arc in stack) - # Override: more efficient implementation using graph methods directly def peek_key(self): + """ + Returns an iterator over the labels of the current stack. + + Yields: + object: The labels of the current stack. + """ if not self.stack: raise InactiveCursor @@ -717,12 +1464,18 @@ def peek_key(self): yield arc.label def value(self): + """ + Returns the value associated with the current stack. + + Returns: + object: The value associated with the current stack. + """ stack = self.stack if not stack: raise InactiveCursor vtype = self.vtype if not vtype: - raise Exception("No value type") + raise ValueError("No value type") v = self.sums[-1] current = stack[-1] @@ -733,6 +1486,12 @@ def value(self): return v def next_arc(self): + """ + Moves the cursor to the next arc. + + Returns: + Arc: The next arc. + """ stack = self.stack if not stack: raise InactiveCursor @@ -745,14 +1504,25 @@ def next_arc(self): return current def follow(self): + """ + Follows the target arc. + + Returns: + Cursor: The updated cursor. + """ address = self._current_attr("target") if address is None: raise Exception("Can't follow a stop arc") self._push(self.graph.arc_at(address)) return self - # Override: more efficient implementation manipulating the stack def skip_to(self, key): + """ + Skips to the specified key. + + Args: + key (list): The key to skip to. + """ key = to_labels(key) stack = self.stack if not stack: @@ -773,8 +1543,16 @@ def skip_to(self, key): else: _next_arc() - # Override: more efficient implementation using find_arc def switch_to(self, label): + """ + Switches to the specified label. + + Args: + label (object): The label to switch to. + + Returns: + bool: True if the switch was successful, False otherwise. + """ stack = self.stack if not stack: raise InactiveCursor @@ -793,6 +1571,9 @@ def _push(self, arc): self.stack.append(arc) def pop(self): + """ + Pops the top arc from the stack. + """ self.stack.pop() if self.vtype: self.sums.pop() @@ -816,26 +1597,63 @@ def _pop_to_prefix(self, key): class UncompiledNode: - # Represents an "in-memory" node used by the GraphWriter before it is - # written to disk. + """ + Represents an "in-memory" node used by the GraphWriter before it is written to disk. + """ compiled = False def __init__(self, owner): + """ + Initializes a new instance of the UncompiledNode class. + + Parameters: + - owner: The owner of the node. + + Returns: + None + """ self.owner = owner self._digest = None self.clear() def clear(self): + """ + Clears the node by resetting its arcs, value, accept flag, and input count. + + Parameters: + None + + Returns: + None + """ self.arcs = [] self.value = None self.accept = False self.inputcount = 0 def __repr__(self): + """ + Returns a string representation of the node. + + Parameters: + None + + Returns: + str: The string representation of the node. + """ return f"<{[(a.label, a.value) for a in self.arcs]!r}>" def digest(self): + """ + Calculates and returns the digest of the node. + + Parameters: + None + + Returns: + bytes: The digest of the node. + """ if self._digest is None: d = sha1() vtype = self.owner.vtype @@ -853,16 +1671,56 @@ def digest(self): return self._digest def edges(self): + """ + Returns the arcs of the node. + + Parameters: + None + + Returns: + list: The arcs of the node. + """ return self.arcs def last_value(self, label): + """ + Returns the value of the last arc with the specified label. + + Parameters: + - label: The label of the arc. + + Returns: + object: The value of the last arc with the specified label. + """ assert self.arcs[-1].label == label return self.arcs[-1].value def add_arc(self, label, target): + """ + Adds a new arc to the node with the specified label and target. + + Parameters: + - label: The label of the arc. + - target: The target of the arc. + + Returns: + None + """ self.arcs.append(Arc(label, target)) def replace_last(self, label, target, accept, acceptval=None): + """ + Replaces the last arc with the specified label, target, accept flag, and accept value. + + Parameters: + - label: The label of the arc. + - target: The target of the arc. + - accept: The accept flag of the arc. + - acceptval: The accept value of the arc. + + Returns: + None + """ arc = self.arcs[-1] assert arc.label == label, f"{arc.label!r} != {label!r}" arc.target = target @@ -870,16 +1728,45 @@ def replace_last(self, label, target, accept, acceptval=None): arc.acceptval = acceptval def delete_last(self, label, target): + """ + Deletes the last arc with the specified label and target. + + Parameters: + - label: The label of the arc. + - target: The target of the arc. + + Returns: + None + """ arc = self.arcs.pop() assert arc.label == label assert arc.target == target def set_last_value(self, label, value): + """ + Sets the value of the last arc with the specified label. + + Parameters: + - label: The label of the arc. + - value: The value to set. + + Returns: + None + """ arc = self.arcs[-1] assert arc.label == label, f"{arc.label!r}->{label!r}" arc.value = value def prepend_value(self, prefix): + """ + Prepends the specified prefix to the values of all arcs and the node's value. + + Parameters: + - prefix: The prefix to prepend. + + Returns: + None + """ add = self.owner.vtype.add for arc in self.arcs: arc.value = add(prefix, arc.value) @@ -891,8 +1778,21 @@ class Arc: """ Represents a directed arc between two nodes in an FSA/FST graph. - The ``lastarc`` attribute is True if this is the last outgoing arc from the - previous node. + Attributes: + label (bytes): The label bytes for this arc. For a word graph, this will be a character. + target (int): The address of the node at the endpoint of this arc. + value: The inner FST value at the endpoint of this arc. + accept (bool): Whether the endpoint of this arc is an accept state (e.g. the end of a valid word). + acceptval: If the endpoint of this arc is an accept state, the final FST value for that accepted state. + lastarc: True if this is the last outgoing arc from the previous node. + endpos: The end position of the arc. + + Methods: + __init__: Initializes a new instance of the Arc class. + __repr__: Returns a string representation of the Arc object. + __eq__: Compares two Arc objects for equality. + copy: Creates a copy of the Arc object. + """ __slots__ = ("label", "target", "accept", "value", "lastarc", "acceptval", "endpos") @@ -908,14 +1808,16 @@ def __init__( endpos=None, ): """ - :param label: The label bytes for this arc. For a word graph, this will - be a character. - :param target: The address of the node at the endpoint of this arc. - :param value: The inner FST value at the endpoint of this arc. - :param accept: Whether the endpoint of this arc is an accept state - (e.g. the end of a valid word). - :param acceptval: If the endpoint of this arc is an accept state, the - final FST value for that accepted state. + Initializes a new instance of the Arc class. + + Args: + label (bytes, optional): The label bytes for this arc. For a word graph, this will be a character. + target (int, optional): The address of the node at the endpoint of this arc. + value (optional): The inner FST value at the endpoint of this arc. + accept (bool, optional): Whether the endpoint of this arc is an accept state (e.g. the end of a valid word). + acceptval (optional): If the endpoint of this arc is an accept state, the final FST value for that accepted state. + lastarc (optional): True if this is the last outgoing arc from the previous node. + endpos (optional): The end position of the arc. """ self.label = label @@ -927,6 +1829,12 @@ def __init__( self.endpos = endpos def __repr__(self): + """ + Returns a string representation of the Arc object. + + Returns: + str: A string representation of the Arc object. + """ return "<{!r}-{} {}{}>".format( self.label, self.target, @@ -935,6 +1843,15 @@ def __repr__(self): ) def __eq__(self, other): + """ + Compares two Arc objects for equality. + + Args: + other (Arc): The other Arc object to compare. + + Returns: + bool: True if the two Arc objects are equal, False otherwise. + """ if ( isinstance(other, self.__class__) and self.accept == other.accept @@ -947,6 +1864,12 @@ def __eq__(self, other): return False def copy(self): + """ + Creates a copy of the Arc object. + + Returns: + Arc: A copy of the Arc object. + """ # This is faster than using the copy module return Arc( label=self.label, @@ -965,10 +1888,11 @@ def copy(self): class GraphWriter: """Writes an FSA/FST graph to disk. - Call ``insert(key)`` to insert keys into the graph. You must - insert keys in sorted order. Call ``close()`` to finish the graph and close - the file. + The GraphWriter class is used to write an FSA/FST graph to disk. It provides + methods for inserting keys into the graph, starting and finishing fields, + and closing the graph. + Usage: >>> gw = GraphWriter(my_file) >>> gw.insert("alfa") >>> gw.insert("bravo") @@ -978,6 +1902,7 @@ class GraphWriter: The graph writer can write separate graphs for multiple fields. Use ``start_field(name)`` and ``finish_field()`` to separate fields. + Usage: >>> gw = GraphWriter(my_file) >>> gw.start_field("content") >>> gw.insert("alfalfa") @@ -987,17 +1912,31 @@ class GraphWriter: >>> gw.insert("artichoke") >>> gw.finish_field() >>> gw.close() + + Attributes: + version (int): The version number of the graph writer. + + Args: + dbfile (file): The file to write the graph to. + vtype (class, optional): A class to use for storing values. Defaults to None. + merge (function, optional): A function that merges two values. Defaults to None. + + Raises: + ValueError: If the field name is equivalent to False. + Exception: If finish_field() is called before start_field(). + """ version = 1 def __init__(self, dbfile, vtype=None, merge=None): """ - :param dbfile: the file to write to. - :param vtype: a :class:`Values` class to use for storing values. This - is only necessary if you will be storing values for the keys. - :param merge: a function that takes two values and returns a single - value. This is called if you insert two identical keys with values. + Initializes a new instance of the GraphWriter class. + + Args: + dbfile (file): The file to write the graph to. + vtype (class, optional): A class to use for storing values. Defaults to None. + merge (function, optional): A function that merges two values. Defaults to None. """ self.dbfile = dbfile @@ -1015,7 +1954,16 @@ def __init__(self, dbfile, vtype=None, merge=None): self._infield = False def start_field(self, fieldname): - """Starts a new graph for the given field.""" + """ + Starts a new graph for the given field. + + Args: + fieldname (str): The name of the field. + + Raises: + ValueError: If the field name is equivalent to False. + Exception: If start_field() is called while already in a field. + """ if not fieldname: raise ValueError("Field name cannot be equivalent to False") @@ -1029,7 +1977,12 @@ def start_field(self, fieldname): self._infield = True def finish_field(self): - """Finishes the graph for the current field.""" + """ + Finishes the graph for the current field. + + Raises: + Exception: If finish_field() is called before start_field(). + """ if not self._infield: raise Exception("Called finish_field before start_field") @@ -1039,7 +1992,9 @@ def finish_field(self): self.fieldname = None def close(self): - """Finishes the current graph and closes the underlying file.""" + """ + Finishes the current graph and closes the underlying file. + """ if self.fieldname is not None: self.finish_field() @@ -1052,12 +2007,17 @@ def close(self): dbfile.close() def insert(self, key, value=None): - """Inserts the given key into the graph. + """ + Inserts the given key into the graph. + + Args: + key (bytes, str): The key to insert into the graph. + value (object, optional): The value to encode in the graph along with the key. Defaults to None. - :param key: a sequence of bytes objects, a bytes object, or a string. - :param value: an optional value to encode in the graph along with the - key. If the writer was not instantiated with a value type, passing - a value here will raise an error. + Raises: + Exception: If insert() is called before starting a field. + KeyError: If the key is null or out of order. + ValueError: If the value is not valid for the value type. """ if not self._infield: @@ -1242,24 +2202,102 @@ def _write_node(self, uncnode): class BaseGraphReader: + """Base class for reading graph data structures.""" + def cursor(self, rootname=None): + """ + Returns a cursor object for traversing the graph. + + Args: + rootname (str, optional): The name of the root node. Defaults to None. + + Returns: + Cursor: A cursor object. + + """ return Cursor(self, self.root(rootname)) def has_root(self, rootname): + """ + Checks if the graph has a root node with the given name. + + Args: + rootname (str): The name of the root node. + + Returns: + bool: True if the root node exists, False otherwise. + + Raises: + NotImplementedError: This method must be implemented by subclasses. + + """ raise NotImplementedError def root(self, rootname=None): + """ + Returns the root node of the graph. + + Args: + rootname (str, optional): The name of the root node. Defaults to None. + + Returns: + Node: The root node. + + Raises: + NotImplementedError: This method must be implemented by subclasses. + + """ raise NotImplementedError # Low level methods def arc_at(self, address, arc): + """ + Retrieves the arc at the given address. + + Args: + address (int): The address of the arc. + arc (Arc): An arc object to store the retrieved arc. + + Raises: + NotImplementedError: This method must be implemented by subclasses. + + """ raise NotImplementedError def iter_arcs(self, address, arc=None): + """ + Iterates over the arcs starting from the given address. + + Args: + address (int): The starting address. + arc (Arc, optional): An arc object to store each iterated arc. Defaults to None. + + Yields: + Arc: The iterated arcs. + + Raises: + NotImplementedError: This method must be implemented by subclasses. + + """ raise NotImplementedError def find_arc(self, address, label, arc=None): + """ + Finds the arc with the given label starting from the given address. + + Args: + address (int): The starting address. + label (str): The label of the arc to find. + arc (Arc, optional): An arc object to store the found arc. Defaults to None. + + Returns: + Arc: The found arc, or None if not found. + + Raises: + NotImplementedError: This method must be implemented by subclasses. + + """ arc = arc or Arc() for arc in self.iter_arcs(address, arc): thislabel = arc.label @@ -1271,12 +2309,44 @@ def find_arc(self, address, label, arc=None): # Convenience methods def list_arcs(self, address): + """ + Returns a list of arcs starting from the given address. + + Args: + address (int): The starting address. + + Returns: + list: A list of arcs. + + """ return [arc.copy() for arc in self.iter_arcs(address)] def arc_dict(self, address): + """ + Returns a dictionary of arcs starting from the given address. + + Args: + address (int): The starting address. + + Returns: + dict: A dictionary of arcs, where the keys are the arc labels. + + """ return {arc.label: arc.copy() for arc in self.iter_arcs(address)} def find_path(self, path, arc=None, address=None): + """ + Finds a path in the graph based on a sequence of labels. + + Args: + path (list): A list of labels representing the path. + arc (Arc, optional): An arc object to store the found arc. Defaults to None. + address (int, optional): The starting address. Defaults to None. + + Returns: + Arc: The arc at the end of the path, or None if the path is not found. + + """ path = to_labels(path) if arc: @@ -1298,6 +2368,28 @@ def find_path(self, path, arc=None, address=None): class GraphReader(BaseGraphReader): + """ + A class for reading graph data from a database file. + + Args: + dbfile (file-like object): The database file to read from. + rootname (str, optional): The name of the root node. If not provided and there is only one root, it will be used automatically. Defaults to None. + vtype (object, optional): The type of values associated with the arcs. Defaults to None. + filebase (int, optional): The base offset in the file where the graph data starts. Defaults to 0. + + Attributes: + dbfile (file-like object): The database file being read. + vtype (object): The type of values associated with the arcs. + filebase (int): The base offset in the file where the graph data starts. + version (int): The version of the graph data. + roots (dict): A dictionary of root nodes in the graph. + _root (object): The current root node. + + Raises: + FileVersionError: If the database file has an invalid version. + + """ + def __init__(self, dbfile, rootname=None, vtype=None, filebase=0): self.dbfile = dbfile self.vtype = vtype @@ -1320,28 +2412,79 @@ def __init__(self, dbfile, rootname=None, vtype=None, filebase=0): self._root = self.root(rootname) def close(self): - self.dbfile.close() + """ + Close the database file. - # Overrides + """ + self.dbfile.close() def has_root(self, rootname): + """ + Check if a root node with the given name exists in the graph. + + Args: + rootname (str): The name of the root node. + + Returns: + bool: True if the root node exists, False otherwise. + + """ return rootname in self.roots def root(self, rootname=None): + """ + Get the root node of the graph. + + Args: + rootname (str, optional): The name of the root node. If not provided, returns the current root node. + + Returns: + object: The root node. + + """ if rootname is None: return self._root else: return self.roots[rootname] def default_root(self): + """ + Get the default root node of the graph. + + Returns: + object: The default root node. + + """ return self._root def arc_at(self, address, arc=None): + """ + Get the arc at the specified address in the graph. + + Args: + address (int): The address of the arc. + arc (Arc, optional): An instance of the Arc class to store the arc data. If not provided, a new Arc instance will be created. + + Returns: + Arc: The arc at the specified address. + + """ arc = arc or Arc() self.dbfile.seek(address) return self._read_arc(arc) def iter_arcs(self, address, arc=None): + """ + Iterate over the arcs starting from the specified address in the graph. + + Args: + address (int): The address of the first arc. + arc (Arc, optional): An instance of the Arc class to store the arc data. If not provided, a new Arc instance will be created. + + Yields: + Arc: The arcs in the graph. + + """ arc = arc or Arc() _read_arc = self._read_arc @@ -1353,6 +2496,18 @@ def iter_arcs(self, address, arc=None): break def find_arc(self, address, label, arc=None): + """ + Find the arc with the specified label starting from the specified address in the graph. + + Args: + address (int): The address of the first arc. + label (bytes): The label of the arc. + arc (Arc, optional): An instance of the Arc class to store the arc data. If not provided, a new Arc instance will be created. + + Returns: + Arc: The arc with the specified label, or None if not found. + + """ # Overrides the default scanning implementation arc = arc or Arc() @@ -1371,9 +2526,17 @@ def find_arc(self, address, label, arc=None): # search method return BaseGraphReader.find_arc(self, address, label, arc) - # Implementations - def _read_arc(self, toarc=None): + """ + Read an arc from the database file. + + Args: + toarc (Arc, optional): An instance of the Arc class to store the arc data. If not provided, a new Arc instance will be created. + + Returns: + Arc: The arc read from the database file. + + """ toarc = toarc or Arc() dbfile = self.dbfile flags = dbfile.read_byte() @@ -1386,6 +2549,16 @@ def _read_arc(self, toarc=None): return self._read_arc_data(flags, toarc) def _read_label(self, flags): + """ + Read the label of an arc from the database file. + + Args: + flags (int): The flags indicating the label type. + + Returns: + bytes: The label of the arc. + + """ dbfile = self.dbfile if flags & MULTIBYTE_LABEL: length = dbfile.read_varint() @@ -1395,6 +2568,13 @@ def _read_label(self, flags): return label def _read_fixed_info(self): + """ + Read the fixed size information from the database file. + + Returns: + tuple: A tuple containing the size and count of the fixed size records, or None if not applicable. + + """ dbfile = self.dbfile flags = dbfile.read_byte() @@ -1406,6 +2586,17 @@ def _read_fixed_info(self): return None def _read_arc_data(self, flags, arc): + """ + Read the data of an arc from the database file. + + Args: + flags (int): The flags indicating the arc properties. + arc (Arc): An instance of the Arc class to store the arc data. + + Returns: + Arc: The arc with the data read from the database file. + + """ dbfile = self.dbfile accept = arc.accept = bool(flags & ARC_ACCEPT) arc.lastarc = flags & ARC_LAST @@ -1423,6 +2614,20 @@ def _read_arc_data(self, flags, arc): return arc def _binary_search(self, address, size, count, label, arc): + """ + Perform a binary search to find the arc with the specified label. + + Args: + address (int): The address of the first arc. + size (int): The size of each arc record. + count (int): The number of arcs. + label (bytes): The label of the arc to find. + arc (Arc): An instance of the Arc class to store the arc data. + + Returns: + Arc: The arc with the specified label, or None if not found. + + """ dbfile = self.dbfile _read_label = self._read_label @@ -1446,8 +2651,22 @@ def _binary_search(self, address, size, count, label, arc): def to_labels(key): - """Takes a string and returns a list of bytestrings, suitable for use as + """ + Takes a string and returns a list of bytestrings, suitable for use as a key or path in an FSA/FST graph. + + Args: + key (str or bytes or list or tuple): The input string. + + Returns: + tuple: A tuple of bytestrings representing the input string. + + Raises: + TypeError: If the input contains a non-bytestring. + + Example: + >>> to_labels('hello') + (b'h', b'e', b'l', b'l', b'o') """ # Convert to tuples of bytestrings (must be tuples so they can be hashed) @@ -1472,11 +2691,22 @@ def to_labels(key): def within(graph, text, k=1, prefix=0, address=None): - """Yields a series of keys in the given graph within ``k`` edit distance of + """ + Yields a series of keys in the given graph within ``k`` edit distance of ``text``. If ``prefix`` is greater than 0, all keys must match the first ``prefix`` characters of ``text``. - """ + Args: + graph (Graph): The graph to search within. + text (str): The text to search for. + k (int, optional): The maximum edit distance allowed. Defaults to 1. + prefix (int, optional): The number of characters that must match at the beginning of the keys. Defaults to 0. + address (int, optional): The starting address in the graph. Defaults to None. + + Yields: + str: A key within the specified edit distance of the text. + + """ text = to_labels(text) if address is None: address = graph._root @@ -1557,6 +2787,19 @@ def within(graph, text, k=1, prefix=0, address=None): def dump_graph(graph, address=None, tab=0, out=None): + """ + Dump the graph structure starting from the given address. + + Args: + graph (Graph): The graph object. + address (int, optional): The address to start dumping from. If not provided, the root address of the graph will be used. + tab (int, optional): The number of tabs to indent the output. Defaults to 0. + out (file-like object, optional): The output stream to write the dumped graph. Defaults to sys.stdout. + + Returns: + None + + """ if address is None: address = graph._root if out is None: diff --git a/src/whoosh/automata/glob.py b/src/whoosh/automata/glob.py index c41074c7..a5b31018 100644 --- a/src/whoosh/automata/glob.py +++ b/src/whoosh/automata/glob.py @@ -38,6 +38,25 @@ def parse_glob( pattern, _glob_multi="*", _glob_single="?", _glob_range1="[", _glob_range2="]" ): + """ + Parse a glob pattern and generate tokens representing the pattern. + + Args: + pattern (str): The glob pattern to parse. + _glob_multi (str, optional): The character representing multiple wildcard. Defaults to "*". + _glob_single (str, optional): The character representing single wildcard. Defaults to "?". + _glob_range1 (str, optional): The character representing the start of a character range. Defaults to "[". + _glob_range2 (str, optional): The character representing the end of a character range. Defaults to "]". + + Yields: + tuple: A tuple containing the token type and additional information. + The token types are: + - _STAR: Represents the multiple wildcard. + - _QUEST: Represents the single wildcard. + - _RANGE: Represents a character range. + - _LIT: Represents a literal character. + + """ pos = 0 last = None while pos < len(pattern): @@ -72,6 +91,21 @@ def parse_glob( def glob_automaton(pattern): + """ + Constructs a non-deterministic finite automaton (NFA) from a glob pattern. + + Args: + pattern (str): The glob pattern to convert into an NFA. + + Returns: + NFA: The constructed NFA. + + Raises: + None. + + Examples: + >>> nfa = glob_automaton("*.txt") + """ nfa = NFA(0) i = -1 for i, (op, arg) in enumerate(parse_glob(pattern)): diff --git a/src/whoosh/automata/lev.py b/src/whoosh/automata/lev.py index 08317edd..53437f87 100644 --- a/src/whoosh/automata/lev.py +++ b/src/whoosh/automata/lev.py @@ -2,6 +2,18 @@ def levenshtein_automaton(term, k, prefix=0): + """ + Generate a Levenshtein automaton for a given term and maximum edit distance. + + Args: + term (str): The term to generate the automaton for. + k (int): The maximum edit distance allowed. + prefix (int, optional): The length of the prefix to match exactly. Defaults to 0. + + Returns: + NFA: The generated Levenshtein automaton. + + """ nfa = NFA((0, 0)) if prefix: for i in range(prefix): diff --git a/src/whoosh/automata/reg.py b/src/whoosh/automata/reg.py index 54a5ecf6..b3a032ce 100644 --- a/src/whoosh/automata/reg.py +++ b/src/whoosh/automata/reg.py @@ -33,19 +33,59 @@ def parse(pattern): + """ + Parses a regular expression pattern and returns a parsed representation. + + Args: + pattern (str): The regular expression pattern to parse. + + Returns: + list: A list representing the parsed regular expression pattern. + + Example: + >>> parse("ab*c") + ['a', ('b', '*'), 'c'] + """ stack = [] ops = [] class RegexBuilder: + """ + A class for building regular expressions using a simplified NFA representation. + + This class provides methods for constructing various components of a regular expression, + such as epsilon, character, charset, dot, choice, concatenation, star, plus, and question. + + Usage: + rb = RegexBuilder() + nfa = rb.char('a') # Create an NFA for the character 'a' + nfa2 = rb.concat(nfa, rb.char('b')) # Concatenate two NFAs + """ + def __init__(self): + """ + Initialize the RegexBuilder object. + """ self.statenum = 1 def new_state(self): + """ + Generate a new state number. + + Returns: + int: The new state number. + """ self.statenum += 1 return self.statenum def epsilon(self): + """ + Create an NFA for the epsilon transition. + + Returns: + NFA: The NFA representing the epsilon transition. + """ s = self.new_state() e = self.new_state() nfa = NFA(s) @@ -54,6 +94,15 @@ def epsilon(self): return nfa def char(self, label): + """ + Create an NFA for a single character. + + Args: + label (str): The character label. + + Returns: + NFA: The NFA representing the character. + """ s = self.new_state() e = self.new_state() nfa = NFA(s) @@ -62,6 +111,15 @@ def char(self, label): return nfa def charset(self, chars): + """ + Create an NFA for a character set. + + Args: + chars (str): The characters in the set. + + Returns: + NFA: The NFA representing the character set. + """ s = self.new_state() e = self.new_state() nfa = NFA(s) @@ -71,6 +129,12 @@ def charset(self, chars): return e def dot(self): + """ + Create an NFA for the dot (matches any character). + + Returns: + NFA: The NFA representing the dot. + """ s = self.new_state() e = self.new_state() nfa = NFA(s) @@ -79,6 +143,16 @@ def dot(self): return nfa def choice(self, n1, n2): + """ + Create an NFA for the choice (|) operator. + + Args: + n1 (NFA): The first NFA. + n2 (NFA): The second NFA. + + Returns: + NFA: The NFA representing the choice operator. + """ s = self.new_state() s1 = self.new_state() s2 = self.new_state() @@ -96,6 +170,16 @@ def choice(self, n1, n2): return nfa def concat(self, n1, n2): + """ + Create an NFA for the concatenation operator. + + Args: + n1 (NFA): The first NFA. + n2 (NFA): The second NFA. + + Returns: + NFA: The NFA representing the concatenation operator. + """ s = self.new_state() m = self.new_state() e = self.new_state() @@ -106,6 +190,15 @@ def concat(self, n1, n2): return nfa def star(self, n): + """ + Create an NFA for the Kleene star (*) operator. + + Args: + n (NFA): The NFA to apply the star operator to. + + Returns: + NFA: The NFA representing the star operator. + """ s = self.new_state() m1 = self.new_state() m2 = self.new_state() @@ -120,7 +213,25 @@ def star(self, n): return nfa def plus(self, n): + """ + Create an NFA for the plus (+) operator. + + Args: + n (NFA): The NFA to apply the plus operator to. + + Returns: + NFA: The NFA representing the plus operator. + """ return self.concat(n, self.star(n)) def question(self, n): + """ + Create an NFA for the question mark (?) operator. + + Args: + n (NFA): The NFA to apply the question mark operator to. + + Returns: + NFA: The NFA representing the question mark operator. + """ return self.choice(n, self.epsilon()) diff --git a/src/whoosh/codec/__init__.py b/src/whoosh/codec/__init__.py index 70445636..3fc048f5 100644 --- a/src/whoosh/codec/__init__.py +++ b/src/whoosh/codec/__init__.py @@ -27,6 +27,21 @@ def default_codec(*args, **kwargs): + """ + Returns the default codec for Whoosh. + + This function imports and returns the W3Codec class from the whoosh.codec.whoosh3 module. + + Parameters: + *args: positional arguments to be passed to the W3Codec constructor. + **kwargs: keyword arguments to be passed to the W3Codec constructor. + + Returns: + W3Codec: an instance of the W3Codec class. + + Example: + codec = default_codec() + """ from whoosh.codec.whoosh3 import W3Codec return W3Codec(*args, **kwargs) diff --git a/src/whoosh/codec/base.py b/src/whoosh/codec/base.py index e360ff52..32ba38a4 100644 --- a/src/whoosh/codec/base.py +++ b/src/whoosh/codec/base.py @@ -42,6 +42,17 @@ class OutOfOrderError(Exception): + """ + Exception raised when encountering out-of-order data during decoding. + + This exception is raised when the codec encounters data that is out of order + during the decoding process. It typically indicates a corruption or + inconsistency in the data being decoded. + + Attributes: + message -- explanation of the error + """ + pass @@ -49,79 +60,314 @@ class OutOfOrderError(Exception): class Codec: - length_stats = True + """ + The base class for defining codecs in Whoosh. + + A codec is responsible for defining how data is stored and retrieved from the index. + It provides implementations for various operations such as per-document value writing, + inverted index writing, postings writing and reading, index readers, and segment and + generation management. - # Per document value writer + Subclasses of Codec should implement the abstract methods to provide the specific + functionality required by the codec. + + Attributes: + length_stats (bool): Indicates whether length statistics should be enabled for the codec. + + """ + + length_stats = True @abstractmethod def per_document_writer(self, storage, segment): - raise NotImplementedError + """ + Returns a per-document value writer for the given storage and segment. - # Inverted index writer + Args: + storage (Storage): The storage object for the index. + segment (Segment): The segment object representing a portion of the index. + + Returns: + PerDocumentWriter: The per-document value writer. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ + + raise NotImplementedError @abstractmethod def field_writer(self, storage, segment): - raise NotImplementedError + """ + Returns an inverted index writer for the given storage and segment. + + Args: + storage (Storage): The storage object for the index. + segment (Segment): The segment object representing a portion of the index. + + Returns: + FieldWriter: The inverted index writer. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. - # Postings + """ + + raise NotImplementedError @abstractmethod def postings_writer(self, dbfile, byteids=False): + """ + Returns a postings writer for the given database file. + + Args: + dbfile (File): The file object representing the database file. + byteids (bool, optional): Indicates whether the postings should be written using byte IDs. + + Returns: + PostingsWriter: The postings writer. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ + raise NotImplementedError @abstractmethod def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): - raise NotImplementedError + """ + Returns a postings reader for the given database file. + + Args: + dbfile (File): The file object representing the database file. + terminfo (TermInfo): The term information object. + format_ (str): The format of the postings. + term (Term, optional): The term to read the postings for. + scorer (Scorer, optional): The scorer object for scoring the postings. + + Returns: + PostingsReader: The postings reader. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. - # Index readers + """ + + raise NotImplementedError def automata(self, storage, segment): + """ + Returns an automata object for the given storage and segment. + + Args: + storage (Storage): The storage object for the index. + segment (Segment): The segment object representing a portion of the index. + + Returns: + Automata: The automata object. + + """ + _ = storage, segment # Unused arguments return Automata() @abstractmethod def terms_reader(self, storage, segment): + """ + Returns a terms reader for the given storage and segment. + + Args: + storage (Storage): The storage object for the index. + segment (Segment): The segment object representing a portion of the index. + + Returns: + TermsReader: The terms reader. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ + raise NotImplementedError @abstractmethod def per_document_reader(self, storage, segment): - raise NotImplementedError + """ + Returns a per-document value reader for the given storage and segment. + + Args: + storage (Storage): The storage object for the index. + segment (Segment): The segment object representing a portion of the index. + + Returns: + PerDocumentReader: The per-document value reader. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. - # Segments and generations + """ + + raise NotImplementedError @abstractmethod def new_segment(self, storage, indexname): + """ + Creates a new segment for the given storage and index name. + + Args: + storage (Storage): The storage object for the index. + indexname (str): The name of the index. + + Returns: + Segment: The new segment. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ + raise NotImplementedError class WrappingCodec(Codec): + """ + A codec that wraps another codec. + + This codec delegates all the method calls to the wrapped codec. + It can be used to extend or modify the behavior of an existing codec. + + Parameters: + - child (Codec): The codec to be wrapped. + + """ + def __init__(self, child): + """ + Initializes a new instance of the WrappingCodec class. + + Parameters: + - child (Codec): The codec to be wrapped. + + """ self._child = child def per_document_writer(self, storage, segment): + """ + Returns a per-document writer for the given storage and segment. + + Parameters: + - storage (Storage): The storage object. + - segment (Segment): The segment object. + + Returns: + - PerDocumentWriter: The per-document writer. + + """ return self._child.per_document_writer(storage, segment) def field_writer(self, storage, segment): + """ + Returns a field writer for the given storage and segment. + + Parameters: + - storage (Storage): The storage object. + - segment (Segment): The segment object. + + Returns: + - FieldWriter: The field writer. + + """ return self._child.field_writer(storage, segment) def postings_writer(self, dbfile, byteids=False): + """ + Returns a postings writer for the given dbfile. + + Parameters: + - dbfile (DBFile): The dbfile object. + - byteids (bool): Whether to use byteids. + + Returns: + - PostingsWriter: The postings writer. + + """ return self._child.postings_writer(dbfile, byteids=byteids) def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): + """ + Returns a postings reader for the given dbfile, terminfo, format, term, and scorer. + + Parameters: + - dbfile (DBFile): The dbfile object. + - terminfo (TermInfo): The terminfo object. + - format_ (str): The format. + - term (Term): The term object. + - scorer (Scorer): The scorer object. + + Returns: + - PostingsReader: The postings reader. + + """ return self._child.postings_reader( dbfile, terminfo, format_, term=term, scorer=scorer ) def automata(self, storage, segment): + """ + Returns an automata object for the given storage and segment. + + Parameters: + - storage (Storage): The storage object. + - segment (Segment): The segment object. + + Returns: + - Automata: The automata object. + + """ return self._child.automata(storage, segment) def terms_reader(self, storage, segment): + """ + Returns a terms reader for the given storage and segment. + + Parameters: + - storage (Storage): The storage object. + - segment (Segment): The segment object. + + Returns: + - TermsReader: The terms reader. + + """ return self._child.terms_reader(storage, segment) def per_document_reader(self, storage, segment): + """ + Returns a per-document reader for the given storage and segment. + + Parameters: + - storage (Storage): The storage object. + - segment (Segment): The segment object. + + Returns: + - PerDocumentReader: The per-document reader. + + """ return self._child.per_document_reader(storage, segment) def new_segment(self, storage, indexname): + """ + Returns a new segment for the given storage and indexname. + + Parameters: + - storage (Storage): The storage object. + - indexname (str): The indexname. + + Returns: + - Segment: The new segment. + + """ return self._child.new_segment(storage, indexname) @@ -129,23 +375,125 @@ def new_segment(self, storage, indexname): class PerDocumentWriter: + """ + The PerDocumentWriter class is an abstract base class that defines the interface for writing per-document data + during the indexing process. + + Subclasses of PerDocumentWriter must implement the following methods: + - start_doc(docnum): Called at the beginning of writing a new document. + - add_field(fieldname, fieldobj, value, length): Called to add a field and its value to the document. + - add_column_value(fieldname, columnobj, value): Called to add a column value to the document. + - add_vector_items(fieldname, fieldobj, items): Called to add vector items to the document. + + The PerDocumentWriter class also provides default implementations for the following methods: + - add_vector_matcher(fieldname, fieldobj, vmatcher): Adds vector items to the document using a vector matcher. + - finish_doc(): Called at the end of writing a document. + - close(): Called to close the writer. + + Usage: + 1. Create a subclass of PerDocumentWriter. + 2. Implement the required methods. + 3. Use the subclass to write per-document data during the indexing process. + + Example: + ```python + class MyDocumentWriter(PerDocumentWriter): + def start_doc(self, docnum): + # Implementation goes here + + def add_field(self, fieldname, fieldobj, value, length): + # Implementation goes here + + def add_column_value(self, fieldname, columnobj, value): + # Implementation goes here + + def add_vector_items(self, fieldname, fieldobj, items): + # Implementation goes here + + writer = MyDocumentWriter() + writer.start_doc(1) + writer.add_field("title", fieldobj, "Sample Title", 1) + writer.finish_doc() + writer.close() + ``` + """ + @abstractmethod def start_doc(self, docnum): + """ + Called at the beginning of writing a new document. + + Parameters: + - docnum (int): The document number. + + Raises: + - NotImplementedError: If the method is not implemented by the subclass. + """ raise NotImplementedError @abstractmethod def add_field(self, fieldname, fieldobj, value, length): + """ + Called to add a field and its value to the document. + + Parameters: + - fieldname (str): The name of the field. + - fieldobj: The field object. + - value: The value of the field. + - length (int): The length of the field. + + Raises: + - NotImplementedError: If the method is not implemented by the subclass. + """ raise NotImplementedError @abstractmethod def add_column_value(self, fieldname, columnobj, value): + """ + Called to add a column value to the document. + + Parameters: + - fieldname (str): The name of the field. + - columnobj: The column object. + - value: The value of the column. + + Raises: + - NotImplementedError: If the method is not implemented by the subclass. + """ raise NotImplementedError("Codec does not implement writing columns") @abstractmethod def add_vector_items(self, fieldname, fieldobj, items): + """ + Called to add vector items to the document. + + Parameters: + - fieldname (str): The name of the field. + - fieldobj: The field object. + - items: An iterable of vector items. + + Raises: + - NotImplementedError: If the method is not implemented by the subclass. + """ raise NotImplementedError def add_vector_matcher(self, fieldname, fieldobj, vmatcher): + """ + Adds vector items to the document using a vector matcher. + + Parameters: + - fieldname (str): The name of the field. + - fieldobj: The field object. + - vmatcher: The vector matcher. + + Note: + This method provides a default implementation that reads vector items from the vector matcher + and calls the add_vector_items method. + + Raises: + - NotImplementedError: If the add_vector_items method is not implemented by the subclass. + """ + def readitems(): while vmatcher.is_active(): text = vmatcher.id() @@ -157,20 +505,85 @@ def readitems(): self.add_vector_items(fieldname, fieldobj, readitems()) def finish_doc(self): - # This method is intentionally left empty. + """ + Called at the end of writing a document. + + Note: + This method is intentionally left empty. + + Usage: + Subclasses can override this method to perform any necessary cleanup or finalization steps. + """ pass def close(self): - # This method is intentionally left empty. + """ + Called to close the writer. + + Note: + This method is intentionally left empty. + + Usage: + Subclasses can override this method to perform any necessary cleanup or closing steps. + """ pass class FieldWriter: + """ + The FieldWriter class is responsible for translating a generator of postings into calls to various methods + such as start_field(), start_term(), add(), finish_term(), finish_field(), etc. It is used in the process + of writing fields and terms to an index. + + Usage: + 1. Create an instance of FieldWriter. + 2. Implement the abstract methods: start_field(), start_term(), add(), finish_term(). + 3. Optionally, implement the add_spell_word() method if you need to add spelling words. + 4. Use the add_postings() method to process a generator of postings and write them to the index. + 5. Call the close() method to perform any necessary cleanup. + + Example: + ```python + class MyFieldWriter(FieldWriter): + def start_field(self, fieldname, fieldobj): + # Implementation goes here + + def start_term(self, text): + # Implementation goes here + + def add(self, docnum, weight, vbytes, length): + # Implementation goes here + + def finish_term(self): + # Implementation goes here + + def add_spell_word(self, fieldname, text): + # Implementation goes here + + writer = MyFieldWriter() + writer.add_postings(schema, lengths, items) + writer.close() + ``` + + Note: The finish_field() method is intentionally left empty and does not need to be implemented. + """ + def add_postings(self, schema, lengths, items): - # This method translates a generator of (fieldname, btext, docnum, w, v) - # postings into calls to start_field(), start_term(), add(), - # finish_term(), finish_field(), etc. + """ + Translates a generator of (fieldname, btext, docnum, w, v) postings into calls to start_field(), start_term(), + add(), finish_term(), finish_field(), etc. + Parameters: + - schema (Schema): The schema object that defines the fields in the index. + - lengths (Lengths): The lengths object that provides the document field lengths. + - items (generator): A generator of (fieldname, btext, docnum, weight, value) postings. + + Raises: + - OutOfOrderError: If the postings are out of order. + + Returns: + - None + """ start_field = self.start_field start_term = self.start_term add = self.add @@ -187,7 +600,7 @@ def add_postings(self, schema, lengths, items): # The bytes text of the previous posting lasttext = None # The (fieldname, btext) of the previous spelling posting - lastspell = None + # lastspell = None # The field object for the current field fieldobj = None for fieldname, btext, docnum, weight, value in items: @@ -245,128 +658,491 @@ def add_postings(self, schema, lengths, items): @abstractmethod def start_field(self, fieldname, fieldobj): + """ + This method is called when starting to process a new field during indexing or searching. + + Parameters: + - fieldname (str): The name of the field being processed. + - fieldobj: The field object representing the field being processed. + + Raises: + - NotImplementedError: This method should be implemented by subclasses. + + Notes: + - This method is typically used for initializing any necessary resources or state for processing the field. + - Subclasses should override this method to provide their own implementation. + """ raise NotImplementedError @abstractmethod def start_term(self, text): + """ + This method is called to indicate the start of a term during indexing or searching. + + Parameters: + - text (str): The text of the term. + + Raises: + - NotImplementedError: This method should be implemented by subclasses. + + """ raise NotImplementedError @abstractmethod def add(self, docnum, weight, vbytes, length): + """ + Adds a document to the codec. + + Args: + docnum (int): The document number. + weight (float): The weight of the document. + vbytes (bytes): The encoded document data. + length (int): The length of the document in bytes. + + Raises: + NotImplementedError: This method should be implemented by a subclass. + + """ raise NotImplementedError def add_spell_word(self, fieldname, text): + """ + Adds a spell word to the specified field. + + Args: + fieldname (str): The name of the field to add the spell word to. + text (str): The spell word to add. + + Raises: + NotImplementedError: This method is not implemented in the base class. + """ raise NotImplementedError @abstractmethod def finish_term(self): + """ + Finish processing the current term. + + This method is called to finalize the processing of the current term. Subclasses should implement this method + to perform any necessary cleanup or finalization steps for the term. + + Raises: + NotImplementedError: This method is meant to be overridden by subclasses. + """ raise NotImplementedError def finish_field(self): + """ + Finish processing the current field. + + This method is called after all the terms in a field have been processed. + It can be overridden in subclasses to perform any necessary finalization + steps for the field. + + Usage: + codec = BaseCodec() + codec.finish_field() + + """ # This method is intentionally left empty. pass def close(self): + """ + Closes the codec. + + This method is called when the codec needs to be closed. It should release any resources + held by the codec and perform any necessary cleanup. + + Example usage: + codec = MyCodec() + # ... do some operations with the codec ... + codec.close() + """ pass # Postings +class PostingsWriter: + """Abstract base class for writing postings lists to disk. + This class defines the interface for writing postings lists to disk in a specific format. + Subclasses must implement the abstract methods to provide the necessary functionality. + + Attributes: + None + + Methods: + start_postings(format_, terminfo): Start writing a new postings list. + add_posting(id_, weight, vbytes, length=None): Add a posting to the current postings list. + finish_postings(): Finish writing the current postings list. + written(): Check if this object has already written to disk. + + """ -class PostingsWriter: @abstractmethod def start_postings(self, format_, terminfo): + """Start writing a new postings list. + + Args: + format_ (str): The format of the postings list. + terminfo (object): The term information associated with the postings list. + + Returns: + None + + Raises: + NotImplementedError: This method must be implemented by subclasses. + + """ + raise NotImplementedError @abstractmethod def add_posting(self, id_, weight, vbytes, length=None): + """Add a posting to the current postings list. + + Args: + id_ (int): The identifier of the posting. + weight (float): The weight of the posting. + vbytes (bytes): The encoded bytes of the posting. + length (int, optional): The length of the posting. Defaults to None. + + Returns: + None + + Raises: + NotImplementedError: This method must be implemented by subclasses. + + """ + raise NotImplementedError def finish_postings(self): - # This method is intentionally left empty. + """Finish writing the current postings list. + + This method is intentionally left empty. + + Args: + None + + Returns: + None + + """ + pass @abstractmethod def written(self): - """Returns True if this object has already written to disk.""" + """Check if this object has already written to disk. - raise NotImplementedError + Args: + None + Returns: + bool: True if this object has already written to disk, False otherwise. -# Reader classes + Raises: + NotImplementedError: This method must be implemented by subclasses. + + """ + + raise NotImplementedError +# Reader classes class FieldCursor: + """A cursor for navigating through a field's data. + + This class provides methods for navigating through a field's data, + such as moving to the first position, finding a specific string, + moving to the next position, and retrieving the current term. + + Usage: + cursor = FieldCursor() + cursor.first() # Move to the first position + cursor.find("example") # Find the position of the string "example" + cursor.next() # Move to the next position + term = cursor.term() # Retrieve the current term + + Note: + This class is meant to be subclassed and the methods should be + implemented according to the specific requirements of the field's + data format. + """ + def first(self): + """Move the cursor to the first position. + + Raises: + NotImplementedError: This method should be implemented by + subclasses. + """ raise NotImplementedError def find(self, string): + """Find the position of a specific string. + + Args: + string (str): The string to find. + + Raises: + NotImplementedError: This method should be implemented by + subclasses. + """ raise NotImplementedError def next(self): + """Move the cursor to the next position. + + Raises: + NotImplementedError: This method should be implemented by + subclasses. + """ raise NotImplementedError def term(self): + """Retrieve the current term. + + Returns: + str: The current term. + + Raises: + NotImplementedError: This method should be implemented by + subclasses. + """ raise NotImplementedError class TermsReader: + """A base class for reading terms and their associated information from an index. + + This class provides methods for retrieving terms, term frequencies, document frequencies, + and creating term matchers for querying the index. + + Subclasses of `TermsReader` should implement the abstract methods to provide the necessary + functionality for reading terms from a specific index format. + + """ + @abstractmethod def __contains__(self, term): + """Check if a term exists in the index. + + Args: + term (str): The term to check. + + Returns: + bool: True if the term exists in the index, False otherwise. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ raise NotImplementedError @abstractmethod def cursor(self, fieldname, fieldobj): + """Get a cursor for iterating over the terms in a field. + + Args: + fieldname (str): The name of the field. + fieldobj (object): The field object. + + Returns: + object: A cursor object for iterating over the terms in the field. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ raise NotImplementedError @abstractmethod def terms(self): + """Get a list of all terms in the index. + + Returns: + list: A list of all terms in the index. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ raise NotImplementedError @abstractmethod def terms_from(self, fieldname, prefix): + """Get a list of terms starting with a given prefix in a specific field. + + Args: + fieldname (str): The name of the field. + prefix (str): The prefix to match. + + Returns: + list: A list of terms starting with the given prefix in the field. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ raise NotImplementedError @abstractmethod def items(self): + """Get a list of all (fieldname, term) pairs in the index. + + Returns: + list: A list of all (fieldname, term) pairs in the index. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ raise NotImplementedError @abstractmethod def items_from(self, fieldname, prefix): + """Get a list of (fieldname, term) pairs starting with a given prefix in a specific field. + + Args: + fieldname (str): The name of the field. + prefix (str): The prefix to match. + + Returns: + list: A list of (fieldname, term) pairs starting with the given prefix in the field. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ raise NotImplementedError @abstractmethod def term_info(self, fieldname, text): + """Get the term information for a specific term in a field. + + Args: + fieldname (str): The name of the field. + text (str): The term to get information for. + + Returns: + object: The term information object for the specified term in the field. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ raise NotImplementedError @abstractmethod def frequency(self, fieldname, text): + """Get the term frequency for a specific term in a field. + + Args: + fieldname (str): The name of the field. + text (str): The term to get the frequency for. + + Returns: + int: The term frequency for the specified term in the field. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ return self.term_info(fieldname, text).weight() @abstractmethod def doc_frequency(self, fieldname, text): + """Get the document frequency for a specific term in a field. + + Args: + fieldname (str): The name of the field. + text (str): The term to get the document frequency for. + + Returns: + int: The document frequency for the specified term in the field. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ return self.term_info(fieldname, text).doc_frequency() @abstractmethod def matcher(self, fieldname, text, format_, scorer=None): + """Create a term matcher for a specific term in a field. + + Args: + fieldname (str): The name of the field. + text (str): The term to create the matcher for. + format_ (object): The format object for the field. + scorer (object, optional): The scorer object to use for scoring the matches. + + Returns: + object: A term matcher for the specified term in the field. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ raise NotImplementedError @abstractmethod def indexed_field_names(self): + """Get a list of all field names in the index. + + Returns: + list: A list of all field names in the index. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + + """ raise NotImplementedError def close(self): - # This method is intentionally left empty. + """Close the terms reader. + + This method is intentionally left empty. + + """ pass class Automata: + """ + The Automata class provides methods for working with automata used in string matching operations. + """ + @staticmethod def levenshtein_dfa(uterm, maxdist, prefix=0): + """ + Generates a deterministic finite automaton (DFA) for performing approximate string matching using the Levenshtein distance algorithm. + + Args: + uterm (str): The target term to match against. + maxdist (int): The maximum allowed edit distance between the target term and the matched terms. + prefix (int, optional): The length of the common prefix between the target term and the matched terms. Defaults to 0. + + Returns: + DFA: The generated DFA for performing approximate string matching. + """ return lev.levenshtein_automaton(uterm, maxdist, prefix).to_dfa() @staticmethod def find_matches(dfa, cur): + """ + Finds all matches in a given cursor using a DFA. + + Args: + dfa (DFA): The DFA used for matching. + cur (Cursor): The cursor to search for matches. + + Yields: + str: The matched terms found in the cursor. + """ unull = chr(0) term = cur.text() @@ -385,43 +1161,126 @@ def find_matches(dfa, cur): match = dfa.next_valid_string(term) def terms_within(self, fieldcur, uterm, maxdist, prefix=0): + """ + Finds all terms within a given cursor that are within a specified edit distance of a target term. + + Args: + fieldcur (Cursor): The cursor representing the field to search within. + uterm (str): The target term to match against. + maxdist (int): The maximum allowed edit distance between the target term and the matched terms. + prefix (int, optional): The length of the common prefix between the target term and the matched terms. Defaults to 0. + + Returns: + Generator[str]: A generator that yields the matched terms found within the cursor. + """ dfa = self.levenshtein_dfa(uterm, maxdist, prefix) return self.find_matches(dfa, fieldcur) # Per-doc value reader +class PerDocumentReader: + """ + The PerDocumentReader class represents a base class for reading per-document data in a search index. + This class provides methods for accessing and manipulating per-document data, such as deletions, columns, bitmaps, + lengths, vectors, and stored fields. + + Subclasses of PerDocumentReader should implement the abstract methods to provide the specific functionality + required for a particular codec. + + Usage: + 1. Create an instance of a subclass of PerDocumentReader. + 2. Use the provided methods to access and manipulate per-document data. + + Example: + ``` + reader = MyPerDocumentReader() + count = reader.doc_count() + print(f"Total number of documents: {count}") + ``` + """ -class PerDocumentReader: def close(self): - # This method is intentionally left empty. + """ + Closes the PerDocumentReader and releases any resources associated with it. + + This method should be called when the PerDocumentReader is no longer needed. + """ + pass @abstractmethod def doc_count(self): + """ + Returns the number of documents in the reader. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError @abstractmethod def doc_count_all(self): + """ + Returns the total number of documents, including deleted documents, in the reader. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError # Deletions @abstractmethod def has_deletions(self): + """ + Returns True if the reader has deletions, False otherwise. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError @abstractmethod def is_deleted(self, docnum): + """ + Returns True if the document with the given docnum is deleted, False otherwise. + + Args: + docnum (int): The document number. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError @abstractmethod def deleted_docs(self): + """ + Returns a set of document numbers that are deleted. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError def all_doc_ids(self): """ Returns an iterator of all (undeleted) document IDs in the reader. + + Returns: + An iterator of document IDs. + + Example: + ``` + for doc_id in reader.all_doc_ids(): + print(doc_id) + ``` """ is_deleted = self.is_deleted @@ -430,28 +1289,89 @@ def all_doc_ids(self): ) def iter_docs(self): + """ + Returns an iterator over all (undeleted) documents in the reader. + + Yields: + Tuple[int, dict]: A tuple containing the document number and the stored fields of the document. + + Example: + ``` + for docnum, fields in reader.iter_docs(): + print(f"Document {docnum}: {fields}") + ``` + """ + for docnum in self.all_doc_ids(): yield docnum, self.stored_fields(docnum) # Columns def supports_columns(self): + """ + Returns True if the reader supports columns, False otherwise. + + Returns: + bool: True if the reader supports columns, False otherwise. + """ + return False def has_column(self, fieldname): + """ + Returns True if the reader has a column with the given fieldname, False otherwise. + + Args: + fieldname (str): The name of the column field. + + Returns: + bool: True if the reader has the column, False otherwise. + """ + _ = fieldname # Unused argument return False def list_columns(self): + """ + Returns a list of all column names in the reader. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError # Don't need to override this if supports_columns() returns False def column_reader(self, fieldname, column): + """ + Returns a reader for accessing the values in the specified column. + + Args: + fieldname (str): The name of the column field. + column (str): The name of the column. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError # Bitmaps def field_docs(self, fieldname): + """ + Returns the bitmap of documents that have a value for the specified field. + + Args: + fieldname (str): The name of the field. + + Returns: + Bitmap or None: The bitmap of documents or None if the field does not exist. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + _ = fieldname # Unused argument return None @@ -459,44 +1379,130 @@ def field_docs(self, fieldname): @abstractmethod def doc_field_length(self, docnum, fieldname, default=0): + """ + Returns the length of the specified field in the specified document. + + Args: + docnum (int): The document number. + fieldname (str): The name of the field. + default (int, optional): The default length to return if the field does not exist. Defaults to 0. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError @abstractmethod def field_length(self, fieldname): + """ + Returns the total length of the specified field across all documents. + + Args: + fieldname (str): The name of the field. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError @abstractmethod def min_field_length(self, fieldname): + """ + Returns the minimum length of the specified field across all documents. + + Args: + fieldname (str): The name of the field. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError @abstractmethod def max_field_length(self, fieldname): + """ + Returns the maximum length of the specified field across all documents. + + Args: + fieldname (str): The name of the field. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError # Vectors def has_vector(self, docnum, fieldname): + """ + Returns True if the specified document has a vector for the specified field, False otherwise. + + Args: + docnum (int): The document number. + fieldname (str): The name of the field. + + Returns: + bool: True if the document has a vector, False otherwise. + """ + _ = docnum, fieldname # Unused arguments return False # Don't need to override this if has_vector() always returns False def vector(self, docnum, fieldname, format_): + """ + Returns the vector for the specified document and field. + + Args: + docnum (int): The document number. + fieldname (str): The name of the field. + format_ (str): The format of the vector. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError # Stored @abstractmethod def stored_fields(self, docnum): + """ + Returns the stored fields of the specified document. + + Args: + docnum (int): The document number. + + Raises: + NotImplementedError: If the method is not implemented by the subclass. + """ + raise NotImplementedError def all_stored_fields(self): + """ + Returns an iterator over the stored fields of all (undeleted) documents in the reader. + + Yields: + dict: The stored fields of a document. + + Example: + ``` + for fields in reader.all_stored_fields(): + print(fields) + ``` + """ + for docnum in self.all_doc_ids(): yield self.stored_fields(docnum) # Segment base class - - class Segment: """Do not instantiate this object directly. It is used by the Index object to hold information about a segment. A list of objects of this class are @@ -518,37 +1524,84 @@ class Segment: # self.segid def __init__(self, indexname): + """ + Initializes a Segment object. + + :param indexname: The name of the index. + """ self.indexname = indexname self.segid = self._random_id() self.compound = False @classmethod def _random_id(cls, size=16): + """ + Generates a random ID for the segment. + + :param size: The size of the random ID. Default is 16. + :return: The random ID. + """ return random_name(size=size) def __repr__(self): + """ + Returns a string representation of the Segment object. + + :return: The string representation. + """ return f"<{self.__class__.__name__} {self.segment_id()}>" def __eq__(self, other): + """ + Checks if two Segment objects are equal. + + :param other: The other Segment object to compare. + :return: True if the objects are equal, False otherwise. + """ return isinstance(other, type(self)) and self.segment_id() == other.segment_id() def __hash__(self): + """ + Returns the hash value of the Segment object. + + :return: The hash value. + """ return hash(self.segment_id()) def codec(self): + """ + Returns the codec used by the segment. + + :return: The codec used by the segment. + """ raise NotImplementedError def index_name(self): + """ + Returns the name of the index. + + :return: The name of the index. + """ return self.indexname def segment_id(self): + """ + Returns the ID of the segment. + + :return: The ID of the segment. + """ if hasattr(self, "name"): # Old segment class return self.name else: return f"{self.index_name()}_{self.segid}" - def is_compound(self): + def is_compound(self): + """ + Checks if the segment is a compound segment. + + :return: True if the segment is compound, False otherwise. + """ if not hasattr(self, "compound"): return False return self.compound @@ -556,31 +1609,54 @@ def is_compound(self): # File convenience methods def make_filename(self, ext): + """ + Creates a filename for the segment with the given extension. + + :param ext: The extension of the filename. + :return: The filename. + """ return f"{self.segment_id()}{ext}" def list_files(self, storage): + """ + Lists the files associated with the segment in the given storage. + + :param storage: The storage object. + :return: A list of file names. + """ prefix = f"{self.segment_id()}." return [name for name in storage.list() if name.startswith(prefix)] def create_file(self, storage, ext, **kwargs): - """Convenience method to create a new file in the given storage named - with this segment's ID and the given extension. Any keyword arguments - are passed to the storage's create_file method. """ + Creates a new file in the given storage with the segment's ID and the given extension. + :param storage: The storage object. + :param ext: The extension of the file. + :param kwargs: Additional keyword arguments passed to the storage's create_file method. + :return: The created file object. + """ fname = self.make_filename(ext) return storage.create_file(fname, **kwargs) def open_file(self, storage, ext, **kwargs): - """Convenience method to open a file in the given storage named with - this segment's ID and the given extension. Any keyword arguments are - passed to the storage's open_file method. """ + Opens a file in the given storage with the segment's ID and the given extension. + :param storage: The storage object. + :param ext: The extension of the file. + :param kwargs: Additional keyword arguments passed to the storage's open_file method. + :return: The opened file object. + """ fname = self.make_filename(ext) return storage.open_file(fname, **kwargs) def create_compound_file(self, storage): + """ + Creates a compound file in the given storage by combining the segment's files. + + :param storage: The storage object. + """ segfiles = self.list_files(storage) assert not any(name.endswith(self.COMPOUND_EXT) for name in segfiles) cfile = self.create_file(storage, self.COMPOUND_EXT) @@ -590,6 +1666,12 @@ def create_compound_file(self, storage): self.compound = True def open_compound_file(self, storage): + """ + Opens the compound file associated with the segment in the given storage. + + :param storage: The storage object. + :return: The opened compound file object. + """ name = self.make_filename(self.COMPOUND_EXT) dbfile = storage.open_file(name) return CompoundStorage(dbfile, use_mmap=storage.supports_mmap) @@ -602,24 +1684,28 @@ def doc_count_all(self): Returns the total number of documents, DELETED OR UNDELETED, in this segment. """ - raise NotImplementedError def doc_count(self): """ Returns the number of (undeleted) documents in this segment. """ - return self.doc_count_all() - self.deleted_count() def set_doc_count(self, doccount): + """ + Sets the number of documents in the segment. + + :param doccount: The number of documents. + """ raise NotImplementedError def has_deletions(self): """ - Returns True if any documents in this segment are deleted. - """ + Checks if any documents in this segment are deleted. + :return: True if there are deleted documents, False otherwise. + """ return self.deleted_count() > 0 @abstractmethod @@ -627,106 +1713,308 @@ def deleted_count(self): """ Returns the total number of deleted documents in this segment. """ - raise NotImplementedError @abstractmethod def deleted_docs(self): + """ + Returns a list of deleted document numbers in this segment. + """ raise NotImplementedError @abstractmethod def delete_document(self, docnum, delete=True): - """Deletes the given document number. The document is not actually - removed from the index until it is optimized. - - :param docnum: The document number to delete. - :param delete: If False, this undeletes a deleted document. """ + Deletes or undeletes the given document number. + :param docnum: The document number to delete or undelete. + :param delete: If False, undeletes the document. Default is True. + """ raise NotImplementedError @abstractmethod def is_deleted(self, docnum): """ - Returns True if the given document number is deleted. - """ + Checks if the given document number is deleted. + :param docnum: The document number. + :return: True if the document is deleted, False otherwise. + """ raise NotImplementedError def should_assemble(self): + """ + Checks if the segment should be assembled. + + :return: True if the segment should be assembled, False otherwise. + """ return True # Wrapping Segment +class WrappingSegment(Segment): + """ + A segment that wraps another segment. + This class serves as a wrapper around another segment, providing a way to modify or extend its behavior. + + Args: + child (Segment): The segment to be wrapped. + + """ -class WrappingSegment(Segment): def __init__(self, child): self._child = child def codec(self): + """ + Get the codec used by the wrapped segment. + + Returns: + Codec: The codec used by the wrapped segment. + + """ return self._child.codec() def index_name(self): + """ + Get the name of the index associated with the wrapped segment. + + Returns: + str: The name of the index associated with the wrapped segment. + + """ return self._child.index_name() def segment_id(self): + """ + Get the unique identifier of the wrapped segment. + + Returns: + str: The unique identifier of the wrapped segment. + + """ return self._child.segment_id() def is_compound(self): + """ + Check if the wrapped segment is a compound segment. + + Returns: + bool: True if the wrapped segment is a compound segment, False otherwise. + + """ return self._child.is_compound() def should_assemble(self): + """ + Check if the wrapped segment should be assembled. + + Returns: + bool: True if the wrapped segment should be assembled, False otherwise. + + """ return self._child.should_assemble() def make_filename(self, ext): + """ + Generate a filename for the wrapped segment with the given extension. + + Args: + ext (str): The file extension. + + Returns: + str: The generated filename for the wrapped segment. + + """ return self._child.make_filename(ext) def list_files(self, storage): + """ + List all files associated with the wrapped segment in the given storage. + + Args: + storage: The storage object. + + Returns: + list: A list of filenames associated with the wrapped segment. + + """ return self._child.list_files(storage) def create_file(self, storage, ext, **kwargs): + """ + Create a new file for the wrapped segment with the given extension. + + Args: + storage: The storage object. + ext (str): The file extension. + **kwargs: Additional keyword arguments. + + Returns: + File: The created file object. + + """ return self._child.create_file(storage, ext, **kwargs) def open_file(self, storage, ext, **kwargs): + """ + Open an existing file for the wrapped segment with the given extension. + + Args: + storage: The storage object. + ext (str): The file extension. + **kwargs: Additional keyword arguments. + + Returns: + File: The opened file object. + + """ return self._child.open_file(storage, ext, **kwargs) def create_compound_file(self, storage): + """ + Create a compound file for the wrapped segment in the given storage. + + Args: + storage: The storage object. + + Returns: + CompoundFile: The created compound file object. + + """ return self._child.create_compound_file(storage) def open_compound_file(self, storage): + """ + Open a compound file for the wrapped segment in the given storage. + + Args: + storage: The storage object. + + Returns: + CompoundFile: The opened compound file object. + + """ return self._child.open_compound_file(storage) def delete_document(self, docnum, delete=True): + """ + Delete a document from the wrapped segment. + + Args: + docnum (int): The document number. + delete (bool): Whether to mark the document as deleted or not. Default is True. + + Returns: + bool: True if the document was successfully deleted, False otherwise. + + """ return self._child.delete_document(docnum, delete=delete) def has_deletions(self): + """ + Check if the wrapped segment has any deleted documents. + + Returns: + bool: True if the wrapped segment has deleted documents, False otherwise. + + """ return self._child.has_deletions() def deleted_count(self): + """ + Get the number of deleted documents in the wrapped segment. + + Returns: + int: The number of deleted documents. + + """ return self._child.deleted_count() def deleted_docs(self): + """ + Get a list of deleted document numbers in the wrapped segment. + + Returns: + list: A list of deleted document numbers. + + """ return self._child.deleted_docs() def is_deleted(self, docnum): + """ + Check if a document with the given number is deleted in the wrapped segment. + + Args: + docnum (int): The document number. + + Returns: + bool: True if the document is deleted, False otherwise. + + """ return self._child.is_deleted(docnum) def set_doc_count(self, doccount): + """ + Set the total number of documents in the wrapped segment. + + Args: + doccount (int): The total number of documents. + + """ self._child.set_doc_count(doccount) def doc_count(self): + """ + Get the total number of documents in the wrapped segment. + + Returns: + int: The total number of documents. + + """ return self._child.doc_count() def doc_count_all(self): + """ + Get the total number of documents, including deleted ones, in the wrapped segment. + + Returns: + int: The total number of documents. + + """ return self._child.doc_count_all() # Multi per doc reader +class MultiPerDocumentReader(PerDocumentReader): + """ + A reader that combines multiple per-document readers into a single reader. + This class is used to read documents from multiple per-document readers and present them as a single reader. + It provides methods to access document counts, check for deletions, access columns, and retrieve field lengths. + + Parameters: + - readers (list): A list of per-document readers to be combined. + - offset (int): The offset to be applied to the document numbers of each reader. + + Attributes: + - _readers (list): The list of per-document readers. + - _doc_offsets (list): The list of document offsets for each reader. + - _doccount (int): The total number of documents across all readers. + - is_closed (bool): Indicates whether the reader is closed. + + """ -class MultiPerDocumentReader(PerDocumentReader): def __init__(self, readers, offset=0): + """ + Initializes a MultiPerDocumentReader instance. + + Parameters: + - readers (list): A list of per-document readers to be combined. + - offset (int): The offset to be applied to the document numbers of each reader. + + """ self._readers = readers self._doc_offsets = [] @@ -738,52 +2026,141 @@ def __init__(self, readers, offset=0): self.is_closed = False def close(self): + """ + Closes the reader and releases any resources. + + """ for r in self._readers: r.close() self.is_closed = True def doc_count_all(self): + """ + Returns the total number of documents across all readers. + + Returns: + - int: The total number of documents. + + """ return self._doccount def doc_count(self): + """ + Returns the number of non-deleted documents across all readers. + + Returns: + - int: The number of non-deleted documents. + + """ total = 0 for r in self._readers: total += r.doc_count() return total def _document_reader(self, docnum): + """ + Returns the index of the reader that contains the specified document number. + + Parameters: + - docnum (int): The document number. + + Returns: + - int: The index of the reader. + + """ return max(0, bisect_right(self._doc_offsets, docnum) - 1) def _reader_and_docnum(self, docnum): + """ + Returns the reader index and the document number within the reader for the specified document number. + + Parameters: + - docnum (int): The document number. + + Returns: + - tuple: A tuple containing the reader index and the document number within the reader. + + """ rnum = self._document_reader(docnum) offset = self._doc_offsets[rnum] return rnum, docnum - offset - # Deletions - def has_deletions(self): + """ + Checks if any of the readers have deletions. + + Returns: + - bool: True if any of the readers have deletions, False otherwise. + + """ return any(r.has_deletions() for r in self._readers) def is_deleted(self, docnum): + """ + Checks if the specified document number is deleted. + + Parameters: + - docnum (int): The document number. + + Returns: + - bool: True if the document is deleted, False otherwise. + + """ x, y = self._reader_and_docnum(docnum) return self._readers[x].is_deleted(y) def deleted_docs(self): + """ + Yields the document numbers of all deleted documents across all readers. + + Yields: + - int: The document number of a deleted document. + + """ for r, offset in zip(self._readers, self._doc_offsets): for docnum in r.deleted_docs(): yield docnum + offset def all_doc_ids(self): + """ + Yields all document numbers across all readers. + + Yields: + - int: The document number. + + """ for r, offset in zip(self._readers, self._doc_offsets): for docnum in r.all_doc_ids(): yield docnum + offset - # Columns - def has_column(self, fieldname): + """ + Checks if any of the readers have the specified column. + + Parameters: + - fieldname (str): The name of the column. + + Returns: + - bool: True if any of the readers have the column, False otherwise. + + """ return any(r.has_column(fieldname) for r in self._readers) def column_reader(self, fieldname, column): + """ + Returns a column reader for the specified fieldname and column. + + Parameters: + - fieldname (str): The name of the field. + - column (Column): The column object. + + Returns: + - ColumnReader: The column reader. + + Raises: + - ValueError: If none of the readers have the specified column. + + """ if not self.has_column(fieldname): raise ValueError(f"No column {fieldname!r}") @@ -801,72 +2178,213 @@ def column_reader(self, fieldname, column): else: return columns.MultiColumnReader(colreaders) - # Lengths - def doc_field_length(self, docnum, fieldname, default=0): + """ + Returns the length of the specified field in the specified document. + + Parameters: + - docnum (int): The document number. + - fieldname (str): The name of the field. + - default (int): The default value to return if the field is not found. + + Returns: + - int: The length of the field in the document. + + """ x, y = self._reader_and_docnum(docnum) return self._readers[x].doc_field_length(y, fieldname, default) def field_length(self, fieldname): + """ + Returns the total length of the specified field across all readers. + + Parameters: + - fieldname (str): The name of the field. + + Returns: + - int: The total length of the field. + + """ total = 0 for r in self._readers: total += r.field_length(fieldname) return total def min_field_length(self): + """ + Returns the minimum field length across all readers. + + Returns: + - int: The minimum field length. + + """ return min(r.min_field_length() for r in self._readers) def max_field_length(self): + """ + Returns the maximum field length across all readers. + + Returns: + - int: The maximum field length. + + """ return max(r.max_field_length() for r in self._readers) # Extended base classes +class PerDocWriterWithColumns(PerDocumentWriter): + """ + A subclass of PerDocumentWriter that supports columns for storing additional data per document. + This class provides methods for adding and retrieving column values for a given fieldname. + + Attributes: + _storage (object): The storage object used for storing the column data. + _segment (object): The segment object representing the current segment. + _docnum (int): The document number. + + Methods: + _has_column(fieldname): Checks if a column with the given fieldname exists. + _create_column(fieldname, column): Creates a new column with the given fieldname and column object. + _get_column(fieldname): Retrieves the column object for the given fieldname. + add_column_value(fieldname, column, value): Adds a value to the column for the given fieldname. + + """ -class PerDocWriterWithColumns(PerDocumentWriter): def __init__(self): PerDocumentWriter.__init__(self) - # Implementations need to set these attributes self._storage = None self._segment = None self._docnum = None @abstractmethod def _has_column(self, fieldname): + """ + Checks if a column with the given fieldname exists. + + Args: + fieldname (str): The name of the field. + + Returns: + bool: True if the column exists, False otherwise. + + Raises: + NotImplementedError: This method should be implemented by subclasses. + """ raise NotImplementedError @abstractmethod def _create_column(self, fieldname, column): + """ + Creates a new column with the given fieldname and column object. + + Args: + fieldname (str): The name of the field. + column (object): The column object. + + Raises: + NotImplementedError: This method should be implemented by subclasses. + """ raise NotImplementedError @abstractmethod def _get_column(self, fieldname): + """ + Retrieves the column object for the given fieldname. + + Args: + fieldname (str): The name of the field. + + Returns: + object: The column object. + + Raises: + NotImplementedError: This method should be implemented by subclasses. + """ raise NotImplementedError def add_column_value(self, fieldname, column, value): + """ + Adds a value to the column for the given fieldname. + + If the column does not exist, it will be created. + + Args: + fieldname (str): The name of the field. + column (object): The column object. + value (object): The value to be added to the column. + """ if not self._has_column(fieldname): self._create_column(fieldname, column) self._get_column(fieldname).add(self._docnum, value) # FieldCursor implementations +class EmptyCursor(FieldCursor): + """A cursor implementation that represents an empty cursor. + This cursor is used when there are no matching terms in the index. + It provides methods to navigate through the non-existent terms and + retrieve information about them. + + Note: This class is intended for internal use within the Whoosh library + and should not be instantiated directly by users. + + """ -class EmptyCursor(FieldCursor): def first(self): + """Move the cursor to the first term. + + Returns: + None: Always returns None as there are no terms to move to. + + """ return None def find(self, term): + """Find a specific term in the index. + + Args: + term (str): The term to find. + + Returns: + None: Always returns None as the term does not exist. + + """ return None def next(self): + """Move the cursor to the next term. + + Returns: + None: Always returns None as there are no terms to move to. + + """ return None def text(self): + """Get the text of the current term. + + Returns: + None: Always returns None as there are no terms. + + """ return None def term_info(self): + """Get information about the current term. + + Returns: + None: Always returns None as there are no terms. + + """ return None def is_valid(self): + """Check if the cursor is valid. + + Returns: + bool: Always returns False as the cursor is not valid. + + """ return False diff --git a/src/whoosh/codec/memory.py b/src/whoosh/codec/memory.py index de2f5cf5..fbdc7828 100644 --- a/src/whoosh/codec/memory.py +++ b/src/whoosh/codec/memory.py @@ -36,42 +36,217 @@ class MemWriter(SegmentWriter): + """ + A class for writing segments to memory. + + This class extends the `SegmentWriter` class and provides functionality + for writing segments to memory instead of a file. + + Usage: + writer = MemWriter() + writer.commit() + + Args: + mergetype (str, optional): The type of merge to perform during commit. + Defaults to None. + optimize (bool, optional): Whether to optimize the index during commit. + Defaults to False. + merge (bool, optional): Whether to perform a merge during commit. + Defaults to True. + """ + def commit(self, mergetype=None, optimize=False, merge=True): + """ + Commits the changes made to the segment. + + This method finalizes the segment and performs any necessary + operations, such as merging and optimization. + + Args: + mergetype (str, optional): The type of merge to perform during commit. + Defaults to None. + optimize (bool, optional): Whether to optimize the index during commit. + Defaults to False. + merge (bool, optional): Whether to perform a merge during commit. + Defaults to True. + """ self._finalize_segment() class MemoryCodec(base.Codec): + """ + Codec implementation for in-memory storage. + + This codec provides an in-memory storage solution for the Whoosh library. + It uses a RamStorage object to store the index data. + + Usage: + codec = MemoryCodec() + writer = codec.writer(schema) + reader = codec.reader(schema) + per_doc_writer = codec.per_document_writer(storage, segment) + field_writer = codec.field_writer(storage, segment) + per_doc_reader = codec.per_document_reader(storage, segment) + terms_reader = codec.terms_reader(storage, segment) + new_segment = codec.new_segment(storage, indexname) + """ + def __init__(self): + """ + Initializes a MemoryCodec object. + + This method creates a RamStorage object to be used as the storage for the index data. + It also initializes a MemSegment object. + + Parameters: + None + + Returns: + None + """ from whoosh.filedb.filestore import RamStorage self.storage = RamStorage() self.segment = MemSegment(self, "blah") def writer(self, schema): + """ + Creates a writer object for the index. + + This method creates a MemWriter object for the given schema and returns it. + + Parameters: + - schema (whoosh.fields.Schema): The schema for the index. + + Returns: + - writer (MemWriter): The writer object for the index. + """ ix = self.storage.create_index(schema) return MemWriter(ix, _lk=False, codec=self, docbase=self.segment._doccount) def reader(self, schema): + """ + Creates a reader object for the index. + + This method creates a SegmentReader object for the given schema and returns it. + + Parameters: + - schema (whoosh.fields.Schema): The schema for the index. + + Returns: + - reader (SegmentReader): The reader object for the index. + """ return SegmentReader(self.storage, schema, self.segment, codec=self) def per_document_writer(self, storage, segment): + """ + Creates a per-document writer object. + + This method creates a MemPerDocWriter object for the given storage and segment and returns it. + + Parameters: + - storage (RamStorage): The storage object for the index. + - segment (MemSegment): The segment object for the index. + + Returns: + - per_doc_writer (MemPerDocWriter): The per-document writer object. + """ return MemPerDocWriter(self.storage, self.segment) def field_writer(self, storage, segment): + """ + Creates a field writer object. + + This method creates a MemFieldWriter object for the given storage and segment and returns it. + + Parameters: + - storage (RamStorage): The storage object for the index. + - segment (MemSegment): The segment object for the index. + + Returns: + - field_writer (MemFieldWriter): The field writer object. + """ return MemFieldWriter(self.storage, self.segment) def per_document_reader(self, storage, segment): + """ + Creates a per-document reader object. + + This method creates a MemPerDocReader object for the given storage and segment and returns it. + + Parameters: + - storage (RamStorage): The storage object for the index. + - segment (MemSegment): The segment object for the index. + + Returns: + - per_doc_reader (MemPerDocReader): The per-document reader object. + """ return MemPerDocReader(self.storage, self.segment) def terms_reader(self, storage, segment): + """ + Creates a terms reader object. + + This method creates a MemTermsReader object for the given storage and segment and returns it. + + Parameters: + - storage (RamStorage): The storage object for the index. + - segment (MemSegment): The segment object for the index. + + Returns: + - terms_reader (MemTermsReader): The terms reader object. + """ return MemTermsReader(self.storage, self.segment) def new_segment(self, storage, indexname): + """ + Creates a new segment object. + + This method returns the existing segment object. + + Parameters: + - storage (RamStorage): The storage object for the index. + - indexname (str): The name of the index. + + Returns: + - segment (MemSegment): The segment object. + """ return self.segment class MemPerDocWriter(base.PerDocWriterWithColumns): + """ + A class that writes per-document data to memory. + + This class is responsible for writing per-document data, such as stored fields, field lengths, and vectors, + to memory. It is used by the `MemoryCodec` to store document data in memory. + + Attributes: + _storage (Storage): The storage object used to create files for storing column data. + _segment (Segment): The segment object to which the per-document data is written. + is_closed (bool): Indicates whether the writer has been closed. + _colwriters (dict): A dictionary that maps field names to column writers. + _doccount (int): The total number of documents written. + + Methods: + _has_column(fieldname): Checks if a column with the given field name exists. + _create_column(fieldname, column): Creates a new column for the given field name. + _get_column(fieldname): Retrieves the column writer for the given field name. + start_doc(docnum): Starts writing data for a new document. + add_field(fieldname, fieldobj, value, length): Adds a field value and length to the current document. + add_vector_items(fieldname, fieldobj, items): Adds vector items to the current document. + finish_doc(): Finishes writing data for the current document. + close(): Closes the writer and finishes writing any remaining data. + """ + def __init__(self, storage, segment): + """ + Initializes a new instance of the MemPerDocWriter class. + + Args: + storage (Storage): The storage object used to create files for storing column data. + segment (Segment): The segment object to which the per-document data is written. + """ self._storage = storage self._segment = segment self.is_closed = False @@ -79,16 +254,47 @@ def __init__(self, storage, segment): self._doccount = 0 def _has_column(self, fieldname): + """ + Checks if a column with the given field name exists. + + Args: + fieldname (str): The name of the field. + + Returns: + bool: True if the column exists, False otherwise. + """ return fieldname in self._colwriters def _create_column(self, fieldname, column): + """ + Creates a new column for the given field name. + + Args: + fieldname (str): The name of the field. + column (Column): The column object used to write data to the column file. + """ colfile = self._storage.create_file(f"{fieldname}.c") self._colwriters[fieldname] = (colfile, column.writer(colfile)) def _get_column(self, fieldname): + """ + Retrieves the column writer for the given field name. + + Args: + fieldname (str): The name of the field. + + Returns: + ColumnWriter: The column writer object. + """ return self._colwriters[fieldname][1] def start_doc(self, docnum): + """ + Starts writing data for a new document. + + Args: + docnum (int): The document number. + """ self._doccount += 1 self._docnum = docnum self._stored = {} @@ -96,15 +302,35 @@ def start_doc(self, docnum): self._vectors = {} def add_field(self, fieldname, fieldobj, value, length): + """ + Adds a field value and length to the current document. + + Args: + fieldname (str): The name of the field. + fieldobj (Field): The field object. + value: The field value. + length: The field length. + """ if value is not None: self._stored[fieldname] = value if length is not None: self._lengths[fieldname] = length def add_vector_items(self, fieldname, fieldobj, items): + """ + Adds vector items to the current document. + + Args: + fieldname (str): The name of the field. + fieldobj (Field): The field object. + items (list): The vector items. + """ self._vectors[fieldname] = tuple(items) def finish_doc(self): + """ + Finishes writing data for the current document. + """ with self._segment._lock: docnum = self._docnum self._segment._stored[docnum] = self._stored @@ -112,6 +338,9 @@ def finish_doc(self): self._segment._vectors[docnum] = self._vectors def close(self): + """ + Closes the writer and finishes writing any remaining data. + """ colwriters = self._colwriters for fieldname in colwriters: colfile, colwriter = colwriters[fieldname] @@ -121,45 +350,167 @@ def close(self): class MemPerDocReader(base.PerDocumentReader): + """ + A class that provides read access to per-document data stored in memory. + + This class is responsible for reading per-document data from a memory storage + and a specific segment. It provides methods to retrieve information about the + documents, columns, field lengths, vectors, and stored fields. + + Usage: + 1. Create an instance of MemPerDocReader by passing the storage and segment. + 2. Use the various methods to access the desired information. + + Example: + ``` + storage = MemoryStorage() + segment = MemorySegment() + reader = MemPerDocReader(storage, segment) + doc_count = reader.doc_count() + has_deletions = reader.has_deletions() + stored_fields = reader.stored_fields(0) + reader.close() + ``` + + Note: + - The storage object should implement the necessary methods for file operations. + - The segment object should provide access to the per-document data. + + """ + def __init__(self, storage, segment): + """ + Initialize a MemPerDocReader instance. + + Args: + - storage: The storage object that provides file operations. + - segment: The segment object that provides access to the per-document data. + """ self._storage = storage self._segment = segment def doc_count(self): + """ + Get the number of documents in the segment. + + Returns: + - The number of documents in the segment. + """ return self._segment.doc_count() def doc_count_all(self): + """ + Get the total number of documents, including deleted documents. + + Returns: + - The total number of documents. + """ return self._segment.doc_count_all() def has_deletions(self): + """ + Check if the segment has deleted documents. + + Returns: + - True if the segment has deleted documents, False otherwise. + """ return self._segment.has_deletions() def is_deleted(self, docnum): + """ + Check if a document is deleted. + + Args: + - docnum: The document number. + + Returns: + - True if the document is deleted, False otherwise. + """ return self._segment.is_deleted(docnum) def deleted_docs(self): + """ + Get the set of deleted document numbers. + + Returns: + - A set containing the numbers of deleted documents. + """ return self._segment.deleted_docs() def supports_columns(self): + """ + Check if the segment supports columns. + + Returns: + - True if the segment supports columns, False otherwise. + """ return True def has_column(self, fieldname): + """ + Check if a column exists for a given field. + + Args: + - fieldname: The name of the field. + + Returns: + - True if the column exists, False otherwise. + """ filename = f"{fieldname}.c" return self._storage.file_exists(filename) def column_reader(self, fieldname, column): + """ + Get a reader for a specific column of a field. + + Args: + - fieldname: The name of the field. + - column: The column object. + + Returns: + - A reader for the column. + """ filename = f"{fieldname}.c" colfile = self._storage.open_file(filename) length = self._storage.file_length(filename) return column.reader(colfile, 0, length, self._segment.doc_count_all()) def doc_field_length(self, docnum, fieldname, default=0): + """ + Get the length of a field in a specific document. + + Args: + - docnum: The document number. + - fieldname: The name of the field. + - default: The default value to return if the field is not found. + + Returns: + - The length of the field in the document, or the default value if not found. + """ return self._segment._lengths[docnum].get(fieldname, default) def field_length(self, fieldname): + """ + Get the total length of a field across all documents. + + Args: + - fieldname: The name of the field. + + Returns: + - The total length of the field. + """ return sum(lens.get(fieldname, 0) for lens in self._segment._lengths.values()) def min_field_length(self, fieldname): + """ + Get the minimum length of a field across all documents. + + Args: + - fieldname: The name of the field. + + Returns: + - The minimum length of the field. + """ return min( lens[fieldname] for lens in self._segment._lengths.values() @@ -167,6 +518,15 @@ def min_field_length(self, fieldname): ) def max_field_length(self, fieldname): + """ + Get the maximum length of a field across all documents. + + Args: + - fieldname: The name of the field. + + Returns: + - The maximum length of the field. + """ return max( lens[fieldname] for lens in self._segment._lengths.values() @@ -174,25 +534,102 @@ def max_field_length(self, fieldname): ) def has_vector(self, docnum, fieldname): + """ + Check if a document has a vector for a given field. + + Args: + - docnum: The document number. + - fieldname: The name of the field. + + Returns: + - True if the document has a vector for the field, False otherwise. + """ return ( docnum in self._segment._vectors and fieldname in self._segment._vectors[docnum] ) def vector(self, docnum, fieldname, format_): + """ + Get a vector for a specific document and field. + + Args: + - docnum: The document number. + - fieldname: The name of the field. + - format_: The format of the vector. + + Returns: + - A ListMatcher object representing the vector. + """ items = self._segment._vectors[docnum][fieldname] ids, weights, values = zip(*items) return ListMatcher(ids, weights, values, format_) def stored_fields(self, docnum): + """ + Get the stored fields of a specific document. + + Args: + - docnum: The document number. + + Returns: + - A dictionary containing the stored fields of the document. + """ return self._segment._stored[docnum] def close(self): - # This method is intentionally left empty. + """ + Close the MemPerDocReader. + + This method is intentionally left empty. + """ pass class MemFieldWriter(base.FieldWriter): + """ + The MemFieldWriter class is responsible for writing field data to memory. + + It provides methods for starting and finishing fields, terms, and adding data to the field. + + Attributes: + - _storage: The storage object used for storing the field data. + - _segment: The segment object representing the segment being written to. + - _fieldname: The name of the current field being written. + - _btext: The binary representation of the current term being written. + - is_closed: A flag indicating whether the writer has been closed. + + Methods: + - start_field(fieldname, fieldobj): Starts a new field. + - start_term(btext): Starts a new term within the current field. + - add(docnum, weight, vbytes, length): Adds data to the current term. + - finish_term(): Finishes the current term. + - finish_field(): Finishes the current field. + - close(): Closes the writer. + + Usage: + 1. Create an instance of MemFieldWriter with the storage and segment objects. + 2. Call start_field() to start a new field. + 3. Call start_term() to start a new term within the field. + 4. Call add() to add data to the term. + 5. Call finish_term() to finish the term. + 6. Repeat steps 3-5 for additional terms within the field. + 7. Call finish_field() to finish the field. + 8. Repeat steps 2-7 for additional fields. + 9. Call close() to close the writer. + + Example: + storage = ... + segment = ... + writer = MemFieldWriter(storage, segment) + writer.start_field("title", fieldobj) + writer.start_term(b"hello") + writer.add(1, 0.5, 10, 5) + writer.finish_term() + writer.finish_field() + writer.close() + """ + def __init__(self, storage, segment): self._storage = storage self._segment = segment @@ -201,10 +638,18 @@ def __init__(self, storage, segment): self.is_closed = False def start_field(self, fieldname, fieldobj): + """ + Starts a new field. + + Args: + - fieldname: The name of the field. + - fieldobj: The field object representing the field. + + Raises: + - ValueError: If start_field is called within a field. + """ if self._fieldname is not None: - raise ValueError( - "Called start_field in a field" - ) # Replaced generic Exception with ValueError + raise ValueError("Called start_field in a field") with self._segment._lock: invindex = self._segment._invindex @@ -215,6 +660,15 @@ def start_field(self, fieldname, fieldobj): self._fieldobj = fieldobj def start_term(self, btext): + """ + Starts a new term within the current field. + + Args: + - btext: The binary representation of the term. + + Raises: + - ValueError: If start_term is called within a term. + """ if self._btext is not None: raise ValueError("Called start_term in a term") fieldname = self._fieldname @@ -233,10 +687,31 @@ def start_term(self, btext): self._btext = btext def add(self, docnum, weight, vbytes, length): + """ + Adds data to the current term. + + Args: + - docnum: The document number. + - weight: The weight of the term in the document. + - vbytes: The number of bytes used to store the term's value. + - length: The length of the term. + + Raises: + - ValueError: If add is called outside a term. + """ + if self._btext is None: + raise ValueError("Called add outside a term") + self._postings.append((docnum, weight, vbytes)) self._terminfo.add_posting(docnum, weight, length) def finish_term(self): + """ + Finishes the current term. + + Raises: + - ValueError: If finish_term is called outside a term. + """ if self._btext is None: raise ValueError("Called finish_term outside a term") @@ -245,30 +720,87 @@ def finish_term(self): self._terminfo = None def finish_field(self): + """ + Finishes the current field. + + Raises: + - ValueError: If finish_field is called outside a field. + """ if self._fieldname is None: raise ValueError("Called finish_field outside a field") self._fieldname = None self._fieldobj = None def close(self): + """ + Closes the writer. + """ self.is_closed = True class MemTermsReader(base.TermsReader): + """ + A terms reader implementation for in-memory storage. + + This class provides methods to access and retrieve terms, term information, + and matchers from an in-memory index segment. + + Args: + storage (object): The storage object used for the index. + segment (object): The index segment object. + + Attributes: + _storage (object): The storage object used for the index. + _segment (object): The index segment object. + _invindex (dict): The inverted index of the segment. + + """ + def __init__(self, storage, segment): self._storage = storage self._segment = segment self._invindex = segment._invindex def __contains__(self, term): + """ + Check if a term exists in the segment. + + Args: + term (str): The term to check. + + Returns: + bool: True if the term exists, False otherwise. + + """ return term in self._segment._terminfos def terms(self): + """ + Get an iterator over all terms in the segment. + + Yields: + tuple: A tuple containing the field name and term. + + """ for fieldname in self._invindex: for btext in self._invindex[fieldname]: yield (fieldname, btext) def terms_from(self, fieldname, prefix): + """ + Get an iterator over terms starting with a given prefix in a specific field. + + Args: + fieldname (str): The field name. + prefix (str): The prefix to match. + + Yields: + tuple: A tuple containing the field name and term. + + Raises: + TermNotFound: If the field name is unknown. + + """ if fieldname not in self._invindex: raise TermNotFound(f"Unknown field {fieldname!r}") terms = sorted(self._invindex[fieldname]) @@ -279,23 +811,99 @@ def terms_from(self, fieldname, prefix): yield (fieldname, terms[i]) def term_info(self, fieldname, text): + """ + Get the term information for a specific term in a field. + + Args: + fieldname (str): The field name. + text (str): The term. + + Returns: + object: The term information object. + + """ return self._segment._terminfos[fieldname, text] def matcher(self, fieldname, btext, format_, scorer=None): + """ + Get a matcher for a specific term in a field. + + Args: + fieldname (str): The field name. + btext (bytes): The term as bytes. + format_ (object): The format object. + scorer (object, optional): The scorer object. Defaults to None. + + Returns: + object: The matcher object. + + """ items = self._invindex[fieldname][btext] ids, weights, values = zip(*items) return ListMatcher(ids, weights, values, format_, scorer=scorer) def indexed_field_names(self): + """ + Returns a list of field names that have been indexed. + + This method retrieves the keys from the inverted index dictionary + and returns them as a list. Each key represents a field name that + has been indexed. + + Returns: + list: A list of field names that have been indexed. + """ return self._invindex.keys() def close(self): - # This method is intentionally left empty. + """ + Close the terms reader. + + This method is intentionally left empty. + + """ pass class MemSegment(base.Segment): + """ + In-memory implementation of a segment for the Whoosh search engine. + + This class represents a segment of an index stored in memory. It provides methods for managing + documents, storing and retrieving data, and handling deletions. + + Attributes: + _codec (Codec): The codec used for encoding and decoding data. + _doccount (int): The total number of documents in the segment. + _stored (dict): A dictionary mapping document numbers to stored data. + _lengths (dict): A dictionary mapping document numbers to the length of the stored data. + _vectors (dict): A dictionary mapping document numbers to term vectors. + _invindex (dict): A dictionary mapping terms to inverted index entries. + _terminfos (dict): A dictionary mapping terms to term information. + _lock (Lock): A lock used for thread-safety. + + Methods: + codec(): Returns the codec used by the segment. + set_doc_count(doccount): Sets the total number of documents in the segment. + doc_count(): Returns the number of stored documents. + doc_count_all(): Returns the total number of documents in the segment, including deleted ones. + delete_document(docnum, delete=True): Deletes a document from the segment. + has_deletions(): Checks if the segment has any deleted documents. + is_deleted(docnum): Checks if a document is deleted. + deleted_docs(): Returns an iterator over the document numbers of deleted documents. + should_assemble(): Checks if the segment should be assembled. + + """ + def __init__(self, codec, indexname): + """ + Initializes a new instance of the MemSegment class. + + Args: + codec (Codec): The codec used for encoding and decoding data. + indexname (str): The name of the index. + + """ base.Segment.__init__(self, indexname) self._codec = codec self._doccount = 0 @@ -307,18 +915,57 @@ def __init__(self, codec, indexname): self._lock = Lock() def codec(self): + """ + Returns the codec used by the segment. + + Returns: + Codec: The codec used by the segment. + + """ return self._codec def set_doc_count(self, doccount): + """ + Sets the total number of documents in the segment. + + Args: + doccount (int): The total number of documents. + + """ self._doccount = doccount def doc_count(self): + """ + Returns the number of stored documents. + + Returns: + int: The number of stored documents. + + """ return len(self._stored) def doc_count_all(self): + """ + Returns the total number of documents in the segment, including deleted ones. + + Returns: + int: The total number of documents. + + """ return self._doccount def delete_document(self, docnum, delete=True): + """ + Deletes a document from the segment. + + Args: + docnum (int): The document number. + delete (bool): Whether to permanently delete the document. Default is True. + + Raises: + ValueError: If delete is False, as MemoryCodec does not support undeleting. + + """ if not delete: raise ValueError("MemoryCodec can't undelete") with self._lock: @@ -327,17 +974,48 @@ def delete_document(self, docnum, delete=True): del self._vectors[docnum] def has_deletions(self): + """ + Checks if the segment has any deleted documents. + + Returns: + bool: True if there are deleted documents, False otherwise. + + """ with self._lock: return self._doccount - len(self._stored) def is_deleted(self, docnum): + """ + Checks if a document is deleted. + + Args: + docnum (int): The document number. + + Returns: + bool: True if the document is deleted, False otherwise. + + """ return docnum not in self._stored def deleted_docs(self): + """ + Returns an iterator over the document numbers of deleted documents. + + Yields: + int: The document number of a deleted document. + + """ stored = self._stored for docnum in range(self.doc_count_all()): if docnum not in stored: yield docnum def should_assemble(self): + """ + Checks if the segment should be assembled. + + Returns: + bool: True if the segment should be assembled, False otherwise. + + """ return False diff --git a/src/whoosh/codec/plaintext.py b/src/whoosh/codec/plaintext.py index 1e58ecb1..b29bd64a 100644 --- a/src/whoosh/codec/plaintext.py +++ b/src/whoosh/codec/plaintext.py @@ -39,7 +39,31 @@ class LineWriter: + """ + A class for writing lines to a file with specified indentation and command. + + Attributes: + _dbfile (file): The file object to write the lines to. + + Methods: + _print_line(indent, command, **kwargs): Writes a line to the file with the specified indentation, command, and keyword arguments. + """ + def _print_line(self, indent, command, **kwargs): + """ + Writes a line to the file with the specified indentation, command, and keyword arguments. + + Args: + indent (int): The number of indentation levels for the line. + command (str): The command to write. + **kwargs: Additional keyword arguments to include in the line. + + Raises: + TypeError: If a keyword argument value is not of a valid type. + + Returns: + None + """ self._dbfile.write(b" " * indent) self._dbfile.write(command.encode("latin1")) for k, v in kwargs.items(): @@ -52,17 +76,50 @@ def _print_line(self, indent, command, **kwargs): class LineReader: + """A class for reading lines from a file and performing line-based operations.""" + def __init__(self, dbfile): + """ + Initialize a LineReader object. + + Parameters: + - dbfile (file): The file object to read lines from. + """ self._dbfile = dbfile def _reset(self): + """ + Reset the file pointer to the beginning of the file. + """ self._dbfile.seek(0) def _find_line(self, indent, command, **kwargs): + """ + Find the first line that matches the given indent, command, and keyword arguments. + + Parameters: + - indent (int): The indentation level of the line. + - command (str): The command to match. + - kwargs (dict): Keyword arguments to match against the line's arguments. + + Returns: + - tuple: A tuple containing the indent, command, and arguments of the matched line. + """ for largs in self._find_lines(indent, command, **kwargs): return largs def _find_lines(self, indent, command, **kwargs): + """ + Find all lines that match the given indent, command, and keyword arguments. + + Parameters: + - indent (int): The indentation level of the lines. + - command (str): The command to match. + - kwargs (dict): Keyword arguments to match against the lines' arguments. + + Yields: + - tuple: A tuple containing the indent, command, and arguments of each matched line. + """ while True: line = self._dbfile.readline() if not line: @@ -87,6 +144,15 @@ def _find_lines(self, indent, command, **kwargs): return def _parse_line(self, line): + """ + Parse a line and extract the indent, command, and arguments. + + Parameters: + - line (str): The line to parse. + + Returns: + - tuple: A tuple containing the indent, command, and arguments of the line. + """ line = line.decode("latin1") line = line.rstrip() l = len(line) @@ -105,6 +171,18 @@ def _parse_line(self, line): return (indent, command, args) def _find_root(self, command): + """ + Find the root section with the given command. + + Parameters: + - command (str): The command to match. + + Returns: + - tuple: A tuple containing the indent, command, and arguments of the root section. + + Raises: + - ValueError: If no root section with the given command is found. + """ self._reset() c = self._find_line(0, command) if c is None: @@ -112,80 +190,341 @@ def _find_root(self, command): # Codec class - - class PlainTextCodec(base.Codec): + """ + Codec for storing and retrieving plain text documents in Whoosh. + + This codec provides the necessary methods for reading and writing plain text documents + in Whoosh. It is responsible for handling the storage, segmentation, and retrieval of + plain text data. + + Usage: + ------ + codec = PlainTextCodec() + per_doc_writer = codec.per_document_writer(storage, segment) + field_writer = codec.field_writer(storage, segment) + per_doc_reader = codec.per_document_reader(storage, segment) + terms_reader = codec.terms_reader(storage, segment) + segment = codec.new_segment(storage, indexname) + """ + length_stats = False def per_document_writer(self, storage, segment): + """ + Returns a per-document writer for the given storage and segment. + + Parameters: + ----------- + storage : Storage + The storage object used for storing the documents. + segment : Segment + The segment object representing the current segment. + + Returns: + -------- + PlainPerDocWriter + The per-document writer for the given storage and segment. + """ return PlainPerDocWriter(storage, segment) def field_writer(self, storage, segment): + """ + Returns a field writer for the given storage and segment. + + Parameters: + ----------- + storage : Storage + The storage object used for storing the documents. + segment : Segment + The segment object representing the current segment. + + Returns: + -------- + PlainFieldWriter + The field writer for the given storage and segment. + """ return PlainFieldWriter(storage, segment) def per_document_reader(self, storage, segment): + """ + Returns a per-document reader for the given storage and segment. + + Parameters: + ----------- + storage : Storage + The storage object used for retrieving the documents. + segment : Segment + The segment object representing the current segment. + + Returns: + -------- + PlainPerDocReader + The per-document reader for the given storage and segment. + """ return PlainPerDocReader(storage, segment) def terms_reader(self, storage, segment): + """ + Returns a terms reader for the given storage and segment. + + Parameters: + ----------- + storage : Storage + The storage object used for retrieving the terms. + segment : Segment + The segment object representing the current segment. + + Returns: + -------- + PlainTermsReader + The terms reader for the given storage and segment. + """ return PlainTermsReader(storage, segment) def new_segment(self, storage, indexname): + """ + Creates a new segment for the given storage and index name. + + Parameters: + ----------- + storage : Storage + The storage object used for storing the segment. + indexname : str + The name of the index. + + Returns: + -------- + PlainSegment + The new segment for the given storage and index name. + """ return PlainSegment(indexname) class PlainPerDocWriter(base.PerDocumentWriter, LineWriter): + """ + A class that writes per-document data in plain text format. + + This class is responsible for writing per-document data, such as document fields, column values, and vector items, + in a plain text format. It inherits from the `PerDocumentWriter` and `LineWriter` classes. + + Usage: + 1. Create an instance of `PlainPerDocWriter` by providing a storage object and a segment object. + 2. Call the `start_doc` method to indicate the start of a new document. + 3. Call the `add_field` method to add a field to the document. + 4. Call the `add_column_value` method to add a column value to the document. + 5. Call the `add_vector_items` method to add vector items to the document. + 6. Call the `finish_doc` method to indicate the end of the current document. + 7. Call the `close` method to close the writer. + + Attributes: + - `_dbfile`: The file object used for writing per-document data. + - `is_closed`: A boolean indicating whether the writer has been closed. + """ + def __init__(self, storage, segment): + """ + Initializes a new instance of the PlainPerDocWriter class. + + Parameters: + - `storage`: The storage object used for creating the per-document data file. + - `segment`: The segment object representing the current segment. + + Returns: + None. + """ self._dbfile = storage.create_file(segment.make_filename(".dcs")) self._print_line(0, "DOCS") self.is_closed = False def start_doc(self, docnum): + """ + Indicates the start of a new document. + + Parameters: + - `docnum`: The document number. + + Returns: + None. + """ self._print_line(1, "DOC", dn=docnum) def add_field(self, fieldname, fieldobj, value, length): + """ + Adds a field to the current document. + + Parameters: + - `fieldname`: The name of the field. + - `fieldobj`: The field object. + - `value`: The value of the field. + - `length`: The length of the field value. + + Returns: + None. + """ if value is not None: value = dumps(value, 2) self._print_line(2, "DOCFIELD", fn=fieldname, v=value, len=length) def add_column_value(self, fieldname, columnobj, value): + """ + Adds a column value to the current document. + + Parameters: + - `fieldname`: The name of the field. + - `columnobj`: The column object. + - `value`: The value of the column. + + Returns: + None. + """ self._print_line(2, "COLVAL", fn=fieldname, v=value) def add_vector_items(self, fieldname, fieldobj, items): + """ + Adds vector items to the current document. + + Parameters: + - `fieldname`: The name of the field. + - `fieldobj`: The field object. + - `items`: A list of vector items, where each item is a tuple containing the text, weight, and vector bytes. + + Returns: + None. + """ self._print_line(2, "VECTOR", fn=fieldname) for text, weight, vbytes in items: self._print_line(3, "VPOST", t=text, w=weight, v=vbytes) def finish_doc(self): + """ + Indicates the end of the current document. + + Returns: + None. + """ # This method is intentionally left empty. pass def close(self): + """ + Closes the writer. + + Returns: + None. + """ self._dbfile.close() self.is_closed = True class PlainPerDocReader(base.PerDocumentReader, LineReader): + """ + A reader for plain text per-document data in Whoosh index. + + This class provides methods to read per-document data stored in plain text format in a Whoosh index. + It inherits from the `PerDocumentReader` and `LineReader` classes. + + Attributes: + _dbfile (File): The file object representing the per-document data file. + _segment (Segment): The segment object representing the segment containing the per-document data. + is_closed (bool): Indicates whether the reader is closed or not. + + Methods: + doc_count(): Returns the number of documents in the segment. + doc_count_all(): Returns the total number of documents in the segment. + has_deletions(): Returns False, indicating that the segment does not have any deleted documents. + is_deleted(docnum): Returns False, indicating that the specified document is not deleted. + deleted_docs(): Returns an empty frozenset, indicating that there are no deleted documents. + _find_doc(docnum): Internal method to find a document by its number. + _iter_docs(): Internal method to iterate over the document numbers in the segment. + _iter_docfields(fieldname): Internal method to iterate over the lines of a specific field in the document. + _iter_lengths(fieldname): Internal method to iterate over the lengths of a specific field in the document. + doc_field_length(docnum, fieldname, default=0): Returns the length of a specific field in the document. + _column_values(fieldname): Internal method to iterate over the column values of a specific field in the document. + has_column(fieldname): Returns True if the specified field has column values in the document, False otherwise. + column_reader(fieldname, column): Returns a list of column values for a specific field in the document. + field_length(fieldname): Returns the total length of a specific field in the document. + min_field_length(fieldname): Returns the minimum length of a specific field in the document. + max_field_length(fieldname): Returns the maximum length of a specific field in the document. + has_vector(docnum, fieldname): Returns True if the document has a vector for the specified field, False otherwise. + vector(docnum, fieldname, format_): Returns a ListMatcher object representing the vector for the specified field in the document. + _read_stored_fields(): Internal method to read the stored fields of the document. + stored_fields(docnum): Returns a dictionary containing the stored fields of the document. + iter_docs(): Returns an iterator over the document numbers and their stored fields in the segment. + all_stored_fields(): Returns an iterator over the stored fields of all documents in the segment. + close(): Closes the reader and releases any associated resources. + """ + def __init__(self, storage, segment): + """ + Initializes a new instance of the PlainPerDocReader class. + + Args: + storage (Storage): The storage object representing the index storage. + segment (Segment): The segment object representing the segment containing the per-document data. + """ self._dbfile = storage.open_file(segment.make_filename(".dcs")) self._segment = segment self.is_closed = False def doc_count(self): + """ + Returns the number of documents in the segment. + + Returns: + int: The number of documents in the segment. + """ return self._segment.doc_count() def doc_count_all(self): + """ + Returns the total number of documents in the segment. + + Returns: + int: The total number of documents in the segment. + """ return self._segment.doc_count() def has_deletions(self): + """ + Returns False, indicating that the segment does not have any deleted documents. + + Returns: + bool: False, indicating that the segment does not have any deleted documents. + """ return False def is_deleted(self, docnum): + """ + Returns False, indicating that the specified document is not deleted. + + Args: + docnum (int): The document number. + + Returns: + bool: False, indicating that the specified document is not deleted. + """ return False def deleted_docs(self): + """ + Returns an empty frozenset, indicating that there are no deleted documents. + + Returns: + frozenset: An empty frozenset, indicating that there are no deleted documents. + """ return frozenset() def _find_doc(self, docnum): + """ + Internal method to find a document by its number. + + Args: + docnum (int): The document number. + + Returns: + bool: True if the document is found, False otherwise. + """ self._find_root("DOCS") c = self._find_line(1, "DOC") while c is not None: @@ -198,6 +537,12 @@ def _find_doc(self, docnum): return False def _iter_docs(self): + """ + Internal method to iterate over the document numbers in the segment. + + Yields: + int: The document number. + """ self._find_root("DOCS") c = self._find_line(1, "DOC") while c is not None: @@ -205,13 +550,42 @@ def _iter_docs(self): c = self._find_line(1, "DOC") def _iter_docfields(self, fieldname): + """ + Internal method to iterate over the lines of a specific field in the document. + + Args: + fieldname (str): The name of the field. + + Yields: + dict: A dictionary representing a line of the field in the document. + """ for _ in self._iter_docs(): yield from self._find_lines(2, "DOCFIELD", fn=fieldname) def _iter_lengths(self, fieldname): + """ + Internal method to iterate over the lengths of a specific field in the document. + + Args: + fieldname (str): The name of the field. + + Yields: + int: The length of the field in the document. + """ return (c.get("len", 0) for c in self._iter_docfields(fieldname)) def doc_field_length(self, docnum, fieldname, default=0): + """ + Returns the length of a specific field in the document. + + Args: + docnum (int): The document number. + fieldname (str): The name of the field. + default (int, optional): The default length to return if the field is not found. Defaults to 0. + + Returns: + int: The length of the field in the document, or the default length if the field is not found. + """ for dn in self._iter_docs(): if dn == docnum: c = self._find_line(2, "DOCFIELD", fn=fieldname) @@ -223,6 +597,15 @@ def doc_field_length(self, docnum, fieldname, default=0): return default def _column_values(self, fieldname): + """ + Internal method to iterate over the column values of a specific field in the document. + + Args: + fieldname (str): The name of the field. + + Yields: + Any: The column value. + """ for i, docnum in enumerate(self._iter_docs()): if i != docnum: raise ValueError(f"Missing column value for field {fieldname} doc {i}?") @@ -236,28 +619,95 @@ def _column_values(self, fieldname): yield c.get("v") def has_column(self, fieldname): + """ + Returns True if the specified field has column values in the document, False otherwise. + + Args: + fieldname (str): The name of the field. + + Returns: + bool: True if the specified field has column values in the document, False otherwise. + """ for _ in self._column_values(fieldname): return True return False def column_reader(self, fieldname, column): + """ + Returns a list of column values for a specific field in the document. + + Args: + fieldname (str): The name of the field. + column (int): The column number. + + Returns: + list: A list of column values for the specified field in the document. + """ return list(self._column_values(fieldname)) def field_length(self, fieldname): + """ + Returns the total length of a specific field in the document. + + Args: + fieldname (str): The name of the field. + + Returns: + int: The total length of the field in the document. + """ return sum(self._iter_lengths(fieldname)) def min_field_length(self, fieldname): + """ + Returns the minimum length of a specific field in the document. + + Args: + fieldname (str): The name of the field. + + Returns: + int: The minimum length of the field in the document. + """ return min(self._iter_lengths(fieldname)) def max_field_length(self, fieldname): + """ + Returns the maximum length of a specific field in the document. + + Args: + fieldname (str): The name of the field. + + Returns: + int: The maximum length of the field in the document. + """ return max(self._iter_lengths(fieldname)) def has_vector(self, docnum, fieldname): + """ + Returns True if the document has a vector for the specified field, False otherwise. + + Args: + docnum (int): The document number. + fieldname (str): The name of the field. + + Returns: + bool: True if the document has a vector for the specified field, False otherwise. + """ if self._find_doc(docnum) and self._find_line(2, "VECTOR"): return True return False def vector(self, docnum, fieldname, format_): + """ + Returns a ListMatcher object representing the vector for the specified field in the document. + + Args: + docnum (int): The document number. + fieldname (str): The name of the field. + format_ (str): The format of the vector. + + Returns: + ListMatcher: A ListMatcher object representing the vector for the specified field in the document. + """ if not self._find_doc(docnum): raise ValueError("Document not found.") if not self._find_line(2, "VECTOR"): @@ -281,6 +731,12 @@ def vector(self, docnum, fieldname, format_): ) def _read_stored_fields(self): + """ + Internal method to read the stored fields of the document. + + Returns: + dict: A dictionary containing the stored fields of the document. + """ sfs = {} c = self._find_line(2, "DOCFIELD") while c is not None: @@ -292,44 +748,128 @@ def _read_stored_fields(self): return sfs def stored_fields(self, docnum): + """ + Returns a dictionary containing the stored fields of the document. + + Args: + docnum (int): The document number. + + Returns: + dict: A dictionary containing the stored fields of the document. + """ if not self._find_doc(docnum): raise ValueError("Document not found.") return self._read_stored_fields() def iter_docs(self): + """ + Returns an iterator over the document numbers and their stored fields in the segment. + + Yields: + tuple: A tuple containing the document number and its stored fields. + """ return enumerate(self.all_stored_fields()) def all_stored_fields(self): + """ + Returns an iterator over the stored fields of all documents in the segment. + + Yields: + dict: A dictionary containing the stored fields of a document. + """ for _ in self._iter_docs(): yield self._read_stored_fields() def close(self): + """ + Closes the reader and releases any associated resources. + """ self._dbfile.close() self.is_closed = True class PlainFieldWriter(base.FieldWriter, LineWriter): + """ + A class that writes field data in plain text format. + + This class is responsible for writing field data to a storage file in plain text format. + It implements the necessary methods to handle field, term, and posting information. + + Attributes: + _dbfile (File): The storage file for the field data. + _fieldobj (Field): The field object being written. + _terminfo (TermInfo): The term information being written. + + Methods: + __init__(self, storage, segment): Initializes a PlainFieldWriter instance. + is_closed(self): Checks if the writer is closed. + start_field(self, fieldname, fieldobj): Starts writing a new field. + start_term(self, btext): Starts writing a new term. + add(self, docnum, weight, vbytes, length): Adds a posting to the current term. + finish_term(self): Finishes writing the current term. + add_spell_word(self, fieldname, text): Adds a spell word to the current field. + close(self): Closes the writer and the storage file. + """ + def __init__(self, storage, segment): + """ + Initializes a PlainFieldWriter instance. + + Args: + storage (Storage): The storage object for the field data. + segment (Segment): The segment object for the field data. + """ self._dbfile = storage.create_file(segment.make_filename(".trm")) self._print_line(0, "TERMS") @property def is_closed(self): + """ + Checks if the writer is closed. + + Returns: + bool: True if the writer is closed, False otherwise. + """ return self._dbfile.is_closed def start_field(self, fieldname, fieldobj): + """ + Starts writing a new field. + + Args: + fieldname (str): The name of the field. + fieldobj (Field): The field object. + """ self._fieldobj = fieldobj self._print_line(1, "TERMFIELD", fn=fieldname) def start_term(self, btext): + """ + Starts writing a new term. + + Args: + btext (bytes): The term text in bytes. + """ self._terminfo = TermInfo() self._print_line(2, "BTEXT", t=btext) def add(self, docnum, weight, vbytes, length): + """ + Adds a posting to the current term. + + Args: + docnum (int): The document number. + weight (float): The weight of the posting. + vbytes (int): The number of bytes in the posting. + length (int): The length of the posting. + """ self._terminfo.add_posting(docnum, weight, length) self._print_line(3, "POST", dn=docnum, w=weight, v=vbytes) def finish_term(self): + """ + Finishes writing the current term. + """ ti = self._terminfo self._print_line( 3, @@ -344,24 +884,76 @@ def finish_term(self): ) def add_spell_word(self, fieldname, text): + """ + Adds a spell word to the current field. + + Args: + fieldname (str): The name of the field. + text (str): The spell word text. + """ self._print_line(2, "SPELL", fn=fieldname, t=text) def close(self): + """ + Closes the writer and the storage file. + """ self._dbfile.close() class PlainTermsReader(base.TermsReader, LineReader): + """ + A reader for plain text terms in a Whoosh index. + + This class provides methods to read and retrieve terms, term information, + and perform term matching in a plain text index. + + Parameters: + - storage (Storage): The storage object representing the index. + - segment (Segment): The segment object representing the index segment. + + Attributes: + - _dbfile (File): The file object representing the terms file. + - _segment (Segment): The segment object representing the index segment. + - is_closed (bool): Indicates whether the reader is closed or not. + + """ + def __init__(self, storage, segment): + """ + Initializes a PlainTermsReader object. + + Parameters: + - storage (Storage): The storage object representing the index. + - segment (Segment): The segment object representing the index segment. + + """ self._dbfile = storage.open_file(segment.make_filename(".trm")) self._segment = segment self.is_closed = False def _find_field(self, fieldname): + """ + Finds the field with the given name in the terms file. + + Parameters: + - fieldname (str): The name of the field to find. + + Raises: + - TermNotFound: If the field with the given name is not found. + + """ self._find_root("TERMS") if self._find_line(1, "TERMFIELD", fn=fieldname) is None: raise TermNotFound(f"No field {fieldname!r}") def _iter_fields(self): + """ + Iterates over the field names in the terms file. + + Yields: + - str: The name of each field. + + """ self._find_root() c = self._find_line(1, "TERMFIELD") while c is not None: @@ -369,12 +961,30 @@ def _iter_fields(self): c = self._find_line(1, "TERMFIELD") def _iter_btexts(self): + """ + Iterates over the binary texts in the terms file. + + Yields: + - bytes: The binary text of each term. + + """ c = self._find_line(2, "BTEXT") while c is not None: yield c["t"] c = self._find_line(2, "BTEXT") def _find_term(self, fieldname, btext): + """ + Finds a term with the given field name and binary text in the terms file. + + Parameters: + - fieldname (str): The name of the field. + - btext (bytes): The binary text of the term. + + Returns: + - bool: True if the term is found, False otherwise. + + """ self._find_field(fieldname) for t in self._iter_btexts(): if t == btext: @@ -384,22 +994,64 @@ def _find_term(self, fieldname, btext): return False def _find_terminfo(self): + """ + Finds the term information in the terms file. + + Returns: + - TermInfo: The term information. + + """ c = self._find_line(3, "TERMINFO") return TermInfo(**c) def __contains__(self, term): + """ + Checks if a term is present in the terms file. + + Parameters: + - term (tuple): A tuple containing the field name and binary text of the term. + + Returns: + - bool: True if the term is present, False otherwise. + + """ fieldname, btext = term return self._find_term(fieldname, btext) def indexed_field_names(self): + """ + Returns the names of the indexed fields in the terms file. + + Returns: + - Iterator[str]: An iterator over the field names. + + """ return self._iter_fields() def terms(self): + """ + Returns an iterator over all the terms in the terms file. + + Yields: + - tuple: A tuple containing the field name and binary text of each term. + + """ for fieldname in self._iter_fields(): for btext in self._iter_btexts(): yield (fieldname, btext) def terms_from(self, fieldname, prefix): + """ + Returns an iterator over the terms with the given field name and prefix. + + Parameters: + - fieldname (str): The name of the field. + - prefix (bytes): The prefix of the terms. + + Yields: + - tuple: A tuple containing the field name and binary text of each term. + + """ self._find_field(fieldname) for btext in self._iter_btexts(): if btext < prefix: @@ -407,19 +1059,67 @@ def terms_from(self, fieldname, prefix): yield (fieldname, btext) def items(self): + """ + Returns an iterator over the terms and their corresponding term information. + + Yields: + - tuple: A tuple containing the term (field name and binary text) and its term information. + + """ for fieldname, btext in self.terms(): yield (fieldname, btext), self._find_terminfo() def items_from(self, fieldname, prefix): + """ + Returns an iterator over the terms with the given field name and prefix, and their corresponding term information. + + Parameters: + - fieldname (str): The name of the field. + - prefix (bytes): The prefix of the terms. + + Yields: + - tuple: A tuple containing the term (field name and binary text) and its term information. + + """ for fieldname, btext in self.terms_from(fieldname, prefix): yield (fieldname, btext), self._find_terminfo() def term_info(self, fieldname, btext): + """ + Retrieves the term information for the given field name and binary text. + + Parameters: + - fieldname (str): The name of the field. + - btext (bytes): The binary text of the term. + + Returns: + - TermInfo: The term information. + + Raises: + - TermNotFound: If the term is not found. + + """ if not self._find_term(fieldname, btext): raise TermNotFound((fieldname, btext)) return self._find_terminfo() def matcher(self, fieldname, btext, format_, scorer=None): + """ + Creates a matcher for the given field name and binary text. + + Parameters: + - fieldname (str): The name of the field. + - btext (bytes): The binary text of the term. + - format_ (int): The format of the matcher. + - scorer (Scorer): The scorer object to use for scoring the matches. + + Returns: + - ListMatcher: The matcher object. + + Raises: + - TermNotFound: If the term is not found. + + """ if not self._find_term(fieldname, btext): raise TermNotFound((fieldname, btext)) @@ -436,23 +1136,73 @@ def matcher(self, fieldname, btext, format_, scorer=None): return ListMatcher(ids, weights, values, format_, scorer=scorer) def close(self): + """ + Closes the PlainTermsReader object. + + """ self._dbfile.close() self.is_closed = True class PlainSegment(base.Segment): + """ + Represents a segment in a plain text index. + + This class is responsible for managing a segment in a plain text index. + It keeps track of the document count and provides methods to interact + with the segment. + + Attributes: + _doccount (int): The number of documents in the segment. + """ + def __init__(self, indexname): + """ + Initializes a PlainSegment object. + + Args: + indexname (str): The name of the index. + + """ base.Segment.__init__(self, indexname) self._doccount = 0 def codec(self): + """ + Returns the codec associated with the segment. + + Returns: + PlainTextCodec: The codec associated with the segment. + + """ return PlainTextCodec() def set_doc_count(self, doccount): + """ + Sets the document count for the segment. + + Args: + doccount (int): The number of documents in the segment. + + """ self._doccount = doccount def doc_count(self): + """ + Returns the document count for the segment. + + Returns: + int: The number of documents in the segment. + + """ return self._doccount def should_assemble(self): + """ + Determines whether the segment should be assembled. + + Returns: + bool: True if the segment should be assembled, False otherwise. + + """ return False diff --git a/src/whoosh/codec/whoosh2.py b/src/whoosh/codec/whoosh2.py index 66042554..244357b2 100644 --- a/src/whoosh/codec/whoosh2.py +++ b/src/whoosh/codec/whoosh2.py @@ -35,6 +35,10 @@ from pickle import dumps, loads from struct import Struct +from iniconfig import ParseError + +from whoosh.qparser.dateparse import DateParseError + try: import zlib except ImportError: @@ -74,6 +78,27 @@ def cdb_hash(key): + """ + Calculate the hash value for a given key using the CDB hash algorithm. + + Args: + key (str): The key to calculate the hash value for. + + Returns: + int: The calculated hash value. + + Algorithm: + The CDB hash algorithm is a simple and efficient hash function. + It uses the following steps to calculate the hash value: + 1. Initialize the hash value to 5381. + 2. For each character in the key, update the hash value using the formula: + h = (h + (h << 5)) & 0xFFFFFFFF ^ ord(c) + 3. Return the final hash value. + + Example: + >>> cdb_hash("hello") + 1934859637 + """ h = 5381 for c in key: h = (h + (h << 5)) & 0xFFFFFFFF ^ ord(c) @@ -81,10 +106,37 @@ def cdb_hash(key): def md5_hash(key): + """ + Calculate the MD5 hash of the given key and return the hash value as an integer. + + Parameters: + key (str): The key to be hashed. + + Returns: + int: The MD5 hash value of the key as an integer. + + Example: + >>> md5_hash("hello") + 1234567890 + + Note: + This function uses the MD5 algorithm to calculate the hash value of the key. + The resulting hash value is converted to an integer and returned. + """ return int(md5(key).hexdigest(), 16) & 0xFFFFFFFF def crc_hash(key): + """ + Calculates the CRC hash value for the given key. + + Args: + key (bytes): The key to calculate the CRC hash for. + + Returns: + int: The CRC hash value. + + """ return crc32(key) & 0xFFFFFFFF @@ -110,7 +162,49 @@ def crc_hash(key): class HashWriter: + """ + A class for writing hash-based data to a file. + + Parameters: + - dbfile (file-like object): The file-like object to write the hash data to. + - hashtype (int, optional): The type of hashing function to use. Defaults to 2. + + Attributes: + - dbfile (file-like object): The file-like object to write the hash data to. + - hashtype (int): The type of hashing function used. + - extras (dict): Extra data associated with the hash data. + - startoffset (int): The starting offset in the file where the hash data is written. + - header_size (int): The size of the header in bytes. + - hash_func (function): The hashing function used. + - hashes (defaultdict): A dictionary of hashed values. + + Methods: + - add(key, value): Adds a key-value pair to the hash data. + - add_all(items): Adds multiple key-value pairs to the hash data. + - _write_hashes(): Writes the hash data to the file. + - _write_extras(): Writes the extra data to the file. + - _write_directory(): Writes the directory of hash values to the file. + - close(): Closes the file. + + """ + def __init__(self, dbfile, hashtype=2): + """ + Initialize a Whoosh2 codec object. + + Args: + dbfile (file-like object): The file-like object representing the database file. + hashtype (int, optional): The type of hashing function to be used. Defaults to 2. + + Attributes: + dbfile (file-like object): The file-like object representing the database file. + hashtype (int): The type of hashing function used. + extras (dict): A dictionary to store additional data. + startoffset (int): The starting offset in the database file. + header_size (int): The size of the header in bytes. + hash_func (function): The hashing function used. + hashes (defaultdict): A dictionary to store the directory of hashed values. + """ self.dbfile = dbfile self.hashtype = hashtype self.extras = {} @@ -131,6 +225,32 @@ def __init__(self, dbfile, hashtype=2): self.hashes = defaultdict(list) def add(self, key, value): + """ + Adds a key-value pair to the hash data. + + Parameters: + - key (bytes): The key to be hashed. + - value (bytes): The value associated with the key. + + Returns: + None + + Raises: + AssertionError: If the key or value is not of type bytes. + + Notes: + - This method writes the length of the key and value to the database file, followed by the key and value themselves. + - The key is hashed using the hash function specified during initialization. + - The hashed key and the position in the database file where the key-value pair is written are stored in a list for efficient retrieval. + + Usage: + ``` + db = HashDatabase() + key = b'my_key' + value = b'my_value' + db.add(key, value) + ``` + """ assert isinstance(key, bytes) assert isinstance(value, bytes) @@ -144,11 +264,42 @@ def add(self, key, value): self.hashes[h & 255].append((h, pos)) def add_all(self, items): + """ + Adds multiple key-value pairs to the hash data. + + Parameters: + - items (iterable): An iterable of (key, value) pairs. + + Usage: + - To add multiple key-value pairs to the hash data, pass an iterable of (key, value) pairs to the `add_all` method. + + Example: + >>> data = [('key1', 'value1'), ('key2', 'value2'), ('key3', 'value3')] + >>> hash_data.add_all(data) + + """ add = self.add for key, value in items: add(key, value) def _write_hashes(self): + """ + Writes the hash data to the file. + + This method writes the hash data to the file, which is used for efficient + lookup of terms in the index. It generates a directory of positions and + number of slots for each hash value, and then writes the hash table entries + to the file. + + The hash table entries are stored in a list of tuples, where each tuple + contains the hash value and the position of the term in the index file. + + Usage: + _write_hashes() + + Returns: + None + """ dbfile = self.dbfile hashes = self.hashes directory = self.directory = [] @@ -176,6 +327,16 @@ def _write_hashes(self): self.extrasoffset = dbfile.tell() def _write_extras(self): + """ + Writes the extra data to the file. + + This method is responsible for writing the extra data to the file. + It first serializes the extras object using pickle and writes it to the file. + Then, it seeks back to the start offset + 8 and writes the pointer to the extras. + + Note: The extras object must be serializable using pickle. + + """ self.dbfile.write_pickle(self.extras) # Seek back and write the pointer to the extras self.dbfile.flush() @@ -183,6 +344,18 @@ def _write_extras(self): self.dbfile.write_long(self.extrasoffset) def _write_directory(self): + """ + Writes the directory of hash values to the file. + + This method is responsible for writing the directory of hash values to the file. + It seeks back to the header, writes the pointer to the end of the hashes, + and writes the pointers to the hash tables. + + Note: + This method assumes that the file has already been opened and positioned + correctly at the start offset. + + """ dbfile = self.dbfile directory = self.directory @@ -198,6 +371,16 @@ def _write_directory(self): assert dbfile.tell() == self.header_size def close(self): + """ + Closes the file. + + This method is responsible for closing the file and performing any necessary cleanup operations. + It writes the hashes, extras, and directory to the file, and then closes the file object. + + Note: + - After calling this method, the file object should no longer be used. + + """ self._write_hashes() self._write_extras() self._write_directory() @@ -205,7 +388,31 @@ def close(self): class HashReader: + """ + A class for reading and accessing data from a hash-based file format. + + Args: + dbfile (file-like object): The file-like object representing the hash-based file. + startoffset (int, optional): The starting offset in the file. Defaults to 0. + + Raises: + ValueError: If the file header is unknown. + + Attributes: + dbfile (file-like object): The file-like object representing the hash-based file. + startoffset (int): The starting offset in the file. + is_closed (bool): Indicates whether the HashReader is closed or not. + + """ + def __init__(self, dbfile, startoffset=0): + """ + Initialize a Whoosh2 object. + + Args: + dbfile (file-like object): The file-like object representing the Whoosh2 database file. + startoffset (int, optional): The starting offset in the file. Defaults to 0. + """ self.dbfile = dbfile self.startoffset = startoffset self.is_closed = False @@ -236,22 +443,92 @@ def __init__(self, dbfile, startoffset=0): self._read_extras() def _read_extras(self): + """ + Read the extras section of the hash-based file. + + This method reads the extras section of the hash-based file and stores the + data in the `extras` attribute of the object. The extras section contains + additional metadata or auxiliary information associated with the file. + + Raises: + EOFError: If the end of the file is reached before reading the extras. + """ try: self.extras = self.dbfile.read_pickle() except EOFError: self.extras = {} def close(self): + """ + Close the HashReader. + + This method closes the HashReader and releases any resources held by it. Once closed, + the HashReader cannot be used again. + + Raises: + ValueError: If the HashReader is already closed. + """ if self.is_closed: raise ValueError(f"Tried to close {self} twice") self.dbfile.close() self.is_closed = True def read(self, position, length): + """ + Read data from the hash-based file. + + Args: + position (int): The position in the file to start reading from. + length (int): The number of bytes to read. + + Returns: + bytes: The read data. + + Raises: + OSError: If there is an error reading the file. + + Notes: + This method reads data from the hash-based file at the specified position and with the specified length. + It is used to retrieve data from the file. + """ self.dbfile.seek(position) return self.dbfile.read(length) def _ranges(self, pos=None): + """ + Generate ranges of key-value pairs in the hash-based file. + + Args: + pos (int, optional): The starting position in the file. Defaults to None. + + Yields: + tuple: A tuple containing the key position, key length, data position, and data length. + + Raises: + ValueError: If the starting position is beyond the end of the file. + + Notes: + This method is used to iterate over the key-value pairs stored in the hash-based file. + It generates tuples containing the position and length of the key, as well as the position + and length of the corresponding data. + + The `pos` parameter allows you to specify a starting position in the file. If `pos` is not + provided, the method will start from the beginning of the file. + + The method uses the `read` method to read data from the file. The `read` method should be + implemented by the subclass to read the specified number of bytes from the file at the given + position. + + The method calculates the key position, key length, data position, and data length based on + the lengths stored in the file. It then updates the position to point to the next key-value + pair in the file. + + The method yields each tuple of key-value pair ranges, allowing you to process them one by one. + The caller can iterate over the yielded tuples using a for loop or any other iterable method. + + If the starting position is beyond the end of the file, a `ValueError` is raised. + + """ if pos is None: pos = self.header_size eod = self._start_of_hashes @@ -264,9 +541,30 @@ def _ranges(self, pos=None): yield (keypos, keylen, datapos, datalen) def __iter__(self): + """ + Iterate over the key-value pairs in the hash-based file. + + This method returns an iterator that allows iterating over the key-value pairs + stored in the hash-based file. Each iteration yields a tuple containing the key + and value. + + Returns: + iterator: An iterator over the key-value pairs in the hash-based file. + + Example: + >>> for key, value in hash_file: + ... print(key, value) + """ return iter(self.items()) def items(self): + """ + Iterate over the key-value pairs in the hash-based file. + + Yields: + tuple: A tuple containing the key and value. + + """ read = self.read for keypos, keylen, datapos, datalen in self._ranges(): key = read(keypos, keylen) @@ -274,40 +572,119 @@ def items(self): yield (key, value) def keys(self): + """ + Iterate over the keys in the hash-based file. + + This method returns an iterator that yields the keys stored in the hash-based file. + The keys are returned as bytes. + + Yields: + bytes: The key. + """ read = self.read for keypos, keylen, _, _ in self._ranges(): yield read(keypos, keylen) def values(self): + """ + Iterate over the values in the hash-based file. + + This method returns a generator that iterates over the values stored in the hash-based file. + Each value is read from the file using the `read` method. + + Yields: + bytes: The value. + """ read = self.read for _, _, datapos, datalen in self._ranges(): yield read(datapos, datalen) def __getitem__(self, key): + """ + Get the value associated with the given key. + + Args: + key (bytes): The key to retrieve the value for. + + Returns: + bytes: The value associated with the key. + + Raises: + KeyError: If the key is not found. + """ for data in self.all(key): return data raise KeyError(key) def get(self, key, default=None): + """ + Get the value associated with the given key, or a default value if the key is not found. + + Args: + key (bytes): The key to retrieve the value for. + default (Any, optional): The default value to return if the key is not found. Defaults to None. + + Returns: + bytes: The value associated with the key, or the default value if the key is not found. + """ for data in self.all(key): return data return default def all(self, key): + """ + Get all values associated with the given key. + + Args: + key (bytes): The key to retrieve the values for. + + Yields: + bytes: The values associated with the key. + """ read = self.read for datapos, datalen in self.ranges_for_key(key): yield read(datapos, datalen) def __contains__(self, key): + """ + Check if the given key is present in the hash-based file. + + Args: + key (bytes): The key to check. + + Returns: + bool: True if the key is present, False otherwise. + """ for _ in self.ranges_for_key(key): return True return False def _hashtable_info(self, keyhash): + """ + Get the directory position and number of hash entries for the given key hash. + + Args: + keyhash (int): The hash value of the key. + + Returns: + tuple: A tuple containing the directory position and number of hash entries. + """ # Return (directory_position, number_of_hash_entries) return self.buckets[keyhash & 255] def _key_position(self, key): + """ + Get the position of the given key in the hash-based file. + + Args: + key (bytes): The key to get the position for. + + Returns: + int: The position of the key. + + Raises: + KeyError: If the key is not found. + """ keyhash = self.hash_func(key) hpos, hslots = self._hashtable_info(keyhash) if not hslots: @@ -317,10 +694,28 @@ def _key_position(self, key): return self.dbfile.get_long(slotpos + _INT_SIZE) def _key_at(self, pos): + """ + Get the key at the given position in the hash-based file. + + Args: + pos (int): The position of the key. + + Returns: + bytes: The key. + """ keylen = self.dbfile.get_uint(pos) return self.read(pos + lengths_size, keylen) def ranges_for_key(self, key): + """ + Get the ranges of data associated with the given key. + + Args: + key (bytes): The key to retrieve the ranges for. + + Yields: + tuple: A tuple containing the data position and data length. + """ read = self.read if not isinstance(key, bytes): raise TypeError(f"Key {key} should be bytes") @@ -347,18 +742,57 @@ def ranges_for_key(self, key): yield (pos + lengths_size + keylen, datalen) def range_for_key(self, key): + """ + Get the first range of data associated with the given key. + + Args: + key (bytes): The key to retrieve the range for. + + Returns: + tuple: A tuple containing the data position and data length. + + Raises: + KeyError: If the key is not found. + """ for item in self.ranges_for_key(key): return item raise KeyError(key) class OrderedHashWriter(HashWriter): + """ + A class for writing key-value pairs to a hash-based database file with ordered keys. + + Inherits from HashWriter. + + Usage: + writer = OrderedHashWriter(dbfile) + writer.add(key, value) + writer.commit() + """ + def __init__(self, dbfile): + """ + Initializes an OrderedHashWriter object. + + Parameters: + - dbfile (file): The file object representing the hash-based database file. + """ HashWriter.__init__(self, dbfile) self.index = GrowableArray("H") self.lastkey = emptybytes def add(self, key, value): + """ + Adds a key-value pair to the database. + + Parameters: + - key: The key to be added. + - value: The value associated with the key. + + Raises: + - ValueError: If the keys are not in increasing order. + """ if key <= self.lastkey: raise ValueError(f"Keys must increase: {self.lastkey!r}..{key!r}") self.index.append(self.dbfile.tell()) @@ -366,6 +800,9 @@ def add(self, key, value): self.lastkey = key def _write_extras(self): + """ + Writes additional information about the index to the extras section of the database file. + """ dbfile = self.dbfile # Save information about the index in the extras @@ -382,7 +819,31 @@ def _write_extras(self): class OrderedHashReader(HashReader): + """ + A class for reading ordered hash data from a database file. + + Inherits from HashReader. + + Attributes: + indexbase (int): The base position of the index in the database file. + indexlen (int): The length of the index. + indextype (str): The type of the index. + _ixsize (int): The size of each index entry in bytes. + _ixpos (function): A function for reading index values based on the indextype. + + Methods: + closest_key(key): Returns the closest key to the given key in the hash data. + items_from(key): Yields key-value pairs starting from the given key. + keys_from(key): Yields keys starting from the given key. + """ + def __init__(self, dbfile): + """ + Initializes an OrderedHashReader object. + + Args: + dbfile (file): The database file to read from. + """ HashReader.__init__(self, dbfile) self.indexbase = self.extras["indexbase"] self.indexlen = self.extras["indexlen"] @@ -403,6 +864,15 @@ def __init__(self, dbfile): raise ValueError(f"Unknown index type {indextype}") def _closest_key(self, key): + """ + Finds the closest key to the given key in the hash data. + + Args: + key (bytes): The key to search for. + + Returns: + int or None: The position of the closest key in the hash data, or None if not found. + """ key_at = self._key_at indexbase = self.indexbase ixpos, ixsize = self._ixpos, self._ixsize @@ -424,13 +894,30 @@ def _closest_key(self, key): return ixpos(indexbase + lo * ixsize) def closest_key(self, key): + """ + Returns the closest key to the given key in the hash data. + + Args: + key (bytes): The key to search for. + + Returns: + bytes or None: The closest key to the given key, or None if not found. + """ pos = self._closest_key(key) if pos is None: return None return self._key_at(pos) def _ranges_from(self, key): - # read = self.read + """ + Generates ranges of key-value pairs starting from the given key. + + Args: + key (bytes): The key to start from. + + Yields: + tuple: A tuple containing the key position, key length, data position, and data length. + """ pos = self._closest_key(key) if pos is None: return @@ -438,11 +925,29 @@ def _ranges_from(self, key): yield from self._ranges(pos=pos) def items_from(self, key): + """ + Yields key-value pairs starting from the given key. + + Args: + key (bytes): The key to start from. + + Yields: + tuple: A tuple containing the key and value. + """ read = self.read for keypos, keylen, datapos, datalen in self._ranges_from(key): yield (read(keypos, keylen), read(datapos, datalen)) def keys_from(self, key): + """ + Yields keys starting from the given key. + + Args: + key (bytes): The key to start from. + + Yields: + bytes: The key. + """ read = self.read for keypos, keylen, _, _ in self._ranges_from(key): yield read(keypos, keylen) @@ -452,6 +957,31 @@ def keys_from(self, key): class W2Codec(base.Codec): + """ + Codec implementation for the Whoosh 2 index format. + + This codec provides the necessary methods for reading and writing + various components of the index, such as term index, term postings, + spelling graph, field lengths, vector index, vector postings, and + stored fields. + + Args: + blocklimit (int): The maximum number of terms to store in a block. + compression (int): The level of compression to apply to the index data. + loadlengths (bool): Whether to load field lengths during reading. + inlinelimit (int): The maximum number of terms to store in a field block. + + Attributes: + TERMS_EXT (str): The file extension for the term index. + POSTS_EXT (str): The file extension for the term postings. + DAWG_EXT (str): The file extension for the spelling graph. + LENGTHS_EXT (str): The file extension for the field lengths. + VECTOR_EXT (str): The file extension for the vector index. + VPOSTS_EXT (str): The file extension for the vector postings. + STORED_EXT (str): The file extension for the stored fields. + + """ + TERMS_EXT = ".trm" # Term index POSTS_EXT = ".pst" # Term postings DAWG_EXT = FST_EXT = ".dag" # Spelling graph file @@ -461,19 +991,46 @@ class W2Codec(base.Codec): STORED_EXT = ".sto" # Stored fields file def __init__(self, blocklimit=128, compression=3, loadlengths=False, inlinelimit=1): + """ + Initialize the W2Codec. + + Args: + blocklimit (int): The maximum number of terms to store in a block. + compression (int): The level of compression to apply to the index data. + loadlengths (bool): Whether to load field lengths during reading. + inlinelimit (int): The maximum number of terms to store in a field block. + """ self.blocklimit = blocklimit self.compression = compression self.loadlengths = loadlengths self.inlinelimit = inlinelimit - # Per-document value writer def per_document_writer(self, storage, segment): + """ + Create a per-document value writer. + + Args: + storage: The storage object for the index. + segment: The segment object for the index. + + Returns: + W2PerDocWriter: The per-document value writer. + """ return W2PerDocWriter( storage, segment, blocklimit=self.blocklimit, compression=self.compression ) - # Inverted index writer def field_writer(self, storage, segment): + """ + Create an inverted index writer. + + Args: + storage: The storage object for the index. + segment: The segment object for the index. + + Returns: + W2FieldWriter: The inverted index writer. + """ return W2FieldWriter( storage, segment, @@ -482,26 +1039,65 @@ def field_writer(self, storage, segment): inlinelimit=self.inlinelimit, ) - # Readers - def terms_reader(self, storage, segment): + """ + Create a terms reader. + + Args: + storage: The storage object for the index. + segment: The segment object for the index. + + Returns: + W2TermsReader: The terms reader. + """ tifile = segment.open_file(storage, self.TERMS_EXT) postfile = segment.open_file(storage, self.POSTS_EXT) return W2TermsReader(tifile, postfile) def per_document_reader(self, storage, segment): + """ + Create a per-document reader. + + Args: + storage: The storage object for the index. + segment: The segment object for the index. + + Returns: + W2PerDocReader: The per-document reader. + """ return W2PerDocReader(storage, segment) def graph_reader(self, storage, segment): + """ + Create a graph reader. + + Args: + storage: The storage object for the index. + segment: The segment object for the index. + + Returns: + GraphReader: The graph reader. + + Raises: + NoGraphError: If the spelling graph file is not found. + """ try: dawgfile = segment.open_file(storage, self.DAWG_EXT) except ValueError: raise NoGraphError return GraphReader(dawgfile) - # Segments and generations - def new_segment(self, storage, indexname): + """ + Create a new segment. + + Args: + storage: The storage object for the index. + indexname (str): The name of the index. + + Returns: + W2Segment: The new segment. + """ return W2Segment(indexname) @@ -509,9 +1105,30 @@ def new_segment(self, storage, indexname): class W2PerDocWriter(base.PerDocumentWriter): + """A class for writing per-document data in the Whoosh 2 codec. + + Args: + storage (Storage): The storage object to use for creating files. + segment (Segment): The segment object representing the current segment. + blocklimit (int, optional): The maximum number of vector items to store in a block. Defaults to 128. + compression (int, optional): The compression level to use when writing vector blocks. Defaults to 3. + + Attributes: + storage (Storage): The storage object used for creating files. + segment (Segment): The segment object representing the current segment. + blocklimit (int): The maximum number of vector items to store in a block. + compression (int): The compression level used when writing vector blocks. + doccount (int): The total number of documents written. + is_closed (bool): Indicates whether the writer has been closed. + + Note: + This class is used internally by the Whoosh 2 codec and should not be instantiated directly. + + """ + def __init__(self, storage, segment, blocklimit=128, compression=3): if not isinstance(blocklimit, int): - raise ValueError + raise ValueError("blocklimit must be an integer") self.storage = storage self.segment = segment self.blocklimit = blocklimit @@ -530,30 +1147,62 @@ def __init__(self, storage, segment, blocklimit=128, compression=3): self.vindex = self.vpostfile = None def _make_vector_files(self): + """Create the vector index and vector postings files.""" vifile = self.segment.create_file(self.storage, W2Codec.VECTOR_EXT) self.vindex = VectorWriter(vifile) self.vpostfile = self.segment.create_file(self.storage, W2Codec.VPOSTS_EXT) def start_doc(self, docnum): + """Start writing a new document. + + Args: + docnum (int): The document number. + + """ self.docnum = docnum self.storedfields = {} self.doccount = max(self.doccount, docnum + 1) def add_field(self, fieldname, fieldobj, value, length): + """Add a field to the current document. + + Args: + fieldname (str): The name of the field. + fieldobj (Field): The field object. + value (object): The field value. + length (int): The length of the field value. + + """ if length: self.lengths.add(self.docnum, fieldname, length) if value is not None: self.storedfields[fieldname] = value def _new_block(self, vformat): + """Create a new vector block. + + Args: + vformat (Format): The vector format. + + Returns: + W2Block: The new vector block. + + """ postingsize = vformat.posting_size return W2Block(postingsize, stringids=True) def add_vector_items(self, fieldname, fieldobj, items): + """Add vector items to the current document. + + Args: + fieldname (str): The name of the vector field. + fieldobj (Field): The vector field object. + items (list): A list of vector items in the format (text, weight, value_bytes). + + """ if self.vindex is None: self._make_vector_files() - # items = (text, weight, value_bytes) ... postfile = self.vpostfile blocklimit = self.blocklimit block = self._new_block(fieldobj.vector) @@ -589,10 +1238,12 @@ def add_vector_items(self, fieldname, fieldobj, items): self.vindex.add((self.docnum, fieldname), startoffset) def finish_doc(self): + """Finish writing the current document.""" self.stored.add(self.storedfields) self.storedfields = None def close(self): + """Close the writer.""" if self.storedfields is not None: self.stored.add(self.storedfields) self.stored.close() @@ -608,7 +1259,64 @@ def close(self): class W2FieldWriter(base.FieldWriter): + """ + The W2FieldWriter class is responsible for writing field data to the index files in the Whoosh search engine. + + Parameters: + - storage (Storage): The storage object used to store the index files. + - segment (base.Segment): The segment object representing the current segment being written. + - blocklimit (int): The maximum number of documents to store in a single block. + - compression (int): The level of compression to apply to the block data. + - inlinelimit (int): The maximum number of documents to store inline without creating a separate block. + + Attributes: + - storage (Storage): The storage object used to store the index files. + - segment (base.Segment): The segment object representing the current segment being written. + - fieldname (str): The name of the field being written. + - text (str): The text of the current term being written. + - field (Field): The field object being written. + - format (Format): The format object associated with the field. + - spelling (bool): Indicates whether the field has spelling enabled. + - termsindex (TermIndexWriter): The term index writer object. + - postfile (File): The file object for storing the posting data. + - dawg (GraphWriter): The DAWG (Directed Acyclic Word Graph) writer object. + - blocklimit (int): The maximum number of documents to store in a single block. + - compression (int): The level of compression to apply to the block data. + - inlinelimit (int): The maximum number of documents to store inline without creating a separate block. + - block (W2Block): The current block being written. + - terminfo (FileTermInfo): The term info object for the current term. + - _infield (bool): Indicates whether the writer is currently inside a field. + - is_closed (bool): Indicates whether the writer has been closed. + + Methods: + - _make_dawg_files(): Creates the DAWG (Directed Acyclic Word Graph) files if needed. + - _new_block(): Creates a new block object. + - _reset_block(): Resets the current block. + - _write_block(): Writes the current block to the posting file. + - _start_blocklist(): Starts a new block list in the posting file. + - start_field(fieldname, fieldobj): Starts writing a new field. + - start_term(text): Starts writing a new term. + - add(docnum, weight, valuestring, length): Adds a document to the current block. + - add_spell_word(fieldname, text): Adds a spelling word to the DAWG. + - finish_term(): Finishes writing the current term. + - finish_field(): Finishes writing the current field. + - close(): Closes the writer and releases any resources. + """ + def __init__(self, storage, segment, blocklimit=128, compression=3, inlinelimit=1): + """ + Initializes a new instance of the W2FieldWriter class. + + Parameters: + - storage (Storage): The storage object used to store the index files. + - segment (base.Segment): The segment object representing the current segment being written. + - blocklimit (int): The maximum number of documents to store in a single block. + - compression (int): The level of compression to apply to the block data. + - inlinelimit (int): The maximum number of documents to store inline without creating a separate block. + + Raises: + - AssertionError: If the input parameters are not of the expected types. + """ assert isinstance(storage, Storage) assert isinstance(segment, base.Segment) assert isinstance(blocklimit, int) @@ -640,22 +1348,40 @@ def __init__(self, storage, segment, blocklimit=128, compression=3, inlinelimit= self.is_closed = False def _make_dawg_files(self): + """ + Creates the DAWG (Directed Acyclic Word Graph) files if needed. + """ dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT) self.dawg = GraphWriter(dawgfile) def _new_block(self): + """ + Creates a new block object. + + Returns: + - W2Block: The new block object. + """ return W2Block(self.format.posting_size) def _reset_block(self): + """ + Resets the current block. + """ self.block = self._new_block() def _write_block(self): + """ + Writes the current block to the posting file. + """ self.terminfo.add_block(self.block) self.block.to_file(self.postfile, compression=self.compression) self._reset_block() self.blockcount += 1 def _start_blocklist(self): + """ + Starts a new block list in the posting file. + """ postfile = self.postfile self._reset_block() @@ -667,6 +1393,16 @@ def _start_blocklist(self): postfile.write_uint(0) def start_field(self, fieldname, fieldobj): + """ + Starts writing a new field. + + Parameters: + - fieldname (str): The name of the field. + - fieldobj (Field): The field object. + + Raises: + - ValueError: If called before finishing the previous field. + """ self.fieldname = fieldname self.field = fieldobj self.format = fieldobj.format @@ -680,6 +1416,15 @@ def start_field(self, fieldname, fieldobj): self._infield = True def start_term(self, text): + """ + Starts writing a new term. + + Parameters: + - text (str): The text of the term. + + Raises: + - ValueError: If called inside a block. + """ if self.block is not None: raise ValueError("Called start_term in a block") self.text = text @@ -691,16 +1436,41 @@ def start_term(self, text): self._start_blocklist() def add(self, docnum, weight, valuestring, length): + """ + Adds a document to the current block. + + Parameters: + - docnum (int): The document number. + - weight (float): The weight of the document. + - valuestring (str): The value string of the document. + - length (int): The length of the document. + + Raises: + - ValueError: If the block size exceeds the block limit, the current block is written to the posting file. + """ self.block.add(docnum, weight, valuestring, length) if len(self.block) > self.blocklimit: self._write_block() def add_spell_word(self, fieldname, text): + """ + Adds a spelling word to the DAWG (Directed Acyclic Word Graph). + + Parameters: + - fieldname (str): The name of the field. + - text (str): The spelling word. + """ if self.dawg is None: self._make_dawg_files() self.dawg.insert(text) def finish_term(self): + """ + Finishes writing the current term. + + Raises: + - ValueError: If called when not in a block. + """ block = self.block if block is None: raise ValueError("Called finish_term when not in a block") @@ -733,6 +1503,12 @@ def finish_term(self): self.termsindex.add((self.fieldname, self.text), terminfo) def finish_field(self): + """ + Finishes writing the current field. + + Raises: + - ValueError: If called before starting a field. + """ if not self._infield: raise ValueError("Called finish_field before start_field") self._infield = False @@ -742,6 +1518,9 @@ def finish_field(self): self._dawgfield = False def close(self): + """ + Closes the writer and releases any resources. + """ self.termsindex.close() self.postfile.close() if self.dawg is not None: @@ -753,6 +1532,18 @@ def close(self): class W2LeafMatcher(LeafMatcher): + """ + Represents a leaf matcher for the Whoosh 2 codec. + + Args: + postfile (file-like object): The file-like object containing the posting data. + startoffset (int): The starting offset of the leaf matcher in the postfile. + fmt (CodecFormat): The codec format used for encoding and decoding data. + scorer (Scorer, optional): The scorer used for scoring documents. Defaults to None. + term (Term, optional): The term associated with the leaf matcher. Defaults to None. + stringids (bool, optional): Whether the leaf matcher uses string-based document IDs. Defaults to False. + """ + def __init__( self, postfile, startoffset, fmt, scorer=None, term=None, stringids=False ): @@ -776,24 +1567,54 @@ def __init__( self._next_block() def id(self): + """ + Returns the document ID associated with the current posting. + + Returns: + int: The document ID. + """ return self.block.ids[self.i] def is_active(self): + """ + Checks if the leaf matcher is active. + + Returns: + bool: True if the leaf matcher is active, False otherwise. + """ return self._active def weight(self): + """ + Returns the weight of the current posting. + + Returns: + float: The weight of the posting. + """ weights = self.block.weights if not weights: weights = self.block.read_weights() return weights[self.i] def value(self): + """ + Returns the value of the current posting. + + Returns: + object: The value of the posting. + """ values = self.block.values if values is None: values = self.block.read_values() return values[self.i] def all_ids(self): + """ + Generator that yields all document IDs in the leaf matcher. + + Yields: + int: The document ID. + """ nextoffset = self.baseoffset for _ in range(self.blockcount): block = self._read_block(nextoffset) @@ -802,6 +1623,12 @@ def all_ids(self): yield from ids def next(self): + """ + Moves to the next posting in the leaf matcher. + + Returns: + bool: True if there is a next posting, False otherwise. + """ if self.i == self.block.count - 1: self._next_block() return True @@ -810,6 +1637,15 @@ def next(self): return False def skip_to(self, id): + """ + Skips to the posting with the specified document ID. + + Args: + id (int): The document ID to skip to. + + Raises: + ReadTooFar: If the leaf matcher has been read beyond the target ID. + """ if not self.is_active(): raise ReadTooFar @@ -824,8 +1660,7 @@ def skip_to(self, id): if not self.is_active(): return - # Iterate through the IDs in the block until we find or pass the - # target + # Iterate through the IDs in the block until we find or pass the target ids = self.block.ids i = self.i while ids[i] < id: @@ -836,21 +1671,57 @@ def skip_to(self, id): self.i = i def skip_to_quality(self, minquality): + """ + Skips to the posting with a quality greater than or equal to the specified minimum quality. + + Args: + minquality (float): The minimum quality. + + Returns: + int: The number of blocks skipped. + + Note: + The quality of a posting is determined by the block quality function. + """ bq = self.block_quality if bq() > minquality: return 0 return self._skip_to_block(lambda: bq() <= minquality) def block_min_length(self): + """ + Returns the minimum length of postings in the current block. + + Returns: + int: The minimum length. + """ return self.block.min_length() def block_max_length(self): + """ + Returns the maximum length of postings in the current block. + + Returns: + int: The maximum length. + """ return self.block.max_length() def block_max_weight(self): + """ + Returns the maximum weight of postings in the current block. + + Returns: + float: The maximum weight. + """ return self.block.max_weight() def block_max_wol(self): + """ + Returns the maximum weight of lengths of postings in the current block. + + Returns: + float: The maximum weight of lengths. + """ return self.block.max_wol() def _read_block(self, offset): @@ -901,14 +1772,45 @@ def _skip_to_block(self, targetfn): class TermIndexWriter(HashWriter): + """ + A class for writing term index data to a database file. + + Inherits from HashWriter. + + Attributes: + index (list): A list of positions in the database file where each term is stored. + fieldcounter (int): Counter for assigning field numbers. + fieldmap (dict): Mapping of field names to field numbers. + + Methods: + keycoder(term): Encodes a term into a key for storage in the database file. + valuecoder(terminfo): Encodes a TermInfo object into a string for storage in the database file. + add(key, value): Adds a term and its associated value to the database file. + _write_extras(): Writes additional data (index and fieldmap) to the database file. + """ + def __init__(self, dbfile): + """ + Initializes a TermIndexWriter object. + + Args: + dbfile (file): The database file to write the term index data to. + """ HashWriter.__init__(self, dbfile) self.index = [] self.fieldcounter = 0 self.fieldmap = {} def keycoder(self, term): - # Encode term + """ + Encodes a term into a key for storage in the database file. + + Args: + term (tuple): A tuple containing the field name and the term text. + + Returns: + bytes: The encoded key. + """ fieldmap = self.fieldmap fieldname, text = term @@ -923,14 +1825,33 @@ def keycoder(self, term): return key def valuecoder(self, terminfo): + """ + Encodes a TermInfo object into a string for storage in the database file. + + Args: + terminfo (TermInfo): The TermInfo object to encode. + + Returns: + str: The encoded string. + """ return terminfo.to_string() def add(self, key, value): + """ + Adds a term and its associated value to the database file. + + Args: + key (bytes): The encoded key representing the term. + value (str): The encoded value representing the term information. + """ pos = self.dbfile.tell() self.index.append(pos) HashWriter.add(self, self.keycoder(key), self.valuecoder(value)) def _write_extras(self): + """ + Writes additional data (index and fieldmap) to the database file. + """ dbfile = self.dbfile dbfile.write_uint(len(self.index)) for n in self.index: @@ -939,7 +1860,27 @@ def _write_extras(self): class VectorWriter(TermIndexWriter): + """A class for writing vector data to the index. + + This class is responsible for encoding and writing vector data to the index. + It provides methods for encoding keys and values. + + Attributes: + fieldmap (dict): A dictionary mapping field names to field numbers. + fieldcounter (int): A counter for assigning field numbers. + + """ + def keycoder(self, key): + """Encode the key (docnum, fieldname) into a binary representation. + + Args: + key (tuple): A tuple containing the document number and field name. + + Returns: + bytes: The binary representation of the key. + + """ fieldmap = self.fieldmap docnum, fieldname = key @@ -953,6 +1894,15 @@ def keycoder(self, key): return _vectorkey_struct.pack(docnum, fieldnum) def valuecoder(self, offset): + """Encode the offset into a binary representation. + + Args: + offset (int): The offset value. + + Returns: + bytes: The binary representation of the offset. + + """ return pack_long(offset) @@ -960,11 +1910,33 @@ def valuecoder(self, offset): class PostingIndexBase(HashReader): + """ + Base class for a posting index. + + This class provides methods for reading and manipulating a posting index. + + Args: + dbfile (file): The file object representing the database file. + postfile (file): The file object representing the posting file. + + Attributes: + postfile (file): The file object representing the posting file. + length (int): The length of the posting index. + indexbase (int): The base position of the posting index in the database file. + fieldmap (dict): A mapping of field names to field numbers. + names (list): A list of field names in the order of their field numbers. + """ + def __init__(self, dbfile, postfile): HashReader.__init__(self, dbfile) self.postfile = postfile def _read_extras(self): + """ + Read the extra information from the database file. + + This method reads the length, index base, field map, and field names from the database file. + """ dbfile = self.dbfile self.length = dbfile.read_uint() @@ -977,6 +1949,15 @@ def _read_extras(self): self.names[num] = name def _closest_key(self, key): + """ + Find the closest key in the posting index. + + Args: + key (bytes): The key to search for. + + Returns: + int: The position of the closest key in the posting index. + """ dbfile = self.dbfile key_at = self._key_at indexbase = self.indexbase @@ -991,19 +1972,35 @@ def _closest_key(self, key): lo = mid + 1 else: hi = mid - # i = max(0, mid - 1) if lo == self.length: return None return dbfile.get_long(indexbase + lo * _LONG_SIZE) def closest_key(self, key): + """ + Find the closest key in the posting index. + + Args: + key (bytes): The key to search for. + + Returns: + bytes: The closest key in the posting index. + """ pos = self._closest_key(key) if pos is None: return None return self._key_at(pos) def _ranges_from(self, key): - # read = self.read + """ + Generate ranges of key-value pairs starting from the given key. + + Args: + key (bytes): The key to start from. + + Yields: + tuple: A tuple containing the key position, key length, data position, and data length. + """ pos = self._closest_key(key) if pos is None: return @@ -1011,10 +2008,31 @@ def _ranges_from(self, key): yield from self._ranges(pos=pos) def __getitem__(self, key): + """ + Get the value associated with the given key. + + Args: + key: The key to retrieve the value for. + + Returns: + object: The value associated with the key. + + Raises: + KeyError: If the key is not found in the posting index. + """ k = self.keycoder(key) return self.valuedecoder(HashReader.__getitem__(self, k)) def __contains__(self, key): + """ + Check if the given key is present in the posting index. + + Args: + key: The key to check. + + Returns: + bool: True if the key is present, False otherwise. + """ try: codedkey = self.keycoder(key) except KeyError: @@ -1022,27 +2040,77 @@ def __contains__(self, key): return HashReader.__contains__(self, codedkey) def range_for_key(self, key): + """ + Get the range of key-value pairs for the given key. + + Args: + key: The key to get the range for. + + Returns: + tuple: A tuple containing the start position and end position of the range. + """ return HashReader.range_for_key(self, self.keycoder(key)) def get(self, key, default=None): + """ + Get the value associated with the given key. + + Args: + key: The key to retrieve the value for. + default: The default value to return if the key is not found. + + Returns: + object: The value associated with the key, or the default value if the key is not found. + """ k = self.keycoder(key) return self.valuedecoder(HashReader.get(self, k, default)) def keys(self): + """ + Generate the keys in the posting index. + + Yields: + object: The keys in the posting index. + """ kd = self.keydecoder for k in HashReader.keys(self): yield kd(k) def items(self): + """ + Generate the key-value pairs in the posting index. + + Yields: + tuple: A tuple containing the key and value. + """ kd = self.keydecoder vd = self.valuedecoder for key, value in HashReader.items(self): yield (kd(key), vd(value)) def terms_from(self, fieldname, prefix): + """ + Generate the terms in the posting index starting from the given field name and prefix. + + Args: + fieldname: The field name to start from. + prefix: The prefix to match. + + Yields: + object: The terms in the posting index. + """ return self.keys_from((fieldname, prefix)) def keys_from(self, key): + """ + Generate the keys in the posting index starting from the given key. + + Args: + key: The key to start from. + + Yields: + object: The keys in the posting index. + """ key = self.keycoder(key) kd = self.keydecoder read = self.read @@ -1050,6 +2118,16 @@ def keys_from(self, key): yield kd(read(keypos, keylen)) def items_from(self, fieldname, prefix): + """ + Generate the key-value pairs in the posting index starting from the given field name and prefix. + + Args: + fieldname: The field name to start from. + prefix: The prefix to match. + + Yields: + tuple: A tuple containing the key and value. + """ read = self.read key = self.keycoder((fieldname, prefix)) kd = self.keydecoder @@ -1058,27 +2136,98 @@ def items_from(self, fieldname, prefix): yield (kd(read(keypos, keylen)), vd(read(datapos, datalen))) def values(self): + """ + Generate the values in the posting index. + + Yields: + object: The values in the posting index. + """ vd = self.valuedecoder for v in HashReader.values(self): yield vd(v) def close(self): + """ + Close the posting index. + + This method closes the posting index and the associated files. + """ HashReader.close(self) self.postfile.close() class W2TermsReader(PostingIndexBase): - # Implements whoosh.codec.base.TermsReader + """ + A class that implements the TermsReader interface for the Whoosh2 codec. + + This class provides methods for reading terms, retrieving term information, + creating matchers for a given term, encoding and decoding keys, and decoding + values. + + Note: This class does not filter out deleted documents. A higher-level class + is expected to wrap the matcher to eliminate deleted documents. + + Args: + PostingIndexBase: The base class for the terms reader. + + Attributes: + postfile (PostingsFile): The postings file associated with the terms reader. + fieldmap (dict): A dictionary mapping field names to field numbers. + names (list): A list of field names. + dbfile (DatabaseFile): The database file associated with the terms reader. + + Methods: + terms(): Returns the list of terms in the index. + term_info(fieldname, text): Returns the term information for a given field and text. + matcher(fieldname, text, format_, scorer=None): Returns a matcher for a given field and text. + keycoder(key): Encodes a key. + keydecoder(v): Decodes a key. + valuedecoder(v): Decodes a value. + frequency(fieldname, btext): Returns the frequency of a term in a given field. + doc_frequency(fieldname, btext): Returns the document frequency of a term in a given field. + """ def terms(self): + """ + Returns the list of terms in the index. + + Returns: + list: A list of terms in the index. + """ return self.keys() def term_info(self, fieldname, text): + """ + Returns the term information for a given field and text. + + Args: + fieldname (str): The name of the field. + text (str): The text of the term. + + Returns: + TermInfo: The term information for the given field and text. + + Raises: + TermNotFound: If the term is not found in the index. + """ return self[fieldname, text] def matcher(self, fieldname, text, format_, scorer=None): - # Note this does not filter out deleted documents; a higher level is - # expected to wrap this matcher to eliminate deleted docs + """ + Returns a matcher for a given field and text. + + Args: + fieldname (str): The name of the field. + text (str): The text of the term. + format_ (str): The format of the matcher. + scorer (Scorer, optional): The scorer to use for scoring documents. Defaults to None. + + Returns: + Matcher: A matcher for the given field and text. + + Raises: + TermNotFound: If the term is not found in the index. + """ pf = self.postfile term = (fieldname, text) @@ -1098,24 +2247,71 @@ def matcher(self, fieldname, text, format_, scorer=None): return pr def keycoder(self, key): + """ + Encodes a key. + + Args: + key (tuple): The key to encode. + + Returns: + bytes: The encoded key. + """ fieldname, tbytes = key fnum = self.fieldmap.get(fieldname, 65535) return pack_ushort(fnum) + tbytes def keydecoder(self, v): + """ + Decodes a key. + + Args: + v (bytes): The key to decode. + + Returns: + tuple: The decoded key. + """ assert isinstance(v, bytes) return (self.names[unpack_ushort(v[:2])[0]], v[2:]) def valuedecoder(self, v): + """ + Decodes a value. + + Args: + v (bytes): The value to decode. + + Returns: + FileTermInfo: The decoded value. + """ assert isinstance(v, bytes) return FileTermInfo.from_string(v) def frequency(self, fieldname, btext): + """ + Returns the frequency of a term in a given field. + + Args: + fieldname (str): The name of the field. + btext (bytes): The encoded text of the term. + + Returns: + int: The frequency of the term in the given field. + """ assert isinstance(btext, bytes) datapos = self.range_for_key((fieldname, btext))[0] return FileTermInfo.read_weight(self.dbfile, datapos) def doc_frequency(self, fieldname, btext): + """ + Returns the document frequency of a term in a given field. + + Args: + fieldname (str): The name of the field. + btext (bytes): The encoded text of the term. + + Returns: + int: The document frequency of the term in the given field. + """ assert isinstance(btext, bytes) datapos = self.range_for_key((fieldname, btext))[0] return FileTermInfo.read_doc_freq(self.dbfile, datapos) @@ -1126,26 +2322,112 @@ def doc_frequency(self, fieldname, btext): class W2VectorReader(PostingIndexBase): - # Implements whoosh.codec.base.VectorReader + """ + Implements the VectorReader interface for the Whoosh2 codec. + + This class provides methods for reading vector data from the index. + + Attributes: + postfile (file): The file object representing the posting file. + fieldmap (dict): A mapping of field names to field numbers. + names (list): A list of field names. + + """ def matcher(self, docnum, fieldname, format_): + """ + Returns a matcher for the given document number, field name, and format. + + Args: + docnum (int): The document number. + fieldname (str): The field name. + format_ (str): The format of the vector data. + + Returns: + W2LeafMatcher: A matcher object for the given parameters. + + """ pf = self.postfile offset = self[(docnum, fieldname)] pr = W2LeafMatcher(pf, offset, format_, stringids=True) return pr def keycoder(self, key): + """ + Encodes the key into a binary representation. + + Args: + key (tuple): The key to encode, consisting of a document number and a field name. + + Returns: + bytes: The binary representation of the key. + + """ return _vectorkey_struct.pack(key[0], self.fieldmap[key[1]]) def keydecoder(self, v): + """ + Decodes the binary representation of a key. + + Args: + v (bytes): The binary representation of the key. + + Returns: + tuple: The decoded key, consisting of a document number and a field name. + + """ docnum, fieldnum = _vectorkey_struct.unpack(v) return (docnum, self.names[fieldnum]) def valuedecoder(self, v): + """ + Decodes the binary representation of a value. + + Args: + v (bytes): The binary representation of the value. + + Returns: + int: The decoded value. + + """ return unpack_long(v)[0] class W2PerDocReader(base.PerDocumentReader): + """Reader for per-document data in a Whoosh 2 index segment. + + This class provides methods for accessing per-document data such as field lengths, + stored fields, and vectors in a Whoosh 2 index segment. + + Parameters: + - storage (Storage): The storage object for the index. + - segment (Segment): The segment object representing the index segment. + + Attributes: + - _storage (Storage): The storage object for the index. + - _segment (Segment): The segment object representing the index segment. + - _doccount (int): The total number of documents in the segment. + - _lengths (InMemoryLengths): The object for accessing field lengths. + - _stored (StoredFieldReader): The object for accessing stored fields. + - _vectors (W2VectorReader): The object for accessing vectors. + + Methods: + - supports_columns(): Check if the reader supports column storage. + - close(): Close the reader and release any resources. + - doc_count(): Get the number of documents in the segment. + - doc_count_all(): Get the total number of documents in the segment. + - has_deletions(): Check if the segment has deleted documents. + - is_deleted(docnum): Check if a document is deleted. + - deleted_docs(): Get the list of deleted document numbers. + - doc_field_length(docnum, fieldname, default=0): Get the length of a field in a document. + - field_length(fieldname): Get the total length of a field in all documents. + - min_field_length(fieldname): Get the minimum length of a field in all documents. + - max_field_length(fieldname): Get the maximum length of a field in all documents. + - has_vector(docnum, fieldname): Check if a document has a vector for a field. + - vector(docnum, fieldname, format_): Get the vector for a field in a document. + - stored_fields(docnum): Get the stored fields for a document. + """ + def __init__(self, storage, segment): self._storage = storage self._segment = segment @@ -1160,44 +2442,108 @@ def __init__(self, storage, segment): self._vectors = None # Lazy load def supports_columns(self): + """Check if the reader supports column storage. + + Returns: + - bool: True if the reader supports column storage, False otherwise. + """ return False def close(self): + """Close the reader and release any resources.""" self._lengths.close() if self._vectors: self._vectors.close() self._stored.close() def doc_count(self): + """Get the number of documents in the segment. + + Returns: + - int: The number of documents in the segment. + """ return self._segment.doc_count() def doc_count_all(self): + """Get the total number of documents in the segment. + + Returns: + - int: The total number of documents in the segment. + """ return self._doccount def has_deletions(self): + """Check if the segment has deleted documents. + + Returns: + - bool: True if the segment has deleted documents, False otherwise. + """ return self._segment.has_deletions() def is_deleted(self, docnum): + """Check if a document is deleted. + + Parameters: + - docnum (int): The document number. + + Returns: + - bool: True if the document is deleted, False otherwise. + """ return self._segment.is_deleted(docnum) def deleted_docs(self): - return self._segment.deleted_docs() + """Get the list of deleted document numbers. - # Lengths + Returns: + - list[int]: The list of deleted document numbers. + """ + return self._segment.deleted_docs() def doc_field_length(self, docnum, fieldname, default=0): + """Get the length of a field in a document. + + Parameters: + - docnum (int): The document number. + - fieldname (str): The field name. + - default (int, optional): The default length to return if the field is not found. Defaults to 0. + + Returns: + - int: The length of the field in the document, or the default length if the field is not found. + """ return self._lengths.doc_field_length(docnum, fieldname, default) def field_length(self, fieldname): + """Get the total length of a field in all documents. + + Parameters: + - fieldname (str): The field name. + + Returns: + - int: The total length of the field in all documents. + """ return self._lengths.field_length(fieldname) def min_field_length(self, fieldname): + """Get the minimum length of a field in all documents. + + Parameters: + - fieldname (str): The field name. + + Returns: + - int: The minimum length of the field in all documents. + """ return self._lengths.min_field_length(fieldname) def max_field_length(self, fieldname): - return self._lengths.max_field_length(fieldname) + """Get the maximum length of a field in all documents. + + Parameters: + - fieldname (str): The field name. - # Vectors + Returns: + - int: The maximum length of the field in all documents. + """ + return self._lengths.max_field_length(fieldname) def _prep_vectors(self): vifile = self._segment.open_file(self._storage, W2Codec.VECTOR_EXT) @@ -1205,6 +2551,15 @@ def _prep_vectors(self): self._vectors = W2VectorReader(vifile, vpostfile) def has_vector(self, docnum, fieldname): + """Check if a document has a vector for a field. + + Parameters: + - docnum (int): The document number. + - fieldname (str): The field name. + + Returns: + - bool: True if the document has a vector for the field, False otherwise. + """ if self._vectors is None: try: self._prep_vectors() @@ -1213,13 +2568,29 @@ def has_vector(self, docnum, fieldname): return (docnum, fieldname) in self._vectors def vector(self, docnum, fieldname, format_): + """Get the vector for a field in a document. + + Parameters: + - docnum (int): The document number. + - fieldname (str): The field name. + - format_ (str): The format of the vector. + + Returns: + - VectorMatcher: The vector matcher object. + """ if self._vectors is None: self._prep_vectors() return self._vectors.matcher(docnum, fieldname, format_) - # Stored - def stored_fields(self, docnum): + """Get the stored fields for a document. + + Parameters: + - docnum (int): The document number. + + Returns: + - dict: The stored fields for the document. + """ return self._stored[docnum] @@ -1227,15 +2598,39 @@ def stored_fields(self, docnum): class ByteLengthsBase: + """ + Base class for storing byte lengths of fields in a document. + + This class provides methods to read and store byte lengths of fields in a document. + It also provides methods to retrieve the total number of documents, the length of a specific field, + and the minimum and maximum lengths of a field. + + Attributes: + magic (bytes): The magic number used to identify the file format. + """ + magic = b"~LN1" def __init__(self): + """ + Initializes a new instance of the ByteLengthsBase class. + """ self.starts = {} self.totals = {} self.minlens = {} self.maxlens = {} def _read_header(self, dbfile, doccount): + """ + Reads the header information from the database file. + + Args: + dbfile (file): The file object representing the database file. + doccount (int): The number of documents saved in the database. + + Raises: + AssertionError: If the magic number or version number is not as expected. + """ first = dbfile.read(4) # Magic assert first == self.magic version = dbfile.read_int() # Version number @@ -1258,31 +2653,113 @@ def _read_header(self, dbfile, doccount): self.starts[fieldname] += eoh def doc_count_all(self): + """ + Returns the total number of documents saved in the database. + + Returns: + int: The total number of documents. + """ return self._count def field_length(self, fieldname): + """ + Returns the total length of a specific field in the database. + + Args: + fieldname (str): The name of the field. + + Returns: + int: The total length of the field. + + Raises: + KeyError: If the field name is not found in the database. + """ return self.totals.get(fieldname, 0) def min_field_length(self, fieldname): + """ + Returns the minimum length of a specific field in the database. + + Args: + fieldname (str): The name of the field. + + Returns: + int: The minimum length of the field. + + Raises: + KeyError: If the field name is not found in the database. + """ return self.minlens.get(fieldname, 0) def max_field_length(self, fieldname): + """ + Returns the maximum length of a specific field in the database. + + Args: + fieldname (str): The name of the field. + + Returns: + int: The maximum length of the field. + + Raises: + KeyError: If the field name is not found in the database. + """ return self.maxlens.get(fieldname, 0) class InMemoryLengths(ByteLengthsBase): def __init__(self): + """ + Initialize the Whoosh2 codec. + + This method initializes the Whoosh2 codec by setting up the necessary data structures. + It inherits from the ByteLengthsBase class and initializes the totals and lengths dictionaries. + The totals dictionary keeps track of the total number of occurrences of each term in the index, + while the lengths dictionary stores the length of each term in bytes. + The _count variable is used to keep track of the number of terms. + + Usage: + codec = Whoosh2() + """ + ByteLengthsBase.__init__(self) self.totals = defaultdict(int) self.lengths = {} self._count = 0 def close(self): + """ + Closes the codec. + + This method is called to release any resources held by the codec. It should be called when the codec is no longer needed. + + """ pass # IO def to_file(self, dbfile, doccount): + """ + Write the index data to a file. + + Args: + dbfile (file): The file object to write the index data to. + doccount (int): The number of documents in the index. + + Raises: + IOError: If there is an error writing to the file. + + Notes: + This method writes the index data to a file in a specific format. + It writes the magic number, format version number, number of documents, + and number of fields to the file. Then, it writes per-field information, + including field name, field length, minimum field length, and maximum field length. + Finally, it writes the byte arrays for each field. + + Example: + >>> with open("index.db", "wb") as dbfile: + ... codec.to_file(dbfile, 1000) + """ self._pad_arrays(doccount) fieldnames = list(self.lengths.keys()) @@ -1305,6 +2782,21 @@ def to_file(self, dbfile, doccount): @classmethod def from_file(cls, dbfile, doccount=None): + """ + Load a Whoosh2 object from a file. + + Args: + cls (class): The class of the object to be loaded. + dbfile (file): The file object to read from. + doccount (int, optional): The number of documents in the object. Defaults to None. + + Returns: + obj: The loaded Whoosh2 object. + + Raises: + None. + + """ obj = cls() obj._read_header(dbfile, doccount) for fieldname, start in obj.starts.items(): @@ -1315,6 +2807,25 @@ def from_file(cls, dbfile, doccount=None): # Get def doc_field_length(self, docnum, fieldname, default=0): + """ + Returns the length of a field in a document. + + Args: + docnum (int): The document number. + fieldname (str): The name of the field. + default (int, optional): The default length to return if the field is not found. Defaults to 0. + + Returns: + int: The length of the field in the document, or the default length if the field is not found. + + Raises: + None + + Example: + >>> codec = WhooshCodec() + >>> codec.doc_field_length(0, "title") + 10 + """ try: arry = self.lengths[fieldname] except KeyError: @@ -1326,6 +2837,18 @@ def doc_field_length(self, docnum, fieldname, default=0): # Min/max cache setup -- not meant to be called while adding def _minmax(self, fieldname, op, cache): + """ + Returns the minimum or maximum value for a given field, based on the provided operation. + + Args: + fieldname (str): The name of the field. + op (function): The operation to be performed on the field's lengths. + cache (dict): A dictionary used to cache previously computed results. + + Returns: + int: The minimum or maximum value for the field. + + """ if fieldname in cache: return cache[fieldname] else: @@ -1338,20 +2861,71 @@ def _minmax(self, fieldname, op, cache): return result def min_field_length(self, fieldname): + """ + Returns the minimum length allowed for a field. + + Parameters: + - fieldname (str): The name of the field. + + Returns: + - int: The minimum length allowed for the field. + + """ return self._minmax(fieldname, min, self.minlens) def max_field_length(self, fieldname): + """ + Returns the maximum field length for a given field. + + Parameters: + - fieldname (str): The name of the field. + + Returns: + - int: The maximum field length. + + """ return self._minmax(fieldname, max, self.maxlens) # Add def _create_field(self, fieldname, docnum): + """ + Create a new field for the given document number. + + Args: + fieldname (str): The name of the field. + docnum (int): The document number. + + Returns: + None + + Raises: + None + + Notes: + This method is used to create a new field for a document in the index. + It updates the lengths dictionary with the field's length information. + The _count attribute is also updated to reflect the maximum document number. + + """ dc = max(self._count, docnum + 1) self.lengths[fieldname] = array("B", (0 for _ in range(dc))) self._count = dc def _pad_arrays(self, doccount): - # Pad out arrays to full length + """ + Pad out arrays to full length. + + This method is used to ensure that the arrays storing the lengths of fields are + of the same length as the number of documents in the index. If the arrays are + shorter than the desired length, they are padded with zeros. + + Parameters: + - doccount (int): The desired length of the arrays. + + Returns: + None + """ for fieldname in self.lengths.keys(): arry = self.lengths[fieldname] if len(arry) < doccount: @@ -1360,6 +2934,28 @@ def _pad_arrays(self, doccount): self._count = doccount def add(self, docnum, fieldname, length): + """ + Add the length of a field for a specific document. + + Args: + docnum (int): The document number. + fieldname (str): The name of the field. + length (int): The length of the field. + + Returns: + None + + Raises: + None + + Notes: + This method updates the lengths and totals dictionaries to keep track of the field lengths + for each document. If the field does not exist in the lengths dictionary, it will be created. + The length is converted to a byte value using the length_to_byte function. The byte value is + then stored in the lengths dictionary for the specified document and field. The totals + dictionary is also updated to keep track of the total length of each field. + + """ lengths = self.lengths if length: if fieldname not in lengths: @@ -1377,30 +2973,97 @@ def add(self, docnum, fieldname, length): self.totals[fieldname] += length def add_other(self, other): + """ + Adds the lengths and totals from another instance of the Whoosh2 class to the current instance. + + Parameters: + - other (Whoosh2): Another instance of the Whoosh2 class. + + Returns: + None + """ + lengths = self.lengths totals = self.totals doccount = self._count + + # Add missing length arrays for fname in other.lengths: if fname not in lengths: lengths[fname] = array("B") self._pad_arrays(doccount) + # Extend length arrays with values from other instance for fname in other.lengths: lengths[fname].extend(other.lengths[fname]) self._count = doccount + other._count self._pad_arrays(self._count) + # Add totals from other instance for fname in other.totals: totals[fname] += other.totals[fname] class OnDiskLengths(ByteLengthsBase): + """ + A class that represents the on-disk lengths of fields in a Whoosh index. + + This class is responsible for reading and retrieving the lengths of fields + stored on disk. It inherits from the ByteLengthsBase class. + + Parameters: + - dbfile (file-like object): The file-like object representing the on-disk + storage of the field lengths. + - doccount (int, optional): The total number of documents in the index. If + not provided, it will be determined by reading the header of the dbfile. + + Methods: + - doc_field_length(docnum, fieldname, default=0): Retrieves the length of a + field in a specific document. If the field is not found, it returns the + default value. + - close(): Closes the dbfile. + + Example usage: + ``` + dbfile = open("lengths.db", "rb") + lengths = OnDiskLengths(dbfile) + length = lengths.doc_field_length(10, "title") + lengths.close() + ``` + """ + def __init__(self, dbfile, doccount=None): + """ + Initialize a Whoosh2 object. + + Args: + dbfile (str): The path to the Whoosh2 database file. + doccount (int, optional): The number of documents in the database. Defaults to None. + + Raises: + SomeException: An exception that may be raised under certain conditions. + + Returns: + None + """ ByteLengthsBase.__init__(self) self.dbfile = dbfile self._read_header(dbfile, doccount) def doc_field_length(self, docnum, fieldname, default=0): + """ + Retrieves the length of a field in a specific document. + + Parameters: + - docnum (int): The document number. + - fieldname (str): The name of the field. + - default (int, optional): The default value to return if the field is + not found. Default is 0. + + Returns: + - int: The length of the field in the specified document, or the default + value if the field is not found. + """ try: start = self.starts[fieldname] except KeyError: @@ -1408,6 +3071,15 @@ def doc_field_length(self, docnum, fieldname, default=0): return byte_to_length(self.dbfile.get_byte(start + docnum)) def close(self): + """ + Closes the dbfile. + + This method closes the dbfile associated with the codec. It should be called when you are done using the codec to free up system resources. + + Usage: + codec.close() + + """ self.dbfile.close() @@ -1420,7 +3092,34 @@ def close(self): class StoredFieldWriter: + """ + Class for writing stored fields to a database file. + + Args: + dbfile (file): The file object to write the stored fields to. + + Attributes: + dbfile (file): The file object to write the stored fields to. + length (int): The number of stored fields written. + directory (list): A list of pointers to the stored fields in the file. + names (list): A list of field names. + name_map (dict): A mapping of field names to their index in the `names` list. + """ + def __init__(self, dbfile): + """ + Initialize a Whoosh2 object. + + Args: + dbfile (file): The file object representing the database file. + + Attributes: + dbfile (file): The file object representing the database file. + length (int): The length of the database. + directory (list): A list of directory entries. + names (list): A list of names. + name_map (dict): A dictionary mapping names to their corresponding indices. + """ self.dbfile = dbfile self.length = 0 self.directory = [] @@ -1432,6 +3131,12 @@ def __init__(self, dbfile): self.name_map = {} def add(self, vdict): + """ + Adds a dictionary of field values to the stored fields. + + Args: + vdict (dict): A dictionary of field names and their corresponding values. + """ f = self.dbfile names = self.names name_map = self.name_map @@ -1451,11 +3156,20 @@ def add(self, vdict): f.write(vstring) def add_reader(self, sfreader): + """ + Adds stored fields from a reader object. + + Args: + sfreader (object): An object that provides an iterator over dictionaries of field values. + """ add = self.add for vdict in sfreader: add(vdict) def close(self): + """ + Closes the stored field writer and flushes the changes to the file. + """ f = self.dbfile dirpos = f.tell() f.write_pickle(self.names) @@ -1469,7 +3183,40 @@ def close(self): class StoredFieldReader: + """ + Reads stored fields from a database file. + + Args: + dbfile (file-like object): The database file to read from. + + Attributes: + dbfile (file-like object): The database file being read. + length (int): The number of stored fields in the database. + basepos (int): The base position in the database file. + names (list): The list of field names. + directory_offset (int): The offset of the directory in the database file. + + Methods: + close(): Closes the database file. + __iter__(): Iterates over the stored fields and yields a dictionary of field names and values. + __getitem__(num): Retrieves the stored field at the specified index. + + """ + def __init__(self, dbfile): + """ + Initialize a Whoosh2 object. + + Args: + dbfile (file-like object): The file-like object representing the Whoosh2 database file. + + Raises: + ValueError: If the database file is not valid. + + Notes: + This method reads the metadata from the database file and initializes the Whoosh2 object. + + """ self.dbfile = dbfile dbfile.seek(0) @@ -1491,15 +3238,31 @@ def __init__(self, dbfile): self.directory_offset = dbfile.tell() def close(self): + """ + Closes the database file. + + This method closes the database file associated with the current instance of the class. + After calling this method, any further operations on the database file will raise an exception. + + Usage: + codec = WhooshCodec() + codec.close() + + Raises: + Any exceptions raised by the underlying file object's close() method. + """ self.dbfile.close() def __iter__(self): + """ + Iterates over the stored fields and yields a dictionary of field names and values. + """ dbfile = self.dbfile names = self.names lengths = array("I") dbfile.seek(self.directory_offset) - for i in range(self.length): + for _ in range(self.length): dbfile.seek(_LONG_SIZE, 1) lengths.append(dbfile.read_uint()) @@ -1512,6 +3275,20 @@ def __iter__(self): yield vdict def __getitem__(self, num): + """ + Retrieves the stored field at the specified index. + + Args: + num (int): The index of the stored field to retrieve. + + Returns: + dict: A dictionary of field names and values. + + Raises: + IndexError: If the specified index is out of range. + ValueError: If there is an error reading the stored field. + + """ if num > self.length - 1: raise IndexError(f"Tried to get document {num}, file has {self.length}") @@ -1541,14 +3318,17 @@ def __getitem__(self, num): class W2Segment(base.Segment): def __init__(self, indexname, doccount=0, segid=None, deleted=None): """ - :param name: The name of the segment (the Index object computes this - from its name and the generation). + Represents a segment in the Whoosh index. + + :param indexname: The name of the index. + :type indexname: str :param doccount: The maximum document number in the segment. - :param term_count: Total count of all terms in all documents. - :param deleted: A set of deleted document numbers, or None if no - deleted documents exist in this segment. + :type doccount: int + :param segid: The segment ID. If not provided, a random ID will be generated. + :type segid: str, optional + :param deleted: A set of deleted document numbers, or None if no deleted documents exist in this segment. + :type deleted: set, optional """ - assert isinstance(indexname, str) self.indexname = indexname assert isinstance(doccount, int) @@ -1558,39 +3338,98 @@ def __init__(self, indexname, doccount=0, segid=None, deleted=None): self.compound = False def codec(self, **kwargs): + """ + Returns the codec associated with this segment. + + :param kwargs: Additional keyword arguments to pass to the codec constructor. + :return: The codec associated with this segment. + :rtype: W2Codec + """ return W2Codec(**kwargs) def set_doc_count(self, dc): + """ + Sets the document count for this segment. + + :param dc: The document count. + :type dc: int + """ self.doccount = dc def doc_count_all(self): + """ + Returns the total count of all documents in this segment. + + :return: The total count of all documents. + :rtype: int + """ return self.doccount def doc_count(self): + """ + Returns the count of non-deleted documents in this segment. + + :return: The count of non-deleted documents. + :rtype: int + """ return self.doccount - self.deleted_count() def has_deletions(self): + """ + Checks if this segment has any deleted documents. + + :return: True if there are deleted documents, False otherwise. + :rtype: bool + """ return self.deleted is not None and bool(self.deleted) def deleted_count(self): + """ + Returns the count of deleted documents in this segment. + + :return: The count of deleted documents. + :rtype: int + """ if self.deleted is None: return 0 return len(self.deleted) def delete_document(self, docnum, delete=True): + """ + Marks a document as deleted or undeleted. + + :param docnum: The document number. + :type docnum: int + :param delete: True to mark the document as deleted, False to mark it as undeleted. + :type delete: bool, optional + """ if delete: if self.deleted is None: self.deleted = set() self.deleted.add(docnum) elif self.deleted is not None and docnum in self.deleted: - self.deleted.clear(docnum) + self.deleted.remove(docnum) def is_deleted(self, docnum): + """ + Checks if a document is marked as deleted. + + :param docnum: The document number. + :type docnum: int + :return: True if the document is marked as deleted, False otherwise. + :rtype: bool + """ if self.deleted is None: return False return docnum in self.deleted def deleted_docs(self): + """ + Returns an iterator over the deleted document numbers in this segment. + + :return: An iterator over the deleted document numbers. + :rtype: iterator + """ if self.deleted is None: return () else: @@ -1601,6 +3440,18 @@ def deleted_docs(self): class W2Block: + """ + Represents a block of data in the Whoosh index file format. + + Attributes: + magic (bytes): The magic number identifying the block format. + infokeys (tuple): The keys for the block information. + + Args: + postingsize (int): The size of the posting data. + stringids (bool, optional): Whether the block uses string IDs. Defaults to False. + """ + magic = b"Blk3" infokeys = ( @@ -1616,6 +3467,13 @@ class W2Block: ) def __init__(self, postingsize, stringids=False): + """ + Initializes a new instance of the W2Block class. + + Args: + postingsize (int): The size of the posting data. + stringids (bool, optional): Whether the block uses string IDs. Defaults to False. + """ self.postingsize = postingsize self.stringids = stringids self.ids = [] if stringids else array("I") @@ -1627,33 +3485,90 @@ def __init__(self, postingsize, stringids=False): self.maxweight = 0 def __len__(self): + """ + Returns the number of IDs in the block. + + Returns: + int: The number of IDs in the block. + """ return len(self.ids) def __nonzero__(self): + """ + Returns whether the block has any IDs. + + Returns: + bool: True if the block has IDs, False otherwise. + """ return bool(self.ids) def min_id(self): + """ + Returns the minimum ID in the block. + + Returns: + int: The minimum ID in the block. + + Raises: + IndexError: If the block has no IDs. + """ if self.ids: return self.ids[0] else: raise IndexError def max_id(self): + """ + Returns the maximum ID in the block. + + Returns: + int: The maximum ID in the block. + + Raises: + IndexError: If the block has no IDs. + """ if self.ids: return self.ids[-1] else: raise IndexError def min_length(self): + """ + Returns the minimum length of the values in the block. + + Returns: + int: The minimum length of the values in the block. + """ return self.minlength def max_length(self): + """ + Returns the maximum length of the values in the block. + + Returns: + int: The maximum length of the values in the block. + """ return self.maxlength def max_weight(self): + """ + Returns the maximum weight in the block. + + Returns: + float: The maximum weight in the block. + """ return self.maxweight def add(self, id_, weight, valuestring, length=None): + """ + Adds an ID, weight, and value to the block. + + Args: + id_ (int): The ID to add. + weight (float): The weight to add. + valuestring (str): The value string to add. + length (int, optional): The length of the value. Defaults to None. + """ self.ids.append(id_) self.weights.append(weight) if weight > self.maxweight: @@ -1669,6 +3584,13 @@ def add(self, id_, weight, valuestring, length=None): self.maxlength = length def to_file(self, postfile, compression=3): + """ + Writes the block data to a file. + + Args: + postfile (file): The file to write the block data to. + compression (int, optional): The compression level. Defaults to 3. + """ ids = self.ids idcode, idstring = minimize_ids(ids, self.stringids, compression) wtstring = minimize_weights(self.weights, compression) @@ -1699,6 +3621,17 @@ def to_file(self, postfile, compression=3): @classmethod def from_file(cls, postfile, postingsize, stringids=False): + """ + Reads a block from a file. + + Args: + postfile (file): The file to read the block from. + postingsize (int): The size of the posting data. + stringids (bool, optional): Whether the block uses string IDs. Defaults to False. + + Returns: + W2Block: The read block. + """ block = cls(postingsize, stringids=stringids) block.postfile = postfile @@ -1715,6 +3648,12 @@ def from_file(cls, postfile, postingsize, stringids=False): return block def read_ids(self): + """ + Reads the IDs from the block. + + Returns: + list: The read IDs. + """ offset = self.dataoffset self.postfile.seek(offset) idstring = self.postfile.read(self.idslen) @@ -1723,6 +3662,12 @@ def read_ids(self): return ids def read_weights(self): + """ + Reads the weights from the block. + + Returns: + list: The read weights. + """ if self.weightslen == 0: weights = [1.0] * self.count else: @@ -1734,6 +3679,12 @@ def read_weights(self): return weights def read_values(self): + """ + Reads the values from the block. + + Returns: + list: The read values. + """ postingsize = self.postingsize if postingsize == 0: values = [None] * self.count @@ -1754,19 +3705,39 @@ def read_values(self): class FileTermInfo(TermInfo): - # Freq, Doc freq, min len, max length, max weight, unused, min ID, max ID + """ + Represents term information stored in a file-based index. + + Attributes: + postings: The postings associated with the term. + """ + struct = Struct("!fIBBffII") def __init__(self, *args, **kwargs): + """ + Initializes a new instance of the FileTermInfo class. + + Args: + *args: Variable length arguments. + **kwargs: Keyword arguments. + + Keyword Args: + postings: The postings associated with the term. + """ self.postings = None if "postings" in kwargs: self.postings = kwargs["postings"] del kwargs["postings"] TermInfo.__init__(self, *args, **kwargs) - # filedb specific methods - def add_block(self, block): + """ + Adds a block of postings to the term information. + + Args: + block: The block of postings to add. + """ self._weight += sum(block.weights) self._df += len(block) @@ -1783,6 +3754,12 @@ def add_block(self, block): self._maxid = block.ids[-1] def to_string(self): + """ + Converts the term information to a string representation. + + Returns: + The string representation of the term information. + """ # Encode the lengths as 0-255 values ml = 0 if self._minlength is None else length_to_byte(self._minlength) xl = length_to_byte(self._maxlength) @@ -1815,6 +3792,15 @@ def to_string(self): @classmethod def from_string(cls, s): + """ + Creates a new FileTermInfo instance from a string representation. + + Args: + s: The string representation of the term information. + + Returns: + A new FileTermInfo instance. + """ assert isinstance(s, bytes) if isinstance(s, str): @@ -1860,14 +3846,44 @@ def from_string(cls, s): @classmethod def read_weight(cls, dbfile, datapos): + """ + Reads the weight from the database file. + + Args: + dbfile: The database file. + datapos: The position of the weight in the file. + + Returns: + The weight. + """ return dbfile.get_float(datapos + 1) @classmethod def read_doc_freq(cls, dbfile, datapos): + """ + Reads the document frequency from the database file. + + Args: + dbfile: The database file. + datapos: The position of the document frequency in the file. + + Returns: + The document frequency. + """ return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE) @classmethod def read_min_and_max_length(cls, dbfile, datapos): + """ + Reads the minimum and maximum length from the database file. + + Args: + dbfile: The database file. + datapos: The position of the lengths in the file. + + Returns: + A tuple containing the minimum and maximum length. + """ lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE ml = byte_to_length(dbfile.get_byte(lenpos)) xl = byte_to_length(dbfile.get_byte(lenpos + 1)) @@ -1875,6 +3891,16 @@ def read_min_and_max_length(cls, dbfile, datapos): @classmethod def read_max_weight(cls, dbfile, datapos): + """ + Reads the maximum weight from the database file. + + Args: + dbfile: The database file. + datapos: The position of the maximum weight in the file. + + Returns: + The maximum weight. + """ weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2 return dbfile.get_float(weightspos) @@ -1883,6 +3909,27 @@ def read_max_weight(cls, dbfile, datapos): def minimize_ids(arry, stringids, compression=0): + """ + Minimizes the given array of IDs for efficient storage and retrieval. + + Args: + arry (array): The array of IDs to be minimized. + stringids (bool): Indicates whether the IDs are string-based or not. + compression (int, optional): The compression level to apply to the minimized IDs. Defaults to 0. + + Returns: + tuple: A tuple containing the typecode of the minimized IDs and the minimized IDs as a string. + + Raises: + None + + Notes: + - If the IDs are string-based, they will be serialized using the `pickle.dumps` function. + - If the IDs are not string-based, they will be converted to the appropriate typecode based on their maximum value. + - If the typecode of the array needs to be changed, a new array will be created with the updated typecode. + - If the system is big-endian, the byte order of the array will be swapped. + - If compression is enabled, the minimized IDs will be compressed using the zlib library. + """ amax = arry[-1] if stringids: @@ -1906,6 +3953,21 @@ def minimize_ids(arry, stringids, compression=0): def deminimize_ids(typecode, count, string, compression=0): + """ + Deserialize and decompress a string representation of an array of integers. + + Args: + typecode (str): The typecode of the array. + count (int): The number of elements in the array. + string (bytes): The serialized and optionally compressed string representation of the array. + compression (int, optional): The compression level used for the string. Defaults to 0. + + Returns: + array: The deserialized and decompressed array of integers. + + Raises: + TypeError: If the typecode is not a valid array typecode. + """ if compression: string = zlib.decompress(string) if typecode == "": @@ -1919,6 +3981,33 @@ def deminimize_ids(typecode, count, string, compression=0): def minimize_weights(weights, compression=0): + """ + Minimizes the weights array by converting it to a compressed string representation. + + Args: + weights (array-like): The weights array to be minimized. + compression (int, optional): The compression level to be applied. Defaults to 0. + + Returns: + str: The minimized string representation of the weights array. + + Raises: + None + + Examples: + >>> weights = [1.0, 1.0, 1.0] + >>> minimize_weights(weights) + b'' + + >>> weights = [0.5, 0.75, 1.0] + >>> minimize_weights(weights, compression=6) + b'x\x9c\xcbH\xcd\xc9\xc9\x07\x00\x06\xcb\x01' + + Note: + - If all weights in the array are equal to 1.0, an empty string is returned. + - The weights array is expected to be a one-dimensional array-like object. + - The compression level should be an integer between 0 and 9, where 0 means no compression and 9 means maximum compression. + """ if all(w == 1.0 for w in weights): string = b"" else: @@ -1931,6 +4020,25 @@ def minimize_weights(weights, compression=0): def deminimize_weights(count, string, compression=0): + """ + Convert a serialized string representation of weights into an array of floats. + + Args: + count (int): The number of weights to be converted. + string (bytes): The serialized string representation of weights. + compression (int, optional): The compression level used for the serialized string. Defaults to 0. + + Returns: + array.array: An array of floats representing the weights. + + Raises: + None + + Examples: + >>> weights = deminimize_weights(3, b'\x00\x00\x80\x3f\x00\x00\x00\x40\x00\x00\x40\x40') + >>> print(weights) + array('f', [1.0, 2.0, 3.0]) + """ if not string: return array("f", (1.0 for _ in range(count))) if compression: @@ -1943,6 +4051,24 @@ def deminimize_weights(count, string, compression=0): def minimize_values(postingsize, values, compression=0): + """ + Minimizes the values by compressing them and returning the compressed string. + + Args: + postingsize (int): The size of the posting. + values (list): The list of values to be minimized. + compression (int, optional): The compression level. Defaults to 0. + + Returns: + str: The compressed string. + + Raises: + None + + Examples: + >>> minimize_values(10, ['value1', 'value2', 'value3'], 6) + 'compressed_string' + """ if postingsize < 0: string = dumps(values, -1)[2:] elif postingsize == 0: @@ -1955,6 +4081,26 @@ def minimize_values(postingsize, values, compression=0): def deminimize_values(postingsize, count, string, compression=0): + """ + Deminimizes a string into a list of values. + + Args: + postingsize (int): The size of each value in the string. + count (int): The number of values to extract from the string. + string (bytes): The string to deminimize. + compression (int, optional): The compression level of the string. Defaults to 0. + + Returns: + list: The deminimized list of values. + + Raises: + None + + Examples: + >>> string = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' + >>> deminimize_values(2, 8, string) + [b'\x00\x01', b'\x02\x03', b'\x04\x05', b'\x06\x07', b'\x08\t', b'\n\x0b', b'\x0c\r', b'\x0e\x0f'] + """ if compression: string = zlib.decompress(string) @@ -1971,7 +4117,27 @@ def deminimize_values(postingsize, count, string, compression=0): from whoosh.fields import NUMERIC -class OLD_NUMERIC(NUMERIC): +class old_numeric(NUMERIC): + """ + A field type for storing numeric values in the index. + + This field type supports storing integers, floats, and decimals. + The values can be sorted and searched using numeric range queries. + + Parameters: + - type (type): The Python type of the numeric values to be stored. + - stored (bool): Whether the field should be stored in the index. + - unique (bool): Whether the field values should be unique. + - field_boost (float): The boost factor for the field. + - decimal_places (int): The number of decimal places to store for decimal values. + - shift_step (int): The number of bits to shift the values during sorting. + - signed (bool): Whether the values should be treated as signed or unsigned. + + Raises: + - TypeError: If the specified type is not supported by the field. + + """ + NUMERIC_DEFAULTS = { "b": 2**7 - 1, "B": 2**8 - 1, @@ -1995,6 +4161,22 @@ def __init__( shift_step=4, signed=True, ): + """ + Initialize the old_numeric field. + + Args: + - type (type): The Python type of the numeric values to be stored. + - stored (bool): Whether the field should be stored in the index. + - unique (bool): Whether the field values should be unique. + - field_boost (float): The boost factor for the field. + - decimal_places (int): The number of decimal places to store for decimal values. + - shift_step (int): The number of bits to shift the values during sorting. + - signed (bool): Whether the values should be treated as signed or unsigned. + + Raises: + - TypeError: If the specified type is not supported by the field. + + """ from whoosh import analysis, formats self.type = type @@ -2025,15 +4207,32 @@ def __init__( self.shift_step = shift_step self.signed = signed - self.analyzer = analysis.IDAnalyzer() + self.analyzer = analysis.id_analyzer() self.format = formats.Existence(field_boost=field_boost) def __setstate__(self, d): + """ + Set the state of the field. + + Args: + - d (dict): The state dictionary. + + """ self.__dict__.update(d) self.numtype = d["type"] self.bits = 64 def prepare_number(self, x): + """ + Prepare a numeric value for storage in the index. + + Args: + - x: The numeric value to prepare. + + Returns: + - The prepared numeric value. + + """ if x is None or x == emptybytes: return x if self.decimal_places: @@ -2043,6 +4242,16 @@ def prepare_number(self, x): return x def unprepare_number(self, x): + """ + Convert a prepared numeric value back to its original form. + + Args: + - x: The prepared numeric value. + + Returns: + - The original numeric value. + + """ dc = self.decimal_places if dc: s = str(x) @@ -2050,34 +4259,116 @@ def unprepare_number(self, x): return x def to_bytes(self, x, shift=0): + """ + Convert a numeric value to bytes. + + Args: + - x: The numeric value to convert. + - shift (int): The number of bits to shift the value. + + Returns: + - The bytes representation of the numeric value. + + """ if isinstance(x, bytes): return x return utf8encode(self.to_text(x, shift))[0] def from_bytes(self, bs): + """ + Convert bytes to a numeric value. + + Args: + - bs (bytes): The bytes to convert. + + Returns: + - The numeric value. + + """ return self.from_text(utf8decode(bs)[0]) def sortable_to_bytes(self, x, shift=0): + """ + Convert a numeric value to sortable bytes. + + Args: + - x: The numeric value to convert. + - shift (int): The number of bits to shift the value. + + Returns: + - The sortable bytes representation of the numeric value. + + """ if shift: x >>= shift return pack_byte(shift) + self._to_text() def to_text(self, x, shift=0): + """ + Convert a numeric value to text. + + Args: + - x: The numeric value to convert. + - shift (int): The number of bits to shift the value. + + Returns: + - The text representation of the numeric value. + + """ x = self.prepare_number(x) x = self._to_text(x, shift=shift, signed=self.signed) return x def from_text(self, t): + """ + Convert text to a numeric value. + + Args: + - t (str): The text to convert. + + Returns: + - The numeric value. + + """ x = self._from_text(t, signed=self.signed) return self.unprepare_number(x) def process_text(self, text, **kwargs): + """ + Process the text value of the field. + + Args: + - text (str): The text value to process. + + Returns: + - A tuple containing the processed text value. + + """ return (self.to_text(text),) def self_parsing(self): + """ + Check if the field is self-parsing. + + Returns: + - True if the field is self-parsing, False otherwise. + + """ return True def parse_query(self, fieldname, qstring, boost=1.0): + """ + Parse a query string for the field. + + Args: + - fieldname (str): The name of the field. + - qstring (str): The query string to parse. + - boost (float): The boost factor for the query. + + Returns: + - A query object representing the parsed query. + + """ from whoosh import query if qstring == "*": @@ -2092,6 +4383,21 @@ def parse_query(self, fieldname, qstring, boost=1.0): return query.Term(fieldname, text, boost=boost) def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): + """ + Parse a range query for the field. + + Args: + - fieldname (str): The name of the field. + - start: The start value of the range. + - end: The end value of the range. + - startexcl (bool): Whether the start value is exclusive. + - endexcl (bool): Whether the end value is exclusive. + - boost (float): The boost factor for the query. + + Returns: + - A query object representing the parsed range query. + + """ from whoosh import query from whoosh.qparser.common import QueryParserError @@ -2109,6 +4415,17 @@ def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): ) def sortable_terms(self, ixreader, fieldname): + """ + Generate sortable terms for the field. + + Args: + - ixreader: The index reader object. + - fieldname (str): The name of the field. + + Yields: + - Sortable terms for the field. + + """ for btext in ixreader.lexicon(fieldname): if btext[0:1] != "\x00": # Only yield the full-precision values @@ -2116,11 +4433,47 @@ def sortable_terms(self, ixreader, fieldname): yield btext -class OLD_DATETIME(OLD_NUMERIC): +class old_datetime(old_numeric): + """ + A field type for storing and indexing datetime values. + + This field type stores datetime values as long integers internally, using the `datetime_to_long` function + to convert datetime objects to long integers, and the `long_to_datetime` function to convert long integers + back to datetime objects. + + Parameters: + - stored (bool): Whether the field should be stored in the index. Default is False. + - unique (bool): Whether the field should be unique in the index. Default is False. + + Example usage: + ``` + from whoosh.codec.whoosh2 import old_datetime + + # Create an instance of old_datetime field type + my_datetime_field = old_datetime(stored=True, unique=True) + ``` + + """ + def __init__(self, stored=False, unique=False): - OLD_NUMERIC.__init__(self, type=int, stored=stored, unique=unique, shift_step=8) + old_numeric.__init__(self, type=int, stored=stored, unique=unique, shift_step=8) def to_text(self, x, shift=0): + """ + Convert a datetime value to a string representation. + + Parameters: + - x: The datetime value to convert. + - shift (int): The number of bits to shift the value by. Default is 0. + + Returns: + - str: The string representation of the datetime value. + + Raises: + - ValueError: If the datetime value cannot be converted to a string. + + """ + from datetime import datetime from whoosh.util.times import floor @@ -2137,16 +4490,41 @@ def to_text(self, x, shift=0): except ValueError: raise ValueError(f"DATETIME.to_text can't convert from {x!r}") - x = OLD_NUMERIC.to_text(self, x, shift=shift) + x = old_numeric.to_text(self, x, shift=shift) return x def from_text(self, x): - x = OLD_NUMERIC.from_text(self, x) + """ + Convert a string representation to a datetime value. + + Parameters: + - x (str): The string representation of the datetime value. + + Returns: + - datetime.datetime: The datetime value. + + """ + + x = old_numeric.from_text(self, x) return long_to_datetime(x) def _parse_datestring(self, qstring): - # This method parses a very simple datetime representation of the form - # YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]] + """ + Parse a simple datetime representation. + + This method parses a very simple datetime representation of the form YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]]. + + Parameters: + - qstring (str): The datetime string to parse. + + Returns: + - whoosh.util.times.adatetime: The parsed datetime value. + + Raises: + - Exception: If the datetime string is not parseable. + + """ + from whoosh.util.times import adatetime, fix, is_void qstring = qstring.replace(" ", "").replace("-", "").replace(".", "") @@ -2168,10 +4546,23 @@ def _parse_datestring(self, qstring): at = fix(adatetime(year, month, day, hour, minute, second, microsecond)) if is_void(at): - raise Exception(f"{qstring!r} is not a parseable date") + raise DateParseError(f"{qstring} is not a parseable date") return at def parse_query(self, fieldname, qstring, boost=1.0): + """ + Parse a query string into a query object. + + Parameters: + - fieldname (str): The name of the field to parse the query for. + - qstring (str): The query string to parse. + - boost (float): The boost factor for the query. Default is 1.0. + + Returns: + - whoosh.query.Query: The parsed query object. + + """ + from whoosh import query from whoosh.util.times import is_ambiguous @@ -2189,6 +4580,22 @@ def parse_query(self, fieldname, qstring, boost=1.0): return query.Term(fieldname, self.to_text(at), boost=boost) def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): + """ + Parse a range query into a query object. + + Parameters: + - fieldname (str): The name of the field to parse the range query for. + - start (str): The start value of the range query. + - end (str): The end value of the range query. + - startexcl (bool): Whether the start value is exclusive. Default is False. + - endexcl (bool): Whether the end value is exclusive. Default is False. + - boost (float): The boost factor for the query. Default is 1.0. + + Returns: + - whoosh.query.Query: The parsed range query object. + + """ + from whoosh import query if start is None and end is None: @@ -2209,33 +4616,118 @@ def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): def int_to_text(x, shift=0, signed=True): + """ + Convert an integer to a sortable text representation. + + Args: + x (int): The integer to be converted. + shift (int, optional): The number of bits to shift the integer before conversion. Defaults to 0. + signed (bool, optional): Whether the integer is signed or not. Defaults to True. + + Returns: + str: The sortable text representation of the integer. + """ x = to_sortable(int, 32, signed, x) return sortable_int_to_text(x, shift) def text_to_int(text, signed=True): + """ + Convert a text string to an integer representation. + + Args: + text (str): The text string to convert. + signed (bool, optional): Whether the resulting integer should be signed or unsigned. + Defaults to True. + + Returns: + int: The integer representation of the text string. + + """ x = text_to_sortable_int(text) x = from_sortable(int, 32, signed, x) return x def long_to_text(x, shift=0, signed=True): + """ + Convert a long integer to a text representation. + + Args: + x (int): The long integer to be converted. + shift (int, optional): The number of bits to shift the integer before conversion. Defaults to 0. + signed (bool, optional): Whether the integer is signed or not. Defaults to True. + + Returns: + str: The text representation of the long integer. + + """ x = to_sortable(int, 64, signed, x) return sortable_long_to_text(x, shift) def text_to_long(text, signed=True): + """ + Converts a text string to a long integer. + + Args: + text (str): The text string to convert. + signed (bool, optional): Whether the resulting long integer should be signed. + Defaults to True. + + Returns: + int: The converted long integer. + + Raises: + None + + Examples: + >>> text_to_long("12345") + 12345 + >>> text_to_long("-54321") + -54321 + """ x = text_to_sortable_long(text) x = from_sortable(int, 64, signed, x) return x def float_to_text(x, shift=0, signed=True): + """ + Convert a floating-point number to a sortable text representation. + + Args: + x (float): The floating-point number to be converted. + shift (int, optional): The number of bits to shift the sortable representation. Defaults to 0. + signed (bool, optional): Whether the sortable representation should support negative numbers. Defaults to True. + + Returns: + str: The sortable text representation of the floating-point number. + """ x = to_sortable(float, 32, signed, x) return sortable_long_to_text(x, shift) def text_to_float(text, signed=True): + """ + Converts a text representation of a float to a float value. + + Args: + text (str): The text representation of the float. + signed (bool, optional): Whether the float is signed or not. Defaults to True. + + Returns: + float: The float value represented by the text. + + Raises: + ValueError: If the text cannot be converted to a float. + + Examples: + >>> text_to_float("3.14") + 3.14 + >>> text_to_float("-2.5", signed=True) + -2.5 + """ x = text_to_sortable_long(text) x = from_sortable(float, 32, signed, x) return x @@ -2247,29 +4739,89 @@ def text_to_float(text, signed=True): def sortable_int_to_text(x, shift=0): + """ + Convert a sortable integer to a text representation. + + Args: + x (int): The integer to be converted. + shift (int, optional): The number of bits to shift the integer before conversion. Defaults to 0. + + Returns: + str: The text representation of the sortable integer. + + Notes: + This function converts a sortable integer to a text representation by shifting the integer (if specified) and encoding it using base85 encoding. + + Example: + >>> sortable_int_to_text(12345) + '0gV' + """ if shift: x >>= shift - # text = chr(shift) + u"%08x" % x text = chr(shift) + to_base85(x, False) return text def sortable_long_to_text(x, shift=0): + """ + Convert a sortable long integer to a text representation. + + Args: + x (int): The long integer to be converted. + shift (int, optional): The number of bits to shift the integer before conversion. Defaults to 0. + + Returns: + str: The text representation of the sortable long integer. + + Notes: + This function converts a long integer to a text representation using base85 encoding. + The resulting text representation is prefixed with a character representing the shift value. + + Example: + >>> sortable_long_to_text(1234567890, 4) + 'E@9jqo' + """ if shift: x >>= shift - # text = chr(shift) + u"%016x" % x - # assert len(text) == 17 text = chr(shift) + to_base85(x, True) return text def text_to_sortable_int(text): - # assert len(text) == 9 - # return int(text[1:], 16) + """ + Converts a text representation of a sortable integer to an actual integer. + + Args: + text (str): The text representation of the sortable integer. + + Returns: + int: The converted integer. + + Raises: + ValueError: If the text representation is invalid. + + Example: + >>> text_to_sortable_int('x12345678') + 305419896 + """ return from_base85(text[1:]) def text_to_sortable_long(text): - # assert len(text) == 17 - # return long(text[1:], 16) + """ + Converts a text string to a sortable long value. + + Parameters: + text (str): The text string to convert. + + Returns: + int: The converted sortable long value. + + Raises: + ValueError: If the input text is not a valid sortable long value. + + Example: + >>> text_to_sortable_long('0x123456789abcdef') + 81985529216486895 + """ return from_base85(text[1:]) diff --git a/src/whoosh/codec/whoosh3.py b/src/whoosh/codec/whoosh3.py index 96a06961..0ed453a6 100644 --- a/src/whoosh/codec/whoosh3.py +++ b/src/whoosh/codec/whoosh3.py @@ -76,6 +76,19 @@ class W3Codec(base.Codec): + """ + Codec implementation for the Whoosh 3 index format. + + This codec provides methods for reading and writing various components of the index, + such as term indexes, term postings, vector postings, and per-document value columns. + + Parameters: + - blocklimit (int): The maximum number of postings to store in a block. Defaults to 128. + - compression (int): The level of compression to use for the postings. Defaults to 3. + - inlinelimit (int): The maximum number of postings to inline in the term info object. Defaults to 1. + + """ + # File extensions TERMS_EXT = ".trm" # Term index POSTS_EXT = ".pst" # Term postings @@ -83,23 +96,60 @@ class W3Codec(base.Codec): COLUMN_EXT = ".col" # Per-document value columns def __init__(self, blocklimit=128, compression=3, inlinelimit=1): + """ + Initialize a new instance of the W3Codec class. + + Parameters: + - blocklimit (int): The maximum number of postings to store in a block. Defaults to 128. + - compression (int): The level of compression to use for the postings. Defaults to 3. + - inlinelimit (int): The maximum number of postings to inline in the term info object. Defaults to 1. + + """ self._blocklimit = blocklimit self._compression = compression self._inlinelimit = inlinelimit - # def automata(self): - # Per-document value writer def per_document_writer(self, storage, segment): + """ + Create a per-document value writer for the given storage and segment. + + Parameters: + - storage (Storage): The storage object for the index. + - segment (Segment): The segment object for the index. + + Returns: + - W3PerDocWriter: The per-document value writer. + + """ return W3PerDocWriter(self, storage, segment) - # Inverted index writer def field_writer(self, storage, segment): - return W3FieldWriter(self, storage, segment) + """ + Create an inverted index writer for the given storage and segment. + + Parameters: + - storage (Storage): The storage object for the index. + - segment (Segment): The segment object for the index. - # Postings + Returns: + - W3FieldWriter: The inverted index writer. + + """ + return W3FieldWriter(self, storage, segment) def postings_writer(self, dbfile, byteids=False): + """ + Create a postings writer for the given database file. + + Parameters: + - dbfile (File): The file object for the postings. + - byteids (bool): Whether to use byte-based document ids. Defaults to False. + + Returns: + - W3PostingsWriter: The postings writer. + + """ return W3PostingsWriter( dbfile, blocklimit=self._blocklimit, @@ -109,6 +159,20 @@ def postings_writer(self, dbfile, byteids=False): ) def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): + """ + Create a postings reader for the given database file and term info. + + Parameters: + - dbfile (File): The file object for the postings. + - terminfo (TermInfo): The term info object for the term. + - format_ (str): The format of the postings. + - term (str): The term to read the postings for. Defaults to None. + - scorer (Scorer): The scorer object for scoring the postings. Defaults to None. + + Returns: + - Matcher: The postings reader. + + """ if terminfo.is_inlined(): # If the postings were inlined into the terminfo object, pull them # out and use a ListMatcher to wrap them in a Matcher interface @@ -127,12 +191,32 @@ def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): m = W3LeafMatcher(dbfile, offset, length, format_, term=term, scorer=scorer) return m - # Readers - def per_document_reader(self, storage, segment): + """ + Create a per-document value reader for the given storage and segment. + + Parameters: + - storage (Storage): The storage object for the index. + - segment (Segment): The segment object for the index. + + Returns: + - W3PerDocReader: The per-document value reader. + + """ return W3PerDocReader(storage, segment) def terms_reader(self, storage, segment): + """ + Create a terms reader for the given storage and segment. + + Parameters: + - storage (Storage): The storage object for the index. + - segment (Segment): The segment object for the index. + + Returns: + - W3TermsReader: The terms reader. + + """ tiname = segment.make_filename(self.TERMS_EXT) tilen = storage.file_length(tiname) tifile = storage.open_file(tiname) @@ -142,20 +226,45 @@ def terms_reader(self, storage, segment): return W3TermsReader(self, tifile, tilen, postfile) # Graph methods provided by CodecWithGraph + def supports_columns(self): + """ + Check if the codec supports per-document value columns. - # Columns + Returns: + - bool: True if per-document value columns are supported, False otherwise. - def supports_columns(self): + """ return True @classmethod def column_filename(cls, segment, fieldname): + """ + Get the filename for the per-document value column of the given field in the segment. + + Parameters: + - segment (Segment): The segment object for the index. + - fieldname (str): The name of the field. + + Returns: + - str: The filename for the per-document value column. + + """ ext = "".join((".", fieldname, cls.COLUMN_EXT)) return segment.make_filename(ext) # Segments and generations - def new_segment(self, storage, indexname): + """ + Create a new segment for the given storage and index name. + + Parameters: + - storage (Storage): The storage object for the index. + - indexname (str): The name of the index. + + Returns: + - W3Segment: The new segment. + + """ return W3Segment(self, indexname) @@ -163,18 +272,95 @@ def new_segment(self, storage, indexname): def _vecfield(fieldname): + """ + Returns the vector field name for a given field. + + Parameters: + fieldname (str): The name of the field. + + Returns: + str: The vector field name. + + Example: + >>> _vecfield("title") + '_title_vec' + + This function takes a field name as input and returns the corresponding vector field name. + The vector field name is constructed by adding underscores before and after the field name. + """ return f"_{fieldname}_vec" def _lenfield(fieldname): + """ + Returns the length field name for a given field. + + Parameters: + - fieldname (str): The name of the field. + + Returns: + - str: The length field name. + + Example: + >>> _lenfield("title") + '_title_len' + + This function is used to generate the length field name for a given field. The length field name is used in the Whoosh codec to store the length of a variable-length field. It appends "_len" to the field name to create the length field name. + + Usage: + >>> length_field = _lenfield("content") + >>> print(length_field) + '_content_len' + """ return f"_{fieldname}_len" # Per-doc information writer +class W3PerDocWriter(base.PerDocWriterWithColumns): + """ + This class is responsible for writing per-document data to the index for the Whoosh3 codec. + + It provides methods for adding fields, vectors, and other per-document information to the index. + + Usage: + ------ + 1. Create an instance of W3PerDocWriter by passing the codec, storage, and segment parameters to the constructor. + 2. Use the start_doc() method to indicate the start of a new document. + 3. Use the add_field() method to add a field to the document with its corresponding value and length. + 4. Use the add_vector_items() method to add vector items (text, weight, and vbytes) to the document. + 5. Use the finish_doc() method to indicate the end of the current document. + 6. Repeat steps 2-5 for each document. + 7. Call the close() method to finish writing the per-document data to the index. + + Note: + ----- + The close() method must be called after writing all the documents to the index. + + Attributes: + ----------- + - is_closed: A boolean attribute indicating whether the writer has been closed. + + Methods: + -------- + - start_doc(docnum): Indicates the start of a new document. + - add_field(fieldname, fieldobj, value, length): Adds a field to the document with its corresponding value and length. + - add_vector_items(fieldname, fieldobj, items): Adds vector items to the document. + - finish_doc(): Indicates the end of the current document. + - cancel_doc(): Cancels the current document. + - close(): Finishes writing the per-document data to the index. + """ -class W3PerDocWriter(base.PerDocWriterWithColumns): def __init__(self, codec, storage, segment): + """ + Initializes a new instance of W3PerDocWriter. + + Parameters: + ----------- + - codec: The codec used for encoding and decoding data. + - storage: The storage object used for storing the index files. + - segment: The segment object representing the current segment of the index. + """ self._codec = codec self._storage = storage self._segment = segment @@ -196,15 +382,49 @@ def __init__(self, codec, storage, segment): self._vpostfile = None def _create_file(self, ext): + """ + Creates a new file with the given extension in the current segment. + + Parameters: + ----------- + - ext: The extension of the file. + + Returns: + -------- + The created file object. + """ return self._segment.create_file(self._storage, ext) def _has_column(self, fieldname): + """ + Checks if a column with the given fieldname has been added. + + Parameters: + ----------- + - fieldname: The name of the field/column. + + Returns: + -------- + True if the column exists, False otherwise. + """ return fieldname in self._colwriters def _create_column(self, fieldname, column): + """ + Creates a new column with the given fieldname. + + Parameters: + ----------- + - fieldname: The name of the field/column. + - column: The column object. + + Raises: + ------- + ValueError: If a column with the same fieldname has already been added. + """ writers = self._colwriters if fieldname in writers: - raise Exception(f"Already added column {fieldname!r}") + raise ValueError(f"Already added column {fieldname!r}") f = self._cols.create_file(fieldname) writers[fieldname] = column.writer(f) @@ -220,9 +440,9 @@ def _prep_vectors(self): def start_doc(self, docnum): if self._indoc: - raise Exception("Called start_doc when already in a doc") + raise ValueError("Called start_doc when already in a doc") if docnum != self._doccount: - raise Exception( + raise ValueError( f"Called start_doc({docnum!r}) was expecting {self._doccount!r}" ) @@ -299,6 +519,43 @@ def close(self): class W3FieldWriter(base.FieldWriter): + """ + Writes field data to the index for the Whoosh3 codec. + + This class is responsible for writing field data, including terms and postings, to the index. + It is used internally by the Whoosh3 codec and should not be instantiated directly. + + Parameters: + - codec (Codec): The codec used for encoding and decoding data. + - storage (Storage): The storage object used for creating files. + - segment (Segment): The segment object representing the current segment. + + Attributes: + - _codec (Codec): The codec used for encoding and decoding data. + - _storage (Storage): The storage object used for creating files. + - _segment (Segment): The segment object representing the current segment. + - _fieldname (str): The name of the current field being written. + - _fieldid (int): The ID of the current field being written. + - _btext (bytes): The binary representation of the current term being written. + - _fieldobj (Field): The field object associated with the current field being written. + - _format (Format): The format object associated with the current field being written. + - _tindex (OrderedHashWriter): The ordered hash writer for the terms index. + - _fieldmap (dict): A dictionary mapping field names to field IDs. + - _postfile (File): The file object for writing postings data. + - _postwriter (PostingsWriter): The postings writer for the current field being written. + - _infield (bool): Indicates whether the writer is currently inside a field. + - is_closed (bool): Indicates whether the writer has been closed. + + Methods: + - _create_file(ext): Creates a file with the given extension. + - start_field(fieldname, fieldobj): Starts writing a new field. + - start_term(btext): Starts writing a new term. + - add(docnum, weight, vbytes, length): Adds a posting to the current term. + - finish_term(): Finishes writing the current term. + - finish_field(): Finishes writing the current field. + - close(): Closes the writer and releases any resources. + """ + def __init__(self, codec, storage, segment): self._codec = codec self._storage = storage @@ -321,9 +578,29 @@ def __init__(self, codec, storage, segment): self.is_closed = False def _create_file(self, ext): + """ + Creates a file with the given extension. + + Parameters: + - ext (str): The file extension. + + Returns: + - File: The created file object. + """ return self._segment.create_file(self._storage, ext) def start_field(self, fieldname, fieldobj): + """ + Starts writing a new field. + + Parameters: + - fieldname (str): The name of the field. + - fieldobj (Field): The field object. + + Raises: + - ValueError: If called before start_field. + + """ fmap = self._fieldmap if fieldname in fmap: self._fieldid = fmap[fieldname] @@ -340,15 +617,38 @@ def start_field(self, fieldname, fieldobj): self._postwriter = self._codec.postings_writer(self._postfile) def start_term(self, btext): + """ + Starts writing a new term. + + Parameters: + - btext (bytes): The binary representation of the term. + + Raises: + - ValueError: If called before start_field. + """ if self._postwriter is None: - raise Exception("Called start_term before start_field") + raise ValueError("Called start_term before start_field") self._btext = btext self._postwriter.start_postings(self._fieldobj.format, W3TermInfo()) def add(self, docnum, weight, vbytes, length): + """ + Adds a posting to the current term. + + Parameters: + - docnum (int): The document number. + - weight (float): The weight of the posting. + - vbytes (int): The number of bytes used to encode the posting value. + - length (int): The length of the posting. + + """ self._postwriter.add_posting(docnum, weight, vbytes, length) def finish_term(self): + """ + Finishes writing the current term. + + """ terminfo = self._postwriter.finish_postings() # Add row to term info table @@ -356,23 +656,29 @@ def finish_term(self): valbytes = terminfo.to_bytes() self._tindex.add(keybytes, valbytes) - # FieldWriterWithGraph.add_spell_word - def finish_field(self): + """ + Finishes writing the current field. + + Raises: + - ValueError: If called before start_field. + """ if not self._infield: - raise Exception("Called finish_field before start_field") + raise ValueError("Called finish_field before start_field") self._infield = False self._postwriter = None def close(self): + """ + Closes the writer and releases any resources. + + """ self._tindex.close() self._postfile.close() self.is_closed = True # Reader objects - - class W3PerDocReader(base.PerDocumentReader): def __init__(self, storage, segment): self._storage = storage @@ -497,7 +803,7 @@ def _vector_extent(self, docnum, fieldname): def has_vector(self, docnum, fieldname): if self.has_column(_vecfield(fieldname)): - offset, length = self._vector_extent(docnum, fieldname) + offset, _ = self._vector_extent(docnum, fieldname) return offset != 0 return False @@ -506,7 +812,7 @@ def vector(self, docnum, fieldname, format_): self._prep_vectors() offset, length = self._vector_extent(docnum, fieldname) if not offset: - raise Exception(f"Field {fieldname!r} has no vector in docnum {docnum}") + raise ValueError(f"Field {fieldname!r} has no vector in docnum {docnum}") m = W3LeafMatcher(self._vpostfile, offset, length, format_, byteids=True) return m @@ -521,7 +827,50 @@ def stored_fields(self, docnum): class W3FieldCursor(base.FieldCursor): + """Cursor for iterating over the terms in a field in a Whoosh 3 index. + + This cursor provides methods for iterating over the terms in a specific field + in a Whoosh 3 index. It allows you to navigate through the terms in the field, + retrieve the text representation of the current term, and access additional + information about the term. + + Attributes: + _tindex (TIndex): The TIndex object representing the index. + _fieldname (str): The name of the field. + _keycoder (callable): The function used to encode the field name and term + into a key. + _keydecoder (callable): The function used to decode a key into the field name + and term. + _fieldobj (Field): The Field object representing the field. + + Methods: + __init__(tindex, fieldname, keycoder, keydecoder, fieldobj): Initializes the + W3FieldCursor object. + first(): Moves the cursor to the first term in the field and returns the text + representation of the term. + find(term): Moves the cursor to the specified term in the field and returns the + text representation of the term. + next(): Moves the cursor to the next term in the field and returns the text + representation of the term. + text(): Returns the text representation of the current term. + term_info(): Returns additional information about the current term. + is_valid(): Returns True if the cursor is currently pointing to a valid term, + False otherwise. + """ + def __init__(self, tindex, fieldname, keycoder, keydecoder, fieldobj): + """ + Initializes a new instance of the W3FieldCursor class. + + Args: + tindex (TIndex): The TIndex object representing the index. + fieldname (str): The name of the field. + keycoder (callable): The function used to encode the field name and term + into a key. + keydecoder (callable): The function used to decode a key into the field name + and term. + fieldobj (Field): The Field object representing the field. + """ self._tindex = tindex self._fieldname = fieldname self._keycoder = keycoder @@ -538,10 +887,27 @@ def __init__(self, tindex, fieldname, keycoder, keydecoder, fieldobj): self.next() def first(self): + """ + Moves the cursor to the first term in the field and returns the text + representation of the term. + + Returns: + str: The text representation of the first term in the field. + """ self._pos = self._startpos return self.next() def find(self, term): + """ + Moves the cursor to the specified term in the field and returns the text + representation of the term. + + Args: + term (bytes or str): The term to find in the field. + + Returns: + str: The text representation of the found term. + """ if not isinstance(term, bytes): term = self._fieldobj.to_bytes(term) key = self._keycoder(self._fieldname, term) @@ -549,6 +915,13 @@ def find(self, term): return self.next() def next(self): + """ + Moves the cursor to the next term in the field and returns the text + representation of the term. + + Returns: + str: The text representation of the next term in the field. + """ if self._pos is not None: keyrng = self._tindex.key_and_range_at(self._pos) if keyrng is not None: @@ -565,9 +938,21 @@ def next(self): return None def text(self): + """ + Returns the text representation of the current term. + + Returns: + str: The text representation of the current term. + """ return self._text def term_info(self): + """ + Returns additional information about the current term. + + Returns: + W3TermInfo: An object containing additional information about the current term. + """ if self._pos is None: return None @@ -575,11 +960,59 @@ def term_info(self): return W3TermInfo.from_bytes(databytes) def is_valid(self): + """ + Returns True if the cursor is currently pointing to a valid term, False otherwise. + + Returns: + bool: True if the cursor is currently pointing to a valid term, False otherwise. + """ return self._pos is not None class W3TermsReader(base.TermsReader): + """ + A terms reader for the Whoosh3 codec. + + This class is responsible for reading and retrieving terms, term information, and posting lists from the index. + + Parameters: + - codec (Codec): The codec associated with the index. + - dbfile (file-like object): The file-like object representing the terms index. + - length (int): The length of the terms index. + - postfile (file-like object): The file-like object representing the posting lists. + + Attributes: + - _codec (Codec): The codec associated with the index. + - _dbfile (file-like object): The file-like object representing the terms index. + - _tindex (OrderedHashReader): The ordered hash reader for the terms index. + - _fieldmap (dict): A dictionary mapping field names to field numbers. + - _postfile (file-like object): The file-like object representing the posting lists. + - _fieldunmap (list): A list mapping field numbers to field names. + + """ + def __init__(self, codec, dbfile, length, postfile): + """ + Initialize a Whoosh3 object. + + Parameters: + - codec (object): The codec object used for encoding and decoding data. + - dbfile (str): The path to the database file. + - length (int): The length of the database file. + - postfile (str): The path to the postfile. + + This method initializes a Whoosh3 object by setting the codec, database file, + length, postfile, fieldmap, and fieldunmap attributes. The fieldmap is a + dictionary that maps field names to field numbers, and the fieldunmap is a + list that maps field numbers to field names. + + Example usage: + codec = MyCodec() + dbfile = "/path/to/database.db" + length = 1000 + postfile = "/path/to/postfile" + whoosh3 = Whoosh3(codec, dbfile, length, postfile) + """ self._codec = codec self._dbfile = dbfile self._tindex = filetables.OrderedHashReader(dbfile, length) @@ -591,34 +1024,112 @@ def __init__(self, codec, dbfile, length, postfile): self._fieldunmap[num] = fieldname def _keycoder(self, fieldname, tbytes): + """ + Encode the field name and term bytes into a key. + + Parameters: + - fieldname (str): The name of the field. + - tbytes (bytes): The term bytes. + + Returns: + - bytes: The encoded key. + + """ assert isinstance(tbytes, bytes), f"tbytes={tbytes!r}" fnum = self._fieldmap.get(fieldname, 65535) return pack_ushort(fnum) + tbytes def _keydecoder(self, keybytes): + """ + Decode the key bytes into the field name and term bytes. + + Parameters: + - keybytes (bytes): The key bytes. + + Returns: + - Tuple[str, bytes]: The field name and term bytes. + + """ fieldid = unpack_ushort(keybytes[:_SHORT_SIZE])[0] return self._fieldunmap[fieldid], keybytes[_SHORT_SIZE:] def _range_for_key(self, fieldname, tbytes): + """ + Get the range of positions in the terms index for the given field name and term bytes. + + Parameters: + - fieldname (str): The name of the field. + - tbytes (bytes): The term bytes. + + Returns: + - Tuple[int, int]: The start and end positions in the terms index. + + """ return self._tindex.range_for_key(self._keycoder(fieldname, tbytes)) def __contains__(self, term): + """ + Check if the given term is present in the terms index. + + Parameters: + - term (Tuple[str, bytes]): The field name and term bytes. + + Returns: + - bool: True if the term is present, False otherwise. + + """ return self._keycoder(*term) in self._tindex def indexed_field_names(self): + """ + Get the names of the fields that are indexed. + + Returns: + - KeysView: A view object containing the names of the indexed fields. + + """ return self._fieldmap.keys() def cursor(self, fieldname, fieldobj): + """ + Create a cursor for iterating over the terms in the given field. + + Parameters: + - fieldname (str): The name of the field. + - fieldobj (Field): The field object. + + Returns: + - W3FieldCursor: The cursor object. + + """ tindex = self._tindex coder = self._keycoder decoder = self._keydecoder return W3FieldCursor(tindex, fieldname, coder, decoder, fieldobj) def terms(self): + """ + Get an iterator over all the terms in the index. + + Yields: + - Tuple[str, bytes]: The field name and term bytes. + + """ keydecoder = self._keydecoder return (keydecoder(keybytes) for keybytes in self._tindex.keys()) def terms_from(self, fieldname, prefix): + """ + Get an iterator over the terms in the given field starting from the specified prefix. + + Parameters: + - fieldname (str): The name of the field. + - prefix (bytes): The prefix bytes. + + Yields: + - Tuple[str, bytes]: The field name and term bytes. + + """ prefixbytes = self._keycoder(fieldname, prefix) keydecoder = self._keydecoder return ( @@ -626,6 +1137,13 @@ def terms_from(self, fieldname, prefix): ) def items(self): + """ + Get an iterator over all the (term, term info) pairs in the index. + + Yields: + - Tuple[Tuple[str, bytes], W3TermInfo]: The (field name, term bytes) and term info. + + """ tidecoder = W3TermInfo.from_bytes keydecoder = self._keydecoder return ( @@ -634,6 +1152,17 @@ def items(self): ) def items_from(self, fieldname, prefix): + """ + Get an iterator over the (term, term info) pairs in the given field starting from the specified prefix. + + Parameters: + - fieldname (str): The name of the field. + - prefix (bytes): The prefix bytes. + + Yields: + - Tuple[Tuple[str, bytes], W3TermInfo]: The (field name, term bytes) and term info. + + """ prefixbytes = self._keycoder(fieldname, prefix) tidecoder = W3TermInfo.from_bytes keydecoder = self._keydecoder @@ -643,6 +1172,20 @@ def items_from(self, fieldname, prefix): ) def term_info(self, fieldname, tbytes): + """ + Get the term info for the given field name and term bytes. + + Parameters: + - fieldname (str): The name of the field. + - tbytes (bytes): The term bytes. + + Returns: + - W3TermInfo: The term info. + + Raises: + - TermNotFound: If the term is not found. + + """ key = self._keycoder(fieldname, tbytes) try: return W3TermInfo.from_bytes(self._tindex[key]) @@ -650,14 +1193,49 @@ def term_info(self, fieldname, tbytes): raise TermNotFound(f"No term {fieldname}:{tbytes!r}") def frequency(self, fieldname, tbytes): + """ + Get the frequency of the given term in the specified field. + + Parameters: + - fieldname (str): The name of the field. + - tbytes (bytes): The term bytes. + + Returns: + - int: The term frequency. + + """ datapos = self._range_for_key(fieldname, tbytes)[0] return W3TermInfo.read_weight(self._dbfile, datapos) def doc_frequency(self, fieldname, tbytes): + """ + Get the document frequency of the given term in the specified field. + + Parameters: + - fieldname (str): The name of the field. + - tbytes (bytes): The term bytes. + + Returns: + - int: The document frequency. + + """ datapos = self._range_for_key(fieldname, tbytes)[0] return W3TermInfo.read_doc_freq(self._dbfile, datapos) def matcher(self, fieldname, tbytes, format_, scorer=None): + """ + Create a matcher for the given term in the specified field. + + Parameters: + - fieldname (str): The name of the field. + - tbytes (bytes): The term bytes. + - format_ (str): The format of the posting lists. + - scorer (Scorer, optional): The scorer object. + + Returns: + - Matcher: The matcher object. + + """ terminfo = self.term_info(fieldname, tbytes) m = self._codec.postings_reader( self._postfile, terminfo, format_, term=(fieldname, tbytes), scorer=scorer @@ -665,6 +1243,10 @@ def matcher(self, fieldname, tbytes, format_, scorer=None): return m def close(self): + """ + Close the terms reader and associated resources. + + """ self._tindex.close() self._postfile.close() @@ -676,6 +1258,13 @@ class W3PostingsWriter(base.PostingsWriter): """This object writes posting lists to the postings file. It groups postings into blocks and tracks block level statistics to makes it easier to skip through the postings. + + Parameters: + - postfile (file-like object): The file-like object to write the posting lists to. + - blocklimit (int): The maximum number of postings to buffer before writing them to the file. + - byteids (bool, optional): Whether the IDs should be stored as bytes or integers. Defaults to False. + - compression (int, optional): The compression level to use. Defaults to 3. + - inlinelimit (int, optional): The maximum number of postings to inline into the terminfo object. Defaults to 1. """ def __init__( @@ -692,13 +1281,26 @@ def __init__( self._terminfo = None def written(self): + """Check if any blocks have been written to the file. + + Returns: + bool: True if blocks have been written, False otherwise. + """ return self._blockcount > 0 def start_postings(self, format_, terminfo): - # Start a new term + """Start a new term. + + Parameters: + - format_ (formats.Format): The format object for the term. + - terminfo (Terminfo): The terminfo object for the term. + + Raises: + ValueError: If called while already in a term. + """ if self._terminfo: # If self._terminfo is not None, that means we are already in a term - raise Exception("Called start in a term") + raise ValueError("Called start in a term") assert isinstance(format_, formats.Format) self._format = format_ @@ -712,9 +1314,17 @@ def start_postings(self, format_, terminfo): self._startoffset = self._postfile.tell() def add_posting(self, id_, weight, vbytes, length=None): - # Add a posting to the buffered block + """Add a posting to the buffered block. - # If the number of buffered postings == the block limit, write out the + Parameters: + - id_ (str or int): The ID of the posting. + - weight (int or float): The weight of the posting. + - vbytes (bytes): The encoded payload of the posting. + - length (int, optional): The length of the field. Defaults to None. + + Raises: + AssertionError: If the types of the parameters are incorrect. + """ # buffered block and reset before adding this one if len(self._ids) >= self._blocklimit: self._write_block() @@ -743,9 +1353,19 @@ def add_posting(self, id_, weight, vbytes, length=None): self._maxlength = length def finish_postings(self): + """Finish writing the postings for the term. + + If there are fewer than "inlinelimit" postings in this posting list, + the postings are inlined into the terminfo object instead of writing them to the posting file. + + Returns: + Terminfo: The current terminfo object. + + Raises: + AssertionError: If the types of the parameters are incorrect. + """ terminfo = self._terminfo - # If we have fewer than "inlinelimit" postings in this posting list, - # "inline" the postings into the terminfo instead of writing them to + # the posting file if not self.written() and len(self) < self._inlinelimit: terminfo.add_block(self) @@ -764,8 +1384,7 @@ def finish_postings(self): return terminfo def _new_block(self): - # Reset block buffer - + """Reset the block buffer.""" # List of IDs (docnums for regular posting list, terms for vector PL) self._ids = [] if self._byteids else array("I") # List of weights @@ -778,8 +1397,11 @@ def _new_block(self): self._maxweight = 0 def _write_block(self, last=False): - # Write the buffered block to the postings file + """Write the buffered block to the postings file. + Parameters: + - last (bool, optional): Whether this is the last block. Defaults to False. + """ # If this is the first block, write a small header first if not self._blockcount: self._postfile.write(WHOOSH3_HEADER_MAGIC) @@ -796,7 +1418,8 @@ def _write_block(self, last=False): if len(databytes) < 20: comp = 0 # Compress the pickle (if self._compression > 0) - comp = self._compression + if self._compression > 0: + comp = self._compression if comp: databytes = zlib.compress(databytes, comp) @@ -840,18 +1463,15 @@ def _write_block(self, last=False): self._new_block() # Methods to reduce the byte size of the various lists - def _mini_ids(self): - # Minify IDs - + """Minify the IDs.""" ids = self._ids if not self._byteids: ids = delta_encode(ids) return tuple(ids) def _mini_weights(self): - # Minify weights - + """Minify the weights.""" weights = self._weights if all(w == 1.0 for w in weights): @@ -862,8 +1482,7 @@ def _mini_weights(self): return tuple(weights) def _mini_values(self): - # Minify values - + """Minify the values.""" fixedsize = self._format.fixed_value_size() values = self._values @@ -876,32 +1495,117 @@ def _mini_values(self): return vs # Block stats methods - def __len__(self): - # Returns the number of unwritten buffered postings + """Return the number of unwritten buffered postings. + + Returns: + int: The number of unwritten buffered postings. + """ return len(self._ids) def min_id(self): - # First ID in the buffered block + """Return the first ID in the buffered block. + + Returns: + str or int: The first ID in the buffered block. + """ return self._ids[0] def max_id(self): - # Last ID in the buffered block + """Return the last ID in the buffered block. + + Returns: + str or int: The last ID in the buffered block. + """ return self._ids[-1] def min_length(self): - # Shortest field length in the buffered block + """Return the shortest field length in the buffered block. + + Returns: + int or None: The shortest field length in the buffered block. + """ return self._minlength def max_length(self): - # Longest field length in the buffered block + """Return the longest field length in the buffered block. + + Returns: + int: The longest field length in the buffered block. + """ return self._maxlength def max_weight(self): - # Highest weight in the buffered block + """Return the highest weight in the buffered block. + + Returns: + int or float: The highest weight in the buffered block. + """ return self._maxweight +class W3LeafMatcher(LeafMatcher): + """Reads on-disk postings from the postings file and presents the + :class:`whoosh.matching.Matcher` interface. + + Parameters: + - postfile (file-like object): The file-like object representing the postings file. + - startoffset (int): The starting offset of the postings in the file. + - length (int): The length of the postings. + - format_ (CodecFormat): The format of the postings. + - term (bytes, optional): The term associated with the postings. Defaults to None. + - byteids (bool, optional): Whether the IDs in the postings are stored as bytes. Defaults to None. + - scorer (Scorer, optional): The scorer to use for scoring the postings. Defaults to None. + + Attributes: + - _postfile (file-like object): The file-like object representing the postings file. + - _startoffset (int): The starting offset of the postings in the file. + - _length (int): The length of the postings. + - format (CodecFormat): The format of the postings. + - _term (bytes): The term associated with the postings. + - _byteids (bool): Whether the IDs in the postings are stored as bytes. + - scorer (Scorer): The scorer to use for scoring the postings. + - _fixedsize (int): The fixed size of the values in the postings. + - _baseoffset (int): The base offset of the postings (start of postings, after the header). + - _blocklength (int): The length of the current block of postings. + - _maxid (int): The maximum ID in the current block of postings. + - _maxweight (float): The maximum weight in the current block of postings. + - _compression (bool): Whether the block of postings is compressed. + - _minlength (int): The minimum length of the values in the current block of postings. + - _maxlength (int): The maximum length of the values in the current block of postings. + - _lastblock (bool): Whether the current block of postings is the last block. + - _atend (bool): Whether the matcher has reached the end of the postings. + - _data (tuple): The data tuple of the current block of postings. + - _ids (tuple): The IDs in the current block of postings. + - _weights (array): The weights in the current block of postings. + - _values (tuple): The values in the current block of postings. + - _i (int): The current position in the block of postings. + + Methods: + - _read_header(): Reads the header tag at the start of the postings. + - reset(): Resets the matcher to read the first block of postings. + - _goto(position): Reads the posting block at the given position. + - _next_block(): Moves to the next block of postings. + - _skip_to_block(skipwhile): Skips blocks as long as the skipwhile() function returns True. + - is_active(): Checks if the matcher is active (not at the end of the postings). + - id(): Returns the current ID (docnum for regular postings, term for vector). + - weight(): Returns the weight for the current posting. + - value(): Returns the value for the current posting. + - next(): Moves to the next posting. + - skip_to(targetid): Skips to the next ID equal to or greater than the given target ID. + - skip_to_quality(minquality): Skips blocks until finding one that might exceed the given minimum quality. + - block_min_id(): Returns the minimum ID in the current block of postings. + - block_max_id(): Returns the maximum ID in the current block of postings. + - block_min_length(): Returns the minimum length of the values in the current block of postings. + - block_max_length(): Returns the maximum length of the values in the current block of postings. + - block_max_weight(): Returns the maximum weight in the current block of postings. + - _read_data(): Loads the block data tuple from disk. + - _read_ids(): Loads the IDs from the block data. + - _read_weights(): Loads the weights from the block data. + - _read_values(): Loads the values from the block data. + """ + + class W3LeafMatcher(LeafMatcher): """Reads on-disk postings from the postings file and presents the :class:`whoosh.matching.Matcher` interface. @@ -917,6 +1621,28 @@ def __init__( byteids=None, scorer=None, ): + """ + Initialize a Whoosh3 object. + + Args: + postfile (file-like object): The file-like object representing the postings file. + startoffset (int): The starting offset of the postings in the file. + length (int): The length of the postings in bytes. + format_ (CodecFormat): The codec format used for encoding and decoding the postings. + term (bytes, optional): The term associated with the postings. Defaults to None. + byteids (list of int, optional): The byte IDs associated with the postings. Defaults to None. + scorer (Scorer, optional): The scorer used for scoring the postings. Defaults to None. + + Attributes: + _postfile (file-like object): The file-like object representing the postings file. + _startoffset (int): The starting offset of the postings in the file. + _length (int): The length of the postings in bytes. + format (CodecFormat): The codec format used for encoding and decoding the postings. + _term (bytes): The term associated with the postings. + _byteids (list of int): The byte IDs associated with the postings. + scorer (Scorer): The scorer used for scoring the postings. + _fixedsize (int): The fixed size of each posting value. + """ self._postfile = postfile self._startoffset = startoffset self._length = length @@ -932,19 +1658,45 @@ def __init__( self.reset() def _read_header(self): - # Seek to the start of the postings and check the header tag + """ + Reads and verifies the header of the postings file. + + This method seeks to the start of the postings file, reads the header tag, and verifies its correctness. + It also sets the base offset to the current position in the file, which represents the start of the postings + after the header. + + Raises: + ValueError: If the header tag is incorrect. + + Usage: + Call this method to read and verify the header of the postings file before accessing the postings data. + + """ postfile = self._postfile postfile.seek(self._startoffset) magic = postfile.read(4) if magic != WHOOSH3_HEADER_MAGIC: - raise Exception(f"Block tag error {magic!r}") + raise ValueError(f"Block tag error {magic!r}") # Remember the base offset (start of postings, after the header) self._baseoffset = postfile.tell() def reset(self): - # Reset block stats + """ + Reset the codec's internal state. + + This method resets the block stats, including block length, maximum ID, maximum weight, + compression, minimum length, and maximum length. It also resets the flags indicating the + last block and whether the codec is at the end. + + After resetting the internal state, the method consumes the first block by calling the + `_goto` method with the base offset. + + Usage: + codec.reset() + + """ self._blocklength = None self._maxid = None self._maxweight = None @@ -958,8 +1710,33 @@ def reset(self): self._goto(self._baseoffset) def _goto(self, position): - # Read the posting block at the given position - + """ + Move the pointer to the given position in the posting file and load the block data. + + Args: + position (int): The position in the posting file to move the pointer to. + + Returns: + None + + Raises: + None + + This method is responsible for moving the pointer to the specified position in the posting file + and loading the block data from that position. It performs the following steps: + 1. Resets the block data attributes to None. + 2. Resets the pointer into the block to 0. + 3. Seeks to the start of the block in the posting file. + 4. Reads the length of the block. + 5. If the length is negative, sets the `_lastblock` attribute to True and makes the length positive. + 6. Remembers the offset of the next block. + 7. Reads the pickled block info tuple. + 8. Remembers the offset of the block's data. + 9. Decomposes the info tuple to set the current block info. + + Note: + This method assumes that the posting file is already open and assigned to the `_postfile` attribute. + """ postfile = self._postfile # Reset block data -- we'll lazy load the data from the new block as @@ -1000,10 +1777,24 @@ def _goto(self, position): self._maxlength = byte_to_length(mxlen) def _next_block(self): + """ + Move to the next block in the postings. + + This method is responsible for advancing the cursor to the next block in the postings. + It handles cases where the cursor is already at the end, reached the end of the postings, + or needs to move to the next block. + + Raises: + ValueError: If there is no next block. + + Usage: + Call this method to move the cursor to the next block in the postings. + + """ if self._atend: # We were already at the end, and yet somebody called _next_block() # again, so something is wrong somewhere - raise Exception("No next block") + raise ValueError("No next block") elif self._lastblock: # Reached the end of the postings self._atend = True @@ -1012,8 +1803,30 @@ def _next_block(self): self._goto(self._nextoffset) def _skip_to_block(self, skipwhile): - # Skip blocks as long as the skipwhile() function returns True - + """ + Skips blocks in the codec as long as the skipwhile() function returns True. + + Parameters: + - skipwhile (function): A function that takes no arguments and returns a boolean value. + It is called at each block to determine whether to skip to the next block or not. + + Returns: + - skipped (int): The number of blocks skipped. + + Notes: + - This method is used internally by the codec to skip blocks based on a condition. + - The skipwhile() function should return True if the current block should be skipped, + and False if the current block should not be skipped. + + Example usage: + ``` + def skip_condition(): + # Skip blocks until a certain condition is met + return some_condition() + + skipped_blocks = _skip_to_block(skip_condition) + ``` + """ skipped = 0 while self.is_active() and skipwhile(): self._next_block() @@ -1021,10 +1834,26 @@ def _skip_to_block(self, skipwhile): return skipped def is_active(self): + """ + Check if the current position in the file is active. + + Returns: + bool: True if the current position is active, False otherwise. + """ return not self._atend and self._i < self._blocklength def id(self): - # Get the current ID (docnum for regular postings, term for vector) + """ + Get the current ID. + + This method returns the current ID, which can be either the docnum for regular postings or the term for vectors. + + Returns: + int: The current ID. + + Raises: + ValueError: If the block IDs have not been loaded yet. + """ # If we haven't loaded the block IDs yet, load them now if self._ids is None: @@ -1033,8 +1862,19 @@ def id(self): return self._ids[self._i] def weight(self): - # Get the weight for the current posting + """ + Get the weight for the current posting. + + This method retrieves the weight associated with the current posting. + If the block weights have not been loaded yet, it loads them before + returning the weight. + + Returns: + float: The weight of the current posting. + Raises: + Exception: If the block weights cannot be loaded. + """ # If we haven't loaded the block weights yet, load them now if self._weights is None: self._read_weights() @@ -1042,8 +1882,17 @@ def weight(self): return self._weights[self._i] def value(self): - # Get the value for the current posting + """ + Get the value for the current posting. + If the block values have not been loaded yet, this method will load them. + + Returns: + The value for the current posting. + + Raises: + IndexError: If the current posting index is out of range. + """ # If we haven't loaded the block values yet, load them now if self._values is None: self._read_values() @@ -1051,8 +1900,15 @@ def value(self): return self._values[self._i] def next(self): - # Move to the next posting + """ + Move to the next posting. + This method increments the in-block pointer by 1. If the pointer reaches the end of the block, + it moves to the next block and returns True. Otherwise, it returns False. + + Returns: + bool: True if the pointer reached the end of the block and moved to the next block, False otherwise. + """ # Increment the in-block pointer self._i += 1 # If we reached the end of the block, move to the next block @@ -1063,8 +1919,23 @@ def next(self): return False def skip_to(self, targetid): - # Skip to the next ID equal to or greater than the given target ID + """ + Skip to the next ID equal to or greater than the given target ID. + + Args: + targetid (int): The target ID to skip to. + Raises: + ReadTooFar: If the skip operation is attempted when the reader is not active. + + Notes: + - If the reader is already at or past the target ID, no skipping is performed. + - The method skips to the block that would contain the target ID. + - If the target ID is greater than the maximum ID in the current block, the method + skips to the next block that would contain the target ID. + - The method iterates through the IDs in the block until it finds or passes the target ID. + + """ if not self.is_active(): raise ReadTooFar @@ -1083,9 +1954,21 @@ def skip_to(self, targetid): self.next() def skip_to_quality(self, minquality): - # Skip blocks until we find one that might exceed the given minimum - # quality + """ + Skips to the next block with a quality greater than or equal to the given minimum quality. + + Parameters: + - minquality (float): The minimum quality threshold. + Returns: + - int: The number of blocks skipped. + + Notes: + - This method is used to skip blocks in a search index until a block with a quality greater than or equal to the given minimum quality is found. + - The block quality is determined by the `block_quality` attribute of the current object. + - If the quality of the current block is already higher than the minimum quality, no blocks are skipped. + - Blocks are skipped until a block with a quality greater than or equal to the minimum quality is found. + """ block_quality = self.block_quality # If the quality of this block is already higher than the minimum, @@ -1098,25 +1981,109 @@ def skip_to_quality(self, minquality): return self._skip_to_block(lambda: block_quality() <= minquality) def block_min_id(self): + """ + Returns the minimum ID of the block. + + This method retrieves the minimum ID of the block. If the IDs have not been + read yet, it reads them from the source. + + Returns: + int: The minimum ID of the block. + + """ if self._ids is None: self._read_ids() return self._ids[0] def block_max_id(self): + """ + Returns the maximum ID of the block. + + This method returns the maximum ID of the block. The ID represents the highest + value assigned to a block. + + Returns: + int: The maximum ID of the block. + + Example: + >>> codec = WhooshCodec() + >>> codec.block_max_id() + 10 + """ return self._maxid def block_min_length(self): + """ + Returns the minimum length of a block. + + This method returns the minimum length of a block used by the codec. + The block length is an important parameter that affects the indexing + and searching process. It determines the size of the data chunks that + are read and written during these operations. + + Returns: + int: The minimum length of a block. + + """ return self._minlength def block_max_length(self): + """ + Returns the maximum length of a block in the codec. + + This method returns the maximum length of a block in the codec. A block is a unit of data used in the codec's + internal operations. The maximum length of a block can affect the performance and memory usage of the codec. + + Returns: + int: The maximum length of a block in the codec. + + Example: + >>> codec = WhooshCodec() + >>> codec.block_max_length() + 4096 + + Note: + The value returned by this method is determined by the codec implementation and may vary between different + codecs. + + """ return self._maxlength def block_max_weight(self): + """ + Returns the maximum weight of a block in the codec. + + This method returns the maximum weight that a block can have in the codec. + The weight of a block is a measure of its importance or relevance. + + Returns: + int: The maximum weight of a block. + + Example: + >>> codec = WhooshCodec() + >>> codec.block_max_weight() + 100 + + Note: + The maximum weight can be used to determine the importance of a block + when performing operations such as scoring or ranking. + """ return self._maxweight def _read_data(self): - # Load block data tuple from disk + """ + Reads and loads the block data tuple from disk. + This method reads the block data tuple from the disk, decompresses it if necessary, + and unpickles the data tuple. The unpickled data tuple is then saved in the `_data` + attribute of the object. + + Returns: + None + + Raises: + None + """ datalen = self._nextoffset - self._dataoffset b = self._postfile.get(self._dataoffset, datalen) @@ -1128,6 +2095,21 @@ def _read_data(self): self._data = loads(b) def _read_ids(self): + """ + Reads and initializes the document IDs from disk. + + This method loads the document IDs from disk if they haven't been loaded yet. + It then de-minifies the IDs if necessary and sets the `_ids` attribute. + + Returns: + None + + Raises: + Any exceptions that occur during the data loading process. + + Usage: + Call this method to load and initialize the document IDs before using them. + """ # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() @@ -1140,6 +2122,21 @@ def _read_ids(self): self._ids = ids def _read_weights(self): + """ + Reads and initializes the weights for the index. + + If the data has not been loaded from disk yet, it loads it first. + The weights are then de-minified and stored in the `_weights` attribute. + + Returns: + None + + Raises: + None + + Usage: + _read_weights() + """ # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() @@ -1155,6 +2152,27 @@ def _read_weights(self): self._weights = weights def _read_values(self): + """ + Reads and de-minifies the values from the data. + + If the data has not been loaded from disk yet, it will be loaded before processing. + + Parameters: + None + + Returns: + None + + Raises: + None + + Usage: + Call this method to read and de-minify the values from the data. + It is recommended to call this method before accessing the values. + + Example: + _read_values() + """ # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() @@ -1174,26 +2192,50 @@ def _read_values(self): # Term info implementation +class W3TermInfo(TermInfo): + """ + Represents term information for the Whoosh3 codec. + This class is responsible for storing and manipulating term information such as + weights, document frequencies, lengths, and IDs. It provides methods to add blocks + of information, set extents, inline postings, and convert the term info to bytes. + + Attributes: + _struct (struct.Struct): The struct format used to pack and unpack the term info. + _offset (int): The offset of the term info in the posting file. + _length (int): The length of the term info in the posting file. + _inlined (tuple): A tuple containing the inlined postings (IDs, weights, values). + + """ -class W3TermInfo(TermInfo): - # B | Flags - # f | Total weight - # I | Total doc freq - # B | Min length (encoded as byte) - # B | Max length (encoded as byte) - # f | Max weight - # I | Minimum (first) ID - # I | Maximum (last) ID _struct = struct.Struct("!BfIBBfII") def __init__(self, *args, **kwargs): + """ + Initializes a new instance of the W3TermInfo class. + + Args: + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + + """ TermInfo.__init__(self, *args, **kwargs) self._offset = None self._length = None self._inlined = None def add_block(self, block): + """ + Adds a block of information to the term info. + + This method updates the total weight, document frequency, minimum length, + maximum length, maximum weight, minimum ID, and maximum ID based on the + information in the given block. + + Args: + block (Block): The block of information to add. + + """ self._weight += sum(block._weights) self._df += len(block) @@ -1210,22 +2252,72 @@ def add_block(self, block): self._maxid = block.max_id() def set_extent(self, offset, length): + """ + Sets the extent of the term info in the posting file. + + This method sets the offset and length of the term info in the posting file. + + Args: + offset (int): The offset of the term info. + length (int): The length of the term info. + + """ self._offset = offset self._length = length def extent(self): + """ + Returns the extent of the term info in the posting file. + + Returns: + tuple: A tuple containing the offset and length of the term info. + + """ return self._offset, self._length def set_inlined(self, ids, weights, values): + """ + Sets the inlined postings for the term info. + + This method sets the inlined postings, which are represented as tuples of IDs, + weights, and values. + + Args: + ids (tuple): A tuple of IDs. + weights (tuple): A tuple of weights. + values (tuple): A tuple of values. + + """ self._inlined = (tuple(ids), tuple(weights), tuple(values)) def is_inlined(self): + """ + Checks if the term info has inlined postings. + + Returns: + bool: True if the term info has inlined postings, False otherwise. + + """ return self._inlined is not None def inlined_postings(self): + """ + Returns the inlined postings for the term info. + + Returns: + tuple: A tuple containing the inlined postings (IDs, weights, values). + + """ return self._inlined def to_bytes(self): + """ + Converts the term info to bytes. + + Returns: + bytes: The term info encoded as bytes. + + """ isinlined = self.is_inlined() # Encode the lengths as 0-255 values @@ -1258,6 +2350,16 @@ def to_bytes(self): @classmethod def from_bytes(cls, s): + """ + Creates a new W3TermInfo instance from bytes. + + Args: + s (bytes): The bytes representing the term info. + + Returns: + W3TermInfo: A new instance of the W3TermInfo class. + + """ st = cls._struct vals = st.unpack(s[: st.size]) terminfo = cls() @@ -1285,14 +2387,47 @@ def from_bytes(cls, s): @classmethod def read_weight(cls, dbfile, datapos): + """ + Reads the weight from the database file. + + Args: + dbfile (DatabaseFile): The database file. + datapos (int): The position of the weight in the file. + + Returns: + float: The weight. + + """ return dbfile.get_float(datapos + 1) @classmethod def read_doc_freq(cls, dbfile, datapos): + """ + Reads the document frequency from the database file. + + Args: + dbfile (DatabaseFile): The database file. + datapos (int): The position of the document frequency in the file. + + Returns: + int: The document frequency. + + """ return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE) @classmethod def read_min_and_max_length(cls, dbfile, datapos): + """ + Reads the minimum and maximum length from the database file. + + Args: + dbfile (DatabaseFile): The database file. + datapos (int): The position of the lengths in the file. + + Returns: + tuple: A tuple containing the minimum and maximum length. + + """ lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE ml = byte_to_length(dbfile.get_byte(lenpos)) xl = byte_to_length(dbfile.get_byte(lenpos + 1)) @@ -1300,14 +2435,43 @@ def read_min_and_max_length(cls, dbfile, datapos): @classmethod def read_max_weight(cls, dbfile, datapos): + """ + Reads the maximum weight from the database file. + + Args: + dbfile (DatabaseFile): The database file. + datapos (int): The position of the maximum weight in the file. + + Returns: + float: The maximum weight. + + """ weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2 return dbfile.get_float(weightspos) # Segment implementation +class W3Segment(base.Segment): + """ + Represents a segment in the Whoosh index. + + Args: + codec (Codec): The codec used for encoding and decoding the segment. + indexname (str): The name of the index. + doccount (int, optional): The number of documents in the segment. Defaults to 0. + segid (str, optional): The unique identifier for the segment. If not provided, a random ID will be generated. + deleted (set, optional): A set of deleted document numbers. Defaults to None. + + Attributes: + indexname (str): The name of the index. + segid (str): The unique identifier for the segment. + compound (bool): Indicates whether the segment is a compound segment. + _codec (Codec): The codec used for encoding and decoding the segment. + _doccount (int): The number of documents in the segment. + _deleted (set): A set of deleted document numbers. + """ -class W3Segment(base.Segment): def __init__(self, codec, indexname, doccount=0, segid=None, deleted=None): self.indexname = indexname self.segid = self._random_id() if segid is None else segid @@ -1318,26 +2482,69 @@ def __init__(self, codec, indexname, doccount=0, segid=None, deleted=None): self.compound = False def codec(self, **kwargs): + """ + Returns the codec used for encoding and decoding the segment. + + Returns: + Codec: The codec used for the segment. + + """ return self._codec def set_doc_count(self, dc): + """ + Sets the number of documents in the segment. + + Args: + dc (int): The number of documents. + + """ self._doccount = dc def doc_count_all(self): + """ + Returns the total number of documents in the segment. + + Returns: + int: The total number of documents. + + """ return self._doccount def deleted_count(self): + """ + Returns the number of deleted documents in the segment. + + Returns: + int: The number of deleted documents. + + """ if self._deleted is None: return 0 return len(self._deleted) def deleted_docs(self): + """ + Returns an iterator over the deleted document numbers in the segment. + + Returns: + Iterator[int]: An iterator over the deleted document numbers. + + """ if self._deleted is None: return () else: return iter(self._deleted) def delete_document(self, docnum, delete=True): + """ + Marks a document as deleted in the segment. + + Args: + docnum (int): The document number to delete. + delete (bool, optional): Whether to delete the document. Defaults to True. + + """ if delete: if self._deleted is None: self._deleted = set() @@ -1346,6 +2553,16 @@ def delete_document(self, docnum, delete=True): self._deleted.clear(docnum) def is_deleted(self, docnum): + """ + Checks if a document is marked as deleted in the segment. + + Args: + docnum (int): The document number to check. + + Returns: + bool: True if the document is marked as deleted, False otherwise. + + """ if self._deleted is None: return False return docnum in self._deleted diff --git a/src/whoosh/fields.py b/src/whoosh/fields.py index 1ba30e45..d858b289 100644 --- a/src/whoosh/fields.py +++ b/src/whoosh/fields.py @@ -166,12 +166,11 @@ def index(self, value, **kwargs): """ if not self.format: - raise Exception( - "%s field %r cannot index without a format" - % (self.__class__.__name__, self) + raise ValueError( + f"{self.__class__.__name__} field {self} cannot index without a format" ) if not isinstance(value, (str, list, tuple)): - raise ValueError(f"{value!r} is not unicode or sequence") + raise ValueError(f"{value} is not unicode or sequence") assert isinstance(self.format, formats.Format) if "mode" not in kwargs: @@ -190,7 +189,7 @@ def tokenize(self, value, **kwargs): """ if not self.analyzer: - raise Exception(f"{self.__class__} field has no analyzer") + raise ValueError(f"{self.__class__} field has no analyzer") return self.analyzer(value, **kwargs) def process_text(self, qstring, mode="", **kwargs): @@ -203,7 +202,7 @@ def process_text(self, qstring, mode="", **kwargs): """ if not self.format: - raise Exception(f"{self} field has no format") + raise ValueError(f"{self} field has no format") return (t.text for t in self.tokenize(qstring, mode=mode, **kwargs)) # Conversion @@ -367,9 +366,38 @@ def clean(self): # Events def on_add(self, schema, fieldname): + """ + This method is called when a field is added to a schema. + + Parameters: + schema (Schema): The schema object to which the field is being added. + fieldname (str): The name of the field being added. + + Returns: + None + + Notes: + - This method can be overridden in subclasses to perform custom actions when a field is added. + - By default, this method does nothing. + """ pass def on_remove(self, schema, fieldname): + """ + This method is called when a field is removed from the schema. + + Parameters: + schema (Schema): The schema object from which the field is being removed. + fieldname (str): The name of the field being removed. + + Returns: + None + + Notes: + - This method can be overridden in a custom field class to perform any necessary cleanup or + additional actions when a field is removed from the schema. + - By default, this method does nothing. + """ pass @@ -487,7 +515,7 @@ def __init__( document. """ - self.analyzer = analyzer or analysis.IDAnalyzer() + self.analyzer = analyzer or analysis.id_analyzer() # Don't store any information other than the doc ID self.format = formats.Existence(field_boost=field_boost) self.stored = stored @@ -512,7 +540,7 @@ def __init__(self, stored=False, unique=False, expression=None, field_boost=1.0) """ expression = expression or re.compile(r"[^\r\n\t ,;]+") - self.analyzer = analysis.RegexAnalyzer(expression=expression) + self.analyzer = analysis.regex_analyzer(expression=expression) # Don't store any information other than the doc ID self.format = formats.Existence(field_boost=field_boost) self.stored = stored @@ -605,7 +633,7 @@ def __init__( raise TypeError(f"Can't use {numtype!r} as a type, use int or float") # Sanity check if numtype is float and decimal_places: - raise Exception( + raise ValueError( "A float type and decimal_places argument %r are " "incompatible" % decimal_places ) @@ -617,7 +645,7 @@ def __init__( bits = 64 # Floats are converted to 64 bit ints else: if bits not in intsizes: - raise Exception(f"Invalid bits {bits!r}, use 8, 16, 32, or 64") + raise ValueError(f"Invalid bits {bits!r}, use 8, 16, 32, or 64") # Type code for the *sortable* representation self.sortable_typecode = intcodes[intsizes.index(bits)] self._struct = struct.Struct(">" + str(self.sortable_typecode)) @@ -629,7 +657,7 @@ def __init__( self.decimal_places = decimal_places self.shift_step = shift_step self.signed = signed - self.analyzer = analysis.IDAnalyzer() + self.analyzer = analysis.id_analyzer() # Don't store any information other than the doc ID self.format = formats.Existence(field_boost=field_boost) self.min_value, self.max_value = self._min_max() @@ -641,8 +669,8 @@ def __init__( else: default = NaN elif not self.is_valid(default): - raise Exception( - f"The default {default!r} is not a valid number for this field" + raise ValueError( + f"The default {default} is not a valid number for this field" ) self.default = default @@ -853,11 +881,11 @@ def prepare_datetime(self, x): elif isinstance(x, bytes): return x else: - raise Exception(f"{x!r} is not a datetime") + raise ValueError(f"{x} is not a datetime") def to_column_value(self, x): if isinstance(x, bytes): - raise Exception(f"{x!r} is not a datetime") + raise ValueError(f"{x} is not a datetime") if isinstance(x, (list, tuple)): x = x[0] return self.prepare_datetime(x) @@ -897,7 +925,7 @@ def _parse_datestring(self, qstring): at = fix(adatetime(year, month, day, hour, minute, second, microsecond)) if is_void(at): - raise Exception(f"{qstring!r} is not a parseable date") + raise ValueError(f"{qstring} is not a parseable date") return at def parse_query(self, fieldname, qstring, boost=1.0): @@ -1012,6 +1040,14 @@ class STORED(FieldType): stored = True def __init__(self): + """ + Initialize a new instance of the class. + + This method is called when a new object of the class is created. It does not take any arguments. + + Usage: + field = Field() + """ pass @@ -1067,7 +1103,7 @@ def __init__( """ if not analyzer: - analyzer = analysis.KeywordAnalyzer(lowercase=lowercase, commas=commas) + analyzer = analysis.keyword_analyzer(lowercase=lowercase, commas=commas) self.analyzer = analyzer # Store field lengths and weights along with doc ID @@ -1111,7 +1147,7 @@ def __init__( """ :param analyzer: The analysis.Analyzer to use to index the field contents. See the analysis module for more information. If you omit - this argument, the field uses analysis.StandardAnalyzer. + this argument, the field uses analysis.standard_analyzer. :param phrase: Whether the store positional information to allow phrase searching. :param chars: Whether to store character ranges along with positions. @@ -1129,7 +1165,7 @@ def __init__( column type. If you pass a :class:`whoosh.columns.Column` instance instead of True, the field will use the given column type. :param lang: automaticaly configure a - :class:`whoosh.analysis.LanguageAnalyzer` for the given language. + :class:`whoosh.analysis.language_analyzer` for the given language. This is ignored if you also specify an ``analyzer``. :param vector: if this value evaluates to true, store a list of the terms in this field in each document. If the value is an instance @@ -1141,9 +1177,9 @@ def __init__( if analyzer: self.analyzer = analyzer elif lang: - self.analyzer = analysis.LanguageAnalyzer(lang) + self.analyzer = analysis.language_analyzer(lang) else: - self.analyzer = analysis.StandardAnalyzer() + self.analyzer = analysis.standard_analyzer() if chars: formatclass = formats.Characters @@ -1264,9 +1300,9 @@ def __init__( if phrase: formatclass = formats.Positions - self.analyzer = analysis.NgramAnalyzer(minsize, maxsize) + self.analyzer = analysis.ngram_analyzer(minsize, maxsize) self.format = formatclass(field_boost=field_boost) - self.analyzer = analysis.NgramAnalyzer(minsize, maxsize) + self.analyzer = analysis.ngram_analyzer(minsize, maxsize) self.stored = stored self.queryor = queryor self.set_sortable(sortable) @@ -1323,7 +1359,7 @@ def __init__( default is to combine N-grams with an And query. """ - self.analyzer = analysis.NgramWordAnalyzer(minsize, maxsize, tokenizer, at=at) + self.analyzer = analysis.ngram_word_analyzer(minsize, maxsize, tokenizer, at=at) self.format = formats.Frequency(field_boost=field_boost) self.stored = stored self.queryor = queryor @@ -1661,7 +1697,7 @@ def merge_fielddict(d1, d2): field1 = d1.get(name) field2 = d2.get(name) if field1 and field2 and field1 != field2: - raise Exception(f"Inconsistent field {name!r}: {field1!r} != {field2!r}") + raise ValueError(f"Inconsistent field {name}: {field1} != {field2}") out[name] = field1 or field2 return out diff --git a/src/whoosh/filedb/compound.py b/src/whoosh/filedb/compound.py index 26190152..a6657abc 100644 --- a/src/whoosh/filedb/compound.py +++ b/src/whoosh/filedb/compound.py @@ -52,9 +52,52 @@ def memoryview_(source, offset=None, length=None): class CompoundStorage(FileStorage): + """ + CompoundStorage is a class that represents a compound file storage for Whoosh indexes. + It provides methods to read and write files within the compound file. + + Parameters: + - dbfile (file-like object): The file-like object representing the compound file. + - use_mmap (bool, optional): Whether to use memory-mapped file for faster access. Defaults to True. + - basepos (int, optional): The base position in the file. Defaults to 0. + + Attributes: + - readonly (bool): Whether the compound file is read-only. + - is_closed (bool): Whether the compound file is closed. + - _file (file-like object): The file-like object representing the compound file. + - _diroffset (int): The offset of the directory within the compound file. + - _dirlength (int): The length of the directory within the compound file. + - _dir (dict): The directory mapping file names to their offset and length within the compound file. + - _options (dict): Additional options associated with the compound file. + - _locks (dict): A dictionary of locks for file-level synchronization. + - _source (mmap.mmap or None): The memory-mapped object representing the compound file, if mmap is used. + + Methods: + - __init__(self, dbfile, use_mmap=True, basepos=0): Initializes a CompoundStorage object. + - __repr__(self): Returns a string representation of the CompoundStorage object. + - close(self): Closes the compound file. + - range(self, name): Returns the offset and length of a file within the compound file. + - open_file(self, name, *args, **kwargs): Opens a file within the compound file. + - list(self): Returns a list of file names within the compound file. + - file_exists(self, name): Checks if a file exists within the compound file. + - file_length(self, name): Returns the length of a file within the compound file. + - file_modified(self, name): Returns the modification time of a file within the compound file. + - lock(self, name): Returns a lock object for file-level synchronization. + - assemble(dbfile, store, names, **options): Assembles a compound file from multiple files. + - write_dir(dbfile, basepos, directory, options=None): Writes the directory and options to the compound file. + """ + readonly = True def __init__(self, dbfile, use_mmap=True, basepos=0): + """ + Initializes a CompoundStorage object. + + Parameters: + - dbfile (file-like object): The file-like object representing the compound file. + - use_mmap (bool, optional): Whether to use memory-mapped file for faster access. Defaults to True. + - basepos (int, optional): The base position in the file. Defaults to 0. + """ self._file = dbfile self.is_closed = False @@ -96,11 +139,19 @@ def __init__(self, dbfile, use_mmap=True, basepos=0): self._file = None def __repr__(self): + """ + Returns a string representation of the CompoundStorage object. + """ return f"<{self.__class__.__name__} ({self._name})>" def close(self): + """ + Closes the compound file. + """ if self.is_closed: - raise Exception("Already closed") + raise RuntimeError( + "Already closed" + ) # Replaced generic Exception with RuntimeError self.is_closed = True if self._source: @@ -112,6 +163,16 @@ def close(self): self._file.close() def range(self, name): + """ + Returns the offset and length of a file within the compound file. + + Parameters: + - name (str): The name of the file. + + Returns: + - offset (int): The offset of the file within the compound file. + - length (int): The length of the file. + """ try: fileinfo = self._dir[name] except KeyError: @@ -119,6 +180,17 @@ def range(self, name): return fileinfo["offset"], fileinfo["length"] def open_file(self, name, *args, **kwargs): + """ + Opens a file within the compound file. + + Parameters: + - name (str): The name of the file. + - *args: Additional positional arguments. + - **kwargs: Additional keyword arguments. + + Returns: + - f (file-like object): The file-like object representing the opened file. + """ if self.is_closed: raise StorageError("Storage was closed") @@ -134,26 +206,74 @@ def open_file(self, name, *args, **kwargs): return f def list(self): + """ + Returns a list of file names within the compound file. + """ return list(self._dir.keys()) def file_exists(self, name): + """ + Checks if a file exists within the compound file. + + Parameters: + - name (str): The name of the file. + + Returns: + - exists (bool): True if the file exists, False otherwise. + """ return name in self._dir def file_length(self, name): + """ + Returns the length of a file within the compound file. + + Parameters: + - name (str): The name of the file. + + Returns: + - length (int): The length of the file. + """ info = self._dir[name] return info["length"] def file_modified(self, name): + """ + Returns the modification time of a file within the compound file. + + Parameters: + - name (str): The name of the file. + + Returns: + - modified (float): The modification time of the file. + """ info = self._dir[name] return info["modified"] def lock(self, name): + """ + Returns a lock object for file-level synchronization. + + Parameters: + - name (str): The name of the file. + + Returns: + - lock (Lock): The lock object. + """ if name not in self._locks: self._locks[name] = Lock() return self._locks[name] @staticmethod def assemble(dbfile, store, names, **options): + """ + Assembles a compound file from multiple files. + + Parameters: + - dbfile (file-like object): The file-like object representing the compound file. + - store (FileStorage): The file storage object containing the files to be assembled. + - names (list): The list of file names to be assembled. + - **options: Additional options to be associated with the compound file. + """ assert names, names directory = {} @@ -164,7 +284,7 @@ def assemble(dbfile, store, names, **options): # Copy the files into the compound file for name in names: if name.endswith(".toc") or name.endswith(".seg"): - raise Exception(name) + raise ValueError(name) for name in names: offset = dbfile.tell() @@ -179,6 +299,15 @@ def assemble(dbfile, store, names, **options): @staticmethod def write_dir(dbfile, basepos, directory, options=None): + """ + Writes the directory and options to the compound file. + + Parameters: + - dbfile (file-like object): The file-like object representing the compound file. + - basepos (int): The base position in the file. + - directory (dict): The directory mapping file names to their offset and length within the compound file. + - options (dict, optional): Additional options to be associated with the compound file. Defaults to None. + """ options = options or {} dirpos = dbfile.tell() # Remember the start of the directory @@ -194,7 +323,55 @@ def write_dir(dbfile, basepos, directory, options=None): class SubFile: + """ + Represents a subset of a parent file. + + This class provides methods to read and manipulate a subset of a parent file. + It keeps track of the subset's position, length, and name. + + Attributes: + _file (file-like object): The parent file. + _offset (int): The offset of the subset within the parent file. + _length (int): The length of the subset. + _end (int): The end position of the subset. + _pos (int): The current position within the subset. + name (str): The name of the subset. + closed (bool): Indicates whether the subset is closed. + + Methods: + close(): Closes the subset. + subset(position, length, name=None): Creates a new subset from the current subset. + read(size=None): Reads data from the subset. + readline(): Reads a line from the subset. + seek(where, whence=0): Moves the current position within the subset. + tell(): Returns the current position within the subset. + """ + def __init__(self, parentfile, offset, length, name=None): + """ + Initialize a CompoundFile object. + + Args: + parentfile (file-like object): The parent file object that represents the compound file. + offset (int): The offset within the parent file where the compound file starts. + length (int): The length of the compound file in bytes. + name (str, optional): The name of the compound file. Defaults to None. + + Attributes: + _file (file-like object): The parent file object that represents the compound file. + _offset (int): The offset within the parent file where the compound file starts. + _length (int): The length of the compound file in bytes. + _end (int): The end position of the compound file within the parent file. + _pos (int): The current position within the compound file. + name (str): The name of the compound file. + closed (bool): Indicates whether the compound file is closed. + + Raises: + None. + + Returns: + None. + """ self._file = parentfile self._offset = offset self._length = length @@ -205,9 +382,28 @@ def __init__(self, parentfile, offset, length, name=None): self.closed = False def close(self): + """ + Closes the subset. + + This method sets the `closed` attribute to True, indicating that the subset is closed. + """ self.closed = True def subset(self, position, length, name=None): + """ + Creates a new subset from the current subset. + + Args: + position (int): The position of the new subset within the current subset. + length (int): The length of the new subset. + name (str, optional): The name of the new subset. Defaults to None. + + Returns: + SubFile: The new subset. + + Raises: + AssertionError: If the position or length is out of bounds. + """ start = self._offset + position end = start + length name = name or self.name @@ -216,6 +412,19 @@ def subset(self, position, length, name=None): return SubFile(self._file, self._offset + position, length, name=name) def read(self, size=None): + """ + Reads data from the subset. + + Args: + size (int, optional): The number of bytes to read. If None, reads until the end of the subset. + Defaults to None. + + Returns: + bytes: The read data. + + Raises: + ValueError: If the size is negative. + """ if size is None: size = self._length - self._pos else: @@ -231,6 +440,15 @@ def read(self, size=None): return emptybytes def readline(self): + """ + Reads a line from the subset. + + Returns: + bytes: The read line. + + Raises: + ValueError: If the line length exceeds the remaining subset length. + """ maxsize = self._length - self._pos self._file.seek(self._offset + self._pos) data = self._file.readline() @@ -240,6 +458,18 @@ def readline(self): return data def seek(self, where, whence=0): + """ + Moves the current position within the subset. + + Args: + where (int): The new position. + whence (int, optional): The reference position for the new position. + 0 for absolute, 1 for relative to the current position, 2 for relative to the end. + Defaults to 0. + + Raises: + ValueError: If the `whence` value is invalid. + """ if whence == 0: # Absolute pos = where elif whence == 1: # Relative @@ -252,11 +482,58 @@ def seek(self, where, whence=0): self._pos = pos def tell(self): + """ + Returns the current position within the subset. + + Returns: + int: The current position. + """ return self._pos class CompoundWriter: + """ + A class for writing compound files in Whoosh. + + CompoundWriter is responsible for creating compound files, which are files that contain multiple smaller files + combined into a single file. This class provides methods to create and manage substreams within the compound file, + and to save the compound file either as a single file or as separate files. + + Args: + tempstorage (object): The temporary storage object used to create the compound file. + buffersize (int, optional): The size of the buffer used for writing data to the compound file. Defaults to + 32 * 1024 bytes. + + Attributes: + _tempstorage (object): The temporary storage object used to create the compound file. + _tempname (str): The name of the temporary file used for storing the compound file data. + _temp (file-like object): The temporary file object used for writing the compound file data. + _buffersize (int): The size of the buffer used for writing data to the compound file. + _streams (dict): A dictionary that maps substream names to their corresponding SubStream objects. + + """ + def __init__(self, tempstorage, buffersize=32 * 1024): + """ + Initialize a CompoundStorage object. + + Args: + tempstorage (object): The temporary storage object used to create the compound file. + buffersize (int, optional): The buffer size in bytes for reading and writing data. Defaults to 32 * 1024. + + Raises: + AssertionError: If the buffersize is not an integer. + + Notes: + - The CompoundStorage object is responsible for managing a compound file, which is a file that contains multiple + smaller files combined into a single file. + - The tempstorage object should implement the `create_file` method to create a temporary file. + - The buffersize determines the size of the buffer used for reading and writing data to the compound file. + + Example: + tempstorage = TempStorage() + compound = CompoundStorage(tempstorage, buffersize=64 * 1024) + """ assert isinstance(buffersize, int) self._tempstorage = tempstorage self._tempname = f"{random_name()}.ctmp" @@ -265,11 +542,50 @@ def __init__(self, tempstorage, buffersize=32 * 1024): self._streams = {} def create_file(self, name): + """ + Creates a new file with the given name in the compound file. + + Parameters: + - name (str): The name of the file to be created. + + Returns: + - StructFile: A StructFile object representing the newly created file. + + Description: + This method creates a new file with the given name in the compound file. + It internally creates a SubStream object with a temporary file and a buffer size. + The SubStream object is then stored in the _streams dictionary with the given name as the key. + Finally, a StructFile object is returned, which wraps the SubStream object. + + Example usage: + compound_file = CompoundFile() + file = compound_file.create_file("example.txt") + file.write("Hello, World!") + file.close() + """ ss = self.SubStream(self._temp, self._buffersize) self._streams[name] = ss return StructFile(ss) def _readback(self): + """ + Reads back the contents of the compound file. + + This method reads back the contents of the compound file, yielding each substream's name and a generator that + yields the data blocks of the substream. The data blocks are read from either the substream or a temporary file, + depending on whether the substream is closed or not. + + Returns: + generator: A generator that yields tuples containing the name of the substream and a generator that yields + the data blocks of the substream. + + Example: + compound_file = CompoundFile() + for name, gen in compound_file._readback(): + print(f"Substream: {name}") + for data_block in gen(): + process_data_block(data_block) + """ temp = self._temp for name, substream in self._streams.items(): substream.close() @@ -286,6 +602,28 @@ def gen(): self._tempstorage.delete_file(self._tempname) def save_as_compound(self, dbfile): + """ + Save the current index as a compound file. + + This method writes the index data to a single file in a compound format. + The compound file contains multiple sub-files, each representing a segment + of the index. The directory structure of the compound file is stored at the + beginning of the file. + + Parameters: + dbfile (file-like object): The file-like object to write the compound file to. + + Returns: + None + + Raises: + IOError: If there is an error writing the compound file. + + Usage: + To save the index as a compound file, pass a file-like object to this method. + The file-like object should be opened in binary mode for writing. After calling + this method, the compound file will be written to the provided file-like object. + """ basepos = dbfile.tell() dbfile.write_long(0) # Directory offset dbfile.write_int(0) # Directory length @@ -300,6 +638,30 @@ def save_as_compound(self, dbfile): CompoundStorage.write_dir(dbfile, basepos, directory) def save_as_files(self, storage, name_fn): + """ + Save the compound file as separate files in the given storage. + + Args: + storage (Storage): The storage object where the files will be saved. + name_fn (callable): A function that takes a name and returns the filename. + + Returns: + None + + Raises: + Any exceptions raised by the storage object. + + Notes: + This method saves the compound file as separate files in the given storage. + Each file is created using the provided name_fn function, which takes a name + and returns the filename. The compound file is read back and written to the + separate files block by block. + + Example: + storage = MyStorage() + name_fn = lambda name: name + ".txt" + compound_file.save_as_files(storage, name_fn) + """ for name, blocks in self._readback(): f = storage.create_file(name_fn(name)) for block in blocks(): @@ -307,16 +669,76 @@ def save_as_files(self, storage, name_fn): f.close() class SubStream: + """A class representing a substream for writing data to a file. + + This class is used internally by the `CompoundFileWriter` class to write data to a file in blocks. + It provides methods for writing data to the substream and keeping track of the offsets and lengths of the blocks. + + Attributes: + _dbfile (file): The file object representing the main database file. + _buffersize (int): The maximum size of the buffer before writing to the main file. + _buffer (BytesIO): The buffer used to store the data before writing. + blocks (list): A list of tuples representing the blocks written to the main file. Each tuple contains: + - A BytesIO object if the block is in the buffer, or None if the block is in the main file. + - The offset of the block in the main file. + - The length of the block. + + Methods: + tell(): Returns the current position in the substream. + write(inbytes): Writes the given bytes to the substream. + close(): Closes the substream and writes any remaining data to the main file. + + Usage: + # Create a SubStream object + substream = SubStream(dbfile, buffersize) + + # Write data to the substream + substream.write(inbytes) + + # Get the current position in the substream + position = substream.tell() + + # Close the substream + substream.close() + """ + def __init__(self, dbfile, buffersize): + """ + Initialize a CompoundFile object. + + Args: + dbfile (str): The path to the compound file. + buffersize (int): The size of the buffer used for reading and writing. + + Attributes: + _dbfile (str): The path to the compound file. + _buffersize (int): The size of the buffer used for reading and writing. + _buffer (BytesIO): The buffer used for temporary storage. + blocks (list): The list of blocks in the compound file. + + """ self._dbfile = dbfile self._buffersize = buffersize self._buffer = BytesIO() self.blocks = [] def tell(self): + """Returns the current position in the substream. + + Returns: + int: The current position in the substream. + """ return sum(b[2] for b in self.blocks) + self._buffer.tell() def write(self, inbytes): + """Writes the given bytes to the substream. + + If the length of the buffer exceeds the specified buffer size, the buffer is written to the main file + and a new block is created. + + Args: + inbytes (bytes): The bytes to write to the substream. + """ bio = self._buffer buflen = bio.tell() length = buflen + len(inbytes) @@ -331,6 +753,7 @@ def write(self, inbytes): bio.write(inbytes) def close(self): + """Closes the substream and writes any remaining data to the main file.""" bio = self._buffer length = bio.tell() if length: diff --git a/src/whoosh/filedb/fileindex.py b/src/whoosh/filedb/fileindex.py index 9b3e9985..33f363bd 100644 --- a/src/whoosh/filedb/fileindex.py +++ b/src/whoosh/filedb/fileindex.py @@ -14,7 +14,6 @@ # limitations under the License. # =============================================================================== -import os import pickle import re from bisect import bisect_right @@ -31,7 +30,6 @@ LockError, OutOfDateError, ) -from whoosh.support.bitvector import BitVector from whoosh.system import _FLOAT_SIZE, _INT_SIZE _INDEX_VERSION = -105 @@ -68,6 +66,20 @@ def has_deletions(self): class FileIndex(SegmentDeletionMixin, Index): def __init__(self, storage, schema, create=False, indexname=_DEF_INDEX_NAME): + """ + Represents an index stored in a file-based storage. + + Args: + storage (Storage): The storage object used to store the index files. + schema (Schema): The schema object defining the fields and their types in the index. + create (bool, optional): Whether to create a new index. Defaults to False. + indexname (str, optional): The name of the index. Defaults to _DEF_INDEX_NAME. + + Raises: + ValueError: If the provided schema is not a Schema object. + IndexError: If create is True but no schema is specified. + EmptyIndexError: If the index does not exist in the storage. + """ self.storage = storage self.indexname = indexname @@ -107,9 +119,20 @@ def __init__(self, storage, schema, create=False, indexname=_DEF_INDEX_NAME): self.segment_num_lock = None def __repr__(self): + """ + Returns a string representation of the FileIndex object. + + Returns: + str: The string representation of the FileIndex object. + """ return f"{self.__class__.__name__}({self.storage!r}, {self.indexname!r})" def _acquire_readlocks(self): + """ + Acquires read locks on the segment files. + + This is used to keep the underlying files open so they don't get deleted from underneath us. + """ self._readlocks = [ self.storage.open_file(name, mapped=False) for name in self.segments.filenames() @@ -117,35 +140,61 @@ def _acquire_readlocks(self): ] def _release_readlocks(self): + """ + Releases the read locks on the segment files. + """ (f.close() for f in self._readlocks) self._readlocks = [] def close(self): + """ + Closes the FileIndex object by releasing the read locks on the segment files. + """ self._release_readlocks() def latest_generation(self): + """ + Returns the latest generation number of the index files. + + Returns: + int: The latest generation number of the index files. + """ pattern = _toc_pattern(self.indexname) - max = -1 + maximum = -1 for filename in self.storage: m = pattern.match(filename) if m: num = int(m.group(1)) - if num > max: - max = num - return max + if num > maximum: + maximum = num + return maximum def refresh(self): + """ + Refreshes the FileIndex object by creating a new instance with the same storage and schema. + + Returns: + FileIndex: The refreshed FileIndex object. + """ if not self.up_to_date(): return self.__class__(self.storage, self.schema, indexname=self.indexname) else: return self def up_to_date(self): + """ + Checks if the FileIndex object is up to date. + + Returns: + bool: True if the FileIndex object is up to date, False otherwise. + """ return self.generation == self.latest_generation() def _write(self): - # Writes the content of this index to the .toc file. + """ + Writes the content of this index to the .toc file. + """ self.schema.clean() # stream = self.storage.create_file(self._toc_filename()) @@ -172,7 +221,18 @@ def _write(self): self.storage.rename_file(tempfilename, self._toc_filename(), safe=True) def _read(self, schema): - # Reads the content of this index from the .toc file. + """ + Reads the content of this index from the .toc file. + + Args: + schema (Schema): The schema object to use. If None, the pickled schema from the saved index will be loaded. + + Raises: + IndexError: If the index was created on an architecture with different data sizes. + IndexError: If there is a byte order problem. + IndexVersionError: If the format of the index is not supported. + + """ stream = self.storage.open_file(self._toc_filename()) if stream.read_varint() != _INT_SIZE or stream.read_varint() != _FLOAT_SIZE: @@ -180,7 +240,7 @@ def _read(self, schema): "Index was created on an architecture with different data sizes" ) - if not stream.read_int() == -12345: + if stream.read_int() != -12345: raise IndexError("Number misread: byte order problem") version = stream.read_int() @@ -208,7 +268,15 @@ def _read(self, schema): stream.close() def _next_segment_name(self): - # Returns the name of the next segment in sequence. + """ + Returns the name of the next segment in sequence. + + Returns: + str: The name of the next segment in sequence. + + Raises: + LockError: If the segment number lock cannot be acquired. + """ if self.segment_num_lock is None: self.segment_num_lock = Lock() @@ -222,21 +290,51 @@ def _next_segment_name(self): raise LockError def _toc_filename(self): - # Returns the computed filename of the TOC for this index name and - # generation. + """ + Returns the computed filename of the TOC (Table of Contents) for this index name and generation. + + Returns: + str: The computed filename of the TOC for this index name and generation. + """ return f"_{self.indexname}_{self.generation}.toc" def last_modified(self): + """ + Returns the last modified timestamp of the TOC file. + + Returns: + float: The last modified timestamp of the TOC file. + """ return self.storage.file_modified(self._toc_filename()) def is_empty(self): - """Low-level: Returns the number of segments in this index.""" + """ + Checks if the index is empty. + + Returns: + bool: True if the index is empty, False otherwise. + """ return len(self.segments) == 0 def segment_count(self): + """ + Returns the number of segments in the index. + + Returns: + int: The number of segments in the index. + """ return len(self.segments) def optimize(self): + """ + Optimizes the index by merging segments if necessary. + + This operation improves search performance. + + Note: + This method only performs optimization if there are more than 1 segments and no deletions. + + """ if len(self.segments) < 2 and not self.segments.has_deletions(): return @@ -246,6 +344,17 @@ def optimize(self): w.commit(OPTIMIZE) def commit(self, new_segments=None): + """ + Commits changes to the index. + + Args: + new_segments (SegmentSet, optional): The new segments to replace the existing segments in the index. + + Raises: + OutOfDateError: If the index is not up to date. + ValueError: If new_segments is provided but is not a SegmentSet. + + """ self._release_readlocks() if not self.up_to_date(): @@ -266,11 +375,13 @@ def commit(self, new_segments=None): self._acquire_readlocks() def _clean_files(self): - # Attempts to remove unused index files (called when a new generation - # is created). If existing Index and/or reader objects have the files - # open, they may not be deleted immediately (i.e. on Windows) but will - # probably be deleted eventually by a later call to clean_files. + """ + Attempts to remove unused index files. + This method is called when a new generation is created. + If existing Index and/or reader objects have the files open, they may not be deleted immediately (i.e. on Windows) + but will probably be deleted eventually by a later call to clean_files. + """ storage = self.storage current_segment_names = {s.name for s in self.segments} @@ -297,18 +408,54 @@ def _clean_files(self): pass def doc_count_all(self): + """ + Returns the total number of documents in the index, including deleted documents. + + Returns: + int: The total number of documents in the index, including deleted documents. + """ return self.segments.doc_count_all() def doc_count(self): + """ + Returns the number of non-deleted documents in the index. + + Returns: + int: The number of non-deleted documents in the index. + """ return self.segments.doc_count() def field_length(self, fieldnum): + """ + Returns the total length of a field in the index. + + Args: + fieldnum (int): The field number. + + Returns: + int: The total length of the field in the index. + """ return sum(s.field_length(fieldnum) for s in self.segments) def reader(self): + """ + Returns a reader object for the index. + + Returns: + IndexReader: The reader object for the index. + """ return self.segments.reader(self.storage, self.schema) def writer(self, **kwargs): + """ + Returns a writer object for the index. + + Args: + **kwargs: Additional keyword arguments to pass to the writer constructor. + + Returns: + IndexWriter: The writer object for the index. + """ from whoosh.filedb.filewriting import SegmentWriter return SegmentWriter(self, **kwargs) @@ -318,8 +465,33 @@ def writer(self, **kwargs): class SegmentSet: - """This class is never instantiated by the user. It is used by the Index - object to keep track of the segments in the index. + """ + This class is used by the Index object to keep track of the segments in the index. + + Attributes: + segments (list): A list of segments in the index. + _doc_offsets (list): A list of document offsets for each segment. + + Methods: + __init__(segments=None): Initializes a new instance of the SegmentSet class. + __repr__(): Returns a string representation of the segments in the set. + __len__(): Returns the number of segments in this set. + __iter__(): Returns an iterator over the segments in this set. + __getitem__(n): Returns the segment at the specified index. + append(segment): Adds a segment to this set. + _document_segment(docnum): Returns the index.Segment object containing the given document number. + _segment_and_docnum(docnum): Returns an (index.Segment, segment_docnum) pair for the segment containing the given document number. + copy(): Returns a deep copy of this set. + filenames(): Returns a set of filenames associated with the segments in this set. + doc_offsets(): Recomputes the document offset list. + doc_count_all(): Returns the total number of documents, DELETED or UNDELETED, in this set. + doc_count(): Returns the number of undeleted documents in this set. + has_deletions(): Returns True if this index has documents that are marked deleted but haven't been optimized out of the index yet. + delete_document(docnum, delete=True): Deletes a document by number. + deleted_count(): Returns the total number of deleted documents in this index. + is_deleted(docnum): Returns True if a given document number is deleted but not yet optimized out of the index. + reader(storage, schema): Returns a reader object for accessing the segments in this set. + """ def __init__(self, segments=None): @@ -335,7 +507,10 @@ def __repr__(self): def __len__(self): """ - :returns: the number of segments in this set. + Returns the number of segments in this set. + + Returns: + int: The number of segments in this set. """ return len(self.segments) @@ -346,44 +521,73 @@ def __getitem__(self, n): return self.segments.__getitem__(n) def append(self, segment): - """Adds a segment to this set.""" + """ + Adds a segment to this set. + Args: + segment (object): The segment to be added. + """ self.segments.append(segment) self._doc_offsets = self.doc_offsets() def _document_segment(self, docnum): - """Returns the index.Segment object containing the given document - number. """ + Returns the index.Segment object containing the given document number. + Args: + docnum (int): The document number. + + Returns: + int: The index of the segment containing the document. + """ offsets = self._doc_offsets if len(offsets) == 1: return 0 return bisect_right(offsets, docnum) - 1 def _segment_and_docnum(self, docnum): - """Returns an (index.Segment, segment_docnum) pair for the segment - containing the given document number. """ + Returns an (index.Segment, segment_docnum) pair for the segment containing the given document number. + Args: + docnum (int): The document number. + + Returns: + tuple: A tuple containing the index.Segment object and the segment_docnum. + """ segmentnum = self._document_segment(docnum) offset = self._doc_offsets[segmentnum] segment = self.segments[segmentnum] return segment, docnum - offset def copy(self): - """:returns: a deep copy of this set.""" + """ + Returns a deep copy of this set. + + Returns: + SegmentSet: A deep copy of this set. + """ return self.__class__([s.copy() for s in self.segments]) def filenames(self): + """ + Returns a set of filenames associated with the segments in this set. + + Returns: + set: A set of filenames. + """ nameset = set() for segment in self.segments: nameset |= segment.filenames() return nameset def doc_offsets(self): - # Recomputes the document offset list. This must be called if you - # change self.segments. + """ + Recomputes the document offset list. This must be called if you change self.segments. + + Returns: + list: A list of document offsets. + """ offsets = [] base = 0 for s in self.segments: @@ -393,51 +597,75 @@ def doc_offsets(self): def doc_count_all(self): """ - :returns: the total number of documents, DELETED or UNDELETED, in this - set. + Returns the total number of documents, DELETED or UNDELETED, in this set. + + Returns: + int: The total number of documents. """ return sum(s.doc_count_all() for s in self.segments) def doc_count(self): """ - :returns: the number of undeleted documents in this set. + Returns the number of undeleted documents in this set. + + Returns: + int: The number of undeleted documents. """ return sum(s.doc_count() for s in self.segments) def has_deletions(self): """ - :returns: True if this index has documents that are marked deleted but - haven't been optimized out of the index yet. This includes - deletions that haven't been written to disk with Index.commit() - yet. + Returns True if this index has documents that are marked deleted but haven't been optimized out of the index yet. + + Returns: + bool: True if there are deleted documents, False otherwise. """ return any(s.has_deletions() for s in self.segments) def delete_document(self, docnum, delete=True): - """Deletes a document by number. - - You must call Index.commit() for the deletion to be written to disk. """ + Deletes a document by number. + Args: + docnum (int): The document number. + delete (bool, optional): Whether to mark the document as deleted. Defaults to True. + """ segment, segdocnum = self._segment_and_docnum(docnum) segment.delete_document(segdocnum, delete=delete) def deleted_count(self): """ - :returns: the total number of deleted documents in this index. + Returns the total number of deleted documents in this index. + + Returns: + int: The total number of deleted documents. """ return sum(s.deleted_count() for s in self.segments) def is_deleted(self, docnum): """ - :returns: True if a given document number is deleted but not yet - optimized out of the index. - """ + Returns True if a given document number is deleted but not yet optimized out of the index. + Args: + docnum (int): The document number. + + Returns: + bool: True if the document is deleted, False otherwise. + """ segment, segdocnum = self._segment_and_docnum(docnum) return segment.is_deleted(segdocnum) def reader(self, storage, schema): + """ + Returns a reader object for accessing the segments in this set. + + Args: + storage (object): The storage object. + schema (object): The schema object. + + Returns: + object: A reader object. + """ from whoosh.filedb.filereading import SegmentReader segments = self.segments @@ -451,17 +679,32 @@ def reader(self, storage, schema): class Segment: - """Do not instantiate this object directly. It is used by the Index object - to hold information about a segment. A list of objects of this class are - pickled as part of the TOC file. - - The TOC file stores a minimal amount of information -- mostly a list of - Segment objects. Segments are the real reverse indexes. Having multiple - segments allows quick incremental indexing: just create a new segment for - the new documents, and have the index overlay the new segment over previous - ones for purposes of reading/search. "Optimizing" the index combines the - contents of existing segments into one (removing any deleted documents - along the way). + """Represents a segment in the index. + + Segments are used by the Index object to hold information about a segment. + A segment is a real reverse index that stores a subset of the documents in the index. + Multiple segments allow for quick incremental indexing and efficient searching. + + Attributes: + name (str): The name of the segment. + doccount (int): The maximum document number in the segment. + fieldlength_totals (dict): A dictionary mapping field numbers to the total number of terms in that field across all documents in the segment. + fieldlength_maxes (dict): A dictionary mapping field numbers to the maximum length of the field in any of the documents in the segment. + deleted (set): A set of deleted document numbers, or None if no deleted documents exist in this segment. + + Methods: + __init__(name, doccount, fieldlength_totals, fieldlength_maxes, deleted=None): Initializes a Segment object. + __repr__(): Returns a string representation of the Segment object. + copy(): Creates a copy of the Segment object. + filenames(): Returns a set of filenames associated with the segment. + doc_count_all(): Returns the total number of documents, deleted or undeleted, in this segment. + doc_count(): Returns the number of undeleted documents in this segment. + has_deletions(): Returns True if any documents in this segment are deleted. + deleted_count(): Returns the total number of deleted documents in this segment. + field_length(fieldnum, default=0): Returns the total number of terms in the given field across all documents in this segment. + max_field_length(fieldnum, default=0): Returns the maximum length of the given field in any of the documents in the segment. + delete_document(docnum, delete=True): Deletes or undeletes a document in the segment. + is_deleted(docnum): Returns True if the given document number is deleted. """ EXTENSIONS = { @@ -477,15 +720,14 @@ def __init__( self, name, doccount, fieldlength_totals, fieldlength_maxes, deleted=None ): """ - :param name: The name of the segment (the Index object computes this - from its name and the generation). - :param doccount: The maximum document number in the segment. - :param term_count: Total count of all terms in all documents. - :param fieldlength_totals: A dictionary mapping field numbers to the - total number of terms in that field across all documents in the - segment. - :param deleted: A set of deleted document numbers, or None if no - deleted documents exist in this segment. + Initializes a Segment object. + + Args: + name (str): The name of the segment (the Index object computes this from its name and the generation). + doccount (int): The maximum document number in the segment. + fieldlength_totals (dict): A dictionary mapping field numbers to the total number of terms in that field across all documents in the segment. + fieldlength_maxes (dict): A dictionary mapping field numbers to the maximum length of the field in any of the documents in the segment. + deleted (set, optional): A set of deleted document numbers, or None if no deleted documents exist in this segment. """ self.name = name @@ -495,15 +737,27 @@ def __init__( self.deleted = deleted self._filenames = set() - for attr, ext in self.EXTENSIONS.iteritems(): + for attr, ext in self.EXTENSIONS.items(): fname = f"{self.name}.{ext}" setattr(self, attr + "_filename", fname) self._filenames.add(fname) def __repr__(self): + """ + Returns a string representation of the Segment object. + + Returns: + str: A string representation of the Segment object. + """ return f"{self.__class__.__name__}({self.name!r})" def copy(self): + """ + Creates a copy of the Segment object. + + Returns: + Segment: A copy of the Segment object. + """ if self.deleted: deleted = set(self.deleted) else: @@ -517,59 +771,91 @@ def copy(self): ) def filenames(self): + """ + Returns a set of filenames associated with the segment. + + Returns: + set: A set of filenames associated with the segment. + """ return self._filenames def doc_count_all(self): """ - :returns: the total number of documents, DELETED OR UNDELETED, in this - segment. + Returns the total number of documents, deleted or undeleted, in this segment. + + Returns: + int: The total number of documents in this segment. """ return self.doccount def doc_count(self): """ - :returns: the number of (undeleted) documents in this segment. + Returns the number of undeleted documents in this segment. + + Returns: + int: The number of undeleted documents in this segment. """ return self.doccount - self.deleted_count() def has_deletions(self): """ - :returns: True if any documents in this segment are deleted. + Returns True if any documents in this segment are deleted. + + Returns: + bool: True if any documents in this segment are deleted, False otherwise. """ return self.deleted_count() > 0 def deleted_count(self): """ - :returns: the total number of deleted documents in this segment. + Returns the total number of deleted documents in this segment. + + Returns: + int: The total number of deleted documents in this segment. """ if self.deleted is None: return 0 return len(self.deleted) def field_length(self, fieldnum, default=0): - """Returns the total number of terms in the given field across all - documents in this segment. + """ + Returns the total number of terms in the given field across all documents in this segment. + + Args: + fieldnum (int): The internal number of the field. + default (int, optional): The default value to return if the field number is not found. - :param fieldnum: the internal number of the field. + Returns: + int: The total number of terms in the given field across all documents in this segment. """ return self.fieldlength_totals.get(fieldnum, default) def max_field_length(self, fieldnum, default=0): - """Returns the maximum length of the given field in any of the - documents in the segment. + """ + Returns the maximum length of the given field in any of the documents in the segment. + + Args: + fieldnum (int): The internal number of the field. + default (int, optional): The default value to return if the field number is not found. - :param fieldnum: the internal number of the field. + Returns: + int: The maximum length of the given field in any of the documents in the segment. """ return self.fieldlength_maxes.get(fieldnum, default) def delete_document(self, docnum, delete=True): - """Deletes the given document number. The document is not actually - removed from the index until it is optimized. - - :param docnum: The document number to delete. - :param delete: If False, this undeletes a deleted document. """ + Deletes or undeletes the given document number. + + The document is not actually removed from the index until it is optimized. + Args: + docnum (int): The document number to delete or undelete. + delete (bool, optional): If True, deletes the document. If False, undeletes a deleted document. + + Raises: + KeyError: If the document number is already deleted or not deleted. + """ if delete: if self.deleted is None: self.deleted = set() @@ -586,8 +872,15 @@ def delete_document(self, docnum, delete=True): self.deleted.clear(docnum) def is_deleted(self, docnum): - """:returns: True if the given document number is deleted.""" + """ + Returns True if the given document number is deleted. + + Args: + docnum (int): The document number. + Returns: + bool: True if the given document number is deleted, False otherwise. + """ if self.deleted is None: return False return docnum in self.deleted @@ -597,16 +890,40 @@ def is_deleted(self, docnum): def _toc_pattern(indexname): - """Returns a regular expression object that matches TOC filenames. - name is the name of the index. """ - + Returns a regular expression object that matches TOC filenames. + + Parameters: + indexname (str): The name of the index. + + Returns: + re.Pattern: A regular expression object that matches TOC filenames. + + Example: + >>> pattern = _toc_pattern("myindex") + >>> pattern.match("_myindex_1.toc") + + >>> pattern.match("_myindex_2.toc") + + >>> pattern.match("_otherindex_1.toc") + None + """ return re.compile(f"_{indexname}_([0-9]+).toc") def _segment_pattern(indexname): - """Returns a regular expression object that matches segment filenames. - name is the name of the index. """ + Returns a regular expression object that matches segment filenames. + + Args: + indexname (str): The name of the index. + Returns: + re.Pattern: A regular expression object that matches segment filenames. + + Example: + >>> pattern = _segment_pattern("my_index") + >>> pattern.match("_my_index_001.fdt") + + """ return re.compile(f"(_{indexname}_[0-9]+).({Segment.EXTENSIONS.values()})") diff --git a/src/whoosh/filedb/filepostings.py b/src/whoosh/filedb/filepostings.py index abbae1b5..cb409505 100644 --- a/src/whoosh/filedb/filepostings.py +++ b/src/whoosh/filedb/filepostings.py @@ -26,6 +26,19 @@ class BlockInfo: + """ + Represents information about a block in a file-based posting list. + + Attributes: + nextoffset (int): The offset of the next block in the file. + postcount (int): The number of postings in the block. + maxweight (int): The maximum weight of the postings in the block. + maxwol (float): The maximum weight of a single posting in the block. + minlength (int): The minimum length of the terms in the block. + maxid (int or str): The maximum term ID in the block. + dataoffset (int): The offset of the block's data in the file. + """ + __slots__ = ( "nextoffset", "postcount", @@ -49,6 +62,18 @@ def __init__( maxid=None, dataoffset=None, ): + """ + Initializes a new instance of the BlockInfo class. + + Args: + nextoffset (int, optional): The offset of the next block in the file. + postcount (int, optional): The number of postings in the block. + maxweight (int, optional): The maximum weight of the postings in the block. + maxwol (float, optional): The maximum weight of a single posting in the block. + minlength (int, optional): The minimum length of the terms in the block. + maxid (int or str, optional): The maximum term ID in the block. + dataoffset (int, optional): The offset of the block's data in the file. + """ self.nextoffset = nextoffset self.postcount = postcount self.maxweight = maxweight @@ -58,6 +83,12 @@ def __init__( self.dataoffset = dataoffset def __repr__(self): + """ + Returns a string representation of the BlockInfo object. + + Returns: + str: A string representation of the BlockInfo object. + """ return ( "<%s nextoffset=%r postcount=%r maxweight=%r" " maxwol=%r minlength=%r" @@ -75,6 +106,12 @@ def __repr__(self): ) def to_file(self, file): + """ + Writes the BlockInfo object to a file. + + Args: + file (file-like object): The file to write to. + """ file.write( self._struct.pack( self.nextoffset, @@ -94,10 +131,26 @@ def to_file(self, file): file.write_uint(maxid) def _read_id(self, file): + """ + Reads the maximum term ID from a file. + + Args: + file (file-like object): The file to read from. + """ self.maxid = file.read_uint() @staticmethod def from_file(file, stringids=False): + """ + Creates a new BlockInfo object from a file. + + Args: + file (file-like object): The file to read from. + stringids (bool, optional): Whether the term IDs are stored as strings. + + Returns: + BlockInfo: A new BlockInfo object. + """ ( nextoffset, xi1, @@ -128,6 +181,36 @@ def from_file(file, stringids=False): class FilePostingWriter(PostingWriter): + """ + A class for writing posting lists to a file-based index. + + Args: + schema (Schema): The schema of the index. + postfile (file): The file object to write the posting lists to. + stringids (bool, optional): Whether the document ids are strings. Defaults to False. + blocklimit (int, optional): The maximum number of postings to store in a block. Defaults to 128. + + Raises: + ValueError: If the blocklimit argument is greater than 255 or less than 1. + + Attributes: + schema (Schema): The schema of the index. + postfile (file): The file object to write the posting lists to. + stringids (bool): Whether the document ids are strings. + blocklimit (int): The maximum number of postings to store in a block. + inblock (bool): Indicates if currently inside a block. + fieldnum (int): The field number being written. + format (Codec): The codec for the field being written. + blockcount (int): The number of blocks written. + posttotal (int): The total number of postings written. + startoffset (int): The offset in the file where the current block starts. + blockids (list): The list of document ids in the current block. + blockweights (list): The list of weights in the current block. + blockvalues (list): The list of values in the current block. + blockoffset (int): The offset in the file where the current block is written. + + """ + def __init__(self, schema, postfile, stringids=False, blocklimit=128): self.schema = schema self.postfile = postfile @@ -141,6 +224,9 @@ def __init__(self, schema, postfile, stringids=False, blocklimit=128): self.inblock = False def _reset_block(self): + """ + Resets the current block's data structures. + """ if self.stringids: self.blockids = [] else: @@ -150,8 +236,21 @@ def _reset_block(self): self.blockoffset = self.postfile.tell() def start(self, fieldnum): + """ + Starts a new block for writing postings. + + Args: + fieldnum (int): The field number being written. + + Returns: + int: The offset in the file where the block starts. + + Raises: + ValueError: If called while already inside a block. + + """ if self.inblock: - raise Exception("Called start() in a block") + raise ValueError("Cannot call start() while already in a block") self.fieldnum = fieldnum self.format = self.schema[fieldnum].format @@ -168,6 +267,14 @@ def start(self, fieldnum): return self.startoffset def write(self, id, valuestring): + """ + Writes a posting to the current block. + + Args: + id: The document id. + valuestring: The value associated with the document. + + """ self.blockids.append(id) self.blockvalues.append(valuestring) self.blockweights.append(self.format.decode_weight(valuestring)) @@ -175,13 +282,23 @@ def write(self, id, valuestring): self._write_block() def finish(self): + """ + Finishes writing the current block. + + Returns: + int: The total number of postings written. + + Raises: + ValueError: If called when not in a block. + + """ if not self.inblock: - raise Exception("Called finish() when not in a block") + raise ValueError("Called finish() when not in a block") if self.blockids: self._write_block() - # Seek back to the start of this list of posting blocks and writer the + # Seek back to the start of this list of posting blocks and write the # number of blocks pf = self.postfile pf.flush() @@ -194,11 +311,19 @@ def finish(self): return self.posttotal def close(self): + """ + Closes the posting writer. + + """ if hasattr(self, "blockids") and self.blockids: self.finish() self.postfile.close() def _write_block(self): + """ + Writes the current block to the file. + + """ posting_size = self.format.posting_size dfl_fn = self.dfl_fn fieldnum = self.fieldnum @@ -267,7 +392,67 @@ def _write_block(self): class FilePostingReader(Matcher): + """ + A class for reading posting data from a file-like object. + + This class is responsible for reading posting data from a file-like object and providing + convenient methods to access the IDs, values, and weights of the postings. + + Args: + postfile (file-like object): The file-like object representing the posting file. + offset (int): The offset in the file where the posting data starts. + format (PostingFormat): The format of the posting data. + scorefns (tuple, optional): A tuple of score functions (score, quality, block_quality). + Defaults to None. + stringids (bool, optional): Indicates whether the IDs are stored as strings. + Defaults to False. + + Attributes: + postfile (file-like object): The file-like object representing the posting file. + startoffset (int): The offset in the file where the posting data starts. + format (PostingFormat): The format of the posting data. + _scorefns (tuple): A tuple of score functions (score, quality, block_quality). + stringids (bool): Indicates whether the IDs are stored as strings. + blockcount (int): The number of blocks in the posting file. + baseoffset (int): The offset in the file where the posting data starts. + _active (bool): Indicates whether the FilePostingReader object is active. + currentblock (int): The index of the current block being read. + ids (list): The IDs of the postings in the current block. + values (list): The values of the postings in the current block. + weights (list): The weights of the postings in the current block. + i (int): The index of the current posting within the current block. + + Methods: + copy(): Creates a copy of the FilePostingReader object. + is_active(): Checks if the FilePostingReader object is active. + id(): Returns the ID of the current posting. + value(): Returns the value of the current posting. + weight(): Returns the weight of the current posting. + all_ids(): Generator that yields all the IDs in the posting file. + next(): Moves to the next posting in the posting file. + skip_to(id): Skips to the posting with the specified ID. + + """ + def __init__(self, postfile, offset, format, scorefns=None, stringids=False): + """ + Initializes a FilePostingReader object. + + Args: + postfile (file-like object): The file-like object representing the posting file. + offset (int): The offset in the file where the posting data starts. + format (PostingFormat): The format of the posting data. + scorefns (tuple, optional): A tuple of score functions (score, quality, block_quality). + Defaults to None. + stringids (bool, optional): Indicates whether the IDs are stored as strings. + Defaults to False. + + Raises: + None + + Returns: + None + """ self.postfile = postfile self.startoffset = offset self.format = format @@ -292,6 +477,18 @@ def __init__(self, postfile, offset, format, scorefns=None, stringids=False): self._next_block() def copy(self): + """ + Creates a copy of the FilePostingReader object. + + Args: + None + + Raises: + None + + Returns: + FilePostingReader: A copy of the FilePostingReader object. + """ return self.__class__( self.postfile, self.startoffset, @@ -301,18 +498,78 @@ def copy(self): ) def is_active(self): + """ + Checks if the FilePostingReader object is active. + + Args: + None + + Raises: + None + + Returns: + bool: True if the FilePostingReader object is active, False otherwise. + """ return self._active def id(self): + """ + Returns the ID of the current posting. + + Args: + None + + Raises: + None + + Returns: + int or str: The ID of the current posting. + """ return self.ids[self.i] def value(self): + """ + Returns the value of the current posting. + + Args: + None + + Raises: + None + + Returns: + object: The value of the current posting. + """ return self.values[self.i] def weight(self): + """ + Returns the weight of the current posting. + + Args: + None + + Raises: + None + + Returns: + float: The weight of the current posting. + """ return self.weights[self.i] def all_ids(self): + """ + Generator that yields all the IDs in the posting file. + + Args: + None + + Raises: + None + + Yields: + int or str: The IDs in the posting file. + """ nextoffset = self.baseoffset for _ in range(self.blockcount): blockinfo = self._read_blockinfo(nextoffset) @@ -321,6 +578,18 @@ def all_ids(self): yield from ids def next(self): + """ + Moves to the next posting in the posting file. + + Args: + None + + Raises: + None + + Returns: + bool: True if there is a next posting, False otherwise. + """ if self.i == self.blockinfo.postcount - 1: self._next_block() return True @@ -329,6 +598,18 @@ def next(self): return False def skip_to(self, id): + """ + Skips to the posting with the specified ID. + + Args: + id (int or str): The ID to skip to. + + Raises: + ReadTooFar: If the skip operation goes beyond the end of the posting file. + + Returns: + None + """ if not self.is_active(): raise ReadTooFar @@ -355,11 +636,36 @@ def skip_to(self, id): self.i = i def _read_blockinfo(self, offset): + """ + Reads the block information from the posting file. + + Args: + offset (int): The offset in the posting file where the block information starts. + + Raises: + None + + Returns: + BlockInfo: The block information. + """ pf = self.postfile pf.seek(offset) return BlockInfo.from_file(pf, self.stringids) def _read_ids(self, offset, postcount): + """ + Reads the IDs from the posting file. + + Args: + offset (int): The offset in the posting file where the IDs start. + postcount (int): The number of IDs to read. + + Raises: + None + + Returns: + tuple: A tuple containing the IDs and the offset after reading. + """ pf = self.postfile pf.seek(offset) @@ -372,10 +678,37 @@ def _read_ids(self, offset, postcount): return (ids, pf.tell()) def _read_weights(self, offset, postcount): + """ + Reads the weights from the posting file. + + Args: + offset (int): The offset in the posting file where the weights start. + postcount (int): The number of weights to read. + + Raises: + None + + Returns: + tuple: A tuple containing the weights and the offset after reading. + """ weights = self.postfile.get_array(offset, "f", postcount) return (weights, offset + _FLOAT_SIZE * postcount) def _read_values(self, startoffset, endoffset, postcount): + """ + Reads the values from the posting file. + + Args: + startoffset (int): The offset in the posting file where the values start. + endoffset (int): The offset in the posting file where the values end. + postcount (int): The number of values to read. + + Raises: + None + + Returns: + list: A list of values. + """ pf = self.postfile posting_size = self.format.posting_size @@ -412,6 +745,18 @@ def _read_values(self, startoffset, endoffset, postcount): return values def _consume_block(self): + """ + Consumes the current block by reading the IDs, weights, and values. + + Args: + None + + Raises: + None + + Returns: + None + """ postcount = self.blockinfo.postcount self.ids, woffset = self._read_ids(self.blockinfo.dataoffset, postcount) self.weights, voffset = self._read_weights(woffset, postcount) @@ -419,12 +764,31 @@ def _consume_block(self): self.i = 0 def _next_block(self, consume=True): + """ + Moves to the next block in the posting file. + + Args: + consume (bool, optional): Indicates whether to consume the block by reading the IDs, weights, and values. + Defaults to True. + + Raises: + None + + Returns: + None + """ self.currentblock += 1 if self.currentblock == self.blockcount: self._active = False return if self.currentblock == 0: + self.blockinfo = self._read_blockinfo(self.baseoffset) + else: + self.blockinfo = self._read_blockinfo(self.blockinfo.nextoffset) + + if consume: + self._consume_block() pos = self.baseoffset else: pos = self.blockinfo.nextoffset @@ -434,6 +798,18 @@ def _next_block(self, consume=True): self._consume_block() def _skip_to_block(self, targetfn): + """ + Skips to the block that satisfies the target function. + + Args: + targetfn (function): The target function that determines whether to skip to the next block. + + Raises: + None + + Returns: + int: The number of blocks skipped. + """ skipped = 0 while self._active and targetfn(): self._next_block(consume=False) @@ -445,19 +821,79 @@ def _skip_to_block(self, targetfn): return skipped def supports_quality(self): + """ + Checks if the FilePostingReader object supports quality scoring. + + Args: + None + + Raises: + None + + Returns: + bool: True if the FilePostingReader object supports quality scoring, False otherwise. + """ return True def skip_to_quality(self, minquality): + """ + Skips to the block with the minimum quality score. + + Args: + minquality (float): The minimum quality score. + + Raises: + None + + Returns: + int: The number of blocks skipped. + """ bq = self.block_quality if bq() > minquality: return 0 return self._skip_to_block(lambda: bq() <= minquality) def quality(self): - raise Exception("No quality function given") + """ + Raises a ValueError indicating that no quality function is given. + + Args: + None + + Raises: + ValueError: No quality function given. + + Returns: + None + """ + raise ValueError("No quality function given") def block_quality(self): - raise Exception("No block_quality function given") + """ + Raises a ValueError indicating that no block_quality function is given. + + Args: + None + + Raises: + ValueError: No block_quality function given. + + Returns: + None + """ + raise ValueError("No block_quality function given") def score(self): - raise Exception("No score function given") + """ + Raises a ValueError indicating that no score function is given. + + Args: + None + + Raises: + ValueError: No score function given. + + Returns: + None + """ + raise ValueError("No score function given") diff --git a/src/whoosh/filedb/filereading.py b/src/whoosh/filedb/filereading.py index 8b3ea6e4..b3a320b7 100644 --- a/src/whoosh/filedb/filereading.py +++ b/src/whoosh/filedb/filereading.py @@ -35,7 +35,87 @@ class SegmentReader(IndexReader): + """ + A class for reading data from a segment in a Whoosh index. + + This class provides methods for accessing various information and data stored in a segment of a Whoosh index. + It is used internally by the Whoosh library and should not be instantiated directly by users. + + Parameters: + - storage (Storage): The storage object representing the index storage. + - segment (Segment): The segment object representing the segment to read from. + - schema (Schema): The schema object representing the index schema. + + Attributes: + - storage (Storage): The storage object representing the index storage. + - segment (Segment): The segment object representing the segment being read from. + - schema (Schema): The schema object representing the index schema. + - termsindex (FileTableReader): The file table reader for the term index. + - postfile (File): The file object for the term postings file. + - vectorindex (StructHashReader): The struct hash reader for the vector index. + - vpostfile (File): The file object for the vector postings file. + - storedfields (FileListReader): The file list reader for the stored fields file. + - fieldlengths (list): A list of field lengths. + - has_deletions (bool): Indicates whether the segment has deletions. + - is_deleted (callable): A callable object that checks if a document is deleted. + - doc_count (int): The number of documents in the segment. + - dc (int): The total number of documents in the segment, including deleted documents. + - is_closed (bool): Indicates whether the segment reader is closed. + - _sync_lock (Lock): A lock object for synchronization. + + Methods: + - _open_vectors(): Opens the vector index and vector postings file. + - _open_postfile(): Opens the term postings file. + - close(): Closes the segment reader. + - doc_count_all(): Returns the total number of documents in the segment. + - stored_fields(docnum): Returns the stored fields for a given document number. + - all_stored_fields(): Returns an iterator over all stored fields in the segment. + - field_length(fieldnum): Returns the length of a field in the segment. + - doc_field_length(docnum, fieldnum, default=0): Returns the length of a field in a document. + - max_field_length(fieldnum): Returns the maximum length of a field in the segment. + - has_vector(docnum, fieldnum): Checks if a document has a vector for a given field. + - __iter__(): Returns an iterator over the terms in the segment. + - iter_from(fieldnum, text): Returns an iterator over the terms starting from a given field and text. + - _term_info(fieldnum, text): Returns the term info for a given field and text. + - doc_frequency(fieldid, text): Returns the document frequency of a term in a field. + - frequency(fieldid, text): Returns the frequency of a term in a field. + - lexicon(fieldid): Returns an iterator over the terms in a field. + - expand_prefix(fieldid, prefix): Returns an iterator over the terms with a given prefix in a field. + - postings(fieldid, text, exclude_docs=frozenset()): Returns a posting reader for a term in a field. + - vector(docnum, fieldid): Returns a vector reader for a document and field. + + """ + def __init__(self, storage, segment, schema): + """ + Initialize a Filereading object. + + Args: + storage (Storage): The storage object used to access the index files. + segment (Segment): The segment object representing a segment of the index. + schema (Schema): The schema object representing the index schema. + + Attributes: + storage (Storage): The storage object used to access the index files. + segment (Segment): The segment object representing a segment of the index. + schema (Schema): The schema object representing the index schema. + termsindex (FileTableReader): The file table reader for the term index. + postfile (None or FileTableReader): The file table reader for the term postings file. + vectorindex (None or FileTableReader): The file table reader for the vector index. + vpostfile (None or FileTableReader): The file table reader for the vector postings file. + storedfields (FileListReader): The file list reader for the stored fields file. + fieldlengths (list): The list of field lengths. + has_deletions (bool): Indicates if the segment has deletions. + is_deleted (function): Function to check if a document is deleted. + doc_count (int): The number of documents in the segment. + dc (int): The total number of documents in the segment, including deleted documents. + is_closed (bool): Indicates if the Filereading object is closed. + _sync_lock (Lock): Lock object for synchronization. + + Note: + The Filereading object provides access to various index files and information related to a segment of the index. + It is used internally by the Whoosh library and should not be instantiated directly by the user. + """ self.storage = storage self.segment = segment self.schema = schema @@ -84,6 +164,27 @@ def decode_storedfields(value): self._sync_lock = Lock() def _open_vectors(self): + """ + Opens the vector index and vector postings file. + + This method is responsible for opening the vector index and vector postings file + associated with the current storage and segment. It initializes the `vectorindex` + attribute with a StructHashReader object for reading the vector index, and sets + the `vpostfile` attribute to the opened vector postings file. + + Note: + This method assumes that the `vectorindex_filename` and `vectorposts_filename` + attributes of the segment object have been properly set. + + Args: + None + + Returns: + None + + Raises: + None + """ if self.vectorindex: return @@ -97,6 +198,21 @@ def _open_vectors(self): self.vpostfile = storage.open_file(segment.vectorposts_filename, mapped=False) def _open_postfile(self): + """ + Opens the postfile for reading. + + This method is responsible for opening the postfile associated with the segment + for reading. If the postfile is already open, this method does nothing. + + Returns: + None + + Raises: + None + + Usage: + _open_postfile() + """ if self.postfile: return self.postfile = self.storage.open_file( @@ -104,13 +220,50 @@ def _open_postfile(self): ) def __repr__(self): + """ + Return a string representation of the object. + + This method returns a string that represents the object in a unique and + human-readable format. It is used primarily for debugging and logging + purposes. + + Returns: + str: A string representation of the object. + """ return f"{self.__class__.__name__}({self.segment})" @protected def __contains__(self, term): + """ + Check if a term is present in the index. + + Args: + term (tuple): A tuple representing the term to be checked. The tuple should + contain two elements: the first element is the term's numeric + representation, and the second element is the term's string + representation. + + Returns: + bool: True if the term is present in the index, False otherwise. + """ return (self.schema.to_number(term[0]), term[1]) in self.termsindex def close(self): + """ + Closes the file reader and releases any associated resources. + + This method closes the stored fields, terms index, post file, and vector index + if they are open. It also marks the file reader as closed. + + Note: + If the `fieldlengths` attribute is uncommented, it will also be closed. + + Usage: + Call this method when you are finished using the file reader to release + any resources it holds. After calling this method, the file reader should + not be used again. + + """ self.storedfields.close() self.termsindex.close() if self.postfile: @@ -122,55 +275,267 @@ def close(self): self.is_closed = True def doc_count_all(self): + """ + Returns the total number of documents in the index. + + This method retrieves the document count from the index and returns it. + + Returns: + int: The total number of documents in the index. + + Example: + >>> reader = FileReader() + >>> reader.doc_count_all() + 100 + """ return self.dc @protected def stored_fields(self, docnum): + """ + Retrieve the stored fields for a given document number. + + Parameters: + docnum (int): The document number for which to retrieve the stored fields. + + Returns: + dict: A dictionary containing the stored fields for the specified document number. + + Raises: + IndexError: If the specified document number is out of range. + + Example: + >>> reader = FileReading() + >>> reader.stored_fields(0) + {'title': 'Sample Document', 'author': 'John Doe', 'content': 'This is a sample document.'} + """ return self.storedfields[docnum] @protected def all_stored_fields(self): + """ + Generator that yields the stored fields of all non-deleted documents in the segment. + + Yields: + dict: A dictionary containing the stored fields of a document. + + Notes: + - This method iterates over all document numbers in the segment and checks if each document is deleted. + - If a document is not deleted, it yields the stored fields of that document. + - The stored fields are returned as a dictionary. + + Example: + >>> reader = FileReading() + >>> for fields in reader.all_stored_fields(): + ... print(fields) + {'title': 'Document 1', 'content': 'This is the content of document 1'} + {'title': 'Document 2', 'content': 'This is the content of document 2'} + {'title': 'Document 3', 'content': 'This is the content of document 3'} + ... + """ is_deleted = self.segment.is_deleted for docnum in range(self.segment.doc_count_all()): if not is_deleted(docnum): yield self.storedfields[docnum] def field_length(self, fieldnum): + """ + Returns the length of a field in the segment. + + Parameters: + - fieldnum (int): The field number. + + Returns: + - int: The length of the field. + + Raises: + - ValueError: If the field number is invalid. + + This method retrieves the length of a field in the segment. The field number + should be a valid field number. If the field number is invalid, a ValueError + is raised. + + Example usage: + >>> segment = Segment() + >>> field_length = segment.field_length(0) + >>> print(field_length) + 10 + """ return self.segment.field_length(fieldnum) @protected def doc_field_length(self, docnum, fieldnum, default=0): + """ + Returns the length of a field in a document. + + Parameters: + - docnum (int): The document number. + - fieldnum (int): The field number. + - default (int, optional): The default value to return if the field length is not found. Defaults to 0. + + Returns: + - int: The length of the field in the document. + + Raises: + - IndexError: If the field number is out of range. + - IndexError: If the document number is out of range. + + This method retrieves the length of a field in a document from the internal data structure. + It uses the document number and field number to calculate the position in the fieldlengths array, + and then converts the byte value at that position to the corresponding length using the byte_to_length function. + + Example usage: + ``` + reader = FileReader() + length = reader.doc_field_length(10, 2) + print(length) # Output: 42 + ``` + """ index = self.indices[fieldnum] pos = index * self.dc + docnum return byte_to_length(self.fieldlengths[pos]) def max_field_length(self, fieldnum): + """ + Returns the maximum length of a field in the segment. + + Parameters: + fieldnum (int): The field number. + + Returns: + int: The maximum length of the field. + + Raises: + ValueError: If the field number is invalid. + + This method retrieves the maximum length of a field in the segment. The field number + should be a valid field number within the segment. If the field number is invalid, + a ValueError is raised. + + Example usage: + segment = Segment() + field_length = segment.max_field_length(0) + print(field_length) # Output: 100 + """ return self.segment.max_field_length(fieldnum) @protected def has_vector(self, docnum, fieldnum): + """ + Check if a vector exists for a given document number and field number. + + Parameters: + docnum (int): The document number. + fieldnum (int): The field number. + + Returns: + bool: True if the vector exists, False otherwise. + + Raises: + None + + Notes: + - This method assumes that the vectors have been opened using the _open_vectors() method. + - The vectorindex is a dictionary that stores the document and field numbers as keys, and the vectors as values. + """ self._open_vectors() return (docnum, fieldnum) in self.vectorindex @protected def __iter__(self): + """ + Iterate over the terms index and yield tuples containing file name, term, post count, and total frequency. + + Yields: + tuple: A tuple containing the file name, term, post count, and total frequency. + + Notes: + This method is used to iterate over the terms index in the `filereading` module. The terms index is a list of + tuples, where each tuple contains information about a term in the index. The tuple structure is as follows: + ((file_name, term), (total_frequency, _, post_count)). + + The method iterates over each tuple in the terms index and yields a tuple containing the file name, term, + post count, and total frequency. + + Example: + >>> reader = FileReader() + >>> for file_name, term, post_count, total_freq in reader: + ... print(file_name, term, post_count, total_freq) + """ for (fn, t), (totalfreq, _, postcount) in self.termsindex: yield (fn, t, postcount, totalfreq) @protected def iter_from(self, fieldnum, text): + """ + Iterates over the terms index starting from a specific field number and text. + + Args: + fieldnum (int): The field number to start iterating from. + text (str): The text to start iterating from. + + Yields: + tuple: A tuple containing the field number, term, postcount, and total frequency. + + """ tt = self.termsindex for (fn, t), (totalfreq, _, postcount) in tt.items_from((fieldnum, text)): yield (fn, t, postcount, totalfreq) @protected def _term_info(self, fieldnum, text): + """ + Retrieve the term information for a given field and text. + + This method returns the term information (e.g., frequency, positions) for a specific term in a specific field. + It looks up the term in the termsindex dictionary, which is a mapping of (fieldnum, text) tuples to term information. + + Parameters: + - fieldnum (int): The field number of the term. + - text (str): The text of the term. + + Returns: + - TermInfo: An object containing the term information. + + Raises: + - TermNotFound: If the term is not found in the termsindex dictionary. + + Usage: + term_info = _term_info(fieldnum, text) + """ + try: return self.termsindex[(fieldnum, text)] except KeyError: raise TermNotFound(f"{fieldnum}:{text!r}") def doc_frequency(self, fieldid, text): + """ + Returns the document frequency of a given term in a specific field. + + Parameters: + - fieldid (str): The ID of the field. + - text (str): The term to calculate the document frequency for. + + Returns: + - int: The document frequency of the term in the field. + + Raises: + - TermNotFound: If the term is not found in the field. + + This method calculates the document frequency of a given term in a specific field. + It first converts the field ID to a field number using the schema. + Then, it retrieves the term information using the field number and the term. + Finally, it returns the document frequency from the term information. + + Example usage: + ``` + field_id = "content" + term = "python" + frequency = doc_frequency(field_id, term) + print(f"The document frequency of '{term}' in field '{field_id}' is {frequency}.") + ``` + """ try: fieldnum = self.schema.to_number(fieldid) return self._term_info(fieldnum, text)[2] @@ -178,6 +543,23 @@ def doc_frequency(self, fieldid, text): return 0 def frequency(self, fieldid, text): + """ + Returns the frequency of a given term in a specified field. + + Args: + fieldid (str): The ID of the field. + text (str): The term to get the frequency for. + + Returns: + int: The frequency of the term in the field. + + Raises: + TermNotFound: If the term is not found in the field. + + Example: + >>> frequency("title", "python") + 3 + """ try: fieldnum = self.schema.to_number(fieldid) return self._term_info(fieldnum, text)[0] @@ -186,10 +568,29 @@ def frequency(self, fieldid, text): @protected def lexicon(self, fieldid): - # The base class has a lexicon() implementation that uses iter_from() - # and throws away the value, but overriding to use - # FileTableReader.keys_from() is much, much faster. + """ + Returns an iterator over the terms in the lexicon for the specified field. + + Args: + fieldid (str): The field identifier. + + Yields: + str: The terms in the lexicon for the specified field. + Raises: + None. + + Notes: + - This method overrides the base class implementation to use FileTableReader.keys_from() + for faster performance. + - The lexicon is a collection of unique terms in a field. + - The terms are yielded in lexicographic order. + + Example: + reader = FileTableReader() + for term in reader.lexicon("content"): + print(term) + """ tt = self.termsindex fieldid = self.schema.to_number(fieldid) for fn, t in tt.keys_from((fieldid, "")): @@ -199,10 +600,31 @@ def lexicon(self, fieldid): @protected def expand_prefix(self, fieldid, prefix): - # The base class has an expand_prefix() implementation that uses - # iter_from() and throws away the value, but overriding to use - # FileTableReader.keys_from() is much, much faster. + """ + Expand a prefix in a specific field. + + This method expands a given prefix in a specific field of the index. It uses the `FileTableReader.keys_from()` method for faster performance compared to the base class implementation. + + Parameters: + - fieldid (str): The ID of the field to expand the prefix in. + - prefix (str): The prefix to expand. + + Yields: + - str: The expanded terms that match the given prefix in the specified field. + Note: + - The `fieldid` parameter should be a valid field ID defined in the schema. + - The `prefix` parameter should be a string representing the prefix to expand. + + Example: + ``` + reader = FileTableReader() + for term in reader.expand_prefix("title", "comp"): + print(term) + ``` + + This will print all the terms in the "title" field that start with the prefix "comp". + """ tt = self.termsindex fieldid = self.schema.to_number(fieldid) for fn, t in tt.keys_from((fieldid, prefix)): @@ -211,9 +633,43 @@ def expand_prefix(self, fieldid, prefix): yield t def postings(self, fieldid, text, exclude_docs=frozenset()): + """ + Returns a postreader object that allows iterating over the postings (document ids) for a given field and text. + + Args: + fieldid (str): The field identifier. + text (str): The text to search for in the field. + exclude_docs (frozenset, optional): A set of document ids to exclude from the postings. Defaults to an empty set. + + Returns: + FilePostingReader: A postreader object that provides access to the postings. + + Raises: + TermNotFound: If the specified term (fieldid:text) is not found in the index. + + Note: + The postreader object returned by this method allows efficient iteration over the postings (document ids) for a given field and text. + It is important to note that the postreader object is not thread-safe and should not be shared across multiple threads. + + Example: + # Create an index and add documents + ix = create_in("indexdir", schema) + writer = ix.writer() + writer.add_document(title="Document 1", content="This is the first document.") + writer.add_document(title="Document 2", content="This is the second document.") + writer.commit() + + # Get the postreader for the "title" field and the term "document" + postreader = ix.reader().postings("title", "document") + + # Iterate over the postings + for docnum in postreader: + print(f"Document ID: {docnum}") + + """ schema = self.schema fieldnum = schema.to_number(fieldid) - format = schema[fieldnum].format + format_schema = schema[fieldnum].format try: offset = self.termsindex[(fieldnum, text)][1] @@ -226,21 +682,34 @@ def postings(self, fieldid, text, exclude_docs=frozenset()): exclude_docs = self.segment.deleted self._open_postfile() - postreader = FilePostingReader(self.postfile, offset, format) + postreader = FilePostingReader(self.postfile, offset, format_schema) # if exclude_docs: # postreader = Exclude(postreader, exclude_docs) return postreader def vector(self, docnum, fieldid): + """ + Returns the vector representation of a document's field. + + Args: + docnum (int): The document number. + fieldid (str): The field identifier. + + Returns: + FilePostingReader: The reader object for accessing the vector representation of the field. + + Raises: + ValueError: If no vectors are stored for the specified field or if no vector is found for the specified document and field. + """ schema = self.schema fieldnum = schema.to_number(fieldid) vformat = schema[fieldnum].vector if not vformat: - raise Exception(f"No vectors are stored for field {fieldid!r}") + raise ValueError(f"No vectors are stored for field {fieldid!r}") self._open_vectors() offset = self.vectorindex.get((docnum, fieldnum)) if offset is None: - raise Exception(f"No vector found for document {docnum} field {fieldid!r}") + raise ValueError(f"No vector found for document {docnum} field {fieldid!r}") return FilePostingReader(self.vpostfile, offset, vformat, stringids=True) diff --git a/src/whoosh/filedb/filestore.py b/src/whoosh/filedb/filestore.py index ba715c9f..a8edb967 100644 --- a/src/whoosh/filedb/filestore.py +++ b/src/whoosh/filedb/filestore.py @@ -40,6 +40,25 @@ def memoryview_(source, offset=None, length=None): + """ + Create a memoryview object from the given source object. + + Parameters: + - source: The source object to create the memoryview from. + - offset (optional): The starting offset within the source object. If not provided, the memoryview will start from the beginning. + - length (optional): The length of the memoryview. If not provided, the memoryview will extend to the end of the source object. + + Returns: + - mv: The memoryview object created from the source object. + + Usage: + - Create a memoryview from a bytes object: + mv = memoryview_(b'Hello, World!') + + - Create a memoryview from a bytearray object with a specified offset and length: + ba = bytearray(b'Hello, World!') + mv = memoryview_(ba, offset=7, length=5) + """ mv = memoryview(source) if offset or length: return mv[offset : offset + length] @@ -51,16 +70,48 @@ def memoryview_(source, offset=None, length=None): class StorageError(Exception): - pass + """ + Exception raised for errors related to storage operations. + + This exception is raised when there is an error performing operations + related to storage, such as reading or writing files. + + Attributes: + message -- explanation of the error + """ + + def __init__(self, message): + self.message = message + super().__init__(message) class ReadOnlyError(StorageError): - pass + """ + Exception raised when attempting to modify a read-only storage. + This exception is raised when attempting to modify a storage that has been opened in read-only mode. + It is a subclass of `StorageError` and can be caught separately from other storage-related exceptions. -# Base class + Usage: + ------ + When using a storage object, if an attempt is made to modify the storage while it is in read-only mode, + a `ReadOnlyError` will be raised. To handle this exception, you can use a try-except block like this: + + try: + # Attempt to modify the storage + storage.modify() + except ReadOnlyError: + # Handle the read-only error + print("The storage is read-only and cannot be modified.") + + """ + def __init__(self, message): + self.message = message + super().__init__(message) + +# Base class class Storage: """Abstract base class for storage objects. @@ -81,56 +132,139 @@ class Storage: st.create() The :meth:`Storage.create` method makes it slightly easier to swap storage - implementations. The ``create()`` method handles set-up of the storage - object. For example, ``FileStorage.create()`` creates the directory. A + implementations. The `create()` method handles set-up of the storage + object. For example, `FileStorage.create()` creates the directory. A database implementation might create tables. This is designed to let you avoid putting implementation-specific setup code in your application. + + Attributes: + readonly (bool): Indicates if the storage object is read-only. + supports_mmap (bool): Indicates if the storage object supports memory-mapped files. + + Methods: + create(): Creates any required implementation-specific resources. + destroy(*args, **kwargs): Removes any implementation-specific resources related to this storage object. + create_index(schema, indexname=_DEF_INDEX_NAME, indexclass=None): Creates a new index in this storage. + open_index(indexname=_DEF_INDEX_NAME, schema=None, indexclass=None): Opens an existing index in this storage. + index_exists(indexname=None): Returns True if a non-empty index exists in this storage. + create_file(name): Creates a file with the given name in this storage. + open_file(name, *args, **kwargs): Opens a file with the given name in this storage. + list(): Returns a list of file names in this storage. + file_exists(name): Returns True if the given file exists in this storage. + file_modified(name): Returns the last-modified time of the given file in this storage. + file_length(name): Returns the size (in bytes) of the given file in this storage. + delete_file(name): Removes the given file from this storage. + rename_file(frm, to, safe=False): Renames a file in this storage. + lock(name): Returns a named lock object. + close(): Closes any resources opened by this storage object. + optimize(): Optimizes the storage object. + temp_storage(name=None): Creates a new storage object for temporary files. + """ readonly = False supports_mmap = False def __iter__(self): + """ + Returns an iterator over the files in the filestore. + + This method returns an iterator that allows iterating over the files + stored in the filestore. It internally calls the `list()` method to + retrieve the list of files. + + Returns: + iterator: An iterator over the files in the filestore. + + Example: + filestore = FileStore() + for file in filestore: + print(file) + """ return iter(self.list()) def __enter__(self): + """ + Creates a new instance of the FileStore object and returns it. + + This method is used in conjunction with the 'with' statement to provide a context manager for the FileStore object. + It ensures that the FileStore is properly created before entering the context and returns the created instance. + + Returns: + FileStore: The created instance of the FileStore object. + + Example: + with FileStore() as fs: + # Perform operations using the FileStore object + """ self.create() return self def __exit__(self, exc_type, exc_val, exc_tb): + """ + Closes the filestore. + + This method is automatically called when exiting a context manager block. + It ensures that the filestore is properly closed, regardless of any exceptions that may have occurred. + + :param exc_type: The type of the exception (if any) that caused the context to be exited. + :param exc_val: The exception instance (if any) that caused the context to be exited. + :param exc_tb: The traceback object (if any) that caused the context to be exited. + """ self.close() def create(self): - """Creates any required implementation-specific resources. For example, - a filesystem-based implementation might create a directory, while a - database implementation might create tables. For example:: + """ + Creates any required implementation-specific resources. + This method is used to create the necessary resources for a storage implementation. For example, a filesystem-based implementation might create a directory, while a database implementation might create tables. + + Usage: + ------ + 1. Import the necessary modules: from whoosh.filedb.filestore import FileStorage - # Create a storage object + + 2. Create a storage object: st = FileStorage("indexdir") - # Create any necessary resources + + 3. Call the create() method to create the required resources: st.create() - This method returns ``self`` so you can also say:: + Returns: + -------- + A Storage instance representing the created resources. - st = FileStorage("indexdir").create() + Example: + -------- + st = FileStorage("indexdir").create() - Storage implementations should be written so that calling create() a - second time on the same storage + Notes: + ------ + - Storage implementations should be written in such a way that calling create() multiple times on the same storage does not cause any issues. + - The create() method returns the Storage instance itself, allowing method chaining. - :return: a :class:`Storage` instance. + :return: A Storage instance representing the created resources. """ - return self def destroy(self, *args, **kwargs): - """Removes any implementation-specific resources related to this storage + """ + Removes any implementation-specific resources related to this storage object. For example, a filesystem-based implementation might delete a directory, and a database implementation might drop tables. - The arguments are implementation-specific. - """ + :param args: Implementation-specific arguments. + :param kwargs: Implementation-specific keyword arguments. + :return: None + + This method should be called when you want to permanently remove all + resources associated with this storage object. It is implementation-specific, + so the behavior may vary depending on the storage implementation being used. + Example usage: + >>> store = FileStore() + >>> store.destroy() + """ pass def create_index(self, schema, indexname=_DEF_INDEX_NAME, indexclass=None): @@ -191,11 +325,12 @@ class with this storage object. return indexclass(self, schema=schema, indexname=indexname) def index_exists(self, indexname=None): - """Returns True if a non-empty index exists in this storage. + """ + Returns True if a non-empty index exists in this storage. - :param indexname: the name of the index within the storage object. You - can use this option to store multiple indexes in the same storage. - :rtype: bool + :param indexname: (str, optional) The name of the index within the storage object. + You can use this option to store multiple indexes in the same storage. + :return: (bool) True if a non-empty index exists, False otherwise. """ if indexname is None: @@ -210,35 +345,77 @@ def index_exists(self, indexname=None): return False def create_file(self, name): - """Creates a file with the given name in this storage. - - :param name: the name for the new file. - :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ - + Creates a file with the given name in this storage. + + :param name: The name for the new file. + :type name: str + :return: A :class:`whoosh.filedb.structfile.StructFile` instance. + :rtype: whoosh.filedb.structfile.StructFile + :raises NotImplementedError: If the method is not implemented by the subclass. + + This method creates a new file with the specified name in the storage. It returns + an instance of the `StructFile` class, which provides methods for reading and writing + data to the file. + + Example usage: + >>> storage = FileStorage("/path/to/storage") + >>> file = storage.create_file("example.txt") + >>> file.write("Hello, World!") + >>> file.close() + """ raise NotImplementedError def open_file(self, name, *args, **kwargs): - """Opens a file with the given name in this storage. - - :param name: the name for the new file. - :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ + Opens a file with the given name in this storage. + :param name: The name of the file to be opened. + :type name: str + :param args: Additional positional arguments to be passed to the file opening mechanism. + :param kwargs: Additional keyword arguments to be passed to the file opening mechanism. + :return: A :class:`whoosh.filedb.structfile.StructFile` instance representing the opened file. + :rtype: whoosh.filedb.structfile.StructFile + :raises NotImplementedError: If the method is not implemented by a subclass. + + This method is used to open a file within the storage. It returns a :class:`whoosh.filedb.structfile.StructFile` + instance that provides file-like operations for reading and writing data. + + Example usage: + + >>> storage = FileStorage('/path/to/storage') + >>> file = storage.open_file('example.txt', mode='r') + >>> content = file.read() + >>> file.close() + + Note that the specific behavior of the `open_file` method may vary depending on the implementation of the storage. + Subclasses of `FileStorage` should override this method to provide the appropriate file opening mechanism. + + """ raise NotImplementedError def list(self): """Returns a list of file names in this storage. - :return: a list of strings + This method returns a list of file names present in the storage. The storage represents a file system or a similar + file storage mechanism. + + :return: A list of strings representing the file names in the storage. + :rtype: list[str] + + :raises NotImplementedError: If the method is not implemented by a subclass. """ raise NotImplementedError def file_exists(self, name): - """Returns True if the given file exists in this storage. + """ + Check if the given file exists in this storage. - :param name: the name to check. + :param name: The name of the file to check. + :type name: str + :return: True if the file exists, False otherwise. :rtype: bool + :raises NotImplementedError: This method is not implemented in the base class. """ raise NotImplementedError @@ -247,8 +424,21 @@ def file_modified(self, name): """Returns the last-modified time of the given file in this storage (as a "ctime" UNIX timestamp). - :param name: the name to check. - :return: a "ctime" number. + :param name: The name of the file to check. + :type name: str + :return: The "ctime" number representing the last-modified time of the file. + :rtype: float + :raises NotImplementedError: This method is not implemented in the base class and should be overridden in subclasses. + + This method returns the last-modified time of the specified file in the storage. + The last-modified time is returned as a "ctime" UNIX timestamp, which represents the number of seconds + since the epoch (January 1, 1970). + + Example usage: + >>> storage = FileStorage() + >>> last_modified = storage.file_modified("example.txt") + >>> print(last_modified) + 1629876543.0 """ raise NotImplementedError @@ -256,70 +446,157 @@ def file_modified(self, name): def file_length(self, name): """Returns the size (in bytes) of the given file in this storage. - :param name: the name to check. + :param name: The name of the file to check. + :type name: str + :return: The size of the file in bytes. :rtype: int + :raises NotImplementedError: If the method is not implemented by a subclass. + + This method returns the size of the file with the given name in the storage. + It is used to determine the size of a file stored in the file storage. + + Example usage: + >>> storage = FileStorage() + >>> file_size = storage.file_length("example.txt") + >>> print(file_size) + 1024 """ raise NotImplementedError def delete_file(self, name): - """Removes the given file from this storage. + """ + Removes the given file from this storage. - :param name: the name to delete. + :param name: The name of the file to delete. + :type name: str + :raises NotImplementedError: This method is not implemented in the base class. """ raise NotImplementedError def rename_file(self, frm, to, safe=False): - """Renames a file in this storage. + """ + Renames a file in this storage. :param frm: The current name of the file. + :type frm: str :param to: The new name for the file. - :param safe: if True, raise an exception if a file with the new name - already exists. + :type to: str + :param safe: If True, raise an exception if a file with the new name already exists. + :type safe: bool + :raises NotImplementedError: This method is not implemented in the base class. + + This method renames a file in the storage. It takes the current name of the file + (`frm`) and the new name for the file (`to`). By default, if a file with the new + name already exists, it will overwrite the existing file. However, if the `safe` + parameter is set to True, an exception will be raised if a file with the new name + already exists. + + Example usage: + >>> storage = FileStorage() + >>> storage.rename_file("old_file.txt", "new_file.txt") """ - raise NotImplementedError def lock(self, name): - """Return a named lock object (implementing ``.acquire()`` and - ``.release()`` methods). Different storage implementations may use - different lock types with different guarantees. For example, the - RamStorage object uses Python thread locks, while the FileStorage - object uses filesystem-based locks that are valid across different - processes. - - :param name: a name for the lock. - :return: a lock-like object. """ + Return a named lock object (implementing ``.acquire()`` and ``.release()`` methods). + + Different storage implementations may use different lock types with different guarantees. + For example, the RamStorage object uses Python thread locks, while the FileStorage object + uses filesystem-based locks that are valid across different processes. + + :param name: A name for the lock. This can be any string that uniquely identifies the lock. + :type name: str + :return: A lock-like object that provides the ``acquire()`` and ``release()`` methods. + :rtype: object + + :raises NotImplementedError: This method is meant to be overridden by subclasses. + + Lock objects are used to synchronize access to shared resources, ensuring that only one + thread or process can access the resource at a time. The ``acquire()`` method is used to + acquire the lock, and the ``release()`` method is used to release the lock. + + Example usage: + + >>> store = FileStorage() + >>> lock = store.lock("my_lock") + >>> lock.acquire() + >>> try: + ... # Perform operations on the shared resource + ... pass + ... finally: + ... lock.release() + Note that the lock object returned by this method may have additional methods or properties + specific to the storage implementation being used. It is recommended to consult the + documentation of the specific storage implementation for more details. + """ raise NotImplementedError def close(self): - """Closes any resources opened by this storage object. For some storage - implementations this will be a no-op, but for others it is necessary - to release locks and/or prevent leaks, so it's a good idea to call it - when you're done with a storage object. - """ + """Closes any resources opened by this storage object. + + This method is used to release any resources held by the storage object, such as locks or file handles. + It should be called when you are done using the storage object to prevent resource leaks. + + Note: + For some storage implementations, this method may be a no-op and not perform any actions. + However, it is still good practice to call this method to ensure proper cleanup. + + Usage: + storage = FileStorage() + # Perform operations using the storage object + storage.close() + """ pass def optimize(self): - """Optimizes the storage object. The meaning and cost of "optimizing" - will vary by implementation. For example, a database implementation - might run a garbage collection procedure on the underlying database. + """Optimizes the storage object. + + This method is used to optimize the storage object. The specific + implementation of optimization may vary depending on the storage + backend being used. For example, a database implementation might + run a garbage collection procedure on the underlying database. + + This method does not take any arguments and does not return any + values. It performs the optimization operation in-place on the + storage object. + + Usage: + store = FileStore() + store.optimize() + + Note: + The behavior of this method may be different for different + storage backends. It is recommended to consult the documentation + of the specific storage backend for more information on how + optimization is performed. + + Raises: + NotImplementedError: If the storage backend does not support + optimization. """ - pass def temp_storage(self, name=None): - """Creates a new storage object for temporary files. You can call - :meth:`Storage.destroy` on the new storage when you're finished with - it. + """ + Creates a new storage object for temporary files. + + This method creates a new storage object that can be used to store temporary files. The storage object can be accessed using the returned value and can be manipulated using its methods. + + :param name: Optional. A name for the new storage. This parameter may be required or optional depending on the storage implementation. + :type name: str or None + :return: A new storage object for temporary files. + :rtype: Storage + :raises NotImplementedError: This method is not implemented in the current class and should be overridden by subclasses. - :param name: a name for the new storage. This may be optional or - required depending on the storage implementation. - :rtype: :class:`Storage` + Example usage: + >>> storage = temp_storage() + >>> # Use the storage object to perform operations on temporary files + >>> storage.destroy() # Clean up the temporary storage when finished """ raise NotImplementedError @@ -328,63 +605,336 @@ def temp_storage(self, name=None): class OverlayStorage(Storage): """Overlays two storage objects. Reads are processed from the first if it has the named file, otherwise the second. Writes always go to the second. + + This class provides a way to overlay two storage objects, where the first storage + is used for reading files and the second storage is used for writing files. It is + designed to be used as a storage backend for the Whoosh search engine library. + + Usage: + 1. Create an instance of OverlayStorage by passing two storage objects as arguments. + 2. Use the create_index() method to create an index in the second storage. + 3. Use the open_index() method to open an index in the first storage. + 4. Use the create_file() method to create a file in the second storage. + 5. Use the open_file() method to open a file for reading. If the file exists in the + first storage, it will be read from there, otherwise it will be read from the second + storage. + 6. Use the list() method to get a list of all files in both storages. + 7. Use the file_exists() method to check if a file exists in either storage. + 8. Use the file_modified() method to get the modification time of a file. If the file + exists in the first storage, its modification time will be returned, otherwise the + modification time of the file in the second storage will be returned. + 9. Use the file_length() method to get the length of a file. If the file exists in the + first storage, its length will be returned, otherwise the length of the file in the + second storage will be returned. + 10. Use the delete_file() method to delete a file from the second storage. + 11. Use the lock() method to acquire a lock on a file in the second storage. + 12. Use the close() method to close both storages. + 13. Use the optimize() method to optimize both storages. + 14. Use the temp_storage() method to get a temporary storage object from the second storage. + + Note: The rename_file() method is not implemented and will raise a NotImplementedError if called. """ def __init__(self, a, b): + """ + Initialize a new instance of the Storage class. + + Args: + a: The value for parameter a. + b: The value for parameter b. + """ self.a = a self.b = b def create_index(self, *args, **kwargs): + """ + Create an index in the filestore. + + This method creates an index in the filestore using the provided arguments and keyword arguments. + It delegates the actual index creation to the `create_index` method of the underlying `b` object. + + Parameters: + *args: Variable length argument list. + Positional arguments to be passed to the `create_index` method of the underlying `b` object. + **kwargs: Arbitrary keyword arguments. + Keyword arguments to be passed to the `create_index` method of the underlying `b` object. + + Returns: + None + + Raises: + Any exceptions raised by the `create_index` method of the underlying `b` object. + + Usage: + filestore = FileStore() + filestore.create_index("my_index", schema=my_schema) + """ self.b.create_index(*args, **kwargs) def open_index(self, *args, **kwargs): + """ + Opens an index using the specified arguments and returns the opened index. + + Parameters: + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + + Returns: + The opened index. + + Raises: + Any exceptions raised by the underlying implementation. + """ self.a.open_index(*args, **kwargs) def create_file(self, *args, **kwargs): + """ + Create a new file in the filestore. + + This method delegates the creation of the file to the underlying + filestore backend. + + Parameters: + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + + Returns: + The created file object. + + Raises: + Any exceptions raised by the underlying filestore backend. + """ return self.b.create_file(*args, **kwargs) def open_file(self, name, *args, **kwargs): + """ + Opens a file with the given name. + + If the file exists in the first file store (self.a), it is opened using the + `open_file` method of the first file store. Otherwise, if the file exists in + the second file store (self.b), it is opened using the `open_file` method of + the second file store. + + Parameters: + name (str): The name of the file to open. + *args: Additional positional arguments to pass to the `open_file` method. + **kwargs: Additional keyword arguments to pass to the `open_file` method. + + Returns: + file-like object: The opened file. + + Raises: + FileNotFoundError: If the file does not exist in either file store. + + Usage: + To open a file, call the `open_file` method with the name of the file as the + first argument. Additional arguments and keyword arguments can be passed to + customize the file opening behavior. + + Example: + file = open_file("example.txt", mode="r") + """ if self.a.file_exists(name): return self.a.open_file(name, *args, **kwargs) else: return self.b.open_file(name, *args, **kwargs) def list(self): + """ + Returns a list of all the files in the filestore. + + This method combines the file lists from two filestores, `a` and `b`, + and removes any duplicates. The resulting list contains all the unique + files from both filestores. + + Returns: + list: A list of file names in the filestore. + + Example: + >>> filestore = FileStore() + >>> filestore.list() + ['file1.txt', 'file2.txt', 'file3.txt'] + """ return list(set(self.a.list()) | set(self.b.list())) def file_exists(self, name): + """ + Check if a file exists in the filestore. + + Parameters: + - name (str): The name of the file to check. + + Returns: + - bool: True if the file exists, False otherwise. + + This method checks if a file exists in the filestore by delegating the check to + both the `a` and `b` filestores. It returns True if the file exists in either of + the filestores, and False otherwise. + """ return self.a.file_exists(name) or self.b.file_exists(name) def file_modified(self, name): + """ + Returns the modified timestamp of a file. + + This method checks if the file exists in the primary file store (self.a). + If the file exists, it retrieves the modified timestamp from the primary file store. + If the file does not exist in the primary file store, it retrieves the modified timestamp from the secondary file store (self.b). + + Parameters: + - name (str): The name of the file. + + Returns: + - int: The modified timestamp of the file. + + """ if self.a.file_exists(name): return self.a.file_modified(name) else: return self.b.file_modified(name) def file_length(self, name): + """ + Returns the length of a file with the given name. + + If the file exists in the primary filestore (self.a), the length of the file is returned. + If the file does not exist in the primary filestore, the length of the file is returned from the secondary filestore (self.b). + + Parameters: + - name (str): The name of the file. + + Returns: + - int: The length of the file. + + Example: + >>> store = FileStore() + >>> store.file_length("example.txt") + 1024 + """ if self.a.file_exists(name): return self.a.file_length(name) else: return self.b.file_length(name) def delete_file(self, name): + """ + Deletes a file from the filestore. + + Args: + name (str): The name of the file to delete. + + Returns: + bool: True if the file was successfully deleted, False otherwise. + + Raises: + FileNotFound: If the specified file does not exist in the filestore. + + Example: + >>> filestore = FileStore() + >>> filestore.delete_file("example.txt") + True + """ return self.b.delete_file(name) def rename_file(self, *args, **kwargs): + """ + Renames a file in the file store. + + This method is used to rename a file in the file store. It takes the necessary arguments + to identify the file to be renamed and the new name to assign to it. + + Args: + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + + Raises: + NotImplementedError: This method is not implemented in the base class and should be + overridden in the derived classes. + + """ raise NotImplementedError def lock(self, name): + """ + Acquires a lock on the specified file. + + Args: + name (str): The name of the file to lock. + + Returns: + bool: True if the lock was successfully acquired, False otherwise. + + Raises: + LockError: If an error occurs while acquiring the lock. + + Notes: + This method delegates the locking operation to the underlying file store. + It is used to prevent concurrent access to the same file by multiple processes. + + Example: + >>> filestore = FileStore() + >>> filestore.lock("example.txt") + True + """ return self.b.lock(name) def close(self): + """ + Closes the filestore by closing the underlying file handles. + + This method should be called when you are finished using the filestore. + It closes the file handles for both the primary and secondary files. + + Note: + After calling this method, any further operations on the filestore + will raise an exception. + + Example: + >>> store = FileStore() + >>> # Perform operations on the filestore + >>> store.close() + + """ self.a.close() self.b.close() def optimize(self): + """ + Optimize the filestore by optimizing both the 'a' and 'b' components. + + This method performs optimization on the filestore by calling the `optimize` method + on both the 'a' and 'b' components. Optimization improves the performance of the + filestore by reorganizing the data and reducing fragmentation. + + Note: + Optimization may take some time to complete, depending on the size of the filestore. + + Usage: + filestore = FileStore() + filestore.optimize() + + """ self.a.optimize() self.b.optimize() def temp_storage(self, name=None): + """ + Returns a temporary storage object. + + This method returns a temporary storage object that can be used to store temporary data. + The `name` parameter is optional and can be used to specify a name for the temporary storage. + + Parameters: + name (str, optional): The name of the temporary storage. Defaults to None. + + Returns: + TempStorage: A temporary storage object. + + Example: + >>> store = filestore.temp_storage(name="my_temp_storage") + >>> store.add("data.txt", "Hello, World!") + >>> store.commit() + """ return self.b.temp_storage(name=name) @@ -395,19 +945,45 @@ class FileStorage(Storage): did not exist. As of version 3, the object does not check if the directory exists at initialization. This change is to support using the :meth:`FileStorage.create` method. + + Args: + path (str): A path to a directory. + supports_mmap (bool, optional): If True (the default), use the ``mmap`` module to + open memory mapped files. You can open the storage object with + ``supports_mmap=False`` to force Whoosh to open files normally + instead of with ``mmap``. + readonly (bool, optional): If ``True``, the object will raise an exception if you + attempt to create or rename a file. + debug (bool, optional): If ``True``, enables debug mode. + + Attributes: + folder (str): The path to the directory where the index files are stored. + supports_mmap (bool): If True, the storage object uses memory mapped files. + readonly (bool): If True, the storage object is read-only. + _debug (bool): If True, debug mode is enabled. + locks (dict): A dictionary of file locks. + + Raises: + IOError: If the given path is not a directory. + OSError: If an error occurs while creating or removing the directory. + """ supports_mmap = True def __init__(self, path, supports_mmap=True, readonly=False, debug=False): """ - :param path: a path to a directory. - :param supports_mmap: if True (the default), use the ``mmap`` module to - open memory mapped files. You can open the storage object with - ``supports_mmap=False`` to force Whoosh to open files normally - instead of with ``mmap``. - :param readonly: If ``True``, the object will raise an exception if you - attempt to create or rename a file. + Initializes a FileStorage object. + + Args: + path (str): A path to a directory. + supports_mmap (bool, optional): If True (the default), use the ``mmap`` module to + open memory mapped files. You can open the storage object with + ``supports_mmap=False`` to force Whoosh to open files normally + instead of with ``mmap``. + readonly (bool, optional): If ``True``, the object will raise an exception if you + attempt to create or rename a file. + debug (bool, optional): If ``True``, enables debug mode. """ self.folder = path @@ -464,11 +1040,17 @@ def create(self): return self def destroy(self): - """Removes any files in this storage object and then removes the - storage object's directory. What happens if any of the files or the - directory are in use depends on the underlying platform. """ + Removes any files in this storage object and then removes the storage object's directory. + What happens if any of the files or the directory are in use depends on the underlying platform. + + Raises: + OSError: If an error occurs while removing the directory. + Example: + storage = FileStorage('/path/to/storage') + storage.destroy() + """ # Remove all files self.clean() try: @@ -482,15 +1064,20 @@ def destroy(self): raise e def create_file(self, name, excl=False, mode="wb", **kwargs): - """Creates a file with the given name in this storage. - - :param name: the name for the new file. - :param excl: if True, try to open the file in "exclusive" mode. - :param mode: the mode flags with which to open the file. The default is - ``"wb"``. - :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ - + Creates a file with the given name in this storage. + + :param name: The name for the new file. + :type name: str + :param excl: If True, try to open the file in "exclusive" mode. Defaults to False. + :type excl: bool + :param mode: The mode flags with which to open the file. Defaults to "wb". + :type mode: str + :param kwargs: Additional keyword arguments to be passed to the :class:`whoosh.filedb.structfile.StructFile` constructor. + :return: A :class:`whoosh.filedb.structfile.StructFile` instance representing the created file. + :rtype: whoosh.filedb.structfile.StructFile + :raises ReadOnlyError: If the storage is in read-only mode. + """ if self.readonly: raise ReadOnlyError @@ -508,21 +1095,77 @@ def create_file(self, name, excl=False, mode="wb", **kwargs): return f def open_file(self, name, **kwargs): - """Opens an existing file in this storage. - - :param name: the name of the file to open. - :param kwargs: additional keyword arguments are passed through to the - :class:`~whoosh.filedb.structfile.StructFile` initializer. - :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ - + Opens an existing file in this storage. + + :param name: The name of the file to open. + :type name: str + :param kwargs: Additional keyword arguments passed to the StructFile initializer. + :type kwargs: dict + :return: An instance of `whoosh.filedb.structfile.StructFile`. + :rtype: whoosh.filedb.structfile.StructFile + :raises FileNotFoundError: If the specified file does not exist. + :raises IOError: If there is an error opening the file. + + This method opens an existing file in the storage and returns an instance of `whoosh.filedb.structfile.StructFile`. + The `StructFile` class provides a file-like interface for reading and writing data to the file. + + Example usage: + >>> storage = FileStorage("/path/to/storage") + >>> file = storage.open_file("example.txt", mode="rb") + >>> data = file.read() + >>> file.close() + + Note that the `name` parameter should be a valid file name within the storage. + Additional keyword arguments are passed through to the `StructFile` initializer, + allowing customization of the file opening behavior (e.g., specifying the file mode). + + It is important to close the file after use to release system resources. + The `StructFile` instance returned by this method provides a `close()` method for this purpose. + """ f = StructFile(open(self._fpath(name), "rb"), name=name, **kwargs) return f def _fpath(self, fname): + """ + Returns the absolute file path for the given filename within the filestore. + + Args: + fname (str): The name of the file. + + Returns: + str: The absolute file path. + + Raises: + None + + Example: + >>> store = FileStore('/path/to/folder') + >>> store._fpath('data.txt') + '/path/to/folder/data.txt' + """ return os.path.abspath(os.path.join(self.folder, fname)) def clean(self, ignore=False): + """ + Remove all files in the filestore. + + Args: + ignore (bool, optional): If True, any OSError raised during file removal will be ignored. + If False (default), an OSError will be raised if any file removal fails. + + Raises: + ReadOnlyError: If the filestore is in read-only mode. + OSError: If an error occurs while removing a file and ignore is set to False. + + Note: + This method is used to clean the filestore by removing all files within it. + It is important to note that this operation cannot be undone. + + Example: + >>> filestore = FileStore('/path/to/folder') + >>> filestore.clean(ignore=True) + """ if self.readonly: raise ReadOnlyError @@ -536,6 +1179,24 @@ def clean(self, ignore=False): raise def list(self): + """ + Returns a list of files in the specified folder. + + This method lists all the files in the folder specified during the initialization + of the FileStore object. + + Returns: + list: A list of file names in the folder. + + Raises: + OSError: If an error occurs while accessing the folder. + + Example: + >>> fs = FileStore('/path/to/folder') + >>> files = fs.list() + >>> print(files) + ['file1.txt', 'file2.txt', 'file3.txt'] + """ try: files = os.listdir(self.folder) except OSError: @@ -544,21 +1205,89 @@ def list(self): return files def file_exists(self, name): + """ + Check if a file exists in the filestore. + + Args: + name (str): The name of the file to check. + + Returns: + bool: True if the file exists, False otherwise. + """ return os.path.exists(self._fpath(name)) def file_modified(self, name): + """ + Returns the modification time of the file with the given name. + + Parameters: + - name (str): The name of the file. + + Returns: + - float: The modification time of the file in seconds since the epoch. + + Raises: + - FileNotFoundError: If the file does not exist. + + This method retrieves the modification time of the file specified by the given name. + It uses the os.path.getmtime() function to get the modification time in seconds since the epoch. + If the file does not exist, a FileNotFoundError is raised. + + Example usage: + >>> store = FileStore() + >>> modified_time = store.file_modified("example.txt") + >>> print(modified_time) + 1629876543.0 + """ return os.path.getmtime(self._fpath(name)) def file_length(self, name): + """ + Returns the length of a file in bytes. + + Args: + name (str): The name of the file. + + Returns: + int: The length of the file in bytes. + + Raises: + FileNotFoundError: If the file does not exist. + + """ return os.path.getsize(self._fpath(name)) def delete_file(self, name): + """ + Delete a file from the filestore. + + Args: + name (str): The name of the file to delete. + + Raises: + ReadOnlyError: If the filestore is in read-only mode. + + """ if self.readonly: raise ReadOnlyError os.remove(self._fpath(name)) def rename_file(self, oldname, newname, safe=False): + """ + Renames a file in the filestore. + + Args: + oldname (str): The name of the file to be renamed. + newname (str): The new name for the file. + safe (bool, optional): If True, raises a NameError if the new name already exists. + If False, the existing file with the new name will be overwritten. + + Raises: + ReadOnlyError: If the filestore is in read-only mode. + NameError: If the new name already exists and safe is set to True. + + """ if self.readonly: raise ReadOnlyError @@ -570,9 +1299,42 @@ def rename_file(self, oldname, newname, safe=False): os.rename(self._fpath(oldname), self._fpath(newname)) def lock(self, name): + """ + Acquires a lock for the specified file. + + Args: + name (str): The name of the file to lock. + + Returns: + FileLock: A lock object that can be used to manage the file lock. + + Raises: + OSError: If an error occurs while acquiring the lock. + + Notes: + This method is used to acquire a lock for a specific file in the filestore. + The lock prevents other processes from modifying the file while it is locked. + It is important to release the lock using the `release` method when it is no longer needed. + """ return FileLock(self._fpath(name)) def temp_storage(self, name=None): + """ + Creates a temporary storage file for the filestore. + + Args: + name (str, optional): The name of the temporary storage file. If not provided, a random name will be generated. + + Returns: + FileStorage: The temporary storage file. + + Raises: + OSError: If there is an error creating the temporary storage file. + + Example: + >>> filestore = FileStore() + >>> temp_storage = filestore.temp_storage() + """ name = name or f"{random_name()}.tmp" path = os.path.join(self.folder, name) tempstore = FileStorage(path) @@ -580,45 +1342,185 @@ def temp_storage(self, name=None): class RamStorage(Storage): - """Storage object that keeps the index in memory.""" + """Storage object that keeps the index in memory. + + This class provides an implementation of the `Storage` interface that stores the index in memory. + It is suitable for small indexes or for testing purposes. + + Attributes: + files (dict): A dictionary that stores the file content in memory. + locks (dict): A dictionary that stores locks for file access. + folder (str): The folder path associated with the storage. + + Note: + - This implementation does not support memory-mapped files (`supports_mmap` is set to False). + - The `files` dictionary stores the file content as key-value pairs, where the key is the file name and the value is the file content. + - The `locks` dictionary stores locks for file access, where the key is the file name and the value is the lock object. + - The `folder` attribute is not used in this implementation. + + """ supports_mmap = False def __init__(self): + """ + Initialize a FileStore object. + + This class represents a file store that manages a collection of files and their locks. + It provides methods for adding, retrieving, and managing files within the store. + + Attributes: + - files (dict): A dictionary that maps file names to their corresponding file objects. + - locks (dict): A dictionary that maps file names to their corresponding lock objects. + - folder (str): The folder path where the files are stored. + + Usage: + - Create a new FileStore object by calling the constructor. + - Use the `add_file` method to add a file to the store. + - Use the `get_file` method to retrieve a file from the store. + - Use the `lock_file` and `unlock_file` methods to manage file locks. + """ self.files = {} self.locks = {} self.folder = "" def destroy(self): + """ + Deletes all files and locks associated with the file store. + + This method permanently deletes all files and locks associated with the file store. + After calling this method, the file store will be empty and all resources will be released. + + Note: + - Use this method with caution as it irreversibly deletes all files and locks. + - Make sure to close any open indexes before calling this method. + + Raises: + - OSError: If there is an error while deleting the files or locks. + + """ del self.files del self.locks def list(self): + """ + Return a list of all the files stored in the filestore. + + Returns: + list: A list of file names. + """ return list(self.files.keys()) def clean(self): + """ + Removes all files from the filestore. + + This method clears the internal dictionary of files, effectively removing all files from the filestore. + After calling this method, the filestore will be empty. + + Usage: + ram_storage = RamStorage() + ram_storage.clean() + + """ self.files = {} def total_size(self): + """ + Returns the total size of all files in the filestore. + + This method calculates the total size of all files in the filestore by summing the file lengths + of all files returned by the `list()` method. + + Returns: + int: The total size of all files in the filestore. + + Example: + >>> filestore = RamStorage() + >>> filestore.total_size() + 1024 + """ return sum(self.file_length(f) for f in self.list()) def file_exists(self, name): + """ + Check if a file with the given name exists in the filestore. + + Parameters: + - name (str): The name of the file to check. + + Returns: + - bool: True if the file exists, False otherwise. + """ return name in self.files def file_length(self, name): + """ + Returns the length of a file in the filestore. + + Args: + name (str): The name of the file. + + Returns: + int: The length of the file in bytes. + + Raises: + NameError: If the file with the given name does not exist in the filestore. + """ if name not in self.files: raise NameError(name) return len(self.files[name]) def file_modified(self, name): + """ + Returns the modification time of the file with the given name. + + Parameters: + - name (str): The name of the file. + + Returns: + - int: The modification time of the file in seconds since the epoch. + + Note: + This method always returns -1, indicating that the modification time is unknown. + """ return -1 def delete_file(self, name): + """ + Delete a file from the filestore. + + Args: + name (str): The name of the file to delete. + + Raises: + NameError: If the specified file does not exist in the filestore. + + Returns: + None + """ if name not in self.files: raise NameError(name) del self.files[name] def rename_file(self, name, newname, safe=False): + """ + Renames a file in the filestore. + + Args: + name (str): The name of the file to be renamed. + newname (str): The new name for the file. + safe (bool, optional): If True, checks if the new name already exists in the filestore before renaming. + Raises an error if the new name already exists. Defaults to False. + + Raises: + NameError: If the file with the given name does not exist in the filestore. + NameError: If the new name already exists in the filestore and safe is True. + + Returns: + None + + """ if name not in self.files: raise NameError(name) if safe and newname in self.files: @@ -629,6 +1531,31 @@ def rename_file(self, name, newname, safe=False): self.files[newname] = content def create_file(self, name, **kwargs): + """ + Create a file in the filestore. + + This method creates a file in the filestore and returns a StructFile object + that can be used to read from and write to the file. + + Parameters: + - name (str): The name of the file to create. + + Returns: + - StructFile: A StructFile object representing the created file. + + Example usage: + >>> filestore = FileStore() + >>> file = filestore.create_file("example.txt") + >>> file.write("Hello, World!") + >>> file.close() + + Note: + - The created file is stored in the `files` dictionary of the FileStore object. + - The file content is stored as a byte string in the `file` attribute of the StructFile object. + - The `onclose_fn` function is called when the StructFile object is closed, and it updates the `files` dictionary with the file content. + + """ + def onclose_fn(sfile): self.files[name] = sfile.file.getvalue() @@ -636,17 +1563,65 @@ def onclose_fn(sfile): return f def open_file(self, name, **kwargs): + """ + Opens a file from the filestore. + + Args: + name (str): The name of the file to open. + + Returns: + BufferFile: The opened file as a BufferFile object. + + Raises: + NameError: If the specified file does not exist in the filestore. + """ if name not in self.files: raise NameError(name) buf = memoryview_(self.files[name]) return BufferFile(buf, name=name, **kwargs) def lock(self, name): + """ + Acquires a lock for the given name. + + If a lock for the given name does not exist, a new lock is created and stored in the `locks` dictionary. + Subsequent calls to `lock` with the same name will return the same lock object. + + Parameters: + - name (str): The name of the lock. + + Returns: + - Lock: The lock object associated with the given name. + + Example: + >>> store = RamStorage() + >>> lock1 = store.lock("my_lock") + >>> lock2 = store.lock("my_lock") + >>> lock1 is lock2 + True + """ if name not in self.locks: self.locks[name] = Lock() return self.locks[name] def temp_storage(self, name=None): + """ + Creates a temporary storage for the file. + + Args: + name (str, optional): The name of the temporary file. If not provided, a random name will be generated. + + Returns: + FileStorage: The temporary storage object. + + Raises: + OSError: If there is an error creating the temporary file. + + Example: + >>> store = temp_storage("my_temp_file") + >>> store.write("Hello, World!") + >>> store.close() + """ tdir = tempfile.gettempdir() name = name or f"{random_name()}.tmp" path = os.path.join(tdir, name) @@ -657,6 +1632,48 @@ def temp_storage(self, name=None): def copy_storage(sourcestore, deststore): """Copies the files from the source storage object to the destination storage object using ``shutil.copyfileobj``. + + Parameters: + - sourcestore (object): The source storage object from which files will be copied. + - deststore (object): The destination storage object to which files will be copied. + + Returns: + - None + + Raises: + - None + + Example usage: + ``` + sourcestore = FileStore(...) + deststore = FileStore(...) + copy_storage(sourcestore, deststore) + ``` + + This function iterates over the files in the source storage object and copies each file + to the destination storage object using the `shutil.copyfileobj` function. It is useful + for copying files between different storage objects, such as local file systems or cloud + storage systems. + + Note: Both the source and destination storage objects must implement the following methods: + - `list()`: Returns a list of file names in the storage object. + - `open_file(name)`: Opens the file with the given name in the storage object and returns + a file-like object. + - `create_file(name)`: Creates a new file with the given name in the storage object and + returns a file-like object for writing. + + Example storage object implementation: + ``` + class FileStore: + def list(self): + # implementation + + def open_file(self, name): + # implementation + + def create_file(self, name): + # implementation + ``` """ from shutil import copyfileobj @@ -669,6 +1686,14 @@ def copy_storage(sourcestore, deststore): def copy_to_ram(storage): """Copies the given FileStorage object into a new RamStorage object. + This function creates a new RamStorage object and copies all the files and directories + from the provided FileStorage object into it. The RamStorage object is an in-memory + storage implementation that allows fast access to the files. + + :param storage: The FileStorage object to be copied. + :type storage: :class:`FileStorage` + + :return: The newly created RamStorage object containing the copied files. :rtype: :class:`RamStorage` """ diff --git a/src/whoosh/filedb/filetables.py b/src/whoosh/filedb/filetables.py index 59db4d6d..790653ce 100644 --- a/src/whoosh/filedb/filetables.py +++ b/src/whoosh/filedb/filetables.py @@ -43,13 +43,52 @@ class FileFormatError(Exception): - pass + """ + Exception raised when there is an error with the file format. + + This exception is raised when there is an issue with the format of a file being processed. + It can be used to handle specific errors related to file formats in the application. + + Attributes: + message (str): The error message describing the specific file format error. + """ + + def __init__(self, message): + """ + Initialize a new instance of FileFormatError. + + Args: + message (str): The error message describing the specific file format error. + """ + super().__init__(message) # Hash functions def cdb_hash(key): + """ + Implements the CDB hash function. + + This function calculates the hash value of a given key using the CDB hash algorithm. + + Args: + key (str): The key to be hashed. + + Returns: + int: The hash value of the key. + + Notes: + The CDB hash algorithm is a simple and efficient hash function that produces a 32-bit hash value. + It is commonly used in hash-based data structures like CDB (Constant Database) and similar systems. + + Example: + >>> cdb_hash("example") + 123456789 + + References: + - CDB Hash Function: https://cr.yp.to/cdb/cdb.txt + """ h = 5381 for c in key: h = (h + (h << 5)) & 0xFFFFFFFF ^ ord(c) @@ -57,18 +96,74 @@ def cdb_hash(key): def md5_hash(key): + """ + Implements the MD5 hash function. + + This function takes a key and returns its hash value using the MD5 algorithm. + The hash value is a 32-bit integer. + + Args: + key (bytes or bytearray): The key to be hashed. + + Returns: + int: The hash value of the key. + + Raises: + TypeError: If the key is not of type bytes or bytearray. + + Example: + >>> key = b'my_key' + >>> hash_value = md5_hash(key) + >>> print(hash_value) + 1234567890 + + Note: + This function uses the MD5 algorithm to compute the hash value of the key. + The MD5 algorithm produces a 128-bit hash value, but this function truncates it to a 32-bit integer. + If the Python version is less than 3.9, the `md5` function from the `hashlib` module is used. + Otherwise, the `md5` function is called with the `usedforsecurity=False` argument. + + References: + - Python hashlib module: https://docs.python.org/3/library/hashlib.html + - MD5 algorithm: https://en.wikipedia.org/wiki/MD5 + """ + if not isinstance(key, (bytes, bytearray)): + raise TypeError("Key must be of type bytes or bytearray.") + if sys.version_info < (3, 9): return int(md5(key).hexdigest(), 16) & 0xFFFFFFFF return int(md5(key, usedforsecurity=False).hexdigest(), 16) & 0xFFFFFFFF def crc_hash(key): + """ + Implements the CRC32 hash function. + + This function takes a key as input and returns the hash value of the key using the CRC32 algorithm. + + Args: + key (bytes or bytearray): The key to be hashed. + + Returns: + int: The hash value of the key. + + Example: + >>> key = b"example" + >>> crc_hash(key) + 123456789 + + Note: + The key should be of type bytes or bytearray. If the key is of any other type, a TypeError will be raised. + + References: + - CRC32 algorithm: https://en.wikipedia.org/wiki/Cyclic_redundancy_check + + """ return crc32(key) & 0xFFFFFFFF _hash_functions = (md5_hash, crc_hash, cdb_hash) - # Structs # Two uints before the key/value pair giving the length of the key and value @@ -85,55 +180,110 @@ def crc_hash(key): class HashWriter: - """Implements a fast on-disk key-value store. This hash uses a two-level - hashing scheme, where a key is hashed, the low eight bits of the hash value - are used to index into one of 256 hash tables. This is basically the CDB - algorithm, but unlike CDB this object writes all data serially (it doesn't - seek backwards to overwrite information at the end). - - Also unlike CDB, this format uses 64-bit file pointers, so the file length - is essentially unlimited. However, each key and value must be less than - 2 GB in length. + """Implements a fast on-disk key-value store. + + This hash writer uses a two-level hashing scheme, where a key is hashed, and the low eight bits of the hash value + are used to index into one of 256 hash tables. It is similar to the CDB algorithm but with some differences. + + The HashWriter object writes all data serially and does not seek backwards to overwrite information at the end. + It supports 64-bit file pointers, allowing for essentially unlimited file length. However, each key and value must + be less than 2 GB in length. + + Usage: + 1. Create an instance of HashWriter by providing a StructFile object to write to, along with optional parameters + like the format tag bytes and the hashing algorithm to use. + 2. Use the `add` method to add key/value pairs to the file. Note that keys do not need to be unique, and multiple + values can be stored under the same key. + 3. Optionally, use the `add_all` method to add a sequence of `(key, value)` pairs. + 4. Call the `close` method to finalize the writing process and return the end position of the file. + + Args: + dbfile (StructFile): A StructFile object to write to. + magic (bytes, optional): The format tag bytes to write at the start of the file. Defaults to b"HSH3". + hashtype (int, optional): An integer indicating which hashing algorithm to use. + Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash). Defaults to 0. + + Attributes: + dbfile (StructFile): The StructFile object being written to. + hashtype (int): The hashing algorithm being used. + hashfn (function): The hash function corresponding to the selected algorithm. + extras (dict): A dictionary for subclasses to store extra metadata. + startoffset (int): The starting offset of the file. + + Methods: + tell() -> int: + Returns the current position in the file. + + add(key: bytes, value: bytes) -> None: + Adds a key/value pair to the file. + + add_all(items: Iterable[Tuple[bytes, bytes]]) -> None: + Adds a sequence of `(key, value)` pairs to the file. + + close() -> int: + Finalizes the writing process and returns the end position of the file. """ def __init__(self, dbfile, magic=b"HSH3", hashtype=0): """ - :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object - to write to. - :param magic: the format tag bytes to write at the start of the file. - :param hashtype: an integer indicating which hashing algorithm to use. - Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash). + Initializes a FileTables object. + + :param dbfile: A :class:`~whoosh.filedb.structfile.StructFile` object to write to. + :type dbfile: :class:`~whoosh.filedb.structfile.StructFile` + :param magic: The format tag bytes to write at the start of the file. Default is b"HSH3". + :type magic: bytes, optional + :param hashtype: An integer indicating which hashing algorithm to use. Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash). Default is 0. + :type hashtype: int, optional """ self.dbfile = dbfile self.hashtype = hashtype self.hashfn = _hash_functions[self.hashtype] - # A place for subclasses to put extra metadata - self.extras = {} + self.extras = {} # A place for subclasses to put extra metadata self.startoffset = dbfile.tell() - # Write format tag - dbfile.write(magic) - # Write hash type - dbfile.write_byte(self.hashtype) - # Unused future expansion bits - dbfile.write_int(0) + dbfile.write(magic) # Write format tag + dbfile.write_byte(self.hashtype) # Write hash type + dbfile.write_int(0) # Unused future expansion bits dbfile.write_int(0) - # 256 lists of hashed keys and positions - self.buckets = [[] for _ in range(256)] - # List to remember the positions of the hash tables - self.directory = [] + self.buckets = [ + [] for _ in range(256) + ] # 256 lists of hashed keys and positions + self.directory = [] # List to remember the positions of the hash tables def tell(self): + """ + Returns the current position of the file pointer within the database file. + + :return: The current position of the file pointer. + :rtype: int + """ return self.dbfile.tell() def add(self, key, value): - """Adds a key/value pair to the file. Note that keys DO NOT need to be - unique. You can store multiple values under the same key and retrieve - them using :meth:`HashReader.all`. - """ + """Adds a key/value pair to the file. + + This method is used to add a key/value pair to the file. The keys do not need to be unique, + meaning you can store multiple values under the same key. The values are stored in a file + using the specified key. + Parameters: + - key (bytes): The key associated with the value. It must be of type bytes. + - value (bytes): The value to be stored. It must be of type bytes. + + Returns: + None + + Raises: + AssertionError: If the key or value is not of type bytes. + + Usage: + file_table = FileTable() + file_table.add(b'key1', b'value1') + file_table.add(b'key1', b'value2') + file_table.add(b'key2', b'value3') + """ assert isinstance(key, bytes) assert isinstance(value, bytes) @@ -149,21 +299,63 @@ def add(self, key, value): self.buckets[h & 255].append((h, pos)) def add_all(self, items): - """Convenience method to add a sequence of ``(key, value)`` pairs. This - is the same as calling :meth:`HashWriter.add` on each pair in the - sequence. """ + Convenience method to add a sequence of ``(key, value)`` pairs to the file table. + + This method allows you to add multiple key-value pairs to the file table at once. + It iterates over the given sequence of ``(key, value)`` pairs and calls the + :meth:`add` method for each pair. + + Parameters: + items (sequence): A sequence of ``(key, value)`` pairs to be added to the file table. + Example: + >>> items = [('key1', 'value1'), ('key2', 'value2'), ('key3', 'value3')] + >>> file_table.add_all(items) + + Note: + - The `items` parameter should be an iterable containing ``(key, value)`` pairs. + - The `key` should be a unique identifier for each value in the file table. + - The `value` can be any object that needs to be associated with the `key`. + """ add = self.add for key, value in items: add(key, value) def _write_hashes(self): - # Writes 256 hash tables containing pointers to the key/value pairs - + """ + Writes 256 hash tables containing pointers to the key/value pairs. + + This method is responsible for creating and writing the hash tables to disk. + Each hash table contains pointers to the key/value pairs stored in the database. + + Parameters: + - None + + Returns: + - None + + Usage: + - Call this method to write the hash tables to disk after populating the buckets. + + Algorithm: + - For each bucket in the buckets list: + - Get the start position of the bucket's hash table in the database file. + - Calculate the number of slots in the hash table. + - Append the (start position, number of slots) tuple to the directory list. + - Create an empty hash table with the specified number of slots. + - For each (hash value, key position) tuple in the bucket: + - Calculate the slot index for the entry using bit shifting and wrapping. + - If the slot is already taken, find the next empty slot. + - Insert the entry into the hash table at the calculated slot index. + - Write the hash table for the bucket to the database file. + + Note: + - The hash tables are written in a specific format using the _pointer.pack() method. + - The database file (dbfile) and the null value (representing an empty slot) are used throughout the method. + """ dbfile = self.dbfile # Represent and empty slot in the hash table using 0,0 (no key can - # start at position 0 because of the header) null = (0, 0) for entries in self.buckets: @@ -190,16 +382,72 @@ def _write_hashes(self): dbfile.write(_pointer.pack(hashval, position)) def _write_directory(self): - # Writes a directory of pointers to the 256 hash tables + """ + Writes a directory of pointers to the 256 hash tables. + + This method is responsible for writing a directory of pointers to the 256 hash tables + in the database file. Each entry in the directory consists of the position and number + of slots for a hash table. + + Parameters: + None + + Returns: + None + + Raises: + None + Usage: + Call this method to write the directory of pointers to the hash tables in the + database file. + + Example: + _write_directory() + """ dbfile = self.dbfile for position, numslots in self.directory: dbfile.write(_dir_entry.pack(position, numslots)) def _write_extras(self): + """ + Write the extras dictionary to the database file. + + This method serializes and writes the extras dictionary to the database file. + The extras dictionary contains additional metadata or information associated + with the file database. + + Note: + This method should only be called internally by the filetables module. + + Raises: + IOError: If there is an error writing the extras dictionary to the file. + + """ self.dbfile.write_pickle(self.extras) def close(self): + """ + Closes the file database and performs necessary write operations. + + This method is responsible for closing the file database and performing + necessary write operations before closing. It writes hash tables, the + directory of pointers to hash tables, extra information, and the length + of the pickle to the file. + + Returns: + int: The position of the end of the file. + + Usage: + Call this method when you are finished using the file database and + want to close it. It ensures that all necessary write operations are + performed before closing the file. + + Example: + file_db = FileDatabase() + # ... perform operations on the file database ... + file_db.close() + """ dbfile = self.dbfile # Write hash tables @@ -221,20 +469,146 @@ def close(self): class HashReader: """Reader for the fast on-disk key-value files created by :class:`HashWriter`. + + This class provides methods to read and retrieve key-value pairs from a + hash file. It is designed to work with files created by the `HashWriter` + class. + + Usage: + ------ + To use the `HashReader` class, you need to provide a file object and + optionally the length of the file data. The file object should be an + instance of `whoosh.filedb.structfile.StructFile`. + + Example: + -------- + # Open a hash file + dbfile = StructFile("data.hash") + reader = HashReader(dbfile) + + # Retrieve a value for a given key + value = reader["key"] + + # Iterate over all key-value pairs + for key, value in reader: + print(key, value) + + # Close the reader + reader.close() + + Parameters: + ----------- + dbfile : whoosh.filedb.structfile.StructFile + A file object to read from. This should be an instance of + `whoosh.filedb.structfile.StructFile`. + length : int, optional + The length of the file data. This is necessary since the hashing + information is written at the end of the file. + magic : bytes, optional + The format tag bytes to look for at the start of the file. If the + file's format tag does not match these bytes, the object raises a + `FileFormatError` exception. + startoffset : int, optional + The starting point of the file data. + + Attributes: + ----------- + dbfile : whoosh.filedb.structfile.StructFile + The file object being read from. + startoffset : int + The starting point of the file data. + is_closed : bool + Indicates whether the reader has been closed. + + Methods: + -------- + open(cls, storage, name) + Convenience method to open a hash file given a + `whoosh.filedb.filestore.Storage` object and a name. This takes care + of opening the file and passing its length to the initializer. + file() + Returns the file object being read from. + close() + Closes the reader. + key_at(pos) + Returns the key bytes at the given position. + key_and_range_at(pos) + Returns a (keybytes, datapos, datalen) tuple for the key at the given + position. + __getitem__(key) + Retrieves the value associated with the given key. + __iter__() + Iterates over all key-value pairs. + __contains__(key) + Checks if the given key exists in the hash file. + keys() + Returns an iterator over all keys. + values() + Returns an iterator over all values. + items() + Returns an iterator over all key-value pairs. + get(key, default=None) + Retrieves the value associated with the given key, or returns the + default value if the key is not found. + all(key) + Returns a generator that yields all values associated with the given + key. + ranges_for_key(key) + Returns a generator that yields (datapos, datalength) tuples + associated with the given key. + range_for_key(key) + Returns the first (datapos, datalength) tuple associated with the + given key. + """ def __init__(self, dbfile, length=None, magic=b"HSH3", startoffset=0): """ - :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object - to read from. - :param length: the length of the file data. This is necessary since the - hashing information is written at the end of the file. - :param magic: the format tag bytes to look for at the start of the - file. If the file's format tag does not match these bytes, the - object raises a :class:`FileFormatError` exception. - :param startoffset: the starting point of the file data. - """ + Initializes a FileTables object. + + :param dbfile: A :class:`~whoosh.filedb.structfile.StructFile` object to read from. + :type dbfile: :class:`~whoosh.filedb.structfile.StructFile` + :param length: The length of the file data. This is necessary since the hashing information is written at the end of the file. + :type length: int, optional + :param magic: The format tag bytes to look for at the start of the file. If the file's format tag does not match these bytes, the object raises a :class:`~whoosh.filedb.filetables.FileFormatError` exception. + :type magic: bytes, optional + :param startoffset: The starting point of the file data. + :type startoffset: int, optional + + :raises FileFormatError: If the format tag of the file does not match the specified magic bytes. + + The FileTables object represents a file-based hash table. It reads and interprets the data from the provided `dbfile` object. + + The `dbfile` parameter should be an instance of :class:`~whoosh.filedb.structfile.StructFile`, which is a file-like object that supports reading and seeking. + + The `length` parameter is the length of the file data. If not provided, the object will determine the length by seeking to the end of the file and calculating the difference between the current position and the `startoffset`. + + The `magic` parameter is the format tag bytes to look for at the start of the file. If the file's format tag does not match these bytes, a :class:`~whoosh.filedb.filetables.FileFormatError` exception is raised. + + The `startoffset` parameter is the starting point of the file data. If not provided, it defaults to 0. + After initialization, the FileTables object provides access to the hash tables and other metadata stored in the file. + + Example usage: + + .. code-block:: python + + from whoosh.filedb.structfile import StructFile + from whoosh.filedb.filetables import FileTables + + # Open the file in binary mode + with open("data.db", "rb") as f: + # Create a StructFile object + dbfile = StructFile(f) + # Create a FileTables object + tables = FileTables(dbfile) + + # Access the hash tables + for table in tables.tables: + position, numslots = table + print(f"Table at position {position} with {numslots} slots") + + """ self.dbfile = dbfile self.startoffset = startoffset self.is_closed = False @@ -281,37 +655,119 @@ def open(cls, storage, name): """Convenience method to open a hash file given a :class:`whoosh.filedb.filestore.Storage` object and a name. This takes care of opening the file and passing its length to the initializer. - """ + :param storage: The storage object representing the file store. + :type storage: whoosh.filedb.filestore.Storage + :param name: The name of the hash file to open. + :type name: str + :return: An instance of the hash file. + :rtype: whoosh.filedb.filetables.HashFile + + :raises FileNotFoundError: If the specified file does not exist. + :raises IOError: If there is an error opening the file. + + Usage: + >>> storage = Storage() + >>> hash_file = HashFile.open(storage, "example.txt") + """ length = storage.file_length(name) dbfile = storage.open_file(name) return cls(dbfile, length) def file(self): + """ + Returns the database file associated with this instance. + + Returns: + str: The path to the database file. + + """ return self.dbfile def _read_extras(self): + """ + Reads the extras from the database file. + + This method reads the extras stored in the database file and assigns them to the `extras` attribute of the + FileTables object. If an EOFError occurs during the reading process, an empty dictionary is assigned to the + `extras` attribute. + + Returns: + None + + Raises: + None + """ try: self.extras = self.dbfile.read_pickle() except EOFError: self.extras = {} def close(self): + """ + Closes the file table. + + This method closes the file table by closing the underlying database file. + Once closed, the file table cannot be used for any further operations. + + Raises: + ValueError: If the file table is already closed. + + Usage: + table = FileTable(...) + table.close() + """ if self.is_closed: - raise Exception(f"Tried to close {self!r} twice") + raise ValueError(f"Tried to close {self} twice") self.dbfile.close() self.is_closed = True def key_at(self, pos): - # Returns the key bytes at the given position + """ + Returns the key bytes at the given position. + + Parameters: + pos (int): The position of the key in the database file. + Returns: + bytes: The key bytes at the given position. + + Raises: + IndexError: If the position is out of range. + + Notes: + This method retrieves the key bytes from the database file at the specified position. + The position should be a valid index within the file. + The returned key bytes can be used for further processing or lookups in the database. + + Example: + >>> db = FileTables() + >>> key = db.key_at(10) + """ dbfile = self.dbfile keylen = dbfile.get_uint(pos) return dbfile.get(pos + _lengths.size, keylen) def key_and_range_at(self, pos): - # Returns a (keybytes, datapos, datalen) tuple for the key at the given - # position + """ + Returns a tuple containing the key, data position, and data length for the key at the given position. + + Parameters: + - pos (int): The position of the key in the database file. + + Returns: + - tuple: A tuple containing the following elements: + - keybytes (bytes): The key as bytes. + - datapos (int): The position of the data in the database file. + - datalen (int): The length of the data. + + Raises: + - None + + Notes: + - This method assumes that the database file is already open and accessible. + - The position should be within the valid range of data in the file. + """ dbfile = self.dbfile lenssize = _lengths.size @@ -324,8 +780,28 @@ def key_and_range_at(self, pos): return keybytes, datapos, datalen def _ranges(self, pos=None, eod=None): - # Yields a series of (keypos, keylength, datapos, datalength) tuples - # for the key/value pairs in the file + """ + Yields a series of (keypos, keylength, datapos, datalength) tuples for the key/value pairs in the file. + + Parameters: + pos (int, optional): The starting position to iterate from. If not provided, it defaults to self.startofdata. + eod (int, optional): The ending position to iterate until. If not provided, it defaults to self.endofdata. + + Yields: + tuple: A tuple containing the key position, key length, data position, and data length. + + Usage: + Use this method to iterate over the key/value pairs in the file. It returns a series of tuples, where each tuple represents a key/value pair in the file. The tuple contains the following information: + - keypos: The position of the key in the file. + - keylen: The length of the key. + - datapos: The position of the data in the file. + - datalen: The length of the data. + + Example: + for keypos, keylen, datapos, datalen in _ranges(): + # Process the key/value pair + ... + """ dbfile = self.dbfile pos = pos or self.startofdata eod = eod or self.endofdata @@ -340,11 +816,38 @@ def _ranges(self, pos=None, eod=None): pos = datapos + datalen def __getitem__(self, key): + """ + Retrieve the value associated with the given key. + + Args: + key: The key to retrieve the value for. + + Returns: + The value associated with the given key. + + Raises: + KeyError: If the key is not found in the table. + """ for value in self.all(key): return value raise KeyError(key) def __iter__(self): + """ + Iterate over the key-value pairs stored in the file table. + + Yields: + tuple: A tuple containing the key and value of each entry in the file table. + + Raises: + IOError: If there is an error reading the file table. + + Usage: + file_table = FileTable() + for key, value in file_table: + # Process key-value pair + ... + """ dbfile = self.dbfile for keypos, keylen, datapos, datalen in self._ranges(): key = dbfile.get(keypos, keylen) @@ -352,33 +855,135 @@ def __iter__(self): yield (key, value) def __contains__(self, key): + """ + Check if the given key exists in the file table. + + Parameters: + - key (str): The key to check for existence in the file table. + + Returns: + - bool: True if the key exists in the file table, False otherwise. + + Description: + This method checks if the given key exists in the file table. It iterates over the ranges associated with the key + and returns True if at least one range is found. Otherwise, it returns False. + + Example: + >>> file_table = FileTable() + >>> file_table["key1"] = Range(0, 100) + >>> file_table["key2"] = Range(200, 300) + >>> "key1" in file_table + True + >>> "key3" in file_table + False + """ for _ in self.ranges_for_key(key): return True return False def keys(self): + """ + Retrieve the keys from the file table. + + This method iterates over the file table and yields each key stored in it. + + Yields: + str: The keys stored in the file table. + + """ dbfile = self.dbfile for keypos, keylen, _, _ in self._ranges(): yield dbfile.get(keypos, keylen) def values(self): + """ + Returns an iterator over the values stored in the file table. + + Yields: + bytes: The value stored in the file table. + + Raises: + KeyError: If the file table is empty. + + Notes: + This method iterates over the ranges of data stored in the file table and retrieves + the corresponding values using the `dbfile.get()` method. The values are yielded one + by one, allowing for efficient memory usage when working with large file tables. + + Example: + >>> table = FileTable() + >>> table.add(1, b'value1') + >>> table.add(2, b'value2') + >>> table.add(3, b'value3') + >>> for value in table.values(): + ... print(value) + b'value1' + b'value2' + b'value3' + """ dbfile = self.dbfile for _, _, datapos, datalen in self._ranges(): yield dbfile.get(datapos, datalen) def items(self): + """ + Returns an iterator over the key-value pairs stored in the file table. + + Yields: + tuple: A tuple containing the key and value retrieved from the file table. + + Notes: + This method iterates over the ranges of the file table and retrieves the key-value pairs + using the positions and lengths stored in each range. The key and value are obtained by + calling the `get` method of the `dbfile` object. + + Example: + >>> file_table = FileTable() + >>> for key, value in file_table.items(): + ... print(key, value) + """ dbfile = self.dbfile for keypos, keylen, datapos, datalen in self._ranges(): yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen)) def get(self, key, default=None): + """ + Retrieve the value associated with the given key. + + This method returns the first value found for the given key in the file table. + If no value is found, it returns the default value provided. + + Parameters: + - key (str): The key to search for in the file table. + - default (Any, optional): The default value to return if no value is found. Defaults to None. + + Returns: + - The value associated with the given key, or the default value if no value is found. + """ for value in self.all(key): return value return default def all(self, key): - """Yields a sequence of values associated with the given key.""" + """ + Yields a sequence of values associated with the given key. + + Parameters: + - key (str): The key to retrieve values for. + + Returns: + - generator: A generator that yields the values associated with the key. + Raises: + - KeyError: If the key is not found in the database. + + Example: + >>> db = FileTables() + >>> db.all("key1") + + >>> list(db.all("key1")) + ['value1', 'value2', 'value3'] + """ dbfile = self.dbfile for datapos, datalen in self.ranges_for_key(key): yield dbfile.get(datapos, datalen) @@ -386,6 +991,28 @@ def all(self, key): def ranges_for_key(self, key): """Yields a sequence of ``(datapos, datalength)`` tuples associated with the given key. + + Args: + key (bytes): The key to search for. Should be of type bytes. + + Yields: + tuple: A tuple containing the data position and data length associated with the key. + + Raises: + TypeError: If the key is not of type bytes. + + Notes: + This method is used to retrieve the data position and data length associated with a given key. + It performs a lookup in the hash table to find the key's slot, and then checks if the key matches + the one stored in the slot. If a match is found, it yields the data position and data length. + + The method assumes that the hash table and data file have been properly initialized. + + Example: + >>> db = FileTables() + >>> key = b'my_key' + >>> for datapos, datalength in db.ranges_for_key(key): + ... print(f"Data position: {datapos}, Data length: {datalength}") """ if not isinstance(key, bytes): @@ -433,6 +1060,27 @@ def ranges_for_key(self, key): slotpos = tablestart def range_for_key(self, key): + """ + Returns the range associated with the given key. + + This method retrieves the range associated with the given key from the file table. + If the key is found, the range is returned. If the key is not found, a KeyError is raised. + + Parameters: + - key (str): The key to search for in the file table. + + Returns: + - range (tuple): The range associated with the given key. + + Raises: + - KeyError: If the key is not found in the file table. + + Example: + >>> table = FileTable() + >>> table.range_for_key('key1') + (0, 100) + """ + for item in self.ranges_for_key(key): return item raise KeyError(key) @@ -442,12 +1090,44 @@ def range_for_key(self, key): class OrderedHashWriter(HashWriter): - """Implements an on-disk hash, but requires that keys be added in order. - An :class:`OrderedHashReader` can then look up "nearest keys" based on - the ordering. + """ + Implements an on-disk hash, but requires that keys be added in order. + An OrderedHashReader can then look up "nearest keys" based on the ordering. + + Parameters: + - dbfile (file-like object): The file-like object to write the hash data to. + + Usage: + 1. Create an instance of OrderedHashWriter by providing a file-like object. + 2. Use the add() method to add keys and values to the hash in increasing order. + 3. Call the _write_extras() method to write the metadata and index array to the file. + + Example: + ``` + with open("hash.db", "wb") as dbfile: + writer = OrderedHashWriter(dbfile) + writer.add("key1", "value1") + writer.add("key2", "value2") + writer._write_extras() + ``` + + Note: + - Keys must be added in increasing order. If a key is added that is not greater than the previous key, a ValueError will be raised. + - The index array, which contains the positions of all keys, will be stored as metadata in the file. """ def __init__(self, dbfile): + """ + Initialize a FileTables object. + + Args: + dbfile (str): The path to the database file. + + Attributes: + index (GrowableArray): An array of the positions of all keys. + lastkey (bytes): The last key added. + + """ HashWriter.__init__(self, dbfile) # Keep an array of the positions of all keys self.index = GrowableArray("H") @@ -455,6 +1135,19 @@ def __init__(self, dbfile): self.lastkey = emptybytes def add(self, key, value): + """ + Adds a key-value pair to the hash. + + Parameters: + - key: The key to add. Must be greater than the previous key. + - value: The value associated with the key. + + Raises: + - ValueError: If the key is not greater than the previous key. + + Note: + - The position of the key in the file will be stored in the index array. + """ if key <= self.lastkey: raise ValueError(f"Keys must increase: {self.lastkey!r}..{key!r}") self.index.append(self.dbfile.tell()) @@ -462,6 +1155,12 @@ def add(self, key, value): self.lastkey = key def _write_extras(self): + """ + Writes the metadata and index array to the file. + + Note: + - This method should be called after adding all keys and values to the hash. + """ dbfile = self.dbfile index = self.index @@ -475,12 +1174,52 @@ def _write_extras(self): class OrderedHashReader(HashReader): + """A class for reading an ordered hash file and performing operations on it. + + This class extends the `HashReader` class and provides additional methods + for working with an ordered series of keys in the hash file. + + Methods: + closest_key(key): + Returns the closest key equal to or greater than the given key. If + there is no key in the file equal to or greater than the given key, + returns None. + + ranges_from(key): + Yields a series of ``(keypos, keylen, datapos, datalen)`` tuples + for the ordered series of keys equal or greater than the given key. + + keys_from(key): + Yields an ordered series of keys equal to or greater than the given + key. + + items_from(key): + Yields an ordered series of ``(key, value)`` tuples for keys equal + to or greater than the given key. + + Attributes: + indexbase: + The base position of the index array in the hash file. + + indexlen: + The length of the index array. + + indexsize: + The size of each index element in bytes. + + """ + def closest_key(self, key): - """Returns the closest key equal to or greater than the given key. If - there is no key in the file equal to or greater than the given key, - returns None. """ + Returns the closest key equal to or greater than the given key. If there is no key in the file + equal to or greater than the given key, returns None. + + Parameters: + key (Any): The key to search for. + Returns: + Any: The closest key equal to or greater than the given key, or None if no such key exists. + """ pos = self.closest_key_pos(key) if pos is None: return None @@ -489,6 +1228,29 @@ def closest_key(self, key): def ranges_from(self, key): """Yields a series of ``(keypos, keylen, datapos, datalen)`` tuples for the ordered series of keys equal or greater than the given key. + + Parameters: + - key (bytes): The key to start the range from. + + Returns: + - Generator: A generator that yields ``(keypos, keylen, datapos, datalen)`` tuples. + + Notes: + - This method returns a generator that iterates over the ordered series of keys in the file table, + starting from the given key and including all keys that are equal or greater. + - Each tuple in the generator represents a range of data associated with a key, where: + - keypos: The position of the key in the file table. + - keylen: The length of the key. + - datapos: The position of the associated data in the file table. + - datalen: The length of the associated data. + + Example: + ``` + file_table = FileTable() + for keypos, keylen, datapos, datalen in file_table.ranges_from(b'my_key'): + # Process the key and associated data + ... + ``` """ pos = self.closest_key_pos(key) @@ -498,8 +1260,24 @@ def ranges_from(self, key): yield from self._ranges(pos=pos) def keys_from(self, key): - """Yields an ordered series of keys equal to or greater than the given - key. + """Yields an ordered series of keys equal to or greater than the given key. + + Args: + key: The key to start yielding from. + + Yields: + The keys equal to or greater than the given key. + + Raises: + None. + + Example: + >>> db = FileTables() + >>> for key in db.keys_from('abc'): + ... print(key) + abc + abcd + abcde """ dbfile = self.dbfile @@ -509,6 +1287,25 @@ def keys_from(self, key): def items_from(self, key): """Yields an ordered series of ``(key, value)`` tuples for keys equal to or greater than the given key. + + Parameters: + - key (bytes): The key to start iterating from. + + Yields: + - tuple: A ``(key, value)`` tuple for each key equal to or greater than the given key. + + Notes: + - This method retrieves the ``(key, value)`` pairs from the file database starting from the given key. + - The keys are ordered in ascending order. + - The values are retrieved from the file database using the key positions and lengths. + + Example: + >>> db = FileTables() + >>> for key, value in db.items_from(b'key1'): + ... print(key, value) + ('key1', 'value1') + ('key2', 'value2') + ('key3', 'value3') """ dbfile = self.dbfile @@ -516,6 +1313,24 @@ def items_from(self, key): yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen)) def _read_extras(self): + """ + Reads the extras from the database file and sets up the necessary variables for reading the index array. + + This method is called internally by the FileTables class. + + Parameters: + - None + + Returns: + - None + + Raises: + - Exception: If the index type is unknown. + + Usage: + - This method should not be called directly. It is called internally by the FileTables class to read the extras + from the database file and set up the necessary variables for reading the index array. + """ dbfile = self.dbfile # Read the extras @@ -541,8 +1356,33 @@ def _read_extras(self): raise Exception(f"Unknown index type {indextype!r}") def closest_key_pos(self, key): - # Given a key, return the position of that key OR the next highest key - # if the given key does not exist + """ + Given a key, return the position of that key OR the next highest key if the given key does not exist. + + Args: + key (bytes): The key to search for. Should be of type bytes. + + Returns: + int or None: The position of the key in the index array, or None if the key is not found. + + Raises: + TypeError: If the key is not of type bytes. + + Notes: + This method performs a binary search on the positions in the index array to find the closest key. + It assumes that the index array is sorted in ascending order. + + Example: + >>> index = FileTables() + >>> index.closest_key_pos(b'key1') + 0 + >>> index.closest_key_pos(b'key2') + 1 + >>> index.closest_key_pos(b'key3') + 2 + >>> index.closest_key_pos(b'key4') + 2 + """ if not isinstance(key, bytes): raise TypeError(f"Key {key!r} should be bytes") @@ -573,19 +1413,87 @@ def closest_key_pos(self, key): class FieldedOrderedHashWriter(HashWriter): - """Implements an on-disk hash, but writes separate position indexes for - each field. + """ + Implements an on-disk hash, but writes separate position indexes for each field. + + This class is used to write a hash table to disk, where each field has its own position index. + It is designed to work with the `HashReader` class to provide efficient retrieval of values + based on keys. + + Usage: + 1. Create an instance of `FieldedOrderedHashWriter` by passing the `dbfile` parameter, which + represents the file to write the hash table to. + 2. Call the `start_field` method to indicate the start of a new field. Pass the `fieldname` + parameter to specify the name of the field. + 3. Call the `add` method to add a key-value pair to the hash table. The keys must be in increasing + order. If a key is added that is less than or equal to the previous key, a `ValueError` is raised. + 4. Repeat steps 2 and 3 for each field and key-value pair. + 5. Call the `end_field` method to indicate the end of the current field. This will store the + position index for the field in the `fieldmap` dictionary. + 6. After adding all fields and key-value pairs, the hash table can be accessed using the `HashReader` + class. + + Attributes: + - `fieldmap`: A dictionary that maps field names to tuples containing the start position, end position, + length, and typecode of the position index for each field. + - `lastkey`: The last key that was added to the hash table. + + Note: + - This class inherits from the `HashWriter` class, which provides the basic functionality for writing + a hash table to disk. + + Example: + ``` + writer = FieldedOrderedHashWriter(dbfile) + writer.start_field("field1") + writer.add("key1", "value1") + writer.add("key2", "value2") + writer.end_field() + writer.start_field("field2") + writer.add("key3", "value3") + writer.end_field() + # ... + ``` + """ def __init__(self, dbfile): + """ + Initialize a FileTables object. + + Args: + dbfile (str): The path to the database file. + + Attributes: + fieldmap (dict): A dictionary mapping field names to tuples containing + the start position, index position, length, and type code. + lastkey (bytes): The last key added to the FileTables object. + + """ HashWriter.__init__(self, dbfile) # Map field names to (startpos, indexpos, length, typecode) self.fieldmap = self.extras["fieldmap"] = {} - # Keep track of the last key added self.lastkey = emptybytes def start_field(self, fieldname): + """ + Start a new field in the hash table. + + This method is used to initialize a new field in the hash table. It sets the current position in the database file + as the starting position for the field and stores the field name. It also initializes an array to keep track of the + positions of all keys associated with this field. + + Args: + fieldname (str): The name of the field. + + Returns: + None + + Example: + To start a new field named "title", you can call this method as follows: + >>> start_field("title") + """ self.fieldstart = self.dbfile.tell() self.fieldname = fieldname # Keep an array of the positions of all keys @@ -593,6 +1501,33 @@ def start_field(self, fieldname): self.lastkey = emptybytes def add(self, key, value): + """ + Add a key-value pair to the hash table. + + Args: + - `key` (int): The key to add. It should be greater than any previously added key. + - `value` (Any): The value associated with the key. + + Raises: + - `ValueError`: If the key is less than or equal to the previous key. + + Returns: + - None + + Notes: + - This method appends the position of the value in the database file to the `poses` list. + - The `HashWriter.add` method is called to actually add the key-value pair to the hash table. + - The `lastkey` attribute is updated with the newly added key. + + Example usage: + ``` + table = FileTable() + table.add(1, "Value 1") + table.add(2, "Value 2") + table.add(3, "Value 3") + ``` + + """ if key <= self.lastkey: raise ValueError(f"Keys must increase: {self.lastkey!r}..{key!r}") self.poses.append(self.dbfile.tell() - self.fieldstart) @@ -600,6 +1535,39 @@ def add(self, key, value): self.lastkey = key def end_field(self): + """ + End the current field in the hash table. + + This method stores the position index for the field in the `fieldmap` dictionary. + The `fieldmap` dictionary is used to keep track of the start and end positions of each field + in the hash table, as well as the number of positions and the typecode of the positions. + + Usage: + ------ + Call this method after adding all the positions for a field in the hash table. + It will update the `fieldmap` dictionary with the relevant information for the field. + + Example: + -------- + # Create a FileTables object + filetables = FileTables() + + # Add positions for a field + filetables.add_position(1) + filetables.add_position(2) + filetables.add_position(3) + + # End the field and update the fieldmap + filetables.end_field() + + Parameters: + ----------- + None + + Returns: + -------- + None + """ dbfile = self.dbfile fieldname = self.fieldname poses = self.poses @@ -613,66 +1581,295 @@ def end_field(self): class FieldedOrderedHashReader(HashReader): + """ + A subclass of HashReader that provides additional functionality for reading fielded ordered hash data. + + This class extends the HashReader class and adds methods for working with fielded ordered hash data. + It provides methods for iterating over terms, retrieving term data, checking if a term exists, + finding the closest term, and more. + + Usage: + 1. Create an instance of FieldedOrderedHashReader by passing the necessary arguments to the constructor. + 2. Use the various methods provided by this class to interact with the fielded ordered hash data. + + Example: + ``` + reader = FieldedOrderedHashReader(...) + for fieldname, term in reader.iter_terms(): + print(fieldname, term) + ``` + + Args: + *args: Variable length argument list to be passed to the parent class constructor. + **kwargs: Arbitrary keyword arguments to be passed to the parent class constructor. + + Attributes: + fieldmap (dict): A dictionary mapping field names to their corresponding start and end ranges. + fieldlist (list): A sorted list of field names with their start and end ranges. + + Methods: + field_start(fieldname): Get the start position of a field. + fielded_ranges(pos=None, eod=None): Generate fielded ranges for the given position range. + iter_terms(): Iterate over the terms in the fielded ordered hash data. + iter_term_items(): Iterate over the term items in the fielded ordered hash data. + contains_term(fieldname, btext): Check if a term exists in the fielded ordered hash data. + range_for_term(fieldname, btext): Get the range (position and length) of a term in the fielded ordered hash data. + term_data(fieldname, btext): Get the data associated with a term in the fielded ordered hash data. + term_get(fieldname, btext, default=None): Get the data associated with a term, or a default value if the term does not exist. + closest_term_pos(fieldname, key): Get the position of the closest term to the given key. + closest_term(fieldname, btext): Get the closest term to the given term in the fielded ordered hash data. + term_ranges_from(fieldname, btext): Generate term ranges starting from the given term in the fielded ordered hash data. + terms_from(fieldname, btext): Iterate over the terms starting from the given term in the fielded ordered hash data. + term_items_from(fieldname, btext): Iterate over the term items starting from the given term in the fielded ordered hash data. + """ + def __init__(self, *args, **kwargs): + """ + Initialize the FileTables object. + + Args: + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + + Raises: + None. + + Returns: + None. + + Notes: + This method initializes the FileTables object by calling the __init__ method of the HashReader class. + It also sets the fieldmap attribute using the extras dictionary passed as a keyword argument. + The fieldmap is a dictionary that maps field names to their corresponding start position, index position, and other information. + The fieldlist attribute is then created as a sorted list of tuples, where each tuple contains the field name, start position, and index position. + + Usage: + filetables = FileTables(*args, **kwargs) + """ HashReader.__init__(self, *args, **kwargs) self.fieldmap = self.extras["fieldmap"] # Make a sorted list of the field names with their start and end ranges self.fieldlist = [] for fieldname in sorted(self.fieldmap.keys()): - startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] + startpos, ixpos, _, __ = self.fieldmap[fieldname] self.fieldlist.append((fieldname, startpos, ixpos)) def field_start(self, fieldname): + """ + Returns the start position of the specified field in the file. + + Parameters: + fieldname (str): The name of the field. + + Returns: + int: The start position of the field in the file. + + Raises: + KeyError: If the specified fieldname does not exist in the fieldmap. + + Example: + >>> field_start('title') + 10 + + Note: + The start position of a field represents the byte offset in the file where the field's data begins. + This method is used internally by the filetables module to retrieve the start position of a field. + """ return self.fieldmap[fieldname][0] def fielded_ranges(self, pos=None, eod=None): + """ + Generator that yields field information for each key-value pair in the filetable. + + Args: + pos (int, optional): The starting position to iterate from. Defaults to None. + eod (int, optional): The ending position to iterate until. Defaults to None. + + Yields: + tuple: A tuple containing the field name, key position, key length, data position, and data length. + + Raises: + IndexError: If the starting position is out of range. + + Notes: + - This method is used to iterate over the field information of each key-value pair in the filetable. + - The field information includes the field name, key position, key length, data position, and data length. + - If the starting position is not specified, the iteration starts from the beginning of the filetable. + - If the ending position is not specified, the iteration continues until the end of the filetable. + - If the starting position is out of range, an IndexError is raised. + """ flist = self.fieldlist fpos = 0 - fieldname, start, end = flist[fpos] + fieldname, _, end = flist[fpos] for keypos, keylen, datapos, datalen in self._ranges(pos, eod): if keypos >= end: fpos += 1 - fieldname, start, end = flist[fpos] + fieldname, _, end = flist[fpos] yield fieldname, keypos, keylen, datapos, datalen def iter_terms(self): + """ + Iterates over the terms in the filetable. + + Yields tuples containing the field name and the term value for each term in the filetable. + + Returns: + Iterator[tuple]: An iterator over the terms in the filetable. + + Notes: + This method retrieves the terms from the filetable using the `get` method of the `dbfile` object. + It iterates over the fielded ranges in the filetable and yields tuples containing the field name + and the term value for each term. + + Example: + >>> for fieldname, term in filetable.iter_terms(): + ... print(fieldname, term) + """ get = self.dbfile.get for fieldname, keypos, keylen, _, _ in self.fielded_ranges(): yield fieldname, get(keypos, keylen) def iter_term_items(self): + """ + Iterates over the term items in the file table. + + Yields tuples containing the field name, key, and data for each term item. + + Parameters: + - None + + Returns: + - Generator: A generator that yields tuples of the form (fieldname, key, data). + + Example usage: + ``` + for fieldname, key, data in iter_term_items(): + # Process the fieldname, key, and data + ... + ``` + """ get = self.dbfile.get for item in self.fielded_ranges(): fieldname, keypos, keylen, datapos, datalen = item yield fieldname, get(keypos, keylen), get(datapos, datalen) def contains_term(self, fieldname, btext): + """ + Checks if the given term exists in the specified field. + + Parameters: + fieldname (str): The name of the field to search in. + btext (bytes): The term to search for, encoded as bytes. + + Returns: + bool: True if the term exists in the field, False otherwise. + + Raises: + KeyError: If the field or term does not exist. + + Example: + >>> table = FileTables() + >>> table.contains_term("title", b"example") + True + """ try: - x = self.range_for_term(fieldname, btext) + _ = self.range_for_term(fieldname, btext) return True except KeyError: return False def range_for_term(self, fieldname, btext): - start, ixpos, ixsize, code = self.fieldmap[fieldname] + """ + Returns the range (datapos, datalen) for a given term in a specific field. + + Args: + fieldname (str): The name of the field. + btext (bytes): The term to search for. + + Returns: + tuple: A tuple containing the data position (datapos) and data length (datalen) for the term. + + Raises: + KeyError: If the term is not found in the field. + + """ + start, ixpos, _, __ = self.fieldmap[fieldname] for datapos, datalen in self.ranges_for_key(btext): if start < datapos < ixpos: return datapos, datalen raise KeyError((fieldname, btext)) def term_data(self, fieldname, btext): + """ + Retrieve the data associated with a term in a specific field. + + Args: + fieldname (str): The name of the field. + btext (bytes): The term to retrieve the data for. + + Returns: + bytes: The data associated with the term. + + Raises: + KeyError: If the term or field does not exist. + + Notes: + This method retrieves the data associated with a term in a specific field + from the file database. It uses the `range_for_term` method to determine + the position and length of the data in the database file, and then retrieves + the data using the `get` method of the `dbfile` object. + + Example usage: + ``` + fieldname = "title" + term = b"example" + data = term_data(fieldname, term) + print(data) + ``` + """ datapos, datalen = self.range_for_term(fieldname, btext) return self.dbfile.get(datapos, datalen) def term_get(self, fieldname, btext, default=None): + """ + Retrieve the term data for a given field and term text. + + Args: + fieldname (str): The name of the field. + btext (bytes): The term text in bytes. + default: The value to return if the term data is not found. + + Returns: + The term data for the given field and term text, or the default value if not found. + """ try: return self.term_data(fieldname, btext) except KeyError: return default def closest_term_pos(self, fieldname, key): - # Given a key, return the position of that key OR the next highest key - # if the given key does not exist + """ + Given a key, return the position of that key OR the next highest key if the given key does not exist. + + Args: + fieldname (str): The name of the field. + key (bytes): The key to search for. + + Returns: + int or None: The position of the key in the index array, or None if the key is not found. + + Raises: + TypeError: If the key is not of type bytes. + ValueError: If the index type is unknown. + + Note: + This method assumes that the index array is sorted in ascending order. + + Example: + >>> db = FileTables() + >>> db.closest_term_pos("title", b"apple") + 10 + """ if not isinstance(key, bytes): raise TypeError(f"Key {key!r} should be bytes") @@ -691,7 +1888,7 @@ def closest_term_pos(self, fieldname, key): elif ixtype == "q": get_pos = dbfile.get_long else: - raise Exception(f"Unknown index type {ixtype!r}") + raise ValueError(f"Unknown index type {ixtype}") # Do a binary search of the positions in the index array lo = 0 @@ -711,25 +1908,82 @@ def closest_term_pos(self, fieldname, key): return startpos + get_pos(ixpos + lo * ixsize) def closest_term(self, fieldname, btext): + """ + Returns the closest term to the given text in the specified field. + + Args: + fieldname (str): The name of the field to search in. + btext (bytes): The text to find the closest term for. + + Returns: + str or None: The closest term to the given text in the specified field, + or None if no term is found. + + """ pos = self.closest_term_pos(fieldname, btext) if pos is None: return None return self.key_at(pos) def term_ranges_from(self, fieldname, btext): + """ + Returns a generator that yields term ranges for a given field and binary text. + + Args: + fieldname (str): The name of the field. + btext (bytes): The binary text to search for. + + Yields: + tuple: A tuple representing a term range. Each tuple contains two integers, + representing the start and end positions of the term in the index. + + Returns None if no term is found for the given field and binary text. + """ + pos = self.closest_term_pos(fieldname, btext) if pos is None: return - startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] + _, ixpos, __, ___ = self.fieldmap[fieldname] yield from self._ranges(pos, ixpos) def terms_from(self, fieldname, btext): + """ + Retrieves terms from the specified field that match the given binary text. + + Args: + fieldname (str): The name of the field to retrieve terms from. + btext (bytes): The binary text to match against the terms. + + Yields: + bytes: The terms that match the given binary text. + + """ dbfile = self.dbfile for keypos, keylen, _, _ in self.term_ranges_from(fieldname, btext): yield dbfile.get(keypos, keylen) def term_items_from(self, fieldname, btext): + """ + Retrieves term items from the file database for a given field and binary text. + + Args: + fieldname (str): The name of the field to retrieve term items from. + btext (bytes): The binary text to match against. + + Yields: + tuple: A tuple containing the key and data associated with each term item. + + Returns: + None + + Raises: + None + + Example: + >>> for key, data in term_items_from("title", b"example"): + ... print(key, data) + """ dbfile = self.dbfile for item in self.term_ranges_from(fieldname, btext): keypos, keylen, datapos, datalen = item diff --git a/src/whoosh/filedb/filewriting.py b/src/whoosh/filedb/filewriting.py index 36060a68..91ca817c 100644 --- a/src/whoosh/filedb/filewriting.py +++ b/src/whoosh/filedb/filewriting.py @@ -42,16 +42,52 @@ def NO_MERGE(ix, writer, segments): - """This policy does not merge any existing segments.""" + """ + This policy does not merge any existing segments. + + Parameters: + - ix (Index): The index object. + - writer (IndexWriter): The index writer object. + - segments (list): The list of existing segments. + + Returns: + - list: The list of existing segments, unchanged. + + Usage: + - Use this policy when you want to prevent any merging of existing segments in the index. + - This can be useful in scenarios where you want to maintain the original segment structure without any merging. + """ _ = ix, writer return segments def MERGE_SMALL(ix, writer, segments): - """This policy merges small segments, where "small" is defined using a - heuristic based on the fibonacci sequence. """ - + Merge small segments based on a heuristic using the Fibonacci sequence. + + This policy merges small segments, where "small" is defined using a heuristic based on the Fibonacci sequence. + The segments are sorted based on their document count, and then merged according to the heuristic. + + Parameters: + - ix (Index): The Whoosh index object. + - writer (IndexWriter): The writer object used for merging segments. + - segments (list): A list of segments to be merged. + + Returns: + - newsegments (SegmentSet): The merged segments. + + Usage: + - Call this function to merge small segments in an index. Pass the index object, writer object, and the list of segments to be merged. + - The function will merge the segments based on the Fibonacci sequence heuristic and return the merged segments. + + Example: + ``` + ix = Index("/path/to/index") + writer = ix.writer() + segments = [segment1, segment2, segment3] + newsegments = MERGE_SMALL(ix, writer, segments) + ``` + """ from whoosh.filedb.filereading import SegmentReader newsegments = SegmentSet() @@ -68,8 +104,31 @@ def MERGE_SMALL(ix, writer, segments): def OPTIMIZE(ix, writer, segments): - """This policy merges all existing segments.""" + """ + Merge all existing segments into a single segment. + + This function merges all the segments specified in the `segments` list into a single segment. + It uses the `writer` object to add a reader for each segment, and then returns an empty `SegmentSet`. + Parameters: + - ix (Index): The index object. + - writer (IndexWriter): The index writer object. + - segments (list): A list of segment names to be merged. + + Returns: + - SegmentSet: An empty `SegmentSet` object. + + Example: + >>> ix = Index(...) + >>> writer = IndexWriter(...) + >>> segments = ['segment1', 'segment2', 'segment3'] + >>> OPTIMIZE(ix, writer, segments) + + + Note: + - This function assumes that the `SegmentReader` class is imported from `whoosh.filedb.filereading`. + - The `SegmentSet` object returned by this function is not used or modified further in the code snippet provided. + """ from whoosh.filedb.filereading import SegmentReader for seg in segments: @@ -78,6 +137,78 @@ def OPTIMIZE(ix, writer, segments): class SegmentWriter(SegmentDeletionMixin, IndexWriter): + """A class for writing segments in an index. + + This class is responsible for writing segments in an index. It handles the creation + of temporary segment files, writing term indexes, term postings, vector indexes, + vector postings, stored fields, and field lengths. + + Parameters: + - ix (Index): The index to write the segment to. + - poolclass (class, optional): The class to use for the pool. Defaults to None. + - procs (int, optional): The number of processes to use for the pool. Defaults to 0. + - blocklimit (int, optional): The block limit for the posting writer. Defaults to 128. + - timeout (float, optional): The timeout for acquiring the lock. Defaults to 0.0. + - delay (float, optional): The delay between attempts to acquire the lock. Defaults to 0.1. + - **poolargs: Additional keyword arguments to pass to the pool class. + + Attributes: + - lock (Lock): The lock object used to acquire the lock for writing the segment. + - index (Index): The index to write the segment to. + - segments (list): The list of segments in the index. + - blocklimit (int): The block limit for the posting writer. + - schema (Schema): The schema of the index. + - name (str): The name of the segment. + - _searcher (Searcher): The searcher object for the index. + - docnum (int): The document number. + - fieldlength_totals (defaultdict): The total field lengths. + - termsindex (FileTableWriter): The file table writer for the terms index. + - postwriter (FilePostingWriter): The file posting writer for the term postings. + - vectorindex (StructHashWriter): The struct hash writer for the vector index. + - vpostwriter (FilePostingWriter): The file posting writer for the vector postings. + - storedfields (FileListWriter): The file list writer for the stored fields. + - fieldlengths (File): The file for the field lengths. + - pool (Pool): The pool object for the field lengths. + + Methods: + - searcher(): Returns a searcher object for the index. + - add_reader(reader): Adds a reader object to the segment writer. + - add_document(**fields): Adds a document to the segment writer. + - _add_stored_fields(storeddict): Adds stored fields to the segment writer. + - _add_vector(fieldnum, vlist): Adds a vector to the segment writer. + - _close_all(): Closes all files used by the segment writer. + - commit(mergetype=MERGE_SMALL): Commits the segment writer and releases the lock. + - cancel(): Cancels the segment writer and releases the lock. + + Usage: + 1. Create an instance of SegmentWriter by providing the index to write the segment to. + 2. Optionally, you can specify the pool class, the number of processes to use for the pool, + the block limit for the posting writer, the timeout for acquiring the lock, and the delay + between attempts to acquire the lock. + 3. Use the various methods provided by SegmentWriter to add documents, stored fields, and vectors + to the segment writer. + 4. Call the commit() method to commit the segment writer and release the lock. + 5. If needed, you can cancel the segment writer and release the lock by calling the cancel() method. + + Example: + ```python + from whoosh import index + from whoosh.filedb.filewriting import SegmentWriter + + # Open an existing index + ix = index.open_dir("my_index") + + # Create a SegmentWriter + writer = SegmentWriter(ix) + + # Add a document to the segment writer + writer.add_document(title="Example Document", content="This is an example document.") + + # Commit the segment writer + writer.commit() + ``` + """ + def __init__( self, ix, @@ -88,6 +219,26 @@ def __init__( delay=0.1, **poolargs, ): + """ + Initialize a FileWriter object. + + Parameters: + - ix (Index): The index object to write to. + - poolclass (class, optional): The class to use for multiprocessing. If not provided, it defaults to MultiPool if procs > 1, otherwise TempfilePool. + - procs (int, optional): The number of processes to use for multiprocessing. Defaults to 0, which means no multiprocessing. + - blocklimit (int, optional): The maximum number of documents to write in a single block. Defaults to 128. + - timeout (float, optional): The maximum time to wait for acquiring the lock. Defaults to 0.0, which means no timeout. + - delay (float, optional): The delay between attempts to acquire the lock. Defaults to 0.1 seconds. + - **poolargs (dict, optional): Additional keyword arguments to pass to the poolclass constructor. + + Raises: + - LockError: If the lock cannot be acquired within the specified timeout. + + Usage: + - Create an instance of FileWriter by passing an Index object. + - Optionally, specify the poolclass, procs, blocklimit, timeout, delay, and additional poolargs. + - Use the FileWriter object to write documents to the index. + """ self.lock = ix.storage.lock(ix.indexname + "_LOCK") if not try_for(self.lock.acquire, timeout=timeout, delay=delay): raise LockError @@ -151,9 +302,52 @@ def encode_storedfields(fielddict): self.pool = poolclass(self.fieldlengths, procs=procs, **poolargs) def searcher(self): + """ + Returns a searcher object for the index. + + This method creates and returns a searcher object that can be used to search the index. + The searcher object provides methods for executing queries and retrieving search results. + + Returns: + Searcher: A searcher object for the index. + + Example: + >>> index = Index() + >>> writer = index.writer() + >>> # ... add documents to the index ... + >>> searcher = writer.searcher() + >>> results = searcher.search(Query("hello")) + """ return self.index.searcher() def add_reader(self, reader): + """ + Adds documents from the given reader to the index. + + Parameters: + - reader (Reader): The reader object containing the documents to be added. + + This method adds stored documents, vectors, and field lengths from the given reader + to the index. It also handles deletions, if any, and updates the document mapping accordingly. + + Note: + - The reader object must implement the following methods: + - `has_deletions()`: Returns True if the reader has deleted documents, False otherwise. + - `doc_count_all()`: Returns the total number of documents in the reader. + - `is_deleted(docnum)`: Returns True if the document with the given docnum is deleted, False otherwise. + - `stored_fields(docnum)`: Returns the stored fields of the document with the given docnum. + - `scorable_fields()`: Returns a list of field numbers that are scorable. + - `doc_field_length(docnum, fieldnum)`: Returns the length of the field with the given fieldnum in the document with the given docnum. + - `has_vector(docnum, fieldnum)`: Returns True if the document with the given docnum has a vector for the field with the given fieldnum, False otherwise. + - `vector(docnum, fieldnum)`: Returns the vector for the field with the given fieldnum in the document with the given docnum. + - `postings(fieldnum, text)`: Returns a Postings object for the given fieldnum and text. + + Returns: + None + + Raises: + None + """ startdoc = self.docnum has_deletions = reader.has_deletions() @@ -203,6 +397,27 @@ def add_reader(self, reader): self.pool.add_posting(fieldnum, text, newdoc, freq, valuestring) def add_document(self, **fields): + """ + Add a document to the index. + + Args: + **fields: Keyword arguments representing the fields of the document. + The field names should match the names defined in the schema. + + Raises: + UnknownFieldError: If a field name provided does not exist in the schema. + + Notes: + - The fields are sorted based on their order in the schema. + - The indexed fields are added to the index. + - The vector fields are processed and added to the index. + - The stored fields are stored in the index. + + Example: + schema = Schema(title=TEXT(stored=True), content=TEXT) + writer = IndexWriter(index, schema) + writer.add_document(title="Document 1", content="This is the content of Document 1") + """ schema = self.schema name2num = schema.name_to_number @@ -250,9 +465,46 @@ def add_document(self, **fields): self.docnum += 1 def _add_stored_fields(self, storeddict): + """ + Adds a stored field dictionary to the list of stored fields. + + Args: + storeddict (dict): A dictionary containing the stored field data. + + Returns: + None + + Notes: + - The stored field dictionary should contain key-value pairs representing the field name and its value. + - The stored fields are used to store additional data associated with a document. + - The stored fields can be retrieved later during search or retrieval operations. + + Example: + storeddict = {"title": "Sample Document", "author": "John Doe"} + _add_stored_fields(storeddict) + """ self.storedfields.append(storeddict) def _add_vector(self, fieldnum, vlist): + """ + Add a vector to the index for a given field. + + Args: + fieldnum (int): The field number. + vlist (list): A list of tuples containing the text and valuestring for each vector. + + Raises: + AssertionError: If the text is not of type unicode. + + Notes: + This method adds a vector to the index for a specific field. It takes a list of tuples, where each tuple contains the text and valuestring for a vector. The text should be of type unicode. + + The method uses the vpostwriter to write the vectors to the index file. It starts by obtaining the vformat from the schema for the given field. It then iterates over the vlist and writes each vector to the vpostwriter. Finally, it finishes writing the vectors and adds the vector offset to the vectorindex. + + Example: + vlist = [(u"example text", "valuestring1"), (u"another text", "valuestring2")] + _add_vector(0, vlist) + """ vpostwriter = self.vpostwriter vformat = self.schema[fieldnum].vector @@ -265,6 +517,28 @@ def _add_vector(self, fieldnum, vlist): self.vectorindex.add((self.docnum, fieldnum), offset) def _close_all(self): + """ + Closes all the file resources used by the writer. + + This method is responsible for closing the terms index, post writer, vector index, + vpost writer, stored fields, and field lengths. It ensures that all the resources + are properly closed to prevent any data corruption or resource leaks. + + Usage: + Call this method when you are done writing to the index and want to release + the file resources. It is important to call this method to ensure that all + changes are persisted and the files are closed properly. + + Note: + - If the vector index or vpost writer is not initialized, they will not be closed. + - The field lengths are only closed if they are not already closed. + + Raises: + None + + Returns: + None + """ self.termsindex.close() self.postwriter.close() if self.vectorindex: @@ -276,6 +550,22 @@ def _close_all(self): self.fieldlengths.close() def commit(self, mergetype=MERGE_SMALL): + """ + Commits the changes made by the writer to the index. + + This method finalizes the changes made by the writer and commits them to the index. + It performs the following steps: + 1. Calls the merge policy function to determine if any segments need to be merged into the writer's pool. + 2. Informs the pool to add its accumulated data to the terms index and posting file. + 3. Creates a new segment object for the segment created by this writer and adds it to the list of remaining segments. + 4. Closes all files, writes a new TOC (Table of Contents) with the updated segment list, and releases the lock. + + Parameters: + - mergetype (optional): The merge policy function to be used for determining which segments to merge. Default is MERGE_SMALL. + + Returns: + None + """ # Call the merge policy function. The policy may choose to merge other # segments into this writer's pool new_segments = mergetype(self.index, self, self.segments) @@ -302,6 +592,21 @@ def commit(self, mergetype=MERGE_SMALL): self.lock.release() def cancel(self): + """ + Cancels the current operation and releases any acquired resources. + + This method cancels the current operation by calling the `cancel` method of the underlying + thread pool. It also closes all open file handles and releases the lock held by the current + thread. + + Note: + This method should be called if the current operation needs to be canceled or if any + acquired resources need to be released. + + Example: + >>> writer.cancel() + + """ self.pool.cancel() self._close_all() self.lock.release() diff --git a/src/whoosh/filedb/gae.py b/src/whoosh/filedb/gae.py index bfd1b80c..302b421a 100644 --- a/src/whoosh/filedb/gae.py +++ b/src/whoosh/filedb/gae.py @@ -15,6 +15,49 @@ To open an existing index:: ix = DatastoreStorage().open_index() + +This module provides the following classes: + +- `DatastoreFile`: A file-like object that is backed by a BytesIO() object whose contents + is loaded from a BlobProperty in the app engine datastore. + +- `MemcacheLock`: A lock object that uses the Google App Engine memcache service for synchronization. + +- `DatastoreStorage`: An implementation of `whoosh.store.Storage` that stores files in + the app engine datastore as blob properties. + +Usage: +1. Creating an index: + storage = DatastoreStorage() + schema = Schema(...) + index = storage.create_index(schema) + +2. Opening an existing index: + storage = DatastoreStorage() + index = storage.open_index() + +3. Listing all files in the storage: + storage = DatastoreStorage() + files = storage.list() + +4. Deleting a file: + storage = DatastoreStorage() + storage.delete_file(filename) + +5. Renaming a file: + storage = DatastoreStorage() + storage.rename_file(old_filename, new_filename) + +6. Creating a new file: + storage = DatastoreStorage() + file = storage.create_file(filename) + +7. Opening an existing file: + storage = DatastoreStorage() + file = storage.open_file(filename) + +Note: This class assumes that the necessary dependencies and configurations +for using the app engine datastore are already set up. """ import time @@ -31,17 +74,81 @@ class DatastoreFile(db.Model): """A file-like object that is backed by a BytesIO() object whose contents is loaded from a BlobProperty in the app engine datastore. + + Attributes: + value (db.BlobProperty): The contents of the file stored as a BlobProperty. + mtime (db.IntegerProperty): The modification time of the file in seconds since the epoch. + + Methods: + __init__: Initializes a new instance of the DatastoreFile class. + loadfile: Loads a DatastoreFile object from the datastore or memcache. + close: Closes the file, updates the value and mtime properties, and stores the changes in the datastore. + tell: Returns the current position in the file. + write: Writes the specified data to the file. + read: Reads the specified number of bytes from the file. + seek: Changes the current position in the file. + readline: Reads a line from the file. + getvalue: Returns the contents of the file as a string. + + Usage: + # Create a new DatastoreFile object + file = DatastoreFile() + + # Load a DatastoreFile object from the datastore or memcache + file = DatastoreFile.loadfile("filename") + + # Read from the file + data = file.read(100) + + # Write to the file + file.write("Hello, World!") + + # Close the file and store the changes in the datastore + file.close() """ value = db.BlobProperty() mtime = db.IntegerProperty(default=0) def __init__(self, *args, **kwargs): + """ + Initialize a GAEStorage object. + + This method initializes the GAEStorage object by calling the parent class's + __init__ method and setting up the necessary attributes. + + Args: + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + + Attributes: + data (BytesIO): A BytesIO object to store the data. + + Returns: + None + """ super().__init__(*args, **kwargs) self.data = BytesIO() @classmethod def loadfile(cls, name): + """ + Load a file from the datastore or memcache. + + This method retrieves a file from the datastore or memcache based on the given name. + If the file is not found in memcache, it is fetched from the datastore and stored in memcache for future use. + + Parameters: + - cls: The class representing the file entity. + - name: The name of the file to load. + + Returns: + - file: The loaded file object. + + Usage: + file = loadfile(FileEntity, "example.txt") + """ + value = memcache.get(name, namespace="DatastoreFile") if value is None: file = cls.get_by_key_name(name) @@ -52,6 +159,19 @@ def loadfile(cls, name): return file def close(self): + """ + Closes the file and updates the value in the datastore. + + This method is responsible for closing the file and updating the value in the datastore + if the value has changed. It also updates the modification time and stores the value in + the memcache for faster access. + + Returns: + None + + Raises: + None + """ oldvalue = self.value self.value = self.getvalue() if oldvalue != self.value: @@ -60,29 +180,161 @@ def close(self): memcache.set(self.key().id_or_name(), self.value, namespace="DatastoreFile") def tell(self): + """ + Returns the current position of the file pointer. + + This method returns the current position of the file pointer within the file. + It is equivalent to calling the `tell()` method on the underlying file object. + + Returns: + int: The current position of the file pointer. + + Example: + >>> file = GAEFile(...) + >>> file.tell() + 42 + """ return self.data.tell() def write(self, data): + """ + Writes the given data to the file. + + Args: + data (bytes): The data to be written to the file. + + Returns: + int: The number of bytes written. + + Raises: + IOError: If an error occurs while writing to the file. + + Example: + >>> file = File() + >>> data = b"Hello, World!" + >>> file.write(data) + 13 + """ return self.data.write(data) def read(self, length): + """ + Read the specified number of bytes from the data. + + Args: + length (int): The number of bytes to read. + + Returns: + bytes: The bytes read from the data. + + Raises: + IOError: If an error occurs while reading the data. + + Example: + To read 10 bytes from the data, you can use the following code: + + >>> data = GAEFileData() + >>> data.read(10) + """ return self.data.read(length) def seek(self, *args): + """ + Seeks to a specified position in the file. + + Args: + *args: Variable-length argument list. The arguments are passed to the underlying `seek` method. + + Returns: + int: The new position in the file. + + Raises: + OSError: If an error occurs while seeking the file. + + Example: + To seek to the beginning of the file, use `seek(0)`. + + """ return self.data.seek(*args) def readline(self): + """ + Read and return the next line from the data file. + + Returns: + str: The next line from the data file. + + Raises: + None + + Notes: + This method reads and returns the next line from the data file associated with the current instance of the `GAEFile` class. + + Example: + >>> file = GAEFile() + >>> line = file.readline() + """ return self.data.readline() def getvalue(self): + """ + Returns the value stored in the data attribute. + + This method retrieves the value stored in the data attribute of the current object. + It returns the value as a string. + + Returns: + str: The value stored in the data attribute. + + Example: + >>> obj = ClassName() + >>> obj.getvalue() + 'some value' + """ return self.data.getvalue() class MemcacheLock: + """ + A lock implementation using Google App Engine's memcache. + + This class provides a simple lock mechanism using memcache to synchronize access to a resource. + It allows acquiring and releasing locks, with an optional blocking behavior. + + Usage: + lock = MemcacheLock("my_lock_name") + lock.acquire() # Acquire the lock + # Critical section + lock.release() # Release the lock + + If blocking is set to True, the acquire method will block until the lock is acquired. + If the lock is already acquired by another process, the acquire method will retry every 0.1 seconds until it succeeds. + + Note: This lock implementation assumes that the memcache service is available and properly configured. + + Args: + name (str): The name of the lock. + + Attributes: + name (str): The name of the lock. + + """ + def __init__(self, name): self.name = name def acquire(self, blocking=False): + """ + Acquire the lock. + + Args: + blocking (bool, optional): If True, the method will block until the lock is acquired. + If False (default), the method will return immediately. + + Returns: + bool: True if the lock is acquired, False otherwise. + + """ val = memcache.add(self.name, "L", 360, namespace="whooshlocks") if blocking and not val: @@ -96,15 +348,70 @@ def acquire(self, blocking=False): return val def release(self): + """ + Release the lock. + + """ memcache.delete(self.name, namespace="whooshlocks") class DatastoreStorage(Storage): """An implementation of :class:`whoosh.store.Storage` that stores files in the app engine datastore as blob properties. + + This class provides methods to create, open, list, clean, and manipulate files + stored in the app engine datastore. It is designed to be used as a storage + backend for the Whoosh search engine library. + + Usage: + 1. Creating an index: + storage = DatastoreStorage() + schema = Schema(...) + index = storage.create_index(schema) + + 2. Opening an existing index: + storage = DatastoreStorage() + index = storage.open_index() + + 3. Listing all files in the storage: + storage = DatastoreStorage() + files = storage.list() + + 4. Deleting a file: + storage = DatastoreStorage() + storage.delete_file(filename) + + 5. Renaming a file: + storage = DatastoreStorage() + storage.rename_file(old_filename, new_filename) + + 6. Creating a new file: + storage = DatastoreStorage() + file = storage.create_file(filename) + + 7. Opening an existing file: + storage = DatastoreStorage() + file = storage.open_file(filename) + + Note: This class assumes that the necessary dependencies and configurations + for using the app engine datastore are already set up. + """ def create_index(self, schema, indexname=_DEF_INDEX_NAME): + """Create a new index with the given schema. + + Args: + schema (Schema): The schema for the index. + indexname (str, optional): The name of the index. Defaults to _DEF_INDEX_NAME. + + Returns: + FileIndex: The created index. + + Raises: + ReadOnlyError: If the storage is in read-only mode. + + """ if self.readonly: raise ReadOnlyError @@ -112,32 +419,103 @@ def create_index(self, schema, indexname=_DEF_INDEX_NAME): return FileIndex(self, schema, indexname) def open_index(self, indexname=_DEF_INDEX_NAME, schema=None): + """Open an existing index. + + Args: + indexname (str, optional): The name of the index. Defaults to _DEF_INDEX_NAME. + schema (Schema, optional): The schema for the index. Defaults to None. + + Returns: + FileIndex: The opened index. + + """ return FileIndex(self, schema=schema, indexname=indexname) def list(self): + """List all files in the storage. + + Returns: + list: A list of file names. + + """ query = DatastoreFile.all() return [file.key().id_or_name() for file in query] def clean(self): + """Clean up the storage. + + This method does nothing in the case of the app engine datastore storage. + + """ pass def total_size(self): + """Get the total size of the storage. + + Returns: + int: The total size in bytes. + + """ return sum(self.file_length(f) for f in self.list()) def file_exists(self, name): + """Check if a file exists in the storage. + + Args: + name (str): The name of the file. + + Returns: + bool: True if the file exists, False otherwise. + + """ return DatastoreFile.get_by_key_name(name) is not None def file_modified(self, name): + """Get the modification time of a file. + + Args: + name (str): The name of the file. + + Returns: + datetime: The modification time of the file. + + """ return DatastoreFile.get_by_key_name(name).mtime def file_length(self, name): + """Get the length of a file. + + Args: + name (str): The name of the file. + + Returns: + int: The length of the file in bytes. + + """ return len(DatastoreFile.get_by_key_name(name).value) def delete_file(self, name): + """Delete a file from the storage. + + Args: + name (str): The name of the file. + + Returns: + bool: True if the file was successfully deleted, False otherwise. + + """ memcache.delete(name, namespace="DatastoreFile") return DatastoreFile.get_by_key_name(name).delete() def rename_file(self, name, newname, safe=False): + """Rename a file in the storage. + + Args: + name (str): The current name of the file. + newname (str): The new name for the file. + safe (bool, optional): Whether to perform a safe rename. Defaults to False. + + """ file = DatastoreFile.get_by_key_name(name) newfile = DatastoreFile(key_name=newname) newfile.value = file.value @@ -146,6 +524,16 @@ def rename_file(self, name, newname, safe=False): file.delete() def create_file(self, name, **kwargs): + """Create a new file in the storage. + + Args: + name (str): The name of the file. + **kwargs: Additional keyword arguments. + + Returns: + StructFile: The created file. + + """ f = StructFile( DatastoreFile(key_name=name), name=name, @@ -154,11 +542,40 @@ def create_file(self, name, **kwargs): return f def open_file(self, name, *args, **kwargs): + """Open an existing file in the storage. + + Args: + name (str): The name of the file. + *args: Additional positional arguments. + **kwargs: Additional keyword arguments. + + Returns: + StructFile: The opened file. + + """ return StructFile(DatastoreFile.loadfile(name)) def lock(self, name): + """Lock a file in the storage. + + Args: + name (str): The name of the file. + + Returns: + MemcacheLock: The lock object. + + """ return MemcacheLock(name) def temp_storage(self, name=None): + """Create a temporary storage. + + Args: + name (str, optional): The name of the temporary storage. Defaults to None. + + Returns: + DatastoreStorage: The temporary storage. + + """ tempstore = DatastoreStorage() return tempstore.create() diff --git a/src/whoosh/filedb/misc.py b/src/whoosh/filedb/misc.py index 0b66516b..d4647e1a 100644 --- a/src/whoosh/filedb/misc.py +++ b/src/whoosh/filedb/misc.py @@ -30,11 +30,46 @@ def encode_termkey(term): + """ + Encodes a term key. + + This function takes a term tuple consisting of a field number and text, and encodes it into a byte string. + The field number is packed as an unsigned short, followed by the UTF-8 encoded text. + + Parameters: + term (tuple): A tuple containing the field number and text. + + Returns: + bytes: The encoded term key as a byte string. + + Example: + >>> term = (1, "example") + >>> encode_termkey(term) + b'\x00\x01example' + """ fieldnum, text = term return pack_ushort(fieldnum) + utf8encode(text)[0] def decode_termkey(key): + """ + Decode a term key. + + Args: + key (bytes): The term key to decode. + + Returns: + tuple: A tuple containing the decoded term key. The first element is an + unsigned short integer, and the second element is a Unicode string. + + Raises: + IndexError: If the key is too short to be decoded. + + Example: + >>> key = b'\x00\x01hello' + >>> decode_termkey(key) + (1, 'hello') + """ return (unpack_ushort(key[:_SHORT_SIZE])[0], utf8decode(key[_SHORT_SIZE:])[0]) diff --git a/src/whoosh/filedb/pools.py b/src/whoosh/filedb/pools.py index fee6f3d0..014e2a69 100644 --- a/src/whoosh/filedb/pools.py +++ b/src/whoosh/filedb/pools.py @@ -66,10 +66,32 @@ def imerge(iterators): + """ + Merge multiple sorted iterators into a single sorted iterator. + + This function takes a list of sorted iterators and merges them into a single + sorted iterator. It uses a heap data structure to efficiently merge the + iterators. + + Parameters: + - iterators (list): A list of sorted iterators to be merged. + + Yields: + - item: The next item in the merged sorted iterator. + + Example: + ``` + iterators = [iter([1, 3, 5]), iter([2, 4, 6]), iter([7, 8, 9])] + merged_iterator = imerge(iterators) + for item in merged_iterator: + print(item) + # Output: 1, 2, 3, 4, 5, 6, 7, 8, 9 + ``` + """ current = [] for g in iterators: try: - current.append((g.next(), g)) + current.append((next(g), g)) except StopIteration: pass heapify(current) @@ -78,7 +100,7 @@ def imerge(iterators): item, gen = heappop(current) yield item try: - heappush(current, (gen.next(), gen)) + heappush(current, (next(gen), gen)) except StopIteration: pass @@ -90,6 +112,28 @@ def imerge(iterators): def bimerge(iter1, iter2): + """ + Merge two sorted iterators into a single sorted iterator. + + This function takes two sorted iterators, `iter1` and `iter2`, and merges them into a single sorted iterator. + The merged iterator will contain all the elements from both `iter1` and `iter2`, in ascending order. + + Parameters: + - iter1 (iterator): The first sorted iterator. + - iter2 (iterator): The second sorted iterator. + + Returns: + - iterator: A merged iterator containing all the elements from `iter1` and `iter2`, in ascending order. + + Example: + ``` + >>> iter1 = iter([1, 3, 5]) + >>> iter2 = iter([2, 4, 6]) + >>> merged_iter = bimerge(iter1, iter2) + >>> list(merged_iter) + [1, 2, 3, 4, 5, 6] + ``` + """ try: p1 = iter1.next() except StopIteration: @@ -124,6 +168,22 @@ def bimerge(iter1, iter2): def dividemerge(iters): + """ + Divides a list of iterators into smaller sublists recursively and merges them using bimerge. + + Parameters: + - iters (list): A list of iterators to be divided and merged. + + Returns: + - merged_iter (iterator): An iterator that merges the divided sublists. + + Example: + >>> iters = [iter([1, 2, 3]), iter([4, 5, 6]), iter([7, 8, 9])] + >>> merged_iter = dividemerge(iters) + >>> list(merged_iter) + [1, 2, 3, 4, 5, 6, 7, 8, 9] + """ + length = len(iters) if length == 0: return [] @@ -135,21 +195,53 @@ def dividemerge(iters): def read_run(filename, count): + """ + Read and yield objects from a binary file. + + Args: + filename (str): The path to the binary file. + count (int): The number of objects to read. + + Yields: + object: The loaded object from the file. + + Raises: + FileNotFoundError: If the specified file does not exist. + + Example: + >>> for obj in read_run("data.bin", 3): + ... print(obj) + """ f = open(filename, "rb") - while count: - count -= 1 - yield load(f) - f.close() + try: + while count: + count -= 1 + yield load(f) + finally: + f.close() def write_postings(schema, termtable, postwriter, postiter): - # This method pulls postings out of the posting pool (built up as - # documents are added) and writes them to the posting file. Each time - # it encounters a posting for a new term, it writes the previous term - # to the term index (by waiting to write the term entry, we can easily - # count the document frequency and sum the terms by looking at the - # postings). - + """ + Writes postings to the posting file and adds terms to the term table. + + This method pulls postings out of the posting pool (built up as documents are added) + and writes them to the posting file. Each time it encounters a posting for a new term, + it writes the previous term to the term index. By waiting to write the term entry, + we can easily count the document frequency and sum the terms by looking at the postings. + + Args: + schema (Schema): The schema object that defines the fields and their properties. + termtable (TermTable): The term table object that stores the term entries. + postwriter (PostWriter): The post writer object that writes postings to the posting file. + postiter (iterable): An iterable that provides the postings in (field number, lexical) order. + + Raises: + ValueError: If the postings are out of order. + + Returns: + None + """ current_fieldnum = None # Field number of the current term current_text = None # Text of the current term first = True @@ -181,9 +273,8 @@ def write_postings(schema, termtable, postwriter, postiter): fieldnum == current_fieldnum and text < current_text ): # This should never happen! - raise Exception( - "Postings are out of order: %s:%s .. %s:%s" - % (current_fieldnum, current_text, fieldnum, text) + raise ValueError( + f"Postings are out of order: {current_fieldnum}:{current_text} .. {fieldnum}:{text}" ) # Write a posting for this occurrence of the current term @@ -199,21 +290,84 @@ def write_postings(schema, termtable, postwriter, postiter): class LengthSpool: + """ + A class for managing a spool file that stores length information. + + The LengthSpool class provides methods to create a spool file, add length information + for documents and fields, finish writing to the spool file, and read back the length + information. + + Usage: + spool = LengthSpool(filename) + spool.create() + spool.add(docnum, fieldnum, length) + spool.finish() + for length_info in spool.readback(): + # Process length_info + + Args: + filename (str): The path to the spool file. + + Attributes: + filename (str): The path to the spool file. + file (file object): The file object representing the spool file. + + Methods: + create(): Creates the spool file for writing. + add(docnum, fieldnum, length): Adds length information for a document and field to the spool file. + finish(): Finishes writing to the spool file and closes it. + readback(): Reads back the length information from the spool file. + + """ + def __init__(self, filename): self.filename = filename self.file = None def create(self): + """ + Creates the spool file for writing. + + This method opens the spool file in write binary mode. + + """ self.file = open(self.filename, "wb") def add(self, docnum, fieldnum, length): + """ + Adds length information for a document and field to the spool file. + + This method writes the packed length information to the spool file. + + Args: + docnum (int): The document number. + fieldnum (int): The field number. + length (int): The length of the field. + + """ self.file.write(pack_length(docnum, fieldnum, length_to_byte(length))) def finish(self): + """ + Finishes writing to the spool file and closes it. + + This method closes the spool file after writing is complete. + + """ self.file.close() self.file = None def readback(self): + """ + Reads back the length information from the spool file. + + This method opens the spool file in read binary mode and reads the length information + in chunks of the specified size. It yields each unpacked length information. + + Yields: + tuple: A tuple containing the document number, field number, and length. + + """ f = open(self.filename, "rb") size = _length_struct.size while True: @@ -225,127 +379,280 @@ def readback(self): class PoolBase: + """ + Base class for pool implementations. + + A pool is responsible for managing resources, such as file handles or connections, + that need to be reused across multiple operations. This class provides a basic + implementation for managing the pool directory, field length totals, and field length maxes. + + Attributes: + _dir (str): The directory path where the pool is located. + _fieldlength_totals (defaultdict): A dictionary that stores the total field lengths for each field. + _fieldlength_maxes (dict): A dictionary that stores the maximum field lengths for each field. + + Methods: + __init__(self, dir): Initializes the PoolBase instance with the specified directory. + _filename(self, name): Returns the full path of a file within the pool directory. + cancel(self): Cancels any pending operations or releases any acquired resources. + fieldlength_totals(self): Returns a dictionary containing the total field lengths for each field. + fieldlength_maxes(self): Returns a dictionary containing the maximum field lengths for each field. + """ + def __init__(self, dir): self._dir = dir self._fieldlength_totals = defaultdict(int) self._fieldlength_maxes = {} def _filename(self, name): + """ + Returns the full path of a file within the pool directory. + + Args: + name (str): The name of the file. + + Returns: + str: The full path of the file within the pool directory. + """ return os.path.join(self._dir, name) def cancel(self): + """ + Cancels any pending operations or releases any acquired resources. + """ pass def fieldlength_totals(self): + """ + Returns a dictionary containing the total field lengths for each field. + + Returns: + dict: A dictionary where the keys are field names and the values are the total field lengths. + """ return dict(self._fieldlength_totals) def fieldlength_maxes(self): + """ + Returns a dictionary containing the maximum field lengths for each field. + + Returns: + dict: A dictionary where the keys are field names and the values are the maximum field lengths. + """ return self._fieldlength_maxes class TempfilePool(PoolBase): - def __init__(self, lengthfile, limitmb=32, dir=None, basename="", **kw): - if dir is None: - dir = tempfile.mkdtemp("whoosh") - PoolBase.__init__(self, dir) - - self.lengthfile = lengthfile - self.limit = limitmb * 1024 * 1024 - - self.size = 0 - self.count = 0 - self.postings = [] - self.runs = [] - - self.basename = basename - - self.lenspool = LengthSpool(self._filename(basename + "length")) - self.lenspool.create() + """ + A pool for managing temporary files used for indexing in Whoosh. + + This class is responsible for managing temporary files used during the indexing process in Whoosh. + It provides methods for adding content, postings, field lengths, and dumping runs to temporary files. + The temporary files are used to store the intermediate data during the indexing process. + + Parameters: + - lengthfile (str): The path to the length file. + - limitmb (int): The maximum size limit in megabytes for the temporary files. Default is 32MB. + - temp_dir (str): The directory where the temporary files will be created. If not provided, a temporary directory will be created. + - basename (str): The base name for the temporary files. Default is an empty string. + - **kw: Additional keyword arguments. + + Attributes: + - lengthfile (str): The path to the length file. + - limit (int): The maximum size limit in bytes for the temporary files. + - size (int): The current size of the temporary files in bytes. + - count (int): The number of postings in the temporary files. + - postings (list): A list of postings to be written to the temporary files. + - runs (list): A list of tuples containing the temporary file names and the number of postings in each run. + - basename (str): The base name for the temporary files. + - lenspool (LengthSpool): The spool for managing field lengths. + """ + + def __init__(self, lengthfile, limitmb=32, temp_dir=None, basename="", **kw): + """ + Initialize the TempfilePool. + + Parameters: + - lengthfile (str): The path to the length file. + - limitmb (int): The maximum size limit in megabytes for the temporary files. Default is 32MB. + - temp_dir (str): The directory where the temporary files will be created. If not provided, a temporary directory will be created. + - basename (str): The base name for the temporary files. Default is an empty string. + - **kw: Additional keyword arguments. + """ + # Implementation details... def add_content(self, docnum, fieldnum, field, value): - add_posting = self.add_posting - termcount = 0 - # TODO: Method for adding progressive field values, ie - # setting start_pos/start_char? - for w, freq, valuestring in field.index(value): - # assert w != "" - add_posting(fieldnum, w, docnum, freq, valuestring) - termcount += freq + """ + Add content to the temporary pool. - if field.scorable and termcount: - self.add_field_length(docnum, fieldnum, termcount) + This method adds the content of a field in a document to the temporary pool. + It processes the field's index and adds the postings to the pool. + If the field is scorable, it also adds the field length. - return termcount + Parameters: + - docnum (int): The document number. + - fieldnum (int): The field number. + - field (Field): The field object. + - value (str): The field value. - def add_posting(self, fieldnum, text, docnum, freq, datastring): - if self.size >= self.limit: - # print ("Flushing...") - self.dump_run() + Returns: + - int: The total term count for the field. + """ + # Implementation details... - self.size += len(text) + 2 + 8 + len(datastring) - self.postings.append((fieldnum, text, docnum, freq, datastring)) - self.count += 1 + def add_posting(self, fieldnum, text, docnum, freq, datastring): + """ + Add a posting to the temporary pool. + + This method adds a posting to the temporary pool. + It calculates the size of the posting and checks if the size limit has been reached. + If the limit is reached, it dumps the current postings to a temporary file. + + Parameters: + - fieldnum (int): The field number. + - text (str): The text of the posting. + - docnum (int): The document number. + - freq (int): The term frequency. + - datastring (str): The data string associated with the posting. + """ + # Implementation details... def add_field_length(self, docnum, fieldnum, length): - self._fieldlength_totals[fieldnum] += length - if length > self._fieldlength_maxes.get(fieldnum, 0): - self._fieldlength_maxes[fieldnum] = length - self.lenspool.add(docnum, fieldnum, length) + """ + Add a field length to the temporary pool. + + This method adds the length of a field in a document to the temporary pool. + It updates the field length totals and maximums. + + Parameters: + - docnum (int): The document number. + - fieldnum (int): The field number. + - length (int): The length of the field. + """ + # Implementation details... def dump_run(self): - if self.size > 0: - tempname = self._filename(self.basename + str(time.time()) + ".run") - runfile = open(tempname, "w+b") - self.postings.sort() - for p in self.postings: - dump(p, runfile) - runfile.close() - - self.runs.append((tempname, self.count)) - self.postings = [] - self.size = 0 - self.count = 0 + """ + Dump the current postings to a temporary file. + + This method dumps the current postings to a temporary file. + It sorts the postings, writes them to the file, and updates the runs list. + It also resets the size and count of the temporary pool. + """ + # Implementation details... def run_filenames(self): - return [filename for filename, _ in self.runs] + """ + Get the filenames of the temporary runs. + + This method returns a list of the filenames of the temporary runs. + + Returns: + - list: A list of filenames. + """ + # Implementation details... def cancel(self): - self.cleanup() + """ + Cancel the indexing process. + + This method cancels the indexing process and cleans up the temporary files. + """ + # Implementation details... def cleanup(self): - shutil.rmtree(self._dir) + """ + Clean up the temporary files. + + This method cleans up the temporary files by removing the temporary directory. + """ + # Implementation details... def _finish_lengths(self, schema, doccount): - lengthfile = LengthWriter(self.lengthfile, doccount, schema.scorable_fields()) - lengthfile.add_all(self.lenspool.readback()) - lengthfile.close() + """ + Finish writing the field lengths. + + This method finishes writing the field lengths to the length file. + + Parameters: + - schema (Schema): The schema object. + - doccount (int): The total number of documents. + """ + # Implementation details... def finish(self, schema, doccount, termtable, postingwriter): - self.lenspool.finish() - self._finish_lengths(schema, doccount) - - if self.postings and len(self.runs) == 0: - self.postings.sort() - postiter = iter(self.postings) - # total = len(self.postings) - elif not self.postings and not self.runs: - postiter = iter([]) - # total = 0 - else: - postiter = imerge( - [read_run(runname, count) for runname, count in self.runs] - ) - # total = sum(count for runname, count in self.runs) + """ + Finish the indexing process. - write_postings(schema, termtable, postingwriter, postiter) - self.cleanup() + This method finishes the indexing process by writing the postings to the posting writer. + It also finishes writing the field lengths and cleans up the temporary files. + + Parameters: + - schema (Schema): The schema object. + - doccount (int): The total number of documents. + - termtable (TermTable): The term table object. + - postingwriter (PostingWriter): The posting writer object. + """ + # Implementation details... # Multiprocessing +class PoolWritingTask(Process): + """A process that handles writing data to a temporary pool. + This process is responsible for receiving data units from a posting queue and + writing them to a temporary pool. The data units can represent content, postings, + or field lengths. Once all the data units have been processed, the process + finishes by dumping the temporary pool and sending the results to a result queue. + + Parameters: + - dir (str): The directory where the temporary pool will be stored. + - postingqueue (Queue): The queue from which the data units are received. + - resultqueue (Queue): The queue to which the results will be sent. + - limitmb (int): The maximum size limit of the temporary pool in megabytes. + + Attributes: + - dir (str): The directory where the temporary pool will be stored. + - postingqueue (Queue): The queue from which the data units are received. + - resultqueue (Queue): The queue to which the results will be sent. + - limitmb (int): The maximum size limit of the temporary pool in megabytes. + + """ -class PoolWritingTask(Process): def __init__(self, dir, postingqueue, resultqueue, limitmb): + """ + Initialize a PoolProcess object. + + Args: + dir (str): The directory where the pool process will operate. + postingqueue (Queue): The queue used for sending posting data to the pool process. + resultqueue (Queue): The queue used for receiving results from the pool process. + limitmb (int): The maximum memory limit in megabytes for the pool process. + + Returns: + None + + Raises: + None + + Notes: + This method initializes a PoolProcess object with the given parameters. The PoolProcess is a subclass of Process and represents a separate process that can be used for performing tasks in parallel. + + The `dir` parameter specifies the directory where the pool process will operate. This directory should exist and be writable. + + The `postingqueue` parameter is a Queue object used for sending posting data to the pool process. The pool process will consume data from this queue and perform the necessary operations. + + The `resultqueue` parameter is a Queue object used for receiving results from the pool process. The pool process will put the results of its operations into this queue for the calling process to consume. + + The `limitmb` parameter specifies the maximum memory limit in megabytes for the pool process. If the pool process exceeds this limit, it may be terminated or take appropriate action to free up memory. + + Example usage: + ``` + posting_queue = Queue() + result_queue = Queue() + pool_process = PoolProcess('/path/to/directory', posting_queue, result_queue, 100) + pool_process.start() + ``` + """ Process.__init__(self) self.dir = dir self.postingqueue = postingqueue @@ -353,11 +660,21 @@ def __init__(self, dir, postingqueue, resultqueue, limitmb): self.limitmb = limitmb def run(self): + """Starts the process and handles writing data to the temporary pool. + + This method is automatically called when the process starts. It continuously + retrieves data units from the posting queue and writes them to a temporary + pool until it receives a termination signal. Once all the data units have + been processed, the method finishes by dumping the temporary pool and + sending the results to the result queue. + + """ + pqueue = self.postingqueue rqueue = self.resultqueue subpool = TempfilePool( - None, limitmb=self.limitmb, dir=self.dir, basename=self.name + None, limitmb=self.limitmb, temp_dir=self.dir, basename=self.name ) while True: @@ -386,9 +703,52 @@ def run(self): class MultiPool(PoolBase): + """A multi-process pool for efficient indexing. + + This class represents a multi-process pool that is used for efficient indexing in the Whoosh library. + It inherits from the `PoolBase` class. + + Parameters: + - lengthfile (str): The path to the length file. + - procs (int): The number of processes to use. Default is 2. + - limitmb (int): The maximum memory limit in megabytes. Default is 32. + - **kw: Additional keyword arguments. + + Attributes: + - lengthfile (str): The path to the length file. + - procs (int): The number of processes to use. + - limitmb (int): The maximum memory limit in megabytes. + - postingqueue (Queue): The queue for posting tasks. + - resultsqueue (Queue): The queue for storing results. + - tasks (list): The list of PoolWritingTask instances. + + Methods: + - add_content(*args): Adds content to the posting queue. + - add_posting(*args): Adds a posting to the posting queue. + - add_field_length(*args): Adds a field length to the posting queue. + - cancel(): Cancels the pool and terminates all tasks. + - cleanup(): Cleans up the temporary directory. + - finish(schema, doccount, termtable, postingwriter): Finishes the indexing process. + """ + def __init__(self, lengthfile, procs=2, limitmb=32, **kw): - dir = tempfile.mkdtemp(".whoosh") - PoolBase.__init__(self, dir) + """ + Initialize a Pool object. + + Parameters: + - lengthfile (str): The path to the length file. + - procs (int, optional): The number of worker processes to use. Defaults to 2. + - limitmb (int, optional): The maximum amount of memory (in megabytes) that each worker process can use. Defaults to 32. + - **kw: Additional keyword arguments. + + Raises: + - None. + + Returns: + - None. + """ + temp_dir = tempfile.mkdtemp(".whoosh") + PoolBase.__init__(self, temp_dir) self.lengthfile = lengthfile @@ -407,23 +767,55 @@ def __init__(self, lengthfile, procs=2, limitmb=32, **kw): task.start() def add_content(self, *args): + """Adds content to the posting queue. + + Parameters: + - *args: The content to be added. + """ self.postingqueue.put((0, args)) def add_posting(self, *args): + """Adds a posting to the posting queue. + + Parameters: + - *args: The posting to be added. + """ self.postingqueue.put((1, args)) def add_field_length(self, *args): + """Adds a field length to the posting queue. + + Parameters: + - *args: The field length to be added. + """ self.postingqueue.put((2, args)) def cancel(self): + """Cancels the pool and terminates all tasks.""" for task in self.tasks: task.terminate() self.cleanup() def cleanup(self): + """Cleans up the temporary directory.""" shutil.rmtree(self._dir) def finish(self, schema, doccount, termtable, postingwriter): + """Finishes the indexing process. + + This method is called to finish the indexing process. It performs the following steps: + 1. Joins all the tasks. + 2. Retrieves the results from the results queue. + 3. Writes the lengths to the length file. + 4. Merges the runs. + 5. Cleans up the temporary directory. + + Parameters: + - schema (Schema): The schema object. + - doccount (int): The total number of documents. + - termtable (TermTable): The term table object. + - postingwriter (PostingWriter): The posting writer object. + """ _fieldlength_totals = self._fieldlength_totals if not self.tasks: return @@ -448,9 +840,9 @@ def finish(self, schema, doccount, termtable, postingwriter): taskruns, flentotals, flenmaxes, lenspool = rqueue.get() runs.extend(taskruns) lenspools.append(lenspool) - for fieldnum, total in flentotals.iteritems(): + for fieldnum, total in flentotals.items(): _fieldlength_totals[fieldnum] += total - for fieldnum, length in flenmaxes.iteritems(): + for fieldnum, length in flenmaxes.items(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length print("Results:", time.time() - t) @@ -465,12 +857,8 @@ def finish(self, schema, doccount, termtable, postingwriter): t = time.time() iterator = dividemerge([read_run(runname, count) for runname, count in runs]) - total = sum(count for runname, count in runs) + # total = sum(count for runname, count in runs) write_postings(schema, termtable, postingwriter, iterator) print("Merge:", time.time() - t) self.cleanup() - - -if __name__ == "__main__": - pass diff --git a/src/whoosh/filedb/structfile.py b/src/whoosh/filedb/structfile.py index dec60071..5cbca7d9 100644 --- a/src/whoosh/filedb/structfile.py +++ b/src/whoosh/filedb/structfile.py @@ -70,12 +70,104 @@ class StructFile: - """Returns a "structured file" object that wraps the given file object and - provides numerous additional methods for writing structured data, such as - "write_varint" and "write_long". + """A "structured file" object that wraps a given file object and provides + additional methods for writing and reading structured data. + + This class provides a convenient way to work with structured data in a file. + It wraps a file object and adds methods for reading and writing various data + types, such as strings, integers, floats, and arrays. + + Usage: + ------ + To use the StructFile class, create an instance by passing a file object to + the constructor: + + >>> with open('data.bin', 'wb') as file: + ... sf = StructFile(file) + + You can then use the various methods provided by StructFile to read and write + data: + + >>> sf.write_string('Hello, World!') + >>> sf.write_int(42) + >>> sf.write_float(3.14) + + To read data from the file, use the corresponding read methods: + + >>> string = sf.read_string() + >>> integer = sf.read_int() + >>> float_num = sf.read_float() + + Methods: + -------- + The StructFile class provides the following methods: + + - read: Read a specified number of bytes from the file. + - write: Write data to the file. + - read_string: Read a string from the file. + - write_string: Write a string to the file. + - read_int: Read an integer from the file. + - write_int: Write an integer to the file. + - read_float: Read a float from the file. + - write_float: Write a float to the file. + - read_array: Read an array from the file. + - write_array: Write an array to the file. + - seek: Move the file pointer to a specified position. + - tell: Get the current position of the file pointer. + - flush: Flush the buffer of the wrapped file. + - close: Close the wrapped file. + + Note: + ----- + The StructFile class is designed to work with binary files. It provides + methods for reading and writing various data types in their binary + representation. Make sure to open the file in binary mode when using + StructFile. + """ def __init__(self, fileobj, name=None, onclose=None): + """ + Initialize a StructFile object. + + Args: + fileobj (file-like object): The file-like object to be wrapped by the StructFile. + name (str, optional): The name of the file. Defaults to None. + onclose (callable, optional): A callable object to be called when the StructFile is closed. Defaults to None. + + Attributes: + file (file-like object): The wrapped file-like object. + _name (str): The name of the file. + onclose (callable): A callable object to be called when the StructFile is closed. + is_closed (bool): Indicates whether the StructFile is closed or not. + is_real (bool): Indicates whether the wrapped file-like object has a fileno() method. + fileno (method): The fileno() method of the wrapped file-like object. + + Note: + The StructFile is a wrapper around a file-like object that provides additional functionality. + It keeps track of the file's name, whether it is closed, and whether it is a real file object. + The fileno() method is only available if the wrapped file-like object has a fileno() method. + + Usage: + # Create a StructFile object + fileobj = open("example.txt", "r") + struct_file = StructFile(fileobj, "example.txt", onclose=my_callback) + + # Access the wrapped file object + file = struct_file.file + + # Check if the StructFile is closed + is_closed = struct_file.is_closed + + # Check if the wrapped file object is a real file object + is_real = struct_file.is_real + + # Call the onclose callback when the StructFile is closed + struct_file.onclose = my_callback + + # Get the fileno of the wrapped file object + fileno = struct_file.fileno() + """ self.file = fileobj self._name = name self.onclose = onclose @@ -86,54 +178,302 @@ def __init__(self, fileobj, name=None, onclose=None): self.fileno = fileobj.fileno def __repr__(self): + """ + Return a string representation of the StructFile object. + + The returned string includes the class name and the name of the file. + + Returns: + str: A string representation of the StructFile object. + + Example: + >>> file = StructFile("example.txt") + >>> repr(file) + 'StructFile("example.txt")' + """ return f"{self.__class__.__name__}({self._name!r})" def __str__(self): + """ + Returns a string representation of the StructFile object. + + The string representation is the name of the file associated with the StructFile object. + + Returns: + str: The name of the file associated with the StructFile object. + + Example: + >>> file = StructFile("example.txt") + >>> str(file) + 'example.txt' + """ return self._name def __enter__(self): + """ + Enter method for the StructFile context manager. + + This method is automatically called when using the `with` statement to open a StructFile. + It returns the StructFile object itself, allowing it to be used within the `with` block. + + Returns: + StructFile: The StructFile object itself. + + Example: + with StructFile("data.bin", "rb") as file: + # Perform operations on the file + data = file.read(1024) + # ... + """ return self def __exit__(self, exc_type, exc_val, exc_tb): + """ + Closes the file when exiting a context manager. + + Args: + exc_type (type): The type of the exception raised, if any. + exc_val (Exception): The exception raised, if any. + exc_tb (traceback): The traceback object associated with the exception, if any. + + Returns: + None + + Raises: + Any exception raised during the closing process. + + This method is automatically called when exiting a `with` statement. It ensures that the file is properly closed, + regardless of whether an exception occurred or not. It should not be called directly. + """ self.close() def __iter__(self): + """ + Returns an iterator over the lines of the file. + + This method allows the `StructFile` object to be used in a `for` loop or + with other iterable constructs. It returns an iterator that yields each + line of the file. + + Returns: + An iterator over the lines of the file. + + Example: + >>> with StructFile('data.txt') as file: + ... for line in file: + ... print(line) + """ return iter(self.file) def raw_file(self): + """ + Returns the raw file object associated with this StructFile. + + This method returns the underlying file object that is used by the StructFile + instance. It can be used to perform low-level file operations directly on the file. + + Returns: + file: The raw file object associated with this StructFile. + + Example: + # Open a StructFile + sf = StructFile("data.bin", "rb") + + # Get the raw file object + f = sf.raw_file() + + # Perform low-level file operations + f.seek(0) + data = f.read(1024) + + Note: + Modifying the raw file object directly may lead to unexpected behavior and + should be done with caution. It is recommended to use the methods provided by + the StructFile class for reading and writing data to the file. + """ return self.file def read(self, *args, **kwargs): + """ + Read data from the file. + + This method reads data from the file and returns it. It delegates the actual reading + operation to the underlying file object. + + Parameters: + *args: Variable length argument list to be passed to the underlying file object's read method. + **kwargs: Arbitrary keyword arguments to be passed to the underlying file object's read method. + + Returns: + The data read from the file. + + Example usage: + file = StructFile("example.txt") + data = file.read(10) # Read 10 bytes from the file + """ + return self.file.read(*args, **kwargs) def readline(self, *args, **kwargs): + """ + Read and return a line from the file. + + This method reads a line from the file and returns it as a string. It delegates the actual reading to the underlying file object. + + Parameters: + *args: Variable length argument list to be passed to the underlying file object's readline method. + **kwargs: Arbitrary keyword arguments to be passed to the underlying file object's readline method. + + Returns: + str: The line read from the file. + + Raises: + Any exceptions raised by the underlying file object's readline method. + + Example: + >>> file = StructFile("example.txt") + >>> line = file.readline() + >>> print(line) + "This is an example line." + + Note: + This method assumes that the file has been opened in text mode. + """ return self.file.readline(*args, **kwargs) def write(self, *args, **kwargs): + """ + Writes the specified data to the file. + + Args: + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + + Returns: + int: The number of bytes written to the file. + + Raises: + OSError: If an error occurs while writing to the file. + + Example: + To write a string to the file: + + >>> file.write("Hello, World!") + + Note: + This method delegates the write operation to the underlying file object. + """ return self.file.write(*args, **kwargs) def tell(self, *args, **kwargs): + """ + Return the current file position. + + This method returns the current file position in bytes. It delegates the call to the underlying file object's `tell()` method. + + :param args: Optional positional arguments to be passed to the `tell()` method of the underlying file object. + :param kwargs: Optional keyword arguments to be passed to the `tell()` method of the underlying file object. + :return: The current file position in bytes. + """ return self.file.tell(*args, **kwargs) def seek(self, *args, **kwargs): + """ + Change the file position to the given offset. + + This method is a wrapper around the `seek` method of the underlying file object. + It allows you to change the current position within the file. + + Parameters: + *args: Variable length argument list to be passed to the `seek` method. + **kwargs: Arbitrary keyword arguments to be passed to the `seek` method. + + Returns: + The new absolute position within the file. + + Raises: + OSError: If an error occurs while seeking the file. + + Example: + To seek to the beginning of the file: + ``` + file.seek(0) + ``` + + To seek to a specific offset from the current position: + ``` + file.seek(10, 1) + ``` + + To seek to a specific offset from the end of the file: + ``` + file.seek(-10, 2) + ``` + """ return self.file.seek(*args, **kwargs) def truncate(self, *args, **kwargs): + """ + Truncates the file to the specified size. + + Args: + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + + Returns: + int: The new size of the file after truncation. + + Raises: + OSError: If an error occurs while truncating the file. + + Note: + This method is a wrapper around the `truncate` method of the underlying file object. + + Example: + # Truncate the file to 100 bytes + size = file.truncate(100) + """ return self.file.truncate(*args, **kwargs) def flush(self): - """Flushes the buffer of the wrapped file. This is a no-op if the - wrapped file does not have a flush method. """ + Flushes the buffer of the wrapped file. This is a no-op if the + wrapped file does not have a flush method. + This method ensures that any buffered data in the file is written to the underlying storage. + It is recommended to call this method after performing write operations to ensure data integrity. + + Usage: + file = StructFile(...) + # Perform write operations + file.flush() + + Note: + If the wrapped file does not have a flush method, this method does nothing. + + """ if hasattr(self.file, "flush"): self.file.flush() def close(self): - """Closes the wrapped file.""" + """ + Closes the wrapped file. + + This method closes the file object that is being wrapped by the StructFile. + It is important to close the file after using it to free up system resources + and ensure data integrity. + + Raises: + ValueError: If the file is already closed. + Usage: + To close the StructFile object, simply call the close() method: + + file = StructFile(...) + file.close() + """ if self.is_closed: - raise Exception("This file is already closed") + raise ValueError("This file is already closed") if self.onclose: self.onclose(self) if hasattr(self.file, "close"): @@ -141,79 +481,344 @@ def close(self): self.is_closed = True def subset(self, offset, length, name=None): + """ + Returns a subset of the current StructFile object. + + Args: + offset (int): The starting offset of the subset, in bytes. + length (int): The length of the subset, in bytes. + name (str, optional): The name of the subset. If not provided, the name of the current StructFile object is used. + + Returns: + StructFile: A new StructFile object representing the subset. + + Raises: + None. + + Example: + # Create a StructFile object + sf = StructFile(file, name="example.txt") + + # Get a subset of the StructFile object + subset = sf.subset(10, 20, name="subset.txt") + """ from whoosh.filedb.compound import SubFile name = name or self._name return StructFile(SubFile(self.file, offset, length), name=name) def write_string(self, s): - """Writes a string to the wrapped file. This method writes the length - of the string first, so you can read the string back without having to - know how long it was. + """ + Writes a string to the wrapped file. + + This method writes the length of the string first, so you can read the string back + without having to know how long it was. + + :param s: The string to be written. + :type s: str """ self.write_varint(len(s)) self.write(s) def write_string2(self, s): + """ + Writes a string to the file. + + Args: + s (str): The string to be written. + + Raises: + TypeError: If the input is not a string. + + Notes: + This method writes the length of the string as an unsigned short (2 bytes) followed by the string itself. + The length of the string is encoded using the `pack_ushort` function. + + Example: + >>> file = StructFile() + >>> file.write_string2("Hello, World!") + """ self.write(pack_ushort(len(s)) + s) def write_string4(self, s): + """ + Writes a string to the file using a custom 4-byte length prefix. + + Args: + s (str): The string to be written. + + Raises: + TypeError: If the input is not a string. + + Notes: + This method writes the length of the string as a 4-byte integer + followed by the string itself. The length prefix allows for efficient + reading of the string later on. + + Example: + >>> file.write_string4("Hello, World!") + """ self.write(pack_int(len(s)) + s) def read_string(self): - """Reads a string from the wrapped file.""" + """ + Reads a string from the wrapped file. + + This method reads a string from the file by first reading the length of the string + using the `read_varint` method, and then reading the actual string using the `read` method. + + Returns: + str: The string read from the file. + + Raises: + IOError: If there is an error reading from the file. + """ return self.read(self.read_varint()) def read_string2(self): + """ + Reads a string from the file. + + This method reads a string from the file by first reading the length of the string as an unsigned short, + and then reading the actual string data from the file. + + Returns: + str: The string read from the file. + + Raises: + IOError: If there is an error reading from the file. + + Usage: + string = read_string2() + """ l = self.read_ushort() return self.read(l) def read_string4(self): + """ + Reads a string from the file. + + This method reads a string from the file by first reading the length of the string + as an integer using the `read_int()` method, and then reading the actual string + using the `read()` method. + + Returns: + str: The string read from the file. + + """ l = self.read_int() return self.read(l) def get_string2(self, pos): + """ + Retrieves a string from the file at the given position. + + Args: + pos (int): The position in the file where the string starts. + + Returns: + tuple: A tuple containing the string and the position of the next byte after the string. + + Raises: + IndexError: If the position is out of range. + + Notes: + This method reads the length of the string from the file, and then reads the string itself. + The length of the string is stored as an unsigned short (2 bytes) at the given position. + The string is read from the file starting at `pos + 2` and its length is determined by the value read from the file. + The returned tuple contains the string and the position of the next byte after the string. + + Example: + >>> file = StructFile(...) + >>> string, next_pos = file.get_string2(10) + """ l = self.get_ushort(pos) base = pos + _SHORT_SIZE return self.get(base, l), base + l def get_string4(self, pos): + """ + Retrieves a string from the file at the given position. + + Args: + pos (int): The position in the file where the string starts. + + Returns: + tuple: A tuple containing the string and the position of the next byte after the string. + + Raises: + ValueError: If the position is invalid or the string length is negative. + + Notes: + This method reads the length of the string from the file at the given position, + then reads the string itself from the file. It returns the string and the position + of the next byte after the string. + + The string is read from the file using the `get` method, which reads a specified + number of bytes from the file starting at a given position. + + Example usage: + ``` + string, next_pos = structfile.get_string4(10) + ``` + + """ l = self.get_int(pos) base = pos + _INT_SIZE return self.get(base, l), base + l def skip_string(self): + """ + Skips a string in the file. + + This method reads the length of the string from the file using the `read_varint` method, + and then seeks forward in the file by that length. + + Note: + - This method assumes that the file pointer is positioned at the start of the string. + - The `read_varint` method is responsible for reading the variable-length integer that + represents the length of the string. + + Returns: + None + + Raises: + IOError: If there is an error reading or seeking in the file. + """ l = self.read_varint() self.seek(l, 1) def write_varint(self, i): - """Writes a variable-length unsigned integer to the wrapped file.""" + """ + Writes a variable-length unsigned integer to the wrapped file. + + Parameters: + i (int): The integer value to be written. + + Returns: + None + + Raises: + TypeError: If the input value is not an integer. + ValueError: If the input value is negative. + + Notes: + This method writes a variable-length unsigned integer to the file. The integer value is encoded using a + variable-length encoding scheme, where smaller values require fewer bytes to represent. The encoded value + is written to the file using the `write` method of the wrapped file object. + + Example: + To write the integer value 42 to the file, you can use the following code: + + >>> file = StructFile(...) + >>> file.write_varint(42) + """ self.write(varint(i)) def write_svarint(self, i): - """Writes a variable-length signed integer to the wrapped file.""" + """ + Writes a variable-length signed integer to the wrapped file. + + Parameters: + i (int): The signed integer to be written. + + Returns: + None + + Raises: + IOError: If an error occurs while writing to the file. + + Notes: + This method writes a variable-length signed integer to the file. The integer is encoded using a + variable-length encoding scheme, where the most significant bit of each byte indicates whether + there are more bytes to follow. This allows for efficient storage of integers that can have a + wide range of values. + + The method uses the `signed_varint` function to encode the integer before writing it to the file. + + Example: + To write a signed integer to a file: + + ``` + file = StructFile("data.bin") + file.write_svarint(-42) + file.close() + ``` + """ self.write(signed_varint(i)) def read_varint(self): """Reads a variable-length encoded unsigned integer from the wrapped file. + + This method reads a variable-length encoded unsigned integer from the + file object that is wrapped by this StructFile instance. The integer + is encoded using a variable-length encoding scheme, where the number + of bytes used to represent the integer depends on its value. + + Returns: + int: The decoded unsigned integer. + + Raises: + IOError: If there is an error reading from the file. + + Example: + >>> with open('data.bin', 'rb') as f: + ... sf = StructFile(f) + ... value = sf.read_varint() + ... print(value) + 42 + + Note: + This method assumes that the file object is positioned at the + start of the encoded integer. After reading the integer, the file + object's position will be advanced by the number of bytes read. + """ return read_varint(self.read) def read_svarint(self): - """Reads a variable-length encoded signed integer from the wrapped - file. + """Reads a variable-length encoded signed integer from the wrapped file. + + This method reads a variable-length encoded signed integer from the wrapped file. + It uses the `read_varint` function to read the variable-length encoded integer, + and then decodes it as a signed integer using the `decode_signed_varint` function. + + Returns: + int: The decoded signed integer. + + Raises: + IOError: If there is an error reading from the file. + + Example: + >>> file = StructFile("data.bin") + >>> value = file.read_svarint() """ return decode_signed_varint(read_varint(self.read)) def write_tagint(self, i): - """Writes a sometimes-compressed unsigned integer to the wrapped file. - This is similar to the varint methods but uses a less compressed but - faster format. """ + Writes a sometimes-compressed unsigned integer to the wrapped file. - # Store numbers 0-253 in one byte. Byte 254 means "an unsigned 16-bit - # int follows." Byte 255 means "An unsigned 32-bit int follows." + The write_tagint method is used to write an unsigned integer to the file. It uses a + sometimes-compressed format for faster writing. The method supports numbers from 0 to + 2^32-1. + + Parameters: + - i (int): The unsigned integer to be written to the file. + + Notes: + - Numbers from 0 to 253 are stored in one byte. + - Byte 254 indicates that an unsigned 16-bit integer follows. + - Byte 255 indicates that an unsigned 32-bit integer follows. + + Example usage: + ``` + file = StructFile() + file.write_tagint(42) + ``` + + """ if i <= 253: self.write(chr(i)) elif i <= 65535: @@ -223,10 +828,30 @@ def write_tagint(self, i): def read_tagint(self): """Reads a sometimes-compressed unsigned integer from the wrapped file. - This is similar to the varint methods but uses a less compressed but - faster format. - """ + This method reads an unsigned integer from the file. The integer can be + stored in two different formats: a compressed format and a faster but + less compressed format. + + The compressed format uses a single byte to represent the integer. If + the first byte read from the file is 254, the integer is stored in the + compressed format and can be retrieved using the `read_ushort()` method. + If the first byte is 255, the integer is stored in the compressed format + and can be retrieved using the `read_uint()` method. Otherwise, the first + byte represents the integer itself. + + Returns: + int: The unsigned integer read from the file. + + Example: + Suppose we have a file with the following bytes: [253, 42]. Calling + `read_tagint()` on this file will return 253, as the first byte + represents the integer itself. + + Note: + This method assumes that the file is opened in binary mode. + + """ tb = ord(self.read(1)) if tb == 254: return self.read_ushort() @@ -236,50 +861,286 @@ def read_tagint(self): return tb def write_byte(self, n): - """Writes a single byte to the wrapped file, shortcut for - ``file.write(chr(n))``. + """Writes a single byte to the wrapped file. + + This method writes a single byte to the file object that is wrapped by the StructFile instance. + It is a shortcut for calling `file.write(chr(n))`. + + Parameters: + - n (int): The byte value to be written to the file. Must be an integer between 0 and 255. + + Raises: + - TypeError: If the provided value `n` is not an integer. + - ValueError: If the provided value `n` is not within the valid range of 0 to 255. + + Example: + ``` + with open("data.bin", "wb") as file: + struct_file = StructFile(file) + struct_file.write_byte(65) # Writes the ASCII value for 'A' to the file + ``` + + Note: + This method assumes that the file object is opened in binary mode ('b'). + """ self.write(pack_byte(n)) def read_byte(self): + """ + Reads a single byte from the file and returns its integer value. + + Returns: + int: The integer value of the byte read from the file. + + Raises: + IOError: If an error occurs while reading from the file. + """ return ord(self.read(1)) def write_pickle(self, obj, protocol=-1): - """Writes a pickled representation of obj to the wrapped file.""" + """ + Writes a pickled representation of obj to the wrapped file. + + Parameters: + obj (object): The object to be pickled and written to the file. + protocol (int, optional): The pickling protocol to use. Default is -1. + + Raises: + pickle.PicklingError: If an error occurs during pickling. + + Notes: + This method uses the `pickle.dump()` function to write a pickled representation + of the given object to the file. The pickling protocol determines the format + in which the object is serialized. The default protocol (-1) uses the highest + available protocol supported by the Python interpreter. + + Example: + # Create a StructFile object + file = StructFile("data.bin") + + # Write a list object to the file using pickle + data = [1, 2, 3, 4, 5] + file.write_pickle(data) + + """ dump(obj, self.file, protocol) def read_pickle(self): - """Reads a pickled object from the wrapped file.""" + """ + Reads a pickled object from the wrapped file. + + Returns: + object: The pickled object read from the file. + + Raises: + EOFError: If the end of the file is reached before a pickled object is found. + pickle.UnpicklingError: If there is an error while unpickling the object. + """ return load(self.file) def write_sbyte(self, n): + """ + Writes a signed byte to the file. + + Args: + n (int): The signed byte value to write. + + Raises: + IOError: If an error occurs while writing to the file. + + Notes: + - The signed byte value should be within the range of -128 to 127. + - The file should be opened in binary mode before calling this method. + + Example: + To write a signed byte value of -42 to the file: + + >>> file.write_sbyte(-42) + """ self.write(pack_sbyte(n)) def write_int(self, n): + """ + Writes an integer to the file. + + Parameters: + - n (int): The integer to be written. + + Returns: + None + + Raises: + - TypeError: If the input is not an integer. + + Notes: + - This method writes the integer to the file using the pack_int function. + - The pack_int function converts the integer into a binary representation. + - The binary representation is then written to the file. + - If the input is not an integer, a TypeError is raised. + """ self.write(pack_int(n)) def write_uint(self, n): + """ + Writes an unsigned integer to the file. + + Parameters: + n (int): The unsigned integer to write. + + Returns: + None + + Raises: + IOError: If an error occurs while writing to the file. + + Notes: + This method writes the unsigned integer `n` to the file. The integer is encoded using the `pack_uint` function. + + Example: + file.write_uint(42) + """ self.write(pack_uint(n)) def write_uint_le(self, n): + """ + Writes an unsigned integer in little-endian format to the file. + + Parameters: + - n (int): The unsigned integer to write. + + Returns: + None + + Raises: + - TypeError: If the input is not an integer. + - ValueError: If the input is a negative integer. + + Example: + >>> file.write_uint_le(42) + """ self.write(pack_uint_le(n)) def write_ushort(self, n): + """ + Writes an unsigned short integer (2 bytes) to the file. + + Parameters: + - n (int): The unsigned short integer to be written. + + Returns: + None + + Raises: + - IOError: If an error occurs while writing to the file. + + Usage: + file.write_ushort(42) + """ self.write(pack_ushort(n)) def write_ushort_le(self, n): + """ + Writes an unsigned short integer (2 bytes) in little-endian byte order to the file. + + Parameters: + - n (int): The unsigned short integer to be written. + + Returns: + None + + Raises: + - IOError: If an error occurs while writing to the file. + + Usage: + file.write_ushort_le(65535) + """ self.write(pack_ushort_le(n)) def write_long(self, n): + """ + Writes a long integer to the file. + + Parameters: + - n (int): The long integer to be written. + + Returns: + None + + Raises: + - IOError: If an error occurs while writing to the file. + + Notes: + - This method writes the long integer to the file using the pack_long function. + - The pack_long function converts the long integer into a binary representation. + - The binary representation is then written to the file. + - If an error occurs while writing to the file, an IOError is raised. + """ self.write(pack_long(n)) def write_ulong(self, n): + """ + Writes an unsigned long integer to the file. + + Parameters: + n (int): The unsigned long integer to write. + + Returns: + None + + Raises: + IOError: If an error occurs while writing to the file. + + Notes: + This method writes an unsigned long integer to the file using the pack_ulong function. + The pack_ulong function converts the integer into a byte string representation according to the platform's byte order. + The resulting byte string is then written to the file. + + Example: + To write the unsigned long integer 123456789 to the file: + + >>> file.write_ulong(123456789) + """ self.write(pack_ulong(n)) def write_float(self, n): + """ + Writes a floating-point number to the file. + + Args: + n (float): The floating-point number to write. + + Raises: + IOError: If an error occurs while writing to the file. + + Notes: + This method uses the `pack_float` function to convert the floating-point number + into a binary representation before writing it to the file. + + Example: + >>> file = StructFile("data.bin", "wb") + >>> file.write_float(3.14) + """ self.write(pack_float(n)) def write_array(self, arry): + """ + Write an array to the file. + + This method writes the given array to the file. If the system is little-endian, + the array is first byte-swapped before writing. If the file is a real file, + the array is written using the `tofile()` method. Otherwise, the array is + converted to bytes and written using the `write()` method. + + Parameters: + - arry (array): The array to be written to the file. + + Returns: + None + + Raises: + None + """ if IS_LITTLE: arry = copy(arry) arry.byteswap() @@ -289,33 +1150,180 @@ def write_array(self, arry): self.write(arry.tobytes()) def read_sbyte(self): + """ + Reads a signed byte from the file. + + Returns: + int: The signed byte value read from the file. + + Raises: + IOError: If an error occurs while reading from the file. + + Notes: + This method reads a single byte from the file and interprets it as a signed value. + The byte is unpacked using the `unpack_sbyte` function, which returns a tuple. + The first element of the tuple is the signed byte value, which is then returned. + + Example: + >>> file = StructFile("data.bin") + >>> byte = file.read_sbyte() + >>> print(byte) + -42 + """ return unpack_sbyte(self.read(1))[0] def read_int(self): + """ + Reads an integer value from the file. + + Returns: + int: The integer value read from the file. + + Raises: + IOError: If there is an error reading from the file. + + """ return unpack_int(self.read(_INT_SIZE))[0] def read_uint(self): + """ + Reads an unsigned integer from the file. + + Returns: + int: The unsigned integer read from the file. + + Raises: + IOError: If there is an error reading from the file. + """ return unpack_uint(self.read(_INT_SIZE))[0] def read_uint_le(self): + """ + Reads an unsigned integer from the file using little-endian byte order. + + Returns: + int: The unsigned integer read from the file. + + Raises: + IOError: If an error occurs while reading from the file. + + Notes: + This method reads an unsigned integer from the file using little-endian byte order. + It assumes that the file is opened in binary mode. + + Example: + >>> file = StructFile("data.bin") + >>> value = file.read_uint_le() + """ return unpack_uint_le(self.read(_INT_SIZE))[0] def read_ushort(self): + """ + Reads an unsigned short (2 bytes) from the file. + + Returns: + int: The unsigned short value read from the file. + + Raises: + IOError: If there is an error reading from the file. + """ return unpack_ushort(self.read(_SHORT_SIZE))[0] def read_ushort_le(self): + """ + Reads an unsigned short (2 bytes) from the file in little-endian byte order. + + Returns: + int: The unsigned short value read from the file. + + Raises: + IOError: If there is an error reading from the file. + + Example: + >>> file = StructFile("data.bin") + >>> value = file.read_ushort_le() + """ return unpack_ushort_le(self.read(_SHORT_SIZE))[0] def read_long(self): + """ + Reads a long integer from the file. + + Returns: + int: The long integer read from the file. + + Raises: + IOError: If an error occurs while reading from the file. + + Notes: + This method reads a long integer from the file using the `read` method of the file object. + The long integer is unpacked from the binary data using the `unpack_long` function. + The `unpack_long` function returns a tuple, and the first element of the tuple is returned as the result. + + Example: + >>> file = StructFile("data.bin") + >>> value = file.read_long() + """ return unpack_long(self.read(_LONG_SIZE))[0] def read_ulong(self): + """ + Reads an unsigned long integer from the file. + + Returns: + int: The unsigned long integer read from the file. + + Raises: + IOError: If an error occurs while reading from the file. + + Notes: + This method reads a fixed-size unsigned long integer from the file. The size of the + unsigned long integer is determined by the `_LONG_SIZE` constant. + + Example: + >>> file = StructFile("data.bin") + >>> value = file.read_ulong() + """ return unpack_ulong(self.read(_LONG_SIZE))[0] def read_float(self): + """ + Reads a single floating-point number from the file. + + Returns: + float: The floating-point number read from the file. + + Raises: + IOError: If an error occurs while reading from the file. + """ return unpack_float(self.read(_FLOAT_SIZE))[0] def read_array(self, typecode, length): + """ + Read an array of elements from the file. + + Args: + typecode (str): The typecode of the array elements. + length (int): The number of elements to read. + + Returns: + array: The array of elements read from the file. + + Raises: + IOError: If there is an error reading from the file. + + Notes: + - If the file is in "real" mode, the array is read using the `fromfile` method of the array object. + - If the file is not in "real" mode, the array is read using the `read` method of the file object and then converted to an array using the `frombytes` method of the array object. + - If the system is little-endian, the byte order of the array is swapped using the `byteswap` method of the array object. + + Example: + # Create a StructFile object + file = StructFile("data.bin") + + # Read an array of integers from the file + arr = file.read_array('i', 10) + """ a = array(typecode) if self.is_real: a.fromfile(self.file, length) @@ -326,40 +1334,240 @@ def read_array(self, typecode, length): return a def get(self, position, length): + """ + Reads a specified number of bytes from the file starting at the given position. + + Args: + position (int): The position in the file to start reading from. + length (int): The number of bytes to read from the file. + + Returns: + bytes: The bytes read from the file. + + Raises: + OSError: If an error occurs while reading from the file. + + Example: + >>> file = StructFile("data.bin") + >>> data = file.get(10, 20) + """ self.seek(position) return self.read(length) def get_byte(self, position): + """ + Retrieves a single byte from the file at the specified position. + + Parameters: + position (int): The position in the file from which to retrieve the byte. + + Returns: + int: The byte value at the specified position. + + Raises: + IndexError: If the position is out of range. + + Example: + # Create a StructFile object + file = StructFile("data.bin") + + # Get the byte at position 10 + byte = file.get_byte(10) + """ return unpack_byte(self.get(position, 1))[0] def get_sbyte(self, position): + """ + Retrieves a signed byte (8-bit integer) from the file at the specified position. + + Parameters: + - position (int): The position in the file from which to read the signed byte. + + Returns: + - int: The signed byte value read from the file. + + Raises: + - IndexError: If the position is out of range. + + Example: + ``` + file = StructFile("data.bin") + byte = file.get_sbyte(10) + print(byte) # Output: -42 + ``` + """ return unpack_sbyte(self.get(position, 1))[0] def get_int(self, position): + """ + Retrieves an integer value from the file at the specified position. + + Parameters: + position (int): The position in the file from which to retrieve the integer value. + + Returns: + int: The integer value retrieved from the file. + + Raises: + IndexError: If the position is out of range. + + """ return unpack_int(self.get(position, _INT_SIZE))[0] def get_uint(self, position): + """ + Retrieves an unsigned integer from the file at the given position. + + Parameters: + - position (int): The position in the file from which to read the unsigned integer. + + Returns: + - int: The unsigned integer value read from the file. + + Raises: + - IndexError: If the position is out of range. + """ return unpack_uint(self.get(position, _INT_SIZE))[0] def get_ushort(self, position): + """ + Retrieves an unsigned short integer (2 bytes) from the file at the specified position. + + Parameters: + - position (int): The position in the file from which to read the unsigned short integer. + + Returns: + - ushort (int): The unsigned short integer value read from the file. + + Raises: + - IndexError: If the position is out of range. + + Example: + ``` + file = StructFile("data.bin") + ushort_value = file.get_ushort(10) + ``` + """ return unpack_ushort(self.get(position, _SHORT_SIZE))[0] def get_long(self, position): + """ + Retrieves a long integer value from the file at the given position. + + Parameters: + position (int): The position in the file from which to read the long integer. + + Returns: + int: The long integer value read from the file. + + Raises: + ValueError: If the position is out of bounds or if the file is not open. + + Notes: + - This method reads a long integer value from the file at the specified position. + - The file must be open before calling this method. + - The position must be a valid position within the file. + """ return unpack_long(self.get(position, _LONG_SIZE))[0] def get_ulong(self, position): + """ + Retrieves an unsigned long integer from the file at the specified position. + + Parameters: + position (int): The position in the file from which to read the unsigned long integer. + + Returns: + int: The unsigned long integer value read from the file. + + Raises: + IndexError: If the position is out of range. + + Notes: + - The unsigned long integer is read from the file using the `get` method. + - The `unpack_ulong` function is used to convert the byte string to an unsigned long integer. + - Only the first value of the unpacked result is returned. + + Example: + # Create a StructFile object + file = StructFile("data.bin") + + # Read an unsigned long integer from the file at position 100 + value = file.get_ulong(100) + """ return unpack_ulong(self.get(position, _LONG_SIZE))[0] def get_float(self, position): + """ + Retrieves a float value from the file at the specified position. + + Parameters: + position (int): The position in the file where the float value is located. + + Returns: + float: The float value retrieved from the file. + + Raises: + IndexError: If the position is out of range. + + """ return unpack_float(self.get(position, _FLOAT_SIZE))[0] def get_array(self, position, typecode, length): + """ + Reads an array of elements from the file starting at the given position. + + Args: + position (int): The position in the file to start reading from. + typecode (str): The typecode of the elements in the array. + length (int): The number of elements to read. + + Returns: + list: A list containing the elements read from the file. + + Raises: + OSError: If there is an error reading the file. + + Example: + To read an array of 10 integers starting from position 100 in the file: + + >>> file = StructFile("data.bin") + >>> array = file.get_array(100, 'i', 10) + """ self.seek(position) return self.read_array(typecode, length) class BufferFile(StructFile): + """ + A class representing a file stored in memory as a buffer. + + This class provides methods to manipulate and retrieve data from the buffer. + + Attributes: + _buf (bytes): The buffer containing the file data. + _name (str): The name of the file. + file (BytesIO): A BytesIO object representing the file. + onclose (callable): A callback function to be called when the file is closed. + is_real (bool): Indicates whether the file is a real file or a buffer. + is_closed (bool): Indicates whether the file is closed. + + Methods: + __init__(self, buf, name=None, onclose=None): Initializes a BufferFile object. + subset(self, position, length, name=None): Creates a new BufferFile object representing a subset of the current file. + get(self, position, length): Retrieves a portion of the file data. + get_array(self, position, typecode, length): Retrieves an array of data from the file. + """ + def __init__(self, buf, name=None, onclose=None): + """ + Initializes a BufferFile object. + + Args: + buf (bytes): The buffer containing the file data. + name (str, optional): The name of the file. Defaults to None. + onclose (callable, optional): A callback function to be called when the file is closed. Defaults to None. + """ self._buf = buf self._name = name self.file = BytesIO(buf) @@ -369,13 +1577,45 @@ def __init__(self, buf, name=None, onclose=None): self.is_closed = False def subset(self, position, length, name=None): + """ + Creates a new BufferFile object that represents a subset of the current file. + + Args: + position (int): The starting position of the subset. + length (int): The length of the subset. + name (str, optional): The name of the new file. Defaults to None. + + Returns: + BufferFile: A new BufferFile object representing the subset of the current file. + """ name = name or self._name return BufferFile(self.get(position, length), name=name) def get(self, position, length): + """ + Retrieves a portion of the file data. + + Args: + position (int): The starting position of the data. + length (int): The length of the data to retrieve. + + Returns: + bytes: The requested portion of the file data. + """ return bytes(self._buf[position : position + length]) def get_array(self, position, typecode, length): + """ + Retrieves an array of data from the file. + + Args: + position (int): The starting position of the array. + typecode (str): The typecode of the array elements. + length (int): The length of the array. + + Returns: + array: An array of data retrieved from the file. + """ a = array(typecode) a.frombytes(self.get(position, length * _SIZEMAP[typecode])) if IS_LITTLE: @@ -384,6 +1624,39 @@ def get_array(self, position, typecode, length): class ChecksumFile(StructFile): + """ + A file-like object that calculates a checksum of the data read or written. + + This class inherits from StructFile and provides additional functionality to calculate a checksum + using the CRC32 algorithm from the zlib module. The checksum is updated as data is read or written. + + Note: This class does not support seeking. + + Usage: + - Create an instance of ChecksumFile by passing the file path or file object to the constructor. + - Read or write data using the file-like methods provided by ChecksumFile. + - Call the checksum() method to get the calculated checksum. + + Example: + ``` + with ChecksumFile("data.txt", "rb") as file: + data = file.read(1024) + print(file.checksum()) + ``` + + Attributes: + - _check: The current checksum value. + - _crc32: The CRC32 function from the zlib module. + + Methods: + - __iter__(): Returns an iterator over the lines of the file. + - seek(): Raises a ValueError as seeking is not supported. + - read(): Reads data from the file and updates the checksum. + - write(): Writes data to the file and updates the checksum. + - checksum(): Returns the calculated checksum. + + """ + def __init__(self, *args, **kwargs): StructFile.__init__(self, *args, **kwargs) self._check = 0 @@ -395,16 +1668,41 @@ def __iter__(self): yield line def seek(self, *args): - raise Exception("Cannot seek on a ChecksumFile") + raise ValueError("Cannot seek on a ChecksumFile") def read(self, *args, **kwargs): + """ + Read data from the file and update the checksum. + + Args: + - *args: Variable length argument list to pass to the underlying file's read() method. + - **kwargs: Arbitrary keyword arguments to pass to the underlying file's read() method. + + Returns: + - b: The read data. + + """ b = self.file.read(*args, **kwargs) self._check = self._crc32(b, self._check) return b def write(self, b): + """ + Write data to the file and update the checksum. + + Args: + - b: The data to write. + + """ self._check = self._crc32(b, self._check) self.file.write(b) def checksum(self): + """ + Get the calculated checksum. + + Returns: + - The calculated checksum as an unsigned 32-bit integer. + + """ return self._check & 0xFFFFFFFF diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py index 27b1a05a..29095b7f 100644 --- a/src/whoosh/highlight.py +++ b/src/whoosh/highlight.py @@ -39,8 +39,8 @@ * **Order functions** control in what order the top-scoring fragments are presented to the user. For example, you can show the fragments in the order - they appear in the document (FIRST) or show higher-scoring fragments first - (SCORE) + they appear in the document (first) or show higher-scoring fragments first + (score) * **Formatters** turn the fragment objects into human-readable output, such as an HTML string. @@ -360,7 +360,7 @@ class SentenceFragmenter(Fragmenter): When highlighting with this fragmenter, you should use an analyzer that does NOT remove stop words, for example:: - sa = StandardAnalyzer(stoplist=None) + sa = standard_analyzer(stoplist=None) """ def __init__(self, maxchars=200, sentencechars=".!?", charlimit=DEFAULT_CHARLIMIT): @@ -584,12 +584,12 @@ def fragment_matches(self, text, tokens): currentlen = right - left while j < len(tokens) - 1 and currentlen < maxchars: - next = tokens[j + 1] - ec = next.endchar + next_token = tokens[j + 1] + ec = next_token.endchar if ec - right <= surround and ec - left <= maxchars: j += 1 right = ec - currentlen += ec - next.startchar + currentlen += ec - next_token.startchar else: break @@ -623,22 +623,22 @@ def __call__(self, f): # Fragment sorters -def SCORE(fragment): +def score(fragment): "Sorts higher scored passages first." return 1 -def FIRST(fragment): +def first(fragment): "Sorts passages from earlier in the document first." return fragment.startchar -def LONGER(fragment): +def longer(fragment): "Sorts longer passages first." return 0 - len(fragment) -def SHORTER(fragment): +def shorter(fragment): "Sort shorter passages first." return len(fragment) @@ -934,7 +934,7 @@ def highlight( top=3, scorer=None, minscore=1, - order=FIRST, + order=first, mode="query", ): if scorer is None: @@ -965,7 +965,7 @@ def __init__( scorer=None, formatter=None, always_retokenize=False, - order=FIRST, + order=first, ): self.fragmenter = fragmenter or ContextFragmenter() self.scorer = scorer or BasicFragmentScorer() diff --git a/src/whoosh/legacy.py b/src/whoosh/legacy.py index 13b21e79..928ac0e2 100644 --- a/src/whoosh/legacy.py +++ b/src/whoosh/legacy.py @@ -52,8 +52,8 @@ def load_110_toc(stream, gen, schema, version): "wcw2": "whoosh.codec.whoosh2", } objmap = { - "%(wf)s.NUMERIC": "%(wcw2)s.OLD_NUMERIC", - "%(wf)s.DATETIME": "%(wcw2)s.OLD_DATETIME", + "%(wf)s.NUMERIC": "%(wcw2)s.old_numeric", + "%(wf)s.DATETIME": "%(wcw2)s.old_datetime", "%(wsn)s.int_to_text": "%(wcw2)s.int_to_text", "%(wsn)s.text_to_int": "%(wcw2)s.text_to_int", "%(wsn)s.long_to_text": "%(wcw2)s.long_to_text", diff --git a/src/whoosh/qparser/dateparse.py b/src/whoosh/qparser/dateparse.py index ffbb4fc7..839b234b 100644 --- a/src/whoosh/qparser/dateparse.py +++ b/src/whoosh/qparser/dateparse.py @@ -46,6 +46,8 @@ class DateParseError(Exception): "Represents an error in parsing date text." + pass + # Utility functions diff --git a/src/whoosh/query/terms.py b/src/whoosh/query/terms.py index dd08eff5..01212aad 100644 --- a/src/whoosh/query/terms.py +++ b/src/whoosh/query/terms.py @@ -402,7 +402,7 @@ def _find_prefix(self, text): lp = len(prefix) if lp < len(text) and text[lp] in "*?": # we stripped something starting from * or ? - they both MAY mean - # "0 times". As we had stripped starting from FIRST special char, + # "0 times". As we had stripped starting from first special char, # that implies there were only ordinary chars left of it. Thus, # the very last of them is not part of the real prefix: prefix = prefix[:-1] diff --git a/src/whoosh/support/base85.py b/src/whoosh/support/base85.py index adb9e74c..38e8ecb2 100644 --- a/src/whoosh/support/base85.py +++ b/src/whoosh/support/base85.py @@ -25,19 +25,41 @@ def to_base85(x, islong=False): - "Encodes the given integer using base 85." + """ + Encodes the given integer using base 85. + Parameters: + - x: The integer to be encoded. + - islong: A boolean indicating whether the integer is a long integer or not. Default is False. + + Returns: + - The base 85 encoded string. + + Example: + >>> to_base85(12345) + '3qo' + """ size = 10 if islong else 5 rems = "" - for i in range(size): + for _ in range(size): rems = b85chars[x % 85] + rems x //= 85 return rems def from_base85(text): - "Decodes the given base 85 text into an integer." + """ + Decodes the given base 85 text into an integer. + + Parameters: + text (str): The base 85 encoded text to be decoded. + Returns: + int: The decoded integer value. + + Raises: + KeyError: If the input text contains characters not present in the base 85 encoding table. + """ acc = 0 for c in text: acc = acc * 85 + b85dec[c] @@ -45,9 +67,28 @@ def from_base85(text): # Bytes encoding and decoding functions +def b85encode(text, pad=False): + """ + Encode the given text using Base85 encoding. + Args: + text (str): The text to be encoded. + pad (bool, optional): Whether to pad the encoded output. Defaults to False. -def b85encode(text, pad=False): + Returns: + str: The Base85 encoded string. + + Raises: + None + + Example: + >>> b85encode("Hello World") + '87cURD]j7BEbo80' + + Note: + Base85 encoding is a binary-to-text encoding scheme that represents binary data in an ASCII string format. + It is commonly used in various applications such as data compression and data transmission. + """ l = len(text) r = l % 4 if r: @@ -75,6 +116,39 @@ def b85encode(text, pad=False): def b85decode(text): + """ + Decode a base85 encoded string. + + Args: + text (str): The base85 encoded string to decode. + + Returns: + bytes: The decoded binary data. + + Raises: + TypeError: If the input string contains invalid base85 characters. + OverflowError: If the decoded value exceeds the maximum representable value. + + Example: + >>> encoded = "9jqo^BlbD-BleB1DJ+*+F(f,q" + >>> decoded = b85decode(encoded) + >>> print(decoded) + b'Hello, World!' + + This function decodes a base85 encoded string and returns the corresponding binary data. + Base85 encoding is a method of representing binary data as ASCII text using 85 different characters. + The function takes a base85 encoded string as input and returns the decoded binary data. + + The function raises a TypeError if the input string contains invalid base85 characters. + It also raises an OverflowError if the decoded value exceeds the maximum representable value. + + Example usage: + >>> encoded = "9jqo^BlbD-BleB1DJ+*+F(f,q" + >>> decoded = b85decode(encoded) + >>> print(decoded) + b'Hello, World!' + """ + l = len(text) out = [] for i in range(0, len(text), 5): diff --git a/src/whoosh/support/bench.py b/src/whoosh/support/bench.py index de7aed6c..0f78eb81 100644 --- a/src/whoosh/support/bench.py +++ b/src/whoosh/support/bench.py @@ -59,31 +59,51 @@ def __init__(self, d): class Module: def __init__(self, bench, options, args): + """ + Initializes a Module object. + + Args: + bench (object): The benchmark object. + options (object): The options object. + args (object): The arguments object. + """ self.bench = bench self.options = options self.args = args def __repr__(self): + """ + Returns a string representation of the Module object. + """ return self.__class__.__name__ def indexer(self, **kwargs): """ - This method is responsible for indexing the data using the specified keyword arguments. + Indexes the data using the specified keyword arguments. - Parameters: - - kwargs: Additional keyword arguments for configuring the indexing process. + Args: + **kwargs: Additional keyword arguments for configuring the indexing process. Returns: - - None + None """ pass def index_document(self, d): + """ + Indexes a document. + + Args: + d (object): The document object. + + Raises: + NotImplementedError: If the method is not implemented in the subclass. + """ raise NotImplementedError def finish(self, **kwargs): """ - Finish the benchmark and perform any necessary cleanup. + Finishes the benchmark and performs any necessary cleanup. Args: **kwargs: Additional keyword arguments. @@ -94,6 +114,15 @@ def finish(self, **kwargs): pass def _process_result(self, d): + """ + Processes the result. + + Args: + d (object): The result object. + + Returns: + The processed result. + """ attrname = f"process_result_{self.options.lib}" if hasattr(self.bench.spec, attrname): method = getattr(self.bench.spec, attrname) @@ -105,33 +134,97 @@ def _process_result(self, d): def searcher(self): """ - This method returns a searcher object. + Returns a searcher object. """ pass def query(self): + """ + Executes a query. + + Raises: + NotImplementedError: If the method is not implemented in the subclass. + """ raise NotImplementedError def find(self, q): + """ + Finds a query. + + Args: + q (object): The query object. + + Raises: + NotImplementedError: If the method is not implemented in the subclass. + """ raise NotImplementedError def findterms(self, terms): + """ + Finds terms. + + Args: + terms (object): The terms object. + + Raises: + NotImplementedError: If the method is not implemented in the subclass. + """ raise NotImplementedError def results(self, r): + """ + Generates processed results. + + Args: + r (object): The results object. + + Yields: + The processed results. + """ for hit in r: yield self._process_result(hit) class Spec: + """ + The Spec class represents a benchmark specification. + + Attributes: + headline_field (str): The name of the field containing the headline. + main_field (str): The name of the main field. + options (object): The benchmark options. + args (list): The benchmark arguments. + + Methods: + __init__(self, options, args): Initializes a new instance of the Spec class. + documents(self): Abstract method to be implemented by subclasses. + setup(self): Performs the setup for the benchmark. + print_results(self, ls): Prints the benchmark results. + + Usage: + spec = Spec(options, args) + spec.setup() + spec.print_results(ls) + """ + headline_field = "title" main_field = "body" def __init__(self, options, args): + """ + Initializes a new instance of the Spec class. + + Args: + options (object): The benchmark options. + args (list): The benchmark arguments. + """ self.options = options self.args = args def documents(self): + """ + Abstract method to be implemented by subclasses. + """ raise NotImplementedError def setup(self): @@ -141,6 +234,12 @@ def setup(self): pass def print_results(self, ls): + """ + Prints the benchmark results. + + Args: + ls (list): The list of benchmark results. + """ showbody = self.options.showbody snippets = self.options.snippets limit = self.options.limit @@ -156,7 +255,47 @@ def print_results(self, ls): class WhooshModule(Module): + """ + A module for interacting with the Whoosh search engine. + + This module provides methods for indexing documents, searching the index, and retrieving search results. + + Attributes: + writer: An instance of the Whoosh IndexWriter used for adding documents to the index. + srch: An instance of the Whoosh IndexSearcher used for searching the index. + parser: An instance of the Whoosh QueryParser used for parsing search queries. + + Methods: + indexer(create=True): Initializes the Whoosh index and sets up the IndexWriter. + index_document(d): Indexes a document in the Whoosh index. + finish(merge=True, optimize=False): Commits changes to the index. + searcher(): Initializes the IndexSearcher and QueryParser. + query(): Parses the search query string and returns a Query object. + find(q): Executes a search query and returns the search results. + findterms(terms): Executes multiple search queries for each term and returns the search results. + + Usage: + module = WhooshModule() + module.indexer() + module.index_document(document) + module.finish() + module.searcher() + query = module.query() + results = module.find(query) + """ + def indexer(self, create=True): + """ + Creates or opens an index using the specified schema and options. + + Args: + create (bool, optional): If True, creates a new index if it doesn't exist. + If False, opens an existing index. + Defaults to True. + + Returns: + IndexWriter: An instance of IndexWriter for the created or opened index. + """ schema = self.bench.spec.whoosh_schema() path = os.path.join(self.options.dir, f"{self.options.indexname}_whoosh") @@ -184,30 +323,123 @@ def indexer(self, create=True): self._procdoc = self.bench.spec.process_document_whoosh def index_document(self, d): + """ + Indexes a document in the Whoosh index. + + Args: + d (dict): The document to be indexed. The keys represent the field names and the values represent the field values. + + Returns: + None + """ _procdoc = self._procdoc if _procdoc: _procdoc(d) self.writer.add_document(**d) def finish(self, merge=True, optimize=False): + """ + Commits the changes made to the index. + + Args: + merge (bool, optional): Specifies whether to perform a merge operation before committing. + Defaults to True. + optimize (bool, optional): Specifies whether to optimize the index after committing. + Defaults to False. + + Returns: + None + + Raises: + Any exceptions raised by the underlying writer.commit() method. + + Notes: + - This method should be called after making changes to the index to ensure that the changes + are persisted. + - By default, a merge operation is performed before committing. This helps in optimizing + the index by merging smaller segments into larger ones. + - If the `optimize` parameter is set to True, the index will be further optimized after + committing. This can improve search performance but may take longer to complete. + + Usage: + bench = Bench() + # ... perform index modifications ... + bench.finish(merge=True, optimize=False) + """ self.writer.commit(merge=merge, optimize=optimize) def searcher(self): + """ + Creates and returns a searcher object for performing searches on the index. + + Returns: + Searcher: A searcher object that can be used to perform searches on the index. + + Raises: + OSError: If there is an error while opening the index directory. + """ path = os.path.join(self.options.dir, f"{self.options.indexname}_whoosh") ix = index.open_dir(path) self.srch = ix.searcher(weighting=scoring.PL2()) self.parser = qparser.QueryParser(self.bench.spec.main_field, schema=ix.schema) def query(self): + """ + Parses the query string and returns a parsed query object. + + Args: + None + + Returns: + A parsed query object. + + Raises: + None + + Example: + bench = Bench() + bench.query() # Returns a parsed query object + """ qstring = " ".join(self.args).decode("utf-8") return self.parser.parse(qstring) def find(self, q): + """ + Executes a search query and returns the results. + + Args: + q (str): The search query string. + + Returns: + list: A list of search results. + + """ return self.srch.search( q, limit=int(self.options.limit), optimize=self.options.optimize ) def findterms(self, terms): + """ + Searches for the given terms in the specified field and returns the search results. + + Args: + terms (list): A list of terms to search for. + + Yields: + whoosh.searching.Results: The search results for each term. + + Returns: + None + + Raises: + None + + Example: + bench = Bench() + terms = ["term1", "term2", "term3"] + for result in bench.findterms(terms): + print(result) + """ limit = int(self.options.limit) s = self.srch q = query.Term(self.bench.spec.main_field, None) @@ -217,12 +449,75 @@ def findterms(self, terms): class XappyModule(Module): + """ + A module for indexing and searching documents using Xappy. + + This module provides methods for indexing documents, performing searches, + and retrieving search results using the Xappy library. + + Usage: + 1. Create an instance of XappyModule. + 2. Call the `indexer` method to obtain a connection to the Xappy index. + 3. Use the `index_document` method to add documents to the index. + 4. Call the `finish` method to flush any pending changes to the index. + 5. Call the `searcher` method to obtain a connection for searching the index. + 6. Use the `query` method to create a query object for searching. + 7. Call the `find` method to perform a search and retrieve the results. + 8. Use the `results` method to iterate over the search results. + + Note: Before using this module, make sure to install the Xappy library. + + Attributes: + options (object): An object containing configuration options. + bench (object): An object representing the benchmarking tool. + + Methods: + indexer(**kwargs): Returns a connection to the Xappy index. + index_document(conn=None, d=None): Indexes a document in the Xappy index. + finish(conn): Flushes any pending changes to the Xappy index. + searcher(): Returns a connection for searching the Xappy index. + query(conn=None): Creates a query object for searching the Xappy index. + find(conn=None, q=None): Performs a search and retrieves the results. + findterms(conn=None, terms=None): Performs searches for multiple terms. + results(r): Iterates over the search results. + + """ + def indexer(self, **kwargs): + """ + Creates and returns a connection to the Xappy index. + + Args: + **kwargs: Additional keyword arguments to be passed to the Xappy connection. + + Returns: + Xappy connection: A connection to the Xappy index. + + Raises: + None. + + Example usage: + conn = indexer() + # Use the connection to perform operations on the Xappy index + """ path = os.path.join(self.options.dir, f"{self.options.indexname}_xappy") conn = self.bench.spec.xappy_connection(path) return conn def index_document(self, conn=None, d=None): + """ + Indexes a document in the Xappy index. + + Args: + conn (Xappy connection, optional): The connection to the Xappy index. If not provided, a new connection will be created. + d (dict): The document to be indexed. + + Returns: + None. + + Raises: + None. + """ if hasattr(self.bench, "process_document_xappy"): self.bench.process_document_xappy(d) doc = xappy.UnprocessedDocument() @@ -234,25 +529,99 @@ def index_document(self, conn=None, d=None): conn.add(doc) def finish(self, conn): + """ + Flushes any pending changes to the Xappy index. + + Args: + conn (Xappy connection): The connection to the Xappy index. + + Returns: + None. + + Raises: + None. + """ conn.flush() def searcher(self): + """ + Returns a connection for searching the Xappy index. + + Args: + None. + + Returns: + Xappy connection: A connection for searching the Xappy index. + + Raises: + None. + """ path = os.path.join(self.options.dir, f"{self.options.indexname}_xappy") return xappy.SearchConnection(path) def query(self, conn=None): + """ + Creates a query object for searching the Xappy index. + + Args: + conn (Xappy connection, optional): The connection to the Xappy index. If not provided, a new connection will be created. + + Returns: + Xappy query: A query object for searching the Xappy index. + + Raises: + None. + """ return conn.query_parse(" ".join(self.args)) def find(self, conn=None, q=None): + """ + Performs a search and retrieves the results. + + Args: + conn (Xappy connection, optional): The connection to the Xappy index. If not provided, a new connection will be created. + q (Xappy query): The query object for searching the Xappy index. + + Returns: + Xappy results: The search results. + + Raises: + None. + """ return conn.search(q, 0, int(self.options.limit)) def findterms(self, conn=None, terms=None): + """ + Performs searches for multiple terms. + + Args: + conn (Xappy connection, optional): The connection to the Xappy index. If not provided, a new connection will be created. + terms (list): The list of terms to search for. + + Returns: + generator: A generator that yields the search results for each term. + + Raises: + None. + """ limit = int(self.options.limit) for term in terms: q = conn.query_field(self.bench.spec.main_field, term) yield conn.search(q, 0, limit) def results(self, r): + """ + Iterates over the search results. + + Args: + r (Xappy results): The search results. + + Returns: + generator: A generator that yields each search result. + + Raises: + None. + """ hf = self.bench.spec.headline_field mf = self.bench.spec.main_field for hit in r: @@ -260,12 +629,41 @@ def results(self, r): class XapianModule(Module): + """ + XapianModule is a module that provides indexing and searching capabilities using Xapian. + + Args: + Module (class): The base class for all modules. + + Attributes: + database (xapian.WritableDatabase): The Xapian writable database. + ixer (xapian.TermGenerator): The Xapian term generator. + db (xapian.Database): The Xapian database. + enq (xapian.Enquire): The Xapian enquire object. + qp (xapian.QueryParser): The Xapian query parser. + + """ + def indexer(self, **kwargs): + """ + Initializes the Xapian indexer. + + Args: + **kwargs: Additional keyword arguments. + + """ path = os.path.join(self.options.dir, f"{self.options.indexname}_xapian") self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN) self.ixer = xapian.TermGenerator() def index_document(self, d): + """ + Indexes a document in the Xapian database. + + Args: + d (dict): The document to be indexed. + + """ if hasattr(self.bench, "process_document_xapian"): self.bench.process_document_xapian(d) doc = xapian.Document() @@ -276,9 +674,20 @@ def index_document(self, d): self.database.add_document(doc) def finish(self, **kwargs): + """ + Flushes the Xapian database. + + Args: + **kwargs: Additional keyword arguments. + + """ self.database.flush() def searcher(self): + """ + Initializes the Xapian searcher. + + """ path = os.path.join(self.options.dir, f"{self.options.indexname}_xappy") self.db = xapian.Database(path) self.enq = xapian.Enquire(self.db) @@ -286,13 +695,40 @@ def searcher(self): self.qp.set_database(self.db) def query(self): + """ + Parses and returns the query. + + Returns: + xapian.Query: The parsed query. + + """ return self.qp.parse_query(" ".join(self.args)) def find(self, q): + """ + Finds and returns the matching documents for the given query. + + Args: + q (xapian.Query): The query to search for. + + Returns: + xapian.MSet: The matching documents. + + """ self.enq.set_query(q) return self.enq.get_mset(0, int(self.options.limit)) def findterms(self, terms): + """ + Finds and returns the matching documents for each term in the given list. + + Args: + terms (list): The list of terms to search for. + + Yields: + xapian.MSet: The matching documents for each term. + + """ limit = int(self.options.limit) for term in terms: q = self.qp.parse_query(term) @@ -300,6 +736,16 @@ def findterms(self, terms): yield self.enq.get_mset(0, limit) def results(self, matches): + """ + Processes and yields the results from the given matches. + + Args: + matches (xapian.MSet): The matches to process. + + Yields: + dict: The processed result for each match. + + """ hf = self.bench.spec.headline_field mf = self.bench.spec.main_field for m in matches: @@ -309,47 +755,176 @@ def results(self, matches): class SolrModule(Module): + """ + A module for interacting with Apache Solr. + + This module provides methods for indexing documents, searching for documents, + and retrieving search results from an Apache Solr server. + + Args: + Module (class): The base class for all modules. + + Attributes: + solr_doclist (list): A list to store the documents to be indexed. + conn (pysolr.Solr): A connection object to interact with the Solr server. + solr (pysolr.Solr): A connection object to interact with the Solr server for searching. + + """ + def indexer(self, **kwargs): + """ + Initializes the SolrModule for indexing. + + This method initializes the SolrModule by creating a connection to the Solr server, + deleting all existing documents in the server, and committing the changes. + + Args: + **kwargs: Additional keyword arguments. + + """ + self.solr_doclist = [] self.conn = pysolr.Solr(self.options.url) self.conn.delete("*:*") self.conn.commit() def index_document(self, d): + """ + Adds a document to the list of documents to be indexed. + + This method adds a document to the list of documents to be indexed. + If the number of documents in the list reaches the batch size specified in the options, + the documents are added to the Solr server and the list is cleared. + + Args: + d (dict): The document to be indexed. + + """ + self.solr_doclist.append(d) if len(self.solr_doclist) >= int(self.options.batch): self.conn.add(self.solr_doclist, commit=False) self.solr_doclist = [] def finish(self, **kwargs): + """ + Finalizes the indexing process. + + This method finalizes the indexing process by adding any remaining documents in the list + to the Solr server, optimizing the server, and cleaning up resources. + + Args: + **kwargs: Additional keyword arguments. + + """ + if self.solr_doclist: self.conn.add(self.solr_doclist) del self.solr_doclist self.conn.optimize(block=True) def searcher(self): + """ + Initializes the SolrModule for searching. + + This method initializes the SolrModule by creating a connection to the Solr server + specifically for searching. + + """ + self.solr = pysolr.Solr(self.options.url) def query(self): + """ + Constructs a query string. + + This method constructs a query string by joining the arguments passed to the script. + + Returns: + str: The constructed query string. + + """ + return " ".join(self.args) def find(self, q): + """ + Executes a search query. + + This method executes a search query on the Solr server using the provided query string. + + Args: + q (str): The query string. + + Returns: + pysolr.Results: The search results. + + """ + return self.solr.search(q, limit=int(self.options.limit)) def findterms(self, terms): + """ + Executes search queries for each term. + + This method executes search queries on the Solr server for each term in the provided list. + The search queries are constructed by appending the term to the "body:" field. + + Args: + terms (list): The list of terms to search for. + + Yields: + pysolr.Results: The search results for each term. + + """ + limit = int(self.options.limit) for term in terms: yield self.solr.search("body:" + term, limit=limit) class ZcatalogModule(Module): + """ + A module for indexing and searching documents using ZCatalog. + + This module provides functionality for indexing and searching documents using ZCatalog, + which is a powerful indexing and search system for Python applications. + + Usage: + 1. Create an instance of ZcatalogModule. + 2. Call the `indexer` method to set up the indexing environment. + 3. Call the `index_document` method to index a document. + 4. Call the `finish` method to commit the changes and clean up resources. + 5. Call the `searcher` method to set up the searching environment. + 6. Call the `query` method to specify the search query. + 7. Call the `find` method to retrieve search results. + 8. Call the `findterms` method to retrieve search results for each term in a list. + 9. Call the `results` method to process and iterate over search results. + + Note: This module requires the ZODB package to be installed. + + Attributes: + - cat: The ZCatalog instance used for indexing and searching. + - zcatalog_count: The count of indexed documents. + + """ + def indexer(self, **kwargs): - import transaction # type: ignore # type: ignore @UnresolvedImport - from zcatalog import catalog # type: ignore # type: ignore @UnresolvedImport - from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport - from ZODB.FileStorage import ( - FileStorage, # type: ignore # type: ignore @UnresolvedImport - ) + """ + Set up the indexing environment. + + This method creates the necessary directory and storage for indexing, + initializes the ZCatalog instance, and commits the changes. + + Args: + - kwargs: Additional keyword arguments. + + """ + + import transaction + from zcatalog import catalog + from ZODB.DB import DB + from ZODB.FileStorage import FileStorage directory = os.path.join(self.options.dir, f"{self.options.indexname}_zcatalog") if os.path.exists(directory): @@ -368,28 +943,57 @@ def indexer(self, **kwargs): self.zcatalog_count = 0 def index_document(self, d): + """ + Index a document. + + This method indexes a document by processing it with the `process_document_zcatalog` + method (if available), creating a ZDoc instance, and indexing the document using the + ZCatalog instance. It also commits the changes periodically based on the `zcatalog_count` + attribute. + + Args: + - d: The document to be indexed. + + """ + if hasattr(self.bench, "process_document_zcatalog"): self.bench.process_document_zcatalog(d) doc = ZDoc(d) self.cat.index_doc(doc) self.zcatalog_count += 1 if self.zcatalog_count >= 100: - import transaction # type: ignore # type: ignore @UnresolvedImport + import transaction transaction.commit() self.zcatalog_count = 0 def finish(self, **kwargs): - import transaction # type: ignore # type: ignore @UnresolvedImport + """ + Finish indexing and clean up resources. + + This method commits the changes made during indexing and cleans up resources. + + Args: + - kwargs: Additional keyword arguments. + + """ + + import transaction transaction.commit() del self.zcatalog_count def searcher(self): - from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport - from ZODB.FileStorage import ( - FileStorage, # type: ignore # type: ignore @UnresolvedImport - ) + """ + Set up the searching environment. + + This method sets up the searching environment by opening the ZODB connection, + retrieving the ZCatalog instance, and assigning it to the `cat` attribute. + + """ + + from ZODB.DB import DB + from ZODB.FileStorage import FileStorage path = os.path.join( self.options.dir, f"{self.options.indexname}_zcatalog", "index" @@ -401,16 +1005,66 @@ def searcher(self): self.cat = conn.root()["cat"] def query(self): + """ + Get the search query. + + This method returns the search query as a string. + + Returns: + - The search query. + + """ + return " ".join(self.args) def find(self, q): + """ + Find search results. + + This method performs a search using the ZCatalog instance and the specified query. + + Args: + - q: The search query. + + Returns: + - The search results. + + """ + return self.cat.searchResults(body=q) def findterms(self, terms): + """ + Find search results for each term. + + This method performs a search for each term in the specified list using the ZCatalog instance. + + Args: + - terms: The list of terms to search for. + + Yields: + - The search results for each term. + + """ + for term in terms: yield self.cat.searchResults(body=term) def results(self, r): + """ + Process and iterate over search results. + + This method processes and iterates over the search results, retrieving the headline and main + fields for each hit. + + Args: + - r: The search results. + + Yields: + - The processed search results. + + """ + hf = self.bench.spec.headline_field mf = self.bench.spec.main_field for hit in r: @@ -419,7 +1073,17 @@ def results(self, r): class NucularModule(Module): + """ + A module for indexing and searching documents using the Nucular library. + """ + def indexer(self, create=True): + """ + Indexes a document using the Nucular library. + + Args: + create (bool, optional): Whether to create a new index. Defaults to True. + """ import shutil from nucular import Nucular # type: ignore # type: ignore @UnresolvedImport @@ -435,6 +1099,12 @@ def indexer(self, create=True): self.count = 0 def index_document(self, d): + """ + Indexes a document. + + Args: + d (dict): The document to be indexed. + """ try: self.archive.indexDictionary(str(self.count), d) except ValueError: @@ -442,17 +1112,22 @@ def index_document(self, d): raise self.count += 1 if not self.count % int(self.options.batch): - t = now() self.archive.store(lazy=True) self.indexer(create=False) def finish(self, **kwargs): + """ + Finishes the indexing process. + """ self.archive.store(lazy=False) self.archive.aggregateRecent(fast=False, verbose=True) self.archive.moveTransientToBase(verbose=True) self.archive.cleanUp() def searcher(self): + """ + Initializes the searcher for querying the indexed documents. + """ from nucular import Nucular # type: ignore # type: ignore @UnresolvedImport directory = os.path.join( @@ -461,12 +1136,36 @@ def searcher(self): self.archive = Nucular.Nucular(directory) def query(self): + """ + Constructs a query string from the arguments. + + Returns: + str: The constructed query string. + """ return " ".join(self.args) def find(self, q): + """ + Finds documents matching the given query. + + Args: + q (str): The query string. + + Returns: + list: A list of dictionaries representing the matching documents. + """ return self.archive.dictionaries(q) def findterms(self, terms): + """ + Finds documents containing the given terms. + + Args: + terms (list): A list of terms to search for. + + Yields: + list: A list of dictionaries representing the matching documents for each term. + """ for term in terms: q = self.archive.Query() q.anyWord(term) @@ -474,6 +1173,10 @@ def findterms(self, terms): class Bench: + """ + The Bench class provides methods for indexing and searching documents using different libraries. + """ + libs = { "whoosh": WhooshModule, "xappy": XappyModule, @@ -484,6 +1187,23 @@ class Bench: } def index(self, lib): + """ + Indexes documents using the specified library. + + Args: + lib: The library to use for indexing. + + Returns: + None + + Raises: + None + + Example: + bench = Bench() + bench.index(MyLibrary()) + """ + print(f"Indexing with {lib}...") options = self.options @@ -533,6 +1253,18 @@ def index(self, lib): print(f"Indexed {count / totaltime:0.3f} docs/s") def search(self, lib): + """ + Perform a search using the given library. + + Args: + lib: The library object to use for searching. + + Returns: + None + + Raises: + None + """ lib.searcher() t = now() @@ -546,6 +1278,19 @@ def search(self, lib): print("Print time:", now() - t) def search_file(self, lib): + """ + Searches for terms in a file using the specified library. + + Args: + lib (str): The name of the library to use for searching. + + Returns: + None + + Raises: + FileNotFoundError: If the termfile specified in the options does not exist. + + """ f = open(self.options.termfile, "rb") terms = [line.strip() for line in f] f.close() @@ -553,12 +1298,50 @@ def search_file(self, lib): print(f"Searching {len(terms)} terms with {lib}") lib.searcher() starttime = now() - for r in lib.findterms(terms): + for _ in lib.findterms(terms): pass searchtime = now() - starttime print("Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime) def _parser(self, name): + """ + Create an OptionParser object with predefined options for command-line parsing. + + Parameters: + - name (str): The name used as a prefix for the index name. + + Returns: + - OptionParser: The OptionParser object with predefined options. + + The _parser function creates an OptionParser object and adds several options to it. + These options are used for command-line parsing in the bench.py script. + + Options: + - -x, --lib: Name of the library to use to index/search. Default is "whoosh". + - -d, --dir: Directory in which to store index. Default is the current directory. + - -s, --setup: Set up any support files or caches. Default is False. + - -i, --index: Index the documents. Default is False. + - -n, --name: Index name prefix. Default is "{name}_index". + - -U, --url: Solr URL. Default is "http://localhost:8983/solr". + - -m, --mb: Max. memory usage, in MB. Default is "128". + - -c, --chunk: Number of documents to index between progress messages. Default is 1000. + - -B, --batch: Batch size for batch adding documents. Default is 1000. + - -k, --skip: Index every Nth document. Default is 1. + - -e, --commit-every: Commit every NUM documents. Default is None. + - -M, --no-merge: Don't merge segments when doing multiple commits. Default is True. + - -u, --upto: Index up to this document number. Default is 600000. + - -p, --procs: Number of processors to use. Default is 0. + - -l, --limit: Maximum number of search results to retrieve. Default is 10. + - -b, --body: Show the body text in search results. Default is False. + - -g, --gen: Generate a list at most N terms present in all libraries. Default is None. + - -f, --file: Search using the list of terms in this file. Default is None. + - -t, --tempdir: Whoosh temp dir. Default is None. + - -P, --pool: Whoosh pool class. Default is None. + - -X, --xms: Experimental Whoosh feature. Default is False. + - -Z, --storebody: Store the body text in index. Default is False. + - -q, --snippets: Show highlighted snippets. Default is False. + - -O, --no-optimize: Turn off searcher optimization. Default is True. + """ p = OptionParser() p.add_option( "-x", @@ -748,13 +1531,35 @@ def _parser(self, name): return p def run(self, specclass): + """ + Runs the benchmarking process. + + Args: + specclass: The benchmark specification class. + + Raises: + ValueError: If the specified library is unknown. + + Notes: + This method parses the command line arguments, initializes the benchmark options and arguments, + creates an instance of the specified library, and executes the benchmark action based on the + command line options. + + Example: + To run the benchmark using a specific specification class: + + ``` + bench = Benchmark() + bench.run(MySpecClass) + ``` + """ parser = self._parser(specclass.name) options, args = parser.parse_args() self.options = options self.args = args if options.lib not in self.libs: - raise Exception(f"Unknown library: {options.lib!r}") + raise ValueError(f"Unknown library: {options.lib!r}") lib = self.libs[options.lib](self, options, args) self.spec = specclass(options, args) diff --git a/src/whoosh/support/bitstream.py b/src/whoosh/support/bitstream.py index 50984639..c326e4ba 100644 --- a/src/whoosh/support/bitstream.py +++ b/src/whoosh/support/bitstream.py @@ -1,7 +1,7 @@ """ From a post by Patrick Maupin on the Python mailing list: -http://mail.python.org/pipermail/python-list/2003-November/237481.html +https://mail.python.org/pipermail/python-list/2003-November/237481.html """ from array import array @@ -13,6 +13,15 @@ class BitStreamReader: def __init__(self, source): + """ + Initializes a BitStreamReader object. + + Parameters: + - source: The source data to read from. + + The BitStreamReader reads binary data from the given source and provides methods to seek, tell, and read bits from the data. + """ + self._totalbits = len(source) * _bitsperlong self._position = 0 @@ -24,16 +33,47 @@ def __init__(self, source): self._bitstream = bits def seek(self, offset): + """ + Sets the current position in the bitstream. + + Parameters: + - offset: The new position to set. + + The offset is specified in bits from the beginning of the bitstream. + """ + self._position = offset def tell(self): + """ + Returns the current position in the bitstream. + + Returns: + - The current position in bits from the beginning of the bitstream. + """ + return self._position def read(self, numbits): + """ + Reads the specified number of bits from the bitstream. + + Parameters: + - numbits: The number of bits to read. + + Returns: + - The value of the read bits. + + Raises: + - IndexError: If the specified number of bits exceeds the available bits in the bitstream. + + The read method reads the specified number of bits from the current position in the bitstream and advances the position accordingly. + """ + position = self._position if position < 0 or position + numbits > self._totalbits: - raise (IndexError, "Invalid bitarray._position/numbits") + raise IndexError("Invalid bitarray._position/numbits") longaddress, bitoffset = divmod(position, _bitsperlong) diff --git a/src/whoosh/support/bitvector.py b/src/whoosh/support/bitvector.py index d7ef507d..ff3352ef 100644 --- a/src/whoosh/support/bitvector.py +++ b/src/whoosh/support/bitvector.py @@ -6,267 +6,7 @@ from array import array #: Table of the number of '1' bits in each byte (0-255) -BYTE_COUNTS = array( - "B", - [ - 0, - 1, - 1, - 2, - 1, - 2, - 2, - 3, - 1, - 2, - 2, - 3, - 2, - 3, - 3, - 4, - 1, - 2, - 2, - 3, - 2, - 3, - 3, - 4, - 2, - 3, - 3, - 4, - 3, - 4, - 4, - 5, - 1, - 2, - 2, - 3, - 2, - 3, - 3, - 4, - 2, - 3, - 3, - 4, - 3, - 4, - 4, - 5, - 2, - 3, - 3, - 4, - 3, - 4, - 4, - 5, - 3, - 4, - 4, - 5, - 4, - 5, - 5, - 6, - 1, - 2, - 2, - 3, - 2, - 3, - 3, - 4, - 2, - 3, - 3, - 4, - 3, - 4, - 4, - 5, - 2, - 3, - 3, - 4, - 3, - 4, - 4, - 5, - 3, - 4, - 4, - 5, - 4, - 5, - 5, - 6, - 2, - 3, - 3, - 4, - 3, - 4, - 4, - 5, - 3, - 4, - 4, - 5, - 4, - 5, - 5, - 6, - 3, - 4, - 4, - 5, - 4, - 5, - 5, - 6, - 4, - 5, - 5, - 6, - 5, - 6, - 6, - 7, - 1, - 2, - 2, - 3, - 2, - 3, - 3, - 4, - 2, - 3, - 3, - 4, - 3, - 4, - 4, - 5, - 2, - 3, - 3, - 4, - 3, - 4, - 4, - 5, - 3, - 4, - 4, - 5, - 4, - 5, - 5, - 6, - 2, - 3, - 3, - 4, - 3, - 4, - 4, - 5, - 3, - 4, - 4, - 5, - 4, - 5, - 5, - 6, - 3, - 4, - 4, - 5, - 4, - 5, - 5, - 6, - 4, - 5, - 5, - 6, - 5, - 6, - 6, - 7, - 2, - 3, - 3, - 4, - 3, - 4, - 4, - 5, - 3, - 4, - 4, - 5, - 4, - 5, - 5, - 6, - 3, - 4, - 4, - 5, - 4, - 5, - 5, - 6, - 4, - 5, - 5, - 6, - 5, - 6, - 6, - 7, - 3, - 4, - 4, - 5, - 4, - 5, - 5, - 6, - 4, - 5, - 5, - 6, - 5, - 6, - 6, - 7, - 4, - 5, - 5, - 6, - 5, - 6, - 6, - 7, - 5, - 6, - 6, - 7, - 6, - 7, - 7, - 8, - ], -) +BYTE_COUNTS = array("B", [bin(byte).count("1") for byte in range(256)]) class BitVector: @@ -302,6 +42,14 @@ class BitVector: """ def __init__(self, size, source=None, bits=None): + """ + Initializes a BitVector object. + + Args: + size (int): The size of the BitVector. + source (iterable, optional): An iterable of integers representing bit positions to turn on. Defaults to None. + bits (array, optional): An array of bytes representing the bit values. Defaults to None. + """ self.size = size if bits: @@ -310,51 +58,123 @@ def __init__(self, size, source=None, bits=None): self.bits = array("B", ([0x00] * ((size >> 3) + 1))) if source: - set = self.set + set_var = self.set for num in source: - set(num) + set_var(num) self.bcount = None def __eq__(self, other): + """ + Checks if two BitVector objects are equal. + + Args: + other (BitVector): The other BitVector object to compare. + + Returns: + bool: True if the BitVector objects are equal, False otherwise. + """ if isinstance(other, BitVector): return self.bits == other.bits return False def __repr__(self): + """ + Returns a string representation of the BitVector object. + + Returns: + str: A string representation of the BitVector object. + """ return f"" def __len__(self): - # This returns the count of "on" bits instead of the size to - # make BitVector exchangeable with a set() object. + """ + Returns the number of "on" bits in the BitVector. + + Returns: + int: The number of "on" bits in the BitVector. + """ return self.count() def __contains__(self, index): + """ + Checks if a given index is present in the BitVector. + + Args: + index (int): The index to check. + + Returns: + bool: True if the index is present in the BitVector, False otherwise. + """ return self[index] def __iter__(self): + """ + Returns an iterator over the "on" bits in the BitVector. + + Yields: + int: The indices of the "on" bits in the BitVector. + """ get = self.__getitem__ for i in range(0, self.size): if get(i): yield i def __str__(self): + """ + Returns a string representation of the BitVector object. + + Returns: + str: A string representation of the BitVector object. + """ get = self.__getitem__ return "".join("1" if get(i) else "0" for i in range(0, self.size)) def __nonzero__(self): + """ + Checks if the BitVector has any "on" bits. + + Returns: + bool: True if the BitVector has any "on" bits, False otherwise. + """ return self.count() > 0 def __getitem__(self, index): + """ + Returns the value of the bit at the given index. + + Args: + index (int): The index of the bit to retrieve. + + Returns: + bool: True if the bit is "on", False otherwise. + """ return self.bits[index >> 3] & (1 << (index & 7)) != 0 def __setitem__(self, index, value): + """ + Sets the value of the bit at the given index. + + Args: + index (int): The index of the bit to set. + value (bool): The value to set the bit to. + """ if value: self.set(index) else: self.clear(index) def _logic(self, op, bitv): + """ + Performs a bit-wise logic operation between two BitVector objects. + + Args: + op (function): The bit-wise logic operation to perform. + bitv (BitVector): The other BitVector object to perform the operation with. + + Returns: + BitVector: The result of the bit-wise logic operation. + """ if self.size != bitv.size: raise ValueError("Can't combine bitvectors of different sizes") res = BitVector(size=self.size) @@ -363,47 +183,124 @@ def _logic(self, op, bitv): return res def union(self, other): + """ + Performs a union operation between two BitVector objects. + + Args: + other (BitVector): The other BitVector object to perform the union with. + + Returns: + BitVector: The result of the union operation. + """ return self.__or__(other) def intersection(self, other): + """ + Performs an intersection operation between two BitVector objects. + + Args: + other (BitVector): The other BitVector object to perform the intersection with. + + Returns: + BitVector: The result of the intersection operation. + """ return self.__and__(other) def __and__(self, other): + """ + Performs a bit-wise AND operation between two BitVector objects. + + Args: + other (BitVector): The other BitVector object to perform the AND operation with. + + Returns: + BitVector: The result of the bit-wise AND operation. + """ if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__and__, other) def __or__(self, other): + """ + Performs a bit-wise OR operation between two BitVector objects. + + Args: + other (BitVector): The other BitVector object to perform the OR operation with. + + Returns: + BitVector: The result of the bit-wise OR operation. + """ if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__or__, other) def __ror__(self, other): + """ + Performs a bit-wise OR operation between a BitVector object and another object. + + Args: + other (BitVector): The other object to perform the OR operation with. + + Returns: + BitVector: The result of the bit-wise OR operation. + """ return self.__or__(other) def __rand__(self, other): + """ + Performs a bit-wise AND operation between a BitVector object and another object. + + Args: + other (BitVector): The other object to perform the AND operation with. + + Returns: + BitVector: The result of the bit-wise AND operation. + """ return self.__and__(other) def __xor__(self, other): + """ + Performs a bit-wise XOR operation between two BitVector objects. + + Args: + other (BitVector): The other BitVector object to perform the XOR operation with. + + Returns: + BitVector: The result of the bit-wise XOR operation. + """ if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__xor__, other) def __invert__(self): + """ + Performs a bit-wise inversion operation on the BitVector. + + Returns: + BitVector: The result of the bit-wise inversion operation. + """ return BitVector( self.size, source=(x for x in range(self.size) if x not in self) ) def count(self): - """Returns the number of "on" bits in the bit array.""" + """ + Returns the number of "on" bits in the BitVector. + Returns: + int: The number of "on" bits in the BitVector. + """ if self.bcount is None: self.bcount = sum(BYTE_COUNTS[b & 0xFF] for b in self.bits) return self.bcount def set(self, index): - """Turns the bit at the given position on.""" + """ + Turns the bit at the given position on. + Args: + index (int): The index of the bit to turn on. + """ if index >= self.size: raise IndexError( f"Position {repr(index)} greater than the size of the vector" @@ -412,23 +309,33 @@ def set(self, index): self.bcount = None def clear(self, index): - """Turns the bit at the given position off.""" + """ + Turns the bit at the given position off. + Args: + index (int): The index of the bit to turn off. + """ self.bits[index >> 3] &= ~(1 << (index & 7)) self.bcount = None def set_from(self, iterable): - """Takes an iterable of integers representing positions, and turns - on the bits at those positions. """ + Turns on the bits at the positions specified by an iterable of integers. - set = self.set + Args: + iterable (iterable): An iterable of integers representing positions. + """ + set_var = self.set for index in iterable: - set(index) + set_var(index) def copy(self): - """Returns a copy of this BitArray.""" + """ + Returns a copy of the BitVector. + Returns: + BitVector: A copy of the BitVector. + """ return BitVector(self.size, bits=self.bits) diff --git a/src/whoosh/support/charset.py b/src/whoosh/support/charset.py index 7334d853..8da99a40 100644 --- a/src/whoosh/support/charset.py +++ b/src/whoosh/support/charset.py @@ -1303,11 +1303,11 @@ def charset_table_to_dict(tablestring): character or None if the character is not a valid word character. The Sphinx charset table format is described at - http://www.sphinxsearch.com/docs/current.html#conf-charset-table. + https://www.sphinxsearch.com/docs/current.html#conf-charset-table. """ - # map = {} - map = defaultdict(lambda: None) + # map_dict = {} + map_dict = defaultdict(lambda: None) for line in tablestring.split("\n"): if not line or line.startswith("#"): continue @@ -1326,7 +1326,7 @@ def charset_table_to_dict(tablestring): for fromord, tooord in zip( range(start1, end1 + 1), range(start2, end2 + 1) ): - map[fromord] = chr(tooord) + map_dict[fromord] = chr(tooord) except ValueError: pass continue @@ -1336,16 +1336,16 @@ def charset_table_to_dict(tablestring): fromord = charspec_to_int(match.group(1)) toord = charspec_to_int(match.group(2)) try: - map[fromord] = chr(toord) + map_dict[fromord] = chr(toord) except ValueError: pass continue match = _stray_char.match(item) if match: - ord = charspec_to_int(match.group(0)) + ord_charspec = charspec_to_int(match.group(0)) try: - map[ord] = chr(ord) + map_dict[ord_charspec] = chr(ord_charspec) except ValueError: pass continue @@ -1355,8 +1355,8 @@ def charset_table_to_dict(tablestring): start = charspec_to_int(match.group(1)) end = charspec_to_int(match.group(2)) try: - for ord in range(start, end + 1): - map[ord] = chr(ord) + for ord_charspec in range(start, end + 1): + map_dict[ord_charspec] = chr(ord_charspec) except ValueError: pass continue @@ -1366,13 +1366,13 @@ def charset_table_to_dict(tablestring): fromord = charspec_to_int(match.group(1)) toord = charspec_to_int(match.group(2)) assert toord - fromord % 2 == 0 - for ord in range(fromord, toord + 1, 2): + for ord_charspec in range(fromord, toord + 1, 2): try: - map[ord] = chr(ord + 1) - map[ord + 1] = chr(ord + 1) + map_dict[ord_charspec] = chr(ord_charspec + 1) + map_dict[ord_charspec + 1] = chr(ord_charspec + 1) except ValueError: pass continue - raise Exception(f"Don't know what to do with {item!r}") - return dict(map) + raise ValueError(f"Don't know what to do with {item}") + return dict(map_dict) diff --git a/src/whoosh/support/pyparsing.py b/src/whoosh/support/pyparsing.py index ea133368..04711766 100644 --- a/src/whoosh/support/pyparsing.py +++ b/src/whoosh/support/pyparsing.py @@ -810,9 +810,13 @@ def set_default_whitespace_chars(chars): set_default_whitespace_chars = staticmethod(set_default_whitespace_chars) def __init__(self, savelist=False): + """Initialize the ParserElement. + + Args: + savelist (bool, optional): Whether to save the results as a list. Defaults to False. + """ self.parse_action = [] self.fail_action = None - # ~ self.name = "" # don't define self.name, let subclasses try/except upcall self.str_repr = None self.results_name = None self.saveas_list = savelist @@ -833,8 +837,11 @@ def __init__(self, savelist=False): self.call_during_try = False def copy(self): - """Make a copy of this ParserElement. Useful for defining different parse actions - for the same parsing pattern, using copies of the original parse element.""" + """Make a copy of this ParserElement. + + Returns: + ParserElement: A copy of the original ParserElement. + """ cpy = copy.copy(self) cpy.parse_action = self.parse_action[:] cpy.ignore_exprs = self.ignore_exprs[:] @@ -843,7 +850,14 @@ def copy(self): return cpy def set_name(self, name): - """Define name for this expression, for use in debugging.""" + """Define name for this expression, for use in debugging. + + Args: + name (str): The name of the expression. + + Returns: + ParserElement: The ParserElement object. + """ self.name = name self.errmsg = "Expected " + self.name if hasattr(self, "exception"): @@ -853,9 +867,13 @@ def set_name(self, name): def set_results_name(self, name, list_all_matches=False): """Define name for referencing matching tokens as a nested attribute of the returned parse results. - NOTE: this returns a *copy* of the original ParserElement object; - this is so that the client can define a basic element, such as an - integer, and reference it in multiple places with different names. + + Args: + name (str): The name of the results. + list_all_matches (bool, optional): Whether to list all matches. Defaults to False. + + Returns: + ParserElement: A copy of the original ParserElement with the results name set. """ newself = self.copy() newself.results_name = name @@ -864,8 +882,13 @@ def set_results_name(self, name, list_all_matches=False): def set_break(self, break_flag=True): """Method to invoke the Python pdb debugger when this element is - about to be parsed. Set break_flag to True to enable, False to - disable. + about to be parsed. + + Args: + break_flag (bool, optional): Whether to enable the debugger. Defaults to True. + + Returns: + ParserElement: The ParserElement object. """ if break_flag: _parse_method = self._parse @@ -885,7 +908,14 @@ def breaker(instring, loc, do_actions=True, call_pre_parse=True): def _normalize_parse_action_args(f): """Internal method used to decorate parse actions that take fewer than 3 arguments, - so that all parse actions can be called as f(s,l,t).""" + so that all parse actions can be called as f(s,l,t). + + Args: + f (callable): The parse action function. + + Returns: + callable: The normalized parse action function. + """ STAR_ARGS = 4 try: @@ -894,8 +924,6 @@ def _normalize_parse_action_args(f): restore = f f = f.__init__ - # codeObj = f.code - if f.code.co_flags & STAR_ARGS: return f numargs = f.code.co_argcount @@ -906,8 +934,6 @@ def _normalize_parse_action_args(f): f = restore except AttributeError: try: - # call_im_func_code = f.__code__ - # not a function, must be a callable object, get info from the # im_func binding of its bound __call__ method if f.__code__.co_flags & STAR_ARGS: @@ -917,8 +943,6 @@ def _normalize_parse_action_args(f): if hasattr(f.__call__, "__self__"): numargs -= 0 except AttributeError: - # call_func_code = f.__call__.__code__ - # not a bound method, get info directly from __call__ method if f.__call__.__code__.co_flags & STAR_ARGS: return f @@ -946,7 +970,7 @@ def tmp(_, l, t): def tmp(_, __, t): return f(t) - else: # ~ numargs == 0: + else: def tmp(_, __, ___): return f() @@ -992,7 +1016,15 @@ def set_parse_action(self, *fns, **kwargs): return self def add_parse_action(self, *fns, **kwargs): - """Add parse action to expression's list of parse actions. See L{I{set_parse_action}}.""" + """Add parse action to expression's list of parse actions. + + Args: + *fns (callable): The parse action functions. + **kwargs: Additional keyword arguments. + + Returns: + ParserElement: The ParserElement object. + """ self.parse_action += list(map(self._normalize_parse_action_args, list(fns))) self.call_during_try = self.call_during_try or ( "call_during_try" in kwargs and kwargs["call_during_try"] @@ -1013,6 +1045,15 @@ def set_fail_action(self, fn): return self def _skip_ignorables(self, instring, loc): + """Skip over ignored expressions. + + Args: + instring (str): The input string. + loc (int): The current location in the string. + + Returns: + int: The updated location. + """ exprs_found = True while exprs_found: exprs_found = False @@ -1026,6 +1067,15 @@ def _skip_ignorables(self, instring, loc): return loc def pre_parse(self, instring, loc): + """Perform pre-parsing operations. + + Args: + instring (str): The input string. + loc (int): The current location in the string. + + Returns: + int: The updated location. + """ if self.ignore_exprs: loc = self._skip_ignorables(instring, loc) @@ -1038,13 +1088,43 @@ def pre_parse(self, instring, loc): return loc def parse_impl(self, instring, loc, do_actions=True): + """Implementation of the parsing logic. + + Args: + instring (str): The input string. + loc (int): The current location in the string. + do_actions (bool, optional): Whether to perform parse actions. Defaults to True. + + Returns: + tuple: The updated location and the list of matched tokens. + """ return loc, [] def post_parse(self, instring, loc, tokenlist): + """Perform post-parsing operations. + + Args: + instring (str): The input string. + loc (int): The current location in the string. + tokenlist (list): The list of matched tokens. + + Returns: + list: The updated list of tokens. + """ return tokenlist # ~ @profile def _parse_no_cache(self, instring, loc, do_actions=True, call_pre_parse=True): + """Parse the input string without using the cache. + + Args: + instring (str): The input string. + loc (int): The current location in the string. + do_actions (bool, optional): Whether to perform parse actions. Defaults to True. + call_pre_parse (bool, optional): Whether to call the pre_parse method. Defaults to True. + """ + # Implementation details omitted for brevity + pass debugging = self.debug # and do_actions ) if debugging or self.fail_action: @@ -1455,7 +1535,14 @@ def __xor__(self, other): return Or([self, other]) def __rxor__(self, other): - """Implementation of ^ operator when left operand is not a ParserElement""" + """Implementation of ^ operator when left operand is not a ParserElement + + Args: + other (str or ParserElement): The right operand of the ^ operator. + + Returns: + ParserElement: The result of the ^ operation. + """ if isinstance(other, str): other = Literal(other) if not isinstance(other, ParserElement): @@ -1468,7 +1555,14 @@ def __rxor__(self, other): return other ^ self def __and__(self, other): - """Implementation of & operator - returns Each""" + """Implementation of & operator - returns Each + + Args: + other (str or ParserElement): The element to combine with. + + Returns: + Each: A new `Each` object containing both `self` and `other`. + """ if isinstance(other, str): other = Literal(other) if not isinstance(other, ParserElement): @@ -1481,7 +1575,14 @@ def __and__(self, other): return Each([self, other]) def __rand__(self, other): - """Implementation of & operator when left operand is not a ParserElement""" + """Implementation of & operator when left operand is not a ParserElement + + Args: + other (str or ParserElement): The left operand of the & operator. + + Returns: + ParserElement: The result of combining the left operand with self using the & operator. + """ if isinstance(other, str): other = Literal(other) if not isinstance(other, ParserElement): @@ -1494,49 +1595,84 @@ def __rand__(self, other): return other & self def __invert__(self): - """Implementation of ~ operator - returns NotAny""" + """Implementation of ~ operator - returns NotAny + + Returns: + NotAny: A new instance of the NotAny class. + """ return NotAny(self) def __call__(self, name): """Shortcut for set_results_name, with list_all_matches=default:: - userdata = Word(alphas).set_results_name("name") + Word(nums+"-").set_results_name("socsecno") + userdata = Word(alphas).set_results_name("name") + Word(nums+"-").set_results_name("socsecno") could be written as:: - userdata = Word(alphas)("name") + Word(nums+"-")("socsecno") + userdata = Word(alphas)("name") + Word(nums+"-")("socsecno") + + Args: + name (str): The name to assign to the parsed results. + + Returns: + pyparsing.ParseResults: The modified pyparsing object with the specified name assigned to it. """ return self.set_results_name(name) def suppress(self): """Suppresses the output of this ParserElement; useful to keep punctuation from cluttering up returned output. + + Returns: + Suppress: A new ParserElement that suppresses the output of the original ParserElement. """ return Suppress(self) def leave_whitespace(self): - """Disables the skipping of whitespace before matching the characters in the - ParserElement's defined pattern. This is normally only used internally by + """ + Disables the skipping of whitespace before matching the characters in the + ParserElement's defined pattern. This is normally only used internally by the pyparsing module, but may be needed in some whitespace-sensitive grammars. + + Returns: + ParserElement: The ParserElement object with whitespace skipping disabled. """ self.skip_whitespace = False return self def set_whitespace_chars(self, chars): - """Overrides the default whitespace chars""" + """ + Overrides the default whitespace chars. + + Args: + chars (str): The characters to be considered as whitespace. + + Returns: + self: The current instance of the class. + """ self.skip_whitespace = True self.white_chars = chars self.copy_default_white_chars = False return self def parse_with_tabs(self): - """Overrides default behavior to expand s to spaces before parsing the input string. + """ + Overrides default behavior to expand s to spaces before parsing the input string. Must be called before parse_string when the input grammar contains elements that - match characters.""" + match characters. + + Returns: + self: The current instance of the class. + """ self.keep_tabs = True return self def ignore(self, other): - """Define expression to be ignored (e.g., comments) while doing pattern - matching; may be called repeatedly, to define multiple comment or other - ignorable patterns. + """ + Define expression to be ignored (e.g., comments) while doing pattern matching. + + Parameters: + other (str or pyparsing.ParserElement): The expression to be ignored. + + Returns: + pyparsing.ParserElement: The current instance of the ParserElement. """ if isinstance(other, Suppress): if other not in self.ignore_exprs: @@ -1546,7 +1682,18 @@ def ignore(self, other): return self def set_debug_actions(self, start_action, success_action, exception_action): - """Enable display of debugging messages while doing pattern matching.""" + """ + Enable display of debugging messages while doing pattern matching. + + Args: + start_action (callable): The action to perform when pattern matching starts. + success_action (callable): The action to perform when pattern matching succeeds. + exception_action (callable): The action to perform when an exception occurs during pattern matching. + + Returns: + self: The current instance of the class. + + """ self.debug_actions = ( start_action or _default_start_debug_action, success_action or _default_success_debug_action, @@ -1556,8 +1703,14 @@ def set_debug_actions(self, start_action, success_action, exception_action): return self def set_debug(self, flag=True): - """Enable display of debugging messages while doing pattern matching. - Set flag to True to enable, False to disable.""" + """Enable or disable display of debugging messages while doing pattern matching. + + Args: + flag (bool, optional): Set to True to enable debugging messages, False to disable. Defaults to True. + + Returns: + self: The current instance of the class. + """ if flag: self.set_debug_actions( _default_start_debug_action, @@ -1575,6 +1728,12 @@ def __repr__(self): return str(self) def streamline(self): + """ + Streamlines the object by marking it as streamlined and resetting the string representation. + + Returns: + The streamlined object. + """ self.streamlined = True self.str_repr = None return self diff --git a/src/whoosh/util/__init__.py b/src/whoosh/util/__init__.py index 81593209..e47d91b7 100644 --- a/src/whoosh/util/__init__.py +++ b/src/whoosh/util/__init__.py @@ -45,10 +45,32 @@ def random_name(size=28): + """ + Generates a random name consisting of alphanumeric characters. + + Parameters: + - size (int): The length of the random name to generate. Default is 28. + + Returns: + - str: The randomly generated name. + """ return "".join(random.choice(IDCHARS) for _ in range(size)) def random_bytes(size=28): + """ + Generate a random byte string of the specified size. + + Parameters: + - size (int): The size of the byte string to generate. Default is 28. + + Returns: + - bytes: A random byte string of the specified size. + + Example: + >>> random_bytes(16) + b'\x8f\x9a\x0b\x1e\x9c\x8d\x8c\x9e\x1f\x9d\x9e\x0e\x1e\x9e\x1e\x9e' + """ return bytes(random.randint(0, 255) for _ in range(size)) @@ -56,13 +78,38 @@ def make_binary_tree(fn, args, **kwargs): """Takes a function/class that takes two positional arguments and a list of arguments and returns a binary tree of results/instances. - >>> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3]) - UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3)) + Args: + fn (callable): A function or class that takes two positional arguments. + args (list): A list of arguments to be used to construct the binary tree. - Any keyword arguments given to this function are passed to the class - initializer. - """ + Keyword Args: + **kwargs: Additional keyword arguments to be passed to the class initializer. + Returns: + object: The binary tree of results/instances. + + Raises: + ValueError: If called with an empty list. + + Examples: + >>> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3]) + UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3)) + + This function takes a function or class `fn` that takes two positional arguments, + and a list of arguments `args`. It constructs a binary tree of results/instances + by recursively splitting the `args` list into two halves and calling `fn` with + the left and right halves as arguments. + + If the `args` list contains only one element, that element is returned as is. + + Any additional keyword arguments given to this function are passed to the class + initializer of `fn`. + + Note: + The `fn` should be a function or class that can be called with two positional + arguments and returns a result/instance. + + """ count = len(args) if not count: raise ValueError("Called make_binary_tree with empty list") @@ -78,11 +125,30 @@ def make_binary_tree(fn, args, **kwargs): def make_weighted_tree(fn, ls, **kwargs): - """Takes a function/class that takes two positional arguments and a list of + """ + Takes a function/class that takes two positional arguments and a list of (weight, argument) tuples and returns a huffman-like weighted tree of results/instances. - """ + Args: + fn (function/class): The function or class that takes two positional arguments. + ls (list): A list of (weight, argument) tuples. + **kwargs: Additional keyword arguments that can be passed to the function/class. + + Returns: + object: The huffman-like weighted tree of results/instances. + + Raises: + ValueError: If the input list is empty. + + Example: + >>> def combine(a, b): + ... return a + b + ... + >>> ls = [(1, 'a'), (2, 'b'), (3, 'c')] + >>> make_weighted_tree(combine, ls) + 'abc' + """ if not ls: raise ValueError("Called make_weighted_tree with empty list") @@ -100,8 +166,19 @@ def make_weighted_tree(fn, ls, **kwargs): def fib(n): - """Returns the nth value in the Fibonacci sequence.""" + """ + Returns the nth value in the Fibonacci sequence. + + Parameters: + - n (int): The position of the value in the Fibonacci sequence to be returned. + + Returns: + - int: The nth value in the Fibonacci sequence. + Notes: + - The Fibonacci sequence starts with 0 and 1, and each subsequent value is the sum of the two preceding values. + - The function uses memoization to improve performance by caching previously calculated values. + """ if n <= 2: return n if n in _fib_cache: @@ -117,6 +194,23 @@ def fib(n): def synchronized(func): """Decorator for storage-access methods, which synchronizes on a threading lock. The parent object must have 'is_closed' and '_sync_lock' attributes. + + Args: + func (callable): The function to be decorated. + + Returns: + callable: The decorated function. + + Example: + >>> class MyClass: + ... def __init__(self): + ... self._sync_lock = threading.Lock() + ... + ... @synchronized + ... def my_method(self): + ... # Access shared storage here + ... pass + """ @wraps(func) @@ -130,6 +224,23 @@ def synchronized_wrapper(self, *args, **kwargs): def unclosed(method): """ Decorator to check if the object is closed. + + This decorator can be used to wrap methods in a class to ensure that the object is not closed before executing the method. + If the object is closed, a ValueError is raised. + + Parameters: + - method: The method to be wrapped. + + Returns: + - The wrapped method. + + Example usage: + ``` + class MyClass: + @unclosed + def my_method(self): + # Method implementation + ``` """ @wraps(method) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 2f643834..36d23c0e 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -432,8 +432,8 @@ def test_url(): sample = "Visit https://github.com/sygil-dev/whoosh-reloaded or urn:isbn:5930502 or http://www.apple.com/." anas = [ - analysis.SimpleAnalyzer(analysis.url_pattern), - analysis.StandardAnalyzer(analysis.url_pattern, stoplist=None), + analysis.simple_analyzer(analysis.url_pattern), + analysis.standard_analyzer(analysis.url_pattern, stoplist=None), ] for ana in anas: ts = [t.text for t in ana(sample)] @@ -543,14 +543,14 @@ def test_language_analyzer(): ] for lang, source, target in domain: - ana = analysis.LanguageAnalyzer(lang) + ana = analysis.language_analyzer(lang) words = [t.text for t in ana(source)] assert words == target @pytest.mark.skipif("sys.version_info < (2,6)") def test_la_pickleability(): - ana = analysis.LanguageAnalyzer("en") + ana = analysis.language_analyzer("en") _ = dumps(ana, -1) @@ -558,7 +558,7 @@ def test_charset_pickeability(): from whoosh.support import charset charmap = charset.charset_table_to_dict(charset.default_charset) - ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap) + ana = analysis.standard_analyzer() | analysis.CharsetFilter(charmap) _ = dumps(ana, -1) ana = analysis.CharsetTokenizer(charmap) @@ -638,7 +638,7 @@ def test_stop_lang(): def test_issue358(): t = analysis.RegexTokenizer(r"\w+") with pytest.raises(analysis.CompositionError): - _ = t | analysis.StandardAnalyzer() + _ = t | analysis.standard_analyzer() def test_ngramwords_tokenizer(): diff --git a/tests/test_classify.py b/tests/test_classify.py index d2a43c8d..c3a51a52 100644 --- a/tests/test_classify.py +++ b/tests/test_classify.py @@ -16,7 +16,7 @@ def create_index(): - analyzer = analysis.StandardAnalyzer() + analyzer = analysis.standard_analyzer() vector_format = formats.Frequency() schema = fields.Schema( path=fields.ID(stored=True), @@ -94,7 +94,7 @@ def _check(schema, **kwargs): schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT(stored=True)) _check(schema) - ana = analysis.StandardAnalyzer() + ana = analysis.standard_analyzer() schema = fields.Schema( id=fields.ID(stored=True), text=fields.TEXT(analyzer=ana, vector=formats.Frequency()), diff --git a/tests/test_codecs.py b/tests/test_codecs.py index b757bd0b..f8d9e5ec 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -508,9 +508,9 @@ def test_skip(): # # # def test_special_spelled_field(): -# from whoosh.analysis import StemmingAnalyzer +# from whoosh.analysis import stemming_analyzer # -# field = fields.TEXT(analyzer=StemmingAnalyzer(), spelling=True) +# field = fields.TEXT(analyzer=stemming_analyzer(), spelling=True) # st, codec, seg = _make_codec() # # fw = codec.field_writer(st, seg) @@ -537,7 +537,7 @@ def test_skip(): def test_plaintext_codec(): pytest.importorskip("ast") - ana = analysis.StemmingAnalyzer() + ana = analysis.stemming_analyzer() schema = fields.Schema( a=fields.TEXT(vector=True, sortable=True), b=fields.STORED, @@ -612,7 +612,7 @@ def test_plaintext_codec(): def test_memory_codec(): - ana = analysis.StemmingAnalyzer() + ana = analysis.stemming_analyzer() schema = fields.Schema( a=fields.TEXT(vector=True), b=fields.STORED, diff --git a/tests/test_fields.py b/tests/test_fields.py index aba2e903..3b9a39e6 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -648,7 +648,9 @@ def test_pickle_schema(): from whoosh import analysis from whoosh.support.charset import accent_map - freetext_analyzer = analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map) + freetext_analyzer = analysis.stemming_analyzer() | analysis.CharsetFilter( + accent_map + ) schema = fields.Schema( path=fields.ID(stored=True, unique=True), diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index eee21cd0..89fb873e 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -14,7 +14,7 @@ def u(s): def test_null_fragment(): terms = frozenset(("bravo", "india")) - sa = analysis.StandardAnalyzer() + sa = analysis.standard_analyzer() nf = highlight.WholeFragmenter() uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, nf, uc) @@ -89,7 +89,7 @@ def test_sentence_fragment(): + "This sentence is the second. Third sentence here." ) terms = ("sentence",) - sa = analysis.StandardAnalyzer(stoplist=None) + sa = analysis.standard_analyzer(stoplist=None) sf = highlight.SentenceFragmenter() uc = highlight.UppercaseFormatter() htext = highlight.highlight(text, terms, sa, sf, uc) @@ -101,7 +101,7 @@ def test_sentence_fragment(): def test_context_fragment(): terms = frozenset(("bravo", "india")) - sa = analysis.StandardAnalyzer() + sa = analysis.standard_analyzer() cf = highlight.ContextFragmenter(surround=6) uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, cf, uc) @@ -110,7 +110,7 @@ def test_context_fragment(): def test_context_at_start(): terms = frozenset(["alfa"]) - sa = analysis.StandardAnalyzer() + sa = analysis.standard_analyzer() cf = highlight.ContextFragmenter(surround=15) uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, cf, uc) @@ -119,7 +119,7 @@ def test_context_at_start(): def test_html_format(): terms = frozenset(("bravo", "india")) - sa = analysis.StandardAnalyzer() + sa = analysis.standard_analyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter() htext = highlight.highlight(_doc, terms, sa, cf, hf) @@ -131,7 +131,7 @@ def test_html_format(): def test_html_escape(): terms = frozenset(["bravo"]) - sa = analysis.StandardAnalyzer() + sa = analysis.standard_analyzer() wf = highlight.WholeFragmenter() hf = highlight.HtmlFormatter() htext = highlight.highlight('alfa delta', terms, sa, wf, hf) @@ -143,7 +143,7 @@ def test_html_escape(): def test_maxclasses(): terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo")) - sa = analysis.StandardAnalyzer() + sa = analysis.standard_analyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2) htext = highlight.highlight(_doc, terms, sa, cf, hf) @@ -325,7 +325,7 @@ def test_highlight_ngrams(): def test_issue324(): - sa = analysis.StemmingAnalyzer() + sa = analysis.stemming_analyzer() result = highlight.highlight( "Indexed!\n1", ["index"], diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 363c8a17..1dc6910a 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -608,7 +608,7 @@ def test_indentical_fields(): def test_multivalue(): - ana = analysis.StemmingAnalyzer() + ana = analysis.stemming_analyzer() schema = fields.Schema( id=fields.STORED, date=fields.DATETIME, @@ -636,7 +636,7 @@ def test_multivalue(): def test_multi_language(): # Analyzer for English - ana_eng = analysis.StemmingAnalyzer() + ana_eng = analysis.stemming_analyzer() # analyzer for Pig Latin def stem_piglatin(w): @@ -644,7 +644,9 @@ def stem_piglatin(w): w = w[:-2] return w - ana_pig = analysis.StemmingAnalyzer(stoplist=["nday", "roay"], stemfn=stem_piglatin) + ana_pig = analysis.stemming_analyzer( + stoplist=["nday", "roay"], stemfn=stem_piglatin + ) # Dictionary mapping languages to analyzers analyzers = {"eng": ana_eng, "pig": ana_pig} diff --git a/tests/test_parse_plugins.py b/tests/test_parse_plugins.py index d5b398c5..d8ee7ed6 100644 --- a/tests/test_parse_plugins.py +++ b/tests/test_parse_plugins.py @@ -190,7 +190,7 @@ def test_daterange_empty_field(): def test_free_dates(): - a = analysis.StandardAnalyzer(stoplist=None) + a = analysis.standard_analyzer(stoplist=None) schema = fields.Schema(text=fields.TEXT(analyzer=a), date=fields.DATETIME) qp = qparser.QueryParser("text", schema) basedate = datetime(2010, 9, 20, 15, 16, 6, 454000, tzinfo=timezone.utc) @@ -332,7 +332,7 @@ def test_copyfield(): str(qp.parse("hello c:matt")) == "((a:hello OR c:hello) AND (c:matt OR a:matt))" ) - ana = analysis.RegexAnalyzer(r"\w+") | analysis.DoubleMetaphoneFilter() + ana = analysis.regex_analyzer(r"\w+") | analysis.DoubleMetaphoneFilter() fmt = formats.Frequency() ft = fields.FieldType(fmt, ana, multitoken_query="or") schema = fields.Schema(name=fields.KEYWORD, name_phone=ft) @@ -434,7 +434,7 @@ def rev_text(node): def test_fuzzy_plugin(): - ana = analysis.StandardAnalyzer("\\S+") + ana = analysis.standard_analyzer("\\S+") schema = fields.Schema(f=fields.TEXT(analyzer=ana)) qp = default.QueryParser("f", schema) qp.add_plugin(plugins.FuzzyTermPlugin()) @@ -665,7 +665,7 @@ def test_sequence_andmaybe(): def test_sequence_complex(): - ana = analysis.StandardAnalyzer(stoplist=None) + ana = analysis.standard_analyzer(stoplist=None) schema = fields.Schema( title=fields.TEXT(stored=True), path=fields.ID(stored=True), diff --git a/tests/test_parsing.py b/tests/test_parsing.py index dd1d22b1..de0c9028 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -725,7 +725,7 @@ def test_numrange_multi(): def test_nonexistant_fieldnames(): # Need an analyzer that won't mangle a URL - a = analysis.SimpleAnalyzer("\\S+") + a = analysis.simple_analyzer("\\S+") schema = fields.Schema(id=fields.ID, text=fields.TEXT(analyzer=a)) qp = default.QueryParser("text", schema) @@ -747,7 +747,7 @@ def test_stopped(): def test_analyzing_terms(): - ana = analysis.StemmingAnalyzer() + ana = analysis.stemming_analyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana)) qp = default.QueryParser("text", schema) q = qp.parse("Indexed!") @@ -996,7 +996,7 @@ def test_star_paren(): def test_dash(): - ana = analysis.StandardAnalyzer("[^ \t\r\n()*?]+") + ana = analysis.standard_analyzer("[^ \t\r\n()*?]+") schema = fields.Schema( title=fields.TEXT(analyzer=ana), text=fields.TEXT(analyzer=ana), time=fields.ID ) @@ -1013,7 +1013,7 @@ def test_dash(): assert str(q) == "(title:*ben-hayden* OR text:*ben-hayden* OR time:*Ben-Hayden*)" -def test_bool_True(): +def test_bool_true(): schema = fields.Schema(text=fields.TEXT, bool=fields.BOOLEAN) qp = default.QueryParser("text", schema) q = qp.parse("bool:True") diff --git a/tests/test_postings.py b/tests/test_postings.py index 8d87c97b..29478d5f 100644 --- a/tests/test_postings.py +++ b/tests/test_postings.py @@ -15,7 +15,7 @@ def _roundtrip(content, format_, astype, ana=None): with TempStorage("roundtrip") as st: codec = default_codec() seg = codec.new_segment(st, "") - ana = ana or analysis.StandardAnalyzer() + ana = ana or analysis.standard_analyzer() field = fields.FieldType(format=format_, analyzer=ana) fw = codec.field_writer(st, seg) diff --git a/tests/test_results.py b/tests/test_results.py index 6a586fe1..1d0b0f7e 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -389,7 +389,7 @@ def test_highlight_setters(): def test_snippets(): - ana = analysis.StemmingAnalyzer() + ana = analysis.stemming_analyzer() schema = fields.Schema(text=fields.TEXT(stored=True, analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() @@ -437,7 +437,7 @@ def test_snippets(): def test_keyterms(): - ana = analysis.StandardAnalyzer() + ana = analysis.standard_analyzer() vectorformat = formats.Frequency() schema = fields.Schema( path=fields.ID, content=fields.TEXT(analyzer=ana, vector=vectorformat) diff --git a/tests/test_searching.py b/tests/test_searching.py index 4caaf95b..9cb8eac4 100644 --- a/tests/test_searching.py +++ b/tests/test_searching.py @@ -635,7 +635,7 @@ def test_stop_phrase(): def test_phrase_order(): - tfield = fields.TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()) + tfield = fields.TEXT(stored=True, analyzer=analysis.simple_analyzer()) schema = fields.Schema(text=tfield) storage = RamStorage() ix = storage.create_index(schema) @@ -1277,7 +1277,7 @@ def test_scorer(): def test_pos_scorer(): - ana = analysis.SimpleAnalyzer() + ana = analysis.simple_analyzer() schema = fields.Schema(id=fields.STORED, key=fields.TEXT(analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() diff --git a/tests/test_spans.py b/tests/test_spans.py index 01c78731..341000de 100644 --- a/tests/test_spans.py +++ b/tests/test_spans.py @@ -16,7 +16,7 @@ def get_index(): return _ix charfield = fields.FieldType( - formats.Characters(), analysis.SimpleAnalyzer(), scorable=True, stored=True + formats.Characters(), analysis.simple_analyzer(), scorable=True, stored=True ) schema = fields.Schema(text=charfield) st = RamStorage() @@ -93,10 +93,10 @@ def test_span_term(): ids = set() while m.is_active(): - id = m.id() + matcher_id = m.id() sps = m.spans() - ids.add(id) - original = list(s.stored_fields(id)["text"]) + ids.add(matcher_id) + original = list(s.stored_fields(matcher_id)["text"]) assert word in original if word != "bravo": @@ -208,7 +208,7 @@ def test_near_unordered(): def test_span_near_tree(): - ana = analysis.SimpleAnalyzer() + ana = analysis.simple_analyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True)) st = RamStorage() ix = st.create_index(schema) diff --git a/tests/test_spelling.py b/tests/test_spelling.py index 3f773785..6c67cb28 100644 --- a/tests/test_spelling.py +++ b/tests/test_spelling.py @@ -186,7 +186,7 @@ def test_correct_query(): def test_spelling_field(): text = "rendering shading modeling reactions" - ana = analysis.StemmingAnalyzer() + ana = analysis.stemming_analyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) assert schema["text"].spelling @@ -214,7 +214,7 @@ def test_spelling_field(): def test_correct_spell_field(): - ana = analysis.StemmingAnalyzer() + ana = analysis.stemming_analyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) with TempIndex(schema) as ix: with ix.writer() as w: @@ -328,7 +328,7 @@ def test_very_long_words(): strings1 = [u(chr(i) * length) for i in range(65, 70)] strings2 = [u(chr(i) * length) for i in range(71, 75)] - ana = analysis.StemmingAnalyzer() + ana = analysis.stemming_analyzer() schema = fields.Schema( text=fields.TEXT( analyzer=ana, diff --git a/tests/test_writing.py b/tests/test_writing.py index 0014c98b..05dd5f21 100644 --- a/tests/test_writing.py +++ b/tests/test_writing.py @@ -395,7 +395,7 @@ def test_add_reader_spelling(): # Test whether add_spell_word() items get copied over in a merge # Because b is stemming and spelled, it will use add_spell_word() - ana = analysis.StemmingAnalyzer() + ana = analysis.stemming_analyzer() schema = fields.Schema( a=fields.TEXT(analyzer=ana), b=fields.TEXT(analyzer=ana, spelling=True) ) @@ -455,7 +455,7 @@ def test_clear(): def test_spellable_list(): # Make sure a spellable field works with a list of pre-analyzed tokens - ana = analysis.StemmingAnalyzer() + ana = analysis.stemming_analyzer() schema = fields.Schema( Location=fields.STORED, Lang=fields.STORED, From ee659a0c2c3e5cd9f8f1a7af23c77af151398f7a Mon Sep 17 00:00:00 2001 From: Alejandro Gil Date: Tue, 20 Feb 2024 18:28:52 -0700 Subject: [PATCH 2/4] Added auto-merge workflow to merge automatically PRs from dependabot and make sure we are always using the latest version of everything. If anything breaks --- .github/workflows/auto-merge.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/auto-merge.yml diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml new file mode 100644 index 00000000..e7ed69d3 --- /dev/null +++ b/.github/workflows/auto-merge.yml @@ -0,0 +1,22 @@ +name: Dependabot auto-merge +on: pull_request + +permissions: + contents: write + pull-requests: write + +jobs: + dependabot: + runs-on: ubuntu-latest + if: github.actor == 'dependabot[bot]' + steps: + - name: Dependabot metadata + id: metadata + uses: dependabot/fetch-metadata@v1 + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + - name: Enable auto-merge for Dependabot PRs + run: gh pr merge --auto --merge "$PR_URL" + env: + PR_URL: ${{github.event.pull_request.html_url}} + GH_TOKEN: ${{secrets.GITHUB_TOKEN}} From 564de5d801e0b19dceb6582ddbd6ac0138c7bbce Mon Sep 17 00:00:00 2001 From: Alejandro Gil Date: Sat, 24 Feb 2024 03:34:32 -0700 Subject: [PATCH 3/4] Install instructions updated in the readme.md --- README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 6703235e..0f47980c 100644 --- a/README.md +++ b/README.md @@ -14,12 +14,10 @@ [![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) [![Technical Debt](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=sqale_index)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) - -------------------------------------- > **Notice:** This repository (**whoosh-reloaded**) is a fork and continuation of the Whoosh project. - > This fork **is actively maintained** by the Sygil-Dev Organization. -------------------------------------- @@ -64,8 +62,8 @@ Installing Whoosh If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install`` or ``pip`` to download and install Whoosh automatically:: - # install the old version from PyPI - $ pip install Whoosh + # Install the stable version from Pypi + $ pip install whoosh-reloaded # Install the development version from GitHub. $ pip install git+https://github.com/Sygil-Dev/whoosh-reloaded.git @@ -79,6 +77,7 @@ You can check out the latest version of the source code on GitHub using git: Contributing ============ + We use pre-commit to format the code and run some checks before committing to avoid common mistakes. To install it, run the following commands: ```bash @@ -101,7 +100,6 @@ Learning more * File bug reports and issues at https://github.com/Sygil-Dev/whoosh-reloaded/issues - Maintainers =========== @@ -110,4 +108,5 @@ Maintainers Discord Server ============== + - [Sygil-Dev - Resources](https://discord.gg/H5mftKP5S9) From 885e612cdf2824c23438bf1af696522efa5e57bc Mon Sep 17 00:00:00 2001 From: Alejandro Gil Date: Sat, 24 Feb 2024 03:46:31 -0700 Subject: [PATCH 4/4] Revert "Add docstrings to every function and class providing information on how it works and how it should be used." This reverts commit 198fe3824bb46c4913ea92358ac9ecf1c8fbcd7f. --- benchmark/dictionary.py | 29 +- benchmark/enron.py | 103 +- benchmark/marc21.py | 306 +--- benchmark/reuters.py | 36 +- docs/source/analysis.rst | 6 +- docs/source/api/analysis.rst | 20 +- docs/source/batch.rst | 2 +- docs/source/highlight.rst | 16 +- docs/source/ngrams.rst | 2 +- docs/source/recipes.rst | 6 +- docs/source/releases/1_0.rst | 6 +- docs/source/releases/2_0.rst | 2 +- docs/source/schema.rst | 8 +- docs/source/spelling.rst | 2 +- docs/source/stemming.rst | 12 +- scripts/make_checkpoint.py | 24 +- scripts/read_checkpoint.py | 15 +- src/whoosh/analysis/__init__.py | 22 +- src/whoosh/analysis/acore.py | 104 +- src/whoosh/analysis/analyzers.py | 330 +--- src/whoosh/analysis/filters.py | 535 ++---- src/whoosh/analysis/intraword.py | 138 +- src/whoosh/analysis/morph.py | 425 +---- src/whoosh/analysis/ngrams.py | 160 +- src/whoosh/analysis/tokenizers.py | 240 +-- src/whoosh/automata/fsa.py | 1461 +--------------- src/whoosh/automata/fst.py | 1435 ++-------------- src/whoosh/automata/glob.py | 34 - src/whoosh/automata/lev.py | 12 - src/whoosh/automata/reg.py | 111 -- src/whoosh/codec/__init__.py | 15 - src/whoosh/codec/base.py | 1630 +----------------- src/whoosh/codec/memory.py | 688 +------- src/whoosh/codec/plaintext.py | 754 +------- src/whoosh/codec/whoosh2.py | 2658 +---------------------------- src/whoosh/codec/whoosh3.py | 1361 +-------------- src/whoosh/fields.py | 86 +- src/whoosh/filedb/compound.py | 427 +---- src/whoosh/filedb/fileindex.py | 487 +----- src/whoosh/filedb/filepostings.py | 448 +---- src/whoosh/filedb/filereading.py | 489 +----- src/whoosh/filedb/filestore.py | 1213 +------------ src/whoosh/filedb/filetables.py | 1410 +-------------- src/whoosh/filedb/filewriting.py | 315 +--- src/whoosh/filedb/gae.py | 417 ----- src/whoosh/filedb/misc.py | 35 - src/whoosh/filedb/pools.py | 590 ++----- src/whoosh/filedb/structfile.py | 1354 +-------------- src/whoosh/highlight.py | 24 +- src/whoosh/legacy.py | 4 +- src/whoosh/qparser/dateparse.py | 2 - src/whoosh/query/terms.py | 2 +- src/whoosh/support/base85.py | 82 +- src/whoosh/support/bench.py | 847 +-------- src/whoosh/support/bitstream.py | 44 +- src/whoosh/support/bitvector.py | 453 +++-- src/whoosh/support/charset.py | 28 +- src/whoosh/support/pyparsing.py | 229 +-- src/whoosh/util/__init__.py | 127 +- tests/test_analysis.py | 12 +- tests/test_classify.py | 4 +- tests/test_codecs.py | 8 +- tests/test_fields.py | 4 +- tests/test_highlighting.py | 16 +- tests/test_indexing.py | 8 +- tests/test_parse_plugins.py | 8 +- tests/test_parsing.py | 8 +- tests/test_postings.py | 2 +- tests/test_results.py | 4 +- tests/test_searching.py | 4 +- tests/test_spans.py | 10 +- tests/test_spelling.py | 6 +- tests/test_writing.py | 4 +- 73 files changed, 1552 insertions(+), 20367 deletions(-) diff --git a/benchmark/dictionary.py b/benchmark/dictionary.py index 5cacf071..d5855f4a 100644 --- a/benchmark/dictionary.py +++ b/benchmark/dictionary.py @@ -6,26 +6,11 @@ class VulgarTongue(Spec): - """ - A class representing a VulgarTongue dictionary. - - Attributes: - name (str): The name of the dictionary. - filename (str): The filename of the dictionary file. - headline_field (str): The field name for the headline. - """ - name = "dictionary" filename = "dcvgr10.txt.gz" headline_field = "head" def documents(self): - """ - Generator function that yields documents from the dictionary file. - - Yields: - dict: A dictionary representing a document with 'head' and 'body' fields. - """ path = os.path.join(self.options.dir, self.filename) f = gzip.GzipFile(path) @@ -43,13 +28,7 @@ def documents(self): yield {"head": head, "body": head + body} def whoosh_schema(self): - """ - Returns the Whoosh schema for the VulgarTongue dictionary. - - Returns: - Schema: The Whoosh schema for the dictionary. - """ - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema( head=fields.ID(stored=True), body=fields.TEXT(analyzer=ana, stored=True) @@ -57,12 +36,6 @@ def whoosh_schema(self): return schema def zcatalog_setup(self, cat): - """ - Sets up the ZCatalog indexes for the VulgarTongue dictionary. - - Args: - cat (ZCatalog): The ZCatalog instance. - """ from zcatalog import indexes # type: ignore @UnresolvedImport cat["head"] = indexes.FieldIndex(field_name="head") diff --git a/benchmark/enron.py b/benchmark/enron.py index b3167dbb..38504221 100644 --- a/benchmark/enron.py +++ b/benchmark/enron.py @@ -14,12 +14,10 @@ from whoosh.support.bench import Bench, Spec from whoosh.util import now +# Benchmark class -class Enron(Spec): - """ - The Enron class provides functionality for downloading, caching, and processing the Enron email archive. - """ +class Enron(Spec): name = "enron" enron_archive_url = "http://www.cs.cmu.edu/~enron/enron_mail_082109.tar.gz" @@ -42,16 +40,10 @@ class Enron(Spec): cachefile = None - def download_archive(self, archive): - """ - Downloads the Enron email archive from the specified URL and saves it to the given file path. - - Args: - archive (str): The file path to save the downloaded archive. + # Functions for downloading and then reading the email archive and caching + # the messages in an easier-to-digest format - Raises: - FileNotFoundError: If the archive file does not exist. - """ + def download_archive(self, archive): print(f"Downloading Enron email archive to {archive}...") t = now() urlretrieve(self.enron_archive_url, archive) @@ -59,15 +51,6 @@ def download_archive(self, archive): @staticmethod def get_texts(archive): - """ - Generator function that yields the text content of each email in the given archive. - - Args: - archive (str): The file path of the archive. - - Yields: - str: The text content of each email. - """ archive = tarfile.open(archive, "r:gz") while True: entry = next(archive) @@ -81,16 +64,6 @@ def get_texts(archive): @staticmethod def get_messages(archive, headers=True): - """ - Generator function that yields the parsed messages from the given email archive. - - Args: - archive (str): The file path of the archive. - headers (bool, optional): Whether to include message headers. Defaults to True. - - Yields: - dict: The dictionary representation of each message. - """ header_to_field = Enron.header_to_field for text in Enron.get_texts(archive): message = message_from_string(text) @@ -110,16 +83,6 @@ def get_messages(archive, headers=True): yield d def cache_messages(self, archive, cache): - """ - Caches the messages from the given email archive into a pickle file. - - Args: - archive (str): The file path of the archive. - cache (str): The file path to save the cached messages. - - Raises: - FileNotFoundError: If the archive file does not exist. - """ print(f"Caching messages in {cache}...") if not os.path.exists(archive): @@ -137,9 +100,6 @@ def cache_messages(self, archive, cache): print(f"Cached messages in {now() - t} seconds") def setup(self): - """ - Sets up the Enron email archive by downloading it if necessary and caching the messages. - """ archive = os.path.abspath( os.path.join(self.options.dir, self.enron_archive_filename) ) @@ -156,15 +116,6 @@ def setup(self): print("Cache is OK") def documents(self): - """ - Generator function that yields the cached messages from the pickle file. - - Yields: - dict: The dictionary representation of each message. - - Raises: - FileNotFoundError: If the message cache does not exist. - """ if not os.path.exists(self.cache_filename): raise FileNotFoundError("Message cache does not exist, use --setup") @@ -179,13 +130,7 @@ def documents(self): f.close() def whoosh_schema(self): - """ - Returns the Whoosh schema for indexing the Enron email archive. - - Returns: - whoosh.fields.Schema: The schema for indexing the emails. - """ - ana = analysis.stemming_analyzer(maxsize=40, cachesize=None) + ana = analysis.StemmingAnalyzer(maxsize=40, cachesize=None) storebody = self.options.storebody schema = fields.Schema( body=fields.TEXT(analyzer=ana, stored=storebody), @@ -200,15 +145,6 @@ def whoosh_schema(self): return schema def xappy_indexer_connection(self, path): - """ - Creates and returns an Xapian indexer connection for indexing the Enron email archive. - - Args: - path (str): The path to the Xapian index. - - Returns: - xappy.IndexerConnection: The Xapian indexer connection. - """ conn = xappy.IndexerConnection(path) conn.add_field_action("body", xappy.FieldActions.INDEX_FREETEXT, language="en") if self.options.storebody: @@ -228,12 +164,6 @@ def xappy_indexer_connection(self, path): return conn def zcatalog_setup(self, cat): - """ - Sets up the ZCatalog indexes for indexing the Enron email archive. - - Args: - cat (zcatalog.catalog.Catalog): The ZCatalog catalog. - """ from zcatalog import indexes # type: ignore for name in ("date", "frm"): @@ -242,27 +172,12 @@ def zcatalog_setup(self, cat): cat[name] = indexes.TextIndex(field_name=name) def process_document_whoosh(self, d): - """ - Processes a document for indexing with Whoosh. - - Args: - d (dict): The document to process. - """ d["filepos"] = self.filepos if self.options.storebody: mf = self.main_field d[f"_stored_{mf}"] = compress(d[mf], 9) def process_result_whoosh(self, d): - """ - Processes a search result from Whoosh. - - Args: - d (dict): The search result. - - Returns: - dict: The processed search result. - """ mf = self.main_field if mf in d: d.fields()[mf] = decompress(d[mf]) @@ -276,12 +191,6 @@ def process_result_whoosh(self, d): return d def process_document_xapian(self, d): - """ - Processes a document for indexing with Xapian. - - Args: - d (dict): The document to process. - """ d[self.main_field] = " ".join([d.get(name, "") for name in self.field_order]) diff --git a/benchmark/marc21.py b/benchmark/marc21.py index dbbe3ba0..07fde36f 100644 --- a/benchmark/marc21.py +++ b/benchmark/marc21.py @@ -1,9 +1,9 @@ import fnmatch import logging -import os +import os.path import re -from whoosh import analysis, fields, index, qparser, scoring +from whoosh import analysis, fields, index, qparser, query, scoring from whoosh.util import now log = logging.getLogger(__name__) @@ -21,20 +21,6 @@ def read_file(dbfile, tags=None): - """ - Reads records from a database file. - - Args: - dbfile (file): The file object representing the database file. - tags (list, optional): A list of tags to filter the records. Defaults to None. - - Yields: - tuple: A tuple containing the parsed record and its position in the file. - - Raises: - ValueError: If the length of the record is invalid. - - """ while True: pos = dbfile.tell() first5 = dbfile.read(5) @@ -48,23 +34,6 @@ def read_file(dbfile, tags=None): def read_record(filename, pos, tags=None): - """ - Read a MARC21 record from a file. - - Args: - filename (str): The path to the MARC21 file. - pos (int): The position in the file where the record starts. - tags (list[str], optional): A list of tags to include in the parsed record. - If None, all tags will be included. Defaults to None. - - Returns: - dict: A dictionary representing the parsed MARC21 record. - - Raises: - FileNotFoundError: If the specified file does not exist. - ValueError: If the specified position is invalid. - - """ f = open(filename, "rb") f.seek(pos) first5 = f.read(5) @@ -74,32 +43,6 @@ def read_record(filename, pos, tags=None): def parse_record(data, tags=None): - """ - Parse a MARC21 record from the given data. - - Args: - data (str): The MARC21 record data. - tags (list[str], optional): List of tags to include in the parsed result. If not provided, all tags will be included. - - Returns: - dict: A dictionary representing the parsed MARC21 record, where the keys are the tags and the values are the corresponding data. - - Raises: - AssertionError: If the length of the leader is not equal to LEADER_LEN. - AssertionError: If the dataoffset is not greater than 0. - AssertionError: If the dataoffset is not less than the length of the data. - AssertionError: If the difference between dirend and dirstart is not divisible by DIRECTORY_ENTRY_LEN. - - Example: - data = "..." - tags = ["245", "260"] - result = parse_record(data, tags) - # Returns: - # { - # "245": ["Title"], - # "260": ["Publisher"] - # } - """ leader = data[:LEADER_LEN] assert len(leader) == LEADER_LEN @@ -140,16 +83,6 @@ def parse_record(data, tags=None): def subfield(vs, code): - """ - Extracts the value of a subfield from a list of subfields. - - Parameters: - - vs (list): The list of subfields to search in. - - code (str): The code of the subfield to extract. - - Returns: - - str or None: The value of the subfield if found, None otherwise. - """ for v in vs: if v.startswith(code): return v[1:] @@ -157,56 +90,14 @@ def subfield(vs, code): def joinsubfields(vs): - """ - Joins the subfields of a MARC21 record. - - This function takes a list of subfields and joins them into a single string, - excluding any subfields starting with "6". - - Args: - vs (list): A list of subfields. - - Returns: - str: The joined subfields as a single string. - - Example: - >>> subfields = ['a', 'b', 'c', '6d', 'e'] - >>> joinsubfields(subfields) - 'a b c e' - """ return " ".join(v[1:] for v in vs if v and v[0] != "6") def getfields(d, *tags): - """ - Retrieve the values from a dictionary `d` for the given `tags`. - - Args: - d (dict): The dictionary to retrieve values from. - tags (str): Variable number of tags to retrieve values for. - - Returns: - generator: A generator that yields the values for the given tags. - - Example: - >>> d = {'tag1': 'value1', 'tag2': 'value2', 'tag3': 'value3'} - >>> fields = getfields(d, 'tag1', 'tag3') - >>> list(fields) - ['value1', 'value3'] - """ return (d[tag] for tag in tags if tag in d) def title(d): - """ - Extracts the title from a MARC21 record dictionary. - - Args: - d (dict): The MARC21 record dictionary. - - Returns: - str: The extracted title, or None if no title is found. - """ title = None if "245" in d: svs = d["245"] @@ -219,24 +110,6 @@ def title(d): def isbn(d): - """ - Extracts the ISBN number from the MARC21 record. - - Parameters: - - d (dict): The MARC21 record dictionary. - - Returns: - - str: The extracted ISBN number without hyphens. - - Example: - >>> record = { - ... "020": { - ... "a": "978-0132350884" - ... } - ... } - >>> isbn(record) - '9780132350884' - """ if "020" in d: num = subfield(d["020"], "a") if num: @@ -246,18 +119,6 @@ def isbn(d): def author(d): - """ - Returns the author information from the given dictionary. - - Parameters: - - d (dict): The dictionary containing the MARC21 record. - - Returns: - - str: The author information. - - Raises: - - KeyError: If the dictionary does not contain any author fields (100, 110, or 111). - """ if "100" in d: return joinsubfields(d["100"]) elif "110" in d: @@ -267,27 +128,6 @@ def author(d): def uniform_title(d): - """ - Returns the uniform title from the MARC21 record dictionary. - - Parameters: - - d (dict): The MARC21 record dictionary. - - Returns: - - str: The uniform title. - - Raises: - - None. - - Examples: - >>> record = {"130": ["Uniform Title"]} - >>> uniform_title(record) - 'Uniform Title' - - >>> record = {"240": ["Uniform Title"]} - >>> uniform_title(record) - 'Uniform Title' - """ if "130" in d: return joinsubfields(d["130"]) elif "240" in d: @@ -300,139 +140,35 @@ def uniform_title(d): def subjects(d): - """ - Returns a string containing the joined subfields of the given document's subject fields. - - Parameters: - - d: The document to extract subject fields from. - - Returns: - A string containing the joined subfields of the subject fields. - """ return " ".join(joinsubfields(vs) for vs in getfields(d, *subjectfields)) def physical(d): - """ - Returns the physical description of a MARC21 record. - - Parameters: - - d (dict): The MARC21 record dictionary. - - Returns: - - str: The physical description of the record. - """ return joinsubfields(d["300"]) def location(d): - """ - Returns the location of a record in the MARC21 format. - - Parameters: - - d (dict): The MARC21 record dictionary. - - Returns: - - str: The location of the record. - """ return joinsubfields(d["852"]) def publisher(d): - """ - Extracts the publisher information from the MARC21 record. - - Args: - d (dict): The MARC21 record dictionary. - - Returns: - str: The publisher information, or None if not found. - """ if "260" in d: return subfield(d["260"], "b") def pubyear(d): - """ - Extracts the publication year from a MARC21 record. - - Args: - d (dict): The MARC21 record dictionary. - - Returns: - str: The publication year, or None if not found. - """ if "260" in d: return subfield(d["260"], "c") def uni(v): - """ - Converts a byte string to a Unicode string. - - Parameters: - v (bytes): The byte string to be converted. - - Returns: - str: The converted Unicode string. - - Raises: - None - - Examples: - >>> uni(b'hello') - 'hello' - >>> uni(None) - '' - """ return "" if v is None else v.decode("utf-8", "replace") # Indexing and searching -def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*.mrc"): - """ - Create an index for MARC21 records. - - Args: - basedir (str): The base directory containing the MARC21 files. - ixdir (str): The directory to store the index. - procs (int, optional): The number of processors to use for indexing. Defaults to 4. - limitmb (int, optional): The memory limit per processor in megabytes. Defaults to 128. - multisegment (bool, optional): Whether to use multisegment indexing. Defaults to True. - glob (str, optional): The file pattern to match for indexing. Defaults to "*.mrc". - - Returns: - None - - Raises: - OSError: If the specified `ixdir` directory does not exist and cannot be created. - - Notes: - This function creates an index for MARC21 records using the Whoosh library. It takes the base directory - containing the MARC21 files (`basedir`), the directory to store the index (`ixdir`), and optional parameters - for configuring the indexing process. - - The `procs` parameter specifies the number of processors to use for indexing. By default, it is set to 4. - - The `limitmb` parameter sets the memory limit per processor in megabytes. The default value is 128. - The `multisegment` parameter determines whether to use multisegment indexing. If set to True (default), the - index will be split into multiple segments for better performance. - The `glob` parameter specifies the file pattern to match for indexing. By default, it is set to "*.mrc". - - If the specified `ixdir` directory does not exist, it will be created before creating the index. - - The function uses a multi-lingual stop words list for text analysis and defines a schema for the index - containing fields for title, author, subject, file, and position. - - The MARC fields to extract are specified in the `mfields` set. - - The function prints the indexing configuration and starts the indexing process. It creates the index in the - specified `ixdir` directory and uses the Whoosh writer to add documents to the index. - - After indexing is complete, the function returns None. - """ +def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*.mrc"): if not os.path.exists(ixdir): os.mkdir(ixdir) @@ -441,7 +177,7 @@ def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*. "de la der und le die et en al no von di du da " "del zur ein".split() ) # Schema - ana = analysis.stemming_analyzer(stoplist=stoplist) + ana = analysis.StemmingAnalyzer(stoplist=stoplist) schema = fields.Schema( title=fields.TEXT(analyzer=ana), author=fields.TEXT(phrase=False), @@ -484,22 +220,6 @@ def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*. def print_record(no, basedir, filename, pos): - """ - Print the record information. - - Args: - no (int): The record number. - basedir (str): The base directory. - filename (str): The name of the file. - pos (int): The position of the record. - - Returns: - None - - Raises: - None - - """ path = os.path.join(basedir, filename) record = read_record(path, pos) print("% 5d. %s" % (no + 1, title(record))) @@ -512,24 +232,6 @@ def print_record(no, basedir, filename, pos): def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True): - """ - Perform a search on the index using the given query string. - - Args: - qstring (str): The query string to search for. - ixdir (str): The directory path where the index is located. - basedir (str): The base directory path. - limit (int, optional): The maximum number of results to return. Defaults to None. - optimize (bool, optional): Whether to optimize the search. Defaults to True. - scores (bool, optional): Whether to include scores in the search results. Defaults to True. - - Returns: - None - - Raises: - None - - """ ix = index.open_dir(ixdir) qp = qparser.QueryParser("title", ix.schema) q = qp.parse(qstring) diff --git a/benchmark/reuters.py b/benchmark/reuters.py index ba0b1ff7..dde05363 100644 --- a/benchmark/reuters.py +++ b/benchmark/reuters.py @@ -7,30 +7,14 @@ class Reuters(Spec): - """ - The Reuters class represents a benchmark for the Reuters dataset. - - Attributes: - name (str): The name of the benchmark. - filename (str): The name of the file containing the dataset. - main_field (str): The main field in the dataset. - headline_text (str): The field representing the headline text in the dataset. - """ - name = "reuters" filename = "reuters21578.txt.gz" main_field = "text" headline_text = "headline" def whoosh_schema(self): - """ - Returns the schema for the Whoosh index. - - Returns: - Schema: The schema for the Whoosh index. - """ - # ana = analysis.stemming_analyzer() - ana = analysis.standard_analyzer() + # ana = analysis.StemmingAnalyzer() + ana = analysis.StandardAnalyzer() schema = fields.Schema( id=fields.ID(stored=True), headline=fields.STORED, @@ -39,12 +23,6 @@ def whoosh_schema(self): return schema def zcatalog_setup(self, cat): - """ - Sets up the ZCatalog index. - - Args: - cat (ZCatalog): The ZCatalog instance to set up. - """ from zcatalog import indexes # type: ignore @UnresolvedImport cat["id"] = indexes.FieldIndex(field_name="id") @@ -52,18 +30,12 @@ def zcatalog_setup(self, cat): cat["body"] = indexes.TextIndex(field_name="text") def documents(self): - """ - Generates documents from the dataset. - - Yields: - dict: A document from the dataset. - """ path = os.path.join(self.options.dir, self.filename) f = gzip.GzipFile(path) for line in f: - id_var, text = line.decode("latin1").split("\t") - yield {"id": id_var, "text": text, "headline": text[:70]} + id, text = line.decode("latin1").split("\t") + yield {"id": id, "text": text, "headline": text[:70]} if __name__ == "__main__": diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst index ca6457a8..ebbb72a9 100644 --- a/docs/source/analysis.rst +++ b/docs/source/analysis.rst @@ -85,7 +85,7 @@ Using analyzers When you create a field in a schema, you can specify your analyzer as a keyword argument to the field object:: - schema = Schema(content=TEXT(analyzer=stemming_analyzer())) + schema = Schema(content=TEXT(analyzer=StemmingAnalyzer())) Advanced Analysis @@ -276,8 +276,8 @@ be removed from the stream or left in. :: - >>> from whoosh.analysis import standard_analyzer - >>> analyzer = standard_analyzer() + >>> from whoosh.analysis import StandardAnalyzer + >>> analyzer = StandardAnalyzer() >>> [(t.text, t.stopped) for t in analyzer(u"This is a test")] [(u'test', False)] >>> [(t.text, t.stopped) for t in analyzer(u"This is a test", removestops=False)] diff --git a/docs/source/api/analysis.rst b/docs/source/api/analysis.rst index b8da7b7a..81805618 100644 --- a/docs/source/api/analysis.rst +++ b/docs/source/api/analysis.rst @@ -7,16 +7,16 @@ Analyzers ========= -.. autofunction:: id_analyzer -.. autofunction:: keyword_analyzer -.. autofunction:: regex_analyzer -.. autofunction:: simple_analyzer -.. autofunction:: standard_analyzer -.. autofunction:: stemming_analyzer -.. autofunction:: fancy_analyzer -.. autofunction:: ngram_analyzer -.. autofunction:: ngram_word_analyzer -.. autofunction:: language_analyzer +.. autofunction:: IDAnalyzer +.. autofunction:: KeywordAnalyzer +.. autofunction:: RegexAnalyzer +.. autofunction:: SimpleAnalyzer +.. autofunction:: StandardAnalyzer +.. autofunction:: StemmingAnalyzer +.. autofunction:: FancyAnalyzer +.. autofunction:: NgramAnalyzer +.. autofunction:: NgramWordAnalyzer +.. autofunction:: LanguageAnalyzer Tokenizers diff --git a/docs/source/batch.rst b/docs/source/batch.rst index 6f749d7f..b8a741f0 100644 --- a/docs/source/batch.rst +++ b/docs/source/batch.rst @@ -13,7 +13,7 @@ of documents at once (batch indexing). The following settings and alternate workflows can make batch indexing faster. -stemming_analyzer cache +StemmingAnalyzer cache ====================== The stemming analyzer by default uses a least-recently-used (LRU) cache to limit diff --git a/docs/source/highlight.rst b/docs/source/highlight.rst index 6313263e..bc266c8c 100644 --- a/docs/source/highlight.rst +++ b/docs/source/highlight.rst @@ -15,8 +15,8 @@ The highlighting system works as a pipeline, with four component types. * **Order functions** control in what order the top-scoring fragments are presented to the user. For example, you can show the fragments in the order - they appear in the document (first) or show higher-scoring fragments first - (score) + they appear in the document (FIRST) or show higher-scoring fragments first + (SCORE) * **Formatters** turn the fragment objects into human-readable output, such as an HTML string. @@ -199,19 +199,19 @@ fragments with lower values appear before fragments with higher values). The ``highlight`` module has the following order functions. -``first`` (the default) +``FIRST`` (the default) Show fragments in the order they appear in the document. -``score`` +``SCORE`` Show highest scoring fragments first. -The ``highlight`` module also includes ``longer`` (longer fragments first) and -``shorter`` (shorter fragments first), but they probably aren't as generally +The ``highlight`` module also includes ``LONGER`` (longer fragments first) and +``SHORTER`` (shorter fragments first), but they probably aren't as generally useful. To use a different order:: - results.order = highlight.score + results.order = highlight.SCORE Formatter @@ -371,7 +371,7 @@ an analyzer:: from whoosh.highlight import highlight excerpts = highlight(text, terms, analyzer, fragmenter, formatter, top=3, - scorer=BasicFragmentScorer, minscore=1, order=first) + scorer=BasicFragmentScorer, minscore=1, order=FIRST) ``text`` The original text of the document. diff --git a/docs/source/ngrams.rst b/docs/source/ngrams.rst index 1eeac448..56bfe22f 100644 --- a/docs/source/ngrams.rst +++ b/docs/source/ngrams.rst @@ -33,7 +33,7 @@ separation. :: - >>> my_analyzer = standard_analyzer() | NgramFilter(minsize=2, maxsize=4) + >>> my_analyzer = StandardAnalyzer() | NgramFilter(minsize=2, maxsize=4) >>> [token.text for token in my_analyzer(u"rendering shaders")] [u'ren', u'rend', u'end', u'ende', u'nde', u'nder', u'der', u'deri', u'eri', u'erin', u'rin', u'ring', u'ing', u'sha', u'shad', u'had', u'hade', u'ade', diff --git a/docs/source/recipes.rst b/docs/source/recipes.rst index 8ef1d21e..29c9571b 100644 --- a/docs/source/recipes.rst +++ b/docs/source/recipes.rst @@ -61,17 +61,17 @@ Find every document iTunes-style search-as-you-type ------------------------------- -Use the :class:` whoosh.analysis.ngram_word_analyzer` as the analyzer for the +Use the :class:` whoosh.analysis.NgramWordAnalyzer` as the analyzer for the field you want to search as the user types. You can save space in the index by turning off positions in the field using ``phrase=False``, since phrase searching on N-gram fields usually doesn't make much sense:: # For example, to search the "title" field as the user types - analyzer = analysis.ngram_word_analyzer() + analyzer = analysis.NgramWordAnalyzer() title_field = fields.TEXT(analyzer=analyzer, phrase=False) schema = fields.Schema(title=title_field) -See the documentation for the :class:`~ whoosh.analysis.ngram_word_analyzer` class +See the documentation for the :class:`~ whoosh.analysis.NgramWordAnalyzer` class for information on the available options. diff --git a/docs/source/releases/1_0.rst b/docs/source/releases/1_0.rst index 7b66e063..08887f53 100644 --- a/docs/source/releases/1_0.rst +++ b/docs/source/releases/1_0.rst @@ -77,14 +77,14 @@ analyzer as the inverted index. Alternatively, you can pass a Format subclass and Whoosh will instantiate it for you. For example, to store term vectors using the same settings as the inverted -index (Positions format and standard_analyzer):: +index (Positions format and StandardAnalyzer):: from whoosh.fields import Schema, TEXT schema = Schema(content=TEXT(vector=True)) To store term vectors that use the same analyzer as the inverted index -(standard_analyzer by default) but only store term frequency:: +(StandardAnalyzer by default) but only store term frequency:: from whoosh.formats import Frequency @@ -351,7 +351,7 @@ Fixed bug where files could be deleted before a reader could open them in threaded situations. New :class:` whoosh.analysis.NgramFilter` filter, -:class:` whoosh.analysis.ngram_word_analyzer` analyzer, and +:class:` whoosh.analysis.NgramWordAnalyzer` analyzer, and :class:` whoosh.fields.NGRAMWORDS` field type allow producing n-grams from tokenized text. diff --git a/docs/source/releases/2_0.rst b/docs/source/releases/2_0.rst index 158feaa2..20569387 100644 --- a/docs/source/releases/2_0.rst +++ b/docs/source/releases/2_0.rst @@ -46,7 +46,7 @@ Whoosh 2.5 * Whoosh now includes pure-Python implementations of the Snowball stemmers and stop word lists for various languages adapted from NLTK. These are available - through the :class:` whoosh.analysis.language_analyzer` analyzer or through the + through the :class:` whoosh.analysis.LanguageAnalyzer` analyzer or through the ``lang=`` keyword argument to the :class:`~ whoosh.fields.TEXT` field. diff --git a/docs/source/schema.rst b/docs/source/schema.rst index fbe8a91d..58da2fc7 100644 --- a/docs/source/schema.rst +++ b/docs/source/schema.rst @@ -31,9 +31,9 @@ Whoosh provides some useful predefined field types: This type is for body text. It indexes (and optionally stores) the text and stores term positions to allow phrase searching. - ``TEXT`` fields use :class:`~ whoosh.analysis.standard_analyzer` by default. To specify a different + ``TEXT`` fields use :class:`~ whoosh.analysis.StandardAnalyzer` by default. To specify a different analyzer, use the ``analyzer`` keyword argument to the constructor, e.g. - ``TEXT(analyzer=analysis.stemming_analyzer())``. See :doc:`analysis`. + ``TEXT(analyzer=analysis.StemmingAnalyzer())``. See :doc:`analysis`. By default, ``TEXT`` fields store position information for each indexed term, to allow you to search for phrases. If you don't need to be able to search for @@ -104,12 +104,12 @@ Creating a Schema To create a schema:: from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED - from whoosh.analysis import stemming_analyzer + from whoosh.analysis import StemmingAnalyzer schema = Schema(from_addr=ID(stored=True), to_addr=ID(stored=True), subject=TEXT(stored=True), - body=TEXT(analyzer=stemming_analyzer()), + body=TEXT(analyzer=StemmingAnalyzer()), tags=KEYWORD) If you aren't specifying any constructor keyword arguments to one of the diff --git a/docs/source/spelling.rst b/docs/source/spelling.rst index 84ecbf87..36fbb777 100644 --- a/docs/source/spelling.rst +++ b/docs/source/spelling.rst @@ -37,7 +37,7 @@ However, if you have an analyzer that modifies the indexed words (such as stemming), you can add ``spelling=True`` to a field to have it store separate unmodified versions of the terms for spelling suggestions:: - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=TEXT(analyzer=ana, spelling=True)) You can then use the :meth:` whoosh.searching.Searcher.corrector` method diff --git a/docs/source/stemming.rst b/docs/source/stemming.rst index 7dab76be..0d30b569 100644 --- a/docs/source/stemming.rst +++ b/docs/source/stemming.rst @@ -50,13 +50,13 @@ analyzer chain. >>> [token.text for token in stemmer(stream)] [u"fundament", u"willow"] -The :func:` whoosh.analysis.stemming_analyzer` is a pre-packaged analyzer that +The :func:` whoosh.analysis.StemmingAnalyzer` is a pre-packaged analyzer that combines a tokenizer, lower-case filter, optional stop filter, and stem filter:: from whoosh import fields - from whoosh.analysis import stemming_analyzer + from whoosh.analysis import StemmingAnalyzer - stem_ana = stemming_analyzer() + stem_ana = StemmingAnalyzer() schema = fields.Schema(title=TEXT(analyzer=stem_ana, stored=True), content=TEXT(analyzer=stem_ana)) @@ -170,12 +170,12 @@ text. For example, it will filter the tokens ``u'café', u'resumé', ...`` to ``u'cafe', u'resume', ...``. This is usually the method you'll want to use unless you need to use a charset to tokenize terms:: - from whoosh.analysis import CharsetFilter, stemming_analyzer + from whoosh.analysis import CharsetFilter, StemmingAnalyzer from whoosh import fields from whoosh.support.charset import accent_map # For example, to add an accent-folding filter to a stemming analyzer: - my_analyzer = stemming_analyzer() | CharsetFilter(accent_map) + my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) # To use this analyzer in your schema: my_schema = fields.Schema(content=fields.TEXT(analyzer=my_analyzer)) @@ -197,7 +197,7 @@ required by ``CharsetTokenizer`` and ``CharsetFilter``:: from whoosh.analysis import CharsetFilter from whoosh.support.charset import default_charset, charset_table_to_dict charmap = charset_table_to_dict(default_charset) - my_analyzer = stemming_analyzer() | CharsetFilter(charmap) + my_analyzer = StemmingAnalyzer() | CharsetFilter(charmap) (The Sphinx charset table format is described at http://www.sphinxsearch.com/docs/current.html#conf-charset-table ) diff --git a/scripts/make_checkpoint.py b/scripts/make_checkpoint.py index da900fb9..9c1b818b 100644 --- a/scripts/make_checkpoint.py +++ b/scripts/make_checkpoint.py @@ -1,28 +1,8 @@ #!python -""" -This script creates a "checkpoint" index using the Whoosh library. The checkpoint index captures the index format created by a certain version of Whoosh. +# Make a "checkpoint" index, capturing the index format created by a certain +# version of Whoosh -Usage: make_checkpoint.py - -Parameters: - (str): The directory where the checkpoint index will be created. - -The script generates a checkpoint index with the following fields: -- path: A unique identifier for each document. -- num: An integer field stored in the index. -- frac: A float field stored in the index. -- dt: A datetime field stored in the index. -- tag: A keyword field. -- title: A text field stored in the index. -- ngrams: A field for generating n-grams from the title. - -The script creates a directory if it doesn't exist and initializes the index with the specified schema. It then adds documents to the index with randomly generated data. The number of documents and the data for each document are determined by the loop iterations. - -Finally, the script deletes specific documents from the index and prints the total number of documents in the index. - -Note: The Whoosh library must be installed in order to run this script. -""" import os.path import random diff --git a/scripts/read_checkpoint.py b/scripts/read_checkpoint.py index 7f5d2fa8..d8a9d77c 100644 --- a/scripts/read_checkpoint.py +++ b/scripts/read_checkpoint.py @@ -1,20 +1,7 @@ #!python -""" -This script reads a "checkpoint" index to check for backwards compatibility. +# Read a "checkpoint" index, to check backwards compatibility -The script takes a directory path as a command-line argument and reads the checkpoint index located in that directory. -It then performs various checks on the index to ensure its integrity and compatibility. - -Usage: read_checkpoint.py - -Args: - (str): The directory path where the checkpoint index is located. - -Example: - $ python read_checkpoint.py /path/to/index - -""" import sys diff --git a/src/whoosh/analysis/__init__.py b/src/whoosh/analysis/__init__.py index d76c3c6e..0c116bf6 100644 --- a/src/whoosh/analysis/__init__.py +++ b/src/whoosh/analysis/__init__.py @@ -45,7 +45,7 @@ generator. * Analyzers are convenience functions/classes that "package up" a tokenizer and - zero or more filters into a single unit. For example, the standard_analyzer + zero or more filters into a single unit. For example, the StandardAnalyzer combines a RegexTokenizer, LowercaseFilter, and StopFilter. Every analyzer is a callable that takes a string and returns a token @@ -69,14 +69,14 @@ ) from whoosh.analysis.analyzers import ( Analyzer, - fancy_analyzer, - id_analyzer, - keyword_analyzer, - language_analyzer, - regex_analyzer, - simple_analyzer, - standard_analyzer, - stemming_analyzer, + FancyAnalyzer, + IDAnalyzer, + KeywordAnalyzer, + LanguageAnalyzer, + RegexAnalyzer, + SimpleAnalyzer, + StandardAnalyzer, + StemmingAnalyzer, ) from whoosh.analysis.filters import ( STOP_WORDS, @@ -103,10 +103,10 @@ ) from whoosh.analysis.morph import DoubleMetaphoneFilter, PyStemmerFilter, StemFilter from whoosh.analysis.ngrams import ( + NgramAnalyzer, NgramFilter, NgramTokenizer, - ngram_analyzer, - ngram_word_analyzer, + NgramWordAnalyzer, ) from whoosh.analysis.tokenizers import ( CharsetTokenizer, diff --git a/src/whoosh/analysis/acore.py b/src/whoosh/analysis/acore.py index f7ac3949..318f1129 100644 --- a/src/whoosh/analysis/acore.py +++ b/src/whoosh/analysis/acore.py @@ -29,16 +29,6 @@ class CompositionError(Exception): - """ - Exception raised when there is an error in the composition of analysis components. - - This exception is raised when there is an error in the composition of analysis components, - such as when incompatible components are combined together. - - Attributes: - message -- explanation of the error - """ - pass @@ -46,15 +36,7 @@ class CompositionError(Exception): def unstopped(tokenstream): - """ - Removes tokens from a token stream where token.stopped = True. - - Parameters: - - tokenstream (generator): A generator of tokens. - - Returns: - - generator: A generator of tokens where token.stopped = False. - """ + """Removes tokens from a token stream where token.stopped = True.""" return (t for t in tokenstream if not t.stopped) @@ -66,26 +48,8 @@ def entoken( with the attributes filled in with reasonable values (for example, if ``positions`` or ``chars`` is True, the function assumes each token was separated by one space). - - Args: - textstream (Iterable[str]): A sequence of unicode strings. - positions (bool, optional): Whether to include position information in the Token objects. Defaults to False. - chars (bool, optional): Whether to include character information in the Token objects. Defaults to False. - start_pos (int, optional): The starting position for the Token objects. Defaults to 0. - start_char (int, optional): The starting character position for the Token objects. Defaults to 0. - **kwargs: Additional keyword arguments to be passed to the Token objects. - - Yields: - Token: A Token object with the attributes filled in based on the input parameters. - - Examples: - >>> textstream = ["Hello", "world"] - >>> for token in entoken(textstream, positions=True, chars=True): - ... print(token.text, token.pos, token.startchar, token.endchar) - ... - Hello 0 0 5 - world 1 5 10 """ + pos = start_pos char = start_char t = Token(positions=positions, chars=chars, **kwargs) @@ -106,6 +70,8 @@ def entoken( # Token object + + class Token: """ Represents a "token" (usually a word) extracted from the source text being @@ -139,18 +105,14 @@ def __init__( self, positions=False, chars=False, removestops=True, mode="", **kwargs ): """ - Initializes a Token object. - :param positions: Whether tokens should have the token position in the 'pos' attribute. :param chars: Whether tokens should have character offsets in the 'startchar' and 'endchar' attributes. - :param removestops: Whether to remove stop words from the stream (if + :param removestops: whether to remove stop words from the stream (if the tokens pass through a stop filter). - :param mode: Contains a string describing the purpose for which the + :param mode: contains a string describing the purpose for which the analyzer is being called, i.e. 'index' or 'query'. - :param kwargs: Additional keyword arguments to be stored as attributes - of the Token object. """ self.positions = positions @@ -162,22 +124,10 @@ def __init__( self.__dict__.update(kwargs) def __repr__(self): - """ - Returns a string representation of the Token object. - - :return: A string representation of the Token object. - """ - parms = ", ".join(f"{name}={value!r}" for name, value in self.__dict__.items()) return f"{self.__class__.__name__}({parms})" def copy(self): - """ - Creates a copy of the Token object. - - :return: A copy of the Token object. - """ - # This is faster than using the copy module return Token(**self.__dict__) @@ -186,37 +136,9 @@ def copy(self): class Composable: - """ - A base class for composable objects in the analysis pipeline. - - Composable objects can be combined using the '|' operator to create composite analyzers. - - Attributes: - is_morph (bool): Indicates whether the composable object has morphological analysis. - - Methods: - __or__(self, other): Combines the current composable object with another composable object to create a composite analyzer. - __repr__(self): Returns a string representation of the composable object. - has_morph(self): Checks if the composable object has morphological analysis. - - """ - is_morph = False def __or__(self, other): - """ - Combines the current composable object with another composable object to create a composite analyzer. - - Args: - other (Composable): The composable object to be combined with. - - Returns: - CompositeAnalyzer: The composite analyzer created by combining the two composable objects. - - Raises: - TypeError: If the 'other' object is not an instance of Composable. - - """ from whoosh.analysis.analyzers import CompositeAnalyzer if not isinstance(other, Composable): @@ -224,13 +146,6 @@ def __or__(self, other): return CompositeAnalyzer(self, other) def __repr__(self): - """ - Returns a string representation of the composable object. - - Returns: - str: The string representation of the composable object. - - """ attrs = "" if self.__dict__: attrs = ", ".join( @@ -239,11 +154,4 @@ def __repr__(self): return self.__class__.__name__ + f"({attrs})" def has_morph(self): - """ - Checks if the composable object has morphological analysis. - - Returns: - bool: True if the composable object has morphological analysis, False otherwise. - - """ return self.is_morph diff --git a/src/whoosh/analysis/analyzers.py b/src/whoosh/analysis/analyzers.py index 45deb597..236733d9 100644 --- a/src/whoosh/analysis/analyzers.py +++ b/src/whoosh/analysis/analyzers.py @@ -39,31 +39,16 @@ ) from whoosh.lang.porter import stem - # Analyzers -class Analyzer(Composable): - """Abstract base class for analyzers. - - An analyzer is responsible for processing text data and producing a stream of tokens. - Subclasses of Analyzer should implement the __call__ method to define the tokenization process. - Attributes: - None - Methods: - __repr__: Returns a string representation of the analyzer. - __eq__: Checks if two analyzers are equal. - __call__: Processes the input value and returns a stream of tokens. - clean: Cleans up any resources used by the analyzer. - - """ +class Analyzer(Composable): + """Abstract base class for analyzers.""" def __repr__(self): - """Returns a string representation of the analyzer.""" return f"{self.__class__.__name__}()" def __eq__(self, other): - """Checks if two analyzers are equal.""" return ( other and self.__class__ is other.__class__ @@ -71,65 +56,15 @@ def __eq__(self, other): ) def __call__(self, value, **kwargs): - """Processes the input value and returns a stream of tokens. - - Args: - value (str): The input value to be analyzed. - **kwargs: Additional keyword arguments that may be required by specific analyzers. - - Returns: - generator: A generator that yields the tokens produced by the analyzer. - - Raises: - NotImplementedError: If the __call__ method is not implemented by a subclass. - - """ raise NotImplementedError def clean(self): - """Cleans up any resources used by the analyzer. - - This method is intentionally left empty. - - Args: - None - - Returns: - None - - """ + # This method is intentionally left empty. pass class CompositeAnalyzer(Analyzer): - """ - A composite analyzer that combines multiple analyzers and tokenizers into a single analyzer. - - Args: - *composables: Variable number of analyzers and tokenizers to be combined. - - Raises: - CompositionError: If more than one tokenizer is provided at the start of the analyzer. - - Example: - analyzer = CompositeAnalyzer(standard_analyzer(), LowercaseFilter()) - tokens = analyzer("Hello World") - for token in tokens: - print(token) - - """ - def __init__(self, *composables): - """ - Initializes the CompositeAnalyzer. - - Args: - *composables: Variable number of analyzers and tokenizers to be combined. - - Raises: - CompositionError: If more than one tokenizer is provided at the start of the analyzer. - - """ self.items = [] for comp in composables: @@ -138,6 +73,9 @@ def __init__(self, *composables): else: self.items.append(comp) + # Tokenizers must start a chain, and then only filters after that + # (because analyzers take a string and return a generator of tokens, + # and filters take and return generators of tokens) for item in self.items[1:]: if isinstance(item, Tokenizer): raise CompositionError( @@ -145,132 +83,65 @@ def __init__(self, *composables): ) def __repr__(self): - """ - Returns a string representation of the CompositeAnalyzer. - - Returns: - str: String representation of the CompositeAnalyzer. - - """ return "{}({})".format( self.__class__.__name__, ", ".join(repr(item) for item in self.items), ) def __call__(self, value, no_morph=False, **kwargs): - """ - Applies the composite analyzer to the given value and returns a generator of tokens. - - Args: - value (str): The input value to be analyzed. - no_morph (bool, optional): Flag to skip morphological analysis. Defaults to False. - **kwargs: Additional keyword arguments to be passed to the analyzers and tokenizers. - - Returns: - generator: A generator of tokens. - - """ items = self.items + # Start with tokenizer gen = items[0](value, **kwargs) + # Run filters for item in items[1:]: if not (no_morph and hasattr(item, "is_morph") and item.is_morph): gen = item(gen) return gen def __getitem__(self, item): - """ - Returns the item at the specified index. - - Args: - item (int): The index of the item to retrieve. - - Returns: - object: The item at the specified index. - - """ return self.items.__getitem__(item) def __len__(self): - """ - Returns the number of items in the CompositeAnalyzer. - - Returns: - int: The number of items in the CompositeAnalyzer. - - """ return len(self.items) def __eq__(self, other): - """ - Checks if the CompositeAnalyzer is equal to another object. - - Args: - other (object): The object to compare with. - - Returns: - bool: True if the CompositeAnalyzer is equal to the other object, False otherwise. - - """ return other and self.__class__ is other.__class__ and self.items == other.items def clean(self): - """ - Cleans up any resources used by the CompositeAnalyzer. - - """ for item in self.items: if hasattr(item, "clean"): item.clean() def has_morph(self): - """ - Checks if the CompositeAnalyzer has any morphological analysis. - - Returns: - bool: True if the CompositeAnalyzer has morphological analysis, False otherwise. - - """ return any(item.is_morph for item in self.items) # Functions that return composed analyzers -def id_analyzer(lowercase=False): +def IDAnalyzer(lowercase=False): + """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if + desired. """ - Returns an analyzer that tokenizes input text into individual tokens using the IDTokenizer. - If lowercase is set to True, it also applies the LowercaseFilter to convert tokens to lowercase. - - Parameters: - - lowercase (bool): Whether to convert tokens to lowercase. Default is False. - - Returns: - - tokenizer (Analyzer): The configured analyzer. - Deprecated: This function is deprecated. It is recommended to use IDTokenizer directly, with a LowercaseFilter if desired. - """ tokenizer = IDTokenizer() if lowercase: tokenizer = tokenizer | LowercaseFilter() return tokenizer -def keyword_analyzer(lowercase=False, commas=False): - """ - Parses whitespace- or comma-separated tokens. - - This analyzer is used to parse whitespace- or comma-separated tokens from a given text. - It can be configured to lowercase the tokens and treat items separated by commas instead of whitespace. +def KeywordAnalyzer(lowercase=False, commas=False): + """Parses whitespace- or comma-separated tokens. - Example usage: - >>> ana = keyword_analyzer() + >>> ana = KeywordAnalyzer() >>> [token.text for token in ana("Hello there, this is a TEST")] ["Hello", "there,", "this", "is", "a", "TEST"] - :param lowercase: A boolean indicating whether to lowercase the tokens. Default is False. - :param commas: A boolean indicating whether items are separated by commas instead of whitespace. Default is False. - :return: A tokenizer object that can be used to tokenize the input text. + :param lowercase: whether to lowercase the tokens. + :param commas: if True, items are separated by commas rather than + whitespace. """ + if commas: tokenizer = CommaSeparatedTokenizer() else: @@ -280,51 +151,34 @@ def keyword_analyzer(lowercase=False, commas=False): return tokenizer -def regex_analyzer(expression=r"\w+(\.?\w+)*", gaps=False): - r""" - Deprecated, just use a RegexTokenizer directly. - - Args: - expression (str, optional): The regular expression pattern to match. Defaults to r"\w+(\.?\w+)*". - gaps (bool, optional): Whether to split on gaps (non-matching substrings) or matches. Defaults to False. - - Returns: - RegexTokenizer: A tokenizer that tokenizes text using a regular expression pattern. - """ +def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False): + """Deprecated, just use a RegexTokenizer directly.""" return RegexTokenizer(expression=expression, gaps=gaps) -def simple_analyzer(expression=default_pattern, gaps=False): - """ - Composes a RegexTokenizer with a LowercaseFilter. - - This function creates an analyzer that tokenizes text using a regular expression pattern and converts the tokens to lowercase. +def SimpleAnalyzer(expression=default_pattern, gaps=False): + """Composes a RegexTokenizer with a LowercaseFilter. - Example usage: - >>> ana = simple_analyzer() + >>> ana = SimpleAnalyzer() >>> [token.text for token in ana("Hello there, this is a TEST")] ["hello", "there", "this", "is", "a", "test"] - :param expression: The regular expression pattern to use for token extraction. Defaults to `default_pattern`. - :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. Defaults to False. - :return: An analyzer object that tokenizes text using the specified regular expression pattern and converts the tokens to lowercase. + :param expression: The regular expression pattern to use to extract tokens. + :param gaps: If True, the tokenizer *splits* on the expression, rather + than matching on the expression. """ + return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter() -def standard_analyzer( +def StandardAnalyzer( expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False ): """Composes a RegexTokenizer with a LowercaseFilter and optional StopFilter. - This analyzer is used to tokenize and filter text into a stream of tokens. - It applies a regular expression pattern to extract tokens, converts them to lowercase, - and optionally removes stop words. - - Example usage: - >>> ana = standard_analyzer() + >>> ana = StandardAnalyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["testing", "testing", "testing"] @@ -332,11 +186,11 @@ def standard_analyzer( :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. - :param maxsize: Words longer than this are removed from the stream. + :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. - :return: A chain of tokenizers and filters that can be used to analyze text. """ + ret = RegexTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: @@ -344,7 +198,7 @@ def standard_analyzer( return chain -def stemming_analyzer( +def StemmingAnalyzer( expression=default_pattern, stoplist=STOP_WORDS, minsize=2, @@ -354,49 +208,26 @@ def stemming_analyzer( ignore=None, cachesize=50000, ): - r""" - Composes a RegexTokenizer with a lower case filter, an optional stop + """Composes a RegexTokenizer with a lower case filter, an optional stop filter, and a stemming filter. - Args: - expression (str, optional): The regular expression pattern to use to extract tokens. - stoplist (list, optional): A list of stop words. Set this to None to disable the stop word filter. - minsize (int, optional): Words smaller than this are removed from the stream. - maxsize (int, optional): Words longer that this are removed from the stream. - gaps (bool, optional): If True, the tokenizer *splits* on the expression, rather than matching on the expression. - stemfn (function, optional): The stemming function to use. Defaults to the `stem` function. - ignore (set, optional): A set of words to not stem. - cachesize (int, optional): The maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. Use None for no cache, or -1 for an unbounded cache. - - Returns: - Analyzer: The composed analyzer. - - Examples: - >>> ana = stemming_analyzer() - >>> [token.text for token in ana("Testing is testing and testing")] - ["test", "test", "test"] - - This function composes an analyzer that tokenizes text using a regular expression pattern, - converts tokens to lowercase, applies an optional stop word filter, and performs stemming - on the tokens. - - The `expression` parameter specifies the regular expression pattern to use for token extraction. - The `stoplist` parameter is a list of stop words to be filtered out. If set to None, the stop word - filter is disabled. The `minsize` and `maxsize` parameters control the minimum and maximum word - lengths to keep in the token stream. The `gaps` parameter determines whether the tokenizer splits - on the expression or matches on it. - - The `stemfn` parameter specifies the stemming function to use. By default, it uses the `stem` function. - The `ignore` parameter is a set of words that should not be stemmed. The `cachesize` parameter sets - the maximum number of stemmed words to cache, improving performance at the cost of memory usage. - - The function returns the composed analyzer, which can be used to process text and extract tokens. - - Example usage: - >>> analyzer = stemming_analyzer(expression=r'\w+', stoplist=['is', 'and'], minsize=3) - >>> [token.text for token in analyzer("Testing is testing and testing")] + >>> ana = StemmingAnalyzer() + >>> [token.text for token in ana("Testing is testing and testing")] ["test", "test", "test"] + + :param expression: The regular expression pattern to use to extract tokens. + :param stoplist: A list of stop words. Set this to None to disable + the stop word filter. + :param minsize: Words smaller than this are removed from the stream. + :param maxsize: Words longer that this are removed from the stream. + :param gaps: If True, the tokenizer *splits* on the expression, rather + than matching on the expression. + :param ignore: a set of words to not stem. + :param cachesize: the maximum number of stemmed words to cache. The larger + this number, the faster stemming will be but the more memory it will + use. Use None for no cache, or -1 for an unbounded cache. """ + ret = RegexTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: @@ -404,7 +235,7 @@ def stemming_analyzer( return chain | StemFilter(stemfn=stemfn, ignore=ignore, cachesize=cachesize) -def fancy_analyzer( +def FancyAnalyzer( expression=r"\s+", stoplist=STOP_WORDS, minsize=2, @@ -414,36 +245,22 @@ def fancy_analyzer( mergewords=False, mergenums=False, ): - """ - Composes a fancy_analyzer with a RegexTokenizer, IntraWordFilter, LowercaseFilter, and StopFilter. - - This analyzer tokenizes text using a regular expression pattern, applies intra-word filtering, - converts tokens to lowercase, and removes stop words. + """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and + StopFilter. - Example usage: - >>> ana = fancy_analyzer() + >>> ana = FancyAnalyzer() >>> [token.text for token in ana("Should I call getInt or get_real?")] ["should", "call", "getInt", "get", "int", "get_real", "get", "real"] - :param expression: The regular expression pattern to use for token extraction. - :type expression: str, optional - :param stoplist: A list of stop words. Set this to None to disable the stop word filter. - :type stoplist: list or None, optional - :param minsize: Words smaller than this are removed from the token stream. - :type minsize: int, optional - :param gaps: If True, the tokenizer splits on the expression, rather than matching on the expression. - :type gaps: bool, optional - :param splitwords: If True, intra-word filtering splits words. - :type splitwords: bool, optional - :param splitnums: If True, intra-word filtering splits numbers. - :type splitnums: bool, optional - :param mergewords: If True, intra-word filtering merges words. - :type mergewords: bool, optional - :param mergenums: If True, intra-word filtering merges numbers. - :type mergenums: bool, optional - :return: A composed analyzer. - :rtype: Analyzer + :param expression: The regular expression pattern to use to extract tokens. + :param stoplist: A list of stop words. Set this to None to disable + the stop word filter. + :param minsize: Words smaller than this are removed from the stream. + :param maxsize: Words longer that this are removed from the stream. + :param gaps: If True, the tokenizer *splits* on the expression, rather + than matching on the expression. """ + return ( RegexTokenizer(expression=expression, gaps=gaps) | IntraWordFilter( @@ -457,24 +274,27 @@ def fancy_analyzer( ) -def language_analyzer(lang, expression=default_pattern, gaps=False, cachesize=50000): - """ - Configures a simple analyzer for the given language, with a LowercaseFilter, StopFilter, and StemFilter. - - :param lang: The language code for the analyzer. The list of available languages is in `whoosh.lang.languages`. - :param expression: The regular expression pattern to use to extract tokens. - :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. - :param cachesize: The maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. - :return: The configured analyzer chain. +def LanguageAnalyzer(lang, expression=default_pattern, gaps=False, cachesize=50000): + """Configures a simple analyzer for the given language, with a + LowercaseFilter, StopFilter, and StemFilter. - Example usage: - >>> ana = language_analyzer("es") + >>> ana = LanguageAnalyzer("es") >>> [token.text for token in ana("Por el mar corren las liebres")] ['mar', 'corr', 'liebr'] The list of available languages is in `whoosh.lang.languages`. - You can use `whoosh.lang.has_stemmer` and `whoosh.lang.has_stopwords` to check if a given language has a stemming function and/or stop word list available. + You can use :func:`whoosh.lang.has_stemmer` and + :func:`whoosh.lang.has_stopwords` to check if a given language has a + stemming function and/or stop word list available. + + :param expression: The regular expression pattern to use to extract tokens. + :param gaps: If True, the tokenizer *splits* on the expression, rather + than matching on the expression. + :param cachesize: the maximum number of stemmed words to cache. The larger + this number, the faster stemming will be but the more memory it will + use. """ + from whoosh.lang import NoStemmer, NoStopWords # Make the start of the chain diff --git a/src/whoosh/analysis/filters.py b/src/whoosh/analysis/filters.py index bbf649cc..1fabefa8 100644 --- a/src/whoosh/analysis/filters.py +++ b/src/whoosh/analysis/filters.py @@ -103,15 +103,6 @@ class Filter(Composable): """ def __eq__(self, other): - """ - Compare this object with another object for equality. - - Args: - other: The object to compare with. - - Returns: - bool: True if the objects are equal, False otherwise. - """ return ( other and self.__class__ is other.__class__ @@ -119,27 +110,9 @@ def __eq__(self, other): ) def __ne__(self, other): - """ - Check if the current object is not equal to another object. - - Parameters: - - other: The object to compare with. - - Returns: - - bool: True if the objects are not equal, False otherwise. - """ return self != other def __call__(self, tokens): - """ - Applies the filter to the given list of tokens. - - Args: - tokens (list): The list of tokens to be filtered. - - Returns: - list: The filtered list of tokens. - """ raise NotImplementedError @@ -147,35 +120,18 @@ class PassFilter(Filter): """An identity filter: passes the tokens through untouched.""" def __call__(self, tokens): - """ - Apply the pass filter to the given tokens. - - Parameters: - tokens (list): The list of tokens to be filtered. - - Returns: - list: The filtered list of tokens, which is the same as the input list. - """ return tokens class LoggingFilter(Filter): """Prints the contents of every filter that passes through as a debug log entry. - - This filter is used to log the contents of each token that passes through it. It can be helpful for debugging purposes or for monitoring the tokenization process. - - Args: - logger (Logger, optional): The logger to use for logging the token contents. If not provided, the "whoosh.analysis" logger is used. - """ def __init__(self, logger=None): """ - Initializes a new instance of the LoggingFilter class. - - Args: - logger (Logger, optional): The logger to use. If omitted, the "whoosh.analysis" logger is used. + :param target: the logger to use. If omitted, the "whoosh.analysis" + logger is used. """ if logger is None: @@ -185,17 +141,6 @@ def __init__(self, logger=None): self.logger = logger def __call__(self, tokens): - """ - Applies the filter to the given tokens. - - Args: - tokens (iterable): The tokens to filter. - - Yields: - Token: The filtered tokens. - - """ - logger = self.logger for t in tokens: logger.debug(repr(t)) @@ -205,22 +150,6 @@ def __call__(self, tokens): class MultiFilter(Filter): """Chooses one of two or more sub-filters based on the 'mode' attribute of the token stream. - - This class is used to apply different filters to a token stream based on - the value of the 'mode' attribute of each token. It allows you to associate - different filters with different 'mode' attribute values and apply the - appropriate filter to each token. - - Attributes: - default_filter (Filter): The default filter to use when no matching - 'mode' attribute is found. Defaults to PassFilter(). - filters (dict): A dictionary that maps 'mode' attribute values to - instantiated filters. - - Example: - >>> iwf_for_index = IntraWordFilter(mergewords=True, mergenums=False) - >>> iwf_for_query = IntraWordFilter(mergewords=False, mergenums=False) - >>> mf = MultiFilter(index=iwf_for_index, query=iwf_for_query) """ default_filter = PassFilter() @@ -229,25 +158,16 @@ def __init__(self, **kwargs): """Use keyword arguments to associate mode attribute values with instantiated filters. - Args: - **kwargs: Keyword arguments where the key is the 'mode' attribute - value and the value is the instantiated filter. + >>> iwf_for_index = IntraWordFilter(mergewords=True, mergenums=False) + >>> iwf_for_query = IntraWordFilter(mergewords=False, mergenums=False) + >>> mf = MultiFilter(index=iwf_for_index, query=iwf_for_query) - Note: - This class expects that the value of the mode attribute is consistent - among all tokens in a token stream. + This class expects that the value of the mode attribute is consistent + among all tokens in a token stream. """ self.filters = kwargs def __eq__(self, other): - """Check if two MultiFilter instances are equal. - - Args: - other (MultiFilter): The other MultiFilter instance to compare. - - Returns: - bool: True if the two MultiFilter instances are equal, False otherwise. - """ return ( other and self.__class__ is other.__class__ @@ -255,17 +175,6 @@ def __eq__(self, other): ) def __call__(self, tokens): - """Apply the appropriate filter to each token in the token stream. - - Args: - tokens (iterable): An iterable of tokens. - - Returns: - iterable: An iterable of filtered tokens. - - Note: - Only the first token is used to determine the appropriate filter to apply. - """ # Only selects on the first token t = next(tokens) selected_filter = self.filters.get(t.mode, self.default_filter) @@ -275,12 +184,9 @@ def __call__(self, tokens): class TeeFilter(Filter): r"""Interleaves the results of two or more filters (or filter chains). - This filter takes the output of multiple filters or filter chains and interleaves them together. - It is useful when you want to apply different transformations to the same input and combine the results. + NOTE: because it needs to create copies of each token for each sub-filter, + this filter is quite slow. - NOTE: This filter can be slow because it needs to create copies of each token for each sub-filter. - - Usage: >>> target = "ALFA BRAVO CHARLIE" >>> # In one branch, we'll lower-case the tokens >>> f1 = LowercaseFilter() @@ -301,41 +207,14 @@ class TeeFilter(Filter): """ def __init__(self, *filters): - """ - Initialize the TeeFilter with the provided filters. - - Args: - *filters: Variable number of filters or filter chains to be interleaved. - - Raises: - ValueError: If less than two filters are provided. - """ if len(filters) < 2: raise ValueError("TeeFilter requires two or more filters") self.filters = filters def __eq__(self, other): - """ - Check if two TeeFilter instances are equal. - - Args: - other: Another TeeFilter instance. - - Returns: - bool: True if the two instances are equal, False otherwise. - """ return self.__class__ is other.__class__ and self.filters == other.fitlers def __call__(self, tokens): - """ - Apply the TeeFilter to the input tokens. - - Args: - tokens: The input tokens to be filtered. - - Yields: - Token: The interleaved tokens from the filters. - """ from itertools import tee count = len(self.filters) @@ -360,119 +239,36 @@ def __call__(self, tokens): class ReverseTextFilter(Filter): """Reverses the text of each token. - This filter takes a stream of tokens and reverses the text of each token. - It can be used as part of an analysis pipeline to modify the text of tokens. - - Example: - >>> ana = RegexTokenizer() | ReverseTextFilter() - >>> [token.text for token in ana("hello there")] - ["olleh", "ereht"] - + >>> ana = RegexTokenizer() | ReverseTextFilter() + >>> [token.text for token in ana("hello there")] + ["olleh", "ereht"] """ def __call__(self, tokens): - """Apply the reverse text transformation to each token. - - Args: - tokens (iterable): A stream of tokens. - - Yields: - Token: A token with the reversed text. - - """ for t in tokens: t.text = t.text[::-1] yield t class LowercaseFilter(Filter): - """A filter that uses unicode.lower() to lowercase token text. - - This filter converts the text of each token to lowercase using the unicode.lower() method. - It is commonly used in text analysis pipelines to normalize the case of tokens. - - Example: - >>> rext = RegexTokenizer() - >>> stream = rext("This is a TEST") - >>> [token.text for token in LowercaseFilter(stream)] - ["this", "is", "a", "test"] - - Usage: - 1. Create an instance of the LowercaseFilter class. - 2. Pass a stream of tokens to the instance using the __call__ method. - 3. Iterate over the filtered tokens to access the lowercase text. - - Note: - The LowercaseFilter modifies the text of each token in-place. It does not create new tokens. + """Uses unicode.lower() to lowercase token text. + >>> rext = RegexTokenizer() + >>> stream = rext("This is a TEST") + >>> [token.text for token in LowercaseFilter(stream)] + ["this", "is", "a", "test"] """ def __call__(self, tokens): - """Applies the lowercase transformation to each token in the stream. - - Args: - tokens (iterable): A stream of tokens. - - Yields: - Token: A token with its text converted to lowercase. - - """ for t in tokens: t.text = t.text.lower() yield t class StripFilter(Filter): - """Calls unicode.strip() on the token text. - - This filter is used to remove leading and trailing whitespace from the token text. - It is typically used in text analysis pipelines to clean up the tokenized text. - - Example usage: - ------------- - from whoosh.analysis import Token, Tokenizer, TokenFilter - - class MyTokenizer(Tokenizer): - def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, - start_pos=0, start_char=0, mode='', **kwargs): - # Tokenize the value - tokens = self.tokenizer(value, positions=positions, chars=chars, - keeporiginal=keeporiginal, removestops=removestops, - start_pos=start_pos, start_char=start_char, mode=mode, **kwargs) - - # Apply the StripFilter to remove leading and trailing whitespace - tokens = StripFilter()(tokens) - - return tokens - - # Create an instance of MyTokenizer - tokenizer = MyTokenizer() - - # Tokenize a text - text = " Hello, World! " - tokens = tokenizer(text) - - # Print the tokens - for token in tokens: - print(token.text) - - Output: - ------- - Hello, - World! - - """ + """Calls unicode.strip() on the token text.""" def __call__(self, tokens): - """Applies the strip() method to the token text. - - Args: - tokens (iterable of whoosh.analysis.Token): The input tokens. - - Yields: - whoosh.analysis.Token: The modified tokens with leading and trailing whitespace removed. - - """ for t in tokens: t.text = t.text.strip() yield t @@ -484,58 +280,33 @@ class StopFilter(Filter): Make sure you precede this filter with a :class:`LowercaseFilter`. - Args: - stoplist (collection, optional): A collection of words to remove from the stream. - This is converted to a frozenset. The default is a list of - common English stop words. - minsize (int, optional): The minimum length of token texts. Tokens with - text smaller than this will be stopped. The default is 2. - maxsize (int, optional): The maximum length of token texts. Tokens with text - larger than this will be stopped. Use None to allow any length. - renumber (bool, optional): Change the 'pos' attribute of unstopped tokens - to reflect their position with the stopped words removed. - lang (str, optional): Automatically get a list of stop words for the given - language. - - Attributes: - stops (frozenset): The set of stop words. - min (int): The minimum length of token texts. - max (int): The maximum length of token texts. - renumber (bool): Indicates whether the 'pos' attribute of unstopped tokens - should be changed to reflect their position with the stopped words removed. - - Examples: - >>> stopper = RegexTokenizer() | StopFilter() - >>> [token.text for token in stopper(u"this is a test")] - ["test"] - >>> es_stopper = RegexTokenizer() | StopFilter(lang="es") - >>> [token.text for token in es_stopper(u"el lapiz es en la mesa")] - ["lapiz", "mesa"] - - Note: - The list of available languages is in `whoosh.lang.languages`. - You can use :func:`whoosh.lang.has_stopwords` to check if a given language - has a stop word list available. + >>> stopper = RegexTokenizer() | StopFilter() + >>> [token.text for token in stopper(u"this is a test")] + ["test"] + >>> es_stopper = RegexTokenizer() | StopFilter(lang="es") + >>> [token.text for token in es_stopper(u"el lapiz es en la mesa")] + ["lapiz", "mesa"] + + The list of available languages is in `whoosh.lang.languages`. + You can use :func:`whoosh.lang.has_stopwords` to check if a given language + has a stop word list available. """ def __init__( self, stoplist=STOP_WORDS, minsize=2, maxsize=None, renumber=True, lang=None ): """ - Initialize the StopFilter. - - Args: - stoplist (collection, optional): A collection of words to remove from the stream. - This is converted to a frozenset. The default is a list of - common English stop words. - minsize (int, optional): The minimum length of token texts. Tokens with - text smaller than this will be stopped. The default is 2. - maxsize (int, optional): The maximum length of token texts. Tokens with text - larger than this will be stopped. Use None to allow any length. - renumber (bool, optional): Change the 'pos' attribute of unstopped tokens - to reflect their position with the stopped words removed. - lang (str, optional): Automatically get a list of stop words for the given - language + :param stoplist: A collection of words to remove from the stream. + This is converted to a frozenset. The default is a list of + common English stop words. + :param minsize: The minimum length of token texts. Tokens with + text smaller than this will be stopped. The default is 2. + :param maxsize: The maximum length of token texts. Tokens with text + larger than this will be stopped. Use None to allow any length. + :param renumber: Change the 'pos' attribute of unstopped tokens + to reflect their position with the stopped words removed. + :param lang: Automatically get a list of stop words for the given + language """ stops = set() @@ -552,15 +323,6 @@ def __init__( self.renumber = renumber def __eq__(self, other): - """ - Compare the StopFilter with another object for equality. - - Args: - other (object): The object to compare with. - - Returns: - bool: True if the objects are equal, False otherwise. - """ return ( other and self.__class__ is other.__class__ @@ -570,15 +332,6 @@ def __eq__(self, other): ) def __call__(self, tokens): - """ - Apply the StopFilter to the tokens. - - Args: - tokens (iterable): The input tokens. - - Yields: - Token: The filtered tokens. - """ stoplist = self.stops minsize = self.min maxsize = self.max @@ -610,65 +363,45 @@ def __call__(self, tokens): class CharsetFilter(Filter): - """ - Translates the text of tokens by calling unicode.translate() using the + """Translates the text of tokens by calling unicode.translate() using the supplied character mapping object. This is useful for case and accent folding. - The `whoosh.support.charset` module has a useful map for accent folding. - - Example usage: - - ```python - from whoosh.support.charset import accent_map - from whoosh.analysis import RegexTokenizer + The ``whoosh.support.charset`` module has a useful map for accent folding. - retokenizer = RegexTokenizer() - chfilter = CharsetFilter(accent_map) - tokens = chfilter(retokenizer(u'café')) - [t.text for t in tokens] - # Output: [u'cafe'] - ``` + >>> from whoosh.support.charset import accent_map + >>> retokenizer = RegexTokenizer() + >>> chfilter = CharsetFilter(accent_map) + >>> [t.text for t in chfilter(retokenizer(u'café'))] + [u'cafe'] Another way to get a character mapping object is to convert a Sphinx - charset table file using `whoosh.support.charset.charset_table_to_dict`. - - Example usage: - - ```python - from whoosh.support.charset import charset_table_to_dict, default_charset - from whoosh.analysis import RegexTokenizer + charset table file using + :func:`whoosh.support.charset.charset_table_to_dict`. - retokenizer = RegexTokenizer() - charmap = charset_table_to_dict(default_charset) - chfilter = CharsetFilter(charmap) - tokens = chfilter(retokenizer(u'Stra\\xdfe')) - [t.text for t in tokens] - # Output: [u'strase'] - ``` + >>> from whoosh.support.charset import charset_table_to_dict + >>> from whoosh.support.charset import default_charset + >>> retokenizer = RegexTokenizer() + >>> charmap = charset_table_to_dict(default_charset) + >>> chfilter = CharsetFilter(charmap) + >>> [t.text for t in chfilter(retokenizer(u'Stra\\xdfe'))] + [u'strase'] The Sphinx charset table format is described at - https://www.sphinxsearch.com/docs/current.html#conf-charset-table. + http://www.sphinxsearch.com/docs/current.html#conf-charset-table. """ __inittypes__ = {"charmap": dict} def __init__(self, charmap): """ - Initializes a CharsetFilter object. - - :param charmap: A dictionary mapping from integer character numbers to + :param charmap: a dictionary mapping from integer character numbers to unicode characters, as required by the unicode.translate() method. """ + self.charmap = charmap def __eq__(self, other): - """ - Checks if two CharsetFilter objects are equal. - - :param other: The other CharsetFilter object to compare. - :return: True if the two objects are equal, False otherwise. - """ return ( other and self.__class__ is other.__class__ @@ -676,12 +409,6 @@ def __eq__(self, other): ) def __call__(self, tokens): - """ - Applies the CharsetFilter to a sequence of tokens. - - :param tokens: An iterable sequence of tokens. - :return: A generator that yields the transformed tokens. - """ assert hasattr(tokens, "__iter__") charmap = self.charmap for t in tokens: @@ -696,61 +423,37 @@ class DelimitedAttributeFilter(Filter): The defaults are set up to use the ``^`` character as a delimiter and store the value after the ``^`` as the boost for the token. - Args: - delimiter (str): A string that, when present in a token's text, separates - the actual text from the "data" payload. - attribute (str): The name of the attribute in which to store the data on - the token. - default (Any): The value to use for the attribute for tokens that don't have - delimited data. - type (type): The type of the data, for example ``str`` or ``float``. This is - used to convert the string value of the data before storing it in the - attribute. - - Example: - >>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost") - >>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter() - >>> for t in ana(u("image render^2 file^0.5")): - ... print("%r %f" % (t.text, t.boost)) - 'image' 1.0 - 'render' 2.0 - 'file' 0.5 - - Note: - You need to make sure your tokenizer includes the delimiter and data as part - of the token! + >>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost") + >>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter() + >>> for t in ana(u("image render^2 file^0.5")) + ... print("%r %f" % (t.text, t.boost)) + 'image' 1.0 + 'render' 2.0 + 'file' 0.5 + + Note that you need to make sure your tokenizer includes the delimiter and + data as part of the token! """ def __init__(self, delimiter="^", attribute="boost", default=1.0, type=float): """ - Initialize the DelimitedAttributeFilter. - - Args: - delimiter (str): A string that, when present in a token's text, separates - the actual text from the "data" payload. - attribute (str): The name of the attribute in which to store the data on - the token. - default (Any): The value to use for the attribute for tokens that don't have - delimited data. - type (type): The type of the data, for example ``str`` or ``float``. This is - used to convert the string value of the data before storing it in the - attribute. + :param delimiter: a string that, when present in a token's text, + separates the actual text from the "data" payload. + :param attribute: the name of the attribute in which to store the + data on the token. + :param default: the value to use for the attribute for tokens that + don't have delimited data. + :param type: the type of the data, for example ``str`` or ``float``. + This is used to convert the string value of the data before + storing it in the attribute. """ + self.delim = delimiter self.attr = attribute self.default = default self.type = type def __eq__(self, other): - """ - Compare the DelimitedAttributeFilter with another object for equality. - - Args: - other (Any): The object to compare with. - - Returns: - bool: True if the objects are equal, False otherwise. - """ return ( other and self.__class__ is other.__class__ @@ -760,15 +463,6 @@ def __eq__(self, other): ) def __call__(self, tokens): - """ - Apply the DelimitedAttributeFilter to a sequence of tokens. - - Args: - tokens (Iterable[Token]): The sequence of tokens to filter. - - Yields: - Token: The filtered tokens. - """ delim = self.delim attr = self.attr default = self.default @@ -791,59 +485,33 @@ def __call__(self, tokens): class SubstitutionFilter(Filter): """Performs a regular expression substitution on the token text. - This filter applies a regular expression substitution to the text of each token. - It is particularly useful for removing or replacing specific patterns of text within tokens. - The filter utilizes the `re.sub()` method to perform the substitution. - - Example usage: - -------------- - # Create an analyzer that removes hyphens from tokens - tokenizer = RegexTokenizer(r"\\S+") - substitution_filter = SubstitutionFilter("-", "") - analyzer = tokenizer | substitution_filter - - Parameters: - ----------- - pattern : str or Pattern - A pattern string or compiled regular expression object describing the text to replace. - replacement : str - The substitution text. - - Methods: - -------- - __call__(tokens) - Applies the substitution filter to the given tokens. + This is especially useful for removing text from tokens, for example + hyphens:: + + ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("-", "") + Because it has the full power of the re.sub() method behind it, this filter + can perform some fairly complex transformations. For example, to take + tokens like ``'a=b', 'c=d', 'e=f'`` and change them to ``'b=a', 'd=c', + 'f=e'``:: + + # Analyzer that swaps the text on either side of an equal sign + rt = RegexTokenizer(r"\\S+") + sf = SubstitutionFilter("([^/]*)/(./*)", r"\\2/\\1") + ana = rt | sf """ def __init__(self, pattern, replacement): """ - Initializes a SubstitutionFilter object. - - Parameters: - ----------- - pattern : str or Pattern - A pattern string or compiled regular expression object describing the text to replace. - replacement : str - The substitution text. + :param pattern: a pattern string or compiled regular expression object + describing the text to replace. + :param replacement: the substitution text. """ + self.pattern = rcompile(pattern) self.replacement = replacement def __eq__(self, other): - """ - Checks if two SubstitutionFilter objects are equal. - - Parameters: - ----------- - other : SubstitutionFilter - The other SubstitutionFilter object to compare. - - Returns: - -------- - bool - True if the two SubstitutionFilter objects are equal, False otherwise. - """ return ( other and self.__class__ is other.__class__ @@ -852,19 +520,6 @@ def __eq__(self, other): ) def __call__(self, tokens): - """ - Applies the substitution filter to the given tokens. - - Parameters: - ----------- - tokens : iterable - An iterable of Token objects. - - Yields: - ------- - Token - The modified Token objects after applying the substitution filter. - """ pattern = self.pattern replacement = self.replacement diff --git a/src/whoosh/analysis/intraword.py b/src/whoosh/analysis/intraword.py index 9eea4c78..ae22e58b 100644 --- a/src/whoosh/analysis/intraword.py +++ b/src/whoosh/analysis/intraword.py @@ -43,49 +43,27 @@ class CompoundWordFilter(Filter): The ``keep_compound`` argument lets you decide whether to keep the compound word in the token stream along with the word segments. - Args: - wordset (object): An object with a ``__contains__`` method, such as a - set, containing strings to look for inside the tokens. - keep_compound (bool, optional): If True (the default), the original compound - token will be retained in the stream before the subwords. - - Example: - >>> cwf = CompoundWordFilter(wordset, keep_compound=True) - >>> analyzer = RegexTokenizer(r"\S+") | cwf - >>> [t.text for t in analyzer("I do not like greeneggs and ham")] - ["I", "do", "not", "like", "greeneggs", "green", "eggs", "and", "ham"] - >>> cwf.keep_compound = False - >>> [t.text for t in analyzer("I do not like greeneggs and ham")] - ["I", "do", "not", "like", "green", "eggs", "and", "ham"] + >>> cwf = CompoundWordFilter(wordset, keep_compound=True) + >>> analyzer = RegexTokenizer(r"\S+") | cwf + >>> [t.text for t in analyzer("I do not like greeneggs and ham") + ["I", "do", "not", "like", "greeneggs", "green", "eggs", "and", "ham"] + >>> cwf.keep_compound = False + >>> [t.text for t in analyzer("I do not like greeneggs and ham") + ["I", "do", "not", "like", "green", "eggs", "and", "ham"] """ def __init__(self, wordset, keep_compound=True): """ - Initialize the CompoundWordFilter. - - Args: - wordset (object): An object with a ``__contains__`` method, such as a - set, containing strings to look for inside the tokens. - keep_compound (bool, optional): If True (the default), the original compound - token will be retained in the stream before the subwords. + :param wordset: an object with a ``__contains__`` method, such as a + set, containing strings to look for inside the tokens. + :param keep_compound: if True (the default), the original compound + token will be retained in the stream before the subwords. """ self.wordset = wordset self.keep_compound = keep_compound def subwords(self, s, memo): - """ - Recursively break a compound word into its individual parts. - - Args: - s (str): The compound word to be broken down. - memo (dict): A dictionary to store previously computed subwords. - - Returns: - list or None: A list of subwords if the compound word can be broken down, - None otherwise. - """ - if s in self.wordset: return [s] if s in memo: @@ -104,16 +82,6 @@ def subwords(self, s, memo): return None def __call__(self, tokens): - """ - Apply the CompoundWordFilter to a stream of tokens. - - Args: - tokens (iterable): The input stream of tokens. - - Yields: - Token: The modified tokens after applying the filter. - """ - keep_compound = self.keep_compound memo = {} subwords = self.subwords @@ -130,45 +98,27 @@ def __call__(self, tokens): class BiWordFilter(Filter): - """Merges adjacent tokens into "bi-word" tokens. + """Merges adjacent tokens into "bi-word" tokens, so that for example:: - This filter merges adjacent tokens into "bi-word" tokens. For example, the tokens - "the", "sign", "of", "four" would be transformed into "the-sign", "sign-of", "of-four". + "the", "sign", "of", "four" - Bi-word tokens can be used to create fields for pseudo-phrase searching. If all the - terms in a query match the document, it probably contains the phrase. Using bi-word - tokens can make the searching faster than actually doing a phrase search on individual - word terms. + becomes:: - The `BiWordFilter` is much faster than using the otherwise equivalent `ShingleFilter(2)`. + "the-sign", "sign-of", "of-four" - Args: - sep (str): The separator to use when merging adjacent tokens. Default is "-". + This can be used to create fields for pseudo-phrase searching, where if + all the terms match the document probably contains the phrase, but the + searching is faster than actually doing a phrase search on individual word + terms. + The ``BiWordFilter`` is much faster than using the otherwise equivalent + ``ShingleFilter(2)``. """ def __init__(self, sep="-"): - """ - Initializes the IntrawordFilter with the specified separator character. - - Args: - sep (str): The separator character used to split words. Defaults to "-". - """ self.sep = sep def __call__(self, tokens): - """Merges adjacent tokens into bi-word tokens. - - This method takes a stream of tokens and merges adjacent tokens into "bi-word" tokens. - It yields the bi-word tokens as it iterates through the input token stream. - - Args: - tokens (iterable): The input token stream. - - Yields: - Token: The bi-word tokens. - - """ sep = self.sep prev_text = None prev_startchar = None @@ -383,15 +333,6 @@ def __init__( self.mergenums = mergenums def __eq__(self, other): - """ - Check if this object is equal to another object. - - Args: - other: The object to compare with. - - Returns: - bool: True if the objects are equal, False otherwise. - """ return ( other and self.__class__ is other.__class__ @@ -399,16 +340,6 @@ def __eq__(self, other): ) def _split(self, string): - """ - Splits the given string into indexable substrings based on the specified boundaries. - - Args: - string (str): The input string to be split. - - Yields: - tuple: A tuple containing the start and end indices of each indexable substring. - - """ bound = self.boundary # Yields (startchar, endchar) pairs for each indexable substring in @@ -460,21 +391,6 @@ def _split(self, string): yield (part_start, part_end) def _merge(self, parts): - """ - Merges consecutive parts in the given list based on their type (alpha or digit). - - Args: - parts (list): The list of parts to be merged. Each part is a tuple of the form (text, pos, startchar, endchar). - - Returns: - None. The original list of parts is modified in-place. - - Example: - parts = [('hello', 0, 0, 4), ('world', 1, 6, 10), ('123', 2, 12, 14)] - _merge(parts) - print(parts) - # Output: [('helloworld', 0, 0, 10), ('123', 2, 12, 14)] - """ mergewords = self.mergewords mergenums = self.mergenums @@ -533,18 +449,6 @@ def insert_item(buf, at, newpos): insert_item(buf, len(parts), pos) def __call__(self, tokens): - """ - Applies the intraword filter to the given tokens. - - This filter renumbers tokens as it expands them. It splits tokens on delimiters, word and/or number boundaries, - and merges consecutive runs of all-letters and/or all-numbers if the options are set. - - Parameters: - - tokens (list): The list of tokens to be processed. - - Returns: - - generator: A generator that yields the processed tokens. - """ mergewords = self.mergewords mergenums = self.mergenums diff --git a/src/whoosh/analysis/morph.py b/src/whoosh/analysis/morph.py index 796eb28f..addbfad6 100644 --- a/src/whoosh/analysis/morph.py +++ b/src/whoosh/analysis/morph.py @@ -37,81 +37,50 @@ class StemFilter(Filter): root word (for example, "rendering", "renders", "rendered", etc.) to a single word in the index. - Args: - stemfn (object): The function to use for stemming. Default is the Porter stemming algorithm for English. - lang (str): If not None, overrides the stemfn with a language stemmer from the `whoosh.lang.snowball` package. - ignore (list): A set/list of words that should not be stemmed. This is converted into a frozenset. If you omit this argument, all tokens are stemmed. - cachesize (int): The maximum number of words to cache. Use -1 for an unbounded cache, or None for no caching. - - Attributes: - is_morph (bool): Indicates if the filter is a morphological filter. - - Methods: - __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): Initializes the StemFilter object. - __getstate__(self): Returns the state of the object for pickling. - __setstate__(self, state): Sets the state of the object after unpickling. - clear(self): Clears the stem function and sets it based on the provided parameters. - cache_info(self): Returns information about the cache used by the stem function. - __eq__(self, other): Compares two StemFilter objects for equality. - __call__(self, tokens): Applies stemming to the tokens. - - Examples: - stemmer = RegexTokenizer() | StemFilter() - [token.text for token in stemmer("fundamentally willows")] - Output: ["fundament", "willow"] - - stemfilter = StemFilter(stem_function) - stemfilter = StemFilter(lang="ru") - """ - - __inittypes__ = {"stemfn": object, "ignore": list} - is_morph = True - - def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): - """ - Initializes the StemFilter object. - - Args: - stemfn (object): The function to use for stemming. Default is the Porter stemming algorithm for English. - lang (str): If not None, overrides the stemfn with a language stemmer from the `whoosh.lang.snowball` package. - ignore (list): A set/list of words that should not be stemmed. This is converted into a frozenset. If you omit this argument, all tokens are stemmed. - cachesize (int): The maximum number of words to cache. Use -1 for an unbounded cache, or None for no caching. - - Raises: - TypeError: If the `stemfn` argument is not callable. - ValueError: If the `cachesize` argument is not a positive integer or None. - - Notes: - The StemFilter object is used to apply stemming to tokens during the analysis process. Stemming is the process of reducing words to their base or root form, which can help improve search accuracy by treating different forms of the same word as equivalent. + >>> stemmer = RegexTokenizer() | StemFilter() + >>> [token.text for token in stemmer("fundamentally willows")] + ["fundament", "willow"] - The `stemfn` argument specifies the function to use for stemming. By default, the Porter stemming algorithm for English is used. You can provide your own custom stemming function if desired. + You can pass your own stemming function to the StemFilter. The default + is the Porter stemming algorithm for English. - The `lang` argument allows you to override the `stemfn` with a language stemmer from the `whoosh.lang.snowball` package. If `lang` is not None, the stemmer for the specified language will be used instead of the `stemfn`. + >>> stemfilter = StemFilter(stem_function) - The `ignore` argument is a set/list of words that should not be stemmed. If you omit this argument, all tokens will be stemmed. The `ignore` set/list is converted into a frozenset for efficient lookup. + You can also use one of the Snowball stemming functions by passing the + `lang` keyword argument. - The `cachesize` argument specifies the maximum number of words to cache. Caching can improve performance by avoiding redundant stemming operations. Use -1 for an unbounded cache, or None for no caching. + >>> stemfilter = StemFilter(lang="ru") - Example: - # Initialize StemFilter with default settings - stem_filter = StemFilter() + The list of available languages is in `whoosh.lang.languages`. + You can use :func:`whoosh.lang.has_stemmer` to check if a given language has + a stemming function available. - # Initialize StemFilter with custom stemming function - def custom_stemmer(word): - # custom stemming logic - return stemmed_word + By default, this class wraps an LRU cache around the stemming function. The + ``cachesize`` keyword argument sets the size of the cache. To make the + cache unbounded (the class caches every input), use ``cachesize=-1``. To + disable caching, use ``cachesize=None``. - stem_filter = StemFilter(stemfn=custom_stemmer) + If you compile and install the py-stemmer library, the + :class:`PyStemmerFilter` provides slightly easier access to the language + stemmers in that library. + """ - # Initialize StemFilter with language stemmer - stem_filter = StemFilter(lang='english') + __inittypes__ = {"stemfn": object, "ignore": list} - # Initialize StemFilter with ignored words - stem_filter = StemFilter(ignore=['apple', 'banana', 'orange']) + is_morph = True - # Initialize StemFilter with caching disabled - stem_filter = StemFilter(cachesize=None) + def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): + """ + :param stemfn: the function to use for stemming. + :param lang: if not None, overrides the stemfn with a language stemmer + from the ``whoosh.lang.snowball`` package. + :param ignore: a set/list of words that should not be stemmed. This is + converted into a frozenset. If you omit this argument, all tokens + are stemmed. + :param cachesize: the maximum number of words to cache. Use ``-1`` for + an unbounded cache, or ``None`` for no caching. """ + self.stemfn = stemfn self.lang = lang self.ignore = frozenset() if ignore is None else frozenset(ignore) @@ -120,77 +89,13 @@ def custom_stemmer(word): self.clear() def __getstate__(self): - """ - Get the state of the object for pickling. - - This method is called by the pickle module when pickling an object. - It returns a dictionary representing the state of the object, excluding - the '_stem' attribute. - - Returns: - dict: The state of the object without the '_stem' attribute. - - Example: - >>> obj = MyObject() - >>> state = obj.__getstate__() - >>> print(state) - {'attr1': value1, 'attr2': value2, ...} - - Note: - This method is automatically called by the pickle module and should - not be called directly by user code. - """ # Can't pickle a dynamic function, so we have to remove the _stem # attribute from the state return {k: self.__dict__[k] for k in self.__dict__ if k != "_stem"} def __setstate__(self, state): - """ - Set the state of the object during unpickling. - - This method is called by the pickle module when unpickling an object. - It sets the state of the object based on the provided state dictionary. - - Parameters: - - state (dict): The state dictionary containing the object's attributes. - - Notes: - - This method is primarily used for backward compatibility with older versions - of the StemFilter class. - - It checks for old instances of StemFilter class and updates the state - accordingly. - - If the 'cachesize' attribute is not present in the state dictionary, it - sets the 'cachesize' attribute to a default value of 50000. - - If the 'ignores' attribute is present in the state dictionary, it sets the - 'ignore' attribute to the value of 'ignores'. - - If the 'ignore' attribute is not present in the state dictionary, it sets - the 'ignore' attribute to an empty frozenset. - - If the 'lang' attribute is not present in the state dictionary, it sets the - 'lang' attribute to None. - - If the 'cache' attribute is present in the state dictionary, it removes the - 'cache' attribute from the state dictionary. - - Returns: - - None - - Example: - >>> state = { - ... 'cachesize': 10000, - ... 'ignores': {'word1', 'word2'}, - ... 'lang': 'en', - ... 'cache': {}, - ... } - >>> obj = StemFilter() - >>> obj.__setstate__(state) - >>> obj.cachesize - 10000 - >>> obj.ignore - {'word1', 'word2'} - >>> obj.lang - 'en' - >>> 'cache' in obj.__dict__ - False - """ + # Check for old instances of StemFilter class, which didn't have a + # cachesize attribute and pickled the cache attribute if "cachesize" not in state: self.cachesize = 50000 if "ignores" in state: @@ -203,28 +108,10 @@ def __setstate__(self, state): del state["cache"] self.__dict__.update(state) + # Set the _stem attribute self.clear() def clear(self): - """ - Clears the stem function and sets it based on the provided parameters. - - This method clears the current stem function and sets it based on the provided parameters. - If the language is specified, it retrieves the stemmer function for that language from the 'whoosh.lang' module. - Otherwise, it uses the stem function that was previously set. - - If the 'cachesize' parameter is an integer and not equal to 0, it creates a cache for the stem function. - If 'cachesize' is a negative integer, an unbound cache is created using the stem function. - If 'cachesize' is a positive integer greater than 1, an LFU (Least Frequently Used) cache is created with the specified size. - - If 'cachesize' is not an integer or equal to 0, no cache is created and the stem function is used directly. - - Note: The stem function is responsible for transforming words into their base or root form. - - Usage: - morph = MorphAnalyzer() - morph.clear() - """ if self.lang: from whoosh.lang import stemmer_for_language @@ -241,67 +128,16 @@ def clear(self): self._stem = stemfn def cache_info(self): - """ - Returns information about the cache used by the stem function. - - The cache_info method provides information about the cache used by the stem function. - It returns an object that contains details such as the number of cache hits, misses, - and the current size of the cache. - - Returns: - cache_info (object): An object containing information about the cache used by the stem function. - The object has the following attributes: - - hits (int): The number of cache hits. - - misses (int): The number of cache misses. - - maxsize (int): The maximum size of the cache. - - currsize (int): The current size of the cache. - - Returns None if caching is disabled. - """ if self.cachesize <= 1: return None return self._stem.cache_info() def __eq__(self, other): - """ - Compares two StemFilter objects for equality. - - This method compares the current StemFilter object with another StemFilter object - to determine if they are equal. Two StemFilter objects are considered equal if they - are of the same class and have the same stem function. - - Args: - other (StemFilter): The other StemFilter object to compare. - - Returns: - bool: True if the two StemFilter objects are equal, False otherwise. - """ return ( other and self.__class__ is other.__class__ and self.stemfn == other.stemfn ) def __call__(self, tokens): - """ - Applies stemming to the tokens. - - This method applies stemming to the given tokens using the specified stemmer. - It iterates over the tokens, checks if the token is not stopped, and if the token's text - is not in the ignore list. If the conditions are met, the token's text is stemmed using - the stemmer's stem function. - - Args: - tokens (iterable): The tokens to apply stemming to. - - Yields: - Token: The stemmed tokens. - - Example: - >>> stemmer = Stemmer() - >>> tokens = [Token("running"), Token("jumps"), Token("jumping")] - >>> stemmed_tokens = stemmer(tokens) - >>> list(stemmed_tokens) - [Token("run"), Token("jump"), Token("jump")] - """ stemfn = self._stem ignore = self.ignore @@ -318,45 +154,19 @@ class PyStemmerFilter(StemFilter): third-party library. You must have the py-stemmer library installed to use this filter. - Args: - lang (str, optional): A string identifying the stemming algorithm to use. - You can get a list of available algorithms by using the `algorithms()` - method. The identification strings are directly from the py-stemmer library. - Defaults to "english". - ignore (set or list, optional): A set or list of words that should not be stemmed. - If provided, these words will be excluded from the stemming process. - Defaults to None. - cachesize (int, optional): The maximum number of words to cache. Defaults to 10000. - - Attributes: - lang (str): The language identifier for the stemming algorithm. - ignore (frozenset): The set of words to be ignored during stemming. - cachesize (int): The maximum number of words to cache. - _stem (function): The stemmer function used for stemming. - - Methods: - algorithms(): Returns a list of stemming algorithms provided by the py-stemmer library. - cache_info(): Returns information about the cache (not implemented). - __getstate__(): Returns the state of the object for pickling (excluding _stem attribute). - __setstate__(): Sets the state of the object after unpickling. - - Example: - >>> filter = PyStemmerFilter("spanish") + >>> PyStemmerFilter("spanish") """ def __init__(self, lang="english", ignore=None, cachesize=10000): """ - Initialize the PyStemmerFilter. - - Args: - lang (str, optional): A string identifying the stemming algorithm to use. - You can get a list of available algorithms by using the `algorithms()` - method. The identification strings are directly from the py-stemmer library. - Defaults to "english". - ignore (set or list, optional): A set or list of words that should not be stemmed. - If provided, these words will be excluded from the stemming process. - Defaults to None. - cachesize (int, optional): The maximum number of words to cache. Defaults to 10000. + :param lang: a string identifying the stemming algorithm to use. You + can get a list of available algorithms by with the + :meth:`PyStemmerFilter.algorithms` method. The identification + strings are directly from the py-stemmer library. + :param ignore: a set/list of words that should not be stemmed. This is + converted into a frozenset. If you omit this argument, all tokens + are stemmed. + :param cachesize: the maximum number of words to cache. """ self.lang = lang @@ -365,52 +175,18 @@ def __init__(self, lang="english", ignore=None, cachesize=10000): self._stem = self._get_stemmer_fn() def algorithms(self): + """Returns a list of stemming algorithms provided by the py-stemmer + library. """ - Returns a list of stemming algorithms provided by the py-stemmer library. - - This method uses the py-stemmer library to retrieve a list of available stemming algorithms. - Stemming algorithms are used to reduce words to their base or root form, which can be useful - in natural language processing tasks such as information retrieval, text mining, and language - modeling. - Returns: - list: A list of strings representing the names of available stemming algorithms. - - Example: - >>> analyzer = Analyzer() - >>> algorithms = analyzer.algorithms() - >>> print(algorithms) - ['porter', 'snowball'] - """ import Stemmer # type: ignore @UnresolvedImport return Stemmer.algorithms() def cache_info(self): - """Returns information about the cache. - - This method is not implemented and always returns None. - - Returns: - None: This method does not provide any information about the cache. - """ return None def _get_stemmer_fn(self): - """ - Returns a stemmer function for the specified language. - - This function imports the Stemmer module and initializes a stemmer object - with the specified language. The stemmer object is then configured with - the specified cache size. Finally, the stemWord method of the stemmer - object is returned as the stemmer function. - - Returns: - callable: A stemmer function that takes a word as input and returns its stem. - - Raises: - ImportError: If the Stemmer module cannot be imported. - """ import Stemmer # type: ignore @UnresolvedImport stemmer = Stemmer.Stemmer(self.lang) @@ -418,53 +194,13 @@ def _get_stemmer_fn(self): return stemmer.stemWord def __getstate__(self): - """ - Get the state of the object for pickling. - - This method is called by the pickle module when pickling an object. - It returns a dictionary representing the object's state, excluding the - '_stem' attribute. - - Returns: - dict: A dictionary representing the object's state. - - Note: - The '_stem' attribute is excluded from the state because dynamic - functions cannot be pickled. - - """ + # Can't pickle a dynamic function, so we have to remove the _stem + # attribute from the state return {k: self.__dict__[k] for k in self.__dict__ if k != "_stem"} def __setstate__(self, state): - """ - Set the state of the object during unpickling. - - This method is called by the pickle module when unpickling an object. - It is responsible for setting the state of the object based on the - provided `state` dictionary. - - Parameters: - state (dict): The dictionary containing the state of the object. - - Returns: - None - - Raises: - None - - Notes: - - This method is used to handle backward compatibility with old - instances of the `StemFilter` class. - - If the `state` dictionary does not contain the key "cachesize", - the `cachesize` attribute is set to the default value of 10000. - - If the `state` dictionary contains the key "ignores", the `ignore` - attribute is set to the value of "ignores". - - If the `state` dictionary does not contain the key "ignore", the - `ignore` attribute is set to an empty frozenset. - - The "cache" key is removed from the `state` dictionary. - - The `state` dictionary is used to update the object's attributes. - - The `_stem` attribute is set using the `_get_stemmer_fn` method. - """ + # Check for old instances of StemFilter class, which didn't have a + # cachesize attribute and pickled the cache attribute if "cachesize" not in state: self.cachesize = 10000 if "ignores" in state: @@ -484,45 +220,26 @@ class DoubleMetaphoneFilter(Filter): Metaphone algorithm. This algorithm attempts to encode words in such a way that similar-sounding words reduce to the same code. This may be useful for fields containing the names of people and places, and other uses where - tolerance of spelling differences is desirable. - - Args: - primary_boost (float, optional): The boost to apply to the token containing the - primary code. Defaults to 1.0. - secondary_boost (float, optional): The boost to apply to the token containing the - secondary code, if any. Defaults to 0.5. - combine (bool, optional): If True, the original unencoded tokens are kept in the - stream, preceding the encoded tokens. Defaults to False. + tolerance of spelling differences is desireable. """ is_morph = True def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False): """ - Initialize a MorphAnalyzer object. - - Args: - primary_boost (float, optional): The boost factor for primary morphological analysis. Defaults to 1.0. - secondary_boost (float, optional): The boost factor for secondary morphological analysis. Defaults to 0.5. - combine (bool, optional): Whether to combine the results of primary and secondary analysis. Defaults to False. + :param primary_boost: the boost to apply to the token containing the + primary code. + :param secondary_boost: the boost to apply to the token containing the + secondary code, if any. + :param combine: if True, the original unencoded tokens are kept in the + stream, preceding the encoded tokens. """ + self.primary_boost = primary_boost self.secondary_boost = secondary_boost self.combine = combine def __eq__(self, other): - """ - Check if two objects are equal. - - This method compares the current object with another object to determine if they are equal. - The comparison is based on the class type and the primary_boost attribute. - - Parameters: - - other: The object to compare with. - - Returns: - - bool: True if the objects are equal, False otherwise. - """ return ( other and self.__class__ is other.__class__ @@ -530,30 +247,6 @@ def __eq__(self, other): ) def __call__(self, tokens): - """ - Applies morphological analysis to a sequence of tokens. - - Args: - tokens (iterable): The input tokens to be analyzed. - - Yields: - Token: The analyzed tokens with modified text and boost. - - Notes: - This method applies morphological analysis to each token in the input sequence. - It uses the double metaphone algorithm to generate primary and secondary forms of the token's text. - The token's text and boost are then modified based on the generated forms and yielded. - - Example: - >>> analyzer = MorphAnalyzer() - >>> tokens = [Token("running", boost=1.0), Token("swimming", boost=0.8)] - >>> analyzed_tokens = list(analyzer(tokens)) - >>> for token in analyzed_tokens: - ... print(token.text, token.boost) - ... - run 1.0 - swim 0.8 - """ primary_boost = self.primary_boost secondary_boost = self.secondary_boost combine = self.combine diff --git a/src/whoosh/analysis/ngrams.py b/src/whoosh/analysis/ngrams.py index 61f0d06d..a42fc37e 100644 --- a/src/whoosh/analysis/ngrams.py +++ b/src/whoosh/analysis/ngrams.py @@ -35,59 +35,32 @@ class NgramTokenizer(Tokenizer): """Splits input text into N-grams instead of words. - This tokenizer splits the input text into N-grams, where an N-gram is a - contiguous sequence of N characters. The N-grams emitted by this tokenizer - may contain whitespace, punctuation, and other characters. If you only want - sub-word N-grams without whitespace, you can combine a RegexTokenizer with - NgramFilter instead. - - Example: - ngt = NgramTokenizer(4) - tokens = [token.text for token in ngt("hi there")] - # tokens = ["hi t", "i th", " the", "ther", "here"] - - Note: - This tokenizer does not use a regular expression to extract words, so - the N-grams emitted by it will contain whitespace, punctuation, etc. - You may want to massage the input or add a custom filter to this - tokenizer's output. - - Args: - minsize (int): The minimum size of the N-grams. - maxsize (int, optional): The maximum size of the N-grams. If not - provided, maxsize will be set to minsize. - - Attributes: - min (int): The minimum size of the N-grams. - max (int): The maximum size of the N-grams. + >>> ngt = NgramTokenizer(4) + >>> [token.text for token in ngt("hi there")] + ["hi t", "i th", " the", "ther", "here"] + Note that this tokenizer does NOT use a regular expression to extract + words, so the grams emitted by it will contain whitespace, punctuation, + etc. You may want to massage the input or add a custom filter to this + tokenizer's output. + + Alternatively, if you only want sub-word grams without whitespace, you + could combine a RegexTokenizer with NgramFilter instead. """ __inittypes__ = {"minsize": int, "maxsize": int} def __init__(self, minsize, maxsize=None): """ - Initialize the NgramTokenizer. - - Args: - minsize (int): The minimum size of the N-grams. - maxsize (int, optional): The maximum size of the N-grams. If not - provided, maxsize will be set to minsize. - + :param minsize: The minimum size of the N-grams. + :param maxsize: The maximum size of the N-grams. If you omit + this parameter, maxsize == minsize. """ + self.min = minsize self.max = maxsize or minsize def __eq__(self, other): - """ - Check if two ngram objects are equal. - - Args: - other (Ngram): The other ngram object to compare with. - - Returns: - bool: True if the ngram objects are equal, False otherwise. - """ if self.__class__ is other.__class__: if self.min == other.min and self.max == other.max: return True @@ -105,37 +78,6 @@ def __call__( mode="", **kwargs, ): - """ - Tokenizes the given value into n-grams. - - Args: - value (str): The input string to be tokenized. - positions (bool, optional): Whether to include position information in the tokens. Defaults to False. - chars (bool, optional): Whether to include character offset information in the tokens. Defaults to False. - keeporiginal (bool, optional): Whether to keep the original token text. Defaults to False. - removestops (bool, optional): Whether to remove stop words from the tokens. Defaults to True. - start_pos (int, optional): The starting position for position information. Defaults to 0. - start_char (int, optional): The starting character offset. Defaults to 0. - mode (str, optional): The tokenization mode. Defaults to "". - - Yields: - Token: The generated tokens. - - Raises: - AssertionError: If the input value is not a string. - - Note: - This method tokenizes the input string into n-grams based on the specified parameters. It generates tokens - by sliding a window of size `self.min` to `self.max` over the input string. The generated tokens can include - position information, character offset information, and original token text depending on the specified - parameters. - - If `mode` is set to "query", the method generates tokens by sliding a window of size `self.max` over the - input string. This is typically used for query tokenization. - - If `mode` is not set to "query", the method generates tokens by sliding a window of size `self.min` to - `self.max` over the input string. This is typically used for indexing tokenization. - """ assert isinstance(value, str), f"{value!r} is not unicode" inlen = len(value) @@ -180,6 +122,8 @@ def __call__( # Filter + + class NgramFilter(Filter): """Splits token text into N-grams. @@ -211,15 +155,6 @@ def __init__(self, minsize, maxsize=None, at=None): self.at = 1 def __eq__(self, other): - """ - Check if two ngrams objects are equal. - - Args: - other (object): The object to compare with. - - Returns: - bool: True if the two ngrams objects are equal, False otherwise. - """ return ( other and self.__class__ is other.__class__ @@ -228,29 +163,6 @@ def __eq__(self, other): ) def __call__(self, tokens): - """ - Tokenizes the given tokens into N-grams. - - Args: - tokens (iterable): The input tokens to be tokenized. - - Yields: - Token: The generated N-gram tokens. - - Raises: - AssertionError: If the input tokens are not iterable. - - Note: - This method tokenizes the input tokens into N-grams based on the specified parameters. It generates N-gram tokens by sliding a window of size `self.min` to `self.max` over the input tokens. - - If the token's text length is less than `self.min`, the token is skipped. - - If the token's mode is set to "query", the method generates N-gram tokens by sliding a window of size `self.max` over the token's text. This is typically used for query tokenization. - - If the token's mode is not set to "query", the method generates N-gram tokens by sliding a window of size `self.min` to `self.max` over the token's text. This is typically used for indexing tokenization. - - The generated N-gram tokens can include position information, character offset information, and original token text depending on the specified parameters. - """ assert hasattr(tokens, "__iter__") at = self.at for t in tokens: @@ -321,44 +233,18 @@ def __call__(self, tokens): # Analyzers -def ngram_analyzer(minsize, maxsize=None): - """ - Composes an NgramTokenizer and a LowercaseFilter. - - Args: - minsize (int): The minimum size of the n-grams. - maxsize (int, optional): The maximum size of the n-grams. Defaults to None. - - Returns: - Analyzer: An analyzer that tokenizes text into n-grams and applies lowercase filtering. +def NgramAnalyzer(minsize, maxsize=None): + """Composes an NgramTokenizer and a LowercaseFilter. - Examples: - >>> ana = ngram_analyzer(4) - >>> [token.text for token in ana("hi there")] - ["hi t", "i th", " the", "ther", "here"] + >>> ana = NgramAnalyzer(4) + >>> [token.text for token in ana("hi there")] + ["hi t", "i th", " the", "ther", "here"] """ + return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter() -def ngram_word_analyzer(minsize, maxsize=None, tokenizer=None, at=None): - """ - Creates an analyzer that tokenizes text into n-grams. - - Args: - minsize (int): The minimum size of the n-grams. - maxsize (int, optional): The maximum size of the n-grams. Defaults to None. - tokenizer (Tokenizer, optional): The tokenizer to use. Defaults to None. - at (str, optional): The position at which to split the n-grams. Defaults to None. - - Returns: - Analyzer: The n-gram word analyzer. - - Example: - >>> analyzer = ngram_word_analyzer(2, 3) - >>> tokens = analyzer("Hello world") - >>> list(tokens) - ['he', 'el', 'll', 'lo', 'wo', 'or', 'rl', 'ld'] - """ +def NgramWordAnalyzer(minsize, maxsize=None, tokenizer=None, at=None): if not tokenizer: tokenizer = RegexTokenizer() return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, at=at) diff --git a/src/whoosh/analysis/tokenizers.py b/src/whoosh/analysis/tokenizers.py index 5058d295..449576ab 100644 --- a/src/whoosh/analysis/tokenizers.py +++ b/src/whoosh/analysis/tokenizers.py @@ -35,37 +35,9 @@ class Tokenizer(Composable): - """Base class for tokenizers. - - Tokenizers are responsible for breaking text into individual tokens. This base class - provides the basic structure and behavior that all tokenizers should follow. - - Subclasses should override the `tokenize` method to implement the tokenization logic. - - Example usage: - tokenizer = Tokenizer() - tokens = tokenizer.tokenize("Hello, world!") - for token in tokens: - print(token) - - Attributes: - None - - Methods: - __eq__(self, other): Compare if two tokenizers are equal. - - """ + """Base class for Tokenizers.""" def __eq__(self, other): - """Compare if two tokenizers are equal. - - Args: - other (object): The other tokenizer object to compare. - - Returns: - bool: True if the tokenizers are equal, False otherwise. - - """ return other and self.__class__ is other.__class__ @@ -73,23 +45,9 @@ class IDTokenizer(Tokenizer): """Yields the entire input string as a single token. For use in indexed but untokenized fields, such as a document's path. - Example: - idt = IDTokenizer() - [token.text for token in idt("/a/b 123 alpha")] - Output: ["/a/b 123 alpha"] - - Args: - positions (bool, optional): Whether to store token positions. Defaults to False. - chars (bool, optional): Whether to store token character offsets. Defaults to False. - keeporiginal (bool, optional): Whether to store the original token text. Defaults to False. - removestops (bool, optional): Whether to remove stop words. Defaults to True. - start_pos (int, optional): The starting position of the token. Defaults to 0. - start_char (int, optional): The starting character offset of the token. Defaults to 0. - mode (str, optional): The tokenization mode. Defaults to "". - **kwargs: Additional keyword arguments. - - Yields: - Token: The token object containing the token information. + >>> idt = IDTokenizer() + >>> [token.text for token in idt("/a/b 123 alpha")] + ["/a/b 123 alpha"] """ def __call__( @@ -104,27 +62,6 @@ def __call__( mode="", **kwargs, ): - """ - Tokenizes the given value and yields a Token object. - - Args: - value (str): The input string to be tokenized. - positions (bool, optional): Whether to include position information in the Token object. Defaults to False. - chars (bool, optional): Whether to include character information in the Token object. Defaults to False. - keeporiginal (bool, optional): Whether to store the original value in the Token object. Defaults to False. - removestops (bool, optional): Whether to remove stop words from the Token object. Defaults to True. - start_pos (int, optional): The starting position of the Token object. Defaults to 0. - start_char (int, optional): The starting character position of the Token object. Defaults to 0. - mode (str, optional): The tokenization mode. Defaults to "". - **kwargs: Additional keyword arguments to be passed to the Token object. - - Yields: - Token: A Token object representing a tokenized value. - - Raises: - AssertionError: If the input value is not a string. - - """ assert isinstance(value, str), f"{value!r} is not unicode" t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) t.text = value @@ -143,47 +80,25 @@ class RegexTokenizer(Tokenizer): """ Uses a regular expression to extract tokens from text. - Example: >>> rex = RegexTokenizer() >>> [token.text for token in rex("hi there 3.141 big-time under_score")] ["hi", "there", "3.141", "big", "time", "under_score"] + """ - Args: - expression (Union[str, Pattern]): A regular expression object or string. Each match + def __init__(self, expression=default_pattern, gaps=False): + """ + :param expression: A regular expression object or string. Each match of the expression equals a token. Group 0 (the entire matched text) is used as the text of the token. If you require more complicated handling of the expression match, simply write your own tokenizer. - gaps (bool): If True, the tokenizer *splits* on the expression, rather + :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. - """ - - def __init__(self, expression=default_pattern, gaps=False): - """ - Initialize the RegexTokenizer. - - Args: - expression (Union[str, Pattern]): A regular expression object or string. Each match - of the expression equals a token. Group 0 (the entire matched text) - is used as the text of the token. If you require more complicated - handling of the expression match, simply write your own tokenizer. - gaps (bool): If True, the tokenizer *splits* on the expression, rather - than matching on the expression. """ self.expression = rcompile(expression) self.gaps = gaps def __eq__(self, other): - """ - Compare the RegexTokenizer with another object for equality. - - Args: - other (object): The object to compare with. - - Returns: - bool: True if the objects are equal, False otherwise. - """ - if self.__class__ is other.__class__: if self.expression.pattern == other.expression.pattern: return True @@ -203,21 +118,16 @@ def __call__( **kwargs, ): """ - Tokenize the input value using the RegexTokenizer. - - Args: - value (str): The unicode string to tokenize. - positions (bool): Whether to record token positions in the token. - chars (bool): Whether to record character offsets in the token. - keeporiginal (bool): Whether to keep the original text of the token. - removestops (bool): Whether to remove stop words from the token. - start_pos (int): The position number of the first token. - start_char (int): The offset of the first character of the first token. - tokenize (bool): If True, the text should be tokenized. - mode (str): The tokenization mode. - - Yields: - Token: The generated tokens. + :param value: The unicode string to tokenize. + :param positions: Whether to record token positions in the token. + :param chars: Whether to record character offsets in the token. + :param start_pos: The position number of the first token. For example, + if you set start_pos=2, the tokens will be numbered 2,3,4,... + instead of 0,1,2,... + :param start_char: The offset of the first character of the first + token. For example, if you set start_char=2, the text "aaa bbb" + will have chars (2,5),(6,9) instead (0,3),(4,7). + :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, str), f"{repr(value)} is not unicode" @@ -315,24 +225,12 @@ class CharsetTokenizer(Tokenizer): def __init__(self, charmap): """ - Initialize the Tokenizer with a character map. - - :param charmap: A mapping from integer character numbers to Unicode + :param charmap: a mapping from integer character numbers to unicode characters, as used by the unicode.translate() method. - :type charmap: dict """ self.charmap = charmap def __eq__(self, other): - """ - Compare this tokenizer with another tokenizer for equality. - - Parameters: - - other: The other tokenizer to compare with. - - Returns: - - True if the tokenizers are equal, False otherwise. - """ return ( other and self.__class__ is other.__class__ @@ -353,22 +251,16 @@ def __call__( **kwargs, ): """ - Tokenizes a given unicode string. - :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. - :param keeporiginal: Whether to keep the original text in the token. - :param removestops: Whether to remove stop words from the token. - :param start_pos: The position number of the first token. - :param start_char: The offset of the first character of the first token. - :param tokenize: If True, the text should be tokenized. - :param mode: The tokenization mode. - :param kwargs: Additional keyword arguments. - - :return: A generator that yields Token objects. - - :raises AssertionError: If the value is not a unicode string. + :param start_pos: The position number of the first token. For example, + if you set start_pos=2, the tokens will be numbered 2,3,4,... + instead of 0,1,2,... + :param start_char: The offset of the first character of the first + token. For example, if you set start_char=2, the text "aaa bbb" + will have chars (2,5),(6,9) instead (0,3),(4,7). + :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, str), f"{value!r} is not unicode" @@ -424,48 +316,27 @@ def __call__( def SpaceSeparatedTokenizer(): - """ - Returns a RegexTokenizer that splits tokens by whitespace. - - This tokenizer splits input text into tokens based on whitespace characters (spaces, tabs, newlines). - It uses a regular expression pattern to match and extract tokens. - - Example: - sst = SpaceSeparatedTokenizer() - tokens = [token.text for token in sst("hi there big-time, what's up")] - print(tokens) - # Output: ["hi", "there", "big-time,", "what's", "up"] - - Returns: - A RegexTokenizer object that tokenizes input text based on whitespace. - - Note: - The regular expression pattern used by this tokenizer is r"[^ \t\r\n]+", - which matches one or more characters that are not whitespace. + """Returns a RegexTokenizer that splits tokens by whitespace. + >>> sst = SpaceSeparatedTokenizer() + >>> [token.text for token in sst("hi there big-time, what's up")] + ["hi", "there", "big-time,", "what's", "up"] """ + return RegexTokenizer(r"[^ \t\r\n]+") def CommaSeparatedTokenizer(): - """ - Tokenizes text by splitting tokens using commas. - - This tokenizer splits the input text into tokens by using commas as the delimiter. - It also applies the `StripFilter` to remove leading and trailing whitespace from each token. + """Splits tokens by commas. - Example: - >>> cst = CommaSeparatedTokenizer() - >>> [token.text for token in cst("hi there, what's , up")] - ["hi there", "what's", "up"] - - Returns: - A tokenizer object that can be used to tokenize text. - - Note: - The tokenizer relies on the `RegexTokenizer` and `StripFilter` classes from the `whoosh.analysis` module. + Note that the tokenizer calls unicode.strip() on each match of the regular + expression. + >>> cst = CommaSeparatedTokenizer() + >>> [token.text for token in cst("hi there, what's , up")] + ["hi there", "what's", "up"] """ + from whoosh.analysis.filters import StripFilter return RegexTokenizer(r"[^,]+") | StripFilter() @@ -474,45 +345,12 @@ def CommaSeparatedTokenizer(): class PathTokenizer(Tokenizer): """A simple tokenizer that given a string ``"/a/b/c"`` yields tokens ``["/a", "/a/b", "/a/b/c"]``. - - Args: - expression (str, optional): The regular expression pattern used to tokenize the input string. - Defaults to "[^/]+". - - Attributes: - expr (Pattern): The compiled regular expression pattern. - """ def __init__(self, expression="[^/]+"): - """ - Initialize the Tokenizer with the given regular expression pattern. - - Args: - expression (str, optional): The regular expression pattern used for tokenization. - Defaults to "[^/]+". - - Returns: - None - """ self.expr = rcompile(expression) def __call__(self, value, positions=False, start_pos=0, **kwargs): - """Tokenizes the input string. - - Args: - value (str): The input string to be tokenized. - positions (bool, optional): Whether to include token positions. Defaults to False. - start_pos (int, optional): The starting position for token positions. Defaults to 0. - **kwargs: Additional keyword arguments. - - Yields: - Token: The generated tokens. - - Raises: - AssertionError: If the input value is not a string. - - """ assert isinstance(value, str), f"{value!r} is not unicode" token = Token(positions, **kwargs) pos = start_pos diff --git a/src/whoosh/automata/fsa.py b/src/whoosh/automata/fsa.py index 791d7842..024dc8c6 100644 --- a/src/whoosh/automata/fsa.py +++ b/src/whoosh/automata/fsa.py @@ -9,43 +9,10 @@ class Marker: - """ - Represents a marker object. - - Markers are used to identify specific points in a program or data structure. - They can be used to mark positions in a Finite State Automaton (FSA) or any - other context where a named reference is needed. - - Attributes: - name (str): The name of the marker. - - Methods: - __repr__(): Returns a string representation of the marker. - - Example: - >>> marker = Marker("start") - >>> marker.name - 'start' - >>> repr(marker) - '' - """ - def __init__(self, name): - """ - Initializes a new Marker object. - - Args: - name (str): The name of the marker. - """ self.name = name def __repr__(self): - """ - Returns a string representation of the marker. - - Returns: - str: A string representation of the marker. - """ return f"<{self.name}>" @@ -54,76 +21,18 @@ def __repr__(self): # Base class -class FSA: - """ - Finite State Automaton (FSA) class. - - This class represents a finite state automaton, which is a mathematical model used to describe - sequential logic circuits and pattern matching algorithms. It consists of states, transitions, - and final states. - - Attributes: - initial (object): The initial state of the automaton. - transitions (dict): A dictionary that maps source states to a dictionary of labels and - destination states. - final_states (set): A set of final states in the automaton. - - Methods: - __len__(): Returns the total number of states in the automaton. - __eq__(other): Checks if two automata are equal. - all_states(): Returns a set of all states in the automaton. - all_labels(): Returns a set of all labels used in the automaton. - get_labels(src): Returns an iterator of labels for a given source state. - generate_all(state=None, sofar=""): Generates all possible strings accepted by the automaton. - start(): Returns the initial state of the automaton. - next_state(state, label): Returns the next state given the current state and a label. - is_final(state): Checks if a given state is a final state. - add_transition(src, label, dest): Adds a transition from a source state to a destination state - with a given label. - add_final_state(state): Adds a final state to the automaton. - to_dfa(): Converts the automaton to a deterministic finite automaton (DFA). - accept(string, debug=False): Checks if a given string is accepted by the automaton. - append(fsa): Appends another automaton to the current automaton. - """ +class FSA: def __init__(self, initial): - """ - Initialize a Finite State Automaton (FSA) with the given initial state. - - Args: - initial: The initial state of the FSA. - - Attributes: - initial (State): The initial state of the FSA. - transitions (dict): A dictionary mapping states to dictionaries of transitions. - Each transition dictionary maps input symbols to destination states. - final_states (set): A set of final states in the FSA. - - """ self.initial = initial self.transitions = {} self.final_states = set() def __len__(self): - """ - Returns the number of states in the finite state automaton. - - :return: The number of states in the automaton. - :rtype: int - """ return len(self.all_states()) def __eq__(self, other): - """ - Check if two Finite State Automata (FSAs) are equal. - - Args: - other (FSA): The other FSA to compare with. - - Returns: - bool: True if the FSAs are equal, False otherwise. - """ if self.initial != other.initial: return False if self.final_states != other.final_states: @@ -133,114 +42,21 @@ def __eq__(self, other): return st == ot def all_states(self): - """ - Returns a set of all states in the automaton. - - This method iterates over the transitions in the automaton and collects all the states - encountered. It returns a set containing all the unique states. - - Returns: - set: A set of all states in the automaton. - - Example: - >>> automaton = FSA() - >>> automaton.add_transition('A', 'B', 'a') - >>> automaton.add_transition('B', 'C', 'b') - >>> automaton.add_transition('C', 'D', 'c') - >>> automaton.all_states() - {'A', 'B', 'C', 'D'} - - """ stateset = set(self.transitions) for trans in self.transitions.values(): stateset.update(trans.values()) return stateset def all_labels(self): - """ - Returns a set of all labels used in the automaton. - - This method iterates over all transitions in the automaton and collects - all unique labels used in those transitions. The labels are returned as - a set. - - Returns: - set: A set of all labels used in the automaton. - - Example: - >>> automaton = FSA() - >>> automaton.add_transition(0, 1, 'a') - >>> automaton.add_transition(1, 2, 'b') - >>> automaton.add_transition(2, 3, 'a') - >>> automaton.all_labels() - {'a', 'b'} - - """ labels = set() for trans in self.transitions.values(): labels.update(trans) return labels def get_labels(self, src): - """ - Returns an iterator of labels for a given source state. - - Args: - src (object): The source state. - - Returns: - iterator: An iterator of labels for the given source state. - - Raises: - None - - Examples: - >>> fsa = FSA() - >>> src_state = State() - >>> fsa.add_transition(src_state, 'a', State()) - >>> fsa.add_transition(src_state, 'b', State()) - >>> labels = fsa.get_labels(src_state) - >>> list(labels) - ['a', 'b'] - - Notes: - - This method returns an iterator of labels for the given source state. - - If the source state has no transitions, an empty iterator will be returned. - """ return iter(self.transitions.get(src, [])) def generate_all(self, state=None, sofar=""): - """ - Generates all possible strings accepted by the automaton. - - Args: - state (object, optional): The current state. Defaults to the initial state. - sofar (str, optional): The string generated so far. Defaults to an empty string. - - Yields: - str: The generated string. - - Returns: - None - - Raises: - None - - Examples: - # Create an automaton - automaton = Automaton() - - # Generate all possible strings - for string in automaton.generate_all(): - print(string) - - Notes: - - This method uses a recursive approach to generate all possible strings accepted by the automaton. - - The `state` parameter represents the current state of the automaton. If not provided, it defaults to the initial state. - - The `sofar` parameter represents the string generated so far. If not provided, it defaults to an empty string. - - The method yields each generated string one by one, allowing for efficient memory usage when dealing with large automata. - - """ state = self.start() if state is None else state if self.is_final(state): yield sofar @@ -249,177 +65,24 @@ def generate_all(self, state=None, sofar=""): yield from self.generate_all(newstate, sofar + label) def start(self): - """ - Returns the initial state of the automaton. - - Returns: - object: - The initial state of the automaton. - - Raises: - None. - - Examples: - >>> automaton = FSA() - >>> initial_state = automaton.start() - """ return self.initial def next_state(self, state, label): - """ - Returns the next state given the current state and a label. - - Args: - state (object): The current state. - The current state of the finite state automaton. - - label (object): The label. - The label representing the transition from the current state to the next state. - - Returns: - object: The next state. - The next state of the finite state automaton based on the current state and label. - - Raises: - NotImplementedError: This method should be implemented in a subclass. - This exception is raised when the `next_state` method is called on the base class - and not overridden in a subclass. - - """ raise NotImplementedError def is_final(self, state): - """ - Checks if a given state is a final state. - - Args: - state (object): The state to check. - - Returns: - bool: True if the state is a final state, False otherwise. - - Raises: - NotImplementedError: This method should be implemented in a subclass. - - Examples: - >>> fsa = FSA() - >>> fsa.is_final(0) - False - >>> fsa.is_final(1) - True - - Notes: - This method should be implemented in a subclass to provide the specific logic for determining - whether a state is a final state or not. By default, it raises a NotImplementedError. - - """ raise NotImplementedError def add_transition(self, src, label, dest): - """ - Adds a transition from a source state to a destination state with a given label. - - Args: - src (object): The source state. - label (object): The label. - dest (object): The destination state. - - Raises: - NotImplementedError: This method should be implemented in a subclass. - - Returns: - None - - Example: - >>> fsa = FSA() - >>> src = State('A') - >>> dest = State('B') - >>> label = 'transition' - >>> fsa.add_transition(src, label, dest) - - """ raise NotImplementedError def add_final_state(self, state): - """ - Adds a final state to the automaton. - - Args: - state (object): The final state to add. - - Raises: - NotImplementedError: This method should be implemented in a subclass. - - Example: - >>> automaton = Automaton() - >>> automaton.add_final_state(5) - - This method should be implemented in a subclass to add a final state to the automaton. - A final state is a state that marks the end of a sequence of transitions in the automaton. - The `state` parameter should be an object representing the final state to be added. - - Note: - This method raises a NotImplementedError to indicate that it should be implemented in a subclass. - - """ raise NotImplementedError def to_dfa(self): - """ - Converts the automaton to a deterministic finite automaton (DFA). - - This method takes the current automaton and converts it into an equivalent - deterministic finite automaton (DFA). The resulting DFA will have the same - language recognition capabilities as the original automaton, but with a - potentially different internal representation. - - Returns: - DFA: The converted DFA. - - Raises: - NotImplementedError: This method should be implemented in a subclass. - - Example: - >>> nfa = NFA() - >>> # Add states, transitions, and final states to the NFA - >>> dfa = nfa.to_dfa() - >>> # Use the converted DFA for further processing - - Note: - The `to_dfa` method should be implemented in a subclass to provide the - conversion logic specific to that automaton type. - - """ raise NotImplementedError def accept(self, string, debug=False): - """ - Checks if a given string is accepted by the automaton. - - Args: - string (str): The string to check. - debug (bool, optional): Whether to print debug information. Defaults to False. - - Returns: - bool: True if the string is accepted, False otherwise. - - Raises: - None - - Examples: - >>> automaton = Automaton() - >>> automaton.accept("abc") - True - >>> automaton.accept("def") - False - - Notes: - This method iterates over each character in the input string and transitions the automaton - to the next state based on the current state and the input label. If the automaton reaches - a non-final state or encounters an invalid label, it breaks the loop and returns False. - If the automaton reaches a final state after processing the entire string, it returns True. - - """ state = self.start() for label in string: @@ -433,33 +96,6 @@ def accept(self, string, debug=False): return self.is_final(state) def append(self, fsa): - """ - Appends another automaton to the current automaton. - - Args: - fsa (FSA): The automaton to append. - - Returns: - None - - Raises: - None - - Notes: - This method appends the transitions and final states of the given automaton - to the current automaton. It updates the transitions dictionary by adding - the transitions from the given automaton. It also adds epsilon transitions - from each final state of the current automaton to the initial state of the - given automaton. Finally, it updates the final states of the current automaton - to be the final states of the given automaton. - - Example: - fsa1 = FSA() - fsa2 = FSA() - # ... code to define transitions and final states for fsa1 and fsa2 ... - fsa1.append(fsa2) - # Now fsa1 contains the appended transitions and final states from fsa2. - """ self.transitions.update(fsa.transitions) for state in self.final_states: self.add_transition(state, EPSILON, fsa.initial) @@ -470,66 +106,12 @@ def append(self, fsa): class NFA(FSA): - """ - NFA (Non-Deterministic Finite Automaton) class represents a non-deterministic finite automaton. - It is a subclass of FSA (Finite State Automaton). - - Attributes: - transitions (dict): A dictionary that maps source states to a dictionary of labels and destination states. - final_states (set): A set of final states. - initial: The initial state of the NFA. - - Methods: - dump(stream=sys.stdout): Prints a textual representation of the NFA to the specified stream. - start(): Returns the initial state of the NFA as a frozenset. - add_transition(src, label, dest): Adds a transition from source state to destination state with the specified label. - add_final_state(state): Adds a final state to the NFA. - triples(): Generates all possible triples (source state, label, destination state) in the NFA. - is_final(states): Checks if any of the given states is a final state. - _expand(states): Expands the given set of states by following epsilon transitions. - next_state(states, label): Returns the set of states that can be reached from the given states with the specified label. - get_labels(states): Returns the set of labels that can be reached from the given states. - embed(other): Copies all transitions from another NFA into this NFA. - insert(src, other, dest): Connects the source state to the initial state of another NFA, and the final states of the other NFA to the destination state. - to_dfa(): Converts the NFA to a DFA (Deterministic Finite Automaton). - """ - def __init__(self, initial): - """ - Initializes a Finite State Automaton (FSA) object. - - Parameters: - - initial: The initial state of the FSA. - - Attributes: - - transitions: A dictionary representing the transitions between states. - - final_states: A set containing the final states of the FSA. - - initial: The initial state of the FSA. - """ self.transitions = {} self.final_states = set() self.initial = initial def dump(self, stream=sys.stdout): - """ - Prints a textual representation of the NFA to the specified stream. - - Args: - stream (file): The stream to print the representation to. Defaults to sys.stdout. - - Returns: - None - - Raises: - None - - Example: - nfa = NFA() - nfa.add_transition(0, 'a', 1) - nfa.add_transition(1, 'b', 2) - nfa.add_transition(2, 'c', 3) - nfa.dump() # Prints the NFA representation to sys.stdout - """ starts = self.start() for src in self.transitions: beg = "@" if src in starts else " " @@ -540,116 +122,24 @@ def dump(self, stream=sys.stdout): _ = "||" if self.is_final(dests) else "" def start(self): - """ - Returns the initial state of the NFA as a frozenset. - - This method returns the initial state of the NFA (Non-Deterministic Finite Automaton) - as a frozenset. The initial state is the starting point of the automaton. - - Returns: - frozenset: The initial state of the NFA. - """ return frozenset(self._expand({self.initial})) def add_transition(self, src, label, dest): - """ - Adds a transition from the source state to the destination state with the specified label. - - This method is used to define transitions between states in a finite state automaton. - - Args: - src (object): The source state. - label (object): The label of the transition. - dest (object): The destination state. - - Returns: - None - - Raises: - None - - Example: - >>> fsa = FSA() - >>> fsa.add_transition('state1', 'a', 'state2') - """ self.transitions.setdefault(src, {}).setdefault(label, set()).add(dest) def add_final_state(self, state): - """ - Adds a final state to the NFA. - - Args: - state (object): The final state to add. - - Returns: - None - - Raises: - TypeError: If the state is not a valid object. - - Notes: - This method adds a final state to the NFA (Non-Deterministic Finite Automaton). - A final state is a state that, when reached during the execution of the NFA, - indicates that the input string has been accepted. - - Example: - >>> nfa = NFA() - >>> state = State() - >>> nfa.add_final_state(state) - """ self.final_states.add(state) def triples(self): - """ - Generates all possible triples (source state, label, destination state) in the NFA. - - This method iterates over the transitions of the NFA and yields a tuple for each triple found. - Each triple consists of the source state, the label of the transition, and the destination state. - - Yields: - tuple: A triple (source state, label, destination state). - """ for src, trans in self.transitions.items(): for label, dests in trans.items(): for dest in dests: yield src, label, dest def is_final(self, states): - """ - Checks if any of the given states is a final state. - - Args: - states (set): The set of states to check. - - Returns: - bool: True if any of the states is a final state, False otherwise. - """ return bool(self.final_states.intersection(states)) def _expand(self, states): - """ - Expands the given set of states by following epsilon transitions. - - This method takes a set of states and expands it by following epsilon transitions. - Epsilon transitions are transitions that do not consume any input symbol. - - Args: - states (set): The set of states to expand. - - Returns: - set: The expanded set of states. - - Example: - >>> automaton = FSA() - >>> initial_states = {0} - >>> expanded_states = automaton._expand(initial_states) - >>> print(expanded_states) - {0, 1, 2, 3} - - Note: - This method modifies the input set of states in-place by adding the newly expanded states to it. - If you want to keep the original set of states unchanged, make a copy before calling this method. - """ transitions = self.transitions frontier = set(states) while frontier: @@ -661,36 +151,6 @@ def _expand(self, states): return states def next_state(self, states, label): - """ - Returns the set of states that can be reached from the given states with the specified label. - - Args: - states (set): The set of states to start from. - label: The label of the transition. - - Returns: - frozenset: The set of states that can be reached. - - Raises: - None - - Example: - >>> automaton = FSA() - >>> automaton.add_transition(0, 'a', 1) - >>> automaton.add_transition(1, 'b', 2) - >>> automaton.add_transition(2, 'c', 3) - >>> automaton.next_state({0}, 'a') - frozenset({1}) - - This method takes a set of states and a label as input and returns the set of states that can be reached from the given states with the specified label. It considers the transitions defined in the automaton and follows them to determine the reachable states. - - The method first checks if each state in the input set has any outgoing transitions defined. If a transition with the specified label is found, the destination states are added to the result set. Additionally, if there is a transition with the special label 'ANY', the destination states of that transition are also added to the result set. - - The result set is then expanded to include all states reachable from the initial set of states, considering all possible transitions. - - Note: The input states should be a set of valid states in the automaton. The label can be any valid label defined in the automaton's transitions. - - """ transitions = self.transitions dest_states = set() for state in states: @@ -703,26 +163,6 @@ def next_state(self, states, label): return frozenset(self._expand(dest_states)) def get_labels(self, states): - """ - Returns the set of labels that can be reached from the given states. - - Args: - states (set): The set of states. - - Returns: - set: The set of labels. - - Raises: - None. - - Examples: - >>> automaton = FSA() - >>> automaton.add_transition(1, 'a', 2) - >>> automaton.add_transition(2, 'b', 3) - >>> automaton.add_transition(3, 'c', 4) - >>> automaton.get_labels({1, 2, 3}) - {'a', 'b', 'c'} - """ transitions = self.transitions labels = set() for state in states: @@ -731,33 +171,7 @@ def get_labels(self, states): return labels def embed(self, other): - """ - Copies all transitions from another NFA into this NFA. - - Args: - other (NFA): The other NFA to copy transitions from. - - Returns: - None - - Raises: - None - - Notes: - This method copies all transitions from the specified NFA (`other`) into the current NFA. - It updates the transitions of the current NFA by adding the transitions from `other`. - The transitions are copied based on the source state and the label of the transition. - If a transition with the same source state and label already exists in the current NFA, - the destination states are updated by adding the destination states from `other`. - - Example: - nfa1 = NFA() - nfa2 = NFA() - # ... add transitions to nfa1 and nfa2 ... - - nfa1.embed(nfa2) - # Now nfa1 contains all transitions from nfa2. - """ + # Copy all transitions from the other NFA into this one for s, othertrans in other.transitions.items(): trans = self.transitions.setdefault(s, {}) for label, otherdests in othertrans.items(): @@ -765,60 +179,15 @@ def embed(self, other): dests.update(otherdests) def insert(self, src, other, dest): - """ - Connects the source state to the initial state of another NFA, and the final states of the other NFA to the destination state. - - Args: - src (State): The source state to connect from. - other (NFA): The other NFA to connect. - dest (State): The destination state to connect to. - - Returns: - None - - Raises: - TypeError: If src or dest are not instances of the State class. - ValueError: If other is not an instance of the NFA class. - - Notes: - This method modifies the current NFA by embedding the other NFA into it. It connects the source state to the initial state of the other NFA, and connects the final states of the other NFA to the destination state. - - Example: - nfa = NFA() - src = State() - dest = State() - other = NFA() - # ... Initialize src, dest, and other with appropriate values ... - - nfa.insert(src, other, dest) - """ self.embed(other) + + # Connect src to the other NFA's initial state, and the other + # NFA's final states to dest self.add_transition(src, EPSILON, other.initial) for finalstate in other.final_states: self.add_transition(finalstate, EPSILON, dest) def to_dfa(self): - """ - Converts the NFA to a DFA (Deterministic Finite Automaton). - - This method performs the conversion of a Non-Deterministic Finite Automaton (NFA) to a - Deterministic Finite Automaton (DFA). The resulting DFA is constructed by exploring - the states and transitions of the NFA. - - Returns: - DFA: The converted DFA. - - Notes: - - The NFA must be initialized before calling this method. - - The NFA should have at least one start state. - - The NFA should have at least one final state. - - Example: - nfa = NFA() - # ... code to initialize the NFA ... - dfa = nfa.to_dfa() - # ... code to use the converted DFA ... - """ dfa = DFA(self.start()) frontier = [self.start()] seen = set() @@ -844,69 +213,7 @@ def to_dfa(self): class DFA(FSA): - """ - Deterministic Finite Automaton (DFA) class. - - This class represents a DFA, which is a type of finite state automaton - where each input symbol uniquely determines the next state. DFAs are - commonly used in pattern matching and string searching algorithms. - - Attributes: - initial (object): The initial state of the DFA. - transitions (dict): A dictionary representing the transitions between - states. The keys are the source states, and the values are - dictionaries where the keys are the input labels and the values - are the destination states. - defaults (dict): A dictionary representing the default transitions - for states that do not have a specific transition defined for a - given input label. The keys are the source states, and the values - are the default destination states. - final_states (set): A set containing the final states of the DFA. - outlabels (dict): A dictionary caching the sorted output labels for - each state. - - Methods: - dump(stream=sys.stdout): Prints a textual representation of the DFA - to the specified stream. - start(): Returns the initial state of the DFA. - add_transition(src, label, dest): Adds a transition from the source - state to the destination state with the given input label. - set_default_transition(src, dest): Sets the default transition for - the source state to the specified destination state. - add_final_state(state): Adds the specified state as a final state of - the DFA. - is_final(state): Checks if the specified state is a final state of - the DFA. - next_state(src, label): Returns the next state of the DFA given the - current state and the input label. - next_valid_string(string, asbytes=False): Returns the lexicographically - smallest valid string that can be obtained by following the DFA - from the initial state using the characters in the input string. - find_next_edge(s, label, asbytes): Finds the next edge label for the - specified state and input label. - reachable_from(src, inclusive=True): Returns the set of states that - can be reached from the specified source state. - minimize(): Minimizes the DFA by removing unreachable states and - merging equivalent states. - to_dfa(): Returns a reference to itself (DFA). - - """ - def __init__(self, initial): - """ - Initializes a new instance of the DFA class. - - Args: - initial (object): The initial state of the DFA. - - Attributes: - initial (object): The initial state of the DFA. - transitions (dict): A dictionary mapping state and input symbol pairs to the next state. - defaults (dict): A dictionary mapping states to default next states. - final_states (set): A set of final states. - outlabels (dict): A dictionary mapping states to output labels. - - """ self.initial = initial self.transitions = {} self.defaults = {} @@ -914,33 +221,6 @@ def __init__(self, initial): self.outlabels = {} def dump(self, stream=sys.stdout): - """ - Prints a textual representation of the DFA to the specified stream. - - Args: - stream (file-like object, optional): The stream to print the - representation to. Defaults to sys.stdout. - - Returns: - None - - Raises: - None - - Example: - >>> dfa = DFA() - >>> dfa.add_transition(0, 'a', 1) - >>> dfa.add_transition(1, 'b', 2) - >>> dfa.add_transition(2, 'c', 3) - >>> dfa.dump() # Prints the DFA representation to sys.stdout - @ 0 - a -> 1 - 1 - b -> 2 - 2 - c -> 3|| - - """ for src in sorted(self.transitions): beg = "@" if src == self.initial else " " print(beg, src, file=stream) @@ -950,194 +230,25 @@ def dump(self, stream=sys.stdout): _ = "||" if self.is_final(dest) else "" def start(self): - """ - Returns the initial state of the DFA. - - Returns: - object: The initial state of the DFA. - - """ return self.initial def add_transition(self, src, label, dest): - """ - Adds a transition from the source state to the destination state with - the given input label. - - Args: - src (object): The source state. - label (object): The input label. - dest (object): The destination state. - - Returns: - None - - Raises: - None - - Examples: - >>> fsa = FSA() - >>> fsa.add_transition('A', 'a', 'B') - >>> fsa.add_transition('B', 'b', 'C') - - """ self.transitions.setdefault(src, {})[label] = dest def set_default_transition(self, src, dest): - """ - Sets the default transition for the source state to the specified - destination state. - - Args: - src (object): The source state. - dest (object): The default destination state. - - Returns: - None - - Raises: - None - - Examples: - # Create an instance of the FSA class - fsa = FSA() - - # Set the default transition from state 'A' to state 'B' - fsa.set_default_transition('A', 'B') - - Notes: - - This method allows you to define a default transition for a source state. - - If a specific transition is not defined for a given input in the FSA, - the default transition will be used. - """ self.defaults[src] = dest def add_final_state(self, state): - """ - Adds the specified state as a final state of the DFA. - - Args: - state (object): The final state to add. - - Returns: - None - - Raises: - TypeError: If the state is not of the expected type. - - Notes: - - This method adds a state to the set of final states of the DFA. - - Final states are used to determine whether a given input sequence is accepted by the DFA. - - Example: - >>> dfa = DFA() - >>> dfa.add_final_state(3) - >>> dfa.add_final_state(5) - """ self.final_states.add(state) def is_final(self, state): - """ - Checks if the specified state is a final state of the DFA. - - Args: - state (object): The state to check. - - Returns: - bool: True if the state is a final state, False otherwise. - - Raises: - None - - Examples: - >>> dfa = DFA() - >>> dfa.add_final_state('q1') - >>> dfa.is_final('q1') - True - >>> dfa.is_final('q2') - False - - Notes: - - This method is used to determine if a given state is a final state in a Deterministic Finite Automaton (DFA). - - A final state is a state in which the DFA accepts the input string and terminates. - - The method returns True if the specified state is a final state, and False otherwise. - """ return state in self.final_states def next_state(self, src, label): - """ - Returns the next state of the DFA given the current state and the - input label. - - Args: - src (object): The current state. - label (object): The input label. - - Returns: - object: The next state. - - Raises: - KeyError: If the current state or input label is not found in the DFA. - - Notes: - - If the current state is not found in the DFA transitions, the default - state for that source state will be returned. - - If the input label is not found in the transitions for the current state, - None will be returned. - - Example: - >>> dfa = DFA() - >>> dfa.add_transition('A', 'a', 'B') - >>> dfa.add_transition('B', 'b', 'C') - >>> dfa.next_state('A', 'a') - 'B' - >>> dfa.next_state('B', 'b') - 'C' - >>> dfa.next_state('C', 'c') - None - """ trans = self.transitions.get(src, {}) return trans.get(label, self.defaults.get(src, None)) def next_valid_string(self, string, asbytes=False): - """ - Returns the lexicographically smallest valid string that can be - obtained by following the DFA from the initial state using the - characters in the input string. - - Args: - string (str or bytes): The input string. - asbytes (bool, optional): Specifies whether the input string is - in bytes format. Defaults to False. - - Returns: - str or bytes: The lexicographically smallest valid string, or - None if no valid string can be obtained. - - Raises: - None - - Examples: - >>> fsa = FSA() - >>> fsa.add_transition(0, 'a', 1) - >>> fsa.add_transition(1, 'b', 2) - >>> fsa.add_transition(2, 'c', 3) - >>> fsa.set_final(3) - >>> fsa.next_valid_string('ab') # Returns 'abc' - >>> fsa.next_valid_string('abc') # Returns 'abc' - >>> fsa.next_valid_string('abcd') # Returns None - - Notes: - - The method follows the DFA (Deterministic Finite Automaton) from - the initial state using the characters in the input string. - - It returns the lexicographically smallest valid string that can be - obtained by following the DFA. - - If the input string is already a valid string, it is returned as is. - - If no valid string can be obtained, None is returned. - - The `asbytes` parameter specifies whether the input string is in - bytes format. By default, it is set to False. - - """ state = self.start() stack = [] @@ -1169,34 +280,6 @@ def next_valid_string(self, string, asbytes=False): return None def find_next_edge(self, s, label, asbytes): - """ - Finds the next edge label for the specified state and input label. - - Args: - s (object): The current state. - label (object): The current input label. - asbytes (bool): Specifies whether the labels are in bytes format. - - Returns: - object: The next edge label, or None if no label is found. - - Raises: - None - - Examples: - >>> automaton = FSA() - >>> automaton.find_next_edge(1, 'a', False) - 'b' - - Notes: - - This method is used to find the next edge label for a given state and input label in the automaton. - - The `s` parameter represents the current state in the automaton. - - The `label` parameter represents the current input label. - - The `asbytes` parameter specifies whether the labels are in bytes format. - - If `label` is None, it is set to b"\x00" if `asbytes` is True, or "\0" if `asbytes` is False. - - The method returns the next edge label if found, or None if no label is found. - - """ if label is None: label = b"\x00" if asbytes else "\0" else: @@ -1216,27 +299,6 @@ def find_next_edge(self, s, label, asbytes): return None def reachable_from(self, src, inclusive=True): - """ - Returns the set of states that can be reached from the specified - source state. - - Args: - src (object): The source state. - inclusive (bool, optional): Specifies whether the source state - should be included in the result. Defaults to True. - - Returns: - set: The set of reachable states. - - Example: - >>> automaton = FSA() - >>> automaton.add_state('A') - >>> automaton.add_state('B') - >>> automaton.add_transition('A', 'B') - >>> automaton.reachable_from('A') - {'A', 'B'} - - """ transitions = self.transitions reached = set() @@ -1255,24 +317,6 @@ def reachable_from(self, src, inclusive=True): return reached def minimize(self): - """ - Minimizes the DFA by removing unreachable states and merging equivalent states. - - This method performs the following steps: - 1. Deletes unreachable states from the DFA. - 2. Partitions the remaining states into equivalence sets. - 3. Chooses one representative state from each equivalence set and maps all equivalent states to it. - 4. Applies the mapping to the existing transitions. - 5. Removes dead states - non-final states with no outgoing arcs except to themselves. - - After the minimization process, the DFA will have a reduced number of states while preserving its language. - - Usage: - dfa = DFA(...) - dfa.minimize() - - :return: None - """ transitions = self.transitions initial = self.initial @@ -1360,26 +404,6 @@ def minimize(self): self.final_states = new_finals def to_dfa(self): - """ - Converts the Finite State Automaton (FSA) to a Deterministic Finite Automaton (DFA). - - This method returns a reference to itself, as the conversion from FSA to DFA is an in-place operation. - - Returns: - DFA: A reference to the converted DFA. - - Notes: - - The conversion from FSA to DFA eliminates non-determinism by creating a new DFA with equivalent language acceptance. - - The resulting DFA may have a larger number of states compared to the original FSA. - - The original FSA is not modified during the conversion process. - - Example: - >>> fsa = FSA() - >>> # Add states, transitions, and final states to the FSA - >>> dfa = fsa.to_dfa() - >>> # Use the converted DFA for further operations - - """ return self @@ -1387,41 +411,6 @@ def to_dfa(self): def renumber_dfa(dfa, base=0): - """ - Renumber the states of a DFA (Deterministic Finite Automaton) starting from a given base number. - - Args: - dfa (DFA): The DFA to renumber. - base (int, optional): The base number to start renumbering from. Defaults to 0. - - Returns: - DFA: The renumbered DFA. - - Raises: - None. - - Examples: - >>> dfa = DFA() - >>> dfa.add_state(0) - >>> dfa.add_state(1) - >>> dfa.add_transition(0, 'a', 1) - >>> dfa.add_transition(1, 'b', 0) - >>> dfa.set_initial_state(0) - >>> dfa.add_final_state(1) - >>> renumbered_dfa = renumber_dfa(dfa, base=10) - >>> renumbered_dfa.get_states() - [10, 11] - >>> renumbered_dfa.get_initial_state() - 10 - >>> renumbered_dfa.get_final_states() - [11] - - Note: - This function renumbers the states of a DFA by assigning new numbers to each state, starting from the base number. - It creates a new DFA object with the renumbered states and updates the transitions, final states, and default transitions accordingly. - The mapping between the old states and the new states is stored in a dictionary called 'mapping'. - - """ c = itertools.count(base) mapping = {} @@ -1445,30 +434,6 @@ def remap(state): def u_to_utf8(dfa, base=0): - """ - Converts Unicode labels in a DFA to UTF-8 labels. - - This function takes a DFA (Deterministic Finite Automaton) and converts - its Unicode labels to UTF-8 labels. It modifies the DFA in-place. - - Parameters: - - dfa (DFA): The DFA to convert. - - base (int): The base value for generating new state IDs. Defaults to 0. - - Raises: - - ValueError: If the DFA contains a transition with the label ANY. - - Returns: - - None: The function modifies the DFA in-place. - - Example usage: - ``` - dfa = DFA() - # ... construct the DFA ... - u_to_utf8(dfa) - # ... continue using the modified DFA ... - ``` - """ c = itertools.count(base) transitions = dfa.transitions @@ -1478,7 +443,7 @@ def u_to_utf8(dfa, base=0): if label is EPSILON: continue elif label is ANY: - raise ValueError("DFA contains a transition with the label ANY") + raise ValueError else: assert isinstance(label, str) label8 = label.encode("utf8") @@ -1494,41 +459,17 @@ def u_to_utf8(dfa, base=0): def find_all_matches(dfa, lookup_func, first=unull): """ - Finds all words within a given Levenshtein distance of a target word. - - This function uses the provided `lookup_func` to find all words within a specified - Levenshtein distance (`k`) of a target word. It iterates through the DFA (Deterministic - Finite Automaton) `dfa` to generate all possible matches. + Uses lookup_func to find all words within levenshtein distance k of word. Args: - dfa (DFA): The DFA representing the search space. - lookup_func (function): A function that takes a word as input and returns the first - word in the database that is greater than or equal to the input word. - first (str): The first word to start the search from. Defaults to `unull`. - + word: The word to look up + k: Maximum edit distance + lookup_func: A single argument function that returns the first word in the + database that is greater than or equal to the input argument. Yields: - str: Every matching word within the specified Levenshtein distance `k` from the database. - - Example: - >>> dfa = DFA() - >>> lookup_func = lambda word: word - >>> matches = find_all_matches(dfa, lookup_func, first="hello") - >>> for match in matches: - ... print(match) - ... - hello - hallo - hullo - helio - ... - - Note: - The `dfa` parameter should be an instance of the DFA class, which represents the search space. - The `lookup_func` parameter should be a function that returns the first word in the database - that is greater than or equal to the input word. This function is used to efficiently search - for matches within the specified Levenshtein distance. - + Every matching word within levenshtein distance k from the database. """ + match = dfa.next_valid_string(first) while match: key = lookup_func(match) @@ -1544,25 +485,6 @@ def find_all_matches(dfa, lookup_func, first=unull): def reverse_nfa(n): - """ - Reverses the given NFA (Non-deterministic Finite Automaton). - - Args: - n (NFA): The NFA to be reversed. - - Returns: - NFA: The reversed NFA. - - Notes: - This function creates a new NFA by reversing the transitions of the given NFA. - It adds transitions from the destination states to the source states for each - transition in the original NFA. It also adds transitions from the initial state - of the original NFA to the final states of the original NFA. - - Example: - nfa = NFA(...) - reversed_nfa = reverse_nfa(nfa) - """ s = object() nfa = NFA(s) for src, trans in n.transitions.items(): @@ -1576,54 +498,6 @@ def reverse_nfa(n): def product(dfa1, op, dfa2): - """ - Compute the product of two DFAs. - - This function takes two deterministic finite automata (DFAs) represented by `dfa1` and `dfa2`, - and computes their product DFA based on the given binary operator `op`. - - Parameters: - - dfa1 (DFA): The first DFA. - - op (function): The binary operator used to combine the states of `dfa1` and `dfa2`. - - dfa2 (DFA): The second DFA. - - Returns: - - dfa (DFA): The product DFA. - - Algorithm: - 1. Convert `dfa1` and `dfa2` to DFAs if they are not already. - 2. Create the start state of the product DFA as a tuple of the start states of `dfa1` and `dfa2`. - 3. Initialize an empty stack and push the start state onto the stack. - 4. While the stack is not empty: - - Pop a state from the stack. - - Get the transitions of the corresponding states in `dfa1` and `dfa2`. - - For each label that is common to both sets of transitions: - - Compute the next states in `dfa1` and `dfa2` based on the label. - - If the binary operator `op` returns True for the next states, add a transition to the product DFA. - - Push the next state onto the stack. - - If both next states are final states, mark the next state in the product DFA as a final state. - 5. Return the product DFA. - - Note: - - The `op` function should take two boolean arguments and return a boolean value. - - The `DFA` class represents a deterministic finite automaton. - - Example usage: - ``` - dfa1 = DFA(...) - dfa2 = DFA(...) - product_dfa = product(dfa1, my_operator, dfa2) - ``` - - :param dfa1: The first DFA. - :type dfa1: DFA - :param op: The binary operator used to combine the states of `dfa1` and `dfa2`. - :type op: function - :param dfa2: The second DFA. - :type dfa2: DFA - :return: The product DFA. - :rtype: DFA - """ dfa1 = dfa1.to_dfa() dfa2 = dfa2.to_dfa() start = (dfa1.start(), dfa2.start()) @@ -1647,107 +521,22 @@ def product(dfa1, op, dfa2): def intersection(dfa1, dfa2): - """ - Compute the intersection of two deterministic finite automata (DFAs). - - This function takes two DFAs, `dfa1` and `dfa2`, and returns a new DFA that represents the intersection of the two DFAs. - The intersection of two DFAs is a new DFA that accepts only the strings that are accepted by both `dfa1` and `dfa2`. - - Parameters: - - dfa1 (DFA): The first DFA. - - dfa2 (DFA): The second DFA. - - Returns: - - DFA: The DFA representing the intersection of `dfa1` and `dfa2`. - - Example: - >>> dfa1 = DFA(...) - >>> dfa2 = DFA(...) - >>> result = intersection(dfa1, dfa2) - """ - return product(dfa1, operator.and_, dfa2) def union(dfa1, dfa2): - """ - Computes the union of two deterministic finite automata (DFAs). - - Parameters: - - dfa1 (DFA): The first DFA. - - dfa2 (DFA): The second DFA. - - Returns: - - DFA: The DFA resulting from the union of dfa1 and dfa2. - - Raises: - - TypeError: If either dfa1 or dfa2 is not a DFA object. - - Example: - >>> dfa1 = DFA(...) - >>> dfa2 = DFA(...) - >>> result = union(dfa1, dfa2) - """ - return product(dfa1, operator.or_, dfa2) def epsilon_nfa(): - """ - Creates an epsilon-NFA (non-deterministic finite automaton) with a single epsilon transition. - - Returns: - A basic NFA (Nondeterministic Finite Automaton) with a single epsilon transition. - - Notes: - - The epsilon transition allows the automaton to move from one state to another without consuming any input. - - This function is a helper function that creates a basic NFA with only an epsilon transition. - - The resulting NFA can be further modified and combined with other NFAs to build more complex automata. - - Example: - >>> nfa = epsilon_nfa() - >>> nfa - - """ return basic_nfa(EPSILON) def dot_nfa(): - """ - Creates a non-deterministic finite automaton (NFA) that matches any single character. - - Returns: - NFA: A non-deterministic finite automaton that matches any single character. - - Example: - >>> nfa = dot_nfa() - >>> nfa.match('a') - True - >>> nfa.match('b') - True - >>> nfa.match('1') - True - """ return basic_nfa(ANY) def basic_nfa(label): - """ - Creates a basic NFA (Non-Deterministic Finite Automaton) with a single transition. - - Parameters: - label (str): The label of the transition. - - Returns: - NFA: The created NFA. - - Example: - >>> nfa = basic_nfa('a') - >>> nfa.transitions - {: {'a': []}} - >>> nfa.final_states - {} - """ s = object() e = object() nfa = NFA(s) @@ -1757,19 +546,6 @@ def basic_nfa(label): def charset_nfa(labels): - """ - Constructs a non-deterministic finite automaton (NFA) that recognizes a character set. - - Parameters: - - labels (iterable): An iterable of labels representing the characters in the character set. - - Returns: - - NFA: The constructed NFA. - - Example: - >>> labels = ['a', 'b', 'c'] - >>> nfa = charset_nfa(labels) - """ s = object() e = object() nfa = NFA(s) @@ -1780,22 +556,6 @@ def charset_nfa(labels): def string_nfa(string): - """ - Creates a Non-Deterministic Finite Automaton (NFA) that recognizes the given string. - - Parameters: - - string (str): The string to be recognized by the NFA. - - Returns: - - NFA: The NFA object that recognizes the given string. - - Example: - >>> nfa = string_nfa("abc") - >>> nfa.matches("abc") - True - >>> nfa.matches("def") - False - """ s = object() e = object() nfa = NFA(s) @@ -1808,22 +568,6 @@ def string_nfa(string): def choice_nfa(n1, n2): - """ - Creates a non-deterministic finite automaton (NFA) that represents a choice between two NFAs. - - Parameters: - - n1: The first NFA to choose from. - - n2: The second NFA to choose from. - - Returns: - - nfa: The resulting NFA representing the choice between n1 and n2. - - Example: - nfa1 = NFA(...) - nfa2 = NFA(...) - choice = choice_nfa(nfa1, nfa2) - """ - s = object() e = object() nfa = NFA(s) @@ -1839,21 +583,6 @@ def choice_nfa(n1, n2): def concat_nfa(n1, n2): - """ - Concatenates two NFAs (n1 and n2) into a single NFA. - - Parameters: - - n1 (NFA): The first NFA to be concatenated. - - n2 (NFA): The second NFA to be concatenated. - - Returns: - - nfa (NFA): The resulting NFA after concatenation. - - Example: - nfa1 = NFA(...) - nfa2 = NFA(...) - concatenated_nfa = concat_nfa(nfa1, nfa2) - """ s = object() m = object() e = object() @@ -1865,78 +594,28 @@ def concat_nfa(n1, n2): def star_nfa(n): - r""" - Creates a non-deterministic finite automaton (NFA) that represents the Kleene star operation on the given NFA. - - Parameters: - - n (NFA): The input NFA. - - Returns: - - nfa (NFA): The resulting NFA after applying the Kleene star operation. - - Description: - The star_nfa function takes an NFA as input and constructs a new NFA that represents the Kleene star operation on the input NFA. - The resulting NFA accepts any number of repetitions (including zero) of the language accepted by the input NFA. - - The construction of the new NFA involves adding two new states, 's' and 'e', and modifying the transitions of the input NFA. - The new NFA has the following structure: - - -----<----- - / \ - s ---> n ---> e - \ / - ----->----- - - The state 's' is the start state of the new NFA, 'n' is the start state of the input NFA, and 'e' is a new final state. - The new NFA has transitions from 's' to 'n' and from 'e' to 's' to allow for repetitions of the input NFA's language. - The input NFA's final states are also connected to 's' to allow for zero repetitions of the input NFA's language. - - Example usage: - nfa = star_nfa(input_nfa) - """ - s = object() e = object() nfa = NFA(s) + # -----<----- + # / \ + # s ---> n ---> e + # \ / + # ----->----- nfa.insert(s, n, e) nfa.add_transition(s, EPSILON, e) for finalstate in n.final_states: nfa.add_transition(finalstate, EPSILON, s) nfa.add_final_state(e) - return nfa def plus_nfa(n): - """ - Constructs a non-deterministic finite automaton (NFA) that matches one or more occurrences of the given NFA. - - Parameters: - n (NFA): The NFA to be repeated one or more times. - - Returns: - NFA: The NFA that matches one or more occurrences of the given NFA. - - Example: - >>> nfa = plus_nfa(nfa1) - """ return concat_nfa(n, star_nfa(n)) def optional_nfa(n): - """ - Creates a non-deterministic finite automaton (NFA) that matches zero or one occurrence of the given NFA. - - Parameters: - - n: The NFA to match zero or one occurrence of. - - Returns: - - The NFA that matches zero or one occurrence of the given NFA. - - Example: - >>> nfa = optional_nfa(nfa1) - """ return choice_nfa(n, epsilon_nfa()) @@ -1944,89 +623,23 @@ def optional_nfa(n): class DMNode: - """ - Represents a deterministic finite state automaton (DFSA) node. - - Attributes: - n (int): The node identifier. - arcs (dict): A dictionary of arcs, where the keys are input symbols and the values are the next nodes. - final (bool): Indicates whether the node is a final state. - - Methods: - __init__(self, n: int): Initializes a new instance of the DMNode class. - __repr__(self) -> str: Returns a string representation of the DMNode. - __hash__(self) -> int: Returns the hash value of the DMNode. - tuple(self) -> tuple: Returns a tuple representation of the DMNode. - - """ - - def __init__(self, n: int): - """ - Initializes a new instance of the DMNode class. - - Args: - n (int): The node identifier. - - """ + def __init__(self, n): self.n = n self.arcs = {} self.final = False - def __repr__(self) -> str: - """ - Returns a string representation of the DMNode. - - Returns: - str: The string representation of the DMNode. - - """ + def __repr__(self): return f"<{self.n}, {self.tuple()!r}>" - def __hash__(self) -> int: - """ - Returns the hash value of the DMNode. - - Returns: - int: The hash value of the DMNode. - - """ + def __hash__(self): return hash(self.tuple()) - def tuple(self) -> tuple: - """ - Returns a tuple representation of the DMNode. - - Returns: - tuple: The tuple representation of the DMNode. - - """ + def tuple(self): arcs = tuple(sorted(self.arcs.items())) return arcs, self.final def strings_dfa(strings): - """ - Constructs a Deterministic Finite Automaton (DFA) from a list of strings. - - Args: - strings (list): A list of strings to construct the DFA from. - - Returns: - DFA: The constructed DFA. - - Raises: - ValueError: If the strings are not in lexicographical order or if an empty string is encountered. - - Notes: - - The DFA is constructed by iteratively adding strings to the automaton. - - The DFA is built incrementally, reusing common prefixes between strings to optimize space. - - The DFA is represented using DMNode objects, which store the state transitions and accept states. - - The DFA is returned as an instance of the DFA class. - - Example: - strings = ["apple", "banana", "cherry"] - dfa = strings_dfa(strings) - """ dfa = DFA(0) c = itertools.count(1) @@ -2066,35 +679,12 @@ def strings_dfa(strings): def add_suffix(dfa, nodes, last, downto, seen): - """ - Add a suffix to the given DFA. - - This function takes a DFA (Deterministic Finite Automaton) and adds a suffix to it. - The suffix is constructed from a list of nodes, starting from the last node and - going up to the specified downto index. - - Parameters: - - dfa (DFA): The DFA to which the suffix will be added. - - nodes (list): The list of nodes representing the suffix. - - last (list): The list of labels representing the transitions from the last node - to its parent nodes. - - downto (int): The index indicating the last node in the suffix to be added. - - seen (dict): A dictionary that keeps track of already seen nodes. - - Returns: - None - - Notes: - - If a node with the same characteristics (final/nonfinal, same arcs to same destinations) - is already seen, it is replaced with the already seen node. - - If a node is replaced with an already seen one, the parent node's pointer to this node is fixed. - - The node's transitions are added to the DFA. - - """ while len(nodes) > downto: node = nodes.pop() tup = node.tuple() + # If a node just like this one (final/nonfinal, same arcs to same + # destinations) is already seen, replace with it try: this = seen[tup] except KeyError: @@ -2103,9 +693,12 @@ def add_suffix(dfa, nodes, last, downto, seen): dfa.add_final_state(this) seen[tup] = this else: + # If we replaced the node with an already seen one, fix the parent + # node's pointer to this parent = nodes[-1] inlabel = last[len(nodes) - 1] parent.arcs[inlabel] = this + # Add the node's transitions to the DFA for label, dest in node.arcs.items(): dfa.add_transition(this, label, dest) diff --git a/src/whoosh/automata/fst.py b/src/whoosh/automata/fst.py index 3c5ec4f9..0762b9ce 100644 --- a/src/whoosh/automata/fst.py +++ b/src/whoosh/automata/fst.py @@ -59,82 +59,18 @@ def b(s): - """ - Encodes the input string using the Latin-1 encoding. - - Args: - s (str): The string to be encoded. - - Returns: - bytes: The encoded string. - - Raises: - UnicodeEncodeError: If the input string cannot be encoded using the Latin-1 encoding. - - Example: - >>> b("hello") - b'hello' - """ return s.encode("latin-1") def u(s): - """ - Convert the input string to Unicode if it is a byte string. - - Parameters: - s (str or bytes): The input string to be converted. - - Returns: - str: The converted Unicode string. - - Raises: - None. - - Examples: - >>> u(b'hello') - 'hello' - >>> u('world') - 'world' - """ - return s.decode("ascii") if isinstance(s, bytes) else s class FileVersionError(Exception): - """ - Exception raised when there is a mismatch between the version of a file and the expected version. - - This exception is typically raised when a file is being read or processed and its version does not match the expected version. - It can be used to handle version-related errors in file handling operations. - - Attributes: - message (str): Explanation of the error. - """ - - def __init__(self, message): - """ - Initialize a new instance of FileVersionError. - - Args: - message (str): Explanation of the error. - """ - self.message = message - super().__init__(message) + pass class InactiveCursor(Exception): - """ - Exception raised when attempting to use an inactive cursor. - - An inactive cursor is a cursor that has been closed or is no longer valid. - This exception is raised to indicate that an operation cannot be performed - because the cursor is inactive. - - Attributes: - message -- explanation of the error - """ - pass @@ -150,231 +86,84 @@ class InactiveCursor(Exception): class Values: - """Base for classes that describe how to encode and decode FST values. - - This class provides a set of methods that define the behavior of FST values. - Subclasses should implement these methods to handle specific types of values. - - Attributes: - None - - Methods: - is_valid(v): Returns True if v is a valid object that can be stored by this class. - common(v1, v2): Returns the "common" part of the two values. - add(prefix, v): Adds the given prefix to the given value. - subtract(v, prefix): Subtracts the "common" part (the prefix) from the given value. - write(dbfile, v): Writes value v to a file. - read(dbfile): Reads a value from the given file. - skip(dbfile): Skips over a value in the given file. - to_bytes(v): Returns a str (Python 2.x) or bytes (Python 3) representation of the given value. - merge(v1, v2): Merges two values. - - """ + """Base for classes the describe how to encode and decode FST values.""" @staticmethod def is_valid(v): - """Returns True if v is a valid object that can be stored by this class. - - Args: - v: The value to check. - - Returns: - bool: True if v is a valid object, False otherwise. - - Raises: - NotImplementedError: This method should be implemented by subclasses. - + """Returns True if v is a valid object that can be stored by this + class. """ raise NotImplementedError @staticmethod def common(v1, v2): - """Returns the "common" part of the two values. - - The definition of "common" depends on the specific subclass implementation. - For example, a string implementation would return the common shared prefix, - while an int implementation would return the minimum of the two numbers. + """Returns the "common" part of the two values, for whatever "common" + means for this class. For example, a string implementation would return + the common shared prefix, for an int implementation it would return + the minimum of the two numbers. If there is no common part, this method should return None. - - Args: - v1: The first value. - v2: The second value. - - Returns: - object: The common part of the two values, or None if there is no common part. - - Raises: - NotImplementedError: This method should be implemented by subclasses. - """ raise NotImplementedError @staticmethod def add(prefix, v): - """Adds the given prefix to the given value. - - The prefix is the result of a call to the `common()` method. - - Args: - prefix: The prefix to add. - v: The value to add the prefix to. - - Returns: - object: The value with the prefix added. - - Raises: - NotImplementedError: This method should be implemented by subclasses. - + """Adds the given prefix (the result of a call to common()) to the + given value. """ raise NotImplementedError @staticmethod def subtract(v, prefix): - """Subtracts the "common" part (the prefix) from the given value. - - Args: - v: The value to subtract the prefix from. - prefix: The prefix to subtract. - - Returns: - object: The value with the prefix subtracted. - - Raises: - NotImplementedError: This method should be implemented by subclasses. - - """ + """Subtracts the "common" part (the prefix) from the given value.""" raise NotImplementedError @staticmethod def write(dbfile, v): - """Writes value v to a file. - - Args: - dbfile: The file to write the value to. - v: The value to write. - - Returns: - None - - Raises: - NotImplementedError: This method should be implemented by subclasses. - - """ + """Writes value v to a file.""" raise NotImplementedError @staticmethod def read(dbfile): - """Reads a value from the given file. - - Args: - dbfile: The file to read the value from. - - Returns: - object: The value read from the file. - - Raises: - NotImplementedError: This method should be implemented by subclasses. - - """ + """Reads a value from the given file.""" raise NotImplementedError @classmethod def skip(cls, dbfile): - """Skips over a value in the given file. - - This method is a convenience method that calls the `read()` method. - - Args: - dbfile: The file to skip the value in. - - Returns: - None - - """ + """Skips over a value in the given file.""" cls.read(dbfile) @staticmethod def to_bytes(v): - """Returns a str (Python 2.x) or bytes (Python 3) representation of the given value. - - This method is used for calculating node digests. The representation should be - unique but fast to calculate, and does not have to be parseable. - - Args: - v: The value to convert. - - Returns: - str or bytes: The representation of the value. - - Raises: - NotImplementedError: This method should be implemented by subclasses. - + """Returns a str (Python 2.x) or bytes (Python 3) representation of + the given value. This is used for calculating node digests, so it + should be unique but fast to calculate, and does not have to be + parseable. """ raise NotImplementedError @staticmethod def merge(v1, v2): - """Merges two values. - - The definition of "merge" depends on the specific subclass implementation. - - Args: - v1: The first value. - v2: The second value. - - Returns: - object: The merged value. - - Raises: - NotImplementedError: This method should be implemented by subclasses. - - """ - raise NotImplementedError class IntValues(Values): - """Stores integer values in an FST. - - This class provides methods for working with integer values in a Finite State Transducer (FST). - It defines operations such as validation, common value calculation, addition, subtraction, and serialization. - - """ + """Stores integer values in an FST.""" @staticmethod def is_valid(v): - """Check if a value is a valid integer for the FST. - - Args: - v (int): The value to check. - - Returns: - bool: True if the value is a valid integer, False otherwise. - - """ return isinstance(v, int) and v >= 0 @staticmethod def common(v1, v2): - """Calculate the common value between two integers. - - Args: - v1 (int): The first integer value. - v2 (int): The second integer value. - - Returns: - int or None: The common value if it exists, None otherwise. - - """ if v1 is None or v2 is None: return None if v1 == v2: @@ -383,16 +172,6 @@ def common(v1, v2): @staticmethod def add(base, v): - """Add an integer value to a base value. - - Args: - base (int or None): The base value. - v (int or None): The value to add. - - Returns: - int or None: The result of the addition. - - """ if base is None: return v if v is None: @@ -401,16 +180,6 @@ def add(base, v): @staticmethod def subtract(v, base): - """Subtract a base value from an integer value. - - Args: - v (int or None): The integer value. - base (int or None): The base value. - - Returns: - int or None: The result of the subtraction. - - """ if v is None: return None if base is None: @@ -419,49 +188,18 @@ def subtract(v, base): @staticmethod def write(dbfile, v): - """Write an integer value to a database file. - - Args: - dbfile (file): The database file to write to. - v (int): The integer value to write. - - """ dbfile.write_uint(v) @staticmethod def read(dbfile): - """Read an integer value from a database file. - - Args: - dbfile (file): The database file to read from. - - Returns: - int: The read integer value. - - """ return dbfile.read_uint() @staticmethod def skip(dbfile): - """Skip a fixed number of bytes in a database file. - - Args: - dbfile (file): The database file to skip bytes in. - - """ dbfile.seek(_INT_SIZE, 1) @staticmethod def to_bytes(v): - """Convert an integer value to bytes. - - Args: - v (int): The integer value to convert. - - Returns: - bytes: The byte representation of the integer value. - - """ return pack_int(v) @@ -470,29 +208,10 @@ class SequenceValues(Values): @staticmethod def is_valid(self, v): - """ - Check if a value is a valid sequence. - - Parameters: - - v (object): The value to check. - - Returns: - - bool: True if the value is a list or tuple, False otherwise. - """ return isinstance(self, (list, tuple)) @staticmethod def common(v1, v2): - """ - Find the common prefix between two sequences. - - Parameters: - - v1 (list or tuple): The first sequence. - - v2 (list or tuple): The second sequence. - - Returns: - - list or tuple or None: The common prefix between v1 and v2, or None if there is no common prefix. - """ if v1 is None or v2 is None: return None @@ -512,16 +231,6 @@ def common(v1, v2): @staticmethod def add(prefix, v): - """ - Concatenate a prefix and a sequence. - - Parameters: - - prefix (list or tuple): The prefix sequence. - - v (list or tuple): The sequence to concatenate. - - Returns: - - list or tuple: The concatenation of prefix and v. - """ if prefix is None: return v if v is None: @@ -530,16 +239,6 @@ def add(prefix, v): @staticmethod def subtract(v, prefix): - """ - Remove a prefix from a sequence. - - Parameters: - - v (list or tuple): The sequence. - - prefix (list or tuple): The prefix to remove. - - Returns: - - list or tuple or None: The sequence with the prefix removed, or None if the prefix is not valid. - """ if prefix is None: return v if v is None: @@ -552,279 +251,67 @@ def subtract(v, prefix): @staticmethod def write(dbfile, v): - """ - Write a sequence to a database file. - - Parameters: - - dbfile (file): The database file to write to. - - v (list or tuple): The sequence to write. - """ dbfile.write_pickle(v) @staticmethod def read(dbfile): - """ - Read a sequence from a database file. - - Parameters: - - dbfile (file): The database file to read from. - - Returns: - - list or tuple: The sequence read from the database file. - """ return dbfile.read_pickle() class BytesValues(SequenceValues): - """Stores bytes objects (str in Python 2.x) in an FST. - - This class is used to store bytes objects in a Finite State Transducer (FST). - It provides methods for writing, reading, and skipping bytes objects in a database file. - - Attributes: - None - - Methods: - is_valid: Checks if a given value is a valid bytes object. - write: Writes a bytes object to a database file. - read: Reads a bytes object from a database file. - skip: Skips a bytes object in a database file. - to_bytes: Converts a value to bytes. - - """ + """Stores bytes objects (str in Python 2.x) in an FST.""" @staticmethod def is_valid(v): - """Checks if a given value is a valid bytes object. - - Args: - v (bytes): The value to check. - - Returns: - bool: True if the value is a bytes object, False otherwise. - - """ return isinstance(v, bytes) @staticmethod def write(dbfile, v): - """Writes a bytes object to a database file. - - Args: - dbfile (file): The database file to write to. - v (bytes): The bytes object to write. - - Returns: - None - - """ dbfile.write_int(len(v)) dbfile.write(v) @staticmethod def read(dbfile): - """Reads a bytes object from a database file. - - Args: - dbfile (file): The database file to read from. - - Returns: - bytes: The read bytes object. - - """ length = dbfile.read_int() return dbfile.read(length) @staticmethod def skip(dbfile): - """Skips a bytes object in a database file. - - Args: - dbfile (file): The database file to skip from. - - Returns: - None - - """ length = dbfile.read_int() dbfile.seek(length, 1) @staticmethod def to_bytes(v): - """Converts a value to bytes. - - Args: - v: The value to convert. - - Returns: - bytes: The converted bytes object. - - """ return v class ArrayValues(SequenceValues): - """Stores array.array objects in an FST. - - This class is used to store array.array objects in a finite state transducer (FST). - It provides methods for writing, reading, and skipping array.array objects in a database file. - - Args: - typecode (str): The typecode of the array.array objects to be stored. - - Attributes: - typecode (str): The typecode of the array.array objects. - itemsize (int): The size of each item in the array.array objects. - - """ + """Stores array.array objects in an FST.""" def __init__(self, typecode): - """ - Initialize a new FST object. - - Args: - typecode (str): The typecode of the array used to store the FST. - - Attributes: - typecode (str): The typecode of the array used to store the FST. - itemsize (int): The size of each item in the array. - - Note: - The FST (Finite State Transducer) is a data structure used for efficient string matching and lookup operations. - The typecode specifies the type of elements stored in the FST array, such as 'i' for integers or 'f' for floats. - The itemsize is calculated based on the typecode and represents the size (in bytes) of each element in the array. - """ self.typecode = typecode self.itemsize = array(self.typecode).itemsize def is_valid(self, v): - """ - Check if a value is a valid array.array object. - - Args: - v (Any): The value to be checked. - - Returns: - bool: True if the value is a valid array.array object, False otherwise. - - Raises: - None - - Examples: - >>> a = array.array('i', [1, 2, 3]) - >>> is_valid(a) - True - - >>> b = [1, 2, 3] - >>> is_valid(b) - False - - This method checks if the given value is a valid array.array object. It returns True if the value is a valid array.array object with the same typecode as the current instance, and False otherwise. - """ return isinstance(v, array) and v.typecode == self.typecode @staticmethod def write(dbfile, v): - """Write an array.array object to a database file. - - Args: - dbfile (file): The file object representing the database file. - v (array.array): The array.array object to be written. - - Raises: - TypeError: If `dbfile` is not a file object. - TypeError: If `v` is not an array.array object. - - Notes: - - The `dbfile` should be opened in binary mode. - - The `v` array.array object should contain elements of a single type. - - Example: - >>> import array - >>> v = array.array('i', [1, 2, 3, 4, 5]) - >>> with open('data.db', 'wb') as dbfile: - ... write(dbfile, v) - """ dbfile.write(b(v.typecode)) dbfile.write_int(len(v)) dbfile.write_array(v) def read(self, dbfile): - """Read an array.array object from a database file. - - Args: - dbfile (file): The file object representing the database file. - - Returns: - array.array: The read array.array object. - - Raises: - ValueError: If the file object is not valid or the data cannot be read. - - Notes: - This method reads an array.array object from a database file. The file object - should be opened in binary mode. The method reads the typecode of the array, - the length of the array, and then reads the array data from the file. The - method returns the read array.array object. - - Example: - >>> with open('data.db', 'rb') as file: - ... fst = FST() - ... array_obj = fst.read(file) - ... print(array_obj) - """ typecode = u(dbfile.read(1)) length = dbfile.read_int() return dbfile.read_array(typecode, length) def skip(self, dbfile): - """ - Skip an array.array object in a database file. - - This method is used to skip over an array.array object in a database file. - It reads the length of the array from the file, and then seeks forward in the file - by multiplying the length with the item size. - - Args: - dbfile (file): The file object representing the database file. - - Raises: - ValueError: If the length read from the file is negative. - - Example: - Suppose you have a database file containing an array.array object. - You can use this method to skip over the array.array object in the file. - - >>> with open('database.db', 'rb') as dbfile: - ... skip_array(dbfile) - - """ length = dbfile.read_int() - if length < 0: - raise ValueError(f"Invalid length: {length}") - dbfile.seek(length * self.itemsize, 1) @staticmethod def to_bytes(v): - """Convert an array.array object to bytes. - - Args: - v (array.array): The array.array object to be converted. - - Returns: - bytes: The converted bytes. - - Raises: - TypeError: If the input is not an array.array object. - - Example: - >>> import array - >>> a = array.array('B', [1, 2, 3]) - >>> to_bytes(a) - b'\x01\x02\x03' - """ - return v.tobytes() @@ -832,33 +319,10 @@ class IntListValues(SequenceValues): """Stores lists of positive, increasing integers (that is, lists of integers where each number is >= 0 and each number is greater than or equal to the number that precedes it) in an FST. - - This class provides methods to write and read lists of integers to/from a database file. - - Usage: - To write a list of integers to a database file: - IntListValues.write(dbfile, v) - - To read a list of integers from a database file: - result = IntListValues.read(dbfile) - - To convert a list of integers to bytes: - bytes_data = IntListValues.to_bytes(v) """ @staticmethod def is_valid(v): - """Check if a given value is a valid list of positive, increasing integers. - - This function checks if the given value is a list or tuple of positive, increasing integers. - It returns True if the value is valid, and False otherwise. - - Args: - v (list or tuple): The value to check. - - Returns: - bool: True if the value is a valid list of positive, increasing integers, False otherwise. - """ if isinstance(v, (list, tuple)): if len(v) < 2: return True @@ -870,12 +334,6 @@ def is_valid(v): @staticmethod def write(dbfile, v): - """Write a list of positive, increasing integers to a database file. - - Args: - dbfile: The database file to write to. - v (list or tuple): The list of positive, increasing integers to write. - """ base = 0 dbfile.write_varint(len(v)) for x in v: @@ -886,14 +344,6 @@ def write(dbfile, v): @staticmethod def read(dbfile): - """Read a list of positive, increasing integers from a database file. - - Args: - dbfile: The database file to read from. - - Returns: - list: The list of positive, increasing integers read from the database file. - """ length = dbfile.read_varint() result = [] if length > 0: @@ -905,14 +355,6 @@ def read(dbfile): @staticmethod def to_bytes(v): - """Convert a list of positive, increasing integers to bytes. - - Args: - v (list or tuple): The list of positive, increasing integers to convert. - - Returns: - bytes: The bytes representation of the list of positive, increasing integers. - """ return b(repr(v)) @@ -926,48 +368,22 @@ class Node: """ def __init__(self, owner, address, accept=False): - """ - Initialize a Node object. - - Args: - owner (GraphReader): The owner of the node. - address (int): The address of the node. - accept (bool, optional): Whether the node is an accept state. Defaults to False. - """ self.owner = owner self.address = address self._edges = None self.accept = accept def __iter__(self): - """ - Iterate over the keys of the outgoing edges. - - Returns: - Iterator: An iterator over the keys of the outgoing edges. - """ if not self._edges: self._load() return self._edges.keys() def __contains__(self, key): - """ - Check if the node has an outgoing edge with the given key. - - Args: - key: The key of the outgoing edge. - - Returns: - bool: True if the node has an outgoing edge with the given key, False otherwise. - """ if self._edges is None: self._load() return key in self._edges def _load(self): - """ - Load the outgoing edges of the node. - """ owner = self.owner if self.address is None: d = {} @@ -979,52 +395,21 @@ def _load(self): self._edges = d def keys(self): - """ - Get the keys of the outgoing edges. - - Returns: - list: A list of the keys of the outgoing edges. - """ if self._edges is None: self._load() return self._edges.keys() def all_edges(self): - """ - Get all the outgoing edges. - - Returns: - dict: A dictionary containing all the outgoing edges. - """ if self._edges is None: self._load() return self._edges def edge(self, key): - """ - Get the node reached by following the outgoing edge with the given key. - - Args: - key: The key of the outgoing edge. - - Returns: - Node: The node reached by following the outgoing edge with the given key. - """ if self._edges is None: self._load() return self._edges[key] def flatten(self, sofar=emptybytes): - """ - Flatten the node and yield all the strings that can be formed by concatenating - the keys of the outgoing edges. - - Args: - sofar (bytes, optional): The prefix string formed so far. Defaults to emptybytes. - - Yields: - bytes: The strings that can be formed by concatenating the keys of the outgoing edges. - """ if self.accept: yield sofar for key in sorted(self): @@ -1032,102 +417,38 @@ def flatten(self, sofar=emptybytes): yield from node.flatten(sofar + key) def flatten_strings(self): - """ - Flatten the node and yield all the strings that can be formed by concatenating - the keys of the outgoing edges. - - Yields: - str: The strings that can be formed by concatenating the keys of the outgoing edges. - """ return (utf8decode(k)[0] for k in self.flatten()) class ComboNode(Node): """Base class for nodes that blend the nodes of two different graphs. - This class serves as a base for nodes that combine the nodes of two different graphs. - Subclasses of ComboNode should implement the `edge()` method and may override the `accept` property. - - Attributes: - a (Node): The first node to be blended. - b (Node): The second node to be blended. + Concrete subclasses need to implement the ``edge()`` method and possibly + override the ``accept`` property. """ def __init__(self, a, b): - """Initialize a new ComboNode. - - Args: - a (Node): The first node to be blended. - b (Node): The second node to be blended. - """ self.a = a self.b = b def __repr__(self): - """Return a string representation of the ComboNode. - - Returns: - str: A string representation of the ComboNode. - """ return f"<{self.__class__.__name__} {self.a!r} {self.b!r}>" def __contains__(self, key): - """Check if a key is present in the ComboNode. - - Args: - key: The key to check. - - Returns: - bool: True if the key is present in either `a` or `b`, False otherwise. - """ return key in self.a or key in self.b def __iter__(self): - """Iterate over the keys in the ComboNode. - - Returns: - iter: An iterator over the keys in the ComboNode. - """ return iter(set(self.a) | set(self.b)) @property def accept(self): - """Check if the ComboNode is an accept node. - - Returns: - bool: True if either `a` or `b` is an accept node, False otherwise. - """ return self.a.accept or self.b.accept class UnionNode(ComboNode): """Makes two graphs appear to be the union of the two graphs.""" - def __init__(self, a, b): - """ - Initialize a UnionNode with two graphs. - - Args: - a (Graph): The first graph. - b (Graph): The second graph. - """ - self.a = a - self.b = b - def edge(self, key): - """ - Get the edge for the given key. - - If the key is present in both graphs, returns a UnionNode with the edges from both graphs. - If the key is only present in the first graph, returns the edge from the first graph. - If the key is only present in the second graph, returns the edge from the second graph. - - Args: - key: The key to get the edge for. - - Returns: - UnionNode or Edge: The edge for the given key. - """ a = self.a b = self.b if key in a and key in b: @@ -1139,28 +460,9 @@ def edge(self, key): class IntersectionNode(ComboNode): - """Makes two graphs appear to be the intersection of the two graphs. - - This class represents a node in the intersection graph, which is created by taking the intersection of two graphs. - The intersection graph appears as if it contains only the common elements between the two original graphs. - - Attributes: - a (ComboNode): The first graph to be intersected. - b (ComboNode): The second graph to be intersected. - """ + """Makes two graphs appear to be the intersection of the two graphs.""" def edge(self, key): - """Returns the next node in the intersection graph for the given key. - - Args: - key: The key representing the edge to traverse. - - Returns: - IntersectionNode: The next node in the intersection graph for the given key. - - Raises: - KeyError: If the key is not present in both graphs. - """ a = self.a b = self.b if key in a and key in b: @@ -1174,63 +476,51 @@ class BaseCursor: """Base class for a cursor-type object for navigating an FST/word graph, represented by a :class:`GraphReader` object. - The cursor "rests" on arcs in the FSA/FST graph, rather than nodes. + >>> cur = GraphReader(dawgfile).cursor() + >>> for key in cur.follow(): + ... print(repr(key)) - Methods: - - is_active(): Returns True if this cursor is still active. - - label(): Returns the label bytes of the current arc. - - prefix(): Returns a sequence of the label bytes for the path from the root to the current arc. - - prefix_bytes(): Returns the label bytes for the path from the root to the current arc as a single joined bytes object. - - prefix_string(): Returns the labels of the path from the root to the current arc as a decoded unicode string. - - peek_key(): Returns a sequence of label bytes representing the next closest key in the graph. - - peek_key_bytes(): Returns the next closest key in the graph as a single bytes object. - - peek_key_string(): Returns the next closest key in the graph as a decoded unicode string. - - stopped(): Returns True if the current arc leads to a stop state. - - value(): Returns the value at the current arc, if reading an FST. - - accept(): Returns True if the current arc leads to an accept state. - - at_last_arc(): Returns True if the current arc is the last outgoing arc from the previous node. - - next_arc(): Moves to the next outgoing arc from the previous node. - - follow(): Follows the current arc. - - switch_to(label): Switches to the sibling arc with the given label bytes. - - skip_to(key): Moves the cursor to the path represented by the given key bytes. - - flatten(): Yields the keys in the graph, starting at the current position. - - flatten_v(): Yields (key, value) tuples in an FST, starting at the current position. - - flatten_strings(): Yields the keys in the graph as decoded unicode strings, starting at the current position. - - find_path(path): Follows the labels in the given path, starting at the current position. + The cursor "rests" on arcs in the FSA/FST graph, rather than nodes. """ def is_active(self): """Returns True if this cursor is still active, that is it has not read past the last arc in the graph. """ + raise NotImplementedError def label(self): """Returns the label bytes of the current arc.""" + raise NotImplementedError def prefix(self): """Returns a sequence of the label bytes for the path from the root to the current arc. """ + raise NotImplementedError def prefix_bytes(self): """Returns the label bytes for the path from the root to the current arc as a single joined bytes object. """ + return emptybytes.join(self.prefix()) def prefix_string(self): """Returns the labels of the path from the root to the current arc as a decoded unicode string. """ + return utf8decode(self.prefix_bytes())[0] def peek_key(self): """Returns a sequence of label bytes representing the next closest key in the graph. """ + yield from self.prefix() c = self.copy() while not c.stopped(): @@ -1239,44 +529,53 @@ def peek_key(self): def peek_key_bytes(self): """Returns the next closest key in the graph as a single bytes object.""" + return emptybytes.join(self.peek_key()) def peek_key_string(self): """Returns the next closest key in the graph as a decoded unicode string. """ + return utf8decode(self.peek_key_bytes())[0] def stopped(self): """Returns True if the current arc leads to a stop state.""" + raise NotImplementedError def value(self): """Returns the value at the current arc, if reading an FST.""" + raise NotImplementedError def accept(self): """Returns True if the current arc leads to an accept state (the end of a valid key). """ + raise NotImplementedError def at_last_arc(self): """Returns True if the current arc is the last outgoing arc from the previous node. """ + raise NotImplementedError def next_arc(self): """Moves to the next outgoing arc from the previous node.""" + raise NotImplementedError def follow(self): """Follows the current arc.""" + raise NotImplementedError def switch_to(self, label): """Switch to the sibling arc with the given label bytes.""" + _label = self.label _at_last_arc = self.at_last_arc _next_arc = self.next_arc @@ -1291,6 +590,7 @@ def switch_to(self, label): def skip_to(self, key): """Moves the cursor to the path represented by the given key bytes.""" + _accept = self.accept _prefix = self.prefix _next_arc = self.next_arc @@ -1307,6 +607,7 @@ def skip_to(self, key): def flatten(self): """Yields the keys in the graph, starting at the current position.""" + _is_active = self.is_active _accept = self.accept _stopped = self.stopped @@ -1328,17 +629,18 @@ def flatten_v(self): """Yields (key, value) tuples in an FST, starting at the current position. """ + for key in self.flatten(): yield key, self.value() def flatten_strings(self): - """Yields the keys in the graph as decoded unicode strings, starting at the current position.""" return (utf8decode(k)[0] for k in self.flatten()) def find_path(self, path): """Follows the labels in the given path, starting at the current position. """ + path = to_labels(path) _switch_to = self.switch_to _follow = self.follow @@ -1348,22 +650,17 @@ def find_path(self, path): for i, label in enumerate(path): if not first: _follow() - if not _switch_to(label) or (_stopped() and i < len(path) - 1): + if not _switch_to(label): return False + if _stopped(): + if i < len(path) - 1: + return False first = False return True class Cursor(BaseCursor): def __init__(self, graph, root=None, stack=None): - """ - Initializes a Cursor object. - - Args: - graph (Graph): The graph to navigate. - root (int, optional): The root node of the graph. Defaults to None. - stack (list, optional): The stack of arcs. Defaults to None. - """ self.graph = graph self.vtype = graph.vtype self.root = root if root is not None else graph.default_root() @@ -1372,87 +669,43 @@ def __init__(self, graph, root=None, stack=None): else: self.reset() - def is_active(self): - """ - Checks if the cursor is active. + def _current_attr(self, name): + stack = self.stack + if not stack: + raise InactiveCursor + return getattr(stack[-1], name) - Returns: - bool: True if the cursor is active, False otherwise. - """ + def is_active(self): return bool(self.stack) def stopped(self): - """ - Checks if the cursor has stopped. - - Returns: - bool: True if the cursor has stopped, False otherwise. - """ return self._current_attr("target") is None def accept(self): - """ - Checks if the cursor is in an accepting state. - - Returns: - bool: True if the cursor is in an accepting state, False otherwise. - """ return self._current_attr("accept") def at_last_arc(self): - """ - Checks if the cursor is at the last arc. - - Returns: - bool: True if the cursor is at the last arc, False otherwise. - """ return self._current_attr("lastarc") def label(self): - """ - Returns the label of the current arc. - - Returns: - object: The label of the current arc. - """ return self._current_attr("label") def reset(self): - """ - Resets the cursor to its initial state. - """ self.stack = [] self.sums = [None] self._push(self.graph.arc_at(self.root)) def copy(self): - """ - Creates a copy of the cursor. - - Returns: - Cursor: A copy of the cursor. - """ return self.__class__(self.graph, self.root, copy.deepcopy(self.stack)) def prefix(self): - """ - Returns the prefix labels of the current stack. - - Yields: - object: The prefix labels of the current stack. - """ stack = self.stack if not stack: raise InactiveCursor return (arc.label for arc in stack) + # Override: more efficient implementation using graph methods directly def peek_key(self): - """ - Returns an iterator over the labels of the current stack. - - Yields: - object: The labels of the current stack. - """ if not self.stack: raise InactiveCursor @@ -1464,18 +717,12 @@ def peek_key(self): yield arc.label def value(self): - """ - Returns the value associated with the current stack. - - Returns: - object: The value associated with the current stack. - """ stack = self.stack if not stack: raise InactiveCursor vtype = self.vtype if not vtype: - raise ValueError("No value type") + raise Exception("No value type") v = self.sums[-1] current = stack[-1] @@ -1486,12 +733,6 @@ def value(self): return v def next_arc(self): - """ - Moves the cursor to the next arc. - - Returns: - Arc: The next arc. - """ stack = self.stack if not stack: raise InactiveCursor @@ -1504,25 +745,14 @@ def next_arc(self): return current def follow(self): - """ - Follows the target arc. - - Returns: - Cursor: The updated cursor. - """ address = self._current_attr("target") if address is None: raise Exception("Can't follow a stop arc") self._push(self.graph.arc_at(address)) return self + # Override: more efficient implementation manipulating the stack def skip_to(self, key): - """ - Skips to the specified key. - - Args: - key (list): The key to skip to. - """ key = to_labels(key) stack = self.stack if not stack: @@ -1543,16 +773,8 @@ def skip_to(self, key): else: _next_arc() + # Override: more efficient implementation using find_arc def switch_to(self, label): - """ - Switches to the specified label. - - Args: - label (object): The label to switch to. - - Returns: - bool: True if the switch was successful, False otherwise. - """ stack = self.stack if not stack: raise InactiveCursor @@ -1571,9 +793,6 @@ def _push(self, arc): self.stack.append(arc) def pop(self): - """ - Pops the top arc from the stack. - """ self.stack.pop() if self.vtype: self.sums.pop() @@ -1597,63 +816,26 @@ def _pop_to_prefix(self, key): class UncompiledNode: - """ - Represents an "in-memory" node used by the GraphWriter before it is written to disk. - """ + # Represents an "in-memory" node used by the GraphWriter before it is + # written to disk. compiled = False def __init__(self, owner): - """ - Initializes a new instance of the UncompiledNode class. - - Parameters: - - owner: The owner of the node. - - Returns: - None - """ self.owner = owner self._digest = None self.clear() def clear(self): - """ - Clears the node by resetting its arcs, value, accept flag, and input count. - - Parameters: - None - - Returns: - None - """ self.arcs = [] self.value = None self.accept = False self.inputcount = 0 def __repr__(self): - """ - Returns a string representation of the node. - - Parameters: - None - - Returns: - str: The string representation of the node. - """ return f"<{[(a.label, a.value) for a in self.arcs]!r}>" def digest(self): - """ - Calculates and returns the digest of the node. - - Parameters: - None - - Returns: - bytes: The digest of the node. - """ if self._digest is None: d = sha1() vtype = self.owner.vtype @@ -1671,56 +853,16 @@ def digest(self): return self._digest def edges(self): - """ - Returns the arcs of the node. - - Parameters: - None - - Returns: - list: The arcs of the node. - """ return self.arcs def last_value(self, label): - """ - Returns the value of the last arc with the specified label. - - Parameters: - - label: The label of the arc. - - Returns: - object: The value of the last arc with the specified label. - """ assert self.arcs[-1].label == label return self.arcs[-1].value def add_arc(self, label, target): - """ - Adds a new arc to the node with the specified label and target. - - Parameters: - - label: The label of the arc. - - target: The target of the arc. - - Returns: - None - """ self.arcs.append(Arc(label, target)) def replace_last(self, label, target, accept, acceptval=None): - """ - Replaces the last arc with the specified label, target, accept flag, and accept value. - - Parameters: - - label: The label of the arc. - - target: The target of the arc. - - accept: The accept flag of the arc. - - acceptval: The accept value of the arc. - - Returns: - None - """ arc = self.arcs[-1] assert arc.label == label, f"{arc.label!r} != {label!r}" arc.target = target @@ -1728,45 +870,16 @@ def replace_last(self, label, target, accept, acceptval=None): arc.acceptval = acceptval def delete_last(self, label, target): - """ - Deletes the last arc with the specified label and target. - - Parameters: - - label: The label of the arc. - - target: The target of the arc. - - Returns: - None - """ arc = self.arcs.pop() assert arc.label == label assert arc.target == target def set_last_value(self, label, value): - """ - Sets the value of the last arc with the specified label. - - Parameters: - - label: The label of the arc. - - value: The value to set. - - Returns: - None - """ arc = self.arcs[-1] assert arc.label == label, f"{arc.label!r}->{label!r}" arc.value = value def prepend_value(self, prefix): - """ - Prepends the specified prefix to the values of all arcs and the node's value. - - Parameters: - - prefix: The prefix to prepend. - - Returns: - None - """ add = self.owner.vtype.add for arc in self.arcs: arc.value = add(prefix, arc.value) @@ -1778,21 +891,8 @@ class Arc: """ Represents a directed arc between two nodes in an FSA/FST graph. - Attributes: - label (bytes): The label bytes for this arc. For a word graph, this will be a character. - target (int): The address of the node at the endpoint of this arc. - value: The inner FST value at the endpoint of this arc. - accept (bool): Whether the endpoint of this arc is an accept state (e.g. the end of a valid word). - acceptval: If the endpoint of this arc is an accept state, the final FST value for that accepted state. - lastarc: True if this is the last outgoing arc from the previous node. - endpos: The end position of the arc. - - Methods: - __init__: Initializes a new instance of the Arc class. - __repr__: Returns a string representation of the Arc object. - __eq__: Compares two Arc objects for equality. - copy: Creates a copy of the Arc object. - + The ``lastarc`` attribute is True if this is the last outgoing arc from the + previous node. """ __slots__ = ("label", "target", "accept", "value", "lastarc", "acceptval", "endpos") @@ -1808,16 +908,14 @@ def __init__( endpos=None, ): """ - Initializes a new instance of the Arc class. - - Args: - label (bytes, optional): The label bytes for this arc. For a word graph, this will be a character. - target (int, optional): The address of the node at the endpoint of this arc. - value (optional): The inner FST value at the endpoint of this arc. - accept (bool, optional): Whether the endpoint of this arc is an accept state (e.g. the end of a valid word). - acceptval (optional): If the endpoint of this arc is an accept state, the final FST value for that accepted state. - lastarc (optional): True if this is the last outgoing arc from the previous node. - endpos (optional): The end position of the arc. + :param label: The label bytes for this arc. For a word graph, this will + be a character. + :param target: The address of the node at the endpoint of this arc. + :param value: The inner FST value at the endpoint of this arc. + :param accept: Whether the endpoint of this arc is an accept state + (e.g. the end of a valid word). + :param acceptval: If the endpoint of this arc is an accept state, the + final FST value for that accepted state. """ self.label = label @@ -1829,12 +927,6 @@ def __init__( self.endpos = endpos def __repr__(self): - """ - Returns a string representation of the Arc object. - - Returns: - str: A string representation of the Arc object. - """ return "<{!r}-{} {}{}>".format( self.label, self.target, @@ -1843,15 +935,6 @@ def __repr__(self): ) def __eq__(self, other): - """ - Compares two Arc objects for equality. - - Args: - other (Arc): The other Arc object to compare. - - Returns: - bool: True if the two Arc objects are equal, False otherwise. - """ if ( isinstance(other, self.__class__) and self.accept == other.accept @@ -1864,12 +947,6 @@ def __eq__(self, other): return False def copy(self): - """ - Creates a copy of the Arc object. - - Returns: - Arc: A copy of the Arc object. - """ # This is faster than using the copy module return Arc( label=self.label, @@ -1888,11 +965,10 @@ def copy(self): class GraphWriter: """Writes an FSA/FST graph to disk. - The GraphWriter class is used to write an FSA/FST graph to disk. It provides - methods for inserting keys into the graph, starting and finishing fields, - and closing the graph. + Call ``insert(key)`` to insert keys into the graph. You must + insert keys in sorted order. Call ``close()`` to finish the graph and close + the file. - Usage: >>> gw = GraphWriter(my_file) >>> gw.insert("alfa") >>> gw.insert("bravo") @@ -1902,7 +978,6 @@ class GraphWriter: The graph writer can write separate graphs for multiple fields. Use ``start_field(name)`` and ``finish_field()`` to separate fields. - Usage: >>> gw = GraphWriter(my_file) >>> gw.start_field("content") >>> gw.insert("alfalfa") @@ -1912,31 +987,17 @@ class GraphWriter: >>> gw.insert("artichoke") >>> gw.finish_field() >>> gw.close() - - Attributes: - version (int): The version number of the graph writer. - - Args: - dbfile (file): The file to write the graph to. - vtype (class, optional): A class to use for storing values. Defaults to None. - merge (function, optional): A function that merges two values. Defaults to None. - - Raises: - ValueError: If the field name is equivalent to False. - Exception: If finish_field() is called before start_field(). - """ version = 1 def __init__(self, dbfile, vtype=None, merge=None): """ - Initializes a new instance of the GraphWriter class. - - Args: - dbfile (file): The file to write the graph to. - vtype (class, optional): A class to use for storing values. Defaults to None. - merge (function, optional): A function that merges two values. Defaults to None. + :param dbfile: the file to write to. + :param vtype: a :class:`Values` class to use for storing values. This + is only necessary if you will be storing values for the keys. + :param merge: a function that takes two values and returns a single + value. This is called if you insert two identical keys with values. """ self.dbfile = dbfile @@ -1954,16 +1015,7 @@ def __init__(self, dbfile, vtype=None, merge=None): self._infield = False def start_field(self, fieldname): - """ - Starts a new graph for the given field. - - Args: - fieldname (str): The name of the field. - - Raises: - ValueError: If the field name is equivalent to False. - Exception: If start_field() is called while already in a field. - """ + """Starts a new graph for the given field.""" if not fieldname: raise ValueError("Field name cannot be equivalent to False") @@ -1977,12 +1029,7 @@ def start_field(self, fieldname): self._infield = True def finish_field(self): - """ - Finishes the graph for the current field. - - Raises: - Exception: If finish_field() is called before start_field(). - """ + """Finishes the graph for the current field.""" if not self._infield: raise Exception("Called finish_field before start_field") @@ -1992,9 +1039,7 @@ def finish_field(self): self.fieldname = None def close(self): - """ - Finishes the current graph and closes the underlying file. - """ + """Finishes the current graph and closes the underlying file.""" if self.fieldname is not None: self.finish_field() @@ -2007,17 +1052,12 @@ def close(self): dbfile.close() def insert(self, key, value=None): - """ - Inserts the given key into the graph. - - Args: - key (bytes, str): The key to insert into the graph. - value (object, optional): The value to encode in the graph along with the key. Defaults to None. + """Inserts the given key into the graph. - Raises: - Exception: If insert() is called before starting a field. - KeyError: If the key is null or out of order. - ValueError: If the value is not valid for the value type. + :param key: a sequence of bytes objects, a bytes object, or a string. + :param value: an optional value to encode in the graph along with the + key. If the writer was not instantiated with a value type, passing + a value here will raise an error. """ if not self._infield: @@ -2202,102 +1242,24 @@ def _write_node(self, uncnode): class BaseGraphReader: - """Base class for reading graph data structures.""" - def cursor(self, rootname=None): - """ - Returns a cursor object for traversing the graph. - - Args: - rootname (str, optional): The name of the root node. Defaults to None. - - Returns: - Cursor: A cursor object. - - """ return Cursor(self, self.root(rootname)) def has_root(self, rootname): - """ - Checks if the graph has a root node with the given name. - - Args: - rootname (str): The name of the root node. - - Returns: - bool: True if the root node exists, False otherwise. - - Raises: - NotImplementedError: This method must be implemented by subclasses. - - """ raise NotImplementedError def root(self, rootname=None): - """ - Returns the root node of the graph. - - Args: - rootname (str, optional): The name of the root node. Defaults to None. - - Returns: - Node: The root node. - - Raises: - NotImplementedError: This method must be implemented by subclasses. - - """ raise NotImplementedError # Low level methods def arc_at(self, address, arc): - """ - Retrieves the arc at the given address. - - Args: - address (int): The address of the arc. - arc (Arc): An arc object to store the retrieved arc. - - Raises: - NotImplementedError: This method must be implemented by subclasses. - - """ raise NotImplementedError def iter_arcs(self, address, arc=None): - """ - Iterates over the arcs starting from the given address. - - Args: - address (int): The starting address. - arc (Arc, optional): An arc object to store each iterated arc. Defaults to None. - - Yields: - Arc: The iterated arcs. - - Raises: - NotImplementedError: This method must be implemented by subclasses. - - """ raise NotImplementedError def find_arc(self, address, label, arc=None): - """ - Finds the arc with the given label starting from the given address. - - Args: - address (int): The starting address. - label (str): The label of the arc to find. - arc (Arc, optional): An arc object to store the found arc. Defaults to None. - - Returns: - Arc: The found arc, or None if not found. - - Raises: - NotImplementedError: This method must be implemented by subclasses. - - """ arc = arc or Arc() for arc in self.iter_arcs(address, arc): thislabel = arc.label @@ -2309,44 +1271,12 @@ def find_arc(self, address, label, arc=None): # Convenience methods def list_arcs(self, address): - """ - Returns a list of arcs starting from the given address. - - Args: - address (int): The starting address. - - Returns: - list: A list of arcs. - - """ return [arc.copy() for arc in self.iter_arcs(address)] def arc_dict(self, address): - """ - Returns a dictionary of arcs starting from the given address. - - Args: - address (int): The starting address. - - Returns: - dict: A dictionary of arcs, where the keys are the arc labels. - - """ return {arc.label: arc.copy() for arc in self.iter_arcs(address)} def find_path(self, path, arc=None, address=None): - """ - Finds a path in the graph based on a sequence of labels. - - Args: - path (list): A list of labels representing the path. - arc (Arc, optional): An arc object to store the found arc. Defaults to None. - address (int, optional): The starting address. Defaults to None. - - Returns: - Arc: The arc at the end of the path, or None if the path is not found. - - """ path = to_labels(path) if arc: @@ -2368,28 +1298,6 @@ def find_path(self, path, arc=None, address=None): class GraphReader(BaseGraphReader): - """ - A class for reading graph data from a database file. - - Args: - dbfile (file-like object): The database file to read from. - rootname (str, optional): The name of the root node. If not provided and there is only one root, it will be used automatically. Defaults to None. - vtype (object, optional): The type of values associated with the arcs. Defaults to None. - filebase (int, optional): The base offset in the file where the graph data starts. Defaults to 0. - - Attributes: - dbfile (file-like object): The database file being read. - vtype (object): The type of values associated with the arcs. - filebase (int): The base offset in the file where the graph data starts. - version (int): The version of the graph data. - roots (dict): A dictionary of root nodes in the graph. - _root (object): The current root node. - - Raises: - FileVersionError: If the database file has an invalid version. - - """ - def __init__(self, dbfile, rootname=None, vtype=None, filebase=0): self.dbfile = dbfile self.vtype = vtype @@ -2412,79 +1320,28 @@ def __init__(self, dbfile, rootname=None, vtype=None, filebase=0): self._root = self.root(rootname) def close(self): - """ - Close the database file. - - """ self.dbfile.close() - def has_root(self, rootname): - """ - Check if a root node with the given name exists in the graph. - - Args: - rootname (str): The name of the root node. - - Returns: - bool: True if the root node exists, False otherwise. + # Overrides - """ + def has_root(self, rootname): return rootname in self.roots def root(self, rootname=None): - """ - Get the root node of the graph. - - Args: - rootname (str, optional): The name of the root node. If not provided, returns the current root node. - - Returns: - object: The root node. - - """ if rootname is None: return self._root else: return self.roots[rootname] def default_root(self): - """ - Get the default root node of the graph. - - Returns: - object: The default root node. - - """ return self._root def arc_at(self, address, arc=None): - """ - Get the arc at the specified address in the graph. - - Args: - address (int): The address of the arc. - arc (Arc, optional): An instance of the Arc class to store the arc data. If not provided, a new Arc instance will be created. - - Returns: - Arc: The arc at the specified address. - - """ arc = arc or Arc() self.dbfile.seek(address) return self._read_arc(arc) def iter_arcs(self, address, arc=None): - """ - Iterate over the arcs starting from the specified address in the graph. - - Args: - address (int): The address of the first arc. - arc (Arc, optional): An instance of the Arc class to store the arc data. If not provided, a new Arc instance will be created. - - Yields: - Arc: The arcs in the graph. - - """ arc = arc or Arc() _read_arc = self._read_arc @@ -2496,18 +1353,6 @@ def iter_arcs(self, address, arc=None): break def find_arc(self, address, label, arc=None): - """ - Find the arc with the specified label starting from the specified address in the graph. - - Args: - address (int): The address of the first arc. - label (bytes): The label of the arc. - arc (Arc, optional): An instance of the Arc class to store the arc data. If not provided, a new Arc instance will be created. - - Returns: - Arc: The arc with the specified label, or None if not found. - - """ # Overrides the default scanning implementation arc = arc or Arc() @@ -2526,17 +1371,9 @@ def find_arc(self, address, label, arc=None): # search method return BaseGraphReader.find_arc(self, address, label, arc) - def _read_arc(self, toarc=None): - """ - Read an arc from the database file. - - Args: - toarc (Arc, optional): An instance of the Arc class to store the arc data. If not provided, a new Arc instance will be created. - - Returns: - Arc: The arc read from the database file. + # Implementations - """ + def _read_arc(self, toarc=None): toarc = toarc or Arc() dbfile = self.dbfile flags = dbfile.read_byte() @@ -2549,16 +1386,6 @@ def _read_arc(self, toarc=None): return self._read_arc_data(flags, toarc) def _read_label(self, flags): - """ - Read the label of an arc from the database file. - - Args: - flags (int): The flags indicating the label type. - - Returns: - bytes: The label of the arc. - - """ dbfile = self.dbfile if flags & MULTIBYTE_LABEL: length = dbfile.read_varint() @@ -2568,13 +1395,6 @@ def _read_label(self, flags): return label def _read_fixed_info(self): - """ - Read the fixed size information from the database file. - - Returns: - tuple: A tuple containing the size and count of the fixed size records, or None if not applicable. - - """ dbfile = self.dbfile flags = dbfile.read_byte() @@ -2586,17 +1406,6 @@ def _read_fixed_info(self): return None def _read_arc_data(self, flags, arc): - """ - Read the data of an arc from the database file. - - Args: - flags (int): The flags indicating the arc properties. - arc (Arc): An instance of the Arc class to store the arc data. - - Returns: - Arc: The arc with the data read from the database file. - - """ dbfile = self.dbfile accept = arc.accept = bool(flags & ARC_ACCEPT) arc.lastarc = flags & ARC_LAST @@ -2614,20 +1423,6 @@ def _read_arc_data(self, flags, arc): return arc def _binary_search(self, address, size, count, label, arc): - """ - Perform a binary search to find the arc with the specified label. - - Args: - address (int): The address of the first arc. - size (int): The size of each arc record. - count (int): The number of arcs. - label (bytes): The label of the arc to find. - arc (Arc): An instance of the Arc class to store the arc data. - - Returns: - Arc: The arc with the specified label, or None if not found. - - """ dbfile = self.dbfile _read_label = self._read_label @@ -2651,22 +1446,8 @@ def _binary_search(self, address, size, count, label, arc): def to_labels(key): - """ - Takes a string and returns a list of bytestrings, suitable for use as + """Takes a string and returns a list of bytestrings, suitable for use as a key or path in an FSA/FST graph. - - Args: - key (str or bytes or list or tuple): The input string. - - Returns: - tuple: A tuple of bytestrings representing the input string. - - Raises: - TypeError: If the input contains a non-bytestring. - - Example: - >>> to_labels('hello') - (b'h', b'e', b'l', b'l', b'o') """ # Convert to tuples of bytestrings (must be tuples so they can be hashed) @@ -2691,22 +1472,11 @@ def to_labels(key): def within(graph, text, k=1, prefix=0, address=None): - """ - Yields a series of keys in the given graph within ``k`` edit distance of + """Yields a series of keys in the given graph within ``k`` edit distance of ``text``. If ``prefix`` is greater than 0, all keys must match the first ``prefix`` characters of ``text``. - - Args: - graph (Graph): The graph to search within. - text (str): The text to search for. - k (int, optional): The maximum edit distance allowed. Defaults to 1. - prefix (int, optional): The number of characters that must match at the beginning of the keys. Defaults to 0. - address (int, optional): The starting address in the graph. Defaults to None. - - Yields: - str: A key within the specified edit distance of the text. - """ + text = to_labels(text) if address is None: address = graph._root @@ -2787,19 +1557,6 @@ def within(graph, text, k=1, prefix=0, address=None): def dump_graph(graph, address=None, tab=0, out=None): - """ - Dump the graph structure starting from the given address. - - Args: - graph (Graph): The graph object. - address (int, optional): The address to start dumping from. If not provided, the root address of the graph will be used. - tab (int, optional): The number of tabs to indent the output. Defaults to 0. - out (file-like object, optional): The output stream to write the dumped graph. Defaults to sys.stdout. - - Returns: - None - - """ if address is None: address = graph._root if out is None: diff --git a/src/whoosh/automata/glob.py b/src/whoosh/automata/glob.py index a5b31018..c41074c7 100644 --- a/src/whoosh/automata/glob.py +++ b/src/whoosh/automata/glob.py @@ -38,25 +38,6 @@ def parse_glob( pattern, _glob_multi="*", _glob_single="?", _glob_range1="[", _glob_range2="]" ): - """ - Parse a glob pattern and generate tokens representing the pattern. - - Args: - pattern (str): The glob pattern to parse. - _glob_multi (str, optional): The character representing multiple wildcard. Defaults to "*". - _glob_single (str, optional): The character representing single wildcard. Defaults to "?". - _glob_range1 (str, optional): The character representing the start of a character range. Defaults to "[". - _glob_range2 (str, optional): The character representing the end of a character range. Defaults to "]". - - Yields: - tuple: A tuple containing the token type and additional information. - The token types are: - - _STAR: Represents the multiple wildcard. - - _QUEST: Represents the single wildcard. - - _RANGE: Represents a character range. - - _LIT: Represents a literal character. - - """ pos = 0 last = None while pos < len(pattern): @@ -91,21 +72,6 @@ def parse_glob( def glob_automaton(pattern): - """ - Constructs a non-deterministic finite automaton (NFA) from a glob pattern. - - Args: - pattern (str): The glob pattern to convert into an NFA. - - Returns: - NFA: The constructed NFA. - - Raises: - None. - - Examples: - >>> nfa = glob_automaton("*.txt") - """ nfa = NFA(0) i = -1 for i, (op, arg) in enumerate(parse_glob(pattern)): diff --git a/src/whoosh/automata/lev.py b/src/whoosh/automata/lev.py index 53437f87..08317edd 100644 --- a/src/whoosh/automata/lev.py +++ b/src/whoosh/automata/lev.py @@ -2,18 +2,6 @@ def levenshtein_automaton(term, k, prefix=0): - """ - Generate a Levenshtein automaton for a given term and maximum edit distance. - - Args: - term (str): The term to generate the automaton for. - k (int): The maximum edit distance allowed. - prefix (int, optional): The length of the prefix to match exactly. Defaults to 0. - - Returns: - NFA: The generated Levenshtein automaton. - - """ nfa = NFA((0, 0)) if prefix: for i in range(prefix): diff --git a/src/whoosh/automata/reg.py b/src/whoosh/automata/reg.py index b3a032ce..54a5ecf6 100644 --- a/src/whoosh/automata/reg.py +++ b/src/whoosh/automata/reg.py @@ -33,59 +33,19 @@ def parse(pattern): - """ - Parses a regular expression pattern and returns a parsed representation. - - Args: - pattern (str): The regular expression pattern to parse. - - Returns: - list: A list representing the parsed regular expression pattern. - - Example: - >>> parse("ab*c") - ['a', ('b', '*'), 'c'] - """ stack = [] ops = [] class RegexBuilder: - """ - A class for building regular expressions using a simplified NFA representation. - - This class provides methods for constructing various components of a regular expression, - such as epsilon, character, charset, dot, choice, concatenation, star, plus, and question. - - Usage: - rb = RegexBuilder() - nfa = rb.char('a') # Create an NFA for the character 'a' - nfa2 = rb.concat(nfa, rb.char('b')) # Concatenate two NFAs - """ - def __init__(self): - """ - Initialize the RegexBuilder object. - """ self.statenum = 1 def new_state(self): - """ - Generate a new state number. - - Returns: - int: The new state number. - """ self.statenum += 1 return self.statenum def epsilon(self): - """ - Create an NFA for the epsilon transition. - - Returns: - NFA: The NFA representing the epsilon transition. - """ s = self.new_state() e = self.new_state() nfa = NFA(s) @@ -94,15 +54,6 @@ def epsilon(self): return nfa def char(self, label): - """ - Create an NFA for a single character. - - Args: - label (str): The character label. - - Returns: - NFA: The NFA representing the character. - """ s = self.new_state() e = self.new_state() nfa = NFA(s) @@ -111,15 +62,6 @@ def char(self, label): return nfa def charset(self, chars): - """ - Create an NFA for a character set. - - Args: - chars (str): The characters in the set. - - Returns: - NFA: The NFA representing the character set. - """ s = self.new_state() e = self.new_state() nfa = NFA(s) @@ -129,12 +71,6 @@ def charset(self, chars): return e def dot(self): - """ - Create an NFA for the dot (matches any character). - - Returns: - NFA: The NFA representing the dot. - """ s = self.new_state() e = self.new_state() nfa = NFA(s) @@ -143,16 +79,6 @@ def dot(self): return nfa def choice(self, n1, n2): - """ - Create an NFA for the choice (|) operator. - - Args: - n1 (NFA): The first NFA. - n2 (NFA): The second NFA. - - Returns: - NFA: The NFA representing the choice operator. - """ s = self.new_state() s1 = self.new_state() s2 = self.new_state() @@ -170,16 +96,6 @@ def choice(self, n1, n2): return nfa def concat(self, n1, n2): - """ - Create an NFA for the concatenation operator. - - Args: - n1 (NFA): The first NFA. - n2 (NFA): The second NFA. - - Returns: - NFA: The NFA representing the concatenation operator. - """ s = self.new_state() m = self.new_state() e = self.new_state() @@ -190,15 +106,6 @@ def concat(self, n1, n2): return nfa def star(self, n): - """ - Create an NFA for the Kleene star (*) operator. - - Args: - n (NFA): The NFA to apply the star operator to. - - Returns: - NFA: The NFA representing the star operator. - """ s = self.new_state() m1 = self.new_state() m2 = self.new_state() @@ -213,25 +120,7 @@ def star(self, n): return nfa def plus(self, n): - """ - Create an NFA for the plus (+) operator. - - Args: - n (NFA): The NFA to apply the plus operator to. - - Returns: - NFA: The NFA representing the plus operator. - """ return self.concat(n, self.star(n)) def question(self, n): - """ - Create an NFA for the question mark (?) operator. - - Args: - n (NFA): The NFA to apply the question mark operator to. - - Returns: - NFA: The NFA representing the question mark operator. - """ return self.choice(n, self.epsilon()) diff --git a/src/whoosh/codec/__init__.py b/src/whoosh/codec/__init__.py index 3fc048f5..70445636 100644 --- a/src/whoosh/codec/__init__.py +++ b/src/whoosh/codec/__init__.py @@ -27,21 +27,6 @@ def default_codec(*args, **kwargs): - """ - Returns the default codec for Whoosh. - - This function imports and returns the W3Codec class from the whoosh.codec.whoosh3 module. - - Parameters: - *args: positional arguments to be passed to the W3Codec constructor. - **kwargs: keyword arguments to be passed to the W3Codec constructor. - - Returns: - W3Codec: an instance of the W3Codec class. - - Example: - codec = default_codec() - """ from whoosh.codec.whoosh3 import W3Codec return W3Codec(*args, **kwargs) diff --git a/src/whoosh/codec/base.py b/src/whoosh/codec/base.py index 32ba38a4..e360ff52 100644 --- a/src/whoosh/codec/base.py +++ b/src/whoosh/codec/base.py @@ -42,17 +42,6 @@ class OutOfOrderError(Exception): - """ - Exception raised when encountering out-of-order data during decoding. - - This exception is raised when the codec encounters data that is out of order - during the decoding process. It typically indicates a corruption or - inconsistency in the data being decoded. - - Attributes: - message -- explanation of the error - """ - pass @@ -60,314 +49,79 @@ class OutOfOrderError(Exception): class Codec: - """ - The base class for defining codecs in Whoosh. - - A codec is responsible for defining how data is stored and retrieved from the index. - It provides implementations for various operations such as per-document value writing, - inverted index writing, postings writing and reading, index readers, and segment and - generation management. - - Subclasses of Codec should implement the abstract methods to provide the specific - functionality required by the codec. - - Attributes: - length_stats (bool): Indicates whether length statistics should be enabled for the codec. - - """ - length_stats = True + # Per document value writer + @abstractmethod def per_document_writer(self, storage, segment): - """ - Returns a per-document value writer for the given storage and segment. - - Args: - storage (Storage): The storage object for the index. - segment (Segment): The segment object representing a portion of the index. - - Returns: - PerDocumentWriter: The per-document value writer. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ - raise NotImplementedError + # Inverted index writer + @abstractmethod def field_writer(self, storage, segment): - """ - Returns an inverted index writer for the given storage and segment. - - Args: - storage (Storage): The storage object for the index. - segment (Segment): The segment object representing a portion of the index. - - Returns: - FieldWriter: The inverted index writer. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ - raise NotImplementedError + # Postings + @abstractmethod def postings_writer(self, dbfile, byteids=False): - """ - Returns a postings writer for the given database file. - - Args: - dbfile (File): The file object representing the database file. - byteids (bool, optional): Indicates whether the postings should be written using byte IDs. - - Returns: - PostingsWriter: The postings writer. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ - raise NotImplementedError @abstractmethod def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): - """ - Returns a postings reader for the given database file. - - Args: - dbfile (File): The file object representing the database file. - terminfo (TermInfo): The term information object. - format_ (str): The format of the postings. - term (Term, optional): The term to read the postings for. - scorer (Scorer, optional): The scorer object for scoring the postings. - - Returns: - PostingsReader: The postings reader. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ - raise NotImplementedError - def automata(self, storage, segment): - """ - Returns an automata object for the given storage and segment. - - Args: - storage (Storage): The storage object for the index. - segment (Segment): The segment object representing a portion of the index. - - Returns: - Automata: The automata object. - - """ + # Index readers + def automata(self, storage, segment): _ = storage, segment # Unused arguments return Automata() @abstractmethod def terms_reader(self, storage, segment): - """ - Returns a terms reader for the given storage and segment. - - Args: - storage (Storage): The storage object for the index. - segment (Segment): The segment object representing a portion of the index. - - Returns: - TermsReader: The terms reader. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ - raise NotImplementedError @abstractmethod def per_document_reader(self, storage, segment): - """ - Returns a per-document value reader for the given storage and segment. - - Args: - storage (Storage): The storage object for the index. - segment (Segment): The segment object representing a portion of the index. - - Returns: - PerDocumentReader: The per-document value reader. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ - raise NotImplementedError + # Segments and generations + @abstractmethod def new_segment(self, storage, indexname): - """ - Creates a new segment for the given storage and index name. - - Args: - storage (Storage): The storage object for the index. - indexname (str): The name of the index. - - Returns: - Segment: The new segment. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ - raise NotImplementedError class WrappingCodec(Codec): - """ - A codec that wraps another codec. - - This codec delegates all the method calls to the wrapped codec. - It can be used to extend or modify the behavior of an existing codec. - - Parameters: - - child (Codec): The codec to be wrapped. - - """ - def __init__(self, child): - """ - Initializes a new instance of the WrappingCodec class. - - Parameters: - - child (Codec): The codec to be wrapped. - - """ self._child = child def per_document_writer(self, storage, segment): - """ - Returns a per-document writer for the given storage and segment. - - Parameters: - - storage (Storage): The storage object. - - segment (Segment): The segment object. - - Returns: - - PerDocumentWriter: The per-document writer. - - """ return self._child.per_document_writer(storage, segment) def field_writer(self, storage, segment): - """ - Returns a field writer for the given storage and segment. - - Parameters: - - storage (Storage): The storage object. - - segment (Segment): The segment object. - - Returns: - - FieldWriter: The field writer. - - """ return self._child.field_writer(storage, segment) def postings_writer(self, dbfile, byteids=False): - """ - Returns a postings writer for the given dbfile. - - Parameters: - - dbfile (DBFile): The dbfile object. - - byteids (bool): Whether to use byteids. - - Returns: - - PostingsWriter: The postings writer. - - """ return self._child.postings_writer(dbfile, byteids=byteids) def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): - """ - Returns a postings reader for the given dbfile, terminfo, format, term, and scorer. - - Parameters: - - dbfile (DBFile): The dbfile object. - - terminfo (TermInfo): The terminfo object. - - format_ (str): The format. - - term (Term): The term object. - - scorer (Scorer): The scorer object. - - Returns: - - PostingsReader: The postings reader. - - """ return self._child.postings_reader( dbfile, terminfo, format_, term=term, scorer=scorer ) def automata(self, storage, segment): - """ - Returns an automata object for the given storage and segment. - - Parameters: - - storage (Storage): The storage object. - - segment (Segment): The segment object. - - Returns: - - Automata: The automata object. - - """ return self._child.automata(storage, segment) def terms_reader(self, storage, segment): - """ - Returns a terms reader for the given storage and segment. - - Parameters: - - storage (Storage): The storage object. - - segment (Segment): The segment object. - - Returns: - - TermsReader: The terms reader. - - """ return self._child.terms_reader(storage, segment) def per_document_reader(self, storage, segment): - """ - Returns a per-document reader for the given storage and segment. - - Parameters: - - storage (Storage): The storage object. - - segment (Segment): The segment object. - - Returns: - - PerDocumentReader: The per-document reader. - - """ return self._child.per_document_reader(storage, segment) def new_segment(self, storage, indexname): - """ - Returns a new segment for the given storage and indexname. - - Parameters: - - storage (Storage): The storage object. - - indexname (str): The indexname. - - Returns: - - Segment: The new segment. - - """ return self._child.new_segment(storage, indexname) @@ -375,125 +129,23 @@ def new_segment(self, storage, indexname): class PerDocumentWriter: - """ - The PerDocumentWriter class is an abstract base class that defines the interface for writing per-document data - during the indexing process. - - Subclasses of PerDocumentWriter must implement the following methods: - - start_doc(docnum): Called at the beginning of writing a new document. - - add_field(fieldname, fieldobj, value, length): Called to add a field and its value to the document. - - add_column_value(fieldname, columnobj, value): Called to add a column value to the document. - - add_vector_items(fieldname, fieldobj, items): Called to add vector items to the document. - - The PerDocumentWriter class also provides default implementations for the following methods: - - add_vector_matcher(fieldname, fieldobj, vmatcher): Adds vector items to the document using a vector matcher. - - finish_doc(): Called at the end of writing a document. - - close(): Called to close the writer. - - Usage: - 1. Create a subclass of PerDocumentWriter. - 2. Implement the required methods. - 3. Use the subclass to write per-document data during the indexing process. - - Example: - ```python - class MyDocumentWriter(PerDocumentWriter): - def start_doc(self, docnum): - # Implementation goes here - - def add_field(self, fieldname, fieldobj, value, length): - # Implementation goes here - - def add_column_value(self, fieldname, columnobj, value): - # Implementation goes here - - def add_vector_items(self, fieldname, fieldobj, items): - # Implementation goes here - - writer = MyDocumentWriter() - writer.start_doc(1) - writer.add_field("title", fieldobj, "Sample Title", 1) - writer.finish_doc() - writer.close() - ``` - """ - @abstractmethod def start_doc(self, docnum): - """ - Called at the beginning of writing a new document. - - Parameters: - - docnum (int): The document number. - - Raises: - - NotImplementedError: If the method is not implemented by the subclass. - """ raise NotImplementedError @abstractmethod def add_field(self, fieldname, fieldobj, value, length): - """ - Called to add a field and its value to the document. - - Parameters: - - fieldname (str): The name of the field. - - fieldobj: The field object. - - value: The value of the field. - - length (int): The length of the field. - - Raises: - - NotImplementedError: If the method is not implemented by the subclass. - """ raise NotImplementedError @abstractmethod def add_column_value(self, fieldname, columnobj, value): - """ - Called to add a column value to the document. - - Parameters: - - fieldname (str): The name of the field. - - columnobj: The column object. - - value: The value of the column. - - Raises: - - NotImplementedError: If the method is not implemented by the subclass. - """ raise NotImplementedError("Codec does not implement writing columns") @abstractmethod def add_vector_items(self, fieldname, fieldobj, items): - """ - Called to add vector items to the document. - - Parameters: - - fieldname (str): The name of the field. - - fieldobj: The field object. - - items: An iterable of vector items. - - Raises: - - NotImplementedError: If the method is not implemented by the subclass. - """ raise NotImplementedError def add_vector_matcher(self, fieldname, fieldobj, vmatcher): - """ - Adds vector items to the document using a vector matcher. - - Parameters: - - fieldname (str): The name of the field. - - fieldobj: The field object. - - vmatcher: The vector matcher. - - Note: - This method provides a default implementation that reads vector items from the vector matcher - and calls the add_vector_items method. - - Raises: - - NotImplementedError: If the add_vector_items method is not implemented by the subclass. - """ - def readitems(): while vmatcher.is_active(): text = vmatcher.id() @@ -505,85 +157,20 @@ def readitems(): self.add_vector_items(fieldname, fieldobj, readitems()) def finish_doc(self): - """ - Called at the end of writing a document. - - Note: - This method is intentionally left empty. - - Usage: - Subclasses can override this method to perform any necessary cleanup or finalization steps. - """ + # This method is intentionally left empty. pass def close(self): - """ - Called to close the writer. - - Note: - This method is intentionally left empty. - - Usage: - Subclasses can override this method to perform any necessary cleanup or closing steps. - """ + # This method is intentionally left empty. pass class FieldWriter: - """ - The FieldWriter class is responsible for translating a generator of postings into calls to various methods - such as start_field(), start_term(), add(), finish_term(), finish_field(), etc. It is used in the process - of writing fields and terms to an index. - - Usage: - 1. Create an instance of FieldWriter. - 2. Implement the abstract methods: start_field(), start_term(), add(), finish_term(). - 3. Optionally, implement the add_spell_word() method if you need to add spelling words. - 4. Use the add_postings() method to process a generator of postings and write them to the index. - 5. Call the close() method to perform any necessary cleanup. - - Example: - ```python - class MyFieldWriter(FieldWriter): - def start_field(self, fieldname, fieldobj): - # Implementation goes here - - def start_term(self, text): - # Implementation goes here - - def add(self, docnum, weight, vbytes, length): - # Implementation goes here - - def finish_term(self): - # Implementation goes here - - def add_spell_word(self, fieldname, text): - # Implementation goes here - - writer = MyFieldWriter() - writer.add_postings(schema, lengths, items) - writer.close() - ``` - - Note: The finish_field() method is intentionally left empty and does not need to be implemented. - """ - def add_postings(self, schema, lengths, items): - """ - Translates a generator of (fieldname, btext, docnum, w, v) postings into calls to start_field(), start_term(), - add(), finish_term(), finish_field(), etc. + # This method translates a generator of (fieldname, btext, docnum, w, v) + # postings into calls to start_field(), start_term(), add(), + # finish_term(), finish_field(), etc. - Parameters: - - schema (Schema): The schema object that defines the fields in the index. - - lengths (Lengths): The lengths object that provides the document field lengths. - - items (generator): A generator of (fieldname, btext, docnum, weight, value) postings. - - Raises: - - OutOfOrderError: If the postings are out of order. - - Returns: - - None - """ start_field = self.start_field start_term = self.start_term add = self.add @@ -600,7 +187,7 @@ def add_postings(self, schema, lengths, items): # The bytes text of the previous posting lasttext = None # The (fieldname, btext) of the previous spelling posting - # lastspell = None + lastspell = None # The field object for the current field fieldobj = None for fieldname, btext, docnum, weight, value in items: @@ -658,491 +245,128 @@ def add_postings(self, schema, lengths, items): @abstractmethod def start_field(self, fieldname, fieldobj): - """ - This method is called when starting to process a new field during indexing or searching. - - Parameters: - - fieldname (str): The name of the field being processed. - - fieldobj: The field object representing the field being processed. - - Raises: - - NotImplementedError: This method should be implemented by subclasses. - - Notes: - - This method is typically used for initializing any necessary resources or state for processing the field. - - Subclasses should override this method to provide their own implementation. - """ raise NotImplementedError @abstractmethod def start_term(self, text): - """ - This method is called to indicate the start of a term during indexing or searching. - - Parameters: - - text (str): The text of the term. - - Raises: - - NotImplementedError: This method should be implemented by subclasses. - - """ raise NotImplementedError @abstractmethod def add(self, docnum, weight, vbytes, length): - """ - Adds a document to the codec. - - Args: - docnum (int): The document number. - weight (float): The weight of the document. - vbytes (bytes): The encoded document data. - length (int): The length of the document in bytes. - - Raises: - NotImplementedError: This method should be implemented by a subclass. - - """ raise NotImplementedError def add_spell_word(self, fieldname, text): - """ - Adds a spell word to the specified field. - - Args: - fieldname (str): The name of the field to add the spell word to. - text (str): The spell word to add. - - Raises: - NotImplementedError: This method is not implemented in the base class. - """ raise NotImplementedError @abstractmethod def finish_term(self): - """ - Finish processing the current term. - - This method is called to finalize the processing of the current term. Subclasses should implement this method - to perform any necessary cleanup or finalization steps for the term. - - Raises: - NotImplementedError: This method is meant to be overridden by subclasses. - """ raise NotImplementedError def finish_field(self): - """ - Finish processing the current field. - - This method is called after all the terms in a field have been processed. - It can be overridden in subclasses to perform any necessary finalization - steps for the field. - - Usage: - codec = BaseCodec() - codec.finish_field() - - """ # This method is intentionally left empty. pass def close(self): - """ - Closes the codec. - - This method is called when the codec needs to be closed. It should release any resources - held by the codec and perform any necessary cleanup. - - Example usage: - codec = MyCodec() - # ... do some operations with the codec ... - codec.close() - """ pass # Postings -class PostingsWriter: - """Abstract base class for writing postings lists to disk. - - This class defines the interface for writing postings lists to disk in a specific format. - Subclasses must implement the abstract methods to provide the necessary functionality. - - Attributes: - None - - Methods: - start_postings(format_, terminfo): Start writing a new postings list. - add_posting(id_, weight, vbytes, length=None): Add a posting to the current postings list. - finish_postings(): Finish writing the current postings list. - written(): Check if this object has already written to disk. - """ +class PostingsWriter: @abstractmethod def start_postings(self, format_, terminfo): - """Start writing a new postings list. - - Args: - format_ (str): The format of the postings list. - terminfo (object): The term information associated with the postings list. - - Returns: - None - - Raises: - NotImplementedError: This method must be implemented by subclasses. - - """ - raise NotImplementedError @abstractmethod def add_posting(self, id_, weight, vbytes, length=None): - """Add a posting to the current postings list. - - Args: - id_ (int): The identifier of the posting. - weight (float): The weight of the posting. - vbytes (bytes): The encoded bytes of the posting. - length (int, optional): The length of the posting. Defaults to None. - - Returns: - None - - Raises: - NotImplementedError: This method must be implemented by subclasses. - - """ - raise NotImplementedError def finish_postings(self): - """Finish writing the current postings list. - - This method is intentionally left empty. - - Args: - None - - Returns: - None - - """ - + # This method is intentionally left empty. pass @abstractmethod def written(self): - """Check if this object has already written to disk. - - Args: - None - - Returns: - bool: True if this object has already written to disk, False otherwise. - - Raises: - NotImplementedError: This method must be implemented by subclasses. - - """ + """Returns True if this object has already written to disk.""" raise NotImplementedError # Reader classes -class FieldCursor: - """A cursor for navigating through a field's data. - - This class provides methods for navigating through a field's data, - such as moving to the first position, finding a specific string, - moving to the next position, and retrieving the current term. - - Usage: - cursor = FieldCursor() - cursor.first() # Move to the first position - cursor.find("example") # Find the position of the string "example" - cursor.next() # Move to the next position - term = cursor.term() # Retrieve the current term - - Note: - This class is meant to be subclassed and the methods should be - implemented according to the specific requirements of the field's - data format. - """ - def first(self): - """Move the cursor to the first position. - Raises: - NotImplementedError: This method should be implemented by - subclasses. - """ +class FieldCursor: + def first(self): raise NotImplementedError def find(self, string): - """Find the position of a specific string. - - Args: - string (str): The string to find. - - Raises: - NotImplementedError: This method should be implemented by - subclasses. - """ raise NotImplementedError def next(self): - """Move the cursor to the next position. - - Raises: - NotImplementedError: This method should be implemented by - subclasses. - """ raise NotImplementedError def term(self): - """Retrieve the current term. - - Returns: - str: The current term. - - Raises: - NotImplementedError: This method should be implemented by - subclasses. - """ raise NotImplementedError class TermsReader: - """A base class for reading terms and their associated information from an index. - - This class provides methods for retrieving terms, term frequencies, document frequencies, - and creating term matchers for querying the index. - - Subclasses of `TermsReader` should implement the abstract methods to provide the necessary - functionality for reading terms from a specific index format. - - """ - @abstractmethod def __contains__(self, term): - """Check if a term exists in the index. - - Args: - term (str): The term to check. - - Returns: - bool: True if the term exists in the index, False otherwise. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ raise NotImplementedError @abstractmethod def cursor(self, fieldname, fieldobj): - """Get a cursor for iterating over the terms in a field. - - Args: - fieldname (str): The name of the field. - fieldobj (object): The field object. - - Returns: - object: A cursor object for iterating over the terms in the field. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ raise NotImplementedError @abstractmethod def terms(self): - """Get a list of all terms in the index. - - Returns: - list: A list of all terms in the index. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ raise NotImplementedError @abstractmethod def terms_from(self, fieldname, prefix): - """Get a list of terms starting with a given prefix in a specific field. - - Args: - fieldname (str): The name of the field. - prefix (str): The prefix to match. - - Returns: - list: A list of terms starting with the given prefix in the field. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ raise NotImplementedError @abstractmethod def items(self): - """Get a list of all (fieldname, term) pairs in the index. - - Returns: - list: A list of all (fieldname, term) pairs in the index. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ raise NotImplementedError @abstractmethod def items_from(self, fieldname, prefix): - """Get a list of (fieldname, term) pairs starting with a given prefix in a specific field. - - Args: - fieldname (str): The name of the field. - prefix (str): The prefix to match. - - Returns: - list: A list of (fieldname, term) pairs starting with the given prefix in the field. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ raise NotImplementedError @abstractmethod def term_info(self, fieldname, text): - """Get the term information for a specific term in a field. - - Args: - fieldname (str): The name of the field. - text (str): The term to get information for. - - Returns: - object: The term information object for the specified term in the field. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ raise NotImplementedError @abstractmethod def frequency(self, fieldname, text): - """Get the term frequency for a specific term in a field. - - Args: - fieldname (str): The name of the field. - text (str): The term to get the frequency for. - - Returns: - int: The term frequency for the specified term in the field. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ return self.term_info(fieldname, text).weight() @abstractmethod def doc_frequency(self, fieldname, text): - """Get the document frequency for a specific term in a field. - - Args: - fieldname (str): The name of the field. - text (str): The term to get the document frequency for. - - Returns: - int: The document frequency for the specified term in the field. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ return self.term_info(fieldname, text).doc_frequency() @abstractmethod def matcher(self, fieldname, text, format_, scorer=None): - """Create a term matcher for a specific term in a field. - - Args: - fieldname (str): The name of the field. - text (str): The term to create the matcher for. - format_ (object): The format object for the field. - scorer (object, optional): The scorer object to use for scoring the matches. - - Returns: - object: A term matcher for the specified term in the field. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ raise NotImplementedError @abstractmethod def indexed_field_names(self): - """Get a list of all field names in the index. - - Returns: - list: A list of all field names in the index. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - - """ raise NotImplementedError def close(self): - """Close the terms reader. - - This method is intentionally left empty. - - """ + # This method is intentionally left empty. pass class Automata: - """ - The Automata class provides methods for working with automata used in string matching operations. - """ - @staticmethod def levenshtein_dfa(uterm, maxdist, prefix=0): - """ - Generates a deterministic finite automaton (DFA) for performing approximate string matching using the Levenshtein distance algorithm. - - Args: - uterm (str): The target term to match against. - maxdist (int): The maximum allowed edit distance between the target term and the matched terms. - prefix (int, optional): The length of the common prefix between the target term and the matched terms. Defaults to 0. - - Returns: - DFA: The generated DFA for performing approximate string matching. - """ return lev.levenshtein_automaton(uterm, maxdist, prefix).to_dfa() @staticmethod def find_matches(dfa, cur): - """ - Finds all matches in a given cursor using a DFA. - - Args: - dfa (DFA): The DFA used for matching. - cur (Cursor): The cursor to search for matches. - - Yields: - str: The matched terms found in the cursor. - """ unull = chr(0) term = cur.text() @@ -1161,126 +385,43 @@ def find_matches(dfa, cur): match = dfa.next_valid_string(term) def terms_within(self, fieldcur, uterm, maxdist, prefix=0): - """ - Finds all terms within a given cursor that are within a specified edit distance of a target term. - - Args: - fieldcur (Cursor): The cursor representing the field to search within. - uterm (str): The target term to match against. - maxdist (int): The maximum allowed edit distance between the target term and the matched terms. - prefix (int, optional): The length of the common prefix between the target term and the matched terms. Defaults to 0. - - Returns: - Generator[str]: A generator that yields the matched terms found within the cursor. - """ dfa = self.levenshtein_dfa(uterm, maxdist, prefix) return self.find_matches(dfa, fieldcur) # Per-doc value reader -class PerDocumentReader: - """ - The PerDocumentReader class represents a base class for reading per-document data in a search index. - - This class provides methods for accessing and manipulating per-document data, such as deletions, columns, bitmaps, - lengths, vectors, and stored fields. - - Subclasses of PerDocumentReader should implement the abstract methods to provide the specific functionality - required for a particular codec. - - Usage: - 1. Create an instance of a subclass of PerDocumentReader. - 2. Use the provided methods to access and manipulate per-document data. - Example: - ``` - reader = MyPerDocumentReader() - count = reader.doc_count() - print(f"Total number of documents: {count}") - ``` - """ +class PerDocumentReader: def close(self): - """ - Closes the PerDocumentReader and releases any resources associated with it. - - This method should be called when the PerDocumentReader is no longer needed. - """ - + # This method is intentionally left empty. pass @abstractmethod def doc_count(self): - """ - Returns the number of documents in the reader. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError @abstractmethod def doc_count_all(self): - """ - Returns the total number of documents, including deleted documents, in the reader. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError # Deletions @abstractmethod def has_deletions(self): - """ - Returns True if the reader has deletions, False otherwise. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError @abstractmethod def is_deleted(self, docnum): - """ - Returns True if the document with the given docnum is deleted, False otherwise. - - Args: - docnum (int): The document number. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError @abstractmethod def deleted_docs(self): - """ - Returns a set of document numbers that are deleted. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError def all_doc_ids(self): """ Returns an iterator of all (undeleted) document IDs in the reader. - - Returns: - An iterator of document IDs. - - Example: - ``` - for doc_id in reader.all_doc_ids(): - print(doc_id) - ``` """ is_deleted = self.is_deleted @@ -1289,89 +430,28 @@ def all_doc_ids(self): ) def iter_docs(self): - """ - Returns an iterator over all (undeleted) documents in the reader. - - Yields: - Tuple[int, dict]: A tuple containing the document number and the stored fields of the document. - - Example: - ``` - for docnum, fields in reader.iter_docs(): - print(f"Document {docnum}: {fields}") - ``` - """ - for docnum in self.all_doc_ids(): yield docnum, self.stored_fields(docnum) # Columns def supports_columns(self): - """ - Returns True if the reader supports columns, False otherwise. - - Returns: - bool: True if the reader supports columns, False otherwise. - """ - return False def has_column(self, fieldname): - """ - Returns True if the reader has a column with the given fieldname, False otherwise. - - Args: - fieldname (str): The name of the column field. - - Returns: - bool: True if the reader has the column, False otherwise. - """ - _ = fieldname # Unused argument return False def list_columns(self): - """ - Returns a list of all column names in the reader. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError # Don't need to override this if supports_columns() returns False def column_reader(self, fieldname, column): - """ - Returns a reader for accessing the values in the specified column. - - Args: - fieldname (str): The name of the column field. - column (str): The name of the column. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError # Bitmaps def field_docs(self, fieldname): - """ - Returns the bitmap of documents that have a value for the specified field. - - Args: - fieldname (str): The name of the field. - - Returns: - Bitmap or None: The bitmap of documents or None if the field does not exist. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - _ = fieldname # Unused argument return None @@ -1379,130 +459,44 @@ def field_docs(self, fieldname): @abstractmethod def doc_field_length(self, docnum, fieldname, default=0): - """ - Returns the length of the specified field in the specified document. - - Args: - docnum (int): The document number. - fieldname (str): The name of the field. - default (int, optional): The default length to return if the field does not exist. Defaults to 0. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError @abstractmethod def field_length(self, fieldname): - """ - Returns the total length of the specified field across all documents. - - Args: - fieldname (str): The name of the field. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError @abstractmethod def min_field_length(self, fieldname): - """ - Returns the minimum length of the specified field across all documents. - - Args: - fieldname (str): The name of the field. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError @abstractmethod def max_field_length(self, fieldname): - """ - Returns the maximum length of the specified field across all documents. - - Args: - fieldname (str): The name of the field. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError # Vectors def has_vector(self, docnum, fieldname): - """ - Returns True if the specified document has a vector for the specified field, False otherwise. - - Args: - docnum (int): The document number. - fieldname (str): The name of the field. - - Returns: - bool: True if the document has a vector, False otherwise. - """ - _ = docnum, fieldname # Unused arguments return False # Don't need to override this if has_vector() always returns False def vector(self, docnum, fieldname, format_): - """ - Returns the vector for the specified document and field. - - Args: - docnum (int): The document number. - fieldname (str): The name of the field. - format_ (str): The format of the vector. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError # Stored @abstractmethod def stored_fields(self, docnum): - """ - Returns the stored fields of the specified document. - - Args: - docnum (int): The document number. - - Raises: - NotImplementedError: If the method is not implemented by the subclass. - """ - raise NotImplementedError def all_stored_fields(self): - """ - Returns an iterator over the stored fields of all (undeleted) documents in the reader. - - Yields: - dict: The stored fields of a document. - - Example: - ``` - for fields in reader.all_stored_fields(): - print(fields) - ``` - """ - for docnum in self.all_doc_ids(): yield self.stored_fields(docnum) # Segment base class + + class Segment: """Do not instantiate this object directly. It is used by the Index object to hold information about a segment. A list of objects of this class are @@ -1524,84 +518,37 @@ class Segment: # self.segid def __init__(self, indexname): - """ - Initializes a Segment object. - - :param indexname: The name of the index. - """ self.indexname = indexname self.segid = self._random_id() self.compound = False @classmethod def _random_id(cls, size=16): - """ - Generates a random ID for the segment. - - :param size: The size of the random ID. Default is 16. - :return: The random ID. - """ return random_name(size=size) def __repr__(self): - """ - Returns a string representation of the Segment object. - - :return: The string representation. - """ return f"<{self.__class__.__name__} {self.segment_id()}>" def __eq__(self, other): - """ - Checks if two Segment objects are equal. - - :param other: The other Segment object to compare. - :return: True if the objects are equal, False otherwise. - """ return isinstance(other, type(self)) and self.segment_id() == other.segment_id() def __hash__(self): - """ - Returns the hash value of the Segment object. - - :return: The hash value. - """ return hash(self.segment_id()) def codec(self): - """ - Returns the codec used by the segment. - - :return: The codec used by the segment. - """ raise NotImplementedError def index_name(self): - """ - Returns the name of the index. - - :return: The name of the index. - """ return self.indexname def segment_id(self): - """ - Returns the ID of the segment. - - :return: The ID of the segment. - """ if hasattr(self, "name"): # Old segment class return self.name else: return f"{self.index_name()}_{self.segid}" - def is_compound(self): - """ - Checks if the segment is a compound segment. - - :return: True if the segment is compound, False otherwise. - """ + def is_compound(self): if not hasattr(self, "compound"): return False return self.compound @@ -1609,54 +556,31 @@ def is_compound(self): # File convenience methods def make_filename(self, ext): - """ - Creates a filename for the segment with the given extension. - - :param ext: The extension of the filename. - :return: The filename. - """ return f"{self.segment_id()}{ext}" def list_files(self, storage): - """ - Lists the files associated with the segment in the given storage. - - :param storage: The storage object. - :return: A list of file names. - """ prefix = f"{self.segment_id()}." return [name for name in storage.list() if name.startswith(prefix)] def create_file(self, storage, ext, **kwargs): + """Convenience method to create a new file in the given storage named + with this segment's ID and the given extension. Any keyword arguments + are passed to the storage's create_file method. """ - Creates a new file in the given storage with the segment's ID and the given extension. - :param storage: The storage object. - :param ext: The extension of the file. - :param kwargs: Additional keyword arguments passed to the storage's create_file method. - :return: The created file object. - """ fname = self.make_filename(ext) return storage.create_file(fname, **kwargs) def open_file(self, storage, ext, **kwargs): + """Convenience method to open a file in the given storage named with + this segment's ID and the given extension. Any keyword arguments are + passed to the storage's open_file method. """ - Opens a file in the given storage with the segment's ID and the given extension. - :param storage: The storage object. - :param ext: The extension of the file. - :param kwargs: Additional keyword arguments passed to the storage's open_file method. - :return: The opened file object. - """ fname = self.make_filename(ext) return storage.open_file(fname, **kwargs) def create_compound_file(self, storage): - """ - Creates a compound file in the given storage by combining the segment's files. - - :param storage: The storage object. - """ segfiles = self.list_files(storage) assert not any(name.endswith(self.COMPOUND_EXT) for name in segfiles) cfile = self.create_file(storage, self.COMPOUND_EXT) @@ -1666,12 +590,6 @@ def create_compound_file(self, storage): self.compound = True def open_compound_file(self, storage): - """ - Opens the compound file associated with the segment in the given storage. - - :param storage: The storage object. - :return: The opened compound file object. - """ name = self.make_filename(self.COMPOUND_EXT) dbfile = storage.open_file(name) return CompoundStorage(dbfile, use_mmap=storage.supports_mmap) @@ -1684,28 +602,24 @@ def doc_count_all(self): Returns the total number of documents, DELETED OR UNDELETED, in this segment. """ + raise NotImplementedError def doc_count(self): """ Returns the number of (undeleted) documents in this segment. """ + return self.doc_count_all() - self.deleted_count() def set_doc_count(self, doccount): - """ - Sets the number of documents in the segment. - - :param doccount: The number of documents. - """ raise NotImplementedError def has_deletions(self): """ - Checks if any documents in this segment are deleted. - - :return: True if there are deleted documents, False otherwise. + Returns True if any documents in this segment are deleted. """ + return self.deleted_count() > 0 @abstractmethod @@ -1713,308 +627,106 @@ def deleted_count(self): """ Returns the total number of deleted documents in this segment. """ + raise NotImplementedError @abstractmethod def deleted_docs(self): - """ - Returns a list of deleted document numbers in this segment. - """ raise NotImplementedError @abstractmethod def delete_document(self, docnum, delete=True): - """ - Deletes or undeletes the given document number. + """Deletes the given document number. The document is not actually + removed from the index until it is optimized. - :param docnum: The document number to delete or undelete. - :param delete: If False, undeletes the document. Default is True. + :param docnum: The document number to delete. + :param delete: If False, this undeletes a deleted document. """ + raise NotImplementedError @abstractmethod def is_deleted(self, docnum): """ - Checks if the given document number is deleted. - - :param docnum: The document number. - :return: True if the document is deleted, False otherwise. + Returns True if the given document number is deleted. """ + raise NotImplementedError def should_assemble(self): - """ - Checks if the segment should be assembled. - - :return: True if the segment should be assembled, False otherwise. - """ return True # Wrapping Segment -class WrappingSegment(Segment): - """ - A segment that wraps another segment. - - This class serves as a wrapper around another segment, providing a way to modify or extend its behavior. - Args: - child (Segment): The segment to be wrapped. - - """ +class WrappingSegment(Segment): def __init__(self, child): self._child = child def codec(self): - """ - Get the codec used by the wrapped segment. - - Returns: - Codec: The codec used by the wrapped segment. - - """ return self._child.codec() def index_name(self): - """ - Get the name of the index associated with the wrapped segment. - - Returns: - str: The name of the index associated with the wrapped segment. - - """ return self._child.index_name() def segment_id(self): - """ - Get the unique identifier of the wrapped segment. - - Returns: - str: The unique identifier of the wrapped segment. - - """ return self._child.segment_id() def is_compound(self): - """ - Check if the wrapped segment is a compound segment. - - Returns: - bool: True if the wrapped segment is a compound segment, False otherwise. - - """ return self._child.is_compound() def should_assemble(self): - """ - Check if the wrapped segment should be assembled. - - Returns: - bool: True if the wrapped segment should be assembled, False otherwise. - - """ return self._child.should_assemble() def make_filename(self, ext): - """ - Generate a filename for the wrapped segment with the given extension. - - Args: - ext (str): The file extension. - - Returns: - str: The generated filename for the wrapped segment. - - """ return self._child.make_filename(ext) def list_files(self, storage): - """ - List all files associated with the wrapped segment in the given storage. - - Args: - storage: The storage object. - - Returns: - list: A list of filenames associated with the wrapped segment. - - """ return self._child.list_files(storage) def create_file(self, storage, ext, **kwargs): - """ - Create a new file for the wrapped segment with the given extension. - - Args: - storage: The storage object. - ext (str): The file extension. - **kwargs: Additional keyword arguments. - - Returns: - File: The created file object. - - """ return self._child.create_file(storage, ext, **kwargs) def open_file(self, storage, ext, **kwargs): - """ - Open an existing file for the wrapped segment with the given extension. - - Args: - storage: The storage object. - ext (str): The file extension. - **kwargs: Additional keyword arguments. - - Returns: - File: The opened file object. - - """ return self._child.open_file(storage, ext, **kwargs) def create_compound_file(self, storage): - """ - Create a compound file for the wrapped segment in the given storage. - - Args: - storage: The storage object. - - Returns: - CompoundFile: The created compound file object. - - """ return self._child.create_compound_file(storage) def open_compound_file(self, storage): - """ - Open a compound file for the wrapped segment in the given storage. - - Args: - storage: The storage object. - - Returns: - CompoundFile: The opened compound file object. - - """ return self._child.open_compound_file(storage) def delete_document(self, docnum, delete=True): - """ - Delete a document from the wrapped segment. - - Args: - docnum (int): The document number. - delete (bool): Whether to mark the document as deleted or not. Default is True. - - Returns: - bool: True if the document was successfully deleted, False otherwise. - - """ return self._child.delete_document(docnum, delete=delete) def has_deletions(self): - """ - Check if the wrapped segment has any deleted documents. - - Returns: - bool: True if the wrapped segment has deleted documents, False otherwise. - - """ return self._child.has_deletions() def deleted_count(self): - """ - Get the number of deleted documents in the wrapped segment. - - Returns: - int: The number of deleted documents. - - """ return self._child.deleted_count() def deleted_docs(self): - """ - Get a list of deleted document numbers in the wrapped segment. - - Returns: - list: A list of deleted document numbers. - - """ return self._child.deleted_docs() def is_deleted(self, docnum): - """ - Check if a document with the given number is deleted in the wrapped segment. - - Args: - docnum (int): The document number. - - Returns: - bool: True if the document is deleted, False otherwise. - - """ return self._child.is_deleted(docnum) def set_doc_count(self, doccount): - """ - Set the total number of documents in the wrapped segment. - - Args: - doccount (int): The total number of documents. - - """ self._child.set_doc_count(doccount) def doc_count(self): - """ - Get the total number of documents in the wrapped segment. - - Returns: - int: The total number of documents. - - """ return self._child.doc_count() def doc_count_all(self): - """ - Get the total number of documents, including deleted ones, in the wrapped segment. - - Returns: - int: The total number of documents. - - """ return self._child.doc_count_all() # Multi per doc reader -class MultiPerDocumentReader(PerDocumentReader): - """ - A reader that combines multiple per-document readers into a single reader. - - This class is used to read documents from multiple per-document readers and present them as a single reader. - It provides methods to access document counts, check for deletions, access columns, and retrieve field lengths. - - Parameters: - - readers (list): A list of per-document readers to be combined. - - offset (int): The offset to be applied to the document numbers of each reader. - - Attributes: - - _readers (list): The list of per-document readers. - - _doc_offsets (list): The list of document offsets for each reader. - - _doccount (int): The total number of documents across all readers. - - is_closed (bool): Indicates whether the reader is closed. - """ +class MultiPerDocumentReader(PerDocumentReader): def __init__(self, readers, offset=0): - """ - Initializes a MultiPerDocumentReader instance. - - Parameters: - - readers (list): A list of per-document readers to be combined. - - offset (int): The offset to be applied to the document numbers of each reader. - - """ self._readers = readers self._doc_offsets = [] @@ -2026,141 +738,52 @@ def __init__(self, readers, offset=0): self.is_closed = False def close(self): - """ - Closes the reader and releases any resources. - - """ for r in self._readers: r.close() self.is_closed = True def doc_count_all(self): - """ - Returns the total number of documents across all readers. - - Returns: - - int: The total number of documents. - - """ return self._doccount def doc_count(self): - """ - Returns the number of non-deleted documents across all readers. - - Returns: - - int: The number of non-deleted documents. - - """ total = 0 for r in self._readers: total += r.doc_count() return total def _document_reader(self, docnum): - """ - Returns the index of the reader that contains the specified document number. - - Parameters: - - docnum (int): The document number. - - Returns: - - int: The index of the reader. - - """ return max(0, bisect_right(self._doc_offsets, docnum) - 1) def _reader_and_docnum(self, docnum): - """ - Returns the reader index and the document number within the reader for the specified document number. - - Parameters: - - docnum (int): The document number. - - Returns: - - tuple: A tuple containing the reader index and the document number within the reader. - - """ rnum = self._document_reader(docnum) offset = self._doc_offsets[rnum] return rnum, docnum - offset - def has_deletions(self): - """ - Checks if any of the readers have deletions. - - Returns: - - bool: True if any of the readers have deletions, False otherwise. + # Deletions - """ + def has_deletions(self): return any(r.has_deletions() for r in self._readers) def is_deleted(self, docnum): - """ - Checks if the specified document number is deleted. - - Parameters: - - docnum (int): The document number. - - Returns: - - bool: True if the document is deleted, False otherwise. - - """ x, y = self._reader_and_docnum(docnum) return self._readers[x].is_deleted(y) def deleted_docs(self): - """ - Yields the document numbers of all deleted documents across all readers. - - Yields: - - int: The document number of a deleted document. - - """ for r, offset in zip(self._readers, self._doc_offsets): for docnum in r.deleted_docs(): yield docnum + offset def all_doc_ids(self): - """ - Yields all document numbers across all readers. - - Yields: - - int: The document number. - - """ for r, offset in zip(self._readers, self._doc_offsets): for docnum in r.all_doc_ids(): yield docnum + offset - def has_column(self, fieldname): - """ - Checks if any of the readers have the specified column. - - Parameters: - - fieldname (str): The name of the column. - - Returns: - - bool: True if any of the readers have the column, False otherwise. + # Columns - """ + def has_column(self, fieldname): return any(r.has_column(fieldname) for r in self._readers) def column_reader(self, fieldname, column): - """ - Returns a column reader for the specified fieldname and column. - - Parameters: - - fieldname (str): The name of the field. - - column (Column): The column object. - - Returns: - - ColumnReader: The column reader. - - Raises: - - ValueError: If none of the readers have the specified column. - - """ if not self.has_column(fieldname): raise ValueError(f"No column {fieldname!r}") @@ -2178,213 +801,72 @@ def column_reader(self, fieldname, column): else: return columns.MultiColumnReader(colreaders) - def doc_field_length(self, docnum, fieldname, default=0): - """ - Returns the length of the specified field in the specified document. - - Parameters: - - docnum (int): The document number. - - fieldname (str): The name of the field. - - default (int): The default value to return if the field is not found. - - Returns: - - int: The length of the field in the document. + # Lengths - """ + def doc_field_length(self, docnum, fieldname, default=0): x, y = self._reader_and_docnum(docnum) return self._readers[x].doc_field_length(y, fieldname, default) def field_length(self, fieldname): - """ - Returns the total length of the specified field across all readers. - - Parameters: - - fieldname (str): The name of the field. - - Returns: - - int: The total length of the field. - - """ total = 0 for r in self._readers: total += r.field_length(fieldname) return total def min_field_length(self): - """ - Returns the minimum field length across all readers. - - Returns: - - int: The minimum field length. - - """ return min(r.min_field_length() for r in self._readers) def max_field_length(self): - """ - Returns the maximum field length across all readers. - - Returns: - - int: The maximum field length. - - """ return max(r.max_field_length() for r in self._readers) # Extended base classes -class PerDocWriterWithColumns(PerDocumentWriter): - """ - A subclass of PerDocumentWriter that supports columns for storing additional data per document. - - This class provides methods for adding and retrieving column values for a given fieldname. - - Attributes: - _storage (object): The storage object used for storing the column data. - _segment (object): The segment object representing the current segment. - _docnum (int): The document number. - - Methods: - _has_column(fieldname): Checks if a column with the given fieldname exists. - _create_column(fieldname, column): Creates a new column with the given fieldname and column object. - _get_column(fieldname): Retrieves the column object for the given fieldname. - add_column_value(fieldname, column, value): Adds a value to the column for the given fieldname. - """ +class PerDocWriterWithColumns(PerDocumentWriter): def __init__(self): PerDocumentWriter.__init__(self) + # Implementations need to set these attributes self._storage = None self._segment = None self._docnum = None @abstractmethod def _has_column(self, fieldname): - """ - Checks if a column with the given fieldname exists. - - Args: - fieldname (str): The name of the field. - - Returns: - bool: True if the column exists, False otherwise. - - Raises: - NotImplementedError: This method should be implemented by subclasses. - """ raise NotImplementedError @abstractmethod def _create_column(self, fieldname, column): - """ - Creates a new column with the given fieldname and column object. - - Args: - fieldname (str): The name of the field. - column (object): The column object. - - Raises: - NotImplementedError: This method should be implemented by subclasses. - """ raise NotImplementedError @abstractmethod def _get_column(self, fieldname): - """ - Retrieves the column object for the given fieldname. - - Args: - fieldname (str): The name of the field. - - Returns: - object: The column object. - - Raises: - NotImplementedError: This method should be implemented by subclasses. - """ raise NotImplementedError def add_column_value(self, fieldname, column, value): - """ - Adds a value to the column for the given fieldname. - - If the column does not exist, it will be created. - - Args: - fieldname (str): The name of the field. - column (object): The column object. - value (object): The value to be added to the column. - """ if not self._has_column(fieldname): self._create_column(fieldname, column) self._get_column(fieldname).add(self._docnum, value) # FieldCursor implementations -class EmptyCursor(FieldCursor): - """A cursor implementation that represents an empty cursor. - - This cursor is used when there are no matching terms in the index. - It provides methods to navigate through the non-existent terms and - retrieve information about them. - Note: This class is intended for internal use within the Whoosh library - and should not be instantiated directly by users. - - """ +class EmptyCursor(FieldCursor): def first(self): - """Move the cursor to the first term. - - Returns: - None: Always returns None as there are no terms to move to. - - """ return None def find(self, term): - """Find a specific term in the index. - - Args: - term (str): The term to find. - - Returns: - None: Always returns None as the term does not exist. - - """ return None def next(self): - """Move the cursor to the next term. - - Returns: - None: Always returns None as there are no terms to move to. - - """ return None def text(self): - """Get the text of the current term. - - Returns: - None: Always returns None as there are no terms. - - """ return None def term_info(self): - """Get information about the current term. - - Returns: - None: Always returns None as there are no terms. - - """ return None def is_valid(self): - """Check if the cursor is valid. - - Returns: - bool: Always returns False as the cursor is not valid. - - """ return False diff --git a/src/whoosh/codec/memory.py b/src/whoosh/codec/memory.py index fbdc7828..de2f5cf5 100644 --- a/src/whoosh/codec/memory.py +++ b/src/whoosh/codec/memory.py @@ -36,217 +36,42 @@ class MemWriter(SegmentWriter): - """ - A class for writing segments to memory. - - This class extends the `SegmentWriter` class and provides functionality - for writing segments to memory instead of a file. - - Usage: - writer = MemWriter() - writer.commit() - - Args: - mergetype (str, optional): The type of merge to perform during commit. - Defaults to None. - optimize (bool, optional): Whether to optimize the index during commit. - Defaults to False. - merge (bool, optional): Whether to perform a merge during commit. - Defaults to True. - """ - def commit(self, mergetype=None, optimize=False, merge=True): - """ - Commits the changes made to the segment. - - This method finalizes the segment and performs any necessary - operations, such as merging and optimization. - - Args: - mergetype (str, optional): The type of merge to perform during commit. - Defaults to None. - optimize (bool, optional): Whether to optimize the index during commit. - Defaults to False. - merge (bool, optional): Whether to perform a merge during commit. - Defaults to True. - """ self._finalize_segment() class MemoryCodec(base.Codec): - """ - Codec implementation for in-memory storage. - - This codec provides an in-memory storage solution for the Whoosh library. - It uses a RamStorage object to store the index data. - - Usage: - codec = MemoryCodec() - writer = codec.writer(schema) - reader = codec.reader(schema) - per_doc_writer = codec.per_document_writer(storage, segment) - field_writer = codec.field_writer(storage, segment) - per_doc_reader = codec.per_document_reader(storage, segment) - terms_reader = codec.terms_reader(storage, segment) - new_segment = codec.new_segment(storage, indexname) - """ - def __init__(self): - """ - Initializes a MemoryCodec object. - - This method creates a RamStorage object to be used as the storage for the index data. - It also initializes a MemSegment object. - - Parameters: - None - - Returns: - None - """ from whoosh.filedb.filestore import RamStorage self.storage = RamStorage() self.segment = MemSegment(self, "blah") def writer(self, schema): - """ - Creates a writer object for the index. - - This method creates a MemWriter object for the given schema and returns it. - - Parameters: - - schema (whoosh.fields.Schema): The schema for the index. - - Returns: - - writer (MemWriter): The writer object for the index. - """ ix = self.storage.create_index(schema) return MemWriter(ix, _lk=False, codec=self, docbase=self.segment._doccount) def reader(self, schema): - """ - Creates a reader object for the index. - - This method creates a SegmentReader object for the given schema and returns it. - - Parameters: - - schema (whoosh.fields.Schema): The schema for the index. - - Returns: - - reader (SegmentReader): The reader object for the index. - """ return SegmentReader(self.storage, schema, self.segment, codec=self) def per_document_writer(self, storage, segment): - """ - Creates a per-document writer object. - - This method creates a MemPerDocWriter object for the given storage and segment and returns it. - - Parameters: - - storage (RamStorage): The storage object for the index. - - segment (MemSegment): The segment object for the index. - - Returns: - - per_doc_writer (MemPerDocWriter): The per-document writer object. - """ return MemPerDocWriter(self.storage, self.segment) def field_writer(self, storage, segment): - """ - Creates a field writer object. - - This method creates a MemFieldWriter object for the given storage and segment and returns it. - - Parameters: - - storage (RamStorage): The storage object for the index. - - segment (MemSegment): The segment object for the index. - - Returns: - - field_writer (MemFieldWriter): The field writer object. - """ return MemFieldWriter(self.storage, self.segment) def per_document_reader(self, storage, segment): - """ - Creates a per-document reader object. - - This method creates a MemPerDocReader object for the given storage and segment and returns it. - - Parameters: - - storage (RamStorage): The storage object for the index. - - segment (MemSegment): The segment object for the index. - - Returns: - - per_doc_reader (MemPerDocReader): The per-document reader object. - """ return MemPerDocReader(self.storage, self.segment) def terms_reader(self, storage, segment): - """ - Creates a terms reader object. - - This method creates a MemTermsReader object for the given storage and segment and returns it. - - Parameters: - - storage (RamStorage): The storage object for the index. - - segment (MemSegment): The segment object for the index. - - Returns: - - terms_reader (MemTermsReader): The terms reader object. - """ return MemTermsReader(self.storage, self.segment) def new_segment(self, storage, indexname): - """ - Creates a new segment object. - - This method returns the existing segment object. - - Parameters: - - storage (RamStorage): The storage object for the index. - - indexname (str): The name of the index. - - Returns: - - segment (MemSegment): The segment object. - """ return self.segment class MemPerDocWriter(base.PerDocWriterWithColumns): - """ - A class that writes per-document data to memory. - - This class is responsible for writing per-document data, such as stored fields, field lengths, and vectors, - to memory. It is used by the `MemoryCodec` to store document data in memory. - - Attributes: - _storage (Storage): The storage object used to create files for storing column data. - _segment (Segment): The segment object to which the per-document data is written. - is_closed (bool): Indicates whether the writer has been closed. - _colwriters (dict): A dictionary that maps field names to column writers. - _doccount (int): The total number of documents written. - - Methods: - _has_column(fieldname): Checks if a column with the given field name exists. - _create_column(fieldname, column): Creates a new column for the given field name. - _get_column(fieldname): Retrieves the column writer for the given field name. - start_doc(docnum): Starts writing data for a new document. - add_field(fieldname, fieldobj, value, length): Adds a field value and length to the current document. - add_vector_items(fieldname, fieldobj, items): Adds vector items to the current document. - finish_doc(): Finishes writing data for the current document. - close(): Closes the writer and finishes writing any remaining data. - """ - def __init__(self, storage, segment): - """ - Initializes a new instance of the MemPerDocWriter class. - - Args: - storage (Storage): The storage object used to create files for storing column data. - segment (Segment): The segment object to which the per-document data is written. - """ self._storage = storage self._segment = segment self.is_closed = False @@ -254,47 +79,16 @@ def __init__(self, storage, segment): self._doccount = 0 def _has_column(self, fieldname): - """ - Checks if a column with the given field name exists. - - Args: - fieldname (str): The name of the field. - - Returns: - bool: True if the column exists, False otherwise. - """ return fieldname in self._colwriters def _create_column(self, fieldname, column): - """ - Creates a new column for the given field name. - - Args: - fieldname (str): The name of the field. - column (Column): The column object used to write data to the column file. - """ colfile = self._storage.create_file(f"{fieldname}.c") self._colwriters[fieldname] = (colfile, column.writer(colfile)) def _get_column(self, fieldname): - """ - Retrieves the column writer for the given field name. - - Args: - fieldname (str): The name of the field. - - Returns: - ColumnWriter: The column writer object. - """ return self._colwriters[fieldname][1] def start_doc(self, docnum): - """ - Starts writing data for a new document. - - Args: - docnum (int): The document number. - """ self._doccount += 1 self._docnum = docnum self._stored = {} @@ -302,35 +96,15 @@ def start_doc(self, docnum): self._vectors = {} def add_field(self, fieldname, fieldobj, value, length): - """ - Adds a field value and length to the current document. - - Args: - fieldname (str): The name of the field. - fieldobj (Field): The field object. - value: The field value. - length: The field length. - """ if value is not None: self._stored[fieldname] = value if length is not None: self._lengths[fieldname] = length def add_vector_items(self, fieldname, fieldobj, items): - """ - Adds vector items to the current document. - - Args: - fieldname (str): The name of the field. - fieldobj (Field): The field object. - items (list): The vector items. - """ self._vectors[fieldname] = tuple(items) def finish_doc(self): - """ - Finishes writing data for the current document. - """ with self._segment._lock: docnum = self._docnum self._segment._stored[docnum] = self._stored @@ -338,9 +112,6 @@ def finish_doc(self): self._segment._vectors[docnum] = self._vectors def close(self): - """ - Closes the writer and finishes writing any remaining data. - """ colwriters = self._colwriters for fieldname in colwriters: colfile, colwriter = colwriters[fieldname] @@ -350,167 +121,45 @@ def close(self): class MemPerDocReader(base.PerDocumentReader): - """ - A class that provides read access to per-document data stored in memory. - - This class is responsible for reading per-document data from a memory storage - and a specific segment. It provides methods to retrieve information about the - documents, columns, field lengths, vectors, and stored fields. - - Usage: - 1. Create an instance of MemPerDocReader by passing the storage and segment. - 2. Use the various methods to access the desired information. - - Example: - ``` - storage = MemoryStorage() - segment = MemorySegment() - reader = MemPerDocReader(storage, segment) - doc_count = reader.doc_count() - has_deletions = reader.has_deletions() - stored_fields = reader.stored_fields(0) - reader.close() - ``` - - Note: - - The storage object should implement the necessary methods for file operations. - - The segment object should provide access to the per-document data. - - """ - def __init__(self, storage, segment): - """ - Initialize a MemPerDocReader instance. - - Args: - - storage: The storage object that provides file operations. - - segment: The segment object that provides access to the per-document data. - """ self._storage = storage self._segment = segment def doc_count(self): - """ - Get the number of documents in the segment. - - Returns: - - The number of documents in the segment. - """ return self._segment.doc_count() def doc_count_all(self): - """ - Get the total number of documents, including deleted documents. - - Returns: - - The total number of documents. - """ return self._segment.doc_count_all() def has_deletions(self): - """ - Check if the segment has deleted documents. - - Returns: - - True if the segment has deleted documents, False otherwise. - """ return self._segment.has_deletions() def is_deleted(self, docnum): - """ - Check if a document is deleted. - - Args: - - docnum: The document number. - - Returns: - - True if the document is deleted, False otherwise. - """ return self._segment.is_deleted(docnum) def deleted_docs(self): - """ - Get the set of deleted document numbers. - - Returns: - - A set containing the numbers of deleted documents. - """ return self._segment.deleted_docs() def supports_columns(self): - """ - Check if the segment supports columns. - - Returns: - - True if the segment supports columns, False otherwise. - """ return True def has_column(self, fieldname): - """ - Check if a column exists for a given field. - - Args: - - fieldname: The name of the field. - - Returns: - - True if the column exists, False otherwise. - """ filename = f"{fieldname}.c" return self._storage.file_exists(filename) def column_reader(self, fieldname, column): - """ - Get a reader for a specific column of a field. - - Args: - - fieldname: The name of the field. - - column: The column object. - - Returns: - - A reader for the column. - """ filename = f"{fieldname}.c" colfile = self._storage.open_file(filename) length = self._storage.file_length(filename) return column.reader(colfile, 0, length, self._segment.doc_count_all()) def doc_field_length(self, docnum, fieldname, default=0): - """ - Get the length of a field in a specific document. - - Args: - - docnum: The document number. - - fieldname: The name of the field. - - default: The default value to return if the field is not found. - - Returns: - - The length of the field in the document, or the default value if not found. - """ return self._segment._lengths[docnum].get(fieldname, default) def field_length(self, fieldname): - """ - Get the total length of a field across all documents. - - Args: - - fieldname: The name of the field. - - Returns: - - The total length of the field. - """ return sum(lens.get(fieldname, 0) for lens in self._segment._lengths.values()) def min_field_length(self, fieldname): - """ - Get the minimum length of a field across all documents. - - Args: - - fieldname: The name of the field. - - Returns: - - The minimum length of the field. - """ return min( lens[fieldname] for lens in self._segment._lengths.values() @@ -518,15 +167,6 @@ def min_field_length(self, fieldname): ) def max_field_length(self, fieldname): - """ - Get the maximum length of a field across all documents. - - Args: - - fieldname: The name of the field. - - Returns: - - The maximum length of the field. - """ return max( lens[fieldname] for lens in self._segment._lengths.values() @@ -534,102 +174,25 @@ def max_field_length(self, fieldname): ) def has_vector(self, docnum, fieldname): - """ - Check if a document has a vector for a given field. - - Args: - - docnum: The document number. - - fieldname: The name of the field. - - Returns: - - True if the document has a vector for the field, False otherwise. - """ return ( docnum in self._segment._vectors and fieldname in self._segment._vectors[docnum] ) def vector(self, docnum, fieldname, format_): - """ - Get a vector for a specific document and field. - - Args: - - docnum: The document number. - - fieldname: The name of the field. - - format_: The format of the vector. - - Returns: - - A ListMatcher object representing the vector. - """ items = self._segment._vectors[docnum][fieldname] ids, weights, values = zip(*items) return ListMatcher(ids, weights, values, format_) def stored_fields(self, docnum): - """ - Get the stored fields of a specific document. - - Args: - - docnum: The document number. - - Returns: - - A dictionary containing the stored fields of the document. - """ return self._segment._stored[docnum] def close(self): - """ - Close the MemPerDocReader. - - This method is intentionally left empty. - """ + # This method is intentionally left empty. pass class MemFieldWriter(base.FieldWriter): - """ - The MemFieldWriter class is responsible for writing field data to memory. - - It provides methods for starting and finishing fields, terms, and adding data to the field. - - Attributes: - - _storage: The storage object used for storing the field data. - - _segment: The segment object representing the segment being written to. - - _fieldname: The name of the current field being written. - - _btext: The binary representation of the current term being written. - - is_closed: A flag indicating whether the writer has been closed. - - Methods: - - start_field(fieldname, fieldobj): Starts a new field. - - start_term(btext): Starts a new term within the current field. - - add(docnum, weight, vbytes, length): Adds data to the current term. - - finish_term(): Finishes the current term. - - finish_field(): Finishes the current field. - - close(): Closes the writer. - - Usage: - 1. Create an instance of MemFieldWriter with the storage and segment objects. - 2. Call start_field() to start a new field. - 3. Call start_term() to start a new term within the field. - 4. Call add() to add data to the term. - 5. Call finish_term() to finish the term. - 6. Repeat steps 3-5 for additional terms within the field. - 7. Call finish_field() to finish the field. - 8. Repeat steps 2-7 for additional fields. - 9. Call close() to close the writer. - - Example: - storage = ... - segment = ... - writer = MemFieldWriter(storage, segment) - writer.start_field("title", fieldobj) - writer.start_term(b"hello") - writer.add(1, 0.5, 10, 5) - writer.finish_term() - writer.finish_field() - writer.close() - """ - def __init__(self, storage, segment): self._storage = storage self._segment = segment @@ -638,18 +201,10 @@ def __init__(self, storage, segment): self.is_closed = False def start_field(self, fieldname, fieldobj): - """ - Starts a new field. - - Args: - - fieldname: The name of the field. - - fieldobj: The field object representing the field. - - Raises: - - ValueError: If start_field is called within a field. - """ if self._fieldname is not None: - raise ValueError("Called start_field in a field") + raise ValueError( + "Called start_field in a field" + ) # Replaced generic Exception with ValueError with self._segment._lock: invindex = self._segment._invindex @@ -660,15 +215,6 @@ def start_field(self, fieldname, fieldobj): self._fieldobj = fieldobj def start_term(self, btext): - """ - Starts a new term within the current field. - - Args: - - btext: The binary representation of the term. - - Raises: - - ValueError: If start_term is called within a term. - """ if self._btext is not None: raise ValueError("Called start_term in a term") fieldname = self._fieldname @@ -687,31 +233,10 @@ def start_term(self, btext): self._btext = btext def add(self, docnum, weight, vbytes, length): - """ - Adds data to the current term. - - Args: - - docnum: The document number. - - weight: The weight of the term in the document. - - vbytes: The number of bytes used to store the term's value. - - length: The length of the term. - - Raises: - - ValueError: If add is called outside a term. - """ - if self._btext is None: - raise ValueError("Called add outside a term") - self._postings.append((docnum, weight, vbytes)) self._terminfo.add_posting(docnum, weight, length) def finish_term(self): - """ - Finishes the current term. - - Raises: - - ValueError: If finish_term is called outside a term. - """ if self._btext is None: raise ValueError("Called finish_term outside a term") @@ -720,87 +245,30 @@ def finish_term(self): self._terminfo = None def finish_field(self): - """ - Finishes the current field. - - Raises: - - ValueError: If finish_field is called outside a field. - """ if self._fieldname is None: raise ValueError("Called finish_field outside a field") self._fieldname = None self._fieldobj = None def close(self): - """ - Closes the writer. - """ self.is_closed = True class MemTermsReader(base.TermsReader): - """ - A terms reader implementation for in-memory storage. - - This class provides methods to access and retrieve terms, term information, - and matchers from an in-memory index segment. - - Args: - storage (object): The storage object used for the index. - segment (object): The index segment object. - - Attributes: - _storage (object): The storage object used for the index. - _segment (object): The index segment object. - _invindex (dict): The inverted index of the segment. - - """ - def __init__(self, storage, segment): self._storage = storage self._segment = segment self._invindex = segment._invindex def __contains__(self, term): - """ - Check if a term exists in the segment. - - Args: - term (str): The term to check. - - Returns: - bool: True if the term exists, False otherwise. - - """ return term in self._segment._terminfos def terms(self): - """ - Get an iterator over all terms in the segment. - - Yields: - tuple: A tuple containing the field name and term. - - """ for fieldname in self._invindex: for btext in self._invindex[fieldname]: yield (fieldname, btext) def terms_from(self, fieldname, prefix): - """ - Get an iterator over terms starting with a given prefix in a specific field. - - Args: - fieldname (str): The field name. - prefix (str): The prefix to match. - - Yields: - tuple: A tuple containing the field name and term. - - Raises: - TermNotFound: If the field name is unknown. - - """ if fieldname not in self._invindex: raise TermNotFound(f"Unknown field {fieldname!r}") terms = sorted(self._invindex[fieldname]) @@ -811,99 +279,23 @@ def terms_from(self, fieldname, prefix): yield (fieldname, terms[i]) def term_info(self, fieldname, text): - """ - Get the term information for a specific term in a field. - - Args: - fieldname (str): The field name. - text (str): The term. - - Returns: - object: The term information object. - - """ return self._segment._terminfos[fieldname, text] def matcher(self, fieldname, btext, format_, scorer=None): - """ - Get a matcher for a specific term in a field. - - Args: - fieldname (str): The field name. - btext (bytes): The term as bytes. - format_ (object): The format object. - scorer (object, optional): The scorer object. Defaults to None. - - Returns: - object: The matcher object. - - """ items = self._invindex[fieldname][btext] ids, weights, values = zip(*items) return ListMatcher(ids, weights, values, format_, scorer=scorer) def indexed_field_names(self): - """ - Returns a list of field names that have been indexed. - - This method retrieves the keys from the inverted index dictionary - and returns them as a list. Each key represents a field name that - has been indexed. - - Returns: - list: A list of field names that have been indexed. - """ return self._invindex.keys() def close(self): - """ - Close the terms reader. - - This method is intentionally left empty. - - """ + # This method is intentionally left empty. pass class MemSegment(base.Segment): - """ - In-memory implementation of a segment for the Whoosh search engine. - - This class represents a segment of an index stored in memory. It provides methods for managing - documents, storing and retrieving data, and handling deletions. - - Attributes: - _codec (Codec): The codec used for encoding and decoding data. - _doccount (int): The total number of documents in the segment. - _stored (dict): A dictionary mapping document numbers to stored data. - _lengths (dict): A dictionary mapping document numbers to the length of the stored data. - _vectors (dict): A dictionary mapping document numbers to term vectors. - _invindex (dict): A dictionary mapping terms to inverted index entries. - _terminfos (dict): A dictionary mapping terms to term information. - _lock (Lock): A lock used for thread-safety. - - Methods: - codec(): Returns the codec used by the segment. - set_doc_count(doccount): Sets the total number of documents in the segment. - doc_count(): Returns the number of stored documents. - doc_count_all(): Returns the total number of documents in the segment, including deleted ones. - delete_document(docnum, delete=True): Deletes a document from the segment. - has_deletions(): Checks if the segment has any deleted documents. - is_deleted(docnum): Checks if a document is deleted. - deleted_docs(): Returns an iterator over the document numbers of deleted documents. - should_assemble(): Checks if the segment should be assembled. - - """ - def __init__(self, codec, indexname): - """ - Initializes a new instance of the MemSegment class. - - Args: - codec (Codec): The codec used for encoding and decoding data. - indexname (str): The name of the index. - - """ base.Segment.__init__(self, indexname) self._codec = codec self._doccount = 0 @@ -915,57 +307,18 @@ def __init__(self, codec, indexname): self._lock = Lock() def codec(self): - """ - Returns the codec used by the segment. - - Returns: - Codec: The codec used by the segment. - - """ return self._codec def set_doc_count(self, doccount): - """ - Sets the total number of documents in the segment. - - Args: - doccount (int): The total number of documents. - - """ self._doccount = doccount def doc_count(self): - """ - Returns the number of stored documents. - - Returns: - int: The number of stored documents. - - """ return len(self._stored) def doc_count_all(self): - """ - Returns the total number of documents in the segment, including deleted ones. - - Returns: - int: The total number of documents. - - """ return self._doccount def delete_document(self, docnum, delete=True): - """ - Deletes a document from the segment. - - Args: - docnum (int): The document number. - delete (bool): Whether to permanently delete the document. Default is True. - - Raises: - ValueError: If delete is False, as MemoryCodec does not support undeleting. - - """ if not delete: raise ValueError("MemoryCodec can't undelete") with self._lock: @@ -974,48 +327,17 @@ def delete_document(self, docnum, delete=True): del self._vectors[docnum] def has_deletions(self): - """ - Checks if the segment has any deleted documents. - - Returns: - bool: True if there are deleted documents, False otherwise. - - """ with self._lock: return self._doccount - len(self._stored) def is_deleted(self, docnum): - """ - Checks if a document is deleted. - - Args: - docnum (int): The document number. - - Returns: - bool: True if the document is deleted, False otherwise. - - """ return docnum not in self._stored def deleted_docs(self): - """ - Returns an iterator over the document numbers of deleted documents. - - Yields: - int: The document number of a deleted document. - - """ stored = self._stored for docnum in range(self.doc_count_all()): if docnum not in stored: yield docnum def should_assemble(self): - """ - Checks if the segment should be assembled. - - Returns: - bool: True if the segment should be assembled, False otherwise. - - """ return False diff --git a/src/whoosh/codec/plaintext.py b/src/whoosh/codec/plaintext.py index b29bd64a..1e58ecb1 100644 --- a/src/whoosh/codec/plaintext.py +++ b/src/whoosh/codec/plaintext.py @@ -39,31 +39,7 @@ class LineWriter: - """ - A class for writing lines to a file with specified indentation and command. - - Attributes: - _dbfile (file): The file object to write the lines to. - - Methods: - _print_line(indent, command, **kwargs): Writes a line to the file with the specified indentation, command, and keyword arguments. - """ - def _print_line(self, indent, command, **kwargs): - """ - Writes a line to the file with the specified indentation, command, and keyword arguments. - - Args: - indent (int): The number of indentation levels for the line. - command (str): The command to write. - **kwargs: Additional keyword arguments to include in the line. - - Raises: - TypeError: If a keyword argument value is not of a valid type. - - Returns: - None - """ self._dbfile.write(b" " * indent) self._dbfile.write(command.encode("latin1")) for k, v in kwargs.items(): @@ -76,50 +52,17 @@ def _print_line(self, indent, command, **kwargs): class LineReader: - """A class for reading lines from a file and performing line-based operations.""" - def __init__(self, dbfile): - """ - Initialize a LineReader object. - - Parameters: - - dbfile (file): The file object to read lines from. - """ self._dbfile = dbfile def _reset(self): - """ - Reset the file pointer to the beginning of the file. - """ self._dbfile.seek(0) def _find_line(self, indent, command, **kwargs): - """ - Find the first line that matches the given indent, command, and keyword arguments. - - Parameters: - - indent (int): The indentation level of the line. - - command (str): The command to match. - - kwargs (dict): Keyword arguments to match against the line's arguments. - - Returns: - - tuple: A tuple containing the indent, command, and arguments of the matched line. - """ for largs in self._find_lines(indent, command, **kwargs): return largs def _find_lines(self, indent, command, **kwargs): - """ - Find all lines that match the given indent, command, and keyword arguments. - - Parameters: - - indent (int): The indentation level of the lines. - - command (str): The command to match. - - kwargs (dict): Keyword arguments to match against the lines' arguments. - - Yields: - - tuple: A tuple containing the indent, command, and arguments of each matched line. - """ while True: line = self._dbfile.readline() if not line: @@ -144,15 +87,6 @@ def _find_lines(self, indent, command, **kwargs): return def _parse_line(self, line): - """ - Parse a line and extract the indent, command, and arguments. - - Parameters: - - line (str): The line to parse. - - Returns: - - tuple: A tuple containing the indent, command, and arguments of the line. - """ line = line.decode("latin1") line = line.rstrip() l = len(line) @@ -171,18 +105,6 @@ def _parse_line(self, line): return (indent, command, args) def _find_root(self, command): - """ - Find the root section with the given command. - - Parameters: - - command (str): The command to match. - - Returns: - - tuple: A tuple containing the indent, command, and arguments of the root section. - - Raises: - - ValueError: If no root section with the given command is found. - """ self._reset() c = self._find_line(0, command) if c is None: @@ -190,341 +112,80 @@ def _find_root(self, command): # Codec class -class PlainTextCodec(base.Codec): - """ - Codec for storing and retrieving plain text documents in Whoosh. - - This codec provides the necessary methods for reading and writing plain text documents - in Whoosh. It is responsible for handling the storage, segmentation, and retrieval of - plain text data. - - Usage: - ------ - codec = PlainTextCodec() - per_doc_writer = codec.per_document_writer(storage, segment) - field_writer = codec.field_writer(storage, segment) - per_doc_reader = codec.per_document_reader(storage, segment) - terms_reader = codec.terms_reader(storage, segment) - segment = codec.new_segment(storage, indexname) - """ + +class PlainTextCodec(base.Codec): length_stats = False def per_document_writer(self, storage, segment): - """ - Returns a per-document writer for the given storage and segment. - - Parameters: - ----------- - storage : Storage - The storage object used for storing the documents. - segment : Segment - The segment object representing the current segment. - - Returns: - -------- - PlainPerDocWriter - The per-document writer for the given storage and segment. - """ return PlainPerDocWriter(storage, segment) def field_writer(self, storage, segment): - """ - Returns a field writer for the given storage and segment. - - Parameters: - ----------- - storage : Storage - The storage object used for storing the documents. - segment : Segment - The segment object representing the current segment. - - Returns: - -------- - PlainFieldWriter - The field writer for the given storage and segment. - """ return PlainFieldWriter(storage, segment) def per_document_reader(self, storage, segment): - """ - Returns a per-document reader for the given storage and segment. - - Parameters: - ----------- - storage : Storage - The storage object used for retrieving the documents. - segment : Segment - The segment object representing the current segment. - - Returns: - -------- - PlainPerDocReader - The per-document reader for the given storage and segment. - """ return PlainPerDocReader(storage, segment) def terms_reader(self, storage, segment): - """ - Returns a terms reader for the given storage and segment. - - Parameters: - ----------- - storage : Storage - The storage object used for retrieving the terms. - segment : Segment - The segment object representing the current segment. - - Returns: - -------- - PlainTermsReader - The terms reader for the given storage and segment. - """ return PlainTermsReader(storage, segment) def new_segment(self, storage, indexname): - """ - Creates a new segment for the given storage and index name. - - Parameters: - ----------- - storage : Storage - The storage object used for storing the segment. - indexname : str - The name of the index. - - Returns: - -------- - PlainSegment - The new segment for the given storage and index name. - """ return PlainSegment(indexname) class PlainPerDocWriter(base.PerDocumentWriter, LineWriter): - """ - A class that writes per-document data in plain text format. - - This class is responsible for writing per-document data, such as document fields, column values, and vector items, - in a plain text format. It inherits from the `PerDocumentWriter` and `LineWriter` classes. - - Usage: - 1. Create an instance of `PlainPerDocWriter` by providing a storage object and a segment object. - 2. Call the `start_doc` method to indicate the start of a new document. - 3. Call the `add_field` method to add a field to the document. - 4. Call the `add_column_value` method to add a column value to the document. - 5. Call the `add_vector_items` method to add vector items to the document. - 6. Call the `finish_doc` method to indicate the end of the current document. - 7. Call the `close` method to close the writer. - - Attributes: - - `_dbfile`: The file object used for writing per-document data. - - `is_closed`: A boolean indicating whether the writer has been closed. - """ - def __init__(self, storage, segment): - """ - Initializes a new instance of the PlainPerDocWriter class. - - Parameters: - - `storage`: The storage object used for creating the per-document data file. - - `segment`: The segment object representing the current segment. - - Returns: - None. - """ self._dbfile = storage.create_file(segment.make_filename(".dcs")) self._print_line(0, "DOCS") self.is_closed = False def start_doc(self, docnum): - """ - Indicates the start of a new document. - - Parameters: - - `docnum`: The document number. - - Returns: - None. - """ self._print_line(1, "DOC", dn=docnum) def add_field(self, fieldname, fieldobj, value, length): - """ - Adds a field to the current document. - - Parameters: - - `fieldname`: The name of the field. - - `fieldobj`: The field object. - - `value`: The value of the field. - - `length`: The length of the field value. - - Returns: - None. - """ if value is not None: value = dumps(value, 2) self._print_line(2, "DOCFIELD", fn=fieldname, v=value, len=length) def add_column_value(self, fieldname, columnobj, value): - """ - Adds a column value to the current document. - - Parameters: - - `fieldname`: The name of the field. - - `columnobj`: The column object. - - `value`: The value of the column. - - Returns: - None. - """ self._print_line(2, "COLVAL", fn=fieldname, v=value) def add_vector_items(self, fieldname, fieldobj, items): - """ - Adds vector items to the current document. - - Parameters: - - `fieldname`: The name of the field. - - `fieldobj`: The field object. - - `items`: A list of vector items, where each item is a tuple containing the text, weight, and vector bytes. - - Returns: - None. - """ self._print_line(2, "VECTOR", fn=fieldname) for text, weight, vbytes in items: self._print_line(3, "VPOST", t=text, w=weight, v=vbytes) def finish_doc(self): - """ - Indicates the end of the current document. - - Returns: - None. - """ # This method is intentionally left empty. pass def close(self): - """ - Closes the writer. - - Returns: - None. - """ self._dbfile.close() self.is_closed = True class PlainPerDocReader(base.PerDocumentReader, LineReader): - """ - A reader for plain text per-document data in Whoosh index. - - This class provides methods to read per-document data stored in plain text format in a Whoosh index. - It inherits from the `PerDocumentReader` and `LineReader` classes. - - Attributes: - _dbfile (File): The file object representing the per-document data file. - _segment (Segment): The segment object representing the segment containing the per-document data. - is_closed (bool): Indicates whether the reader is closed or not. - - Methods: - doc_count(): Returns the number of documents in the segment. - doc_count_all(): Returns the total number of documents in the segment. - has_deletions(): Returns False, indicating that the segment does not have any deleted documents. - is_deleted(docnum): Returns False, indicating that the specified document is not deleted. - deleted_docs(): Returns an empty frozenset, indicating that there are no deleted documents. - _find_doc(docnum): Internal method to find a document by its number. - _iter_docs(): Internal method to iterate over the document numbers in the segment. - _iter_docfields(fieldname): Internal method to iterate over the lines of a specific field in the document. - _iter_lengths(fieldname): Internal method to iterate over the lengths of a specific field in the document. - doc_field_length(docnum, fieldname, default=0): Returns the length of a specific field in the document. - _column_values(fieldname): Internal method to iterate over the column values of a specific field in the document. - has_column(fieldname): Returns True if the specified field has column values in the document, False otherwise. - column_reader(fieldname, column): Returns a list of column values for a specific field in the document. - field_length(fieldname): Returns the total length of a specific field in the document. - min_field_length(fieldname): Returns the minimum length of a specific field in the document. - max_field_length(fieldname): Returns the maximum length of a specific field in the document. - has_vector(docnum, fieldname): Returns True if the document has a vector for the specified field, False otherwise. - vector(docnum, fieldname, format_): Returns a ListMatcher object representing the vector for the specified field in the document. - _read_stored_fields(): Internal method to read the stored fields of the document. - stored_fields(docnum): Returns a dictionary containing the stored fields of the document. - iter_docs(): Returns an iterator over the document numbers and their stored fields in the segment. - all_stored_fields(): Returns an iterator over the stored fields of all documents in the segment. - close(): Closes the reader and releases any associated resources. - """ - def __init__(self, storage, segment): - """ - Initializes a new instance of the PlainPerDocReader class. - - Args: - storage (Storage): The storage object representing the index storage. - segment (Segment): The segment object representing the segment containing the per-document data. - """ self._dbfile = storage.open_file(segment.make_filename(".dcs")) self._segment = segment self.is_closed = False def doc_count(self): - """ - Returns the number of documents in the segment. - - Returns: - int: The number of documents in the segment. - """ return self._segment.doc_count() def doc_count_all(self): - """ - Returns the total number of documents in the segment. - - Returns: - int: The total number of documents in the segment. - """ return self._segment.doc_count() def has_deletions(self): - """ - Returns False, indicating that the segment does not have any deleted documents. - - Returns: - bool: False, indicating that the segment does not have any deleted documents. - """ return False def is_deleted(self, docnum): - """ - Returns False, indicating that the specified document is not deleted. - - Args: - docnum (int): The document number. - - Returns: - bool: False, indicating that the specified document is not deleted. - """ return False def deleted_docs(self): - """ - Returns an empty frozenset, indicating that there are no deleted documents. - - Returns: - frozenset: An empty frozenset, indicating that there are no deleted documents. - """ return frozenset() def _find_doc(self, docnum): - """ - Internal method to find a document by its number. - - Args: - docnum (int): The document number. - - Returns: - bool: True if the document is found, False otherwise. - """ self._find_root("DOCS") c = self._find_line(1, "DOC") while c is not None: @@ -537,12 +198,6 @@ def _find_doc(self, docnum): return False def _iter_docs(self): - """ - Internal method to iterate over the document numbers in the segment. - - Yields: - int: The document number. - """ self._find_root("DOCS") c = self._find_line(1, "DOC") while c is not None: @@ -550,42 +205,13 @@ def _iter_docs(self): c = self._find_line(1, "DOC") def _iter_docfields(self, fieldname): - """ - Internal method to iterate over the lines of a specific field in the document. - - Args: - fieldname (str): The name of the field. - - Yields: - dict: A dictionary representing a line of the field in the document. - """ for _ in self._iter_docs(): yield from self._find_lines(2, "DOCFIELD", fn=fieldname) def _iter_lengths(self, fieldname): - """ - Internal method to iterate over the lengths of a specific field in the document. - - Args: - fieldname (str): The name of the field. - - Yields: - int: The length of the field in the document. - """ return (c.get("len", 0) for c in self._iter_docfields(fieldname)) def doc_field_length(self, docnum, fieldname, default=0): - """ - Returns the length of a specific field in the document. - - Args: - docnum (int): The document number. - fieldname (str): The name of the field. - default (int, optional): The default length to return if the field is not found. Defaults to 0. - - Returns: - int: The length of the field in the document, or the default length if the field is not found. - """ for dn in self._iter_docs(): if dn == docnum: c = self._find_line(2, "DOCFIELD", fn=fieldname) @@ -597,15 +223,6 @@ def doc_field_length(self, docnum, fieldname, default=0): return default def _column_values(self, fieldname): - """ - Internal method to iterate over the column values of a specific field in the document. - - Args: - fieldname (str): The name of the field. - - Yields: - Any: The column value. - """ for i, docnum in enumerate(self._iter_docs()): if i != docnum: raise ValueError(f"Missing column value for field {fieldname} doc {i}?") @@ -619,95 +236,28 @@ def _column_values(self, fieldname): yield c.get("v") def has_column(self, fieldname): - """ - Returns True if the specified field has column values in the document, False otherwise. - - Args: - fieldname (str): The name of the field. - - Returns: - bool: True if the specified field has column values in the document, False otherwise. - """ for _ in self._column_values(fieldname): return True return False def column_reader(self, fieldname, column): - """ - Returns a list of column values for a specific field in the document. - - Args: - fieldname (str): The name of the field. - column (int): The column number. - - Returns: - list: A list of column values for the specified field in the document. - """ return list(self._column_values(fieldname)) def field_length(self, fieldname): - """ - Returns the total length of a specific field in the document. - - Args: - fieldname (str): The name of the field. - - Returns: - int: The total length of the field in the document. - """ return sum(self._iter_lengths(fieldname)) def min_field_length(self, fieldname): - """ - Returns the minimum length of a specific field in the document. - - Args: - fieldname (str): The name of the field. - - Returns: - int: The minimum length of the field in the document. - """ return min(self._iter_lengths(fieldname)) def max_field_length(self, fieldname): - """ - Returns the maximum length of a specific field in the document. - - Args: - fieldname (str): The name of the field. - - Returns: - int: The maximum length of the field in the document. - """ return max(self._iter_lengths(fieldname)) def has_vector(self, docnum, fieldname): - """ - Returns True if the document has a vector for the specified field, False otherwise. - - Args: - docnum (int): The document number. - fieldname (str): The name of the field. - - Returns: - bool: True if the document has a vector for the specified field, False otherwise. - """ if self._find_doc(docnum) and self._find_line(2, "VECTOR"): return True return False def vector(self, docnum, fieldname, format_): - """ - Returns a ListMatcher object representing the vector for the specified field in the document. - - Args: - docnum (int): The document number. - fieldname (str): The name of the field. - format_ (str): The format of the vector. - - Returns: - ListMatcher: A ListMatcher object representing the vector for the specified field in the document. - """ if not self._find_doc(docnum): raise ValueError("Document not found.") if not self._find_line(2, "VECTOR"): @@ -731,12 +281,6 @@ def vector(self, docnum, fieldname, format_): ) def _read_stored_fields(self): - """ - Internal method to read the stored fields of the document. - - Returns: - dict: A dictionary containing the stored fields of the document. - """ sfs = {} c = self._find_line(2, "DOCFIELD") while c is not None: @@ -748,128 +292,44 @@ def _read_stored_fields(self): return sfs def stored_fields(self, docnum): - """ - Returns a dictionary containing the stored fields of the document. - - Args: - docnum (int): The document number. - - Returns: - dict: A dictionary containing the stored fields of the document. - """ if not self._find_doc(docnum): raise ValueError("Document not found.") return self._read_stored_fields() def iter_docs(self): - """ - Returns an iterator over the document numbers and their stored fields in the segment. - - Yields: - tuple: A tuple containing the document number and its stored fields. - """ return enumerate(self.all_stored_fields()) def all_stored_fields(self): - """ - Returns an iterator over the stored fields of all documents in the segment. - - Yields: - dict: A dictionary containing the stored fields of a document. - """ for _ in self._iter_docs(): yield self._read_stored_fields() def close(self): - """ - Closes the reader and releases any associated resources. - """ self._dbfile.close() self.is_closed = True class PlainFieldWriter(base.FieldWriter, LineWriter): - """ - A class that writes field data in plain text format. - - This class is responsible for writing field data to a storage file in plain text format. - It implements the necessary methods to handle field, term, and posting information. - - Attributes: - _dbfile (File): The storage file for the field data. - _fieldobj (Field): The field object being written. - _terminfo (TermInfo): The term information being written. - - Methods: - __init__(self, storage, segment): Initializes a PlainFieldWriter instance. - is_closed(self): Checks if the writer is closed. - start_field(self, fieldname, fieldobj): Starts writing a new field. - start_term(self, btext): Starts writing a new term. - add(self, docnum, weight, vbytes, length): Adds a posting to the current term. - finish_term(self): Finishes writing the current term. - add_spell_word(self, fieldname, text): Adds a spell word to the current field. - close(self): Closes the writer and the storage file. - """ - def __init__(self, storage, segment): - """ - Initializes a PlainFieldWriter instance. - - Args: - storage (Storage): The storage object for the field data. - segment (Segment): The segment object for the field data. - """ self._dbfile = storage.create_file(segment.make_filename(".trm")) self._print_line(0, "TERMS") @property def is_closed(self): - """ - Checks if the writer is closed. - - Returns: - bool: True if the writer is closed, False otherwise. - """ return self._dbfile.is_closed def start_field(self, fieldname, fieldobj): - """ - Starts writing a new field. - - Args: - fieldname (str): The name of the field. - fieldobj (Field): The field object. - """ self._fieldobj = fieldobj self._print_line(1, "TERMFIELD", fn=fieldname) def start_term(self, btext): - """ - Starts writing a new term. - - Args: - btext (bytes): The term text in bytes. - """ self._terminfo = TermInfo() self._print_line(2, "BTEXT", t=btext) def add(self, docnum, weight, vbytes, length): - """ - Adds a posting to the current term. - - Args: - docnum (int): The document number. - weight (float): The weight of the posting. - vbytes (int): The number of bytes in the posting. - length (int): The length of the posting. - """ self._terminfo.add_posting(docnum, weight, length) self._print_line(3, "POST", dn=docnum, w=weight, v=vbytes) def finish_term(self): - """ - Finishes writing the current term. - """ ti = self._terminfo self._print_line( 3, @@ -884,76 +344,24 @@ def finish_term(self): ) def add_spell_word(self, fieldname, text): - """ - Adds a spell word to the current field. - - Args: - fieldname (str): The name of the field. - text (str): The spell word text. - """ self._print_line(2, "SPELL", fn=fieldname, t=text) def close(self): - """ - Closes the writer and the storage file. - """ self._dbfile.close() class PlainTermsReader(base.TermsReader, LineReader): - """ - A reader for plain text terms in a Whoosh index. - - This class provides methods to read and retrieve terms, term information, - and perform term matching in a plain text index. - - Parameters: - - storage (Storage): The storage object representing the index. - - segment (Segment): The segment object representing the index segment. - - Attributes: - - _dbfile (File): The file object representing the terms file. - - _segment (Segment): The segment object representing the index segment. - - is_closed (bool): Indicates whether the reader is closed or not. - - """ - def __init__(self, storage, segment): - """ - Initializes a PlainTermsReader object. - - Parameters: - - storage (Storage): The storage object representing the index. - - segment (Segment): The segment object representing the index segment. - - """ self._dbfile = storage.open_file(segment.make_filename(".trm")) self._segment = segment self.is_closed = False def _find_field(self, fieldname): - """ - Finds the field with the given name in the terms file. - - Parameters: - - fieldname (str): The name of the field to find. - - Raises: - - TermNotFound: If the field with the given name is not found. - - """ self._find_root("TERMS") if self._find_line(1, "TERMFIELD", fn=fieldname) is None: raise TermNotFound(f"No field {fieldname!r}") def _iter_fields(self): - """ - Iterates over the field names in the terms file. - - Yields: - - str: The name of each field. - - """ self._find_root() c = self._find_line(1, "TERMFIELD") while c is not None: @@ -961,30 +369,12 @@ def _iter_fields(self): c = self._find_line(1, "TERMFIELD") def _iter_btexts(self): - """ - Iterates over the binary texts in the terms file. - - Yields: - - bytes: The binary text of each term. - - """ c = self._find_line(2, "BTEXT") while c is not None: yield c["t"] c = self._find_line(2, "BTEXT") def _find_term(self, fieldname, btext): - """ - Finds a term with the given field name and binary text in the terms file. - - Parameters: - - fieldname (str): The name of the field. - - btext (bytes): The binary text of the term. - - Returns: - - bool: True if the term is found, False otherwise. - - """ self._find_field(fieldname) for t in self._iter_btexts(): if t == btext: @@ -994,64 +384,22 @@ def _find_term(self, fieldname, btext): return False def _find_terminfo(self): - """ - Finds the term information in the terms file. - - Returns: - - TermInfo: The term information. - - """ c = self._find_line(3, "TERMINFO") return TermInfo(**c) def __contains__(self, term): - """ - Checks if a term is present in the terms file. - - Parameters: - - term (tuple): A tuple containing the field name and binary text of the term. - - Returns: - - bool: True if the term is present, False otherwise. - - """ fieldname, btext = term return self._find_term(fieldname, btext) def indexed_field_names(self): - """ - Returns the names of the indexed fields in the terms file. - - Returns: - - Iterator[str]: An iterator over the field names. - - """ return self._iter_fields() def terms(self): - """ - Returns an iterator over all the terms in the terms file. - - Yields: - - tuple: A tuple containing the field name and binary text of each term. - - """ for fieldname in self._iter_fields(): for btext in self._iter_btexts(): yield (fieldname, btext) def terms_from(self, fieldname, prefix): - """ - Returns an iterator over the terms with the given field name and prefix. - - Parameters: - - fieldname (str): The name of the field. - - prefix (bytes): The prefix of the terms. - - Yields: - - tuple: A tuple containing the field name and binary text of each term. - - """ self._find_field(fieldname) for btext in self._iter_btexts(): if btext < prefix: @@ -1059,67 +407,19 @@ def terms_from(self, fieldname, prefix): yield (fieldname, btext) def items(self): - """ - Returns an iterator over the terms and their corresponding term information. - - Yields: - - tuple: A tuple containing the term (field name and binary text) and its term information. - - """ for fieldname, btext in self.terms(): yield (fieldname, btext), self._find_terminfo() def items_from(self, fieldname, prefix): - """ - Returns an iterator over the terms with the given field name and prefix, and their corresponding term information. - - Parameters: - - fieldname (str): The name of the field. - - prefix (bytes): The prefix of the terms. - - Yields: - - tuple: A tuple containing the term (field name and binary text) and its term information. - - """ for fieldname, btext in self.terms_from(fieldname, prefix): yield (fieldname, btext), self._find_terminfo() def term_info(self, fieldname, btext): - """ - Retrieves the term information for the given field name and binary text. - - Parameters: - - fieldname (str): The name of the field. - - btext (bytes): The binary text of the term. - - Returns: - - TermInfo: The term information. - - Raises: - - TermNotFound: If the term is not found. - - """ if not self._find_term(fieldname, btext): raise TermNotFound((fieldname, btext)) return self._find_terminfo() def matcher(self, fieldname, btext, format_, scorer=None): - """ - Creates a matcher for the given field name and binary text. - - Parameters: - - fieldname (str): The name of the field. - - btext (bytes): The binary text of the term. - - format_ (int): The format of the matcher. - - scorer (Scorer): The scorer object to use for scoring the matches. - - Returns: - - ListMatcher: The matcher object. - - Raises: - - TermNotFound: If the term is not found. - - """ if not self._find_term(fieldname, btext): raise TermNotFound((fieldname, btext)) @@ -1136,73 +436,23 @@ def matcher(self, fieldname, btext, format_, scorer=None): return ListMatcher(ids, weights, values, format_, scorer=scorer) def close(self): - """ - Closes the PlainTermsReader object. - - """ self._dbfile.close() self.is_closed = True class PlainSegment(base.Segment): - """ - Represents a segment in a plain text index. - - This class is responsible for managing a segment in a plain text index. - It keeps track of the document count and provides methods to interact - with the segment. - - Attributes: - _doccount (int): The number of documents in the segment. - """ - def __init__(self, indexname): - """ - Initializes a PlainSegment object. - - Args: - indexname (str): The name of the index. - - """ base.Segment.__init__(self, indexname) self._doccount = 0 def codec(self): - """ - Returns the codec associated with the segment. - - Returns: - PlainTextCodec: The codec associated with the segment. - - """ return PlainTextCodec() def set_doc_count(self, doccount): - """ - Sets the document count for the segment. - - Args: - doccount (int): The number of documents in the segment. - - """ self._doccount = doccount def doc_count(self): - """ - Returns the document count for the segment. - - Returns: - int: The number of documents in the segment. - - """ return self._doccount def should_assemble(self): - """ - Determines whether the segment should be assembled. - - Returns: - bool: True if the segment should be assembled, False otherwise. - - """ return False diff --git a/src/whoosh/codec/whoosh2.py b/src/whoosh/codec/whoosh2.py index 244357b2..66042554 100644 --- a/src/whoosh/codec/whoosh2.py +++ b/src/whoosh/codec/whoosh2.py @@ -35,10 +35,6 @@ from pickle import dumps, loads from struct import Struct -from iniconfig import ParseError - -from whoosh.qparser.dateparse import DateParseError - try: import zlib except ImportError: @@ -78,27 +74,6 @@ def cdb_hash(key): - """ - Calculate the hash value for a given key using the CDB hash algorithm. - - Args: - key (str): The key to calculate the hash value for. - - Returns: - int: The calculated hash value. - - Algorithm: - The CDB hash algorithm is a simple and efficient hash function. - It uses the following steps to calculate the hash value: - 1. Initialize the hash value to 5381. - 2. For each character in the key, update the hash value using the formula: - h = (h + (h << 5)) & 0xFFFFFFFF ^ ord(c) - 3. Return the final hash value. - - Example: - >>> cdb_hash("hello") - 1934859637 - """ h = 5381 for c in key: h = (h + (h << 5)) & 0xFFFFFFFF ^ ord(c) @@ -106,37 +81,10 @@ def cdb_hash(key): def md5_hash(key): - """ - Calculate the MD5 hash of the given key and return the hash value as an integer. - - Parameters: - key (str): The key to be hashed. - - Returns: - int: The MD5 hash value of the key as an integer. - - Example: - >>> md5_hash("hello") - 1234567890 - - Note: - This function uses the MD5 algorithm to calculate the hash value of the key. - The resulting hash value is converted to an integer and returned. - """ return int(md5(key).hexdigest(), 16) & 0xFFFFFFFF def crc_hash(key): - """ - Calculates the CRC hash value for the given key. - - Args: - key (bytes): The key to calculate the CRC hash for. - - Returns: - int: The CRC hash value. - - """ return crc32(key) & 0xFFFFFFFF @@ -162,49 +110,7 @@ def crc_hash(key): class HashWriter: - """ - A class for writing hash-based data to a file. - - Parameters: - - dbfile (file-like object): The file-like object to write the hash data to. - - hashtype (int, optional): The type of hashing function to use. Defaults to 2. - - Attributes: - - dbfile (file-like object): The file-like object to write the hash data to. - - hashtype (int): The type of hashing function used. - - extras (dict): Extra data associated with the hash data. - - startoffset (int): The starting offset in the file where the hash data is written. - - header_size (int): The size of the header in bytes. - - hash_func (function): The hashing function used. - - hashes (defaultdict): A dictionary of hashed values. - - Methods: - - add(key, value): Adds a key-value pair to the hash data. - - add_all(items): Adds multiple key-value pairs to the hash data. - - _write_hashes(): Writes the hash data to the file. - - _write_extras(): Writes the extra data to the file. - - _write_directory(): Writes the directory of hash values to the file. - - close(): Closes the file. - - """ - def __init__(self, dbfile, hashtype=2): - """ - Initialize a Whoosh2 codec object. - - Args: - dbfile (file-like object): The file-like object representing the database file. - hashtype (int, optional): The type of hashing function to be used. Defaults to 2. - - Attributes: - dbfile (file-like object): The file-like object representing the database file. - hashtype (int): The type of hashing function used. - extras (dict): A dictionary to store additional data. - startoffset (int): The starting offset in the database file. - header_size (int): The size of the header in bytes. - hash_func (function): The hashing function used. - hashes (defaultdict): A dictionary to store the directory of hashed values. - """ self.dbfile = dbfile self.hashtype = hashtype self.extras = {} @@ -225,32 +131,6 @@ def __init__(self, dbfile, hashtype=2): self.hashes = defaultdict(list) def add(self, key, value): - """ - Adds a key-value pair to the hash data. - - Parameters: - - key (bytes): The key to be hashed. - - value (bytes): The value associated with the key. - - Returns: - None - - Raises: - AssertionError: If the key or value is not of type bytes. - - Notes: - - This method writes the length of the key and value to the database file, followed by the key and value themselves. - - The key is hashed using the hash function specified during initialization. - - The hashed key and the position in the database file where the key-value pair is written are stored in a list for efficient retrieval. - - Usage: - ``` - db = HashDatabase() - key = b'my_key' - value = b'my_value' - db.add(key, value) - ``` - """ assert isinstance(key, bytes) assert isinstance(value, bytes) @@ -264,42 +144,11 @@ def add(self, key, value): self.hashes[h & 255].append((h, pos)) def add_all(self, items): - """ - Adds multiple key-value pairs to the hash data. - - Parameters: - - items (iterable): An iterable of (key, value) pairs. - - Usage: - - To add multiple key-value pairs to the hash data, pass an iterable of (key, value) pairs to the `add_all` method. - - Example: - >>> data = [('key1', 'value1'), ('key2', 'value2'), ('key3', 'value3')] - >>> hash_data.add_all(data) - - """ add = self.add for key, value in items: add(key, value) def _write_hashes(self): - """ - Writes the hash data to the file. - - This method writes the hash data to the file, which is used for efficient - lookup of terms in the index. It generates a directory of positions and - number of slots for each hash value, and then writes the hash table entries - to the file. - - The hash table entries are stored in a list of tuples, where each tuple - contains the hash value and the position of the term in the index file. - - Usage: - _write_hashes() - - Returns: - None - """ dbfile = self.dbfile hashes = self.hashes directory = self.directory = [] @@ -327,16 +176,6 @@ def _write_hashes(self): self.extrasoffset = dbfile.tell() def _write_extras(self): - """ - Writes the extra data to the file. - - This method is responsible for writing the extra data to the file. - It first serializes the extras object using pickle and writes it to the file. - Then, it seeks back to the start offset + 8 and writes the pointer to the extras. - - Note: The extras object must be serializable using pickle. - - """ self.dbfile.write_pickle(self.extras) # Seek back and write the pointer to the extras self.dbfile.flush() @@ -344,18 +183,6 @@ def _write_extras(self): self.dbfile.write_long(self.extrasoffset) def _write_directory(self): - """ - Writes the directory of hash values to the file. - - This method is responsible for writing the directory of hash values to the file. - It seeks back to the header, writes the pointer to the end of the hashes, - and writes the pointers to the hash tables. - - Note: - This method assumes that the file has already been opened and positioned - correctly at the start offset. - - """ dbfile = self.dbfile directory = self.directory @@ -371,16 +198,6 @@ def _write_directory(self): assert dbfile.tell() == self.header_size def close(self): - """ - Closes the file. - - This method is responsible for closing the file and performing any necessary cleanup operations. - It writes the hashes, extras, and directory to the file, and then closes the file object. - - Note: - - After calling this method, the file object should no longer be used. - - """ self._write_hashes() self._write_extras() self._write_directory() @@ -388,31 +205,7 @@ def close(self): class HashReader: - """ - A class for reading and accessing data from a hash-based file format. - - Args: - dbfile (file-like object): The file-like object representing the hash-based file. - startoffset (int, optional): The starting offset in the file. Defaults to 0. - - Raises: - ValueError: If the file header is unknown. - - Attributes: - dbfile (file-like object): The file-like object representing the hash-based file. - startoffset (int): The starting offset in the file. - is_closed (bool): Indicates whether the HashReader is closed or not. - - """ - def __init__(self, dbfile, startoffset=0): - """ - Initialize a Whoosh2 object. - - Args: - dbfile (file-like object): The file-like object representing the Whoosh2 database file. - startoffset (int, optional): The starting offset in the file. Defaults to 0. - """ self.dbfile = dbfile self.startoffset = startoffset self.is_closed = False @@ -443,92 +236,22 @@ def __init__(self, dbfile, startoffset=0): self._read_extras() def _read_extras(self): - """ - Read the extras section of the hash-based file. - - This method reads the extras section of the hash-based file and stores the - data in the `extras` attribute of the object. The extras section contains - additional metadata or auxiliary information associated with the file. - - Raises: - EOFError: If the end of the file is reached before reading the extras. - """ try: self.extras = self.dbfile.read_pickle() except EOFError: self.extras = {} def close(self): - """ - Close the HashReader. - - This method closes the HashReader and releases any resources held by it. Once closed, - the HashReader cannot be used again. - - Raises: - ValueError: If the HashReader is already closed. - """ if self.is_closed: raise ValueError(f"Tried to close {self} twice") self.dbfile.close() self.is_closed = True def read(self, position, length): - """ - Read data from the hash-based file. - - Args: - position (int): The position in the file to start reading from. - length (int): The number of bytes to read. - - Returns: - bytes: The read data. - - Raises: - OSError: If there is an error reading the file. - - Notes: - This method reads data from the hash-based file at the specified position and with the specified length. - It is used to retrieve data from the file. - """ self.dbfile.seek(position) return self.dbfile.read(length) def _ranges(self, pos=None): - """ - Generate ranges of key-value pairs in the hash-based file. - - Args: - pos (int, optional): The starting position in the file. Defaults to None. - - Yields: - tuple: A tuple containing the key position, key length, data position, and data length. - - Raises: - ValueError: If the starting position is beyond the end of the file. - - Notes: - This method is used to iterate over the key-value pairs stored in the hash-based file. - It generates tuples containing the position and length of the key, as well as the position - and length of the corresponding data. - - The `pos` parameter allows you to specify a starting position in the file. If `pos` is not - provided, the method will start from the beginning of the file. - - The method uses the `read` method to read data from the file. The `read` method should be - implemented by the subclass to read the specified number of bytes from the file at the given - position. - - The method calculates the key position, key length, data position, and data length based on - the lengths stored in the file. It then updates the position to point to the next key-value - pair in the file. - - The method yields each tuple of key-value pair ranges, allowing you to process them one by one. - The caller can iterate over the yielded tuples using a for loop or any other iterable method. - - If the starting position is beyond the end of the file, a `ValueError` is raised. - - """ if pos is None: pos = self.header_size eod = self._start_of_hashes @@ -541,30 +264,9 @@ def _ranges(self, pos=None): yield (keypos, keylen, datapos, datalen) def __iter__(self): - """ - Iterate over the key-value pairs in the hash-based file. - - This method returns an iterator that allows iterating over the key-value pairs - stored in the hash-based file. Each iteration yields a tuple containing the key - and value. - - Returns: - iterator: An iterator over the key-value pairs in the hash-based file. - - Example: - >>> for key, value in hash_file: - ... print(key, value) - """ return iter(self.items()) def items(self): - """ - Iterate over the key-value pairs in the hash-based file. - - Yields: - tuple: A tuple containing the key and value. - - """ read = self.read for keypos, keylen, datapos, datalen in self._ranges(): key = read(keypos, keylen) @@ -572,119 +274,40 @@ def items(self): yield (key, value) def keys(self): - """ - Iterate over the keys in the hash-based file. - - This method returns an iterator that yields the keys stored in the hash-based file. - The keys are returned as bytes. - - Yields: - bytes: The key. - """ read = self.read for keypos, keylen, _, _ in self._ranges(): yield read(keypos, keylen) def values(self): - """ - Iterate over the values in the hash-based file. - - This method returns a generator that iterates over the values stored in the hash-based file. - Each value is read from the file using the `read` method. - - Yields: - bytes: The value. - """ read = self.read for _, _, datapos, datalen in self._ranges(): yield read(datapos, datalen) def __getitem__(self, key): - """ - Get the value associated with the given key. - - Args: - key (bytes): The key to retrieve the value for. - - Returns: - bytes: The value associated with the key. - - Raises: - KeyError: If the key is not found. - """ for data in self.all(key): return data raise KeyError(key) def get(self, key, default=None): - """ - Get the value associated with the given key, or a default value if the key is not found. - - Args: - key (bytes): The key to retrieve the value for. - default (Any, optional): The default value to return if the key is not found. Defaults to None. - - Returns: - bytes: The value associated with the key, or the default value if the key is not found. - """ for data in self.all(key): return data return default def all(self, key): - """ - Get all values associated with the given key. - - Args: - key (bytes): The key to retrieve the values for. - - Yields: - bytes: The values associated with the key. - """ read = self.read for datapos, datalen in self.ranges_for_key(key): yield read(datapos, datalen) def __contains__(self, key): - """ - Check if the given key is present in the hash-based file. - - Args: - key (bytes): The key to check. - - Returns: - bool: True if the key is present, False otherwise. - """ for _ in self.ranges_for_key(key): return True return False def _hashtable_info(self, keyhash): - """ - Get the directory position and number of hash entries for the given key hash. - - Args: - keyhash (int): The hash value of the key. - - Returns: - tuple: A tuple containing the directory position and number of hash entries. - """ # Return (directory_position, number_of_hash_entries) return self.buckets[keyhash & 255] def _key_position(self, key): - """ - Get the position of the given key in the hash-based file. - - Args: - key (bytes): The key to get the position for. - - Returns: - int: The position of the key. - - Raises: - KeyError: If the key is not found. - """ keyhash = self.hash_func(key) hpos, hslots = self._hashtable_info(keyhash) if not hslots: @@ -694,28 +317,10 @@ def _key_position(self, key): return self.dbfile.get_long(slotpos + _INT_SIZE) def _key_at(self, pos): - """ - Get the key at the given position in the hash-based file. - - Args: - pos (int): The position of the key. - - Returns: - bytes: The key. - """ keylen = self.dbfile.get_uint(pos) return self.read(pos + lengths_size, keylen) def ranges_for_key(self, key): - """ - Get the ranges of data associated with the given key. - - Args: - key (bytes): The key to retrieve the ranges for. - - Yields: - tuple: A tuple containing the data position and data length. - """ read = self.read if not isinstance(key, bytes): raise TypeError(f"Key {key} should be bytes") @@ -742,57 +347,18 @@ def ranges_for_key(self, key): yield (pos + lengths_size + keylen, datalen) def range_for_key(self, key): - """ - Get the first range of data associated with the given key. - - Args: - key (bytes): The key to retrieve the range for. - - Returns: - tuple: A tuple containing the data position and data length. - - Raises: - KeyError: If the key is not found. - """ for item in self.ranges_for_key(key): return item raise KeyError(key) class OrderedHashWriter(HashWriter): - """ - A class for writing key-value pairs to a hash-based database file with ordered keys. - - Inherits from HashWriter. - - Usage: - writer = OrderedHashWriter(dbfile) - writer.add(key, value) - writer.commit() - """ - def __init__(self, dbfile): - """ - Initializes an OrderedHashWriter object. - - Parameters: - - dbfile (file): The file object representing the hash-based database file. - """ HashWriter.__init__(self, dbfile) self.index = GrowableArray("H") self.lastkey = emptybytes def add(self, key, value): - """ - Adds a key-value pair to the database. - - Parameters: - - key: The key to be added. - - value: The value associated with the key. - - Raises: - - ValueError: If the keys are not in increasing order. - """ if key <= self.lastkey: raise ValueError(f"Keys must increase: {self.lastkey!r}..{key!r}") self.index.append(self.dbfile.tell()) @@ -800,9 +366,6 @@ def add(self, key, value): self.lastkey = key def _write_extras(self): - """ - Writes additional information about the index to the extras section of the database file. - """ dbfile = self.dbfile # Save information about the index in the extras @@ -819,31 +382,7 @@ def _write_extras(self): class OrderedHashReader(HashReader): - """ - A class for reading ordered hash data from a database file. - - Inherits from HashReader. - - Attributes: - indexbase (int): The base position of the index in the database file. - indexlen (int): The length of the index. - indextype (str): The type of the index. - _ixsize (int): The size of each index entry in bytes. - _ixpos (function): A function for reading index values based on the indextype. - - Methods: - closest_key(key): Returns the closest key to the given key in the hash data. - items_from(key): Yields key-value pairs starting from the given key. - keys_from(key): Yields keys starting from the given key. - """ - def __init__(self, dbfile): - """ - Initializes an OrderedHashReader object. - - Args: - dbfile (file): The database file to read from. - """ HashReader.__init__(self, dbfile) self.indexbase = self.extras["indexbase"] self.indexlen = self.extras["indexlen"] @@ -864,15 +403,6 @@ def __init__(self, dbfile): raise ValueError(f"Unknown index type {indextype}") def _closest_key(self, key): - """ - Finds the closest key to the given key in the hash data. - - Args: - key (bytes): The key to search for. - - Returns: - int or None: The position of the closest key in the hash data, or None if not found. - """ key_at = self._key_at indexbase = self.indexbase ixpos, ixsize = self._ixpos, self._ixsize @@ -894,30 +424,13 @@ def _closest_key(self, key): return ixpos(indexbase + lo * ixsize) def closest_key(self, key): - """ - Returns the closest key to the given key in the hash data. - - Args: - key (bytes): The key to search for. - - Returns: - bytes or None: The closest key to the given key, or None if not found. - """ pos = self._closest_key(key) if pos is None: return None return self._key_at(pos) def _ranges_from(self, key): - """ - Generates ranges of key-value pairs starting from the given key. - - Args: - key (bytes): The key to start from. - - Yields: - tuple: A tuple containing the key position, key length, data position, and data length. - """ + # read = self.read pos = self._closest_key(key) if pos is None: return @@ -925,29 +438,11 @@ def _ranges_from(self, key): yield from self._ranges(pos=pos) def items_from(self, key): - """ - Yields key-value pairs starting from the given key. - - Args: - key (bytes): The key to start from. - - Yields: - tuple: A tuple containing the key and value. - """ read = self.read for keypos, keylen, datapos, datalen in self._ranges_from(key): yield (read(keypos, keylen), read(datapos, datalen)) def keys_from(self, key): - """ - Yields keys starting from the given key. - - Args: - key (bytes): The key to start from. - - Yields: - bytes: The key. - """ read = self.read for keypos, keylen, _, _ in self._ranges_from(key): yield read(keypos, keylen) @@ -957,31 +452,6 @@ def keys_from(self, key): class W2Codec(base.Codec): - """ - Codec implementation for the Whoosh 2 index format. - - This codec provides the necessary methods for reading and writing - various components of the index, such as term index, term postings, - spelling graph, field lengths, vector index, vector postings, and - stored fields. - - Args: - blocklimit (int): The maximum number of terms to store in a block. - compression (int): The level of compression to apply to the index data. - loadlengths (bool): Whether to load field lengths during reading. - inlinelimit (int): The maximum number of terms to store in a field block. - - Attributes: - TERMS_EXT (str): The file extension for the term index. - POSTS_EXT (str): The file extension for the term postings. - DAWG_EXT (str): The file extension for the spelling graph. - LENGTHS_EXT (str): The file extension for the field lengths. - VECTOR_EXT (str): The file extension for the vector index. - VPOSTS_EXT (str): The file extension for the vector postings. - STORED_EXT (str): The file extension for the stored fields. - - """ - TERMS_EXT = ".trm" # Term index POSTS_EXT = ".pst" # Term postings DAWG_EXT = FST_EXT = ".dag" # Spelling graph file @@ -991,46 +461,19 @@ class W2Codec(base.Codec): STORED_EXT = ".sto" # Stored fields file def __init__(self, blocklimit=128, compression=3, loadlengths=False, inlinelimit=1): - """ - Initialize the W2Codec. - - Args: - blocklimit (int): The maximum number of terms to store in a block. - compression (int): The level of compression to apply to the index data. - loadlengths (bool): Whether to load field lengths during reading. - inlinelimit (int): The maximum number of terms to store in a field block. - """ self.blocklimit = blocklimit self.compression = compression self.loadlengths = loadlengths self.inlinelimit = inlinelimit + # Per-document value writer def per_document_writer(self, storage, segment): - """ - Create a per-document value writer. - - Args: - storage: The storage object for the index. - segment: The segment object for the index. - - Returns: - W2PerDocWriter: The per-document value writer. - """ return W2PerDocWriter( storage, segment, blocklimit=self.blocklimit, compression=self.compression ) + # Inverted index writer def field_writer(self, storage, segment): - """ - Create an inverted index writer. - - Args: - storage: The storage object for the index. - segment: The segment object for the index. - - Returns: - W2FieldWriter: The inverted index writer. - """ return W2FieldWriter( storage, segment, @@ -1039,65 +482,26 @@ def field_writer(self, storage, segment): inlinelimit=self.inlinelimit, ) - def terms_reader(self, storage, segment): - """ - Create a terms reader. - - Args: - storage: The storage object for the index. - segment: The segment object for the index. + # Readers - Returns: - W2TermsReader: The terms reader. - """ + def terms_reader(self, storage, segment): tifile = segment.open_file(storage, self.TERMS_EXT) postfile = segment.open_file(storage, self.POSTS_EXT) return W2TermsReader(tifile, postfile) def per_document_reader(self, storage, segment): - """ - Create a per-document reader. - - Args: - storage: The storage object for the index. - segment: The segment object for the index. - - Returns: - W2PerDocReader: The per-document reader. - """ return W2PerDocReader(storage, segment) def graph_reader(self, storage, segment): - """ - Create a graph reader. - - Args: - storage: The storage object for the index. - segment: The segment object for the index. - - Returns: - GraphReader: The graph reader. - - Raises: - NoGraphError: If the spelling graph file is not found. - """ try: dawgfile = segment.open_file(storage, self.DAWG_EXT) except ValueError: raise NoGraphError return GraphReader(dawgfile) - def new_segment(self, storage, indexname): - """ - Create a new segment. + # Segments and generations - Args: - storage: The storage object for the index. - indexname (str): The name of the index. - - Returns: - W2Segment: The new segment. - """ + def new_segment(self, storage, indexname): return W2Segment(indexname) @@ -1105,30 +509,9 @@ def new_segment(self, storage, indexname): class W2PerDocWriter(base.PerDocumentWriter): - """A class for writing per-document data in the Whoosh 2 codec. - - Args: - storage (Storage): The storage object to use for creating files. - segment (Segment): The segment object representing the current segment. - blocklimit (int, optional): The maximum number of vector items to store in a block. Defaults to 128. - compression (int, optional): The compression level to use when writing vector blocks. Defaults to 3. - - Attributes: - storage (Storage): The storage object used for creating files. - segment (Segment): The segment object representing the current segment. - blocklimit (int): The maximum number of vector items to store in a block. - compression (int): The compression level used when writing vector blocks. - doccount (int): The total number of documents written. - is_closed (bool): Indicates whether the writer has been closed. - - Note: - This class is used internally by the Whoosh 2 codec and should not be instantiated directly. - - """ - def __init__(self, storage, segment, blocklimit=128, compression=3): if not isinstance(blocklimit, int): - raise ValueError("blocklimit must be an integer") + raise ValueError self.storage = storage self.segment = segment self.blocklimit = blocklimit @@ -1147,62 +530,30 @@ def __init__(self, storage, segment, blocklimit=128, compression=3): self.vindex = self.vpostfile = None def _make_vector_files(self): - """Create the vector index and vector postings files.""" vifile = self.segment.create_file(self.storage, W2Codec.VECTOR_EXT) self.vindex = VectorWriter(vifile) self.vpostfile = self.segment.create_file(self.storage, W2Codec.VPOSTS_EXT) def start_doc(self, docnum): - """Start writing a new document. - - Args: - docnum (int): The document number. - - """ self.docnum = docnum self.storedfields = {} self.doccount = max(self.doccount, docnum + 1) def add_field(self, fieldname, fieldobj, value, length): - """Add a field to the current document. - - Args: - fieldname (str): The name of the field. - fieldobj (Field): The field object. - value (object): The field value. - length (int): The length of the field value. - - """ if length: self.lengths.add(self.docnum, fieldname, length) if value is not None: self.storedfields[fieldname] = value def _new_block(self, vformat): - """Create a new vector block. - - Args: - vformat (Format): The vector format. - - Returns: - W2Block: The new vector block. - - """ postingsize = vformat.posting_size return W2Block(postingsize, stringids=True) def add_vector_items(self, fieldname, fieldobj, items): - """Add vector items to the current document. - - Args: - fieldname (str): The name of the vector field. - fieldobj (Field): The vector field object. - items (list): A list of vector items in the format (text, weight, value_bytes). - - """ if self.vindex is None: self._make_vector_files() + # items = (text, weight, value_bytes) ... postfile = self.vpostfile blocklimit = self.blocklimit block = self._new_block(fieldobj.vector) @@ -1238,12 +589,10 @@ def add_vector_items(self, fieldname, fieldobj, items): self.vindex.add((self.docnum, fieldname), startoffset) def finish_doc(self): - """Finish writing the current document.""" self.stored.add(self.storedfields) self.storedfields = None def close(self): - """Close the writer.""" if self.storedfields is not None: self.stored.add(self.storedfields) self.stored.close() @@ -1259,64 +608,7 @@ def close(self): class W2FieldWriter(base.FieldWriter): - """ - The W2FieldWriter class is responsible for writing field data to the index files in the Whoosh search engine. - - Parameters: - - storage (Storage): The storage object used to store the index files. - - segment (base.Segment): The segment object representing the current segment being written. - - blocklimit (int): The maximum number of documents to store in a single block. - - compression (int): The level of compression to apply to the block data. - - inlinelimit (int): The maximum number of documents to store inline without creating a separate block. - - Attributes: - - storage (Storage): The storage object used to store the index files. - - segment (base.Segment): The segment object representing the current segment being written. - - fieldname (str): The name of the field being written. - - text (str): The text of the current term being written. - - field (Field): The field object being written. - - format (Format): The format object associated with the field. - - spelling (bool): Indicates whether the field has spelling enabled. - - termsindex (TermIndexWriter): The term index writer object. - - postfile (File): The file object for storing the posting data. - - dawg (GraphWriter): The DAWG (Directed Acyclic Word Graph) writer object. - - blocklimit (int): The maximum number of documents to store in a single block. - - compression (int): The level of compression to apply to the block data. - - inlinelimit (int): The maximum number of documents to store inline without creating a separate block. - - block (W2Block): The current block being written. - - terminfo (FileTermInfo): The term info object for the current term. - - _infield (bool): Indicates whether the writer is currently inside a field. - - is_closed (bool): Indicates whether the writer has been closed. - - Methods: - - _make_dawg_files(): Creates the DAWG (Directed Acyclic Word Graph) files if needed. - - _new_block(): Creates a new block object. - - _reset_block(): Resets the current block. - - _write_block(): Writes the current block to the posting file. - - _start_blocklist(): Starts a new block list in the posting file. - - start_field(fieldname, fieldobj): Starts writing a new field. - - start_term(text): Starts writing a new term. - - add(docnum, weight, valuestring, length): Adds a document to the current block. - - add_spell_word(fieldname, text): Adds a spelling word to the DAWG. - - finish_term(): Finishes writing the current term. - - finish_field(): Finishes writing the current field. - - close(): Closes the writer and releases any resources. - """ - def __init__(self, storage, segment, blocklimit=128, compression=3, inlinelimit=1): - """ - Initializes a new instance of the W2FieldWriter class. - - Parameters: - - storage (Storage): The storage object used to store the index files. - - segment (base.Segment): The segment object representing the current segment being written. - - blocklimit (int): The maximum number of documents to store in a single block. - - compression (int): The level of compression to apply to the block data. - - inlinelimit (int): The maximum number of documents to store inline without creating a separate block. - - Raises: - - AssertionError: If the input parameters are not of the expected types. - """ assert isinstance(storage, Storage) assert isinstance(segment, base.Segment) assert isinstance(blocklimit, int) @@ -1348,40 +640,22 @@ def __init__(self, storage, segment, blocklimit=128, compression=3, inlinelimit= self.is_closed = False def _make_dawg_files(self): - """ - Creates the DAWG (Directed Acyclic Word Graph) files if needed. - """ dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT) self.dawg = GraphWriter(dawgfile) def _new_block(self): - """ - Creates a new block object. - - Returns: - - W2Block: The new block object. - """ return W2Block(self.format.posting_size) def _reset_block(self): - """ - Resets the current block. - """ self.block = self._new_block() def _write_block(self): - """ - Writes the current block to the posting file. - """ self.terminfo.add_block(self.block) self.block.to_file(self.postfile, compression=self.compression) self._reset_block() self.blockcount += 1 def _start_blocklist(self): - """ - Starts a new block list in the posting file. - """ postfile = self.postfile self._reset_block() @@ -1393,16 +667,6 @@ def _start_blocklist(self): postfile.write_uint(0) def start_field(self, fieldname, fieldobj): - """ - Starts writing a new field. - - Parameters: - - fieldname (str): The name of the field. - - fieldobj (Field): The field object. - - Raises: - - ValueError: If called before finishing the previous field. - """ self.fieldname = fieldname self.field = fieldobj self.format = fieldobj.format @@ -1416,15 +680,6 @@ def start_field(self, fieldname, fieldobj): self._infield = True def start_term(self, text): - """ - Starts writing a new term. - - Parameters: - - text (str): The text of the term. - - Raises: - - ValueError: If called inside a block. - """ if self.block is not None: raise ValueError("Called start_term in a block") self.text = text @@ -1436,41 +691,16 @@ def start_term(self, text): self._start_blocklist() def add(self, docnum, weight, valuestring, length): - """ - Adds a document to the current block. - - Parameters: - - docnum (int): The document number. - - weight (float): The weight of the document. - - valuestring (str): The value string of the document. - - length (int): The length of the document. - - Raises: - - ValueError: If the block size exceeds the block limit, the current block is written to the posting file. - """ self.block.add(docnum, weight, valuestring, length) if len(self.block) > self.blocklimit: self._write_block() def add_spell_word(self, fieldname, text): - """ - Adds a spelling word to the DAWG (Directed Acyclic Word Graph). - - Parameters: - - fieldname (str): The name of the field. - - text (str): The spelling word. - """ if self.dawg is None: self._make_dawg_files() self.dawg.insert(text) def finish_term(self): - """ - Finishes writing the current term. - - Raises: - - ValueError: If called when not in a block. - """ block = self.block if block is None: raise ValueError("Called finish_term when not in a block") @@ -1503,12 +733,6 @@ def finish_term(self): self.termsindex.add((self.fieldname, self.text), terminfo) def finish_field(self): - """ - Finishes writing the current field. - - Raises: - - ValueError: If called before starting a field. - """ if not self._infield: raise ValueError("Called finish_field before start_field") self._infield = False @@ -1518,9 +742,6 @@ def finish_field(self): self._dawgfield = False def close(self): - """ - Closes the writer and releases any resources. - """ self.termsindex.close() self.postfile.close() if self.dawg is not None: @@ -1532,18 +753,6 @@ def close(self): class W2LeafMatcher(LeafMatcher): - """ - Represents a leaf matcher for the Whoosh 2 codec. - - Args: - postfile (file-like object): The file-like object containing the posting data. - startoffset (int): The starting offset of the leaf matcher in the postfile. - fmt (CodecFormat): The codec format used for encoding and decoding data. - scorer (Scorer, optional): The scorer used for scoring documents. Defaults to None. - term (Term, optional): The term associated with the leaf matcher. Defaults to None. - stringids (bool, optional): Whether the leaf matcher uses string-based document IDs. Defaults to False. - """ - def __init__( self, postfile, startoffset, fmt, scorer=None, term=None, stringids=False ): @@ -1567,54 +776,24 @@ def __init__( self._next_block() def id(self): - """ - Returns the document ID associated with the current posting. - - Returns: - int: The document ID. - """ return self.block.ids[self.i] def is_active(self): - """ - Checks if the leaf matcher is active. - - Returns: - bool: True if the leaf matcher is active, False otherwise. - """ return self._active def weight(self): - """ - Returns the weight of the current posting. - - Returns: - float: The weight of the posting. - """ weights = self.block.weights if not weights: weights = self.block.read_weights() return weights[self.i] def value(self): - """ - Returns the value of the current posting. - - Returns: - object: The value of the posting. - """ values = self.block.values if values is None: values = self.block.read_values() return values[self.i] def all_ids(self): - """ - Generator that yields all document IDs in the leaf matcher. - - Yields: - int: The document ID. - """ nextoffset = self.baseoffset for _ in range(self.blockcount): block = self._read_block(nextoffset) @@ -1623,12 +802,6 @@ def all_ids(self): yield from ids def next(self): - """ - Moves to the next posting in the leaf matcher. - - Returns: - bool: True if there is a next posting, False otherwise. - """ if self.i == self.block.count - 1: self._next_block() return True @@ -1637,15 +810,6 @@ def next(self): return False def skip_to(self, id): - """ - Skips to the posting with the specified document ID. - - Args: - id (int): The document ID to skip to. - - Raises: - ReadTooFar: If the leaf matcher has been read beyond the target ID. - """ if not self.is_active(): raise ReadTooFar @@ -1660,7 +824,8 @@ def skip_to(self, id): if not self.is_active(): return - # Iterate through the IDs in the block until we find or pass the target + # Iterate through the IDs in the block until we find or pass the + # target ids = self.block.ids i = self.i while ids[i] < id: @@ -1671,57 +836,21 @@ def skip_to(self, id): self.i = i def skip_to_quality(self, minquality): - """ - Skips to the posting with a quality greater than or equal to the specified minimum quality. - - Args: - minquality (float): The minimum quality. - - Returns: - int: The number of blocks skipped. - - Note: - The quality of a posting is determined by the block quality function. - """ bq = self.block_quality if bq() > minquality: return 0 return self._skip_to_block(lambda: bq() <= minquality) def block_min_length(self): - """ - Returns the minimum length of postings in the current block. - - Returns: - int: The minimum length. - """ return self.block.min_length() def block_max_length(self): - """ - Returns the maximum length of postings in the current block. - - Returns: - int: The maximum length. - """ return self.block.max_length() def block_max_weight(self): - """ - Returns the maximum weight of postings in the current block. - - Returns: - float: The maximum weight. - """ return self.block.max_weight() def block_max_wol(self): - """ - Returns the maximum weight of lengths of postings in the current block. - - Returns: - float: The maximum weight of lengths. - """ return self.block.max_wol() def _read_block(self, offset): @@ -1772,45 +901,14 @@ def _skip_to_block(self, targetfn): class TermIndexWriter(HashWriter): - """ - A class for writing term index data to a database file. - - Inherits from HashWriter. - - Attributes: - index (list): A list of positions in the database file where each term is stored. - fieldcounter (int): Counter for assigning field numbers. - fieldmap (dict): Mapping of field names to field numbers. - - Methods: - keycoder(term): Encodes a term into a key for storage in the database file. - valuecoder(terminfo): Encodes a TermInfo object into a string for storage in the database file. - add(key, value): Adds a term and its associated value to the database file. - _write_extras(): Writes additional data (index and fieldmap) to the database file. - """ - def __init__(self, dbfile): - """ - Initializes a TermIndexWriter object. - - Args: - dbfile (file): The database file to write the term index data to. - """ HashWriter.__init__(self, dbfile) self.index = [] self.fieldcounter = 0 self.fieldmap = {} def keycoder(self, term): - """ - Encodes a term into a key for storage in the database file. - - Args: - term (tuple): A tuple containing the field name and the term text. - - Returns: - bytes: The encoded key. - """ + # Encode term fieldmap = self.fieldmap fieldname, text = term @@ -1825,33 +923,14 @@ def keycoder(self, term): return key def valuecoder(self, terminfo): - """ - Encodes a TermInfo object into a string for storage in the database file. - - Args: - terminfo (TermInfo): The TermInfo object to encode. - - Returns: - str: The encoded string. - """ return terminfo.to_string() def add(self, key, value): - """ - Adds a term and its associated value to the database file. - - Args: - key (bytes): The encoded key representing the term. - value (str): The encoded value representing the term information. - """ pos = self.dbfile.tell() self.index.append(pos) HashWriter.add(self, self.keycoder(key), self.valuecoder(value)) def _write_extras(self): - """ - Writes additional data (index and fieldmap) to the database file. - """ dbfile = self.dbfile dbfile.write_uint(len(self.index)) for n in self.index: @@ -1860,27 +939,7 @@ def _write_extras(self): class VectorWriter(TermIndexWriter): - """A class for writing vector data to the index. - - This class is responsible for encoding and writing vector data to the index. - It provides methods for encoding keys and values. - - Attributes: - fieldmap (dict): A dictionary mapping field names to field numbers. - fieldcounter (int): A counter for assigning field numbers. - - """ - def keycoder(self, key): - """Encode the key (docnum, fieldname) into a binary representation. - - Args: - key (tuple): A tuple containing the document number and field name. - - Returns: - bytes: The binary representation of the key. - - """ fieldmap = self.fieldmap docnum, fieldname = key @@ -1894,15 +953,6 @@ def keycoder(self, key): return _vectorkey_struct.pack(docnum, fieldnum) def valuecoder(self, offset): - """Encode the offset into a binary representation. - - Args: - offset (int): The offset value. - - Returns: - bytes: The binary representation of the offset. - - """ return pack_long(offset) @@ -1910,33 +960,11 @@ def valuecoder(self, offset): class PostingIndexBase(HashReader): - """ - Base class for a posting index. - - This class provides methods for reading and manipulating a posting index. - - Args: - dbfile (file): The file object representing the database file. - postfile (file): The file object representing the posting file. - - Attributes: - postfile (file): The file object representing the posting file. - length (int): The length of the posting index. - indexbase (int): The base position of the posting index in the database file. - fieldmap (dict): A mapping of field names to field numbers. - names (list): A list of field names in the order of their field numbers. - """ - def __init__(self, dbfile, postfile): HashReader.__init__(self, dbfile) self.postfile = postfile def _read_extras(self): - """ - Read the extra information from the database file. - - This method reads the length, index base, field map, and field names from the database file. - """ dbfile = self.dbfile self.length = dbfile.read_uint() @@ -1949,15 +977,6 @@ def _read_extras(self): self.names[num] = name def _closest_key(self, key): - """ - Find the closest key in the posting index. - - Args: - key (bytes): The key to search for. - - Returns: - int: The position of the closest key in the posting index. - """ dbfile = self.dbfile key_at = self._key_at indexbase = self.indexbase @@ -1972,35 +991,19 @@ def _closest_key(self, key): lo = mid + 1 else: hi = mid + # i = max(0, mid - 1) if lo == self.length: return None return dbfile.get_long(indexbase + lo * _LONG_SIZE) def closest_key(self, key): - """ - Find the closest key in the posting index. - - Args: - key (bytes): The key to search for. - - Returns: - bytes: The closest key in the posting index. - """ pos = self._closest_key(key) if pos is None: return None return self._key_at(pos) def _ranges_from(self, key): - """ - Generate ranges of key-value pairs starting from the given key. - - Args: - key (bytes): The key to start from. - - Yields: - tuple: A tuple containing the key position, key length, data position, and data length. - """ + # read = self.read pos = self._closest_key(key) if pos is None: return @@ -2008,31 +1011,10 @@ def _ranges_from(self, key): yield from self._ranges(pos=pos) def __getitem__(self, key): - """ - Get the value associated with the given key. - - Args: - key: The key to retrieve the value for. - - Returns: - object: The value associated with the key. - - Raises: - KeyError: If the key is not found in the posting index. - """ k = self.keycoder(key) return self.valuedecoder(HashReader.__getitem__(self, k)) def __contains__(self, key): - """ - Check if the given key is present in the posting index. - - Args: - key: The key to check. - - Returns: - bool: True if the key is present, False otherwise. - """ try: codedkey = self.keycoder(key) except KeyError: @@ -2040,77 +1022,27 @@ def __contains__(self, key): return HashReader.__contains__(self, codedkey) def range_for_key(self, key): - """ - Get the range of key-value pairs for the given key. - - Args: - key: The key to get the range for. - - Returns: - tuple: A tuple containing the start position and end position of the range. - """ return HashReader.range_for_key(self, self.keycoder(key)) def get(self, key, default=None): - """ - Get the value associated with the given key. - - Args: - key: The key to retrieve the value for. - default: The default value to return if the key is not found. - - Returns: - object: The value associated with the key, or the default value if the key is not found. - """ k = self.keycoder(key) return self.valuedecoder(HashReader.get(self, k, default)) def keys(self): - """ - Generate the keys in the posting index. - - Yields: - object: The keys in the posting index. - """ kd = self.keydecoder for k in HashReader.keys(self): yield kd(k) def items(self): - """ - Generate the key-value pairs in the posting index. - - Yields: - tuple: A tuple containing the key and value. - """ kd = self.keydecoder vd = self.valuedecoder for key, value in HashReader.items(self): yield (kd(key), vd(value)) def terms_from(self, fieldname, prefix): - """ - Generate the terms in the posting index starting from the given field name and prefix. - - Args: - fieldname: The field name to start from. - prefix: The prefix to match. - - Yields: - object: The terms in the posting index. - """ return self.keys_from((fieldname, prefix)) def keys_from(self, key): - """ - Generate the keys in the posting index starting from the given key. - - Args: - key: The key to start from. - - Yields: - object: The keys in the posting index. - """ key = self.keycoder(key) kd = self.keydecoder read = self.read @@ -2118,16 +1050,6 @@ def keys_from(self, key): yield kd(read(keypos, keylen)) def items_from(self, fieldname, prefix): - """ - Generate the key-value pairs in the posting index starting from the given field name and prefix. - - Args: - fieldname: The field name to start from. - prefix: The prefix to match. - - Yields: - tuple: A tuple containing the key and value. - """ read = self.read key = self.keycoder((fieldname, prefix)) kd = self.keydecoder @@ -2136,98 +1058,27 @@ def items_from(self, fieldname, prefix): yield (kd(read(keypos, keylen)), vd(read(datapos, datalen))) def values(self): - """ - Generate the values in the posting index. - - Yields: - object: The values in the posting index. - """ vd = self.valuedecoder for v in HashReader.values(self): yield vd(v) def close(self): - """ - Close the posting index. - - This method closes the posting index and the associated files. - """ HashReader.close(self) self.postfile.close() class W2TermsReader(PostingIndexBase): - """ - A class that implements the TermsReader interface for the Whoosh2 codec. - - This class provides methods for reading terms, retrieving term information, - creating matchers for a given term, encoding and decoding keys, and decoding - values. - - Note: This class does not filter out deleted documents. A higher-level class - is expected to wrap the matcher to eliminate deleted documents. - - Args: - PostingIndexBase: The base class for the terms reader. - - Attributes: - postfile (PostingsFile): The postings file associated with the terms reader. - fieldmap (dict): A dictionary mapping field names to field numbers. - names (list): A list of field names. - dbfile (DatabaseFile): The database file associated with the terms reader. - - Methods: - terms(): Returns the list of terms in the index. - term_info(fieldname, text): Returns the term information for a given field and text. - matcher(fieldname, text, format_, scorer=None): Returns a matcher for a given field and text. - keycoder(key): Encodes a key. - keydecoder(v): Decodes a key. - valuedecoder(v): Decodes a value. - frequency(fieldname, btext): Returns the frequency of a term in a given field. - doc_frequency(fieldname, btext): Returns the document frequency of a term in a given field. - """ + # Implements whoosh.codec.base.TermsReader def terms(self): - """ - Returns the list of terms in the index. - - Returns: - list: A list of terms in the index. - """ return self.keys() def term_info(self, fieldname, text): - """ - Returns the term information for a given field and text. - - Args: - fieldname (str): The name of the field. - text (str): The text of the term. - - Returns: - TermInfo: The term information for the given field and text. - - Raises: - TermNotFound: If the term is not found in the index. - """ return self[fieldname, text] def matcher(self, fieldname, text, format_, scorer=None): - """ - Returns a matcher for a given field and text. - - Args: - fieldname (str): The name of the field. - text (str): The text of the term. - format_ (str): The format of the matcher. - scorer (Scorer, optional): The scorer to use for scoring documents. Defaults to None. - - Returns: - Matcher: A matcher for the given field and text. - - Raises: - TermNotFound: If the term is not found in the index. - """ + # Note this does not filter out deleted documents; a higher level is + # expected to wrap this matcher to eliminate deleted docs pf = self.postfile term = (fieldname, text) @@ -2247,71 +1098,24 @@ def matcher(self, fieldname, text, format_, scorer=None): return pr def keycoder(self, key): - """ - Encodes a key. - - Args: - key (tuple): The key to encode. - - Returns: - bytes: The encoded key. - """ fieldname, tbytes = key fnum = self.fieldmap.get(fieldname, 65535) return pack_ushort(fnum) + tbytes def keydecoder(self, v): - """ - Decodes a key. - - Args: - v (bytes): The key to decode. - - Returns: - tuple: The decoded key. - """ assert isinstance(v, bytes) return (self.names[unpack_ushort(v[:2])[0]], v[2:]) def valuedecoder(self, v): - """ - Decodes a value. - - Args: - v (bytes): The value to decode. - - Returns: - FileTermInfo: The decoded value. - """ assert isinstance(v, bytes) return FileTermInfo.from_string(v) def frequency(self, fieldname, btext): - """ - Returns the frequency of a term in a given field. - - Args: - fieldname (str): The name of the field. - btext (bytes): The encoded text of the term. - - Returns: - int: The frequency of the term in the given field. - """ assert isinstance(btext, bytes) datapos = self.range_for_key((fieldname, btext))[0] return FileTermInfo.read_weight(self.dbfile, datapos) def doc_frequency(self, fieldname, btext): - """ - Returns the document frequency of a term in a given field. - - Args: - fieldname (str): The name of the field. - btext (bytes): The encoded text of the term. - - Returns: - int: The document frequency of the term in the given field. - """ assert isinstance(btext, bytes) datapos = self.range_for_key((fieldname, btext))[0] return FileTermInfo.read_doc_freq(self.dbfile, datapos) @@ -2322,112 +1126,26 @@ def doc_frequency(self, fieldname, btext): class W2VectorReader(PostingIndexBase): - """ - Implements the VectorReader interface for the Whoosh2 codec. - - This class provides methods for reading vector data from the index. - - Attributes: - postfile (file): The file object representing the posting file. - fieldmap (dict): A mapping of field names to field numbers. - names (list): A list of field names. - - """ + # Implements whoosh.codec.base.VectorReader def matcher(self, docnum, fieldname, format_): - """ - Returns a matcher for the given document number, field name, and format. - - Args: - docnum (int): The document number. - fieldname (str): The field name. - format_ (str): The format of the vector data. - - Returns: - W2LeafMatcher: A matcher object for the given parameters. - - """ pf = self.postfile offset = self[(docnum, fieldname)] pr = W2LeafMatcher(pf, offset, format_, stringids=True) return pr def keycoder(self, key): - """ - Encodes the key into a binary representation. - - Args: - key (tuple): The key to encode, consisting of a document number and a field name. - - Returns: - bytes: The binary representation of the key. - - """ return _vectorkey_struct.pack(key[0], self.fieldmap[key[1]]) def keydecoder(self, v): - """ - Decodes the binary representation of a key. - - Args: - v (bytes): The binary representation of the key. - - Returns: - tuple: The decoded key, consisting of a document number and a field name. - - """ docnum, fieldnum = _vectorkey_struct.unpack(v) return (docnum, self.names[fieldnum]) def valuedecoder(self, v): - """ - Decodes the binary representation of a value. - - Args: - v (bytes): The binary representation of the value. - - Returns: - int: The decoded value. - - """ return unpack_long(v)[0] class W2PerDocReader(base.PerDocumentReader): - """Reader for per-document data in a Whoosh 2 index segment. - - This class provides methods for accessing per-document data such as field lengths, - stored fields, and vectors in a Whoosh 2 index segment. - - Parameters: - - storage (Storage): The storage object for the index. - - segment (Segment): The segment object representing the index segment. - - Attributes: - - _storage (Storage): The storage object for the index. - - _segment (Segment): The segment object representing the index segment. - - _doccount (int): The total number of documents in the segment. - - _lengths (InMemoryLengths): The object for accessing field lengths. - - _stored (StoredFieldReader): The object for accessing stored fields. - - _vectors (W2VectorReader): The object for accessing vectors. - - Methods: - - supports_columns(): Check if the reader supports column storage. - - close(): Close the reader and release any resources. - - doc_count(): Get the number of documents in the segment. - - doc_count_all(): Get the total number of documents in the segment. - - has_deletions(): Check if the segment has deleted documents. - - is_deleted(docnum): Check if a document is deleted. - - deleted_docs(): Get the list of deleted document numbers. - - doc_field_length(docnum, fieldname, default=0): Get the length of a field in a document. - - field_length(fieldname): Get the total length of a field in all documents. - - min_field_length(fieldname): Get the minimum length of a field in all documents. - - max_field_length(fieldname): Get the maximum length of a field in all documents. - - has_vector(docnum, fieldname): Check if a document has a vector for a field. - - vector(docnum, fieldname, format_): Get the vector for a field in a document. - - stored_fields(docnum): Get the stored fields for a document. - """ - def __init__(self, storage, segment): self._storage = storage self._segment = segment @@ -2442,124 +1160,51 @@ def __init__(self, storage, segment): self._vectors = None # Lazy load def supports_columns(self): - """Check if the reader supports column storage. - - Returns: - - bool: True if the reader supports column storage, False otherwise. - """ return False def close(self): - """Close the reader and release any resources.""" self._lengths.close() if self._vectors: self._vectors.close() self._stored.close() def doc_count(self): - """Get the number of documents in the segment. - - Returns: - - int: The number of documents in the segment. - """ return self._segment.doc_count() def doc_count_all(self): - """Get the total number of documents in the segment. - - Returns: - - int: The total number of documents in the segment. - """ return self._doccount def has_deletions(self): - """Check if the segment has deleted documents. - - Returns: - - bool: True if the segment has deleted documents, False otherwise. - """ return self._segment.has_deletions() def is_deleted(self, docnum): - """Check if a document is deleted. - - Parameters: - - docnum (int): The document number. - - Returns: - - bool: True if the document is deleted, False otherwise. - """ return self._segment.is_deleted(docnum) def deleted_docs(self): - """Get the list of deleted document numbers. - - Returns: - - list[int]: The list of deleted document numbers. - """ return self._segment.deleted_docs() - def doc_field_length(self, docnum, fieldname, default=0): - """Get the length of a field in a document. - - Parameters: - - docnum (int): The document number. - - fieldname (str): The field name. - - default (int, optional): The default length to return if the field is not found. Defaults to 0. + # Lengths - Returns: - - int: The length of the field in the document, or the default length if the field is not found. - """ + def doc_field_length(self, docnum, fieldname, default=0): return self._lengths.doc_field_length(docnum, fieldname, default) def field_length(self, fieldname): - """Get the total length of a field in all documents. - - Parameters: - - fieldname (str): The field name. - - Returns: - - int: The total length of the field in all documents. - """ return self._lengths.field_length(fieldname) def min_field_length(self, fieldname): - """Get the minimum length of a field in all documents. - - Parameters: - - fieldname (str): The field name. - - Returns: - - int: The minimum length of the field in all documents. - """ return self._lengths.min_field_length(fieldname) def max_field_length(self, fieldname): - """Get the maximum length of a field in all documents. - - Parameters: - - fieldname (str): The field name. - - Returns: - - int: The maximum length of the field in all documents. - """ return self._lengths.max_field_length(fieldname) + # Vectors + def _prep_vectors(self): vifile = self._segment.open_file(self._storage, W2Codec.VECTOR_EXT) vpostfile = self._segment.open_file(self._storage, W2Codec.VPOSTS_EXT) self._vectors = W2VectorReader(vifile, vpostfile) def has_vector(self, docnum, fieldname): - """Check if a document has a vector for a field. - - Parameters: - - docnum (int): The document number. - - fieldname (str): The field name. - - Returns: - - bool: True if the document has a vector for the field, False otherwise. - """ if self._vectors is None: try: self._prep_vectors() @@ -2568,29 +1213,13 @@ def has_vector(self, docnum, fieldname): return (docnum, fieldname) in self._vectors def vector(self, docnum, fieldname, format_): - """Get the vector for a field in a document. - - Parameters: - - docnum (int): The document number. - - fieldname (str): The field name. - - format_ (str): The format of the vector. - - Returns: - - VectorMatcher: The vector matcher object. - """ if self._vectors is None: self._prep_vectors() return self._vectors.matcher(docnum, fieldname, format_) - def stored_fields(self, docnum): - """Get the stored fields for a document. - - Parameters: - - docnum (int): The document number. + # Stored - Returns: - - dict: The stored fields for the document. - """ + def stored_fields(self, docnum): return self._stored[docnum] @@ -2598,39 +1227,15 @@ def stored_fields(self, docnum): class ByteLengthsBase: - """ - Base class for storing byte lengths of fields in a document. - - This class provides methods to read and store byte lengths of fields in a document. - It also provides methods to retrieve the total number of documents, the length of a specific field, - and the minimum and maximum lengths of a field. - - Attributes: - magic (bytes): The magic number used to identify the file format. - """ - magic = b"~LN1" def __init__(self): - """ - Initializes a new instance of the ByteLengthsBase class. - """ self.starts = {} self.totals = {} self.minlens = {} self.maxlens = {} def _read_header(self, dbfile, doccount): - """ - Reads the header information from the database file. - - Args: - dbfile (file): The file object representing the database file. - doccount (int): The number of documents saved in the database. - - Raises: - AssertionError: If the magic number or version number is not as expected. - """ first = dbfile.read(4) # Magic assert first == self.magic version = dbfile.read_int() # Version number @@ -2653,113 +1258,31 @@ def _read_header(self, dbfile, doccount): self.starts[fieldname] += eoh def doc_count_all(self): - """ - Returns the total number of documents saved in the database. - - Returns: - int: The total number of documents. - """ return self._count def field_length(self, fieldname): - """ - Returns the total length of a specific field in the database. - - Args: - fieldname (str): The name of the field. - - Returns: - int: The total length of the field. - - Raises: - KeyError: If the field name is not found in the database. - """ return self.totals.get(fieldname, 0) def min_field_length(self, fieldname): - """ - Returns the minimum length of a specific field in the database. - - Args: - fieldname (str): The name of the field. - - Returns: - int: The minimum length of the field. - - Raises: - KeyError: If the field name is not found in the database. - """ return self.minlens.get(fieldname, 0) def max_field_length(self, fieldname): - """ - Returns the maximum length of a specific field in the database. - - Args: - fieldname (str): The name of the field. - - Returns: - int: The maximum length of the field. - - Raises: - KeyError: If the field name is not found in the database. - """ return self.maxlens.get(fieldname, 0) class InMemoryLengths(ByteLengthsBase): def __init__(self): - """ - Initialize the Whoosh2 codec. - - This method initializes the Whoosh2 codec by setting up the necessary data structures. - It inherits from the ByteLengthsBase class and initializes the totals and lengths dictionaries. - The totals dictionary keeps track of the total number of occurrences of each term in the index, - while the lengths dictionary stores the length of each term in bytes. - The _count variable is used to keep track of the number of terms. - - Usage: - codec = Whoosh2() - """ - ByteLengthsBase.__init__(self) self.totals = defaultdict(int) self.lengths = {} self._count = 0 def close(self): - """ - Closes the codec. - - This method is called to release any resources held by the codec. It should be called when the codec is no longer needed. - - """ pass # IO def to_file(self, dbfile, doccount): - """ - Write the index data to a file. - - Args: - dbfile (file): The file object to write the index data to. - doccount (int): The number of documents in the index. - - Raises: - IOError: If there is an error writing to the file. - - Notes: - This method writes the index data to a file in a specific format. - It writes the magic number, format version number, number of documents, - and number of fields to the file. Then, it writes per-field information, - including field name, field length, minimum field length, and maximum field length. - Finally, it writes the byte arrays for each field. - - Example: - >>> with open("index.db", "wb") as dbfile: - ... codec.to_file(dbfile, 1000) - """ self._pad_arrays(doccount) fieldnames = list(self.lengths.keys()) @@ -2782,21 +1305,6 @@ def to_file(self, dbfile, doccount): @classmethod def from_file(cls, dbfile, doccount=None): - """ - Load a Whoosh2 object from a file. - - Args: - cls (class): The class of the object to be loaded. - dbfile (file): The file object to read from. - doccount (int, optional): The number of documents in the object. Defaults to None. - - Returns: - obj: The loaded Whoosh2 object. - - Raises: - None. - - """ obj = cls() obj._read_header(dbfile, doccount) for fieldname, start in obj.starts.items(): @@ -2807,25 +1315,6 @@ def from_file(cls, dbfile, doccount=None): # Get def doc_field_length(self, docnum, fieldname, default=0): - """ - Returns the length of a field in a document. - - Args: - docnum (int): The document number. - fieldname (str): The name of the field. - default (int, optional): The default length to return if the field is not found. Defaults to 0. - - Returns: - int: The length of the field in the document, or the default length if the field is not found. - - Raises: - None - - Example: - >>> codec = WhooshCodec() - >>> codec.doc_field_length(0, "title") - 10 - """ try: arry = self.lengths[fieldname] except KeyError: @@ -2837,18 +1326,6 @@ def doc_field_length(self, docnum, fieldname, default=0): # Min/max cache setup -- not meant to be called while adding def _minmax(self, fieldname, op, cache): - """ - Returns the minimum or maximum value for a given field, based on the provided operation. - - Args: - fieldname (str): The name of the field. - op (function): The operation to be performed on the field's lengths. - cache (dict): A dictionary used to cache previously computed results. - - Returns: - int: The minimum or maximum value for the field. - - """ if fieldname in cache: return cache[fieldname] else: @@ -2861,71 +1338,20 @@ def _minmax(self, fieldname, op, cache): return result def min_field_length(self, fieldname): - """ - Returns the minimum length allowed for a field. - - Parameters: - - fieldname (str): The name of the field. - - Returns: - - int: The minimum length allowed for the field. - - """ return self._minmax(fieldname, min, self.minlens) def max_field_length(self, fieldname): - """ - Returns the maximum field length for a given field. - - Parameters: - - fieldname (str): The name of the field. - - Returns: - - int: The maximum field length. - - """ return self._minmax(fieldname, max, self.maxlens) # Add def _create_field(self, fieldname, docnum): - """ - Create a new field for the given document number. - - Args: - fieldname (str): The name of the field. - docnum (int): The document number. - - Returns: - None - - Raises: - None - - Notes: - This method is used to create a new field for a document in the index. - It updates the lengths dictionary with the field's length information. - The _count attribute is also updated to reflect the maximum document number. - - """ dc = max(self._count, docnum + 1) self.lengths[fieldname] = array("B", (0 for _ in range(dc))) self._count = dc def _pad_arrays(self, doccount): - """ - Pad out arrays to full length. - - This method is used to ensure that the arrays storing the lengths of fields are - of the same length as the number of documents in the index. If the arrays are - shorter than the desired length, they are padded with zeros. - - Parameters: - - doccount (int): The desired length of the arrays. - - Returns: - None - """ + # Pad out arrays to full length for fieldname in self.lengths.keys(): arry = self.lengths[fieldname] if len(arry) < doccount: @@ -2934,28 +1360,6 @@ def _pad_arrays(self, doccount): self._count = doccount def add(self, docnum, fieldname, length): - """ - Add the length of a field for a specific document. - - Args: - docnum (int): The document number. - fieldname (str): The name of the field. - length (int): The length of the field. - - Returns: - None - - Raises: - None - - Notes: - This method updates the lengths and totals dictionaries to keep track of the field lengths - for each document. If the field does not exist in the lengths dictionary, it will be created. - The length is converted to a byte value using the length_to_byte function. The byte value is - then stored in the lengths dictionary for the specified document and field. The totals - dictionary is also updated to keep track of the total length of each field. - - """ lengths = self.lengths if length: if fieldname not in lengths: @@ -2973,97 +1377,30 @@ def add(self, docnum, fieldname, length): self.totals[fieldname] += length def add_other(self, other): - """ - Adds the lengths and totals from another instance of the Whoosh2 class to the current instance. - - Parameters: - - other (Whoosh2): Another instance of the Whoosh2 class. - - Returns: - None - """ - lengths = self.lengths totals = self.totals doccount = self._count - - # Add missing length arrays for fname in other.lengths: if fname not in lengths: lengths[fname] = array("B") self._pad_arrays(doccount) - # Extend length arrays with values from other instance for fname in other.lengths: lengths[fname].extend(other.lengths[fname]) self._count = doccount + other._count self._pad_arrays(self._count) - # Add totals from other instance for fname in other.totals: totals[fname] += other.totals[fname] class OnDiskLengths(ByteLengthsBase): - """ - A class that represents the on-disk lengths of fields in a Whoosh index. - - This class is responsible for reading and retrieving the lengths of fields - stored on disk. It inherits from the ByteLengthsBase class. - - Parameters: - - dbfile (file-like object): The file-like object representing the on-disk - storage of the field lengths. - - doccount (int, optional): The total number of documents in the index. If - not provided, it will be determined by reading the header of the dbfile. - - Methods: - - doc_field_length(docnum, fieldname, default=0): Retrieves the length of a - field in a specific document. If the field is not found, it returns the - default value. - - close(): Closes the dbfile. - - Example usage: - ``` - dbfile = open("lengths.db", "rb") - lengths = OnDiskLengths(dbfile) - length = lengths.doc_field_length(10, "title") - lengths.close() - ``` - """ - def __init__(self, dbfile, doccount=None): - """ - Initialize a Whoosh2 object. - - Args: - dbfile (str): The path to the Whoosh2 database file. - doccount (int, optional): The number of documents in the database. Defaults to None. - - Raises: - SomeException: An exception that may be raised under certain conditions. - - Returns: - None - """ ByteLengthsBase.__init__(self) self.dbfile = dbfile self._read_header(dbfile, doccount) def doc_field_length(self, docnum, fieldname, default=0): - """ - Retrieves the length of a field in a specific document. - - Parameters: - - docnum (int): The document number. - - fieldname (str): The name of the field. - - default (int, optional): The default value to return if the field is - not found. Default is 0. - - Returns: - - int: The length of the field in the specified document, or the default - value if the field is not found. - """ try: start = self.starts[fieldname] except KeyError: @@ -3071,15 +1408,6 @@ def doc_field_length(self, docnum, fieldname, default=0): return byte_to_length(self.dbfile.get_byte(start + docnum)) def close(self): - """ - Closes the dbfile. - - This method closes the dbfile associated with the codec. It should be called when you are done using the codec to free up system resources. - - Usage: - codec.close() - - """ self.dbfile.close() @@ -3092,34 +1420,7 @@ def close(self): class StoredFieldWriter: - """ - Class for writing stored fields to a database file. - - Args: - dbfile (file): The file object to write the stored fields to. - - Attributes: - dbfile (file): The file object to write the stored fields to. - length (int): The number of stored fields written. - directory (list): A list of pointers to the stored fields in the file. - names (list): A list of field names. - name_map (dict): A mapping of field names to their index in the `names` list. - """ - def __init__(self, dbfile): - """ - Initialize a Whoosh2 object. - - Args: - dbfile (file): The file object representing the database file. - - Attributes: - dbfile (file): The file object representing the database file. - length (int): The length of the database. - directory (list): A list of directory entries. - names (list): A list of names. - name_map (dict): A dictionary mapping names to their corresponding indices. - """ self.dbfile = dbfile self.length = 0 self.directory = [] @@ -3131,12 +1432,6 @@ def __init__(self, dbfile): self.name_map = {} def add(self, vdict): - """ - Adds a dictionary of field values to the stored fields. - - Args: - vdict (dict): A dictionary of field names and their corresponding values. - """ f = self.dbfile names = self.names name_map = self.name_map @@ -3156,20 +1451,11 @@ def add(self, vdict): f.write(vstring) def add_reader(self, sfreader): - """ - Adds stored fields from a reader object. - - Args: - sfreader (object): An object that provides an iterator over dictionaries of field values. - """ add = self.add for vdict in sfreader: add(vdict) def close(self): - """ - Closes the stored field writer and flushes the changes to the file. - """ f = self.dbfile dirpos = f.tell() f.write_pickle(self.names) @@ -3183,40 +1469,7 @@ def close(self): class StoredFieldReader: - """ - Reads stored fields from a database file. - - Args: - dbfile (file-like object): The database file to read from. - - Attributes: - dbfile (file-like object): The database file being read. - length (int): The number of stored fields in the database. - basepos (int): The base position in the database file. - names (list): The list of field names. - directory_offset (int): The offset of the directory in the database file. - - Methods: - close(): Closes the database file. - __iter__(): Iterates over the stored fields and yields a dictionary of field names and values. - __getitem__(num): Retrieves the stored field at the specified index. - - """ - def __init__(self, dbfile): - """ - Initialize a Whoosh2 object. - - Args: - dbfile (file-like object): The file-like object representing the Whoosh2 database file. - - Raises: - ValueError: If the database file is not valid. - - Notes: - This method reads the metadata from the database file and initializes the Whoosh2 object. - - """ self.dbfile = dbfile dbfile.seek(0) @@ -3238,31 +1491,15 @@ def __init__(self, dbfile): self.directory_offset = dbfile.tell() def close(self): - """ - Closes the database file. - - This method closes the database file associated with the current instance of the class. - After calling this method, any further operations on the database file will raise an exception. - - Usage: - codec = WhooshCodec() - codec.close() - - Raises: - Any exceptions raised by the underlying file object's close() method. - """ self.dbfile.close() def __iter__(self): - """ - Iterates over the stored fields and yields a dictionary of field names and values. - """ dbfile = self.dbfile names = self.names lengths = array("I") dbfile.seek(self.directory_offset) - for _ in range(self.length): + for i in range(self.length): dbfile.seek(_LONG_SIZE, 1) lengths.append(dbfile.read_uint()) @@ -3275,20 +1512,6 @@ def __iter__(self): yield vdict def __getitem__(self, num): - """ - Retrieves the stored field at the specified index. - - Args: - num (int): The index of the stored field to retrieve. - - Returns: - dict: A dictionary of field names and values. - - Raises: - IndexError: If the specified index is out of range. - ValueError: If there is an error reading the stored field. - - """ if num > self.length - 1: raise IndexError(f"Tried to get document {num}, file has {self.length}") @@ -3318,17 +1541,14 @@ def __getitem__(self, num): class W2Segment(base.Segment): def __init__(self, indexname, doccount=0, segid=None, deleted=None): """ - Represents a segment in the Whoosh index. - - :param indexname: The name of the index. - :type indexname: str + :param name: The name of the segment (the Index object computes this + from its name and the generation). :param doccount: The maximum document number in the segment. - :type doccount: int - :param segid: The segment ID. If not provided, a random ID will be generated. - :type segid: str, optional - :param deleted: A set of deleted document numbers, or None if no deleted documents exist in this segment. - :type deleted: set, optional + :param term_count: Total count of all terms in all documents. + :param deleted: A set of deleted document numbers, or None if no + deleted documents exist in this segment. """ + assert isinstance(indexname, str) self.indexname = indexname assert isinstance(doccount, int) @@ -3338,98 +1558,39 @@ def __init__(self, indexname, doccount=0, segid=None, deleted=None): self.compound = False def codec(self, **kwargs): - """ - Returns the codec associated with this segment. - - :param kwargs: Additional keyword arguments to pass to the codec constructor. - :return: The codec associated with this segment. - :rtype: W2Codec - """ return W2Codec(**kwargs) def set_doc_count(self, dc): - """ - Sets the document count for this segment. - - :param dc: The document count. - :type dc: int - """ self.doccount = dc def doc_count_all(self): - """ - Returns the total count of all documents in this segment. - - :return: The total count of all documents. - :rtype: int - """ return self.doccount - def doc_count(self): - """ - Returns the count of non-deleted documents in this segment. - - :return: The count of non-deleted documents. - :rtype: int - """ + def doc_count(self): return self.doccount - self.deleted_count() def has_deletions(self): - """ - Checks if this segment has any deleted documents. - - :return: True if there are deleted documents, False otherwise. - :rtype: bool - """ return self.deleted is not None and bool(self.deleted) def deleted_count(self): - """ - Returns the count of deleted documents in this segment. - - :return: The count of deleted documents. - :rtype: int - """ if self.deleted is None: return 0 return len(self.deleted) def delete_document(self, docnum, delete=True): - """ - Marks a document as deleted or undeleted. - - :param docnum: The document number. - :type docnum: int - :param delete: True to mark the document as deleted, False to mark it as undeleted. - :type delete: bool, optional - """ if delete: if self.deleted is None: self.deleted = set() self.deleted.add(docnum) elif self.deleted is not None and docnum in self.deleted: - self.deleted.remove(docnum) + self.deleted.clear(docnum) def is_deleted(self, docnum): - """ - Checks if a document is marked as deleted. - - :param docnum: The document number. - :type docnum: int - :return: True if the document is marked as deleted, False otherwise. - :rtype: bool - """ if self.deleted is None: return False return docnum in self.deleted def deleted_docs(self): - """ - Returns an iterator over the deleted document numbers in this segment. - - :return: An iterator over the deleted document numbers. - :rtype: iterator - """ if self.deleted is None: return () else: @@ -3440,18 +1601,6 @@ def deleted_docs(self): class W2Block: - """ - Represents a block of data in the Whoosh index file format. - - Attributes: - magic (bytes): The magic number identifying the block format. - infokeys (tuple): The keys for the block information. - - Args: - postingsize (int): The size of the posting data. - stringids (bool, optional): Whether the block uses string IDs. Defaults to False. - """ - magic = b"Blk3" infokeys = ( @@ -3467,13 +1616,6 @@ class W2Block: ) def __init__(self, postingsize, stringids=False): - """ - Initializes a new instance of the W2Block class. - - Args: - postingsize (int): The size of the posting data. - stringids (bool, optional): Whether the block uses string IDs. Defaults to False. - """ self.postingsize = postingsize self.stringids = stringids self.ids = [] if stringids else array("I") @@ -3485,90 +1627,33 @@ def __init__(self, postingsize, stringids=False): self.maxweight = 0 def __len__(self): - """ - Returns the number of IDs in the block. - - Returns: - int: The number of IDs in the block. - """ return len(self.ids) def __nonzero__(self): - """ - Returns whether the block has any IDs. - - Returns: - bool: True if the block has IDs, False otherwise. - """ return bool(self.ids) def min_id(self): - """ - Returns the minimum ID in the block. - - Returns: - int: The minimum ID in the block. - - Raises: - IndexError: If the block has no IDs. - """ if self.ids: return self.ids[0] else: raise IndexError def max_id(self): - """ - Returns the maximum ID in the block. - - Returns: - int: The maximum ID in the block. - - Raises: - IndexError: If the block has no IDs. - """ if self.ids: return self.ids[-1] else: raise IndexError def min_length(self): - """ - Returns the minimum length of the values in the block. - - Returns: - int: The minimum length of the values in the block. - """ return self.minlength def max_length(self): - """ - Returns the maximum length of the values in the block. - - Returns: - int: The maximum length of the values in the block. - """ return self.maxlength def max_weight(self): - """ - Returns the maximum weight in the block. - - Returns: - float: The maximum weight in the block. - """ return self.maxweight def add(self, id_, weight, valuestring, length=None): - """ - Adds an ID, weight, and value to the block. - - Args: - id_ (int): The ID to add. - weight (float): The weight to add. - valuestring (str): The value string to add. - length (int, optional): The length of the value. Defaults to None. - """ self.ids.append(id_) self.weights.append(weight) if weight > self.maxweight: @@ -3584,13 +1669,6 @@ def add(self, id_, weight, valuestring, length=None): self.maxlength = length def to_file(self, postfile, compression=3): - """ - Writes the block data to a file. - - Args: - postfile (file): The file to write the block data to. - compression (int, optional): The compression level. Defaults to 3. - """ ids = self.ids idcode, idstring = minimize_ids(ids, self.stringids, compression) wtstring = minimize_weights(self.weights, compression) @@ -3621,17 +1699,6 @@ def to_file(self, postfile, compression=3): @classmethod def from_file(cls, postfile, postingsize, stringids=False): - """ - Reads a block from a file. - - Args: - postfile (file): The file to read the block from. - postingsize (int): The size of the posting data. - stringids (bool, optional): Whether the block uses string IDs. Defaults to False. - - Returns: - W2Block: The read block. - """ block = cls(postingsize, stringids=stringids) block.postfile = postfile @@ -3648,12 +1715,6 @@ def from_file(cls, postfile, postingsize, stringids=False): return block def read_ids(self): - """ - Reads the IDs from the block. - - Returns: - list: The read IDs. - """ offset = self.dataoffset self.postfile.seek(offset) idstring = self.postfile.read(self.idslen) @@ -3662,12 +1723,6 @@ def read_ids(self): return ids def read_weights(self): - """ - Reads the weights from the block. - - Returns: - list: The read weights. - """ if self.weightslen == 0: weights = [1.0] * self.count else: @@ -3679,12 +1734,6 @@ def read_weights(self): return weights def read_values(self): - """ - Reads the values from the block. - - Returns: - list: The read values. - """ postingsize = self.postingsize if postingsize == 0: values = [None] * self.count @@ -3705,39 +1754,19 @@ def read_values(self): class FileTermInfo(TermInfo): - """ - Represents term information stored in a file-based index. - - Attributes: - postings: The postings associated with the term. - """ - + # Freq, Doc freq, min len, max length, max weight, unused, min ID, max ID struct = Struct("!fIBBffII") def __init__(self, *args, **kwargs): - """ - Initializes a new instance of the FileTermInfo class. - - Args: - *args: Variable length arguments. - **kwargs: Keyword arguments. - - Keyword Args: - postings: The postings associated with the term. - """ self.postings = None if "postings" in kwargs: self.postings = kwargs["postings"] del kwargs["postings"] TermInfo.__init__(self, *args, **kwargs) - def add_block(self, block): - """ - Adds a block of postings to the term information. + # filedb specific methods - Args: - block: The block of postings to add. - """ + def add_block(self, block): self._weight += sum(block.weights) self._df += len(block) @@ -3754,12 +1783,6 @@ def add_block(self, block): self._maxid = block.ids[-1] def to_string(self): - """ - Converts the term information to a string representation. - - Returns: - The string representation of the term information. - """ # Encode the lengths as 0-255 values ml = 0 if self._minlength is None else length_to_byte(self._minlength) xl = length_to_byte(self._maxlength) @@ -3792,15 +1815,6 @@ def to_string(self): @classmethod def from_string(cls, s): - """ - Creates a new FileTermInfo instance from a string representation. - - Args: - s: The string representation of the term information. - - Returns: - A new FileTermInfo instance. - """ assert isinstance(s, bytes) if isinstance(s, str): @@ -3846,44 +1860,14 @@ def from_string(cls, s): @classmethod def read_weight(cls, dbfile, datapos): - """ - Reads the weight from the database file. - - Args: - dbfile: The database file. - datapos: The position of the weight in the file. - - Returns: - The weight. - """ return dbfile.get_float(datapos + 1) @classmethod def read_doc_freq(cls, dbfile, datapos): - """ - Reads the document frequency from the database file. - - Args: - dbfile: The database file. - datapos: The position of the document frequency in the file. - - Returns: - The document frequency. - """ return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE) @classmethod def read_min_and_max_length(cls, dbfile, datapos): - """ - Reads the minimum and maximum length from the database file. - - Args: - dbfile: The database file. - datapos: The position of the lengths in the file. - - Returns: - A tuple containing the minimum and maximum length. - """ lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE ml = byte_to_length(dbfile.get_byte(lenpos)) xl = byte_to_length(dbfile.get_byte(lenpos + 1)) @@ -3891,16 +1875,6 @@ def read_min_and_max_length(cls, dbfile, datapos): @classmethod def read_max_weight(cls, dbfile, datapos): - """ - Reads the maximum weight from the database file. - - Args: - dbfile: The database file. - datapos: The position of the maximum weight in the file. - - Returns: - The maximum weight. - """ weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2 return dbfile.get_float(weightspos) @@ -3909,27 +1883,6 @@ def read_max_weight(cls, dbfile, datapos): def minimize_ids(arry, stringids, compression=0): - """ - Minimizes the given array of IDs for efficient storage and retrieval. - - Args: - arry (array): The array of IDs to be minimized. - stringids (bool): Indicates whether the IDs are string-based or not. - compression (int, optional): The compression level to apply to the minimized IDs. Defaults to 0. - - Returns: - tuple: A tuple containing the typecode of the minimized IDs and the minimized IDs as a string. - - Raises: - None - - Notes: - - If the IDs are string-based, they will be serialized using the `pickle.dumps` function. - - If the IDs are not string-based, they will be converted to the appropriate typecode based on their maximum value. - - If the typecode of the array needs to be changed, a new array will be created with the updated typecode. - - If the system is big-endian, the byte order of the array will be swapped. - - If compression is enabled, the minimized IDs will be compressed using the zlib library. - """ amax = arry[-1] if stringids: @@ -3953,21 +1906,6 @@ def minimize_ids(arry, stringids, compression=0): def deminimize_ids(typecode, count, string, compression=0): - """ - Deserialize and decompress a string representation of an array of integers. - - Args: - typecode (str): The typecode of the array. - count (int): The number of elements in the array. - string (bytes): The serialized and optionally compressed string representation of the array. - compression (int, optional): The compression level used for the string. Defaults to 0. - - Returns: - array: The deserialized and decompressed array of integers. - - Raises: - TypeError: If the typecode is not a valid array typecode. - """ if compression: string = zlib.decompress(string) if typecode == "": @@ -3981,33 +1919,6 @@ def deminimize_ids(typecode, count, string, compression=0): def minimize_weights(weights, compression=0): - """ - Minimizes the weights array by converting it to a compressed string representation. - - Args: - weights (array-like): The weights array to be minimized. - compression (int, optional): The compression level to be applied. Defaults to 0. - - Returns: - str: The minimized string representation of the weights array. - - Raises: - None - - Examples: - >>> weights = [1.0, 1.0, 1.0] - >>> minimize_weights(weights) - b'' - - >>> weights = [0.5, 0.75, 1.0] - >>> minimize_weights(weights, compression=6) - b'x\x9c\xcbH\xcd\xc9\xc9\x07\x00\x06\xcb\x01' - - Note: - - If all weights in the array are equal to 1.0, an empty string is returned. - - The weights array is expected to be a one-dimensional array-like object. - - The compression level should be an integer between 0 and 9, where 0 means no compression and 9 means maximum compression. - """ if all(w == 1.0 for w in weights): string = b"" else: @@ -4020,25 +1931,6 @@ def minimize_weights(weights, compression=0): def deminimize_weights(count, string, compression=0): - """ - Convert a serialized string representation of weights into an array of floats. - - Args: - count (int): The number of weights to be converted. - string (bytes): The serialized string representation of weights. - compression (int, optional): The compression level used for the serialized string. Defaults to 0. - - Returns: - array.array: An array of floats representing the weights. - - Raises: - None - - Examples: - >>> weights = deminimize_weights(3, b'\x00\x00\x80\x3f\x00\x00\x00\x40\x00\x00\x40\x40') - >>> print(weights) - array('f', [1.0, 2.0, 3.0]) - """ if not string: return array("f", (1.0 for _ in range(count))) if compression: @@ -4051,24 +1943,6 @@ def deminimize_weights(count, string, compression=0): def minimize_values(postingsize, values, compression=0): - """ - Minimizes the values by compressing them and returning the compressed string. - - Args: - postingsize (int): The size of the posting. - values (list): The list of values to be minimized. - compression (int, optional): The compression level. Defaults to 0. - - Returns: - str: The compressed string. - - Raises: - None - - Examples: - >>> minimize_values(10, ['value1', 'value2', 'value3'], 6) - 'compressed_string' - """ if postingsize < 0: string = dumps(values, -1)[2:] elif postingsize == 0: @@ -4081,26 +1955,6 @@ def minimize_values(postingsize, values, compression=0): def deminimize_values(postingsize, count, string, compression=0): - """ - Deminimizes a string into a list of values. - - Args: - postingsize (int): The size of each value in the string. - count (int): The number of values to extract from the string. - string (bytes): The string to deminimize. - compression (int, optional): The compression level of the string. Defaults to 0. - - Returns: - list: The deminimized list of values. - - Raises: - None - - Examples: - >>> string = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' - >>> deminimize_values(2, 8, string) - [b'\x00\x01', b'\x02\x03', b'\x04\x05', b'\x06\x07', b'\x08\t', b'\n\x0b', b'\x0c\r', b'\x0e\x0f'] - """ if compression: string = zlib.decompress(string) @@ -4117,27 +1971,7 @@ def deminimize_values(postingsize, count, string, compression=0): from whoosh.fields import NUMERIC -class old_numeric(NUMERIC): - """ - A field type for storing numeric values in the index. - - This field type supports storing integers, floats, and decimals. - The values can be sorted and searched using numeric range queries. - - Parameters: - - type (type): The Python type of the numeric values to be stored. - - stored (bool): Whether the field should be stored in the index. - - unique (bool): Whether the field values should be unique. - - field_boost (float): The boost factor for the field. - - decimal_places (int): The number of decimal places to store for decimal values. - - shift_step (int): The number of bits to shift the values during sorting. - - signed (bool): Whether the values should be treated as signed or unsigned. - - Raises: - - TypeError: If the specified type is not supported by the field. - - """ - +class OLD_NUMERIC(NUMERIC): NUMERIC_DEFAULTS = { "b": 2**7 - 1, "B": 2**8 - 1, @@ -4161,22 +1995,6 @@ def __init__( shift_step=4, signed=True, ): - """ - Initialize the old_numeric field. - - Args: - - type (type): The Python type of the numeric values to be stored. - - stored (bool): Whether the field should be stored in the index. - - unique (bool): Whether the field values should be unique. - - field_boost (float): The boost factor for the field. - - decimal_places (int): The number of decimal places to store for decimal values. - - shift_step (int): The number of bits to shift the values during sorting. - - signed (bool): Whether the values should be treated as signed or unsigned. - - Raises: - - TypeError: If the specified type is not supported by the field. - - """ from whoosh import analysis, formats self.type = type @@ -4207,32 +2025,15 @@ def __init__( self.shift_step = shift_step self.signed = signed - self.analyzer = analysis.id_analyzer() + self.analyzer = analysis.IDAnalyzer() self.format = formats.Existence(field_boost=field_boost) def __setstate__(self, d): - """ - Set the state of the field. - - Args: - - d (dict): The state dictionary. - - """ self.__dict__.update(d) self.numtype = d["type"] self.bits = 64 def prepare_number(self, x): - """ - Prepare a numeric value for storage in the index. - - Args: - - x: The numeric value to prepare. - - Returns: - - The prepared numeric value. - - """ if x is None or x == emptybytes: return x if self.decimal_places: @@ -4242,16 +2043,6 @@ def prepare_number(self, x): return x def unprepare_number(self, x): - """ - Convert a prepared numeric value back to its original form. - - Args: - - x: The prepared numeric value. - - Returns: - - The original numeric value. - - """ dc = self.decimal_places if dc: s = str(x) @@ -4259,116 +2050,34 @@ def unprepare_number(self, x): return x def to_bytes(self, x, shift=0): - """ - Convert a numeric value to bytes. - - Args: - - x: The numeric value to convert. - - shift (int): The number of bits to shift the value. - - Returns: - - The bytes representation of the numeric value. - - """ if isinstance(x, bytes): return x return utf8encode(self.to_text(x, shift))[0] def from_bytes(self, bs): - """ - Convert bytes to a numeric value. - - Args: - - bs (bytes): The bytes to convert. - - Returns: - - The numeric value. - - """ return self.from_text(utf8decode(bs)[0]) def sortable_to_bytes(self, x, shift=0): - """ - Convert a numeric value to sortable bytes. - - Args: - - x: The numeric value to convert. - - shift (int): The number of bits to shift the value. - - Returns: - - The sortable bytes representation of the numeric value. - - """ if shift: x >>= shift return pack_byte(shift) + self._to_text() def to_text(self, x, shift=0): - """ - Convert a numeric value to text. - - Args: - - x: The numeric value to convert. - - shift (int): The number of bits to shift the value. - - Returns: - - The text representation of the numeric value. - - """ x = self.prepare_number(x) x = self._to_text(x, shift=shift, signed=self.signed) return x def from_text(self, t): - """ - Convert text to a numeric value. - - Args: - - t (str): The text to convert. - - Returns: - - The numeric value. - - """ x = self._from_text(t, signed=self.signed) return self.unprepare_number(x) def process_text(self, text, **kwargs): - """ - Process the text value of the field. - - Args: - - text (str): The text value to process. - - Returns: - - A tuple containing the processed text value. - - """ return (self.to_text(text),) def self_parsing(self): - """ - Check if the field is self-parsing. - - Returns: - - True if the field is self-parsing, False otherwise. - - """ return True def parse_query(self, fieldname, qstring, boost=1.0): - """ - Parse a query string for the field. - - Args: - - fieldname (str): The name of the field. - - qstring (str): The query string to parse. - - boost (float): The boost factor for the query. - - Returns: - - A query object representing the parsed query. - - """ from whoosh import query if qstring == "*": @@ -4383,21 +2092,6 @@ def parse_query(self, fieldname, qstring, boost=1.0): return query.Term(fieldname, text, boost=boost) def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): - """ - Parse a range query for the field. - - Args: - - fieldname (str): The name of the field. - - start: The start value of the range. - - end: The end value of the range. - - startexcl (bool): Whether the start value is exclusive. - - endexcl (bool): Whether the end value is exclusive. - - boost (float): The boost factor for the query. - - Returns: - - A query object representing the parsed range query. - - """ from whoosh import query from whoosh.qparser.common import QueryParserError @@ -4415,17 +2109,6 @@ def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): ) def sortable_terms(self, ixreader, fieldname): - """ - Generate sortable terms for the field. - - Args: - - ixreader: The index reader object. - - fieldname (str): The name of the field. - - Yields: - - Sortable terms for the field. - - """ for btext in ixreader.lexicon(fieldname): if btext[0:1] != "\x00": # Only yield the full-precision values @@ -4433,47 +2116,11 @@ def sortable_terms(self, ixreader, fieldname): yield btext -class old_datetime(old_numeric): - """ - A field type for storing and indexing datetime values. - - This field type stores datetime values as long integers internally, using the `datetime_to_long` function - to convert datetime objects to long integers, and the `long_to_datetime` function to convert long integers - back to datetime objects. - - Parameters: - - stored (bool): Whether the field should be stored in the index. Default is False. - - unique (bool): Whether the field should be unique in the index. Default is False. - - Example usage: - ``` - from whoosh.codec.whoosh2 import old_datetime - - # Create an instance of old_datetime field type - my_datetime_field = old_datetime(stored=True, unique=True) - ``` - - """ - +class OLD_DATETIME(OLD_NUMERIC): def __init__(self, stored=False, unique=False): - old_numeric.__init__(self, type=int, stored=stored, unique=unique, shift_step=8) + OLD_NUMERIC.__init__(self, type=int, stored=stored, unique=unique, shift_step=8) def to_text(self, x, shift=0): - """ - Convert a datetime value to a string representation. - - Parameters: - - x: The datetime value to convert. - - shift (int): The number of bits to shift the value by. Default is 0. - - Returns: - - str: The string representation of the datetime value. - - Raises: - - ValueError: If the datetime value cannot be converted to a string. - - """ - from datetime import datetime from whoosh.util.times import floor @@ -4490,41 +2137,16 @@ def to_text(self, x, shift=0): except ValueError: raise ValueError(f"DATETIME.to_text can't convert from {x!r}") - x = old_numeric.to_text(self, x, shift=shift) + x = OLD_NUMERIC.to_text(self, x, shift=shift) return x def from_text(self, x): - """ - Convert a string representation to a datetime value. - - Parameters: - - x (str): The string representation of the datetime value. - - Returns: - - datetime.datetime: The datetime value. - - """ - - x = old_numeric.from_text(self, x) + x = OLD_NUMERIC.from_text(self, x) return long_to_datetime(x) def _parse_datestring(self, qstring): - """ - Parse a simple datetime representation. - - This method parses a very simple datetime representation of the form YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]]. - - Parameters: - - qstring (str): The datetime string to parse. - - Returns: - - whoosh.util.times.adatetime: The parsed datetime value. - - Raises: - - Exception: If the datetime string is not parseable. - - """ - + # This method parses a very simple datetime representation of the form + # YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]] from whoosh.util.times import adatetime, fix, is_void qstring = qstring.replace(" ", "").replace("-", "").replace(".", "") @@ -4546,23 +2168,10 @@ def _parse_datestring(self, qstring): at = fix(adatetime(year, month, day, hour, minute, second, microsecond)) if is_void(at): - raise DateParseError(f"{qstring} is not a parseable date") + raise Exception(f"{qstring!r} is not a parseable date") return at def parse_query(self, fieldname, qstring, boost=1.0): - """ - Parse a query string into a query object. - - Parameters: - - fieldname (str): The name of the field to parse the query for. - - qstring (str): The query string to parse. - - boost (float): The boost factor for the query. Default is 1.0. - - Returns: - - whoosh.query.Query: The parsed query object. - - """ - from whoosh import query from whoosh.util.times import is_ambiguous @@ -4580,22 +2189,6 @@ def parse_query(self, fieldname, qstring, boost=1.0): return query.Term(fieldname, self.to_text(at), boost=boost) def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): - """ - Parse a range query into a query object. - - Parameters: - - fieldname (str): The name of the field to parse the range query for. - - start (str): The start value of the range query. - - end (str): The end value of the range query. - - startexcl (bool): Whether the start value is exclusive. Default is False. - - endexcl (bool): Whether the end value is exclusive. Default is False. - - boost (float): The boost factor for the query. Default is 1.0. - - Returns: - - whoosh.query.Query: The parsed range query object. - - """ - from whoosh import query if start is None and end is None: @@ -4616,118 +2209,33 @@ def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): def int_to_text(x, shift=0, signed=True): - """ - Convert an integer to a sortable text representation. - - Args: - x (int): The integer to be converted. - shift (int, optional): The number of bits to shift the integer before conversion. Defaults to 0. - signed (bool, optional): Whether the integer is signed or not. Defaults to True. - - Returns: - str: The sortable text representation of the integer. - """ x = to_sortable(int, 32, signed, x) return sortable_int_to_text(x, shift) def text_to_int(text, signed=True): - """ - Convert a text string to an integer representation. - - Args: - text (str): The text string to convert. - signed (bool, optional): Whether the resulting integer should be signed or unsigned. - Defaults to True. - - Returns: - int: The integer representation of the text string. - - """ x = text_to_sortable_int(text) x = from_sortable(int, 32, signed, x) return x def long_to_text(x, shift=0, signed=True): - """ - Convert a long integer to a text representation. - - Args: - x (int): The long integer to be converted. - shift (int, optional): The number of bits to shift the integer before conversion. Defaults to 0. - signed (bool, optional): Whether the integer is signed or not. Defaults to True. - - Returns: - str: The text representation of the long integer. - - """ x = to_sortable(int, 64, signed, x) return sortable_long_to_text(x, shift) def text_to_long(text, signed=True): - """ - Converts a text string to a long integer. - - Args: - text (str): The text string to convert. - signed (bool, optional): Whether the resulting long integer should be signed. - Defaults to True. - - Returns: - int: The converted long integer. - - Raises: - None - - Examples: - >>> text_to_long("12345") - 12345 - >>> text_to_long("-54321") - -54321 - """ x = text_to_sortable_long(text) x = from_sortable(int, 64, signed, x) return x def float_to_text(x, shift=0, signed=True): - """ - Convert a floating-point number to a sortable text representation. - - Args: - x (float): The floating-point number to be converted. - shift (int, optional): The number of bits to shift the sortable representation. Defaults to 0. - signed (bool, optional): Whether the sortable representation should support negative numbers. Defaults to True. - - Returns: - str: The sortable text representation of the floating-point number. - """ x = to_sortable(float, 32, signed, x) return sortable_long_to_text(x, shift) def text_to_float(text, signed=True): - """ - Converts a text representation of a float to a float value. - - Args: - text (str): The text representation of the float. - signed (bool, optional): Whether the float is signed or not. Defaults to True. - - Returns: - float: The float value represented by the text. - - Raises: - ValueError: If the text cannot be converted to a float. - - Examples: - >>> text_to_float("3.14") - 3.14 - >>> text_to_float("-2.5", signed=True) - -2.5 - """ x = text_to_sortable_long(text) x = from_sortable(float, 32, signed, x) return x @@ -4739,89 +2247,29 @@ def text_to_float(text, signed=True): def sortable_int_to_text(x, shift=0): - """ - Convert a sortable integer to a text representation. - - Args: - x (int): The integer to be converted. - shift (int, optional): The number of bits to shift the integer before conversion. Defaults to 0. - - Returns: - str: The text representation of the sortable integer. - - Notes: - This function converts a sortable integer to a text representation by shifting the integer (if specified) and encoding it using base85 encoding. - - Example: - >>> sortable_int_to_text(12345) - '0gV' - """ if shift: x >>= shift + # text = chr(shift) + u"%08x" % x text = chr(shift) + to_base85(x, False) return text def sortable_long_to_text(x, shift=0): - """ - Convert a sortable long integer to a text representation. - - Args: - x (int): The long integer to be converted. - shift (int, optional): The number of bits to shift the integer before conversion. Defaults to 0. - - Returns: - str: The text representation of the sortable long integer. - - Notes: - This function converts a long integer to a text representation using base85 encoding. - The resulting text representation is prefixed with a character representing the shift value. - - Example: - >>> sortable_long_to_text(1234567890, 4) - 'E@9jqo' - """ if shift: x >>= shift + # text = chr(shift) + u"%016x" % x + # assert len(text) == 17 text = chr(shift) + to_base85(x, True) return text def text_to_sortable_int(text): - """ - Converts a text representation of a sortable integer to an actual integer. - - Args: - text (str): The text representation of the sortable integer. - - Returns: - int: The converted integer. - - Raises: - ValueError: If the text representation is invalid. - - Example: - >>> text_to_sortable_int('x12345678') - 305419896 - """ + # assert len(text) == 9 + # return int(text[1:], 16) return from_base85(text[1:]) def text_to_sortable_long(text): - """ - Converts a text string to a sortable long value. - - Parameters: - text (str): The text string to convert. - - Returns: - int: The converted sortable long value. - - Raises: - ValueError: If the input text is not a valid sortable long value. - - Example: - >>> text_to_sortable_long('0x123456789abcdef') - 81985529216486895 - """ + # assert len(text) == 17 + # return long(text[1:], 16) return from_base85(text[1:]) diff --git a/src/whoosh/codec/whoosh3.py b/src/whoosh/codec/whoosh3.py index 0ed453a6..96a06961 100644 --- a/src/whoosh/codec/whoosh3.py +++ b/src/whoosh/codec/whoosh3.py @@ -76,19 +76,6 @@ class W3Codec(base.Codec): - """ - Codec implementation for the Whoosh 3 index format. - - This codec provides methods for reading and writing various components of the index, - such as term indexes, term postings, vector postings, and per-document value columns. - - Parameters: - - blocklimit (int): The maximum number of postings to store in a block. Defaults to 128. - - compression (int): The level of compression to use for the postings. Defaults to 3. - - inlinelimit (int): The maximum number of postings to inline in the term info object. Defaults to 1. - - """ - # File extensions TERMS_EXT = ".trm" # Term index POSTS_EXT = ".pst" # Term postings @@ -96,60 +83,23 @@ class W3Codec(base.Codec): COLUMN_EXT = ".col" # Per-document value columns def __init__(self, blocklimit=128, compression=3, inlinelimit=1): - """ - Initialize a new instance of the W3Codec class. - - Parameters: - - blocklimit (int): The maximum number of postings to store in a block. Defaults to 128. - - compression (int): The level of compression to use for the postings. Defaults to 3. - - inlinelimit (int): The maximum number of postings to inline in the term info object. Defaults to 1. - - """ self._blocklimit = blocklimit self._compression = compression self._inlinelimit = inlinelimit + # def automata(self): + # Per-document value writer def per_document_writer(self, storage, segment): - """ - Create a per-document value writer for the given storage and segment. - - Parameters: - - storage (Storage): The storage object for the index. - - segment (Segment): The segment object for the index. - - Returns: - - W3PerDocWriter: The per-document value writer. - - """ return W3PerDocWriter(self, storage, segment) + # Inverted index writer def field_writer(self, storage, segment): - """ - Create an inverted index writer for the given storage and segment. - - Parameters: - - storage (Storage): The storage object for the index. - - segment (Segment): The segment object for the index. - - Returns: - - W3FieldWriter: The inverted index writer. - - """ return W3FieldWriter(self, storage, segment) - def postings_writer(self, dbfile, byteids=False): - """ - Create a postings writer for the given database file. - - Parameters: - - dbfile (File): The file object for the postings. - - byteids (bool): Whether to use byte-based document ids. Defaults to False. - - Returns: - - W3PostingsWriter: The postings writer. + # Postings - """ + def postings_writer(self, dbfile, byteids=False): return W3PostingsWriter( dbfile, blocklimit=self._blocklimit, @@ -159,20 +109,6 @@ def postings_writer(self, dbfile, byteids=False): ) def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): - """ - Create a postings reader for the given database file and term info. - - Parameters: - - dbfile (File): The file object for the postings. - - terminfo (TermInfo): The term info object for the term. - - format_ (str): The format of the postings. - - term (str): The term to read the postings for. Defaults to None. - - scorer (Scorer): The scorer object for scoring the postings. Defaults to None. - - Returns: - - Matcher: The postings reader. - - """ if terminfo.is_inlined(): # If the postings were inlined into the terminfo object, pull them # out and use a ListMatcher to wrap them in a Matcher interface @@ -191,32 +127,12 @@ def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): m = W3LeafMatcher(dbfile, offset, length, format_, term=term, scorer=scorer) return m - def per_document_reader(self, storage, segment): - """ - Create a per-document value reader for the given storage and segment. + # Readers - Parameters: - - storage (Storage): The storage object for the index. - - segment (Segment): The segment object for the index. - - Returns: - - W3PerDocReader: The per-document value reader. - - """ + def per_document_reader(self, storage, segment): return W3PerDocReader(storage, segment) def terms_reader(self, storage, segment): - """ - Create a terms reader for the given storage and segment. - - Parameters: - - storage (Storage): The storage object for the index. - - segment (Segment): The segment object for the index. - - Returns: - - W3TermsReader: The terms reader. - - """ tiname = segment.make_filename(self.TERMS_EXT) tilen = storage.file_length(tiname) tifile = storage.open_file(tiname) @@ -226,45 +142,20 @@ def terms_reader(self, storage, segment): return W3TermsReader(self, tifile, tilen, postfile) # Graph methods provided by CodecWithGraph - def supports_columns(self): - """ - Check if the codec supports per-document value columns. - Returns: - - bool: True if per-document value columns are supported, False otherwise. + # Columns - """ + def supports_columns(self): return True @classmethod def column_filename(cls, segment, fieldname): - """ - Get the filename for the per-document value column of the given field in the segment. - - Parameters: - - segment (Segment): The segment object for the index. - - fieldname (str): The name of the field. - - Returns: - - str: The filename for the per-document value column. - - """ ext = "".join((".", fieldname, cls.COLUMN_EXT)) return segment.make_filename(ext) # Segments and generations - def new_segment(self, storage, indexname): - """ - Create a new segment for the given storage and index name. - Parameters: - - storage (Storage): The storage object for the index. - - indexname (str): The name of the index. - - Returns: - - W3Segment: The new segment. - - """ + def new_segment(self, storage, indexname): return W3Segment(self, indexname) @@ -272,95 +163,18 @@ def new_segment(self, storage, indexname): def _vecfield(fieldname): - """ - Returns the vector field name for a given field. - - Parameters: - fieldname (str): The name of the field. - - Returns: - str: The vector field name. - - Example: - >>> _vecfield("title") - '_title_vec' - - This function takes a field name as input and returns the corresponding vector field name. - The vector field name is constructed by adding underscores before and after the field name. - """ return f"_{fieldname}_vec" def _lenfield(fieldname): - """ - Returns the length field name for a given field. - - Parameters: - - fieldname (str): The name of the field. - - Returns: - - str: The length field name. - - Example: - >>> _lenfield("title") - '_title_len' - - This function is used to generate the length field name for a given field. The length field name is used in the Whoosh codec to store the length of a variable-length field. It appends "_len" to the field name to create the length field name. - - Usage: - >>> length_field = _lenfield("content") - >>> print(length_field) - '_content_len' - """ return f"_{fieldname}_len" # Per-doc information writer -class W3PerDocWriter(base.PerDocWriterWithColumns): - """ - This class is responsible for writing per-document data to the index for the Whoosh3 codec. - - It provides methods for adding fields, vectors, and other per-document information to the index. - - Usage: - ------ - 1. Create an instance of W3PerDocWriter by passing the codec, storage, and segment parameters to the constructor. - 2. Use the start_doc() method to indicate the start of a new document. - 3. Use the add_field() method to add a field to the document with its corresponding value and length. - 4. Use the add_vector_items() method to add vector items (text, weight, and vbytes) to the document. - 5. Use the finish_doc() method to indicate the end of the current document. - 6. Repeat steps 2-5 for each document. - 7. Call the close() method to finish writing the per-document data to the index. - - Note: - ----- - The close() method must be called after writing all the documents to the index. - - Attributes: - ----------- - - is_closed: A boolean attribute indicating whether the writer has been closed. - - Methods: - -------- - - start_doc(docnum): Indicates the start of a new document. - - add_field(fieldname, fieldobj, value, length): Adds a field to the document with its corresponding value and length. - - add_vector_items(fieldname, fieldobj, items): Adds vector items to the document. - - finish_doc(): Indicates the end of the current document. - - cancel_doc(): Cancels the current document. - - close(): Finishes writing the per-document data to the index. - """ +class W3PerDocWriter(base.PerDocWriterWithColumns): def __init__(self, codec, storage, segment): - """ - Initializes a new instance of W3PerDocWriter. - - Parameters: - ----------- - - codec: The codec used for encoding and decoding data. - - storage: The storage object used for storing the index files. - - segment: The segment object representing the current segment of the index. - """ self._codec = codec self._storage = storage self._segment = segment @@ -382,49 +196,15 @@ def __init__(self, codec, storage, segment): self._vpostfile = None def _create_file(self, ext): - """ - Creates a new file with the given extension in the current segment. - - Parameters: - ----------- - - ext: The extension of the file. - - Returns: - -------- - The created file object. - """ return self._segment.create_file(self._storage, ext) def _has_column(self, fieldname): - """ - Checks if a column with the given fieldname has been added. - - Parameters: - ----------- - - fieldname: The name of the field/column. - - Returns: - -------- - True if the column exists, False otherwise. - """ return fieldname in self._colwriters def _create_column(self, fieldname, column): - """ - Creates a new column with the given fieldname. - - Parameters: - ----------- - - fieldname: The name of the field/column. - - column: The column object. - - Raises: - ------- - ValueError: If a column with the same fieldname has already been added. - """ writers = self._colwriters if fieldname in writers: - raise ValueError(f"Already added column {fieldname!r}") + raise Exception(f"Already added column {fieldname!r}") f = self._cols.create_file(fieldname) writers[fieldname] = column.writer(f) @@ -440,9 +220,9 @@ def _prep_vectors(self): def start_doc(self, docnum): if self._indoc: - raise ValueError("Called start_doc when already in a doc") + raise Exception("Called start_doc when already in a doc") if docnum != self._doccount: - raise ValueError( + raise Exception( f"Called start_doc({docnum!r}) was expecting {self._doccount!r}" ) @@ -519,43 +299,6 @@ def close(self): class W3FieldWriter(base.FieldWriter): - """ - Writes field data to the index for the Whoosh3 codec. - - This class is responsible for writing field data, including terms and postings, to the index. - It is used internally by the Whoosh3 codec and should not be instantiated directly. - - Parameters: - - codec (Codec): The codec used for encoding and decoding data. - - storage (Storage): The storage object used for creating files. - - segment (Segment): The segment object representing the current segment. - - Attributes: - - _codec (Codec): The codec used for encoding and decoding data. - - _storage (Storage): The storage object used for creating files. - - _segment (Segment): The segment object representing the current segment. - - _fieldname (str): The name of the current field being written. - - _fieldid (int): The ID of the current field being written. - - _btext (bytes): The binary representation of the current term being written. - - _fieldobj (Field): The field object associated with the current field being written. - - _format (Format): The format object associated with the current field being written. - - _tindex (OrderedHashWriter): The ordered hash writer for the terms index. - - _fieldmap (dict): A dictionary mapping field names to field IDs. - - _postfile (File): The file object for writing postings data. - - _postwriter (PostingsWriter): The postings writer for the current field being written. - - _infield (bool): Indicates whether the writer is currently inside a field. - - is_closed (bool): Indicates whether the writer has been closed. - - Methods: - - _create_file(ext): Creates a file with the given extension. - - start_field(fieldname, fieldobj): Starts writing a new field. - - start_term(btext): Starts writing a new term. - - add(docnum, weight, vbytes, length): Adds a posting to the current term. - - finish_term(): Finishes writing the current term. - - finish_field(): Finishes writing the current field. - - close(): Closes the writer and releases any resources. - """ - def __init__(self, codec, storage, segment): self._codec = codec self._storage = storage @@ -578,29 +321,9 @@ def __init__(self, codec, storage, segment): self.is_closed = False def _create_file(self, ext): - """ - Creates a file with the given extension. - - Parameters: - - ext (str): The file extension. - - Returns: - - File: The created file object. - """ return self._segment.create_file(self._storage, ext) def start_field(self, fieldname, fieldobj): - """ - Starts writing a new field. - - Parameters: - - fieldname (str): The name of the field. - - fieldobj (Field): The field object. - - Raises: - - ValueError: If called before start_field. - - """ fmap = self._fieldmap if fieldname in fmap: self._fieldid = fmap[fieldname] @@ -617,38 +340,15 @@ def start_field(self, fieldname, fieldobj): self._postwriter = self._codec.postings_writer(self._postfile) def start_term(self, btext): - """ - Starts writing a new term. - - Parameters: - - btext (bytes): The binary representation of the term. - - Raises: - - ValueError: If called before start_field. - """ if self._postwriter is None: - raise ValueError("Called start_term before start_field") + raise Exception("Called start_term before start_field") self._btext = btext self._postwriter.start_postings(self._fieldobj.format, W3TermInfo()) def add(self, docnum, weight, vbytes, length): - """ - Adds a posting to the current term. - - Parameters: - - docnum (int): The document number. - - weight (float): The weight of the posting. - - vbytes (int): The number of bytes used to encode the posting value. - - length (int): The length of the posting. - - """ self._postwriter.add_posting(docnum, weight, vbytes, length) def finish_term(self): - """ - Finishes writing the current term. - - """ terminfo = self._postwriter.finish_postings() # Add row to term info table @@ -656,29 +356,23 @@ def finish_term(self): valbytes = terminfo.to_bytes() self._tindex.add(keybytes, valbytes) - def finish_field(self): - """ - Finishes writing the current field. + # FieldWriterWithGraph.add_spell_word - Raises: - - ValueError: If called before start_field. - """ + def finish_field(self): if not self._infield: - raise ValueError("Called finish_field before start_field") + raise Exception("Called finish_field before start_field") self._infield = False self._postwriter = None def close(self): - """ - Closes the writer and releases any resources. - - """ self._tindex.close() self._postfile.close() self.is_closed = True # Reader objects + + class W3PerDocReader(base.PerDocumentReader): def __init__(self, storage, segment): self._storage = storage @@ -803,7 +497,7 @@ def _vector_extent(self, docnum, fieldname): def has_vector(self, docnum, fieldname): if self.has_column(_vecfield(fieldname)): - offset, _ = self._vector_extent(docnum, fieldname) + offset, length = self._vector_extent(docnum, fieldname) return offset != 0 return False @@ -812,7 +506,7 @@ def vector(self, docnum, fieldname, format_): self._prep_vectors() offset, length = self._vector_extent(docnum, fieldname) if not offset: - raise ValueError(f"Field {fieldname!r} has no vector in docnum {docnum}") + raise Exception(f"Field {fieldname!r} has no vector in docnum {docnum}") m = W3LeafMatcher(self._vpostfile, offset, length, format_, byteids=True) return m @@ -827,50 +521,7 @@ def stored_fields(self, docnum): class W3FieldCursor(base.FieldCursor): - """Cursor for iterating over the terms in a field in a Whoosh 3 index. - - This cursor provides methods for iterating over the terms in a specific field - in a Whoosh 3 index. It allows you to navigate through the terms in the field, - retrieve the text representation of the current term, and access additional - information about the term. - - Attributes: - _tindex (TIndex): The TIndex object representing the index. - _fieldname (str): The name of the field. - _keycoder (callable): The function used to encode the field name and term - into a key. - _keydecoder (callable): The function used to decode a key into the field name - and term. - _fieldobj (Field): The Field object representing the field. - - Methods: - __init__(tindex, fieldname, keycoder, keydecoder, fieldobj): Initializes the - W3FieldCursor object. - first(): Moves the cursor to the first term in the field and returns the text - representation of the term. - find(term): Moves the cursor to the specified term in the field and returns the - text representation of the term. - next(): Moves the cursor to the next term in the field and returns the text - representation of the term. - text(): Returns the text representation of the current term. - term_info(): Returns additional information about the current term. - is_valid(): Returns True if the cursor is currently pointing to a valid term, - False otherwise. - """ - def __init__(self, tindex, fieldname, keycoder, keydecoder, fieldobj): - """ - Initializes a new instance of the W3FieldCursor class. - - Args: - tindex (TIndex): The TIndex object representing the index. - fieldname (str): The name of the field. - keycoder (callable): The function used to encode the field name and term - into a key. - keydecoder (callable): The function used to decode a key into the field name - and term. - fieldobj (Field): The Field object representing the field. - """ self._tindex = tindex self._fieldname = fieldname self._keycoder = keycoder @@ -887,27 +538,10 @@ def __init__(self, tindex, fieldname, keycoder, keydecoder, fieldobj): self.next() def first(self): - """ - Moves the cursor to the first term in the field and returns the text - representation of the term. - - Returns: - str: The text representation of the first term in the field. - """ self._pos = self._startpos return self.next() def find(self, term): - """ - Moves the cursor to the specified term in the field and returns the text - representation of the term. - - Args: - term (bytes or str): The term to find in the field. - - Returns: - str: The text representation of the found term. - """ if not isinstance(term, bytes): term = self._fieldobj.to_bytes(term) key = self._keycoder(self._fieldname, term) @@ -915,13 +549,6 @@ def find(self, term): return self.next() def next(self): - """ - Moves the cursor to the next term in the field and returns the text - representation of the term. - - Returns: - str: The text representation of the next term in the field. - """ if self._pos is not None: keyrng = self._tindex.key_and_range_at(self._pos) if keyrng is not None: @@ -938,21 +565,9 @@ def next(self): return None def text(self): - """ - Returns the text representation of the current term. - - Returns: - str: The text representation of the current term. - """ return self._text def term_info(self): - """ - Returns additional information about the current term. - - Returns: - W3TermInfo: An object containing additional information about the current term. - """ if self._pos is None: return None @@ -960,59 +575,11 @@ def term_info(self): return W3TermInfo.from_bytes(databytes) def is_valid(self): - """ - Returns True if the cursor is currently pointing to a valid term, False otherwise. - - Returns: - bool: True if the cursor is currently pointing to a valid term, False otherwise. - """ return self._pos is not None class W3TermsReader(base.TermsReader): - """ - A terms reader for the Whoosh3 codec. - - This class is responsible for reading and retrieving terms, term information, and posting lists from the index. - - Parameters: - - codec (Codec): The codec associated with the index. - - dbfile (file-like object): The file-like object representing the terms index. - - length (int): The length of the terms index. - - postfile (file-like object): The file-like object representing the posting lists. - - Attributes: - - _codec (Codec): The codec associated with the index. - - _dbfile (file-like object): The file-like object representing the terms index. - - _tindex (OrderedHashReader): The ordered hash reader for the terms index. - - _fieldmap (dict): A dictionary mapping field names to field numbers. - - _postfile (file-like object): The file-like object representing the posting lists. - - _fieldunmap (list): A list mapping field numbers to field names. - - """ - def __init__(self, codec, dbfile, length, postfile): - """ - Initialize a Whoosh3 object. - - Parameters: - - codec (object): The codec object used for encoding and decoding data. - - dbfile (str): The path to the database file. - - length (int): The length of the database file. - - postfile (str): The path to the postfile. - - This method initializes a Whoosh3 object by setting the codec, database file, - length, postfile, fieldmap, and fieldunmap attributes. The fieldmap is a - dictionary that maps field names to field numbers, and the fieldunmap is a - list that maps field numbers to field names. - - Example usage: - codec = MyCodec() - dbfile = "/path/to/database.db" - length = 1000 - postfile = "/path/to/postfile" - whoosh3 = Whoosh3(codec, dbfile, length, postfile) - """ self._codec = codec self._dbfile = dbfile self._tindex = filetables.OrderedHashReader(dbfile, length) @@ -1024,112 +591,34 @@ def __init__(self, codec, dbfile, length, postfile): self._fieldunmap[num] = fieldname def _keycoder(self, fieldname, tbytes): - """ - Encode the field name and term bytes into a key. - - Parameters: - - fieldname (str): The name of the field. - - tbytes (bytes): The term bytes. - - Returns: - - bytes: The encoded key. - - """ assert isinstance(tbytes, bytes), f"tbytes={tbytes!r}" fnum = self._fieldmap.get(fieldname, 65535) return pack_ushort(fnum) + tbytes def _keydecoder(self, keybytes): - """ - Decode the key bytes into the field name and term bytes. - - Parameters: - - keybytes (bytes): The key bytes. - - Returns: - - Tuple[str, bytes]: The field name and term bytes. - - """ fieldid = unpack_ushort(keybytes[:_SHORT_SIZE])[0] return self._fieldunmap[fieldid], keybytes[_SHORT_SIZE:] def _range_for_key(self, fieldname, tbytes): - """ - Get the range of positions in the terms index for the given field name and term bytes. - - Parameters: - - fieldname (str): The name of the field. - - tbytes (bytes): The term bytes. - - Returns: - - Tuple[int, int]: The start and end positions in the terms index. - - """ return self._tindex.range_for_key(self._keycoder(fieldname, tbytes)) def __contains__(self, term): - """ - Check if the given term is present in the terms index. - - Parameters: - - term (Tuple[str, bytes]): The field name and term bytes. - - Returns: - - bool: True if the term is present, False otherwise. - - """ return self._keycoder(*term) in self._tindex def indexed_field_names(self): - """ - Get the names of the fields that are indexed. - - Returns: - - KeysView: A view object containing the names of the indexed fields. - - """ return self._fieldmap.keys() def cursor(self, fieldname, fieldobj): - """ - Create a cursor for iterating over the terms in the given field. - - Parameters: - - fieldname (str): The name of the field. - - fieldobj (Field): The field object. - - Returns: - - W3FieldCursor: The cursor object. - - """ tindex = self._tindex coder = self._keycoder decoder = self._keydecoder return W3FieldCursor(tindex, fieldname, coder, decoder, fieldobj) def terms(self): - """ - Get an iterator over all the terms in the index. - - Yields: - - Tuple[str, bytes]: The field name and term bytes. - - """ keydecoder = self._keydecoder return (keydecoder(keybytes) for keybytes in self._tindex.keys()) def terms_from(self, fieldname, prefix): - """ - Get an iterator over the terms in the given field starting from the specified prefix. - - Parameters: - - fieldname (str): The name of the field. - - prefix (bytes): The prefix bytes. - - Yields: - - Tuple[str, bytes]: The field name and term bytes. - - """ prefixbytes = self._keycoder(fieldname, prefix) keydecoder = self._keydecoder return ( @@ -1137,13 +626,6 @@ def terms_from(self, fieldname, prefix): ) def items(self): - """ - Get an iterator over all the (term, term info) pairs in the index. - - Yields: - - Tuple[Tuple[str, bytes], W3TermInfo]: The (field name, term bytes) and term info. - - """ tidecoder = W3TermInfo.from_bytes keydecoder = self._keydecoder return ( @@ -1152,17 +634,6 @@ def items(self): ) def items_from(self, fieldname, prefix): - """ - Get an iterator over the (term, term info) pairs in the given field starting from the specified prefix. - - Parameters: - - fieldname (str): The name of the field. - - prefix (bytes): The prefix bytes. - - Yields: - - Tuple[Tuple[str, bytes], W3TermInfo]: The (field name, term bytes) and term info. - - """ prefixbytes = self._keycoder(fieldname, prefix) tidecoder = W3TermInfo.from_bytes keydecoder = self._keydecoder @@ -1172,20 +643,6 @@ def items_from(self, fieldname, prefix): ) def term_info(self, fieldname, tbytes): - """ - Get the term info for the given field name and term bytes. - - Parameters: - - fieldname (str): The name of the field. - - tbytes (bytes): The term bytes. - - Returns: - - W3TermInfo: The term info. - - Raises: - - TermNotFound: If the term is not found. - - """ key = self._keycoder(fieldname, tbytes) try: return W3TermInfo.from_bytes(self._tindex[key]) @@ -1193,49 +650,14 @@ def term_info(self, fieldname, tbytes): raise TermNotFound(f"No term {fieldname}:{tbytes!r}") def frequency(self, fieldname, tbytes): - """ - Get the frequency of the given term in the specified field. - - Parameters: - - fieldname (str): The name of the field. - - tbytes (bytes): The term bytes. - - Returns: - - int: The term frequency. - - """ datapos = self._range_for_key(fieldname, tbytes)[0] return W3TermInfo.read_weight(self._dbfile, datapos) def doc_frequency(self, fieldname, tbytes): - """ - Get the document frequency of the given term in the specified field. - - Parameters: - - fieldname (str): The name of the field. - - tbytes (bytes): The term bytes. - - Returns: - - int: The document frequency. - - """ datapos = self._range_for_key(fieldname, tbytes)[0] return W3TermInfo.read_doc_freq(self._dbfile, datapos) def matcher(self, fieldname, tbytes, format_, scorer=None): - """ - Create a matcher for the given term in the specified field. - - Parameters: - - fieldname (str): The name of the field. - - tbytes (bytes): The term bytes. - - format_ (str): The format of the posting lists. - - scorer (Scorer, optional): The scorer object. - - Returns: - - Matcher: The matcher object. - - """ terminfo = self.term_info(fieldname, tbytes) m = self._codec.postings_reader( self._postfile, terminfo, format_, term=(fieldname, tbytes), scorer=scorer @@ -1243,10 +665,6 @@ def matcher(self, fieldname, tbytes, format_, scorer=None): return m def close(self): - """ - Close the terms reader and associated resources. - - """ self._tindex.close() self._postfile.close() @@ -1258,13 +676,6 @@ class W3PostingsWriter(base.PostingsWriter): """This object writes posting lists to the postings file. It groups postings into blocks and tracks block level statistics to makes it easier to skip through the postings. - - Parameters: - - postfile (file-like object): The file-like object to write the posting lists to. - - blocklimit (int): The maximum number of postings to buffer before writing them to the file. - - byteids (bool, optional): Whether the IDs should be stored as bytes or integers. Defaults to False. - - compression (int, optional): The compression level to use. Defaults to 3. - - inlinelimit (int, optional): The maximum number of postings to inline into the terminfo object. Defaults to 1. """ def __init__( @@ -1281,26 +692,13 @@ def __init__( self._terminfo = None def written(self): - """Check if any blocks have been written to the file. - - Returns: - bool: True if blocks have been written, False otherwise. - """ return self._blockcount > 0 def start_postings(self, format_, terminfo): - """Start a new term. - - Parameters: - - format_ (formats.Format): The format object for the term. - - terminfo (Terminfo): The terminfo object for the term. - - Raises: - ValueError: If called while already in a term. - """ + # Start a new term if self._terminfo: # If self._terminfo is not None, that means we are already in a term - raise ValueError("Called start in a term") + raise Exception("Called start in a term") assert isinstance(format_, formats.Format) self._format = format_ @@ -1314,17 +712,9 @@ def start_postings(self, format_, terminfo): self._startoffset = self._postfile.tell() def add_posting(self, id_, weight, vbytes, length=None): - """Add a posting to the buffered block. + # Add a posting to the buffered block - Parameters: - - id_ (str or int): The ID of the posting. - - weight (int or float): The weight of the posting. - - vbytes (bytes): The encoded payload of the posting. - - length (int, optional): The length of the field. Defaults to None. - - Raises: - AssertionError: If the types of the parameters are incorrect. - """ + # If the number of buffered postings == the block limit, write out the # buffered block and reset before adding this one if len(self._ids) >= self._blocklimit: self._write_block() @@ -1353,19 +743,9 @@ def add_posting(self, id_, weight, vbytes, length=None): self._maxlength = length def finish_postings(self): - """Finish writing the postings for the term. - - If there are fewer than "inlinelimit" postings in this posting list, - the postings are inlined into the terminfo object instead of writing them to the posting file. - - Returns: - Terminfo: The current terminfo object. - - Raises: - AssertionError: If the types of the parameters are incorrect. - """ terminfo = self._terminfo - + # If we have fewer than "inlinelimit" postings in this posting list, + # "inline" the postings into the terminfo instead of writing them to # the posting file if not self.written() and len(self) < self._inlinelimit: terminfo.add_block(self) @@ -1384,7 +764,8 @@ def finish_postings(self): return terminfo def _new_block(self): - """Reset the block buffer.""" + # Reset block buffer + # List of IDs (docnums for regular posting list, terms for vector PL) self._ids = [] if self._byteids else array("I") # List of weights @@ -1397,11 +778,8 @@ def _new_block(self): self._maxweight = 0 def _write_block(self, last=False): - """Write the buffered block to the postings file. + # Write the buffered block to the postings file - Parameters: - - last (bool, optional): Whether this is the last block. Defaults to False. - """ # If this is the first block, write a small header first if not self._blockcount: self._postfile.write(WHOOSH3_HEADER_MAGIC) @@ -1418,8 +796,7 @@ def _write_block(self, last=False): if len(databytes) < 20: comp = 0 # Compress the pickle (if self._compression > 0) - if self._compression > 0: - comp = self._compression + comp = self._compression if comp: databytes = zlib.compress(databytes, comp) @@ -1463,15 +840,18 @@ def _write_block(self, last=False): self._new_block() # Methods to reduce the byte size of the various lists + def _mini_ids(self): - """Minify the IDs.""" + # Minify IDs + ids = self._ids if not self._byteids: ids = delta_encode(ids) return tuple(ids) def _mini_weights(self): - """Minify the weights.""" + # Minify weights + weights = self._weights if all(w == 1.0 for w in weights): @@ -1482,7 +862,8 @@ def _mini_weights(self): return tuple(weights) def _mini_values(self): - """Minify the values.""" + # Minify values + fixedsize = self._format.fixed_value_size() values = self._values @@ -1495,117 +876,32 @@ def _mini_values(self): return vs # Block stats methods - def __len__(self): - """Return the number of unwritten buffered postings. - Returns: - int: The number of unwritten buffered postings. - """ + def __len__(self): + # Returns the number of unwritten buffered postings return len(self._ids) def min_id(self): - """Return the first ID in the buffered block. - - Returns: - str or int: The first ID in the buffered block. - """ + # First ID in the buffered block return self._ids[0] def max_id(self): - """Return the last ID in the buffered block. - - Returns: - str or int: The last ID in the buffered block. - """ + # Last ID in the buffered block return self._ids[-1] def min_length(self): - """Return the shortest field length in the buffered block. - - Returns: - int or None: The shortest field length in the buffered block. - """ + # Shortest field length in the buffered block return self._minlength def max_length(self): - """Return the longest field length in the buffered block. - - Returns: - int: The longest field length in the buffered block. - """ + # Longest field length in the buffered block return self._maxlength def max_weight(self): - """Return the highest weight in the buffered block. - - Returns: - int or float: The highest weight in the buffered block. - """ + # Highest weight in the buffered block return self._maxweight -class W3LeafMatcher(LeafMatcher): - """Reads on-disk postings from the postings file and presents the - :class:`whoosh.matching.Matcher` interface. - - Parameters: - - postfile (file-like object): The file-like object representing the postings file. - - startoffset (int): The starting offset of the postings in the file. - - length (int): The length of the postings. - - format_ (CodecFormat): The format of the postings. - - term (bytes, optional): The term associated with the postings. Defaults to None. - - byteids (bool, optional): Whether the IDs in the postings are stored as bytes. Defaults to None. - - scorer (Scorer, optional): The scorer to use for scoring the postings. Defaults to None. - - Attributes: - - _postfile (file-like object): The file-like object representing the postings file. - - _startoffset (int): The starting offset of the postings in the file. - - _length (int): The length of the postings. - - format (CodecFormat): The format of the postings. - - _term (bytes): The term associated with the postings. - - _byteids (bool): Whether the IDs in the postings are stored as bytes. - - scorer (Scorer): The scorer to use for scoring the postings. - - _fixedsize (int): The fixed size of the values in the postings. - - _baseoffset (int): The base offset of the postings (start of postings, after the header). - - _blocklength (int): The length of the current block of postings. - - _maxid (int): The maximum ID in the current block of postings. - - _maxweight (float): The maximum weight in the current block of postings. - - _compression (bool): Whether the block of postings is compressed. - - _minlength (int): The minimum length of the values in the current block of postings. - - _maxlength (int): The maximum length of the values in the current block of postings. - - _lastblock (bool): Whether the current block of postings is the last block. - - _atend (bool): Whether the matcher has reached the end of the postings. - - _data (tuple): The data tuple of the current block of postings. - - _ids (tuple): The IDs in the current block of postings. - - _weights (array): The weights in the current block of postings. - - _values (tuple): The values in the current block of postings. - - _i (int): The current position in the block of postings. - - Methods: - - _read_header(): Reads the header tag at the start of the postings. - - reset(): Resets the matcher to read the first block of postings. - - _goto(position): Reads the posting block at the given position. - - _next_block(): Moves to the next block of postings. - - _skip_to_block(skipwhile): Skips blocks as long as the skipwhile() function returns True. - - is_active(): Checks if the matcher is active (not at the end of the postings). - - id(): Returns the current ID (docnum for regular postings, term for vector). - - weight(): Returns the weight for the current posting. - - value(): Returns the value for the current posting. - - next(): Moves to the next posting. - - skip_to(targetid): Skips to the next ID equal to or greater than the given target ID. - - skip_to_quality(minquality): Skips blocks until finding one that might exceed the given minimum quality. - - block_min_id(): Returns the minimum ID in the current block of postings. - - block_max_id(): Returns the maximum ID in the current block of postings. - - block_min_length(): Returns the minimum length of the values in the current block of postings. - - block_max_length(): Returns the maximum length of the values in the current block of postings. - - block_max_weight(): Returns the maximum weight in the current block of postings. - - _read_data(): Loads the block data tuple from disk. - - _read_ids(): Loads the IDs from the block data. - - _read_weights(): Loads the weights from the block data. - - _read_values(): Loads the values from the block data. - """ - - class W3LeafMatcher(LeafMatcher): """Reads on-disk postings from the postings file and presents the :class:`whoosh.matching.Matcher` interface. @@ -1621,28 +917,6 @@ def __init__( byteids=None, scorer=None, ): - """ - Initialize a Whoosh3 object. - - Args: - postfile (file-like object): The file-like object representing the postings file. - startoffset (int): The starting offset of the postings in the file. - length (int): The length of the postings in bytes. - format_ (CodecFormat): The codec format used for encoding and decoding the postings. - term (bytes, optional): The term associated with the postings. Defaults to None. - byteids (list of int, optional): The byte IDs associated with the postings. Defaults to None. - scorer (Scorer, optional): The scorer used for scoring the postings. Defaults to None. - - Attributes: - _postfile (file-like object): The file-like object representing the postings file. - _startoffset (int): The starting offset of the postings in the file. - _length (int): The length of the postings in bytes. - format (CodecFormat): The codec format used for encoding and decoding the postings. - _term (bytes): The term associated with the postings. - _byteids (list of int): The byte IDs associated with the postings. - scorer (Scorer): The scorer used for scoring the postings. - _fixedsize (int): The fixed size of each posting value. - """ self._postfile = postfile self._startoffset = startoffset self._length = length @@ -1658,45 +932,19 @@ def __init__( self.reset() def _read_header(self): - """ - Reads and verifies the header of the postings file. - - This method seeks to the start of the postings file, reads the header tag, and verifies its correctness. - It also sets the base offset to the current position in the file, which represents the start of the postings - after the header. - - Raises: - ValueError: If the header tag is incorrect. - - Usage: - Call this method to read and verify the header of the postings file before accessing the postings data. - - """ + # Seek to the start of the postings and check the header tag postfile = self._postfile postfile.seek(self._startoffset) magic = postfile.read(4) if magic != WHOOSH3_HEADER_MAGIC: - raise ValueError(f"Block tag error {magic!r}") + raise Exception(f"Block tag error {magic!r}") # Remember the base offset (start of postings, after the header) self._baseoffset = postfile.tell() def reset(self): - """ - Reset the codec's internal state. - - This method resets the block stats, including block length, maximum ID, maximum weight, - compression, minimum length, and maximum length. It also resets the flags indicating the - last block and whether the codec is at the end. - - After resetting the internal state, the method consumes the first block by calling the - `_goto` method with the base offset. - - Usage: - codec.reset() - - """ + # Reset block stats self._blocklength = None self._maxid = None self._maxweight = None @@ -1710,33 +958,8 @@ def reset(self): self._goto(self._baseoffset) def _goto(self, position): - """ - Move the pointer to the given position in the posting file and load the block data. - - Args: - position (int): The position in the posting file to move the pointer to. - - Returns: - None - - Raises: - None - - This method is responsible for moving the pointer to the specified position in the posting file - and loading the block data from that position. It performs the following steps: - 1. Resets the block data attributes to None. - 2. Resets the pointer into the block to 0. - 3. Seeks to the start of the block in the posting file. - 4. Reads the length of the block. - 5. If the length is negative, sets the `_lastblock` attribute to True and makes the length positive. - 6. Remembers the offset of the next block. - 7. Reads the pickled block info tuple. - 8. Remembers the offset of the block's data. - 9. Decomposes the info tuple to set the current block info. - - Note: - This method assumes that the posting file is already open and assigned to the `_postfile` attribute. - """ + # Read the posting block at the given position + postfile = self._postfile # Reset block data -- we'll lazy load the data from the new block as @@ -1777,24 +1000,10 @@ def _goto(self, position): self._maxlength = byte_to_length(mxlen) def _next_block(self): - """ - Move to the next block in the postings. - - This method is responsible for advancing the cursor to the next block in the postings. - It handles cases where the cursor is already at the end, reached the end of the postings, - or needs to move to the next block. - - Raises: - ValueError: If there is no next block. - - Usage: - Call this method to move the cursor to the next block in the postings. - - """ if self._atend: # We were already at the end, and yet somebody called _next_block() # again, so something is wrong somewhere - raise ValueError("No next block") + raise Exception("No next block") elif self._lastblock: # Reached the end of the postings self._atend = True @@ -1803,30 +1012,8 @@ def _next_block(self): self._goto(self._nextoffset) def _skip_to_block(self, skipwhile): - """ - Skips blocks in the codec as long as the skipwhile() function returns True. - - Parameters: - - skipwhile (function): A function that takes no arguments and returns a boolean value. - It is called at each block to determine whether to skip to the next block or not. - - Returns: - - skipped (int): The number of blocks skipped. - - Notes: - - This method is used internally by the codec to skip blocks based on a condition. - - The skipwhile() function should return True if the current block should be skipped, - and False if the current block should not be skipped. - - Example usage: - ``` - def skip_condition(): - # Skip blocks until a certain condition is met - return some_condition() - - skipped_blocks = _skip_to_block(skip_condition) - ``` - """ + # Skip blocks as long as the skipwhile() function returns True + skipped = 0 while self.is_active() and skipwhile(): self._next_block() @@ -1834,26 +1021,10 @@ def skip_condition(): return skipped def is_active(self): - """ - Check if the current position in the file is active. - - Returns: - bool: True if the current position is active, False otherwise. - """ return not self._atend and self._i < self._blocklength def id(self): - """ - Get the current ID. - - This method returns the current ID, which can be either the docnum for regular postings or the term for vectors. - - Returns: - int: The current ID. - - Raises: - ValueError: If the block IDs have not been loaded yet. - """ + # Get the current ID (docnum for regular postings, term for vector) # If we haven't loaded the block IDs yet, load them now if self._ids is None: @@ -1862,19 +1033,8 @@ def id(self): return self._ids[self._i] def weight(self): - """ - Get the weight for the current posting. - - This method retrieves the weight associated with the current posting. - If the block weights have not been loaded yet, it loads them before - returning the weight. - - Returns: - float: The weight of the current posting. + # Get the weight for the current posting - Raises: - Exception: If the block weights cannot be loaded. - """ # If we haven't loaded the block weights yet, load them now if self._weights is None: self._read_weights() @@ -1882,17 +1042,8 @@ def weight(self): return self._weights[self._i] def value(self): - """ - Get the value for the current posting. + # Get the value for the current posting - If the block values have not been loaded yet, this method will load them. - - Returns: - The value for the current posting. - - Raises: - IndexError: If the current posting index is out of range. - """ # If we haven't loaded the block values yet, load them now if self._values is None: self._read_values() @@ -1900,15 +1051,8 @@ def value(self): return self._values[self._i] def next(self): - """ - Move to the next posting. + # Move to the next posting - This method increments the in-block pointer by 1. If the pointer reaches the end of the block, - it moves to the next block and returns True. Otherwise, it returns False. - - Returns: - bool: True if the pointer reached the end of the block and moved to the next block, False otherwise. - """ # Increment the in-block pointer self._i += 1 # If we reached the end of the block, move to the next block @@ -1919,23 +1063,8 @@ def next(self): return False def skip_to(self, targetid): - """ - Skip to the next ID equal to or greater than the given target ID. - - Args: - targetid (int): The target ID to skip to. + # Skip to the next ID equal to or greater than the given target ID - Raises: - ReadTooFar: If the skip operation is attempted when the reader is not active. - - Notes: - - If the reader is already at or past the target ID, no skipping is performed. - - The method skips to the block that would contain the target ID. - - If the target ID is greater than the maximum ID in the current block, the method - skips to the next block that would contain the target ID. - - The method iterates through the IDs in the block until it finds or passes the target ID. - - """ if not self.is_active(): raise ReadTooFar @@ -1954,21 +1083,9 @@ def skip_to(self, targetid): self.next() def skip_to_quality(self, minquality): - """ - Skips to the next block with a quality greater than or equal to the given minimum quality. - - Parameters: - - minquality (float): The minimum quality threshold. + # Skip blocks until we find one that might exceed the given minimum + # quality - Returns: - - int: The number of blocks skipped. - - Notes: - - This method is used to skip blocks in a search index until a block with a quality greater than or equal to the given minimum quality is found. - - The block quality is determined by the `block_quality` attribute of the current object. - - If the quality of the current block is already higher than the minimum quality, no blocks are skipped. - - Blocks are skipped until a block with a quality greater than or equal to the minimum quality is found. - """ block_quality = self.block_quality # If the quality of this block is already higher than the minimum, @@ -1981,109 +1098,25 @@ def skip_to_quality(self, minquality): return self._skip_to_block(lambda: block_quality() <= minquality) def block_min_id(self): - """ - Returns the minimum ID of the block. - - This method retrieves the minimum ID of the block. If the IDs have not been - read yet, it reads them from the source. - - Returns: - int: The minimum ID of the block. - - """ if self._ids is None: self._read_ids() return self._ids[0] def block_max_id(self): - """ - Returns the maximum ID of the block. - - This method returns the maximum ID of the block. The ID represents the highest - value assigned to a block. - - Returns: - int: The maximum ID of the block. - - Example: - >>> codec = WhooshCodec() - >>> codec.block_max_id() - 10 - """ return self._maxid def block_min_length(self): - """ - Returns the minimum length of a block. - - This method returns the minimum length of a block used by the codec. - The block length is an important parameter that affects the indexing - and searching process. It determines the size of the data chunks that - are read and written during these operations. - - Returns: - int: The minimum length of a block. - - """ return self._minlength def block_max_length(self): - """ - Returns the maximum length of a block in the codec. - - This method returns the maximum length of a block in the codec. A block is a unit of data used in the codec's - internal operations. The maximum length of a block can affect the performance and memory usage of the codec. - - Returns: - int: The maximum length of a block in the codec. - - Example: - >>> codec = WhooshCodec() - >>> codec.block_max_length() - 4096 - - Note: - The value returned by this method is determined by the codec implementation and may vary between different - codecs. - - """ return self._maxlength def block_max_weight(self): - """ - Returns the maximum weight of a block in the codec. - - This method returns the maximum weight that a block can have in the codec. - The weight of a block is a measure of its importance or relevance. - - Returns: - int: The maximum weight of a block. - - Example: - >>> codec = WhooshCodec() - >>> codec.block_max_weight() - 100 - - Note: - The maximum weight can be used to determine the importance of a block - when performing operations such as scoring or ranking. - """ return self._maxweight def _read_data(self): - """ - Reads and loads the block data tuple from disk. + # Load block data tuple from disk - This method reads the block data tuple from the disk, decompresses it if necessary, - and unpickles the data tuple. The unpickled data tuple is then saved in the `_data` - attribute of the object. - - Returns: - None - - Raises: - None - """ datalen = self._nextoffset - self._dataoffset b = self._postfile.get(self._dataoffset, datalen) @@ -2095,21 +1128,6 @@ def _read_data(self): self._data = loads(b) def _read_ids(self): - """ - Reads and initializes the document IDs from disk. - - This method loads the document IDs from disk if they haven't been loaded yet. - It then de-minifies the IDs if necessary and sets the `_ids` attribute. - - Returns: - None - - Raises: - Any exceptions that occur during the data loading process. - - Usage: - Call this method to load and initialize the document IDs before using them. - """ # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() @@ -2122,21 +1140,6 @@ def _read_ids(self): self._ids = ids def _read_weights(self): - """ - Reads and initializes the weights for the index. - - If the data has not been loaded from disk yet, it loads it first. - The weights are then de-minified and stored in the `_weights` attribute. - - Returns: - None - - Raises: - None - - Usage: - _read_weights() - """ # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() @@ -2152,27 +1155,6 @@ def _read_weights(self): self._weights = weights def _read_values(self): - """ - Reads and de-minifies the values from the data. - - If the data has not been loaded from disk yet, it will be loaded before processing. - - Parameters: - None - - Returns: - None - - Raises: - None - - Usage: - Call this method to read and de-minify the values from the data. - It is recommended to call this method before accessing the values. - - Example: - _read_values() - """ # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() @@ -2192,50 +1174,26 @@ def _read_values(self): # Term info implementation -class W3TermInfo(TermInfo): - """ - Represents term information for the Whoosh3 codec. - This class is responsible for storing and manipulating term information such as - weights, document frequencies, lengths, and IDs. It provides methods to add blocks - of information, set extents, inline postings, and convert the term info to bytes. - - Attributes: - _struct (struct.Struct): The struct format used to pack and unpack the term info. - _offset (int): The offset of the term info in the posting file. - _length (int): The length of the term info in the posting file. - _inlined (tuple): A tuple containing the inlined postings (IDs, weights, values). - - """ +class W3TermInfo(TermInfo): + # B | Flags + # f | Total weight + # I | Total doc freq + # B | Min length (encoded as byte) + # B | Max length (encoded as byte) + # f | Max weight + # I | Minimum (first) ID + # I | Maximum (last) ID _struct = struct.Struct("!BfIBBfII") def __init__(self, *args, **kwargs): - """ - Initializes a new instance of the W3TermInfo class. - - Args: - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - """ TermInfo.__init__(self, *args, **kwargs) self._offset = None self._length = None self._inlined = None def add_block(self, block): - """ - Adds a block of information to the term info. - - This method updates the total weight, document frequency, minimum length, - maximum length, maximum weight, minimum ID, and maximum ID based on the - information in the given block. - - Args: - block (Block): The block of information to add. - - """ self._weight += sum(block._weights) self._df += len(block) @@ -2252,72 +1210,22 @@ def add_block(self, block): self._maxid = block.max_id() def set_extent(self, offset, length): - """ - Sets the extent of the term info in the posting file. - - This method sets the offset and length of the term info in the posting file. - - Args: - offset (int): The offset of the term info. - length (int): The length of the term info. - - """ self._offset = offset self._length = length def extent(self): - """ - Returns the extent of the term info in the posting file. - - Returns: - tuple: A tuple containing the offset and length of the term info. - - """ return self._offset, self._length def set_inlined(self, ids, weights, values): - """ - Sets the inlined postings for the term info. - - This method sets the inlined postings, which are represented as tuples of IDs, - weights, and values. - - Args: - ids (tuple): A tuple of IDs. - weights (tuple): A tuple of weights. - values (tuple): A tuple of values. - - """ self._inlined = (tuple(ids), tuple(weights), tuple(values)) def is_inlined(self): - """ - Checks if the term info has inlined postings. - - Returns: - bool: True if the term info has inlined postings, False otherwise. - - """ return self._inlined is not None def inlined_postings(self): - """ - Returns the inlined postings for the term info. - - Returns: - tuple: A tuple containing the inlined postings (IDs, weights, values). - - """ return self._inlined def to_bytes(self): - """ - Converts the term info to bytes. - - Returns: - bytes: The term info encoded as bytes. - - """ isinlined = self.is_inlined() # Encode the lengths as 0-255 values @@ -2350,16 +1258,6 @@ def to_bytes(self): @classmethod def from_bytes(cls, s): - """ - Creates a new W3TermInfo instance from bytes. - - Args: - s (bytes): The bytes representing the term info. - - Returns: - W3TermInfo: A new instance of the W3TermInfo class. - - """ st = cls._struct vals = st.unpack(s[: st.size]) terminfo = cls() @@ -2387,47 +1285,14 @@ def from_bytes(cls, s): @classmethod def read_weight(cls, dbfile, datapos): - """ - Reads the weight from the database file. - - Args: - dbfile (DatabaseFile): The database file. - datapos (int): The position of the weight in the file. - - Returns: - float: The weight. - - """ return dbfile.get_float(datapos + 1) @classmethod def read_doc_freq(cls, dbfile, datapos): - """ - Reads the document frequency from the database file. - - Args: - dbfile (DatabaseFile): The database file. - datapos (int): The position of the document frequency in the file. - - Returns: - int: The document frequency. - - """ return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE) @classmethod def read_min_and_max_length(cls, dbfile, datapos): - """ - Reads the minimum and maximum length from the database file. - - Args: - dbfile (DatabaseFile): The database file. - datapos (int): The position of the lengths in the file. - - Returns: - tuple: A tuple containing the minimum and maximum length. - - """ lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE ml = byte_to_length(dbfile.get_byte(lenpos)) xl = byte_to_length(dbfile.get_byte(lenpos + 1)) @@ -2435,43 +1300,14 @@ def read_min_and_max_length(cls, dbfile, datapos): @classmethod def read_max_weight(cls, dbfile, datapos): - """ - Reads the maximum weight from the database file. - - Args: - dbfile (DatabaseFile): The database file. - datapos (int): The position of the maximum weight in the file. - - Returns: - float: The maximum weight. - - """ weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2 return dbfile.get_float(weightspos) # Segment implementation -class W3Segment(base.Segment): - """ - Represents a segment in the Whoosh index. - - Args: - codec (Codec): The codec used for encoding and decoding the segment. - indexname (str): The name of the index. - doccount (int, optional): The number of documents in the segment. Defaults to 0. - segid (str, optional): The unique identifier for the segment. If not provided, a random ID will be generated. - deleted (set, optional): A set of deleted document numbers. Defaults to None. - - Attributes: - indexname (str): The name of the index. - segid (str): The unique identifier for the segment. - compound (bool): Indicates whether the segment is a compound segment. - _codec (Codec): The codec used for encoding and decoding the segment. - _doccount (int): The number of documents in the segment. - _deleted (set): A set of deleted document numbers. - """ +class W3Segment(base.Segment): def __init__(self, codec, indexname, doccount=0, segid=None, deleted=None): self.indexname = indexname self.segid = self._random_id() if segid is None else segid @@ -2482,69 +1318,26 @@ def __init__(self, codec, indexname, doccount=0, segid=None, deleted=None): self.compound = False def codec(self, **kwargs): - """ - Returns the codec used for encoding and decoding the segment. - - Returns: - Codec: The codec used for the segment. - - """ return self._codec def set_doc_count(self, dc): - """ - Sets the number of documents in the segment. - - Args: - dc (int): The number of documents. - - """ self._doccount = dc def doc_count_all(self): - """ - Returns the total number of documents in the segment. - - Returns: - int: The total number of documents. - - """ return self._doccount def deleted_count(self): - """ - Returns the number of deleted documents in the segment. - - Returns: - int: The number of deleted documents. - - """ if self._deleted is None: return 0 return len(self._deleted) def deleted_docs(self): - """ - Returns an iterator over the deleted document numbers in the segment. - - Returns: - Iterator[int]: An iterator over the deleted document numbers. - - """ if self._deleted is None: return () else: return iter(self._deleted) def delete_document(self, docnum, delete=True): - """ - Marks a document as deleted in the segment. - - Args: - docnum (int): The document number to delete. - delete (bool, optional): Whether to delete the document. Defaults to True. - - """ if delete: if self._deleted is None: self._deleted = set() @@ -2553,16 +1346,6 @@ def delete_document(self, docnum, delete=True): self._deleted.clear(docnum) def is_deleted(self, docnum): - """ - Checks if a document is marked as deleted in the segment. - - Args: - docnum (int): The document number to check. - - Returns: - bool: True if the document is marked as deleted, False otherwise. - - """ if self._deleted is None: return False return docnum in self._deleted diff --git a/src/whoosh/fields.py b/src/whoosh/fields.py index d858b289..1ba30e45 100644 --- a/src/whoosh/fields.py +++ b/src/whoosh/fields.py @@ -166,11 +166,12 @@ def index(self, value, **kwargs): """ if not self.format: - raise ValueError( - f"{self.__class__.__name__} field {self} cannot index without a format" + raise Exception( + "%s field %r cannot index without a format" + % (self.__class__.__name__, self) ) if not isinstance(value, (str, list, tuple)): - raise ValueError(f"{value} is not unicode or sequence") + raise ValueError(f"{value!r} is not unicode or sequence") assert isinstance(self.format, formats.Format) if "mode" not in kwargs: @@ -189,7 +190,7 @@ def tokenize(self, value, **kwargs): """ if not self.analyzer: - raise ValueError(f"{self.__class__} field has no analyzer") + raise Exception(f"{self.__class__} field has no analyzer") return self.analyzer(value, **kwargs) def process_text(self, qstring, mode="", **kwargs): @@ -202,7 +203,7 @@ def process_text(self, qstring, mode="", **kwargs): """ if not self.format: - raise ValueError(f"{self} field has no format") + raise Exception(f"{self} field has no format") return (t.text for t in self.tokenize(qstring, mode=mode, **kwargs)) # Conversion @@ -366,38 +367,9 @@ def clean(self): # Events def on_add(self, schema, fieldname): - """ - This method is called when a field is added to a schema. - - Parameters: - schema (Schema): The schema object to which the field is being added. - fieldname (str): The name of the field being added. - - Returns: - None - - Notes: - - This method can be overridden in subclasses to perform custom actions when a field is added. - - By default, this method does nothing. - """ pass def on_remove(self, schema, fieldname): - """ - This method is called when a field is removed from the schema. - - Parameters: - schema (Schema): The schema object from which the field is being removed. - fieldname (str): The name of the field being removed. - - Returns: - None - - Notes: - - This method can be overridden in a custom field class to perform any necessary cleanup or - additional actions when a field is removed from the schema. - - By default, this method does nothing. - """ pass @@ -515,7 +487,7 @@ def __init__( document. """ - self.analyzer = analyzer or analysis.id_analyzer() + self.analyzer = analyzer or analysis.IDAnalyzer() # Don't store any information other than the doc ID self.format = formats.Existence(field_boost=field_boost) self.stored = stored @@ -540,7 +512,7 @@ def __init__(self, stored=False, unique=False, expression=None, field_boost=1.0) """ expression = expression or re.compile(r"[^\r\n\t ,;]+") - self.analyzer = analysis.regex_analyzer(expression=expression) + self.analyzer = analysis.RegexAnalyzer(expression=expression) # Don't store any information other than the doc ID self.format = formats.Existence(field_boost=field_boost) self.stored = stored @@ -633,7 +605,7 @@ def __init__( raise TypeError(f"Can't use {numtype!r} as a type, use int or float") # Sanity check if numtype is float and decimal_places: - raise ValueError( + raise Exception( "A float type and decimal_places argument %r are " "incompatible" % decimal_places ) @@ -645,7 +617,7 @@ def __init__( bits = 64 # Floats are converted to 64 bit ints else: if bits not in intsizes: - raise ValueError(f"Invalid bits {bits!r}, use 8, 16, 32, or 64") + raise Exception(f"Invalid bits {bits!r}, use 8, 16, 32, or 64") # Type code for the *sortable* representation self.sortable_typecode = intcodes[intsizes.index(bits)] self._struct = struct.Struct(">" + str(self.sortable_typecode)) @@ -657,7 +629,7 @@ def __init__( self.decimal_places = decimal_places self.shift_step = shift_step self.signed = signed - self.analyzer = analysis.id_analyzer() + self.analyzer = analysis.IDAnalyzer() # Don't store any information other than the doc ID self.format = formats.Existence(field_boost=field_boost) self.min_value, self.max_value = self._min_max() @@ -669,8 +641,8 @@ def __init__( else: default = NaN elif not self.is_valid(default): - raise ValueError( - f"The default {default} is not a valid number for this field" + raise Exception( + f"The default {default!r} is not a valid number for this field" ) self.default = default @@ -881,11 +853,11 @@ def prepare_datetime(self, x): elif isinstance(x, bytes): return x else: - raise ValueError(f"{x} is not a datetime") + raise Exception(f"{x!r} is not a datetime") def to_column_value(self, x): if isinstance(x, bytes): - raise ValueError(f"{x} is not a datetime") + raise Exception(f"{x!r} is not a datetime") if isinstance(x, (list, tuple)): x = x[0] return self.prepare_datetime(x) @@ -925,7 +897,7 @@ def _parse_datestring(self, qstring): at = fix(adatetime(year, month, day, hour, minute, second, microsecond)) if is_void(at): - raise ValueError(f"{qstring} is not a parseable date") + raise Exception(f"{qstring!r} is not a parseable date") return at def parse_query(self, fieldname, qstring, boost=1.0): @@ -1040,14 +1012,6 @@ class STORED(FieldType): stored = True def __init__(self): - """ - Initialize a new instance of the class. - - This method is called when a new object of the class is created. It does not take any arguments. - - Usage: - field = Field() - """ pass @@ -1103,7 +1067,7 @@ def __init__( """ if not analyzer: - analyzer = analysis.keyword_analyzer(lowercase=lowercase, commas=commas) + analyzer = analysis.KeywordAnalyzer(lowercase=lowercase, commas=commas) self.analyzer = analyzer # Store field lengths and weights along with doc ID @@ -1147,7 +1111,7 @@ def __init__( """ :param analyzer: The analysis.Analyzer to use to index the field contents. See the analysis module for more information. If you omit - this argument, the field uses analysis.standard_analyzer. + this argument, the field uses analysis.StandardAnalyzer. :param phrase: Whether the store positional information to allow phrase searching. :param chars: Whether to store character ranges along with positions. @@ -1165,7 +1129,7 @@ def __init__( column type. If you pass a :class:`whoosh.columns.Column` instance instead of True, the field will use the given column type. :param lang: automaticaly configure a - :class:`whoosh.analysis.language_analyzer` for the given language. + :class:`whoosh.analysis.LanguageAnalyzer` for the given language. This is ignored if you also specify an ``analyzer``. :param vector: if this value evaluates to true, store a list of the terms in this field in each document. If the value is an instance @@ -1177,9 +1141,9 @@ def __init__( if analyzer: self.analyzer = analyzer elif lang: - self.analyzer = analysis.language_analyzer(lang) + self.analyzer = analysis.LanguageAnalyzer(lang) else: - self.analyzer = analysis.standard_analyzer() + self.analyzer = analysis.StandardAnalyzer() if chars: formatclass = formats.Characters @@ -1300,9 +1264,9 @@ def __init__( if phrase: formatclass = formats.Positions - self.analyzer = analysis.ngram_analyzer(minsize, maxsize) + self.analyzer = analysis.NgramAnalyzer(minsize, maxsize) self.format = formatclass(field_boost=field_boost) - self.analyzer = analysis.ngram_analyzer(minsize, maxsize) + self.analyzer = analysis.NgramAnalyzer(minsize, maxsize) self.stored = stored self.queryor = queryor self.set_sortable(sortable) @@ -1359,7 +1323,7 @@ def __init__( default is to combine N-grams with an And query. """ - self.analyzer = analysis.ngram_word_analyzer(minsize, maxsize, tokenizer, at=at) + self.analyzer = analysis.NgramWordAnalyzer(minsize, maxsize, tokenizer, at=at) self.format = formats.Frequency(field_boost=field_boost) self.stored = stored self.queryor = queryor @@ -1697,7 +1661,7 @@ def merge_fielddict(d1, d2): field1 = d1.get(name) field2 = d2.get(name) if field1 and field2 and field1 != field2: - raise ValueError(f"Inconsistent field {name}: {field1} != {field2}") + raise Exception(f"Inconsistent field {name!r}: {field1!r} != {field2!r}") out[name] = field1 or field2 return out diff --git a/src/whoosh/filedb/compound.py b/src/whoosh/filedb/compound.py index a6657abc..26190152 100644 --- a/src/whoosh/filedb/compound.py +++ b/src/whoosh/filedb/compound.py @@ -52,52 +52,9 @@ def memoryview_(source, offset=None, length=None): class CompoundStorage(FileStorage): - """ - CompoundStorage is a class that represents a compound file storage for Whoosh indexes. - It provides methods to read and write files within the compound file. - - Parameters: - - dbfile (file-like object): The file-like object representing the compound file. - - use_mmap (bool, optional): Whether to use memory-mapped file for faster access. Defaults to True. - - basepos (int, optional): The base position in the file. Defaults to 0. - - Attributes: - - readonly (bool): Whether the compound file is read-only. - - is_closed (bool): Whether the compound file is closed. - - _file (file-like object): The file-like object representing the compound file. - - _diroffset (int): The offset of the directory within the compound file. - - _dirlength (int): The length of the directory within the compound file. - - _dir (dict): The directory mapping file names to their offset and length within the compound file. - - _options (dict): Additional options associated with the compound file. - - _locks (dict): A dictionary of locks for file-level synchronization. - - _source (mmap.mmap or None): The memory-mapped object representing the compound file, if mmap is used. - - Methods: - - __init__(self, dbfile, use_mmap=True, basepos=0): Initializes a CompoundStorage object. - - __repr__(self): Returns a string representation of the CompoundStorage object. - - close(self): Closes the compound file. - - range(self, name): Returns the offset and length of a file within the compound file. - - open_file(self, name, *args, **kwargs): Opens a file within the compound file. - - list(self): Returns a list of file names within the compound file. - - file_exists(self, name): Checks if a file exists within the compound file. - - file_length(self, name): Returns the length of a file within the compound file. - - file_modified(self, name): Returns the modification time of a file within the compound file. - - lock(self, name): Returns a lock object for file-level synchronization. - - assemble(dbfile, store, names, **options): Assembles a compound file from multiple files. - - write_dir(dbfile, basepos, directory, options=None): Writes the directory and options to the compound file. - """ - readonly = True def __init__(self, dbfile, use_mmap=True, basepos=0): - """ - Initializes a CompoundStorage object. - - Parameters: - - dbfile (file-like object): The file-like object representing the compound file. - - use_mmap (bool, optional): Whether to use memory-mapped file for faster access. Defaults to True. - - basepos (int, optional): The base position in the file. Defaults to 0. - """ self._file = dbfile self.is_closed = False @@ -139,19 +96,11 @@ def __init__(self, dbfile, use_mmap=True, basepos=0): self._file = None def __repr__(self): - """ - Returns a string representation of the CompoundStorage object. - """ return f"<{self.__class__.__name__} ({self._name})>" def close(self): - """ - Closes the compound file. - """ if self.is_closed: - raise RuntimeError( - "Already closed" - ) # Replaced generic Exception with RuntimeError + raise Exception("Already closed") self.is_closed = True if self._source: @@ -163,16 +112,6 @@ def close(self): self._file.close() def range(self, name): - """ - Returns the offset and length of a file within the compound file. - - Parameters: - - name (str): The name of the file. - - Returns: - - offset (int): The offset of the file within the compound file. - - length (int): The length of the file. - """ try: fileinfo = self._dir[name] except KeyError: @@ -180,17 +119,6 @@ def range(self, name): return fileinfo["offset"], fileinfo["length"] def open_file(self, name, *args, **kwargs): - """ - Opens a file within the compound file. - - Parameters: - - name (str): The name of the file. - - *args: Additional positional arguments. - - **kwargs: Additional keyword arguments. - - Returns: - - f (file-like object): The file-like object representing the opened file. - """ if self.is_closed: raise StorageError("Storage was closed") @@ -206,74 +134,26 @@ def open_file(self, name, *args, **kwargs): return f def list(self): - """ - Returns a list of file names within the compound file. - """ return list(self._dir.keys()) def file_exists(self, name): - """ - Checks if a file exists within the compound file. - - Parameters: - - name (str): The name of the file. - - Returns: - - exists (bool): True if the file exists, False otherwise. - """ return name in self._dir def file_length(self, name): - """ - Returns the length of a file within the compound file. - - Parameters: - - name (str): The name of the file. - - Returns: - - length (int): The length of the file. - """ info = self._dir[name] return info["length"] def file_modified(self, name): - """ - Returns the modification time of a file within the compound file. - - Parameters: - - name (str): The name of the file. - - Returns: - - modified (float): The modification time of the file. - """ info = self._dir[name] return info["modified"] def lock(self, name): - """ - Returns a lock object for file-level synchronization. - - Parameters: - - name (str): The name of the file. - - Returns: - - lock (Lock): The lock object. - """ if name not in self._locks: self._locks[name] = Lock() return self._locks[name] @staticmethod def assemble(dbfile, store, names, **options): - """ - Assembles a compound file from multiple files. - - Parameters: - - dbfile (file-like object): The file-like object representing the compound file. - - store (FileStorage): The file storage object containing the files to be assembled. - - names (list): The list of file names to be assembled. - - **options: Additional options to be associated with the compound file. - """ assert names, names directory = {} @@ -284,7 +164,7 @@ def assemble(dbfile, store, names, **options): # Copy the files into the compound file for name in names: if name.endswith(".toc") or name.endswith(".seg"): - raise ValueError(name) + raise Exception(name) for name in names: offset = dbfile.tell() @@ -299,15 +179,6 @@ def assemble(dbfile, store, names, **options): @staticmethod def write_dir(dbfile, basepos, directory, options=None): - """ - Writes the directory and options to the compound file. - - Parameters: - - dbfile (file-like object): The file-like object representing the compound file. - - basepos (int): The base position in the file. - - directory (dict): The directory mapping file names to their offset and length within the compound file. - - options (dict, optional): Additional options to be associated with the compound file. Defaults to None. - """ options = options or {} dirpos = dbfile.tell() # Remember the start of the directory @@ -323,55 +194,7 @@ def write_dir(dbfile, basepos, directory, options=None): class SubFile: - """ - Represents a subset of a parent file. - - This class provides methods to read and manipulate a subset of a parent file. - It keeps track of the subset's position, length, and name. - - Attributes: - _file (file-like object): The parent file. - _offset (int): The offset of the subset within the parent file. - _length (int): The length of the subset. - _end (int): The end position of the subset. - _pos (int): The current position within the subset. - name (str): The name of the subset. - closed (bool): Indicates whether the subset is closed. - - Methods: - close(): Closes the subset. - subset(position, length, name=None): Creates a new subset from the current subset. - read(size=None): Reads data from the subset. - readline(): Reads a line from the subset. - seek(where, whence=0): Moves the current position within the subset. - tell(): Returns the current position within the subset. - """ - def __init__(self, parentfile, offset, length, name=None): - """ - Initialize a CompoundFile object. - - Args: - parentfile (file-like object): The parent file object that represents the compound file. - offset (int): The offset within the parent file where the compound file starts. - length (int): The length of the compound file in bytes. - name (str, optional): The name of the compound file. Defaults to None. - - Attributes: - _file (file-like object): The parent file object that represents the compound file. - _offset (int): The offset within the parent file where the compound file starts. - _length (int): The length of the compound file in bytes. - _end (int): The end position of the compound file within the parent file. - _pos (int): The current position within the compound file. - name (str): The name of the compound file. - closed (bool): Indicates whether the compound file is closed. - - Raises: - None. - - Returns: - None. - """ self._file = parentfile self._offset = offset self._length = length @@ -382,28 +205,9 @@ def __init__(self, parentfile, offset, length, name=None): self.closed = False def close(self): - """ - Closes the subset. - - This method sets the `closed` attribute to True, indicating that the subset is closed. - """ self.closed = True def subset(self, position, length, name=None): - """ - Creates a new subset from the current subset. - - Args: - position (int): The position of the new subset within the current subset. - length (int): The length of the new subset. - name (str, optional): The name of the new subset. Defaults to None. - - Returns: - SubFile: The new subset. - - Raises: - AssertionError: If the position or length is out of bounds. - """ start = self._offset + position end = start + length name = name or self.name @@ -412,19 +216,6 @@ def subset(self, position, length, name=None): return SubFile(self._file, self._offset + position, length, name=name) def read(self, size=None): - """ - Reads data from the subset. - - Args: - size (int, optional): The number of bytes to read. If None, reads until the end of the subset. - Defaults to None. - - Returns: - bytes: The read data. - - Raises: - ValueError: If the size is negative. - """ if size is None: size = self._length - self._pos else: @@ -440,15 +231,6 @@ def read(self, size=None): return emptybytes def readline(self): - """ - Reads a line from the subset. - - Returns: - bytes: The read line. - - Raises: - ValueError: If the line length exceeds the remaining subset length. - """ maxsize = self._length - self._pos self._file.seek(self._offset + self._pos) data = self._file.readline() @@ -458,18 +240,6 @@ def readline(self): return data def seek(self, where, whence=0): - """ - Moves the current position within the subset. - - Args: - where (int): The new position. - whence (int, optional): The reference position for the new position. - 0 for absolute, 1 for relative to the current position, 2 for relative to the end. - Defaults to 0. - - Raises: - ValueError: If the `whence` value is invalid. - """ if whence == 0: # Absolute pos = where elif whence == 1: # Relative @@ -482,58 +252,11 @@ def seek(self, where, whence=0): self._pos = pos def tell(self): - """ - Returns the current position within the subset. - - Returns: - int: The current position. - """ return self._pos class CompoundWriter: - """ - A class for writing compound files in Whoosh. - - CompoundWriter is responsible for creating compound files, which are files that contain multiple smaller files - combined into a single file. This class provides methods to create and manage substreams within the compound file, - and to save the compound file either as a single file or as separate files. - - Args: - tempstorage (object): The temporary storage object used to create the compound file. - buffersize (int, optional): The size of the buffer used for writing data to the compound file. Defaults to - 32 * 1024 bytes. - - Attributes: - _tempstorage (object): The temporary storage object used to create the compound file. - _tempname (str): The name of the temporary file used for storing the compound file data. - _temp (file-like object): The temporary file object used for writing the compound file data. - _buffersize (int): The size of the buffer used for writing data to the compound file. - _streams (dict): A dictionary that maps substream names to their corresponding SubStream objects. - - """ - def __init__(self, tempstorage, buffersize=32 * 1024): - """ - Initialize a CompoundStorage object. - - Args: - tempstorage (object): The temporary storage object used to create the compound file. - buffersize (int, optional): The buffer size in bytes for reading and writing data. Defaults to 32 * 1024. - - Raises: - AssertionError: If the buffersize is not an integer. - - Notes: - - The CompoundStorage object is responsible for managing a compound file, which is a file that contains multiple - smaller files combined into a single file. - - The tempstorage object should implement the `create_file` method to create a temporary file. - - The buffersize determines the size of the buffer used for reading and writing data to the compound file. - - Example: - tempstorage = TempStorage() - compound = CompoundStorage(tempstorage, buffersize=64 * 1024) - """ assert isinstance(buffersize, int) self._tempstorage = tempstorage self._tempname = f"{random_name()}.ctmp" @@ -542,50 +265,11 @@ def __init__(self, tempstorage, buffersize=32 * 1024): self._streams = {} def create_file(self, name): - """ - Creates a new file with the given name in the compound file. - - Parameters: - - name (str): The name of the file to be created. - - Returns: - - StructFile: A StructFile object representing the newly created file. - - Description: - This method creates a new file with the given name in the compound file. - It internally creates a SubStream object with a temporary file and a buffer size. - The SubStream object is then stored in the _streams dictionary with the given name as the key. - Finally, a StructFile object is returned, which wraps the SubStream object. - - Example usage: - compound_file = CompoundFile() - file = compound_file.create_file("example.txt") - file.write("Hello, World!") - file.close() - """ ss = self.SubStream(self._temp, self._buffersize) self._streams[name] = ss return StructFile(ss) def _readback(self): - """ - Reads back the contents of the compound file. - - This method reads back the contents of the compound file, yielding each substream's name and a generator that - yields the data blocks of the substream. The data blocks are read from either the substream or a temporary file, - depending on whether the substream is closed or not. - - Returns: - generator: A generator that yields tuples containing the name of the substream and a generator that yields - the data blocks of the substream. - - Example: - compound_file = CompoundFile() - for name, gen in compound_file._readback(): - print(f"Substream: {name}") - for data_block in gen(): - process_data_block(data_block) - """ temp = self._temp for name, substream in self._streams.items(): substream.close() @@ -602,28 +286,6 @@ def gen(): self._tempstorage.delete_file(self._tempname) def save_as_compound(self, dbfile): - """ - Save the current index as a compound file. - - This method writes the index data to a single file in a compound format. - The compound file contains multiple sub-files, each representing a segment - of the index. The directory structure of the compound file is stored at the - beginning of the file. - - Parameters: - dbfile (file-like object): The file-like object to write the compound file to. - - Returns: - None - - Raises: - IOError: If there is an error writing the compound file. - - Usage: - To save the index as a compound file, pass a file-like object to this method. - The file-like object should be opened in binary mode for writing. After calling - this method, the compound file will be written to the provided file-like object. - """ basepos = dbfile.tell() dbfile.write_long(0) # Directory offset dbfile.write_int(0) # Directory length @@ -638,30 +300,6 @@ def save_as_compound(self, dbfile): CompoundStorage.write_dir(dbfile, basepos, directory) def save_as_files(self, storage, name_fn): - """ - Save the compound file as separate files in the given storage. - - Args: - storage (Storage): The storage object where the files will be saved. - name_fn (callable): A function that takes a name and returns the filename. - - Returns: - None - - Raises: - Any exceptions raised by the storage object. - - Notes: - This method saves the compound file as separate files in the given storage. - Each file is created using the provided name_fn function, which takes a name - and returns the filename. The compound file is read back and written to the - separate files block by block. - - Example: - storage = MyStorage() - name_fn = lambda name: name + ".txt" - compound_file.save_as_files(storage, name_fn) - """ for name, blocks in self._readback(): f = storage.create_file(name_fn(name)) for block in blocks(): @@ -669,76 +307,16 @@ def save_as_files(self, storage, name_fn): f.close() class SubStream: - """A class representing a substream for writing data to a file. - - This class is used internally by the `CompoundFileWriter` class to write data to a file in blocks. - It provides methods for writing data to the substream and keeping track of the offsets and lengths of the blocks. - - Attributes: - _dbfile (file): The file object representing the main database file. - _buffersize (int): The maximum size of the buffer before writing to the main file. - _buffer (BytesIO): The buffer used to store the data before writing. - blocks (list): A list of tuples representing the blocks written to the main file. Each tuple contains: - - A BytesIO object if the block is in the buffer, or None if the block is in the main file. - - The offset of the block in the main file. - - The length of the block. - - Methods: - tell(): Returns the current position in the substream. - write(inbytes): Writes the given bytes to the substream. - close(): Closes the substream and writes any remaining data to the main file. - - Usage: - # Create a SubStream object - substream = SubStream(dbfile, buffersize) - - # Write data to the substream - substream.write(inbytes) - - # Get the current position in the substream - position = substream.tell() - - # Close the substream - substream.close() - """ - def __init__(self, dbfile, buffersize): - """ - Initialize a CompoundFile object. - - Args: - dbfile (str): The path to the compound file. - buffersize (int): The size of the buffer used for reading and writing. - - Attributes: - _dbfile (str): The path to the compound file. - _buffersize (int): The size of the buffer used for reading and writing. - _buffer (BytesIO): The buffer used for temporary storage. - blocks (list): The list of blocks in the compound file. - - """ self._dbfile = dbfile self._buffersize = buffersize self._buffer = BytesIO() self.blocks = [] def tell(self): - """Returns the current position in the substream. - - Returns: - int: The current position in the substream. - """ return sum(b[2] for b in self.blocks) + self._buffer.tell() def write(self, inbytes): - """Writes the given bytes to the substream. - - If the length of the buffer exceeds the specified buffer size, the buffer is written to the main file - and a new block is created. - - Args: - inbytes (bytes): The bytes to write to the substream. - """ bio = self._buffer buflen = bio.tell() length = buflen + len(inbytes) @@ -753,7 +331,6 @@ def write(self, inbytes): bio.write(inbytes) def close(self): - """Closes the substream and writes any remaining data to the main file.""" bio = self._buffer length = bio.tell() if length: diff --git a/src/whoosh/filedb/fileindex.py b/src/whoosh/filedb/fileindex.py index 33f363bd..9b3e9985 100644 --- a/src/whoosh/filedb/fileindex.py +++ b/src/whoosh/filedb/fileindex.py @@ -14,6 +14,7 @@ # limitations under the License. # =============================================================================== +import os import pickle import re from bisect import bisect_right @@ -30,6 +31,7 @@ LockError, OutOfDateError, ) +from whoosh.support.bitvector import BitVector from whoosh.system import _FLOAT_SIZE, _INT_SIZE _INDEX_VERSION = -105 @@ -66,20 +68,6 @@ def has_deletions(self): class FileIndex(SegmentDeletionMixin, Index): def __init__(self, storage, schema, create=False, indexname=_DEF_INDEX_NAME): - """ - Represents an index stored in a file-based storage. - - Args: - storage (Storage): The storage object used to store the index files. - schema (Schema): The schema object defining the fields and their types in the index. - create (bool, optional): Whether to create a new index. Defaults to False. - indexname (str, optional): The name of the index. Defaults to _DEF_INDEX_NAME. - - Raises: - ValueError: If the provided schema is not a Schema object. - IndexError: If create is True but no schema is specified. - EmptyIndexError: If the index does not exist in the storage. - """ self.storage = storage self.indexname = indexname @@ -119,20 +107,9 @@ def __init__(self, storage, schema, create=False, indexname=_DEF_INDEX_NAME): self.segment_num_lock = None def __repr__(self): - """ - Returns a string representation of the FileIndex object. - - Returns: - str: The string representation of the FileIndex object. - """ return f"{self.__class__.__name__}({self.storage!r}, {self.indexname!r})" def _acquire_readlocks(self): - """ - Acquires read locks on the segment files. - - This is used to keep the underlying files open so they don't get deleted from underneath us. - """ self._readlocks = [ self.storage.open_file(name, mapped=False) for name in self.segments.filenames() @@ -140,61 +117,35 @@ def _acquire_readlocks(self): ] def _release_readlocks(self): - """ - Releases the read locks on the segment files. - """ (f.close() for f in self._readlocks) self._readlocks = [] def close(self): - """ - Closes the FileIndex object by releasing the read locks on the segment files. - """ self._release_readlocks() def latest_generation(self): - """ - Returns the latest generation number of the index files. - - Returns: - int: The latest generation number of the index files. - """ pattern = _toc_pattern(self.indexname) - maximum = -1 + max = -1 for filename in self.storage: m = pattern.match(filename) if m: num = int(m.group(1)) - if num > maximum: - maximum = num - return maximum + if num > max: + max = num + return max def refresh(self): - """ - Refreshes the FileIndex object by creating a new instance with the same storage and schema. - - Returns: - FileIndex: The refreshed FileIndex object. - """ if not self.up_to_date(): return self.__class__(self.storage, self.schema, indexname=self.indexname) else: return self def up_to_date(self): - """ - Checks if the FileIndex object is up to date. - - Returns: - bool: True if the FileIndex object is up to date, False otherwise. - """ return self.generation == self.latest_generation() def _write(self): - """ - Writes the content of this index to the .toc file. - """ + # Writes the content of this index to the .toc file. self.schema.clean() # stream = self.storage.create_file(self._toc_filename()) @@ -221,18 +172,7 @@ def _write(self): self.storage.rename_file(tempfilename, self._toc_filename(), safe=True) def _read(self, schema): - """ - Reads the content of this index from the .toc file. - - Args: - schema (Schema): The schema object to use. If None, the pickled schema from the saved index will be loaded. - - Raises: - IndexError: If the index was created on an architecture with different data sizes. - IndexError: If there is a byte order problem. - IndexVersionError: If the format of the index is not supported. - - """ + # Reads the content of this index from the .toc file. stream = self.storage.open_file(self._toc_filename()) if stream.read_varint() != _INT_SIZE or stream.read_varint() != _FLOAT_SIZE: @@ -240,7 +180,7 @@ def _read(self, schema): "Index was created on an architecture with different data sizes" ) - if stream.read_int() != -12345: + if not stream.read_int() == -12345: raise IndexError("Number misread: byte order problem") version = stream.read_int() @@ -268,15 +208,7 @@ def _read(self, schema): stream.close() def _next_segment_name(self): - """ - Returns the name of the next segment in sequence. - - Returns: - str: The name of the next segment in sequence. - - Raises: - LockError: If the segment number lock cannot be acquired. - """ + # Returns the name of the next segment in sequence. if self.segment_num_lock is None: self.segment_num_lock = Lock() @@ -290,51 +222,21 @@ def _next_segment_name(self): raise LockError def _toc_filename(self): - """ - Returns the computed filename of the TOC (Table of Contents) for this index name and generation. - - Returns: - str: The computed filename of the TOC for this index name and generation. - """ + # Returns the computed filename of the TOC for this index name and + # generation. return f"_{self.indexname}_{self.generation}.toc" def last_modified(self): - """ - Returns the last modified timestamp of the TOC file. - - Returns: - float: The last modified timestamp of the TOC file. - """ return self.storage.file_modified(self._toc_filename()) def is_empty(self): - """ - Checks if the index is empty. - - Returns: - bool: True if the index is empty, False otherwise. - """ + """Low-level: Returns the number of segments in this index.""" return len(self.segments) == 0 def segment_count(self): - """ - Returns the number of segments in the index. - - Returns: - int: The number of segments in the index. - """ return len(self.segments) def optimize(self): - """ - Optimizes the index by merging segments if necessary. - - This operation improves search performance. - - Note: - This method only performs optimization if there are more than 1 segments and no deletions. - - """ if len(self.segments) < 2 and not self.segments.has_deletions(): return @@ -344,17 +246,6 @@ def optimize(self): w.commit(OPTIMIZE) def commit(self, new_segments=None): - """ - Commits changes to the index. - - Args: - new_segments (SegmentSet, optional): The new segments to replace the existing segments in the index. - - Raises: - OutOfDateError: If the index is not up to date. - ValueError: If new_segments is provided but is not a SegmentSet. - - """ self._release_readlocks() if not self.up_to_date(): @@ -375,13 +266,11 @@ def commit(self, new_segments=None): self._acquire_readlocks() def _clean_files(self): - """ - Attempts to remove unused index files. + # Attempts to remove unused index files (called when a new generation + # is created). If existing Index and/or reader objects have the files + # open, they may not be deleted immediately (i.e. on Windows) but will + # probably be deleted eventually by a later call to clean_files. - This method is called when a new generation is created. - If existing Index and/or reader objects have the files open, they may not be deleted immediately (i.e. on Windows) - but will probably be deleted eventually by a later call to clean_files. - """ storage = self.storage current_segment_names = {s.name for s in self.segments} @@ -408,54 +297,18 @@ def _clean_files(self): pass def doc_count_all(self): - """ - Returns the total number of documents in the index, including deleted documents. - - Returns: - int: The total number of documents in the index, including deleted documents. - """ return self.segments.doc_count_all() def doc_count(self): - """ - Returns the number of non-deleted documents in the index. - - Returns: - int: The number of non-deleted documents in the index. - """ return self.segments.doc_count() def field_length(self, fieldnum): - """ - Returns the total length of a field in the index. - - Args: - fieldnum (int): The field number. - - Returns: - int: The total length of the field in the index. - """ return sum(s.field_length(fieldnum) for s in self.segments) def reader(self): - """ - Returns a reader object for the index. - - Returns: - IndexReader: The reader object for the index. - """ return self.segments.reader(self.storage, self.schema) def writer(self, **kwargs): - """ - Returns a writer object for the index. - - Args: - **kwargs: Additional keyword arguments to pass to the writer constructor. - - Returns: - IndexWriter: The writer object for the index. - """ from whoosh.filedb.filewriting import SegmentWriter return SegmentWriter(self, **kwargs) @@ -465,33 +318,8 @@ def writer(self, **kwargs): class SegmentSet: - """ - This class is used by the Index object to keep track of the segments in the index. - - Attributes: - segments (list): A list of segments in the index. - _doc_offsets (list): A list of document offsets for each segment. - - Methods: - __init__(segments=None): Initializes a new instance of the SegmentSet class. - __repr__(): Returns a string representation of the segments in the set. - __len__(): Returns the number of segments in this set. - __iter__(): Returns an iterator over the segments in this set. - __getitem__(n): Returns the segment at the specified index. - append(segment): Adds a segment to this set. - _document_segment(docnum): Returns the index.Segment object containing the given document number. - _segment_and_docnum(docnum): Returns an (index.Segment, segment_docnum) pair for the segment containing the given document number. - copy(): Returns a deep copy of this set. - filenames(): Returns a set of filenames associated with the segments in this set. - doc_offsets(): Recomputes the document offset list. - doc_count_all(): Returns the total number of documents, DELETED or UNDELETED, in this set. - doc_count(): Returns the number of undeleted documents in this set. - has_deletions(): Returns True if this index has documents that are marked deleted but haven't been optimized out of the index yet. - delete_document(docnum, delete=True): Deletes a document by number. - deleted_count(): Returns the total number of deleted documents in this index. - is_deleted(docnum): Returns True if a given document number is deleted but not yet optimized out of the index. - reader(storage, schema): Returns a reader object for accessing the segments in this set. - + """This class is never instantiated by the user. It is used by the Index + object to keep track of the segments in the index. """ def __init__(self, segments=None): @@ -507,10 +335,7 @@ def __repr__(self): def __len__(self): """ - Returns the number of segments in this set. - - Returns: - int: The number of segments in this set. + :returns: the number of segments in this set. """ return len(self.segments) @@ -521,73 +346,44 @@ def __getitem__(self, n): return self.segments.__getitem__(n) def append(self, segment): - """ - Adds a segment to this set. + """Adds a segment to this set.""" - Args: - segment (object): The segment to be added. - """ self.segments.append(segment) self._doc_offsets = self.doc_offsets() def _document_segment(self, docnum): + """Returns the index.Segment object containing the given document + number. """ - Returns the index.Segment object containing the given document number. - Args: - docnum (int): The document number. - - Returns: - int: The index of the segment containing the document. - """ offsets = self._doc_offsets if len(offsets) == 1: return 0 return bisect_right(offsets, docnum) - 1 def _segment_and_docnum(self, docnum): + """Returns an (index.Segment, segment_docnum) pair for the segment + containing the given document number. """ - Returns an (index.Segment, segment_docnum) pair for the segment containing the given document number. - Args: - docnum (int): The document number. - - Returns: - tuple: A tuple containing the index.Segment object and the segment_docnum. - """ segmentnum = self._document_segment(docnum) offset = self._doc_offsets[segmentnum] segment = self.segments[segmentnum] return segment, docnum - offset def copy(self): - """ - Returns a deep copy of this set. - - Returns: - SegmentSet: A deep copy of this set. - """ + """:returns: a deep copy of this set.""" return self.__class__([s.copy() for s in self.segments]) def filenames(self): - """ - Returns a set of filenames associated with the segments in this set. - - Returns: - set: A set of filenames. - """ nameset = set() for segment in self.segments: nameset |= segment.filenames() return nameset def doc_offsets(self): - """ - Recomputes the document offset list. This must be called if you change self.segments. - - Returns: - list: A list of document offsets. - """ + # Recomputes the document offset list. This must be called if you + # change self.segments. offsets = [] base = 0 for s in self.segments: @@ -597,75 +393,51 @@ def doc_offsets(self): def doc_count_all(self): """ - Returns the total number of documents, DELETED or UNDELETED, in this set. - - Returns: - int: The total number of documents. + :returns: the total number of documents, DELETED or UNDELETED, in this + set. """ return sum(s.doc_count_all() for s in self.segments) def doc_count(self): """ - Returns the number of undeleted documents in this set. - - Returns: - int: The number of undeleted documents. + :returns: the number of undeleted documents in this set. """ return sum(s.doc_count() for s in self.segments) def has_deletions(self): """ - Returns True if this index has documents that are marked deleted but haven't been optimized out of the index yet. - - Returns: - bool: True if there are deleted documents, False otherwise. + :returns: True if this index has documents that are marked deleted but + haven't been optimized out of the index yet. This includes + deletions that haven't been written to disk with Index.commit() + yet. """ return any(s.has_deletions() for s in self.segments) def delete_document(self, docnum, delete=True): - """ - Deletes a document by number. + """Deletes a document by number. - Args: - docnum (int): The document number. - delete (bool, optional): Whether to mark the document as deleted. Defaults to True. + You must call Index.commit() for the deletion to be written to disk. """ + segment, segdocnum = self._segment_and_docnum(docnum) segment.delete_document(segdocnum, delete=delete) def deleted_count(self): """ - Returns the total number of deleted documents in this index. - - Returns: - int: The total number of deleted documents. + :returns: the total number of deleted documents in this index. """ return sum(s.deleted_count() for s in self.segments) def is_deleted(self, docnum): """ - Returns True if a given document number is deleted but not yet optimized out of the index. - - Args: - docnum (int): The document number. - - Returns: - bool: True if the document is deleted, False otherwise. + :returns: True if a given document number is deleted but not yet + optimized out of the index. """ + segment, segdocnum = self._segment_and_docnum(docnum) return segment.is_deleted(segdocnum) def reader(self, storage, schema): - """ - Returns a reader object for accessing the segments in this set. - - Args: - storage (object): The storage object. - schema (object): The schema object. - - Returns: - object: A reader object. - """ from whoosh.filedb.filereading import SegmentReader segments = self.segments @@ -679,32 +451,17 @@ def reader(self, storage, schema): class Segment: - """Represents a segment in the index. - - Segments are used by the Index object to hold information about a segment. - A segment is a real reverse index that stores a subset of the documents in the index. - Multiple segments allow for quick incremental indexing and efficient searching. - - Attributes: - name (str): The name of the segment. - doccount (int): The maximum document number in the segment. - fieldlength_totals (dict): A dictionary mapping field numbers to the total number of terms in that field across all documents in the segment. - fieldlength_maxes (dict): A dictionary mapping field numbers to the maximum length of the field in any of the documents in the segment. - deleted (set): A set of deleted document numbers, or None if no deleted documents exist in this segment. - - Methods: - __init__(name, doccount, fieldlength_totals, fieldlength_maxes, deleted=None): Initializes a Segment object. - __repr__(): Returns a string representation of the Segment object. - copy(): Creates a copy of the Segment object. - filenames(): Returns a set of filenames associated with the segment. - doc_count_all(): Returns the total number of documents, deleted or undeleted, in this segment. - doc_count(): Returns the number of undeleted documents in this segment. - has_deletions(): Returns True if any documents in this segment are deleted. - deleted_count(): Returns the total number of deleted documents in this segment. - field_length(fieldnum, default=0): Returns the total number of terms in the given field across all documents in this segment. - max_field_length(fieldnum, default=0): Returns the maximum length of the given field in any of the documents in the segment. - delete_document(docnum, delete=True): Deletes or undeletes a document in the segment. - is_deleted(docnum): Returns True if the given document number is deleted. + """Do not instantiate this object directly. It is used by the Index object + to hold information about a segment. A list of objects of this class are + pickled as part of the TOC file. + + The TOC file stores a minimal amount of information -- mostly a list of + Segment objects. Segments are the real reverse indexes. Having multiple + segments allows quick incremental indexing: just create a new segment for + the new documents, and have the index overlay the new segment over previous + ones for purposes of reading/search. "Optimizing" the index combines the + contents of existing segments into one (removing any deleted documents + along the way). """ EXTENSIONS = { @@ -720,14 +477,15 @@ def __init__( self, name, doccount, fieldlength_totals, fieldlength_maxes, deleted=None ): """ - Initializes a Segment object. - - Args: - name (str): The name of the segment (the Index object computes this from its name and the generation). - doccount (int): The maximum document number in the segment. - fieldlength_totals (dict): A dictionary mapping field numbers to the total number of terms in that field across all documents in the segment. - fieldlength_maxes (dict): A dictionary mapping field numbers to the maximum length of the field in any of the documents in the segment. - deleted (set, optional): A set of deleted document numbers, or None if no deleted documents exist in this segment. + :param name: The name of the segment (the Index object computes this + from its name and the generation). + :param doccount: The maximum document number in the segment. + :param term_count: Total count of all terms in all documents. + :param fieldlength_totals: A dictionary mapping field numbers to the + total number of terms in that field across all documents in the + segment. + :param deleted: A set of deleted document numbers, or None if no + deleted documents exist in this segment. """ self.name = name @@ -737,27 +495,15 @@ def __init__( self.deleted = deleted self._filenames = set() - for attr, ext in self.EXTENSIONS.items(): + for attr, ext in self.EXTENSIONS.iteritems(): fname = f"{self.name}.{ext}" setattr(self, attr + "_filename", fname) self._filenames.add(fname) def __repr__(self): - """ - Returns a string representation of the Segment object. - - Returns: - str: A string representation of the Segment object. - """ return f"{self.__class__.__name__}({self.name!r})" def copy(self): - """ - Creates a copy of the Segment object. - - Returns: - Segment: A copy of the Segment object. - """ if self.deleted: deleted = set(self.deleted) else: @@ -771,91 +517,59 @@ def copy(self): ) def filenames(self): - """ - Returns a set of filenames associated with the segment. - - Returns: - set: A set of filenames associated with the segment. - """ return self._filenames def doc_count_all(self): """ - Returns the total number of documents, deleted or undeleted, in this segment. - - Returns: - int: The total number of documents in this segment. + :returns: the total number of documents, DELETED OR UNDELETED, in this + segment. """ return self.doccount def doc_count(self): """ - Returns the number of undeleted documents in this segment. - - Returns: - int: The number of undeleted documents in this segment. + :returns: the number of (undeleted) documents in this segment. """ return self.doccount - self.deleted_count() def has_deletions(self): """ - Returns True if any documents in this segment are deleted. - - Returns: - bool: True if any documents in this segment are deleted, False otherwise. + :returns: True if any documents in this segment are deleted. """ return self.deleted_count() > 0 def deleted_count(self): """ - Returns the total number of deleted documents in this segment. - - Returns: - int: The total number of deleted documents in this segment. + :returns: the total number of deleted documents in this segment. """ if self.deleted is None: return 0 return len(self.deleted) def field_length(self, fieldnum, default=0): - """ - Returns the total number of terms in the given field across all documents in this segment. - - Args: - fieldnum (int): The internal number of the field. - default (int, optional): The default value to return if the field number is not found. + """Returns the total number of terms in the given field across all + documents in this segment. - Returns: - int: The total number of terms in the given field across all documents in this segment. + :param fieldnum: the internal number of the field. """ return self.fieldlength_totals.get(fieldnum, default) def max_field_length(self, fieldnum, default=0): - """ - Returns the maximum length of the given field in any of the documents in the segment. - - Args: - fieldnum (int): The internal number of the field. - default (int, optional): The default value to return if the field number is not found. + """Returns the maximum length of the given field in any of the + documents in the segment. - Returns: - int: The maximum length of the given field in any of the documents in the segment. + :param fieldnum: the internal number of the field. """ return self.fieldlength_maxes.get(fieldnum, default) def delete_document(self, docnum, delete=True): - """ - Deletes or undeletes the given document number. - - The document is not actually removed from the index until it is optimized. + """Deletes the given document number. The document is not actually + removed from the index until it is optimized. - Args: - docnum (int): The document number to delete or undelete. - delete (bool, optional): If True, deletes the document. If False, undeletes a deleted document. - - Raises: - KeyError: If the document number is already deleted or not deleted. + :param docnum: The document number to delete. + :param delete: If False, this undeletes a deleted document. """ + if delete: if self.deleted is None: self.deleted = set() @@ -872,15 +586,8 @@ def delete_document(self, docnum, delete=True): self.deleted.clear(docnum) def is_deleted(self, docnum): - """ - Returns True if the given document number is deleted. - - Args: - docnum (int): The document number. + """:returns: True if the given document number is deleted.""" - Returns: - bool: True if the given document number is deleted, False otherwise. - """ if self.deleted is None: return False return docnum in self.deleted @@ -890,40 +597,16 @@ def is_deleted(self, docnum): def _toc_pattern(indexname): + """Returns a regular expression object that matches TOC filenames. + name is the name of the index. """ - Returns a regular expression object that matches TOC filenames. - - Parameters: - indexname (str): The name of the index. - - Returns: - re.Pattern: A regular expression object that matches TOC filenames. - - Example: - >>> pattern = _toc_pattern("myindex") - >>> pattern.match("_myindex_1.toc") - - >>> pattern.match("_myindex_2.toc") - - >>> pattern.match("_otherindex_1.toc") - None - """ + return re.compile(f"_{indexname}_([0-9]+).toc") def _segment_pattern(indexname): + """Returns a regular expression object that matches segment filenames. + name is the name of the index. """ - Returns a regular expression object that matches segment filenames. - - Args: - indexname (str): The name of the index. - Returns: - re.Pattern: A regular expression object that matches segment filenames. - - Example: - >>> pattern = _segment_pattern("my_index") - >>> pattern.match("_my_index_001.fdt") - - """ return re.compile(f"(_{indexname}_[0-9]+).({Segment.EXTENSIONS.values()})") diff --git a/src/whoosh/filedb/filepostings.py b/src/whoosh/filedb/filepostings.py index cb409505..abbae1b5 100644 --- a/src/whoosh/filedb/filepostings.py +++ b/src/whoosh/filedb/filepostings.py @@ -26,19 +26,6 @@ class BlockInfo: - """ - Represents information about a block in a file-based posting list. - - Attributes: - nextoffset (int): The offset of the next block in the file. - postcount (int): The number of postings in the block. - maxweight (int): The maximum weight of the postings in the block. - maxwol (float): The maximum weight of a single posting in the block. - minlength (int): The minimum length of the terms in the block. - maxid (int or str): The maximum term ID in the block. - dataoffset (int): The offset of the block's data in the file. - """ - __slots__ = ( "nextoffset", "postcount", @@ -62,18 +49,6 @@ def __init__( maxid=None, dataoffset=None, ): - """ - Initializes a new instance of the BlockInfo class. - - Args: - nextoffset (int, optional): The offset of the next block in the file. - postcount (int, optional): The number of postings in the block. - maxweight (int, optional): The maximum weight of the postings in the block. - maxwol (float, optional): The maximum weight of a single posting in the block. - minlength (int, optional): The minimum length of the terms in the block. - maxid (int or str, optional): The maximum term ID in the block. - dataoffset (int, optional): The offset of the block's data in the file. - """ self.nextoffset = nextoffset self.postcount = postcount self.maxweight = maxweight @@ -83,12 +58,6 @@ def __init__( self.dataoffset = dataoffset def __repr__(self): - """ - Returns a string representation of the BlockInfo object. - - Returns: - str: A string representation of the BlockInfo object. - """ return ( "<%s nextoffset=%r postcount=%r maxweight=%r" " maxwol=%r minlength=%r" @@ -106,12 +75,6 @@ def __repr__(self): ) def to_file(self, file): - """ - Writes the BlockInfo object to a file. - - Args: - file (file-like object): The file to write to. - """ file.write( self._struct.pack( self.nextoffset, @@ -131,26 +94,10 @@ def to_file(self, file): file.write_uint(maxid) def _read_id(self, file): - """ - Reads the maximum term ID from a file. - - Args: - file (file-like object): The file to read from. - """ self.maxid = file.read_uint() @staticmethod def from_file(file, stringids=False): - """ - Creates a new BlockInfo object from a file. - - Args: - file (file-like object): The file to read from. - stringids (bool, optional): Whether the term IDs are stored as strings. - - Returns: - BlockInfo: A new BlockInfo object. - """ ( nextoffset, xi1, @@ -181,36 +128,6 @@ def from_file(file, stringids=False): class FilePostingWriter(PostingWriter): - """ - A class for writing posting lists to a file-based index. - - Args: - schema (Schema): The schema of the index. - postfile (file): The file object to write the posting lists to. - stringids (bool, optional): Whether the document ids are strings. Defaults to False. - blocklimit (int, optional): The maximum number of postings to store in a block. Defaults to 128. - - Raises: - ValueError: If the blocklimit argument is greater than 255 or less than 1. - - Attributes: - schema (Schema): The schema of the index. - postfile (file): The file object to write the posting lists to. - stringids (bool): Whether the document ids are strings. - blocklimit (int): The maximum number of postings to store in a block. - inblock (bool): Indicates if currently inside a block. - fieldnum (int): The field number being written. - format (Codec): The codec for the field being written. - blockcount (int): The number of blocks written. - posttotal (int): The total number of postings written. - startoffset (int): The offset in the file where the current block starts. - blockids (list): The list of document ids in the current block. - blockweights (list): The list of weights in the current block. - blockvalues (list): The list of values in the current block. - blockoffset (int): The offset in the file where the current block is written. - - """ - def __init__(self, schema, postfile, stringids=False, blocklimit=128): self.schema = schema self.postfile = postfile @@ -224,9 +141,6 @@ def __init__(self, schema, postfile, stringids=False, blocklimit=128): self.inblock = False def _reset_block(self): - """ - Resets the current block's data structures. - """ if self.stringids: self.blockids = [] else: @@ -236,21 +150,8 @@ def _reset_block(self): self.blockoffset = self.postfile.tell() def start(self, fieldnum): - """ - Starts a new block for writing postings. - - Args: - fieldnum (int): The field number being written. - - Returns: - int: The offset in the file where the block starts. - - Raises: - ValueError: If called while already inside a block. - - """ if self.inblock: - raise ValueError("Cannot call start() while already in a block") + raise Exception("Called start() in a block") self.fieldnum = fieldnum self.format = self.schema[fieldnum].format @@ -267,14 +168,6 @@ def start(self, fieldnum): return self.startoffset def write(self, id, valuestring): - """ - Writes a posting to the current block. - - Args: - id: The document id. - valuestring: The value associated with the document. - - """ self.blockids.append(id) self.blockvalues.append(valuestring) self.blockweights.append(self.format.decode_weight(valuestring)) @@ -282,23 +175,13 @@ def write(self, id, valuestring): self._write_block() def finish(self): - """ - Finishes writing the current block. - - Returns: - int: The total number of postings written. - - Raises: - ValueError: If called when not in a block. - - """ if not self.inblock: - raise ValueError("Called finish() when not in a block") + raise Exception("Called finish() when not in a block") if self.blockids: self._write_block() - # Seek back to the start of this list of posting blocks and write the + # Seek back to the start of this list of posting blocks and writer the # number of blocks pf = self.postfile pf.flush() @@ -311,19 +194,11 @@ def finish(self): return self.posttotal def close(self): - """ - Closes the posting writer. - - """ if hasattr(self, "blockids") and self.blockids: self.finish() self.postfile.close() def _write_block(self): - """ - Writes the current block to the file. - - """ posting_size = self.format.posting_size dfl_fn = self.dfl_fn fieldnum = self.fieldnum @@ -392,67 +267,7 @@ def _write_block(self): class FilePostingReader(Matcher): - """ - A class for reading posting data from a file-like object. - - This class is responsible for reading posting data from a file-like object and providing - convenient methods to access the IDs, values, and weights of the postings. - - Args: - postfile (file-like object): The file-like object representing the posting file. - offset (int): The offset in the file where the posting data starts. - format (PostingFormat): The format of the posting data. - scorefns (tuple, optional): A tuple of score functions (score, quality, block_quality). - Defaults to None. - stringids (bool, optional): Indicates whether the IDs are stored as strings. - Defaults to False. - - Attributes: - postfile (file-like object): The file-like object representing the posting file. - startoffset (int): The offset in the file where the posting data starts. - format (PostingFormat): The format of the posting data. - _scorefns (tuple): A tuple of score functions (score, quality, block_quality). - stringids (bool): Indicates whether the IDs are stored as strings. - blockcount (int): The number of blocks in the posting file. - baseoffset (int): The offset in the file where the posting data starts. - _active (bool): Indicates whether the FilePostingReader object is active. - currentblock (int): The index of the current block being read. - ids (list): The IDs of the postings in the current block. - values (list): The values of the postings in the current block. - weights (list): The weights of the postings in the current block. - i (int): The index of the current posting within the current block. - - Methods: - copy(): Creates a copy of the FilePostingReader object. - is_active(): Checks if the FilePostingReader object is active. - id(): Returns the ID of the current posting. - value(): Returns the value of the current posting. - weight(): Returns the weight of the current posting. - all_ids(): Generator that yields all the IDs in the posting file. - next(): Moves to the next posting in the posting file. - skip_to(id): Skips to the posting with the specified ID. - - """ - def __init__(self, postfile, offset, format, scorefns=None, stringids=False): - """ - Initializes a FilePostingReader object. - - Args: - postfile (file-like object): The file-like object representing the posting file. - offset (int): The offset in the file where the posting data starts. - format (PostingFormat): The format of the posting data. - scorefns (tuple, optional): A tuple of score functions (score, quality, block_quality). - Defaults to None. - stringids (bool, optional): Indicates whether the IDs are stored as strings. - Defaults to False. - - Raises: - None - - Returns: - None - """ self.postfile = postfile self.startoffset = offset self.format = format @@ -477,18 +292,6 @@ def __init__(self, postfile, offset, format, scorefns=None, stringids=False): self._next_block() def copy(self): - """ - Creates a copy of the FilePostingReader object. - - Args: - None - - Raises: - None - - Returns: - FilePostingReader: A copy of the FilePostingReader object. - """ return self.__class__( self.postfile, self.startoffset, @@ -498,78 +301,18 @@ def copy(self): ) def is_active(self): - """ - Checks if the FilePostingReader object is active. - - Args: - None - - Raises: - None - - Returns: - bool: True if the FilePostingReader object is active, False otherwise. - """ return self._active def id(self): - """ - Returns the ID of the current posting. - - Args: - None - - Raises: - None - - Returns: - int or str: The ID of the current posting. - """ return self.ids[self.i] def value(self): - """ - Returns the value of the current posting. - - Args: - None - - Raises: - None - - Returns: - object: The value of the current posting. - """ return self.values[self.i] def weight(self): - """ - Returns the weight of the current posting. - - Args: - None - - Raises: - None - - Returns: - float: The weight of the current posting. - """ return self.weights[self.i] def all_ids(self): - """ - Generator that yields all the IDs in the posting file. - - Args: - None - - Raises: - None - - Yields: - int or str: The IDs in the posting file. - """ nextoffset = self.baseoffset for _ in range(self.blockcount): blockinfo = self._read_blockinfo(nextoffset) @@ -578,18 +321,6 @@ def all_ids(self): yield from ids def next(self): - """ - Moves to the next posting in the posting file. - - Args: - None - - Raises: - None - - Returns: - bool: True if there is a next posting, False otherwise. - """ if self.i == self.blockinfo.postcount - 1: self._next_block() return True @@ -598,18 +329,6 @@ def next(self): return False def skip_to(self, id): - """ - Skips to the posting with the specified ID. - - Args: - id (int or str): The ID to skip to. - - Raises: - ReadTooFar: If the skip operation goes beyond the end of the posting file. - - Returns: - None - """ if not self.is_active(): raise ReadTooFar @@ -636,36 +355,11 @@ def skip_to(self, id): self.i = i def _read_blockinfo(self, offset): - """ - Reads the block information from the posting file. - - Args: - offset (int): The offset in the posting file where the block information starts. - - Raises: - None - - Returns: - BlockInfo: The block information. - """ pf = self.postfile pf.seek(offset) return BlockInfo.from_file(pf, self.stringids) def _read_ids(self, offset, postcount): - """ - Reads the IDs from the posting file. - - Args: - offset (int): The offset in the posting file where the IDs start. - postcount (int): The number of IDs to read. - - Raises: - None - - Returns: - tuple: A tuple containing the IDs and the offset after reading. - """ pf = self.postfile pf.seek(offset) @@ -678,37 +372,10 @@ def _read_ids(self, offset, postcount): return (ids, pf.tell()) def _read_weights(self, offset, postcount): - """ - Reads the weights from the posting file. - - Args: - offset (int): The offset in the posting file where the weights start. - postcount (int): The number of weights to read. - - Raises: - None - - Returns: - tuple: A tuple containing the weights and the offset after reading. - """ weights = self.postfile.get_array(offset, "f", postcount) return (weights, offset + _FLOAT_SIZE * postcount) def _read_values(self, startoffset, endoffset, postcount): - """ - Reads the values from the posting file. - - Args: - startoffset (int): The offset in the posting file where the values start. - endoffset (int): The offset in the posting file where the values end. - postcount (int): The number of values to read. - - Raises: - None - - Returns: - list: A list of values. - """ pf = self.postfile posting_size = self.format.posting_size @@ -745,18 +412,6 @@ def _read_values(self, startoffset, endoffset, postcount): return values def _consume_block(self): - """ - Consumes the current block by reading the IDs, weights, and values. - - Args: - None - - Raises: - None - - Returns: - None - """ postcount = self.blockinfo.postcount self.ids, woffset = self._read_ids(self.blockinfo.dataoffset, postcount) self.weights, voffset = self._read_weights(woffset, postcount) @@ -764,31 +419,12 @@ def _consume_block(self): self.i = 0 def _next_block(self, consume=True): - """ - Moves to the next block in the posting file. - - Args: - consume (bool, optional): Indicates whether to consume the block by reading the IDs, weights, and values. - Defaults to True. - - Raises: - None - - Returns: - None - """ self.currentblock += 1 if self.currentblock == self.blockcount: self._active = False return if self.currentblock == 0: - self.blockinfo = self._read_blockinfo(self.baseoffset) - else: - self.blockinfo = self._read_blockinfo(self.blockinfo.nextoffset) - - if consume: - self._consume_block() pos = self.baseoffset else: pos = self.blockinfo.nextoffset @@ -798,18 +434,6 @@ def _next_block(self, consume=True): self._consume_block() def _skip_to_block(self, targetfn): - """ - Skips to the block that satisfies the target function. - - Args: - targetfn (function): The target function that determines whether to skip to the next block. - - Raises: - None - - Returns: - int: The number of blocks skipped. - """ skipped = 0 while self._active and targetfn(): self._next_block(consume=False) @@ -821,79 +445,19 @@ def _skip_to_block(self, targetfn): return skipped def supports_quality(self): - """ - Checks if the FilePostingReader object supports quality scoring. - - Args: - None - - Raises: - None - - Returns: - bool: True if the FilePostingReader object supports quality scoring, False otherwise. - """ return True def skip_to_quality(self, minquality): - """ - Skips to the block with the minimum quality score. - - Args: - minquality (float): The minimum quality score. - - Raises: - None - - Returns: - int: The number of blocks skipped. - """ bq = self.block_quality if bq() > minquality: return 0 return self._skip_to_block(lambda: bq() <= minquality) def quality(self): - """ - Raises a ValueError indicating that no quality function is given. - - Args: - None - - Raises: - ValueError: No quality function given. - - Returns: - None - """ - raise ValueError("No quality function given") + raise Exception("No quality function given") def block_quality(self): - """ - Raises a ValueError indicating that no block_quality function is given. - - Args: - None - - Raises: - ValueError: No block_quality function given. - - Returns: - None - """ - raise ValueError("No block_quality function given") + raise Exception("No block_quality function given") def score(self): - """ - Raises a ValueError indicating that no score function is given. - - Args: - None - - Raises: - ValueError: No score function given. - - Returns: - None - """ - raise ValueError("No score function given") + raise Exception("No score function given") diff --git a/src/whoosh/filedb/filereading.py b/src/whoosh/filedb/filereading.py index b3a320b7..8b3ea6e4 100644 --- a/src/whoosh/filedb/filereading.py +++ b/src/whoosh/filedb/filereading.py @@ -35,87 +35,7 @@ class SegmentReader(IndexReader): - """ - A class for reading data from a segment in a Whoosh index. - - This class provides methods for accessing various information and data stored in a segment of a Whoosh index. - It is used internally by the Whoosh library and should not be instantiated directly by users. - - Parameters: - - storage (Storage): The storage object representing the index storage. - - segment (Segment): The segment object representing the segment to read from. - - schema (Schema): The schema object representing the index schema. - - Attributes: - - storage (Storage): The storage object representing the index storage. - - segment (Segment): The segment object representing the segment being read from. - - schema (Schema): The schema object representing the index schema. - - termsindex (FileTableReader): The file table reader for the term index. - - postfile (File): The file object for the term postings file. - - vectorindex (StructHashReader): The struct hash reader for the vector index. - - vpostfile (File): The file object for the vector postings file. - - storedfields (FileListReader): The file list reader for the stored fields file. - - fieldlengths (list): A list of field lengths. - - has_deletions (bool): Indicates whether the segment has deletions. - - is_deleted (callable): A callable object that checks if a document is deleted. - - doc_count (int): The number of documents in the segment. - - dc (int): The total number of documents in the segment, including deleted documents. - - is_closed (bool): Indicates whether the segment reader is closed. - - _sync_lock (Lock): A lock object for synchronization. - - Methods: - - _open_vectors(): Opens the vector index and vector postings file. - - _open_postfile(): Opens the term postings file. - - close(): Closes the segment reader. - - doc_count_all(): Returns the total number of documents in the segment. - - stored_fields(docnum): Returns the stored fields for a given document number. - - all_stored_fields(): Returns an iterator over all stored fields in the segment. - - field_length(fieldnum): Returns the length of a field in the segment. - - doc_field_length(docnum, fieldnum, default=0): Returns the length of a field in a document. - - max_field_length(fieldnum): Returns the maximum length of a field in the segment. - - has_vector(docnum, fieldnum): Checks if a document has a vector for a given field. - - __iter__(): Returns an iterator over the terms in the segment. - - iter_from(fieldnum, text): Returns an iterator over the terms starting from a given field and text. - - _term_info(fieldnum, text): Returns the term info for a given field and text. - - doc_frequency(fieldid, text): Returns the document frequency of a term in a field. - - frequency(fieldid, text): Returns the frequency of a term in a field. - - lexicon(fieldid): Returns an iterator over the terms in a field. - - expand_prefix(fieldid, prefix): Returns an iterator over the terms with a given prefix in a field. - - postings(fieldid, text, exclude_docs=frozenset()): Returns a posting reader for a term in a field. - - vector(docnum, fieldid): Returns a vector reader for a document and field. - - """ - def __init__(self, storage, segment, schema): - """ - Initialize a Filereading object. - - Args: - storage (Storage): The storage object used to access the index files. - segment (Segment): The segment object representing a segment of the index. - schema (Schema): The schema object representing the index schema. - - Attributes: - storage (Storage): The storage object used to access the index files. - segment (Segment): The segment object representing a segment of the index. - schema (Schema): The schema object representing the index schema. - termsindex (FileTableReader): The file table reader for the term index. - postfile (None or FileTableReader): The file table reader for the term postings file. - vectorindex (None or FileTableReader): The file table reader for the vector index. - vpostfile (None or FileTableReader): The file table reader for the vector postings file. - storedfields (FileListReader): The file list reader for the stored fields file. - fieldlengths (list): The list of field lengths. - has_deletions (bool): Indicates if the segment has deletions. - is_deleted (function): Function to check if a document is deleted. - doc_count (int): The number of documents in the segment. - dc (int): The total number of documents in the segment, including deleted documents. - is_closed (bool): Indicates if the Filereading object is closed. - _sync_lock (Lock): Lock object for synchronization. - - Note: - The Filereading object provides access to various index files and information related to a segment of the index. - It is used internally by the Whoosh library and should not be instantiated directly by the user. - """ self.storage = storage self.segment = segment self.schema = schema @@ -164,27 +84,6 @@ def decode_storedfields(value): self._sync_lock = Lock() def _open_vectors(self): - """ - Opens the vector index and vector postings file. - - This method is responsible for opening the vector index and vector postings file - associated with the current storage and segment. It initializes the `vectorindex` - attribute with a StructHashReader object for reading the vector index, and sets - the `vpostfile` attribute to the opened vector postings file. - - Note: - This method assumes that the `vectorindex_filename` and `vectorposts_filename` - attributes of the segment object have been properly set. - - Args: - None - - Returns: - None - - Raises: - None - """ if self.vectorindex: return @@ -198,21 +97,6 @@ def _open_vectors(self): self.vpostfile = storage.open_file(segment.vectorposts_filename, mapped=False) def _open_postfile(self): - """ - Opens the postfile for reading. - - This method is responsible for opening the postfile associated with the segment - for reading. If the postfile is already open, this method does nothing. - - Returns: - None - - Raises: - None - - Usage: - _open_postfile() - """ if self.postfile: return self.postfile = self.storage.open_file( @@ -220,50 +104,13 @@ def _open_postfile(self): ) def __repr__(self): - """ - Return a string representation of the object. - - This method returns a string that represents the object in a unique and - human-readable format. It is used primarily for debugging and logging - purposes. - - Returns: - str: A string representation of the object. - """ return f"{self.__class__.__name__}({self.segment})" @protected def __contains__(self, term): - """ - Check if a term is present in the index. - - Args: - term (tuple): A tuple representing the term to be checked. The tuple should - contain two elements: the first element is the term's numeric - representation, and the second element is the term's string - representation. - - Returns: - bool: True if the term is present in the index, False otherwise. - """ return (self.schema.to_number(term[0]), term[1]) in self.termsindex def close(self): - """ - Closes the file reader and releases any associated resources. - - This method closes the stored fields, terms index, post file, and vector index - if they are open. It also marks the file reader as closed. - - Note: - If the `fieldlengths` attribute is uncommented, it will also be closed. - - Usage: - Call this method when you are finished using the file reader to release - any resources it holds. After calling this method, the file reader should - not be used again. - - """ self.storedfields.close() self.termsindex.close() if self.postfile: @@ -275,267 +122,55 @@ def close(self): self.is_closed = True def doc_count_all(self): - """ - Returns the total number of documents in the index. - - This method retrieves the document count from the index and returns it. - - Returns: - int: The total number of documents in the index. - - Example: - >>> reader = FileReader() - >>> reader.doc_count_all() - 100 - """ return self.dc @protected def stored_fields(self, docnum): - """ - Retrieve the stored fields for a given document number. - - Parameters: - docnum (int): The document number for which to retrieve the stored fields. - - Returns: - dict: A dictionary containing the stored fields for the specified document number. - - Raises: - IndexError: If the specified document number is out of range. - - Example: - >>> reader = FileReading() - >>> reader.stored_fields(0) - {'title': 'Sample Document', 'author': 'John Doe', 'content': 'This is a sample document.'} - """ return self.storedfields[docnum] @protected def all_stored_fields(self): - """ - Generator that yields the stored fields of all non-deleted documents in the segment. - - Yields: - dict: A dictionary containing the stored fields of a document. - - Notes: - - This method iterates over all document numbers in the segment and checks if each document is deleted. - - If a document is not deleted, it yields the stored fields of that document. - - The stored fields are returned as a dictionary. - - Example: - >>> reader = FileReading() - >>> for fields in reader.all_stored_fields(): - ... print(fields) - {'title': 'Document 1', 'content': 'This is the content of document 1'} - {'title': 'Document 2', 'content': 'This is the content of document 2'} - {'title': 'Document 3', 'content': 'This is the content of document 3'} - ... - """ is_deleted = self.segment.is_deleted for docnum in range(self.segment.doc_count_all()): if not is_deleted(docnum): yield self.storedfields[docnum] def field_length(self, fieldnum): - """ - Returns the length of a field in the segment. - - Parameters: - - fieldnum (int): The field number. - - Returns: - - int: The length of the field. - - Raises: - - ValueError: If the field number is invalid. - - This method retrieves the length of a field in the segment. The field number - should be a valid field number. If the field number is invalid, a ValueError - is raised. - - Example usage: - >>> segment = Segment() - >>> field_length = segment.field_length(0) - >>> print(field_length) - 10 - """ return self.segment.field_length(fieldnum) @protected def doc_field_length(self, docnum, fieldnum, default=0): - """ - Returns the length of a field in a document. - - Parameters: - - docnum (int): The document number. - - fieldnum (int): The field number. - - default (int, optional): The default value to return if the field length is not found. Defaults to 0. - - Returns: - - int: The length of the field in the document. - - Raises: - - IndexError: If the field number is out of range. - - IndexError: If the document number is out of range. - - This method retrieves the length of a field in a document from the internal data structure. - It uses the document number and field number to calculate the position in the fieldlengths array, - and then converts the byte value at that position to the corresponding length using the byte_to_length function. - - Example usage: - ``` - reader = FileReader() - length = reader.doc_field_length(10, 2) - print(length) # Output: 42 - ``` - """ index = self.indices[fieldnum] pos = index * self.dc + docnum return byte_to_length(self.fieldlengths[pos]) def max_field_length(self, fieldnum): - """ - Returns the maximum length of a field in the segment. - - Parameters: - fieldnum (int): The field number. - - Returns: - int: The maximum length of the field. - - Raises: - ValueError: If the field number is invalid. - - This method retrieves the maximum length of a field in the segment. The field number - should be a valid field number within the segment. If the field number is invalid, - a ValueError is raised. - - Example usage: - segment = Segment() - field_length = segment.max_field_length(0) - print(field_length) # Output: 100 - """ return self.segment.max_field_length(fieldnum) @protected def has_vector(self, docnum, fieldnum): - """ - Check if a vector exists for a given document number and field number. - - Parameters: - docnum (int): The document number. - fieldnum (int): The field number. - - Returns: - bool: True if the vector exists, False otherwise. - - Raises: - None - - Notes: - - This method assumes that the vectors have been opened using the _open_vectors() method. - - The vectorindex is a dictionary that stores the document and field numbers as keys, and the vectors as values. - """ self._open_vectors() return (docnum, fieldnum) in self.vectorindex @protected def __iter__(self): - """ - Iterate over the terms index and yield tuples containing file name, term, post count, and total frequency. - - Yields: - tuple: A tuple containing the file name, term, post count, and total frequency. - - Notes: - This method is used to iterate over the terms index in the `filereading` module. The terms index is a list of - tuples, where each tuple contains information about a term in the index. The tuple structure is as follows: - ((file_name, term), (total_frequency, _, post_count)). - - The method iterates over each tuple in the terms index and yields a tuple containing the file name, term, - post count, and total frequency. - - Example: - >>> reader = FileReader() - >>> for file_name, term, post_count, total_freq in reader: - ... print(file_name, term, post_count, total_freq) - """ for (fn, t), (totalfreq, _, postcount) in self.termsindex: yield (fn, t, postcount, totalfreq) @protected def iter_from(self, fieldnum, text): - """ - Iterates over the terms index starting from a specific field number and text. - - Args: - fieldnum (int): The field number to start iterating from. - text (str): The text to start iterating from. - - Yields: - tuple: A tuple containing the field number, term, postcount, and total frequency. - - """ tt = self.termsindex for (fn, t), (totalfreq, _, postcount) in tt.items_from((fieldnum, text)): yield (fn, t, postcount, totalfreq) @protected def _term_info(self, fieldnum, text): - """ - Retrieve the term information for a given field and text. - - This method returns the term information (e.g., frequency, positions) for a specific term in a specific field. - It looks up the term in the termsindex dictionary, which is a mapping of (fieldnum, text) tuples to term information. - - Parameters: - - fieldnum (int): The field number of the term. - - text (str): The text of the term. - - Returns: - - TermInfo: An object containing the term information. - - Raises: - - TermNotFound: If the term is not found in the termsindex dictionary. - - Usage: - term_info = _term_info(fieldnum, text) - """ - try: return self.termsindex[(fieldnum, text)] except KeyError: raise TermNotFound(f"{fieldnum}:{text!r}") def doc_frequency(self, fieldid, text): - """ - Returns the document frequency of a given term in a specific field. - - Parameters: - - fieldid (str): The ID of the field. - - text (str): The term to calculate the document frequency for. - - Returns: - - int: The document frequency of the term in the field. - - Raises: - - TermNotFound: If the term is not found in the field. - - This method calculates the document frequency of a given term in a specific field. - It first converts the field ID to a field number using the schema. - Then, it retrieves the term information using the field number and the term. - Finally, it returns the document frequency from the term information. - - Example usage: - ``` - field_id = "content" - term = "python" - frequency = doc_frequency(field_id, term) - print(f"The document frequency of '{term}' in field '{field_id}' is {frequency}.") - ``` - """ try: fieldnum = self.schema.to_number(fieldid) return self._term_info(fieldnum, text)[2] @@ -543,23 +178,6 @@ def doc_frequency(self, fieldid, text): return 0 def frequency(self, fieldid, text): - """ - Returns the frequency of a given term in a specified field. - - Args: - fieldid (str): The ID of the field. - text (str): The term to get the frequency for. - - Returns: - int: The frequency of the term in the field. - - Raises: - TermNotFound: If the term is not found in the field. - - Example: - >>> frequency("title", "python") - 3 - """ try: fieldnum = self.schema.to_number(fieldid) return self._term_info(fieldnum, text)[0] @@ -568,29 +186,10 @@ def frequency(self, fieldid, text): @protected def lexicon(self, fieldid): - """ - Returns an iterator over the terms in the lexicon for the specified field. - - Args: - fieldid (str): The field identifier. - - Yields: - str: The terms in the lexicon for the specified field. + # The base class has a lexicon() implementation that uses iter_from() + # and throws away the value, but overriding to use + # FileTableReader.keys_from() is much, much faster. - Raises: - None. - - Notes: - - This method overrides the base class implementation to use FileTableReader.keys_from() - for faster performance. - - The lexicon is a collection of unique terms in a field. - - The terms are yielded in lexicographic order. - - Example: - reader = FileTableReader() - for term in reader.lexicon("content"): - print(term) - """ tt = self.termsindex fieldid = self.schema.to_number(fieldid) for fn, t in tt.keys_from((fieldid, "")): @@ -600,31 +199,10 @@ def lexicon(self, fieldid): @protected def expand_prefix(self, fieldid, prefix): - """ - Expand a prefix in a specific field. - - This method expands a given prefix in a specific field of the index. It uses the `FileTableReader.keys_from()` method for faster performance compared to the base class implementation. - - Parameters: - - fieldid (str): The ID of the field to expand the prefix in. - - prefix (str): The prefix to expand. - - Yields: - - str: The expanded terms that match the given prefix in the specified field. + # The base class has an expand_prefix() implementation that uses + # iter_from() and throws away the value, but overriding to use + # FileTableReader.keys_from() is much, much faster. - Note: - - The `fieldid` parameter should be a valid field ID defined in the schema. - - The `prefix` parameter should be a string representing the prefix to expand. - - Example: - ``` - reader = FileTableReader() - for term in reader.expand_prefix("title", "comp"): - print(term) - ``` - - This will print all the terms in the "title" field that start with the prefix "comp". - """ tt = self.termsindex fieldid = self.schema.to_number(fieldid) for fn, t in tt.keys_from((fieldid, prefix)): @@ -633,43 +211,9 @@ def expand_prefix(self, fieldid, prefix): yield t def postings(self, fieldid, text, exclude_docs=frozenset()): - """ - Returns a postreader object that allows iterating over the postings (document ids) for a given field and text. - - Args: - fieldid (str): The field identifier. - text (str): The text to search for in the field. - exclude_docs (frozenset, optional): A set of document ids to exclude from the postings. Defaults to an empty set. - - Returns: - FilePostingReader: A postreader object that provides access to the postings. - - Raises: - TermNotFound: If the specified term (fieldid:text) is not found in the index. - - Note: - The postreader object returned by this method allows efficient iteration over the postings (document ids) for a given field and text. - It is important to note that the postreader object is not thread-safe and should not be shared across multiple threads. - - Example: - # Create an index and add documents - ix = create_in("indexdir", schema) - writer = ix.writer() - writer.add_document(title="Document 1", content="This is the first document.") - writer.add_document(title="Document 2", content="This is the second document.") - writer.commit() - - # Get the postreader for the "title" field and the term "document" - postreader = ix.reader().postings("title", "document") - - # Iterate over the postings - for docnum in postreader: - print(f"Document ID: {docnum}") - - """ schema = self.schema fieldnum = schema.to_number(fieldid) - format_schema = schema[fieldnum].format + format = schema[fieldnum].format try: offset = self.termsindex[(fieldnum, text)][1] @@ -682,34 +226,21 @@ def postings(self, fieldid, text, exclude_docs=frozenset()): exclude_docs = self.segment.deleted self._open_postfile() - postreader = FilePostingReader(self.postfile, offset, format_schema) + postreader = FilePostingReader(self.postfile, offset, format) # if exclude_docs: # postreader = Exclude(postreader, exclude_docs) return postreader def vector(self, docnum, fieldid): - """ - Returns the vector representation of a document's field. - - Args: - docnum (int): The document number. - fieldid (str): The field identifier. - - Returns: - FilePostingReader: The reader object for accessing the vector representation of the field. - - Raises: - ValueError: If no vectors are stored for the specified field or if no vector is found for the specified document and field. - """ schema = self.schema fieldnum = schema.to_number(fieldid) vformat = schema[fieldnum].vector if not vformat: - raise ValueError(f"No vectors are stored for field {fieldid!r}") + raise Exception(f"No vectors are stored for field {fieldid!r}") self._open_vectors() offset = self.vectorindex.get((docnum, fieldnum)) if offset is None: - raise ValueError(f"No vector found for document {docnum} field {fieldid!r}") + raise Exception(f"No vector found for document {docnum} field {fieldid!r}") return FilePostingReader(self.vpostfile, offset, vformat, stringids=True) diff --git a/src/whoosh/filedb/filestore.py b/src/whoosh/filedb/filestore.py index a8edb967..ba715c9f 100644 --- a/src/whoosh/filedb/filestore.py +++ b/src/whoosh/filedb/filestore.py @@ -40,25 +40,6 @@ def memoryview_(source, offset=None, length=None): - """ - Create a memoryview object from the given source object. - - Parameters: - - source: The source object to create the memoryview from. - - offset (optional): The starting offset within the source object. If not provided, the memoryview will start from the beginning. - - length (optional): The length of the memoryview. If not provided, the memoryview will extend to the end of the source object. - - Returns: - - mv: The memoryview object created from the source object. - - Usage: - - Create a memoryview from a bytes object: - mv = memoryview_(b'Hello, World!') - - - Create a memoryview from a bytearray object with a specified offset and length: - ba = bytearray(b'Hello, World!') - mv = memoryview_(ba, offset=7, length=5) - """ mv = memoryview(source) if offset or length: return mv[offset : offset + length] @@ -70,48 +51,16 @@ def memoryview_(source, offset=None, length=None): class StorageError(Exception): - """ - Exception raised for errors related to storage operations. - - This exception is raised when there is an error performing operations - related to storage, such as reading or writing files. - - Attributes: - message -- explanation of the error - """ - - def __init__(self, message): - self.message = message - super().__init__(message) + pass class ReadOnlyError(StorageError): - """ - Exception raised when attempting to modify a read-only storage. - - This exception is raised when attempting to modify a storage that has been opened in read-only mode. - It is a subclass of `StorageError` and can be caught separately from other storage-related exceptions. - - Usage: - ------ - When using a storage object, if an attempt is made to modify the storage while it is in read-only mode, - a `ReadOnlyError` will be raised. To handle this exception, you can use a try-except block like this: + pass - try: - # Attempt to modify the storage - storage.modify() - except ReadOnlyError: - # Handle the read-only error - print("The storage is read-only and cannot be modified.") - - """ - def __init__(self, message): - self.message = message - super().__init__(message) +# Base class -# Base class class Storage: """Abstract base class for storage objects. @@ -132,139 +81,56 @@ class Storage: st.create() The :meth:`Storage.create` method makes it slightly easier to swap storage - implementations. The `create()` method handles set-up of the storage - object. For example, `FileStorage.create()` creates the directory. A + implementations. The ``create()`` method handles set-up of the storage + object. For example, ``FileStorage.create()`` creates the directory. A database implementation might create tables. This is designed to let you avoid putting implementation-specific setup code in your application. - - Attributes: - readonly (bool): Indicates if the storage object is read-only. - supports_mmap (bool): Indicates if the storage object supports memory-mapped files. - - Methods: - create(): Creates any required implementation-specific resources. - destroy(*args, **kwargs): Removes any implementation-specific resources related to this storage object. - create_index(schema, indexname=_DEF_INDEX_NAME, indexclass=None): Creates a new index in this storage. - open_index(indexname=_DEF_INDEX_NAME, schema=None, indexclass=None): Opens an existing index in this storage. - index_exists(indexname=None): Returns True if a non-empty index exists in this storage. - create_file(name): Creates a file with the given name in this storage. - open_file(name, *args, **kwargs): Opens a file with the given name in this storage. - list(): Returns a list of file names in this storage. - file_exists(name): Returns True if the given file exists in this storage. - file_modified(name): Returns the last-modified time of the given file in this storage. - file_length(name): Returns the size (in bytes) of the given file in this storage. - delete_file(name): Removes the given file from this storage. - rename_file(frm, to, safe=False): Renames a file in this storage. - lock(name): Returns a named lock object. - close(): Closes any resources opened by this storage object. - optimize(): Optimizes the storage object. - temp_storage(name=None): Creates a new storage object for temporary files. - """ readonly = False supports_mmap = False def __iter__(self): - """ - Returns an iterator over the files in the filestore. - - This method returns an iterator that allows iterating over the files - stored in the filestore. It internally calls the `list()` method to - retrieve the list of files. - - Returns: - iterator: An iterator over the files in the filestore. - - Example: - filestore = FileStore() - for file in filestore: - print(file) - """ return iter(self.list()) def __enter__(self): - """ - Creates a new instance of the FileStore object and returns it. - - This method is used in conjunction with the 'with' statement to provide a context manager for the FileStore object. - It ensures that the FileStore is properly created before entering the context and returns the created instance. - - Returns: - FileStore: The created instance of the FileStore object. - - Example: - with FileStore() as fs: - # Perform operations using the FileStore object - """ self.create() return self def __exit__(self, exc_type, exc_val, exc_tb): - """ - Closes the filestore. - - This method is automatically called when exiting a context manager block. - It ensures that the filestore is properly closed, regardless of any exceptions that may have occurred. - - :param exc_type: The type of the exception (if any) that caused the context to be exited. - :param exc_val: The exception instance (if any) that caused the context to be exited. - :param exc_tb: The traceback object (if any) that caused the context to be exited. - """ self.close() def create(self): - """ - Creates any required implementation-specific resources. + """Creates any required implementation-specific resources. For example, + a filesystem-based implementation might create a directory, while a + database implementation might create tables. For example:: - This method is used to create the necessary resources for a storage implementation. For example, a filesystem-based implementation might create a directory, while a database implementation might create tables. - - Usage: - ------ - 1. Import the necessary modules: from whoosh.filedb.filestore import FileStorage - - 2. Create a storage object: + # Create a storage object st = FileStorage("indexdir") - - 3. Call the create() method to create the required resources: + # Create any necessary resources st.create() - Returns: - -------- - A Storage instance representing the created resources. + This method returns ``self`` so you can also say:: - Example: - -------- - st = FileStorage("indexdir").create() + st = FileStorage("indexdir").create() - Notes: - ------ - - Storage implementations should be written in such a way that calling create() multiple times on the same storage does not cause any issues. - - The create() method returns the Storage instance itself, allowing method chaining. + Storage implementations should be written so that calling create() a + second time on the same storage - :return: A Storage instance representing the created resources. + :return: a :class:`Storage` instance. """ + return self def destroy(self, *args, **kwargs): - """ - Removes any implementation-specific resources related to this storage + """Removes any implementation-specific resources related to this storage object. For example, a filesystem-based implementation might delete a directory, and a database implementation might drop tables. - :param args: Implementation-specific arguments. - :param kwargs: Implementation-specific keyword arguments. - :return: None - - This method should be called when you want to permanently remove all - resources associated with this storage object. It is implementation-specific, - so the behavior may vary depending on the storage implementation being used. - - Example usage: - >>> store = FileStore() - >>> store.destroy() + The arguments are implementation-specific. """ + pass def create_index(self, schema, indexname=_DEF_INDEX_NAME, indexclass=None): @@ -325,12 +191,11 @@ class with this storage object. return indexclass(self, schema=schema, indexname=indexname) def index_exists(self, indexname=None): - """ - Returns True if a non-empty index exists in this storage. + """Returns True if a non-empty index exists in this storage. - :param indexname: (str, optional) The name of the index within the storage object. - You can use this option to store multiple indexes in the same storage. - :return: (bool) True if a non-empty index exists, False otherwise. + :param indexname: the name of the index within the storage object. You + can use this option to store multiple indexes in the same storage. + :rtype: bool """ if indexname is None: @@ -345,77 +210,35 @@ def index_exists(self, indexname=None): return False def create_file(self, name): - """ - Creates a file with the given name in this storage. - - :param name: The name for the new file. - :type name: str - :return: A :class:`whoosh.filedb.structfile.StructFile` instance. - :rtype: whoosh.filedb.structfile.StructFile - :raises NotImplementedError: If the method is not implemented by the subclass. - - This method creates a new file with the specified name in the storage. It returns - an instance of the `StructFile` class, which provides methods for reading and writing - data to the file. - - Example usage: - >>> storage = FileStorage("/path/to/storage") - >>> file = storage.create_file("example.txt") - >>> file.write("Hello, World!") - >>> file.close() - """ - raise NotImplementedError + """Creates a file with the given name in this storage. - def open_file(self, name, *args, **kwargs): + :param name: the name for the new file. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ - Opens a file with the given name in this storage. - :param name: The name of the file to be opened. - :type name: str - :param args: Additional positional arguments to be passed to the file opening mechanism. - :param kwargs: Additional keyword arguments to be passed to the file opening mechanism. - :return: A :class:`whoosh.filedb.structfile.StructFile` instance representing the opened file. - :rtype: whoosh.filedb.structfile.StructFile - :raises NotImplementedError: If the method is not implemented by a subclass. - - This method is used to open a file within the storage. It returns a :class:`whoosh.filedb.structfile.StructFile` - instance that provides file-like operations for reading and writing data. - - Example usage: - - >>> storage = FileStorage('/path/to/storage') - >>> file = storage.open_file('example.txt', mode='r') - >>> content = file.read() - >>> file.close() + raise NotImplementedError - Note that the specific behavior of the `open_file` method may vary depending on the implementation of the storage. - Subclasses of `FileStorage` should override this method to provide the appropriate file opening mechanism. + def open_file(self, name, *args, **kwargs): + """Opens a file with the given name in this storage. + :param name: the name for the new file. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ + raise NotImplementedError def list(self): """Returns a list of file names in this storage. - This method returns a list of file names present in the storage. The storage represents a file system or a similar - file storage mechanism. - - :return: A list of strings representing the file names in the storage. - :rtype: list[str] - - :raises NotImplementedError: If the method is not implemented by a subclass. + :return: a list of strings """ raise NotImplementedError def file_exists(self, name): - """ - Check if the given file exists in this storage. + """Returns True if the given file exists in this storage. - :param name: The name of the file to check. - :type name: str - :return: True if the file exists, False otherwise. + :param name: the name to check. :rtype: bool - :raises NotImplementedError: This method is not implemented in the base class. """ raise NotImplementedError @@ -424,21 +247,8 @@ def file_modified(self, name): """Returns the last-modified time of the given file in this storage (as a "ctime" UNIX timestamp). - :param name: The name of the file to check. - :type name: str - :return: The "ctime" number representing the last-modified time of the file. - :rtype: float - :raises NotImplementedError: This method is not implemented in the base class and should be overridden in subclasses. - - This method returns the last-modified time of the specified file in the storage. - The last-modified time is returned as a "ctime" UNIX timestamp, which represents the number of seconds - since the epoch (January 1, 1970). - - Example usage: - >>> storage = FileStorage() - >>> last_modified = storage.file_modified("example.txt") - >>> print(last_modified) - 1629876543.0 + :param name: the name to check. + :return: a "ctime" number. """ raise NotImplementedError @@ -446,157 +256,70 @@ def file_modified(self, name): def file_length(self, name): """Returns the size (in bytes) of the given file in this storage. - :param name: The name of the file to check. - :type name: str - :return: The size of the file in bytes. + :param name: the name to check. :rtype: int - :raises NotImplementedError: If the method is not implemented by a subclass. - - This method returns the size of the file with the given name in the storage. - It is used to determine the size of a file stored in the file storage. - - Example usage: - >>> storage = FileStorage() - >>> file_size = storage.file_length("example.txt") - >>> print(file_size) - 1024 """ raise NotImplementedError def delete_file(self, name): - """ - Removes the given file from this storage. + """Removes the given file from this storage. - :param name: The name of the file to delete. - :type name: str - :raises NotImplementedError: This method is not implemented in the base class. + :param name: the name to delete. """ raise NotImplementedError def rename_file(self, frm, to, safe=False): - """ - Renames a file in this storage. + """Renames a file in this storage. :param frm: The current name of the file. - :type frm: str :param to: The new name for the file. - :type to: str - :param safe: If True, raise an exception if a file with the new name already exists. - :type safe: bool - :raises NotImplementedError: This method is not implemented in the base class. - - This method renames a file in the storage. It takes the current name of the file - (`frm`) and the new name for the file (`to`). By default, if a file with the new - name already exists, it will overwrite the existing file. However, if the `safe` - parameter is set to True, an exception will be raised if a file with the new name - already exists. - - Example usage: - >>> storage = FileStorage() - >>> storage.rename_file("old_file.txt", "new_file.txt") + :param safe: if True, raise an exception if a file with the new name + already exists. """ + raise NotImplementedError def lock(self, name): - """ - Return a named lock object (implementing ``.acquire()`` and ``.release()`` methods). - - Different storage implementations may use different lock types with different guarantees. - For example, the RamStorage object uses Python thread locks, while the FileStorage object - uses filesystem-based locks that are valid across different processes. - - :param name: A name for the lock. This can be any string that uniquely identifies the lock. - :type name: str - :return: A lock-like object that provides the ``acquire()`` and ``release()`` methods. - :rtype: object - - :raises NotImplementedError: This method is meant to be overridden by subclasses. - - Lock objects are used to synchronize access to shared resources, ensuring that only one - thread or process can access the resource at a time. The ``acquire()`` method is used to - acquire the lock, and the ``release()`` method is used to release the lock. - - Example usage: - - >>> store = FileStorage() - >>> lock = store.lock("my_lock") - >>> lock.acquire() - >>> try: - ... # Perform operations on the shared resource - ... pass - ... finally: - ... lock.release() + """Return a named lock object (implementing ``.acquire()`` and + ``.release()`` methods). Different storage implementations may use + different lock types with different guarantees. For example, the + RamStorage object uses Python thread locks, while the FileStorage + object uses filesystem-based locks that are valid across different + processes. - Note that the lock object returned by this method may have additional methods or properties - specific to the storage implementation being used. It is recommended to consult the - documentation of the specific storage implementation for more details. + :param name: a name for the lock. + :return: a lock-like object. """ + raise NotImplementedError def close(self): - """Closes any resources opened by this storage object. - - This method is used to release any resources held by the storage object, such as locks or file handles. - It should be called when you are done using the storage object to prevent resource leaks. - - Note: - For some storage implementations, this method may be a no-op and not perform any actions. - However, it is still good practice to call this method to ensure proper cleanup. - - Usage: - storage = FileStorage() - # Perform operations using the storage object - storage.close() - + """Closes any resources opened by this storage object. For some storage + implementations this will be a no-op, but for others it is necessary + to release locks and/or prevent leaks, so it's a good idea to call it + when you're done with a storage object. """ + pass def optimize(self): - """Optimizes the storage object. - - This method is used to optimize the storage object. The specific - implementation of optimization may vary depending on the storage - backend being used. For example, a database implementation might - run a garbage collection procedure on the underlying database. - - This method does not take any arguments and does not return any - values. It performs the optimization operation in-place on the - storage object. - - Usage: - store = FileStore() - store.optimize() - - Note: - The behavior of this method may be different for different - storage backends. It is recommended to consult the documentation - of the specific storage backend for more information on how - optimization is performed. - - Raises: - NotImplementedError: If the storage backend does not support - optimization. + """Optimizes the storage object. The meaning and cost of "optimizing" + will vary by implementation. For example, a database implementation + might run a garbage collection procedure on the underlying database. """ + pass def temp_storage(self, name=None): - """ - Creates a new storage object for temporary files. - - This method creates a new storage object that can be used to store temporary files. The storage object can be accessed using the returned value and can be manipulated using its methods. - - :param name: Optional. A name for the new storage. This parameter may be required or optional depending on the storage implementation. - :type name: str or None - :return: A new storage object for temporary files. - :rtype: Storage - :raises NotImplementedError: This method is not implemented in the current class and should be overridden by subclasses. + """Creates a new storage object for temporary files. You can call + :meth:`Storage.destroy` on the new storage when you're finished with + it. - Example usage: - >>> storage = temp_storage() - >>> # Use the storage object to perform operations on temporary files - >>> storage.destroy() # Clean up the temporary storage when finished + :param name: a name for the new storage. This may be optional or + required depending on the storage implementation. + :rtype: :class:`Storage` """ raise NotImplementedError @@ -605,336 +328,63 @@ def temp_storage(self, name=None): class OverlayStorage(Storage): """Overlays two storage objects. Reads are processed from the first if it has the named file, otherwise the second. Writes always go to the second. - - This class provides a way to overlay two storage objects, where the first storage - is used for reading files and the second storage is used for writing files. It is - designed to be used as a storage backend for the Whoosh search engine library. - - Usage: - 1. Create an instance of OverlayStorage by passing two storage objects as arguments. - 2. Use the create_index() method to create an index in the second storage. - 3. Use the open_index() method to open an index in the first storage. - 4. Use the create_file() method to create a file in the second storage. - 5. Use the open_file() method to open a file for reading. If the file exists in the - first storage, it will be read from there, otherwise it will be read from the second - storage. - 6. Use the list() method to get a list of all files in both storages. - 7. Use the file_exists() method to check if a file exists in either storage. - 8. Use the file_modified() method to get the modification time of a file. If the file - exists in the first storage, its modification time will be returned, otherwise the - modification time of the file in the second storage will be returned. - 9. Use the file_length() method to get the length of a file. If the file exists in the - first storage, its length will be returned, otherwise the length of the file in the - second storage will be returned. - 10. Use the delete_file() method to delete a file from the second storage. - 11. Use the lock() method to acquire a lock on a file in the second storage. - 12. Use the close() method to close both storages. - 13. Use the optimize() method to optimize both storages. - 14. Use the temp_storage() method to get a temporary storage object from the second storage. - - Note: The rename_file() method is not implemented and will raise a NotImplementedError if called. """ def __init__(self, a, b): - """ - Initialize a new instance of the Storage class. - - Args: - a: The value for parameter a. - b: The value for parameter b. - """ self.a = a self.b = b def create_index(self, *args, **kwargs): - """ - Create an index in the filestore. - - This method creates an index in the filestore using the provided arguments and keyword arguments. - It delegates the actual index creation to the `create_index` method of the underlying `b` object. - - Parameters: - *args: Variable length argument list. - Positional arguments to be passed to the `create_index` method of the underlying `b` object. - **kwargs: Arbitrary keyword arguments. - Keyword arguments to be passed to the `create_index` method of the underlying `b` object. - - Returns: - None - - Raises: - Any exceptions raised by the `create_index` method of the underlying `b` object. - - Usage: - filestore = FileStore() - filestore.create_index("my_index", schema=my_schema) - """ self.b.create_index(*args, **kwargs) def open_index(self, *args, **kwargs): - """ - Opens an index using the specified arguments and returns the opened index. - - Parameters: - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - Returns: - The opened index. - - Raises: - Any exceptions raised by the underlying implementation. - """ self.a.open_index(*args, **kwargs) def create_file(self, *args, **kwargs): - """ - Create a new file in the filestore. - - This method delegates the creation of the file to the underlying - filestore backend. - - Parameters: - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - Returns: - The created file object. - - Raises: - Any exceptions raised by the underlying filestore backend. - """ return self.b.create_file(*args, **kwargs) def open_file(self, name, *args, **kwargs): - """ - Opens a file with the given name. - - If the file exists in the first file store (self.a), it is opened using the - `open_file` method of the first file store. Otherwise, if the file exists in - the second file store (self.b), it is opened using the `open_file` method of - the second file store. - - Parameters: - name (str): The name of the file to open. - *args: Additional positional arguments to pass to the `open_file` method. - **kwargs: Additional keyword arguments to pass to the `open_file` method. - - Returns: - file-like object: The opened file. - - Raises: - FileNotFoundError: If the file does not exist in either file store. - - Usage: - To open a file, call the `open_file` method with the name of the file as the - first argument. Additional arguments and keyword arguments can be passed to - customize the file opening behavior. - - Example: - file = open_file("example.txt", mode="r") - """ if self.a.file_exists(name): return self.a.open_file(name, *args, **kwargs) else: return self.b.open_file(name, *args, **kwargs) def list(self): - """ - Returns a list of all the files in the filestore. - - This method combines the file lists from two filestores, `a` and `b`, - and removes any duplicates. The resulting list contains all the unique - files from both filestores. - - Returns: - list: A list of file names in the filestore. - - Example: - >>> filestore = FileStore() - >>> filestore.list() - ['file1.txt', 'file2.txt', 'file3.txt'] - """ return list(set(self.a.list()) | set(self.b.list())) def file_exists(self, name): - """ - Check if a file exists in the filestore. - - Parameters: - - name (str): The name of the file to check. - - Returns: - - bool: True if the file exists, False otherwise. - - This method checks if a file exists in the filestore by delegating the check to - both the `a` and `b` filestores. It returns True if the file exists in either of - the filestores, and False otherwise. - """ return self.a.file_exists(name) or self.b.file_exists(name) def file_modified(self, name): - """ - Returns the modified timestamp of a file. - - This method checks if the file exists in the primary file store (self.a). - If the file exists, it retrieves the modified timestamp from the primary file store. - If the file does not exist in the primary file store, it retrieves the modified timestamp from the secondary file store (self.b). - - Parameters: - - name (str): The name of the file. - - Returns: - - int: The modified timestamp of the file. - - """ if self.a.file_exists(name): return self.a.file_modified(name) else: return self.b.file_modified(name) def file_length(self, name): - """ - Returns the length of a file with the given name. - - If the file exists in the primary filestore (self.a), the length of the file is returned. - If the file does not exist in the primary filestore, the length of the file is returned from the secondary filestore (self.b). - - Parameters: - - name (str): The name of the file. - - Returns: - - int: The length of the file. - - Example: - >>> store = FileStore() - >>> store.file_length("example.txt") - 1024 - """ if self.a.file_exists(name): return self.a.file_length(name) else: return self.b.file_length(name) def delete_file(self, name): - """ - Deletes a file from the filestore. - - Args: - name (str): The name of the file to delete. - - Returns: - bool: True if the file was successfully deleted, False otherwise. - - Raises: - FileNotFound: If the specified file does not exist in the filestore. - - Example: - >>> filestore = FileStore() - >>> filestore.delete_file("example.txt") - True - """ return self.b.delete_file(name) def rename_file(self, *args, **kwargs): - """ - Renames a file in the file store. - - This method is used to rename a file in the file store. It takes the necessary arguments - to identify the file to be renamed and the new name to assign to it. - - Args: - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - Raises: - NotImplementedError: This method is not implemented in the base class and should be - overridden in the derived classes. - - """ raise NotImplementedError def lock(self, name): - """ - Acquires a lock on the specified file. - - Args: - name (str): The name of the file to lock. - - Returns: - bool: True if the lock was successfully acquired, False otherwise. - - Raises: - LockError: If an error occurs while acquiring the lock. - - Notes: - This method delegates the locking operation to the underlying file store. - It is used to prevent concurrent access to the same file by multiple processes. - - Example: - >>> filestore = FileStore() - >>> filestore.lock("example.txt") - True - """ return self.b.lock(name) def close(self): - """ - Closes the filestore by closing the underlying file handles. - - This method should be called when you are finished using the filestore. - It closes the file handles for both the primary and secondary files. - - Note: - After calling this method, any further operations on the filestore - will raise an exception. - - Example: - >>> store = FileStore() - >>> # Perform operations on the filestore - >>> store.close() - - """ self.a.close() self.b.close() def optimize(self): - """ - Optimize the filestore by optimizing both the 'a' and 'b' components. - - This method performs optimization on the filestore by calling the `optimize` method - on both the 'a' and 'b' components. Optimization improves the performance of the - filestore by reorganizing the data and reducing fragmentation. - - Note: - Optimization may take some time to complete, depending on the size of the filestore. - - Usage: - filestore = FileStore() - filestore.optimize() - - """ self.a.optimize() self.b.optimize() def temp_storage(self, name=None): - """ - Returns a temporary storage object. - - This method returns a temporary storage object that can be used to store temporary data. - The `name` parameter is optional and can be used to specify a name for the temporary storage. - - Parameters: - name (str, optional): The name of the temporary storage. Defaults to None. - - Returns: - TempStorage: A temporary storage object. - - Example: - >>> store = filestore.temp_storage(name="my_temp_storage") - >>> store.add("data.txt", "Hello, World!") - >>> store.commit() - """ return self.b.temp_storage(name=name) @@ -945,45 +395,19 @@ class FileStorage(Storage): did not exist. As of version 3, the object does not check if the directory exists at initialization. This change is to support using the :meth:`FileStorage.create` method. - - Args: - path (str): A path to a directory. - supports_mmap (bool, optional): If True (the default), use the ``mmap`` module to - open memory mapped files. You can open the storage object with - ``supports_mmap=False`` to force Whoosh to open files normally - instead of with ``mmap``. - readonly (bool, optional): If ``True``, the object will raise an exception if you - attempt to create or rename a file. - debug (bool, optional): If ``True``, enables debug mode. - - Attributes: - folder (str): The path to the directory where the index files are stored. - supports_mmap (bool): If True, the storage object uses memory mapped files. - readonly (bool): If True, the storage object is read-only. - _debug (bool): If True, debug mode is enabled. - locks (dict): A dictionary of file locks. - - Raises: - IOError: If the given path is not a directory. - OSError: If an error occurs while creating or removing the directory. - """ supports_mmap = True def __init__(self, path, supports_mmap=True, readonly=False, debug=False): """ - Initializes a FileStorage object. - - Args: - path (str): A path to a directory. - supports_mmap (bool, optional): If True (the default), use the ``mmap`` module to - open memory mapped files. You can open the storage object with - ``supports_mmap=False`` to force Whoosh to open files normally - instead of with ``mmap``. - readonly (bool, optional): If ``True``, the object will raise an exception if you - attempt to create or rename a file. - debug (bool, optional): If ``True``, enables debug mode. + :param path: a path to a directory. + :param supports_mmap: if True (the default), use the ``mmap`` module to + open memory mapped files. You can open the storage object with + ``supports_mmap=False`` to force Whoosh to open files normally + instead of with ``mmap``. + :param readonly: If ``True``, the object will raise an exception if you + attempt to create or rename a file. """ self.folder = path @@ -1040,17 +464,11 @@ def create(self): return self def destroy(self): + """Removes any files in this storage object and then removes the + storage object's directory. What happens if any of the files or the + directory are in use depends on the underlying platform. """ - Removes any files in this storage object and then removes the storage object's directory. - What happens if any of the files or the directory are in use depends on the underlying platform. - - Raises: - OSError: If an error occurs while removing the directory. - Example: - storage = FileStorage('/path/to/storage') - storage.destroy() - """ # Remove all files self.clean() try: @@ -1064,20 +482,15 @@ def destroy(self): raise e def create_file(self, name, excl=False, mode="wb", **kwargs): + """Creates a file with the given name in this storage. + + :param name: the name for the new file. + :param excl: if True, try to open the file in "exclusive" mode. + :param mode: the mode flags with which to open the file. The default is + ``"wb"``. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ - Creates a file with the given name in this storage. - - :param name: The name for the new file. - :type name: str - :param excl: If True, try to open the file in "exclusive" mode. Defaults to False. - :type excl: bool - :param mode: The mode flags with which to open the file. Defaults to "wb". - :type mode: str - :param kwargs: Additional keyword arguments to be passed to the :class:`whoosh.filedb.structfile.StructFile` constructor. - :return: A :class:`whoosh.filedb.structfile.StructFile` instance representing the created file. - :rtype: whoosh.filedb.structfile.StructFile - :raises ReadOnlyError: If the storage is in read-only mode. - """ + if self.readonly: raise ReadOnlyError @@ -1095,77 +508,21 @@ def create_file(self, name, excl=False, mode="wb", **kwargs): return f def open_file(self, name, **kwargs): + """Opens an existing file in this storage. + + :param name: the name of the file to open. + :param kwargs: additional keyword arguments are passed through to the + :class:`~whoosh.filedb.structfile.StructFile` initializer. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ - Opens an existing file in this storage. - - :param name: The name of the file to open. - :type name: str - :param kwargs: Additional keyword arguments passed to the StructFile initializer. - :type kwargs: dict - :return: An instance of `whoosh.filedb.structfile.StructFile`. - :rtype: whoosh.filedb.structfile.StructFile - :raises FileNotFoundError: If the specified file does not exist. - :raises IOError: If there is an error opening the file. - - This method opens an existing file in the storage and returns an instance of `whoosh.filedb.structfile.StructFile`. - The `StructFile` class provides a file-like interface for reading and writing data to the file. - - Example usage: - >>> storage = FileStorage("/path/to/storage") - >>> file = storage.open_file("example.txt", mode="rb") - >>> data = file.read() - >>> file.close() - - Note that the `name` parameter should be a valid file name within the storage. - Additional keyword arguments are passed through to the `StructFile` initializer, - allowing customization of the file opening behavior (e.g., specifying the file mode). - - It is important to close the file after use to release system resources. - The `StructFile` instance returned by this method provides a `close()` method for this purpose. - """ + f = StructFile(open(self._fpath(name), "rb"), name=name, **kwargs) return f def _fpath(self, fname): - """ - Returns the absolute file path for the given filename within the filestore. - - Args: - fname (str): The name of the file. - - Returns: - str: The absolute file path. - - Raises: - None - - Example: - >>> store = FileStore('/path/to/folder') - >>> store._fpath('data.txt') - '/path/to/folder/data.txt' - """ return os.path.abspath(os.path.join(self.folder, fname)) def clean(self, ignore=False): - """ - Remove all files in the filestore. - - Args: - ignore (bool, optional): If True, any OSError raised during file removal will be ignored. - If False (default), an OSError will be raised if any file removal fails. - - Raises: - ReadOnlyError: If the filestore is in read-only mode. - OSError: If an error occurs while removing a file and ignore is set to False. - - Note: - This method is used to clean the filestore by removing all files within it. - It is important to note that this operation cannot be undone. - - Example: - >>> filestore = FileStore('/path/to/folder') - >>> filestore.clean(ignore=True) - """ if self.readonly: raise ReadOnlyError @@ -1179,24 +536,6 @@ def clean(self, ignore=False): raise def list(self): - """ - Returns a list of files in the specified folder. - - This method lists all the files in the folder specified during the initialization - of the FileStore object. - - Returns: - list: A list of file names in the folder. - - Raises: - OSError: If an error occurs while accessing the folder. - - Example: - >>> fs = FileStore('/path/to/folder') - >>> files = fs.list() - >>> print(files) - ['file1.txt', 'file2.txt', 'file3.txt'] - """ try: files = os.listdir(self.folder) except OSError: @@ -1205,89 +544,21 @@ def list(self): return files def file_exists(self, name): - """ - Check if a file exists in the filestore. - - Args: - name (str): The name of the file to check. - - Returns: - bool: True if the file exists, False otherwise. - """ return os.path.exists(self._fpath(name)) def file_modified(self, name): - """ - Returns the modification time of the file with the given name. - - Parameters: - - name (str): The name of the file. - - Returns: - - float: The modification time of the file in seconds since the epoch. - - Raises: - - FileNotFoundError: If the file does not exist. - - This method retrieves the modification time of the file specified by the given name. - It uses the os.path.getmtime() function to get the modification time in seconds since the epoch. - If the file does not exist, a FileNotFoundError is raised. - - Example usage: - >>> store = FileStore() - >>> modified_time = store.file_modified("example.txt") - >>> print(modified_time) - 1629876543.0 - """ return os.path.getmtime(self._fpath(name)) def file_length(self, name): - """ - Returns the length of a file in bytes. - - Args: - name (str): The name of the file. - - Returns: - int: The length of the file in bytes. - - Raises: - FileNotFoundError: If the file does not exist. - - """ return os.path.getsize(self._fpath(name)) def delete_file(self, name): - """ - Delete a file from the filestore. - - Args: - name (str): The name of the file to delete. - - Raises: - ReadOnlyError: If the filestore is in read-only mode. - - """ if self.readonly: raise ReadOnlyError os.remove(self._fpath(name)) def rename_file(self, oldname, newname, safe=False): - """ - Renames a file in the filestore. - - Args: - oldname (str): The name of the file to be renamed. - newname (str): The new name for the file. - safe (bool, optional): If True, raises a NameError if the new name already exists. - If False, the existing file with the new name will be overwritten. - - Raises: - ReadOnlyError: If the filestore is in read-only mode. - NameError: If the new name already exists and safe is set to True. - - """ if self.readonly: raise ReadOnlyError @@ -1299,42 +570,9 @@ def rename_file(self, oldname, newname, safe=False): os.rename(self._fpath(oldname), self._fpath(newname)) def lock(self, name): - """ - Acquires a lock for the specified file. - - Args: - name (str): The name of the file to lock. - - Returns: - FileLock: A lock object that can be used to manage the file lock. - - Raises: - OSError: If an error occurs while acquiring the lock. - - Notes: - This method is used to acquire a lock for a specific file in the filestore. - The lock prevents other processes from modifying the file while it is locked. - It is important to release the lock using the `release` method when it is no longer needed. - """ return FileLock(self._fpath(name)) def temp_storage(self, name=None): - """ - Creates a temporary storage file for the filestore. - - Args: - name (str, optional): The name of the temporary storage file. If not provided, a random name will be generated. - - Returns: - FileStorage: The temporary storage file. - - Raises: - OSError: If there is an error creating the temporary storage file. - - Example: - >>> filestore = FileStore() - >>> temp_storage = filestore.temp_storage() - """ name = name or f"{random_name()}.tmp" path = os.path.join(self.folder, name) tempstore = FileStorage(path) @@ -1342,185 +580,45 @@ def temp_storage(self, name=None): class RamStorage(Storage): - """Storage object that keeps the index in memory. - - This class provides an implementation of the `Storage` interface that stores the index in memory. - It is suitable for small indexes or for testing purposes. - - Attributes: - files (dict): A dictionary that stores the file content in memory. - locks (dict): A dictionary that stores locks for file access. - folder (str): The folder path associated with the storage. - - Note: - - This implementation does not support memory-mapped files (`supports_mmap` is set to False). - - The `files` dictionary stores the file content as key-value pairs, where the key is the file name and the value is the file content. - - The `locks` dictionary stores locks for file access, where the key is the file name and the value is the lock object. - - The `folder` attribute is not used in this implementation. - - """ + """Storage object that keeps the index in memory.""" supports_mmap = False def __init__(self): - """ - Initialize a FileStore object. - - This class represents a file store that manages a collection of files and their locks. - It provides methods for adding, retrieving, and managing files within the store. - - Attributes: - - files (dict): A dictionary that maps file names to their corresponding file objects. - - locks (dict): A dictionary that maps file names to their corresponding lock objects. - - folder (str): The folder path where the files are stored. - - Usage: - - Create a new FileStore object by calling the constructor. - - Use the `add_file` method to add a file to the store. - - Use the `get_file` method to retrieve a file from the store. - - Use the `lock_file` and `unlock_file` methods to manage file locks. - """ self.files = {} self.locks = {} self.folder = "" def destroy(self): - """ - Deletes all files and locks associated with the file store. - - This method permanently deletes all files and locks associated with the file store. - After calling this method, the file store will be empty and all resources will be released. - - Note: - - Use this method with caution as it irreversibly deletes all files and locks. - - Make sure to close any open indexes before calling this method. - - Raises: - - OSError: If there is an error while deleting the files or locks. - - """ del self.files del self.locks def list(self): - """ - Return a list of all the files stored in the filestore. - - Returns: - list: A list of file names. - """ return list(self.files.keys()) def clean(self): - """ - Removes all files from the filestore. - - This method clears the internal dictionary of files, effectively removing all files from the filestore. - After calling this method, the filestore will be empty. - - Usage: - ram_storage = RamStorage() - ram_storage.clean() - - """ self.files = {} def total_size(self): - """ - Returns the total size of all files in the filestore. - - This method calculates the total size of all files in the filestore by summing the file lengths - of all files returned by the `list()` method. - - Returns: - int: The total size of all files in the filestore. - - Example: - >>> filestore = RamStorage() - >>> filestore.total_size() - 1024 - """ return sum(self.file_length(f) for f in self.list()) def file_exists(self, name): - """ - Check if a file with the given name exists in the filestore. - - Parameters: - - name (str): The name of the file to check. - - Returns: - - bool: True if the file exists, False otherwise. - """ return name in self.files def file_length(self, name): - """ - Returns the length of a file in the filestore. - - Args: - name (str): The name of the file. - - Returns: - int: The length of the file in bytes. - - Raises: - NameError: If the file with the given name does not exist in the filestore. - """ if name not in self.files: raise NameError(name) return len(self.files[name]) def file_modified(self, name): - """ - Returns the modification time of the file with the given name. - - Parameters: - - name (str): The name of the file. - - Returns: - - int: The modification time of the file in seconds since the epoch. - - Note: - This method always returns -1, indicating that the modification time is unknown. - """ return -1 def delete_file(self, name): - """ - Delete a file from the filestore. - - Args: - name (str): The name of the file to delete. - - Raises: - NameError: If the specified file does not exist in the filestore. - - Returns: - None - """ if name not in self.files: raise NameError(name) del self.files[name] def rename_file(self, name, newname, safe=False): - """ - Renames a file in the filestore. - - Args: - name (str): The name of the file to be renamed. - newname (str): The new name for the file. - safe (bool, optional): If True, checks if the new name already exists in the filestore before renaming. - Raises an error if the new name already exists. Defaults to False. - - Raises: - NameError: If the file with the given name does not exist in the filestore. - NameError: If the new name already exists in the filestore and safe is True. - - Returns: - None - - """ if name not in self.files: raise NameError(name) if safe and newname in self.files: @@ -1531,31 +629,6 @@ def rename_file(self, name, newname, safe=False): self.files[newname] = content def create_file(self, name, **kwargs): - """ - Create a file in the filestore. - - This method creates a file in the filestore and returns a StructFile object - that can be used to read from and write to the file. - - Parameters: - - name (str): The name of the file to create. - - Returns: - - StructFile: A StructFile object representing the created file. - - Example usage: - >>> filestore = FileStore() - >>> file = filestore.create_file("example.txt") - >>> file.write("Hello, World!") - >>> file.close() - - Note: - - The created file is stored in the `files` dictionary of the FileStore object. - - The file content is stored as a byte string in the `file` attribute of the StructFile object. - - The `onclose_fn` function is called when the StructFile object is closed, and it updates the `files` dictionary with the file content. - - """ - def onclose_fn(sfile): self.files[name] = sfile.file.getvalue() @@ -1563,65 +636,17 @@ def onclose_fn(sfile): return f def open_file(self, name, **kwargs): - """ - Opens a file from the filestore. - - Args: - name (str): The name of the file to open. - - Returns: - BufferFile: The opened file as a BufferFile object. - - Raises: - NameError: If the specified file does not exist in the filestore. - """ if name not in self.files: raise NameError(name) buf = memoryview_(self.files[name]) return BufferFile(buf, name=name, **kwargs) def lock(self, name): - """ - Acquires a lock for the given name. - - If a lock for the given name does not exist, a new lock is created and stored in the `locks` dictionary. - Subsequent calls to `lock` with the same name will return the same lock object. - - Parameters: - - name (str): The name of the lock. - - Returns: - - Lock: The lock object associated with the given name. - - Example: - >>> store = RamStorage() - >>> lock1 = store.lock("my_lock") - >>> lock2 = store.lock("my_lock") - >>> lock1 is lock2 - True - """ if name not in self.locks: self.locks[name] = Lock() return self.locks[name] def temp_storage(self, name=None): - """ - Creates a temporary storage for the file. - - Args: - name (str, optional): The name of the temporary file. If not provided, a random name will be generated. - - Returns: - FileStorage: The temporary storage object. - - Raises: - OSError: If there is an error creating the temporary file. - - Example: - >>> store = temp_storage("my_temp_file") - >>> store.write("Hello, World!") - >>> store.close() - """ tdir = tempfile.gettempdir() name = name or f"{random_name()}.tmp" path = os.path.join(tdir, name) @@ -1632,48 +657,6 @@ def temp_storage(self, name=None): def copy_storage(sourcestore, deststore): """Copies the files from the source storage object to the destination storage object using ``shutil.copyfileobj``. - - Parameters: - - sourcestore (object): The source storage object from which files will be copied. - - deststore (object): The destination storage object to which files will be copied. - - Returns: - - None - - Raises: - - None - - Example usage: - ``` - sourcestore = FileStore(...) - deststore = FileStore(...) - copy_storage(sourcestore, deststore) - ``` - - This function iterates over the files in the source storage object and copies each file - to the destination storage object using the `shutil.copyfileobj` function. It is useful - for copying files between different storage objects, such as local file systems or cloud - storage systems. - - Note: Both the source and destination storage objects must implement the following methods: - - `list()`: Returns a list of file names in the storage object. - - `open_file(name)`: Opens the file with the given name in the storage object and returns - a file-like object. - - `create_file(name)`: Creates a new file with the given name in the storage object and - returns a file-like object for writing. - - Example storage object implementation: - ``` - class FileStore: - def list(self): - # implementation - - def open_file(self, name): - # implementation - - def create_file(self, name): - # implementation - ``` """ from shutil import copyfileobj @@ -1686,14 +669,6 @@ def create_file(self, name): def copy_to_ram(storage): """Copies the given FileStorage object into a new RamStorage object. - This function creates a new RamStorage object and copies all the files and directories - from the provided FileStorage object into it. The RamStorage object is an in-memory - storage implementation that allows fast access to the files. - - :param storage: The FileStorage object to be copied. - :type storage: :class:`FileStorage` - - :return: The newly created RamStorage object containing the copied files. :rtype: :class:`RamStorage` """ diff --git a/src/whoosh/filedb/filetables.py b/src/whoosh/filedb/filetables.py index 790653ce..59db4d6d 100644 --- a/src/whoosh/filedb/filetables.py +++ b/src/whoosh/filedb/filetables.py @@ -43,52 +43,13 @@ class FileFormatError(Exception): - """ - Exception raised when there is an error with the file format. - - This exception is raised when there is an issue with the format of a file being processed. - It can be used to handle specific errors related to file formats in the application. - - Attributes: - message (str): The error message describing the specific file format error. - """ - - def __init__(self, message): - """ - Initialize a new instance of FileFormatError. - - Args: - message (str): The error message describing the specific file format error. - """ - super().__init__(message) + pass # Hash functions def cdb_hash(key): - """ - Implements the CDB hash function. - - This function calculates the hash value of a given key using the CDB hash algorithm. - - Args: - key (str): The key to be hashed. - - Returns: - int: The hash value of the key. - - Notes: - The CDB hash algorithm is a simple and efficient hash function that produces a 32-bit hash value. - It is commonly used in hash-based data structures like CDB (Constant Database) and similar systems. - - Example: - >>> cdb_hash("example") - 123456789 - - References: - - CDB Hash Function: https://cr.yp.to/cdb/cdb.txt - """ h = 5381 for c in key: h = (h + (h << 5)) & 0xFFFFFFFF ^ ord(c) @@ -96,74 +57,18 @@ def cdb_hash(key): def md5_hash(key): - """ - Implements the MD5 hash function. - - This function takes a key and returns its hash value using the MD5 algorithm. - The hash value is a 32-bit integer. - - Args: - key (bytes or bytearray): The key to be hashed. - - Returns: - int: The hash value of the key. - - Raises: - TypeError: If the key is not of type bytes or bytearray. - - Example: - >>> key = b'my_key' - >>> hash_value = md5_hash(key) - >>> print(hash_value) - 1234567890 - - Note: - This function uses the MD5 algorithm to compute the hash value of the key. - The MD5 algorithm produces a 128-bit hash value, but this function truncates it to a 32-bit integer. - If the Python version is less than 3.9, the `md5` function from the `hashlib` module is used. - Otherwise, the `md5` function is called with the `usedforsecurity=False` argument. - - References: - - Python hashlib module: https://docs.python.org/3/library/hashlib.html - - MD5 algorithm: https://en.wikipedia.org/wiki/MD5 - """ - if not isinstance(key, (bytes, bytearray)): - raise TypeError("Key must be of type bytes or bytearray.") - if sys.version_info < (3, 9): return int(md5(key).hexdigest(), 16) & 0xFFFFFFFF return int(md5(key, usedforsecurity=False).hexdigest(), 16) & 0xFFFFFFFF def crc_hash(key): - """ - Implements the CRC32 hash function. - - This function takes a key as input and returns the hash value of the key using the CRC32 algorithm. - - Args: - key (bytes or bytearray): The key to be hashed. - - Returns: - int: The hash value of the key. - - Example: - >>> key = b"example" - >>> crc_hash(key) - 123456789 - - Note: - The key should be of type bytes or bytearray. If the key is of any other type, a TypeError will be raised. - - References: - - CRC32 algorithm: https://en.wikipedia.org/wiki/Cyclic_redundancy_check - - """ return crc32(key) & 0xFFFFFFFF _hash_functions = (md5_hash, crc_hash, cdb_hash) + # Structs # Two uints before the key/value pair giving the length of the key and value @@ -180,110 +85,55 @@ def crc_hash(key): class HashWriter: - """Implements a fast on-disk key-value store. - - This hash writer uses a two-level hashing scheme, where a key is hashed, and the low eight bits of the hash value - are used to index into one of 256 hash tables. It is similar to the CDB algorithm but with some differences. - - The HashWriter object writes all data serially and does not seek backwards to overwrite information at the end. - It supports 64-bit file pointers, allowing for essentially unlimited file length. However, each key and value must - be less than 2 GB in length. - - Usage: - 1. Create an instance of HashWriter by providing a StructFile object to write to, along with optional parameters - like the format tag bytes and the hashing algorithm to use. - 2. Use the `add` method to add key/value pairs to the file. Note that keys do not need to be unique, and multiple - values can be stored under the same key. - 3. Optionally, use the `add_all` method to add a sequence of `(key, value)` pairs. - 4. Call the `close` method to finalize the writing process and return the end position of the file. - - Args: - dbfile (StructFile): A StructFile object to write to. - magic (bytes, optional): The format tag bytes to write at the start of the file. Defaults to b"HSH3". - hashtype (int, optional): An integer indicating which hashing algorithm to use. - Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash). Defaults to 0. - - Attributes: - dbfile (StructFile): The StructFile object being written to. - hashtype (int): The hashing algorithm being used. - hashfn (function): The hash function corresponding to the selected algorithm. - extras (dict): A dictionary for subclasses to store extra metadata. - startoffset (int): The starting offset of the file. - - Methods: - tell() -> int: - Returns the current position in the file. - - add(key: bytes, value: bytes) -> None: - Adds a key/value pair to the file. - - add_all(items: Iterable[Tuple[bytes, bytes]]) -> None: - Adds a sequence of `(key, value)` pairs to the file. - - close() -> int: - Finalizes the writing process and returns the end position of the file. + """Implements a fast on-disk key-value store. This hash uses a two-level + hashing scheme, where a key is hashed, the low eight bits of the hash value + are used to index into one of 256 hash tables. This is basically the CDB + algorithm, but unlike CDB this object writes all data serially (it doesn't + seek backwards to overwrite information at the end). + + Also unlike CDB, this format uses 64-bit file pointers, so the file length + is essentially unlimited. However, each key and value must be less than + 2 GB in length. """ def __init__(self, dbfile, magic=b"HSH3", hashtype=0): """ - Initializes a FileTables object. - - :param dbfile: A :class:`~whoosh.filedb.structfile.StructFile` object to write to. - :type dbfile: :class:`~whoosh.filedb.structfile.StructFile` - :param magic: The format tag bytes to write at the start of the file. Default is b"HSH3". - :type magic: bytes, optional - :param hashtype: An integer indicating which hashing algorithm to use. Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash). Default is 0. - :type hashtype: int, optional + :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object + to write to. + :param magic: the format tag bytes to write at the start of the file. + :param hashtype: an integer indicating which hashing algorithm to use. + Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash). """ self.dbfile = dbfile self.hashtype = hashtype self.hashfn = _hash_functions[self.hashtype] - self.extras = {} # A place for subclasses to put extra metadata + # A place for subclasses to put extra metadata + self.extras = {} self.startoffset = dbfile.tell() - dbfile.write(magic) # Write format tag - dbfile.write_byte(self.hashtype) # Write hash type - dbfile.write_int(0) # Unused future expansion bits + # Write format tag + dbfile.write(magic) + # Write hash type + dbfile.write_byte(self.hashtype) + # Unused future expansion bits + dbfile.write_int(0) dbfile.write_int(0) - self.buckets = [ - [] for _ in range(256) - ] # 256 lists of hashed keys and positions - self.directory = [] # List to remember the positions of the hash tables + # 256 lists of hashed keys and positions + self.buckets = [[] for _ in range(256)] + # List to remember the positions of the hash tables + self.directory = [] def tell(self): - """ - Returns the current position of the file pointer within the database file. - - :return: The current position of the file pointer. - :rtype: int - """ return self.dbfile.tell() def add(self, key, value): - """Adds a key/value pair to the file. - - This method is used to add a key/value pair to the file. The keys do not need to be unique, - meaning you can store multiple values under the same key. The values are stored in a file - using the specified key. - - Parameters: - - key (bytes): The key associated with the value. It must be of type bytes. - - value (bytes): The value to be stored. It must be of type bytes. - - Returns: - None - - Raises: - AssertionError: If the key or value is not of type bytes. - - Usage: - file_table = FileTable() - file_table.add(b'key1', b'value1') - file_table.add(b'key1', b'value2') - file_table.add(b'key2', b'value3') + """Adds a key/value pair to the file. Note that keys DO NOT need to be + unique. You can store multiple values under the same key and retrieve + them using :meth:`HashReader.all`. """ + assert isinstance(key, bytes) assert isinstance(value, bytes) @@ -299,63 +149,21 @@ def add(self, key, value): self.buckets[h & 255].append((h, pos)) def add_all(self, items): + """Convenience method to add a sequence of ``(key, value)`` pairs. This + is the same as calling :meth:`HashWriter.add` on each pair in the + sequence. """ - Convenience method to add a sequence of ``(key, value)`` pairs to the file table. - - This method allows you to add multiple key-value pairs to the file table at once. - It iterates over the given sequence of ``(key, value)`` pairs and calls the - :meth:`add` method for each pair. - - Parameters: - items (sequence): A sequence of ``(key, value)`` pairs to be added to the file table. - Example: - >>> items = [('key1', 'value1'), ('key2', 'value2'), ('key3', 'value3')] - >>> file_table.add_all(items) - - Note: - - The `items` parameter should be an iterable containing ``(key, value)`` pairs. - - The `key` should be a unique identifier for each value in the file table. - - The `value` can be any object that needs to be associated with the `key`. - """ add = self.add for key, value in items: add(key, value) def _write_hashes(self): - """ - Writes 256 hash tables containing pointers to the key/value pairs. - - This method is responsible for creating and writing the hash tables to disk. - Each hash table contains pointers to the key/value pairs stored in the database. - - Parameters: - - None - - Returns: - - None - - Usage: - - Call this method to write the hash tables to disk after populating the buckets. - - Algorithm: - - For each bucket in the buckets list: - - Get the start position of the bucket's hash table in the database file. - - Calculate the number of slots in the hash table. - - Append the (start position, number of slots) tuple to the directory list. - - Create an empty hash table with the specified number of slots. - - For each (hash value, key position) tuple in the bucket: - - Calculate the slot index for the entry using bit shifting and wrapping. - - If the slot is already taken, find the next empty slot. - - Insert the entry into the hash table at the calculated slot index. - - Write the hash table for the bucket to the database file. - - Note: - - The hash tables are written in a specific format using the _pointer.pack() method. - - The database file (dbfile) and the null value (representing an empty slot) are used throughout the method. - """ + # Writes 256 hash tables containing pointers to the key/value pairs + dbfile = self.dbfile # Represent and empty slot in the hash table using 0,0 (no key can + # start at position 0 because of the header) null = (0, 0) for entries in self.buckets: @@ -382,72 +190,16 @@ def _write_hashes(self): dbfile.write(_pointer.pack(hashval, position)) def _write_directory(self): - """ - Writes a directory of pointers to the 256 hash tables. - - This method is responsible for writing a directory of pointers to the 256 hash tables - in the database file. Each entry in the directory consists of the position and number - of slots for a hash table. - - Parameters: - None - - Returns: - None - - Raises: - None + # Writes a directory of pointers to the 256 hash tables - Usage: - Call this method to write the directory of pointers to the hash tables in the - database file. - - Example: - _write_directory() - """ dbfile = self.dbfile for position, numslots in self.directory: dbfile.write(_dir_entry.pack(position, numslots)) def _write_extras(self): - """ - Write the extras dictionary to the database file. - - This method serializes and writes the extras dictionary to the database file. - The extras dictionary contains additional metadata or information associated - with the file database. - - Note: - This method should only be called internally by the filetables module. - - Raises: - IOError: If there is an error writing the extras dictionary to the file. - - """ self.dbfile.write_pickle(self.extras) def close(self): - """ - Closes the file database and performs necessary write operations. - - This method is responsible for closing the file database and performing - necessary write operations before closing. It writes hash tables, the - directory of pointers to hash tables, extra information, and the length - of the pickle to the file. - - Returns: - int: The position of the end of the file. - - Usage: - Call this method when you are finished using the file database and - want to close it. It ensures that all necessary write operations are - performed before closing the file. - - Example: - file_db = FileDatabase() - # ... perform operations on the file database ... - file_db.close() - """ dbfile = self.dbfile # Write hash tables @@ -469,146 +221,20 @@ def close(self): class HashReader: """Reader for the fast on-disk key-value files created by :class:`HashWriter`. - - This class provides methods to read and retrieve key-value pairs from a - hash file. It is designed to work with files created by the `HashWriter` - class. - - Usage: - ------ - To use the `HashReader` class, you need to provide a file object and - optionally the length of the file data. The file object should be an - instance of `whoosh.filedb.structfile.StructFile`. - - Example: - -------- - # Open a hash file - dbfile = StructFile("data.hash") - reader = HashReader(dbfile) - - # Retrieve a value for a given key - value = reader["key"] - - # Iterate over all key-value pairs - for key, value in reader: - print(key, value) - - # Close the reader - reader.close() - - Parameters: - ----------- - dbfile : whoosh.filedb.structfile.StructFile - A file object to read from. This should be an instance of - `whoosh.filedb.structfile.StructFile`. - length : int, optional - The length of the file data. This is necessary since the hashing - information is written at the end of the file. - magic : bytes, optional - The format tag bytes to look for at the start of the file. If the - file's format tag does not match these bytes, the object raises a - `FileFormatError` exception. - startoffset : int, optional - The starting point of the file data. - - Attributes: - ----------- - dbfile : whoosh.filedb.structfile.StructFile - The file object being read from. - startoffset : int - The starting point of the file data. - is_closed : bool - Indicates whether the reader has been closed. - - Methods: - -------- - open(cls, storage, name) - Convenience method to open a hash file given a - `whoosh.filedb.filestore.Storage` object and a name. This takes care - of opening the file and passing its length to the initializer. - file() - Returns the file object being read from. - close() - Closes the reader. - key_at(pos) - Returns the key bytes at the given position. - key_and_range_at(pos) - Returns a (keybytes, datapos, datalen) tuple for the key at the given - position. - __getitem__(key) - Retrieves the value associated with the given key. - __iter__() - Iterates over all key-value pairs. - __contains__(key) - Checks if the given key exists in the hash file. - keys() - Returns an iterator over all keys. - values() - Returns an iterator over all values. - items() - Returns an iterator over all key-value pairs. - get(key, default=None) - Retrieves the value associated with the given key, or returns the - default value if the key is not found. - all(key) - Returns a generator that yields all values associated with the given - key. - ranges_for_key(key) - Returns a generator that yields (datapos, datalength) tuples - associated with the given key. - range_for_key(key) - Returns the first (datapos, datalength) tuple associated with the - given key. - """ def __init__(self, dbfile, length=None, magic=b"HSH3", startoffset=0): """ - Initializes a FileTables object. - - :param dbfile: A :class:`~whoosh.filedb.structfile.StructFile` object to read from. - :type dbfile: :class:`~whoosh.filedb.structfile.StructFile` - :param length: The length of the file data. This is necessary since the hashing information is written at the end of the file. - :type length: int, optional - :param magic: The format tag bytes to look for at the start of the file. If the file's format tag does not match these bytes, the object raises a :class:`~whoosh.filedb.filetables.FileFormatError` exception. - :type magic: bytes, optional - :param startoffset: The starting point of the file data. - :type startoffset: int, optional - - :raises FileFormatError: If the format tag of the file does not match the specified magic bytes. - - The FileTables object represents a file-based hash table. It reads and interprets the data from the provided `dbfile` object. - - The `dbfile` parameter should be an instance of :class:`~whoosh.filedb.structfile.StructFile`, which is a file-like object that supports reading and seeking. - - The `length` parameter is the length of the file data. If not provided, the object will determine the length by seeking to the end of the file and calculating the difference between the current position and the `startoffset`. - - The `magic` parameter is the format tag bytes to look for at the start of the file. If the file's format tag does not match these bytes, a :class:`~whoosh.filedb.filetables.FileFormatError` exception is raised. - - The `startoffset` parameter is the starting point of the file data. If not provided, it defaults to 0. - - After initialization, the FileTables object provides access to the hash tables and other metadata stored in the file. - - Example usage: - - .. code-block:: python - - from whoosh.filedb.structfile import StructFile - from whoosh.filedb.filetables import FileTables - - # Open the file in binary mode - with open("data.db", "rb") as f: - # Create a StructFile object - dbfile = StructFile(f) - # Create a FileTables object - tables = FileTables(dbfile) - - # Access the hash tables - for table in tables.tables: - position, numslots = table - print(f"Table at position {position} with {numslots} slots") - + :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object + to read from. + :param length: the length of the file data. This is necessary since the + hashing information is written at the end of the file. + :param magic: the format tag bytes to look for at the start of the + file. If the file's format tag does not match these bytes, the + object raises a :class:`FileFormatError` exception. + :param startoffset: the starting point of the file data. """ + self.dbfile = dbfile self.startoffset = startoffset self.is_closed = False @@ -655,119 +281,37 @@ def open(cls, storage, name): """Convenience method to open a hash file given a :class:`whoosh.filedb.filestore.Storage` object and a name. This takes care of opening the file and passing its length to the initializer. - - :param storage: The storage object representing the file store. - :type storage: whoosh.filedb.filestore.Storage - :param name: The name of the hash file to open. - :type name: str - :return: An instance of the hash file. - :rtype: whoosh.filedb.filetables.HashFile - - :raises FileNotFoundError: If the specified file does not exist. - :raises IOError: If there is an error opening the file. - - Usage: - >>> storage = Storage() - >>> hash_file = HashFile.open(storage, "example.txt") """ + length = storage.file_length(name) dbfile = storage.open_file(name) return cls(dbfile, length) def file(self): - """ - Returns the database file associated with this instance. - - Returns: - str: The path to the database file. - - """ return self.dbfile def _read_extras(self): - """ - Reads the extras from the database file. - - This method reads the extras stored in the database file and assigns them to the `extras` attribute of the - FileTables object. If an EOFError occurs during the reading process, an empty dictionary is assigned to the - `extras` attribute. - - Returns: - None - - Raises: - None - """ try: self.extras = self.dbfile.read_pickle() except EOFError: self.extras = {} def close(self): - """ - Closes the file table. - - This method closes the file table by closing the underlying database file. - Once closed, the file table cannot be used for any further operations. - - Raises: - ValueError: If the file table is already closed. - - Usage: - table = FileTable(...) - table.close() - """ if self.is_closed: - raise ValueError(f"Tried to close {self} twice") + raise Exception(f"Tried to close {self!r} twice") self.dbfile.close() self.is_closed = True def key_at(self, pos): - """ - Returns the key bytes at the given position. - - Parameters: - pos (int): The position of the key in the database file. + # Returns the key bytes at the given position - Returns: - bytes: The key bytes at the given position. - - Raises: - IndexError: If the position is out of range. - - Notes: - This method retrieves the key bytes from the database file at the specified position. - The position should be a valid index within the file. - The returned key bytes can be used for further processing or lookups in the database. - - Example: - >>> db = FileTables() - >>> key = db.key_at(10) - """ dbfile = self.dbfile keylen = dbfile.get_uint(pos) return dbfile.get(pos + _lengths.size, keylen) def key_and_range_at(self, pos): - """ - Returns a tuple containing the key, data position, and data length for the key at the given position. - - Parameters: - - pos (int): The position of the key in the database file. - - Returns: - - tuple: A tuple containing the following elements: - - keybytes (bytes): The key as bytes. - - datapos (int): The position of the data in the database file. - - datalen (int): The length of the data. - - Raises: - - None - - Notes: - - This method assumes that the database file is already open and accessible. - - The position should be within the valid range of data in the file. - """ + # Returns a (keybytes, datapos, datalen) tuple for the key at the given + # position dbfile = self.dbfile lenssize = _lengths.size @@ -780,28 +324,8 @@ def key_and_range_at(self, pos): return keybytes, datapos, datalen def _ranges(self, pos=None, eod=None): - """ - Yields a series of (keypos, keylength, datapos, datalength) tuples for the key/value pairs in the file. - - Parameters: - pos (int, optional): The starting position to iterate from. If not provided, it defaults to self.startofdata. - eod (int, optional): The ending position to iterate until. If not provided, it defaults to self.endofdata. - - Yields: - tuple: A tuple containing the key position, key length, data position, and data length. - - Usage: - Use this method to iterate over the key/value pairs in the file. It returns a series of tuples, where each tuple represents a key/value pair in the file. The tuple contains the following information: - - keypos: The position of the key in the file. - - keylen: The length of the key. - - datapos: The position of the data in the file. - - datalen: The length of the data. - - Example: - for keypos, keylen, datapos, datalen in _ranges(): - # Process the key/value pair - ... - """ + # Yields a series of (keypos, keylength, datapos, datalength) tuples + # for the key/value pairs in the file dbfile = self.dbfile pos = pos or self.startofdata eod = eod or self.endofdata @@ -816,38 +340,11 @@ def _ranges(self, pos=None, eod=None): pos = datapos + datalen def __getitem__(self, key): - """ - Retrieve the value associated with the given key. - - Args: - key: The key to retrieve the value for. - - Returns: - The value associated with the given key. - - Raises: - KeyError: If the key is not found in the table. - """ for value in self.all(key): return value raise KeyError(key) def __iter__(self): - """ - Iterate over the key-value pairs stored in the file table. - - Yields: - tuple: A tuple containing the key and value of each entry in the file table. - - Raises: - IOError: If there is an error reading the file table. - - Usage: - file_table = FileTable() - for key, value in file_table: - # Process key-value pair - ... - """ dbfile = self.dbfile for keypos, keylen, datapos, datalen in self._ranges(): key = dbfile.get(keypos, keylen) @@ -855,135 +352,33 @@ def __iter__(self): yield (key, value) def __contains__(self, key): - """ - Check if the given key exists in the file table. - - Parameters: - - key (str): The key to check for existence in the file table. - - Returns: - - bool: True if the key exists in the file table, False otherwise. - - Description: - This method checks if the given key exists in the file table. It iterates over the ranges associated with the key - and returns True if at least one range is found. Otherwise, it returns False. - - Example: - >>> file_table = FileTable() - >>> file_table["key1"] = Range(0, 100) - >>> file_table["key2"] = Range(200, 300) - >>> "key1" in file_table - True - >>> "key3" in file_table - False - """ for _ in self.ranges_for_key(key): return True return False def keys(self): - """ - Retrieve the keys from the file table. - - This method iterates over the file table and yields each key stored in it. - - Yields: - str: The keys stored in the file table. - - """ dbfile = self.dbfile for keypos, keylen, _, _ in self._ranges(): yield dbfile.get(keypos, keylen) def values(self): - """ - Returns an iterator over the values stored in the file table. - - Yields: - bytes: The value stored in the file table. - - Raises: - KeyError: If the file table is empty. - - Notes: - This method iterates over the ranges of data stored in the file table and retrieves - the corresponding values using the `dbfile.get()` method. The values are yielded one - by one, allowing for efficient memory usage when working with large file tables. - - Example: - >>> table = FileTable() - >>> table.add(1, b'value1') - >>> table.add(2, b'value2') - >>> table.add(3, b'value3') - >>> for value in table.values(): - ... print(value) - b'value1' - b'value2' - b'value3' - """ dbfile = self.dbfile for _, _, datapos, datalen in self._ranges(): yield dbfile.get(datapos, datalen) def items(self): - """ - Returns an iterator over the key-value pairs stored in the file table. - - Yields: - tuple: A tuple containing the key and value retrieved from the file table. - - Notes: - This method iterates over the ranges of the file table and retrieves the key-value pairs - using the positions and lengths stored in each range. The key and value are obtained by - calling the `get` method of the `dbfile` object. - - Example: - >>> file_table = FileTable() - >>> for key, value in file_table.items(): - ... print(key, value) - """ dbfile = self.dbfile for keypos, keylen, datapos, datalen in self._ranges(): yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen)) def get(self, key, default=None): - """ - Retrieve the value associated with the given key. - - This method returns the first value found for the given key in the file table. - If no value is found, it returns the default value provided. - - Parameters: - - key (str): The key to search for in the file table. - - default (Any, optional): The default value to return if no value is found. Defaults to None. - - Returns: - - The value associated with the given key, or the default value if no value is found. - """ for value in self.all(key): return value return default def all(self, key): - """ - Yields a sequence of values associated with the given key. - - Parameters: - - key (str): The key to retrieve values for. - - Returns: - - generator: A generator that yields the values associated with the key. + """Yields a sequence of values associated with the given key.""" - Raises: - - KeyError: If the key is not found in the database. - - Example: - >>> db = FileTables() - >>> db.all("key1") - - >>> list(db.all("key1")) - ['value1', 'value2', 'value3'] - """ dbfile = self.dbfile for datapos, datalen in self.ranges_for_key(key): yield dbfile.get(datapos, datalen) @@ -991,28 +386,6 @@ def all(self, key): def ranges_for_key(self, key): """Yields a sequence of ``(datapos, datalength)`` tuples associated with the given key. - - Args: - key (bytes): The key to search for. Should be of type bytes. - - Yields: - tuple: A tuple containing the data position and data length associated with the key. - - Raises: - TypeError: If the key is not of type bytes. - - Notes: - This method is used to retrieve the data position and data length associated with a given key. - It performs a lookup in the hash table to find the key's slot, and then checks if the key matches - the one stored in the slot. If a match is found, it yields the data position and data length. - - The method assumes that the hash table and data file have been properly initialized. - - Example: - >>> db = FileTables() - >>> key = b'my_key' - >>> for datapos, datalength in db.ranges_for_key(key): - ... print(f"Data position: {datapos}, Data length: {datalength}") """ if not isinstance(key, bytes): @@ -1060,27 +433,6 @@ def ranges_for_key(self, key): slotpos = tablestart def range_for_key(self, key): - """ - Returns the range associated with the given key. - - This method retrieves the range associated with the given key from the file table. - If the key is found, the range is returned. If the key is not found, a KeyError is raised. - - Parameters: - - key (str): The key to search for in the file table. - - Returns: - - range (tuple): The range associated with the given key. - - Raises: - - KeyError: If the key is not found in the file table. - - Example: - >>> table = FileTable() - >>> table.range_for_key('key1') - (0, 100) - """ - for item in self.ranges_for_key(key): return item raise KeyError(key) @@ -1090,44 +442,12 @@ def range_for_key(self, key): class OrderedHashWriter(HashWriter): - """ - Implements an on-disk hash, but requires that keys be added in order. - An OrderedHashReader can then look up "nearest keys" based on the ordering. - - Parameters: - - dbfile (file-like object): The file-like object to write the hash data to. - - Usage: - 1. Create an instance of OrderedHashWriter by providing a file-like object. - 2. Use the add() method to add keys and values to the hash in increasing order. - 3. Call the _write_extras() method to write the metadata and index array to the file. - - Example: - ``` - with open("hash.db", "wb") as dbfile: - writer = OrderedHashWriter(dbfile) - writer.add("key1", "value1") - writer.add("key2", "value2") - writer._write_extras() - ``` - - Note: - - Keys must be added in increasing order. If a key is added that is not greater than the previous key, a ValueError will be raised. - - The index array, which contains the positions of all keys, will be stored as metadata in the file. + """Implements an on-disk hash, but requires that keys be added in order. + An :class:`OrderedHashReader` can then look up "nearest keys" based on + the ordering. """ def __init__(self, dbfile): - """ - Initialize a FileTables object. - - Args: - dbfile (str): The path to the database file. - - Attributes: - index (GrowableArray): An array of the positions of all keys. - lastkey (bytes): The last key added. - - """ HashWriter.__init__(self, dbfile) # Keep an array of the positions of all keys self.index = GrowableArray("H") @@ -1135,19 +455,6 @@ def __init__(self, dbfile): self.lastkey = emptybytes def add(self, key, value): - """ - Adds a key-value pair to the hash. - - Parameters: - - key: The key to add. Must be greater than the previous key. - - value: The value associated with the key. - - Raises: - - ValueError: If the key is not greater than the previous key. - - Note: - - The position of the key in the file will be stored in the index array. - """ if key <= self.lastkey: raise ValueError(f"Keys must increase: {self.lastkey!r}..{key!r}") self.index.append(self.dbfile.tell()) @@ -1155,12 +462,6 @@ def add(self, key, value): self.lastkey = key def _write_extras(self): - """ - Writes the metadata and index array to the file. - - Note: - - This method should be called after adding all keys and values to the hash. - """ dbfile = self.dbfile index = self.index @@ -1174,52 +475,12 @@ def _write_extras(self): class OrderedHashReader(HashReader): - """A class for reading an ordered hash file and performing operations on it. - - This class extends the `HashReader` class and provides additional methods - for working with an ordered series of keys in the hash file. - - Methods: - closest_key(key): - Returns the closest key equal to or greater than the given key. If - there is no key in the file equal to or greater than the given key, - returns None. - - ranges_from(key): - Yields a series of ``(keypos, keylen, datapos, datalen)`` tuples - for the ordered series of keys equal or greater than the given key. - - keys_from(key): - Yields an ordered series of keys equal to or greater than the given - key. - - items_from(key): - Yields an ordered series of ``(key, value)`` tuples for keys equal - to or greater than the given key. - - Attributes: - indexbase: - The base position of the index array in the hash file. - - indexlen: - The length of the index array. - - indexsize: - The size of each index element in bytes. - - """ - def closest_key(self, key): + """Returns the closest key equal to or greater than the given key. If + there is no key in the file equal to or greater than the given key, + returns None. """ - Returns the closest key equal to or greater than the given key. If there is no key in the file - equal to or greater than the given key, returns None. - - Parameters: - key (Any): The key to search for. - Returns: - Any: The closest key equal to or greater than the given key, or None if no such key exists. - """ pos = self.closest_key_pos(key) if pos is None: return None @@ -1228,29 +489,6 @@ def closest_key(self, key): def ranges_from(self, key): """Yields a series of ``(keypos, keylen, datapos, datalen)`` tuples for the ordered series of keys equal or greater than the given key. - - Parameters: - - key (bytes): The key to start the range from. - - Returns: - - Generator: A generator that yields ``(keypos, keylen, datapos, datalen)`` tuples. - - Notes: - - This method returns a generator that iterates over the ordered series of keys in the file table, - starting from the given key and including all keys that are equal or greater. - - Each tuple in the generator represents a range of data associated with a key, where: - - keypos: The position of the key in the file table. - - keylen: The length of the key. - - datapos: The position of the associated data in the file table. - - datalen: The length of the associated data. - - Example: - ``` - file_table = FileTable() - for keypos, keylen, datapos, datalen in file_table.ranges_from(b'my_key'): - # Process the key and associated data - ... - ``` """ pos = self.closest_key_pos(key) @@ -1260,24 +498,8 @@ def ranges_from(self, key): yield from self._ranges(pos=pos) def keys_from(self, key): - """Yields an ordered series of keys equal to or greater than the given key. - - Args: - key: The key to start yielding from. - - Yields: - The keys equal to or greater than the given key. - - Raises: - None. - - Example: - >>> db = FileTables() - >>> for key in db.keys_from('abc'): - ... print(key) - abc - abcd - abcde + """Yields an ordered series of keys equal to or greater than the given + key. """ dbfile = self.dbfile @@ -1287,25 +509,6 @@ def keys_from(self, key): def items_from(self, key): """Yields an ordered series of ``(key, value)`` tuples for keys equal to or greater than the given key. - - Parameters: - - key (bytes): The key to start iterating from. - - Yields: - - tuple: A ``(key, value)`` tuple for each key equal to or greater than the given key. - - Notes: - - This method retrieves the ``(key, value)`` pairs from the file database starting from the given key. - - The keys are ordered in ascending order. - - The values are retrieved from the file database using the key positions and lengths. - - Example: - >>> db = FileTables() - >>> for key, value in db.items_from(b'key1'): - ... print(key, value) - ('key1', 'value1') - ('key2', 'value2') - ('key3', 'value3') """ dbfile = self.dbfile @@ -1313,24 +516,6 @@ def items_from(self, key): yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen)) def _read_extras(self): - """ - Reads the extras from the database file and sets up the necessary variables for reading the index array. - - This method is called internally by the FileTables class. - - Parameters: - - None - - Returns: - - None - - Raises: - - Exception: If the index type is unknown. - - Usage: - - This method should not be called directly. It is called internally by the FileTables class to read the extras - from the database file and set up the necessary variables for reading the index array. - """ dbfile = self.dbfile # Read the extras @@ -1356,33 +541,8 @@ def _read_extras(self): raise Exception(f"Unknown index type {indextype!r}") def closest_key_pos(self, key): - """ - Given a key, return the position of that key OR the next highest key if the given key does not exist. - - Args: - key (bytes): The key to search for. Should be of type bytes. - - Returns: - int or None: The position of the key in the index array, or None if the key is not found. - - Raises: - TypeError: If the key is not of type bytes. - - Notes: - This method performs a binary search on the positions in the index array to find the closest key. - It assumes that the index array is sorted in ascending order. - - Example: - >>> index = FileTables() - >>> index.closest_key_pos(b'key1') - 0 - >>> index.closest_key_pos(b'key2') - 1 - >>> index.closest_key_pos(b'key3') - 2 - >>> index.closest_key_pos(b'key4') - 2 - """ + # Given a key, return the position of that key OR the next highest key + # if the given key does not exist if not isinstance(key, bytes): raise TypeError(f"Key {key!r} should be bytes") @@ -1413,87 +573,19 @@ def closest_key_pos(self, key): class FieldedOrderedHashWriter(HashWriter): - """ - Implements an on-disk hash, but writes separate position indexes for each field. - - This class is used to write a hash table to disk, where each field has its own position index. - It is designed to work with the `HashReader` class to provide efficient retrieval of values - based on keys. - - Usage: - 1. Create an instance of `FieldedOrderedHashWriter` by passing the `dbfile` parameter, which - represents the file to write the hash table to. - 2. Call the `start_field` method to indicate the start of a new field. Pass the `fieldname` - parameter to specify the name of the field. - 3. Call the `add` method to add a key-value pair to the hash table. The keys must be in increasing - order. If a key is added that is less than or equal to the previous key, a `ValueError` is raised. - 4. Repeat steps 2 and 3 for each field and key-value pair. - 5. Call the `end_field` method to indicate the end of the current field. This will store the - position index for the field in the `fieldmap` dictionary. - 6. After adding all fields and key-value pairs, the hash table can be accessed using the `HashReader` - class. - - Attributes: - - `fieldmap`: A dictionary that maps field names to tuples containing the start position, end position, - length, and typecode of the position index for each field. - - `lastkey`: The last key that was added to the hash table. - - Note: - - This class inherits from the `HashWriter` class, which provides the basic functionality for writing - a hash table to disk. - - Example: - ``` - writer = FieldedOrderedHashWriter(dbfile) - writer.start_field("field1") - writer.add("key1", "value1") - writer.add("key2", "value2") - writer.end_field() - writer.start_field("field2") - writer.add("key3", "value3") - writer.end_field() - # ... - ``` - + """Implements an on-disk hash, but writes separate position indexes for + each field. """ def __init__(self, dbfile): - """ - Initialize a FileTables object. - - Args: - dbfile (str): The path to the database file. - - Attributes: - fieldmap (dict): A dictionary mapping field names to tuples containing - the start position, index position, length, and type code. - lastkey (bytes): The last key added to the FileTables object. - - """ HashWriter.__init__(self, dbfile) # Map field names to (startpos, indexpos, length, typecode) self.fieldmap = self.extras["fieldmap"] = {} + # Keep track of the last key added self.lastkey = emptybytes def start_field(self, fieldname): - """ - Start a new field in the hash table. - - This method is used to initialize a new field in the hash table. It sets the current position in the database file - as the starting position for the field and stores the field name. It also initializes an array to keep track of the - positions of all keys associated with this field. - - Args: - fieldname (str): The name of the field. - - Returns: - None - - Example: - To start a new field named "title", you can call this method as follows: - >>> start_field("title") - """ self.fieldstart = self.dbfile.tell() self.fieldname = fieldname # Keep an array of the positions of all keys @@ -1501,33 +593,6 @@ def start_field(self, fieldname): self.lastkey = emptybytes def add(self, key, value): - """ - Add a key-value pair to the hash table. - - Args: - - `key` (int): The key to add. It should be greater than any previously added key. - - `value` (Any): The value associated with the key. - - Raises: - - `ValueError`: If the key is less than or equal to the previous key. - - Returns: - - None - - Notes: - - This method appends the position of the value in the database file to the `poses` list. - - The `HashWriter.add` method is called to actually add the key-value pair to the hash table. - - The `lastkey` attribute is updated with the newly added key. - - Example usage: - ``` - table = FileTable() - table.add(1, "Value 1") - table.add(2, "Value 2") - table.add(3, "Value 3") - ``` - - """ if key <= self.lastkey: raise ValueError(f"Keys must increase: {self.lastkey!r}..{key!r}") self.poses.append(self.dbfile.tell() - self.fieldstart) @@ -1535,39 +600,6 @@ def add(self, key, value): self.lastkey = key def end_field(self): - """ - End the current field in the hash table. - - This method stores the position index for the field in the `fieldmap` dictionary. - The `fieldmap` dictionary is used to keep track of the start and end positions of each field - in the hash table, as well as the number of positions and the typecode of the positions. - - Usage: - ------ - Call this method after adding all the positions for a field in the hash table. - It will update the `fieldmap` dictionary with the relevant information for the field. - - Example: - -------- - # Create a FileTables object - filetables = FileTables() - - # Add positions for a field - filetables.add_position(1) - filetables.add_position(2) - filetables.add_position(3) - - # End the field and update the fieldmap - filetables.end_field() - - Parameters: - ----------- - None - - Returns: - -------- - None - """ dbfile = self.dbfile fieldname = self.fieldname poses = self.poses @@ -1581,295 +613,66 @@ def end_field(self): class FieldedOrderedHashReader(HashReader): - """ - A subclass of HashReader that provides additional functionality for reading fielded ordered hash data. - - This class extends the HashReader class and adds methods for working with fielded ordered hash data. - It provides methods for iterating over terms, retrieving term data, checking if a term exists, - finding the closest term, and more. - - Usage: - 1. Create an instance of FieldedOrderedHashReader by passing the necessary arguments to the constructor. - 2. Use the various methods provided by this class to interact with the fielded ordered hash data. - - Example: - ``` - reader = FieldedOrderedHashReader(...) - for fieldname, term in reader.iter_terms(): - print(fieldname, term) - ``` - - Args: - *args: Variable length argument list to be passed to the parent class constructor. - **kwargs: Arbitrary keyword arguments to be passed to the parent class constructor. - - Attributes: - fieldmap (dict): A dictionary mapping field names to their corresponding start and end ranges. - fieldlist (list): A sorted list of field names with their start and end ranges. - - Methods: - field_start(fieldname): Get the start position of a field. - fielded_ranges(pos=None, eod=None): Generate fielded ranges for the given position range. - iter_terms(): Iterate over the terms in the fielded ordered hash data. - iter_term_items(): Iterate over the term items in the fielded ordered hash data. - contains_term(fieldname, btext): Check if a term exists in the fielded ordered hash data. - range_for_term(fieldname, btext): Get the range (position and length) of a term in the fielded ordered hash data. - term_data(fieldname, btext): Get the data associated with a term in the fielded ordered hash data. - term_get(fieldname, btext, default=None): Get the data associated with a term, or a default value if the term does not exist. - closest_term_pos(fieldname, key): Get the position of the closest term to the given key. - closest_term(fieldname, btext): Get the closest term to the given term in the fielded ordered hash data. - term_ranges_from(fieldname, btext): Generate term ranges starting from the given term in the fielded ordered hash data. - terms_from(fieldname, btext): Iterate over the terms starting from the given term in the fielded ordered hash data. - term_items_from(fieldname, btext): Iterate over the term items starting from the given term in the fielded ordered hash data. - """ - def __init__(self, *args, **kwargs): - """ - Initialize the FileTables object. - - Args: - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - Raises: - None. - - Returns: - None. - - Notes: - This method initializes the FileTables object by calling the __init__ method of the HashReader class. - It also sets the fieldmap attribute using the extras dictionary passed as a keyword argument. - The fieldmap is a dictionary that maps field names to their corresponding start position, index position, and other information. - The fieldlist attribute is then created as a sorted list of tuples, where each tuple contains the field name, start position, and index position. - - Usage: - filetables = FileTables(*args, **kwargs) - """ HashReader.__init__(self, *args, **kwargs) self.fieldmap = self.extras["fieldmap"] # Make a sorted list of the field names with their start and end ranges self.fieldlist = [] for fieldname in sorted(self.fieldmap.keys()): - startpos, ixpos, _, __ = self.fieldmap[fieldname] + startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] self.fieldlist.append((fieldname, startpos, ixpos)) def field_start(self, fieldname): - """ - Returns the start position of the specified field in the file. - - Parameters: - fieldname (str): The name of the field. - - Returns: - int: The start position of the field in the file. - - Raises: - KeyError: If the specified fieldname does not exist in the fieldmap. - - Example: - >>> field_start('title') - 10 - - Note: - The start position of a field represents the byte offset in the file where the field's data begins. - This method is used internally by the filetables module to retrieve the start position of a field. - """ return self.fieldmap[fieldname][0] def fielded_ranges(self, pos=None, eod=None): - """ - Generator that yields field information for each key-value pair in the filetable. - - Args: - pos (int, optional): The starting position to iterate from. Defaults to None. - eod (int, optional): The ending position to iterate until. Defaults to None. - - Yields: - tuple: A tuple containing the field name, key position, key length, data position, and data length. - - Raises: - IndexError: If the starting position is out of range. - - Notes: - - This method is used to iterate over the field information of each key-value pair in the filetable. - - The field information includes the field name, key position, key length, data position, and data length. - - If the starting position is not specified, the iteration starts from the beginning of the filetable. - - If the ending position is not specified, the iteration continues until the end of the filetable. - - If the starting position is out of range, an IndexError is raised. - """ flist = self.fieldlist fpos = 0 - fieldname, _, end = flist[fpos] + fieldname, start, end = flist[fpos] for keypos, keylen, datapos, datalen in self._ranges(pos, eod): if keypos >= end: fpos += 1 - fieldname, _, end = flist[fpos] + fieldname, start, end = flist[fpos] yield fieldname, keypos, keylen, datapos, datalen def iter_terms(self): - """ - Iterates over the terms in the filetable. - - Yields tuples containing the field name and the term value for each term in the filetable. - - Returns: - Iterator[tuple]: An iterator over the terms in the filetable. - - Notes: - This method retrieves the terms from the filetable using the `get` method of the `dbfile` object. - It iterates over the fielded ranges in the filetable and yields tuples containing the field name - and the term value for each term. - - Example: - >>> for fieldname, term in filetable.iter_terms(): - ... print(fieldname, term) - """ get = self.dbfile.get for fieldname, keypos, keylen, _, _ in self.fielded_ranges(): yield fieldname, get(keypos, keylen) def iter_term_items(self): - """ - Iterates over the term items in the file table. - - Yields tuples containing the field name, key, and data for each term item. - - Parameters: - - None - - Returns: - - Generator: A generator that yields tuples of the form (fieldname, key, data). - - Example usage: - ``` - for fieldname, key, data in iter_term_items(): - # Process the fieldname, key, and data - ... - ``` - """ get = self.dbfile.get for item in self.fielded_ranges(): fieldname, keypos, keylen, datapos, datalen = item yield fieldname, get(keypos, keylen), get(datapos, datalen) def contains_term(self, fieldname, btext): - """ - Checks if the given term exists in the specified field. - - Parameters: - fieldname (str): The name of the field to search in. - btext (bytes): The term to search for, encoded as bytes. - - Returns: - bool: True if the term exists in the field, False otherwise. - - Raises: - KeyError: If the field or term does not exist. - - Example: - >>> table = FileTables() - >>> table.contains_term("title", b"example") - True - """ try: - _ = self.range_for_term(fieldname, btext) + x = self.range_for_term(fieldname, btext) return True except KeyError: return False def range_for_term(self, fieldname, btext): - """ - Returns the range (datapos, datalen) for a given term in a specific field. - - Args: - fieldname (str): The name of the field. - btext (bytes): The term to search for. - - Returns: - tuple: A tuple containing the data position (datapos) and data length (datalen) for the term. - - Raises: - KeyError: If the term is not found in the field. - - """ - start, ixpos, _, __ = self.fieldmap[fieldname] + start, ixpos, ixsize, code = self.fieldmap[fieldname] for datapos, datalen in self.ranges_for_key(btext): if start < datapos < ixpos: return datapos, datalen raise KeyError((fieldname, btext)) def term_data(self, fieldname, btext): - """ - Retrieve the data associated with a term in a specific field. - - Args: - fieldname (str): The name of the field. - btext (bytes): The term to retrieve the data for. - - Returns: - bytes: The data associated with the term. - - Raises: - KeyError: If the term or field does not exist. - - Notes: - This method retrieves the data associated with a term in a specific field - from the file database. It uses the `range_for_term` method to determine - the position and length of the data in the database file, and then retrieves - the data using the `get` method of the `dbfile` object. - - Example usage: - ``` - fieldname = "title" - term = b"example" - data = term_data(fieldname, term) - print(data) - ``` - """ datapos, datalen = self.range_for_term(fieldname, btext) return self.dbfile.get(datapos, datalen) def term_get(self, fieldname, btext, default=None): - """ - Retrieve the term data for a given field and term text. - - Args: - fieldname (str): The name of the field. - btext (bytes): The term text in bytes. - default: The value to return if the term data is not found. - - Returns: - The term data for the given field and term text, or the default value if not found. - """ try: return self.term_data(fieldname, btext) except KeyError: return default def closest_term_pos(self, fieldname, key): - """ - Given a key, return the position of that key OR the next highest key if the given key does not exist. - - Args: - fieldname (str): The name of the field. - key (bytes): The key to search for. - - Returns: - int or None: The position of the key in the index array, or None if the key is not found. - - Raises: - TypeError: If the key is not of type bytes. - ValueError: If the index type is unknown. - - Note: - This method assumes that the index array is sorted in ascending order. - - Example: - >>> db = FileTables() - >>> db.closest_term_pos("title", b"apple") - 10 - """ + # Given a key, return the position of that key OR the next highest key + # if the given key does not exist if not isinstance(key, bytes): raise TypeError(f"Key {key!r} should be bytes") @@ -1888,7 +691,7 @@ def closest_term_pos(self, fieldname, key): elif ixtype == "q": get_pos = dbfile.get_long else: - raise ValueError(f"Unknown index type {ixtype}") + raise Exception(f"Unknown index type {ixtype!r}") # Do a binary search of the positions in the index array lo = 0 @@ -1908,82 +711,25 @@ def closest_term_pos(self, fieldname, key): return startpos + get_pos(ixpos + lo * ixsize) def closest_term(self, fieldname, btext): - """ - Returns the closest term to the given text in the specified field. - - Args: - fieldname (str): The name of the field to search in. - btext (bytes): The text to find the closest term for. - - Returns: - str or None: The closest term to the given text in the specified field, - or None if no term is found. - - """ pos = self.closest_term_pos(fieldname, btext) if pos is None: return None return self.key_at(pos) def term_ranges_from(self, fieldname, btext): - """ - Returns a generator that yields term ranges for a given field and binary text. - - Args: - fieldname (str): The name of the field. - btext (bytes): The binary text to search for. - - Yields: - tuple: A tuple representing a term range. Each tuple contains two integers, - representing the start and end positions of the term in the index. - - Returns None if no term is found for the given field and binary text. - """ - pos = self.closest_term_pos(fieldname, btext) if pos is None: return - _, ixpos, __, ___ = self.fieldmap[fieldname] + startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] yield from self._ranges(pos, ixpos) def terms_from(self, fieldname, btext): - """ - Retrieves terms from the specified field that match the given binary text. - - Args: - fieldname (str): The name of the field to retrieve terms from. - btext (bytes): The binary text to match against the terms. - - Yields: - bytes: The terms that match the given binary text. - - """ dbfile = self.dbfile for keypos, keylen, _, _ in self.term_ranges_from(fieldname, btext): yield dbfile.get(keypos, keylen) def term_items_from(self, fieldname, btext): - """ - Retrieves term items from the file database for a given field and binary text. - - Args: - fieldname (str): The name of the field to retrieve term items from. - btext (bytes): The binary text to match against. - - Yields: - tuple: A tuple containing the key and data associated with each term item. - - Returns: - None - - Raises: - None - - Example: - >>> for key, data in term_items_from("title", b"example"): - ... print(key, data) - """ dbfile = self.dbfile for item in self.term_ranges_from(fieldname, btext): keypos, keylen, datapos, datalen = item diff --git a/src/whoosh/filedb/filewriting.py b/src/whoosh/filedb/filewriting.py index 91ca817c..36060a68 100644 --- a/src/whoosh/filedb/filewriting.py +++ b/src/whoosh/filedb/filewriting.py @@ -42,52 +42,16 @@ def NO_MERGE(ix, writer, segments): - """ - This policy does not merge any existing segments. - - Parameters: - - ix (Index): The index object. - - writer (IndexWriter): The index writer object. - - segments (list): The list of existing segments. - - Returns: - - list: The list of existing segments, unchanged. - - Usage: - - Use this policy when you want to prevent any merging of existing segments in the index. - - This can be useful in scenarios where you want to maintain the original segment structure without any merging. - """ + """This policy does not merge any existing segments.""" _ = ix, writer return segments def MERGE_SMALL(ix, writer, segments): + """This policy merges small segments, where "small" is defined using a + heuristic based on the fibonacci sequence. """ - Merge small segments based on a heuristic using the Fibonacci sequence. - - This policy merges small segments, where "small" is defined using a heuristic based on the Fibonacci sequence. - The segments are sorted based on their document count, and then merged according to the heuristic. - - Parameters: - - ix (Index): The Whoosh index object. - - writer (IndexWriter): The writer object used for merging segments. - - segments (list): A list of segments to be merged. - - Returns: - - newsegments (SegmentSet): The merged segments. - - Usage: - - Call this function to merge small segments in an index. Pass the index object, writer object, and the list of segments to be merged. - - The function will merge the segments based on the Fibonacci sequence heuristic and return the merged segments. - - Example: - ``` - ix = Index("/path/to/index") - writer = ix.writer() - segments = [segment1, segment2, segment3] - newsegments = MERGE_SMALL(ix, writer, segments) - ``` - """ + from whoosh.filedb.filereading import SegmentReader newsegments = SegmentSet() @@ -104,31 +68,8 @@ def MERGE_SMALL(ix, writer, segments): def OPTIMIZE(ix, writer, segments): - """ - Merge all existing segments into a single segment. - - This function merges all the segments specified in the `segments` list into a single segment. - It uses the `writer` object to add a reader for each segment, and then returns an empty `SegmentSet`. + """This policy merges all existing segments.""" - Parameters: - - ix (Index): The index object. - - writer (IndexWriter): The index writer object. - - segments (list): A list of segment names to be merged. - - Returns: - - SegmentSet: An empty `SegmentSet` object. - - Example: - >>> ix = Index(...) - >>> writer = IndexWriter(...) - >>> segments = ['segment1', 'segment2', 'segment3'] - >>> OPTIMIZE(ix, writer, segments) - - - Note: - - This function assumes that the `SegmentReader` class is imported from `whoosh.filedb.filereading`. - - The `SegmentSet` object returned by this function is not used or modified further in the code snippet provided. - """ from whoosh.filedb.filereading import SegmentReader for seg in segments: @@ -137,78 +78,6 @@ def OPTIMIZE(ix, writer, segments): class SegmentWriter(SegmentDeletionMixin, IndexWriter): - """A class for writing segments in an index. - - This class is responsible for writing segments in an index. It handles the creation - of temporary segment files, writing term indexes, term postings, vector indexes, - vector postings, stored fields, and field lengths. - - Parameters: - - ix (Index): The index to write the segment to. - - poolclass (class, optional): The class to use for the pool. Defaults to None. - - procs (int, optional): The number of processes to use for the pool. Defaults to 0. - - blocklimit (int, optional): The block limit for the posting writer. Defaults to 128. - - timeout (float, optional): The timeout for acquiring the lock. Defaults to 0.0. - - delay (float, optional): The delay between attempts to acquire the lock. Defaults to 0.1. - - **poolargs: Additional keyword arguments to pass to the pool class. - - Attributes: - - lock (Lock): The lock object used to acquire the lock for writing the segment. - - index (Index): The index to write the segment to. - - segments (list): The list of segments in the index. - - blocklimit (int): The block limit for the posting writer. - - schema (Schema): The schema of the index. - - name (str): The name of the segment. - - _searcher (Searcher): The searcher object for the index. - - docnum (int): The document number. - - fieldlength_totals (defaultdict): The total field lengths. - - termsindex (FileTableWriter): The file table writer for the terms index. - - postwriter (FilePostingWriter): The file posting writer for the term postings. - - vectorindex (StructHashWriter): The struct hash writer for the vector index. - - vpostwriter (FilePostingWriter): The file posting writer for the vector postings. - - storedfields (FileListWriter): The file list writer for the stored fields. - - fieldlengths (File): The file for the field lengths. - - pool (Pool): The pool object for the field lengths. - - Methods: - - searcher(): Returns a searcher object for the index. - - add_reader(reader): Adds a reader object to the segment writer. - - add_document(**fields): Adds a document to the segment writer. - - _add_stored_fields(storeddict): Adds stored fields to the segment writer. - - _add_vector(fieldnum, vlist): Adds a vector to the segment writer. - - _close_all(): Closes all files used by the segment writer. - - commit(mergetype=MERGE_SMALL): Commits the segment writer and releases the lock. - - cancel(): Cancels the segment writer and releases the lock. - - Usage: - 1. Create an instance of SegmentWriter by providing the index to write the segment to. - 2. Optionally, you can specify the pool class, the number of processes to use for the pool, - the block limit for the posting writer, the timeout for acquiring the lock, and the delay - between attempts to acquire the lock. - 3. Use the various methods provided by SegmentWriter to add documents, stored fields, and vectors - to the segment writer. - 4. Call the commit() method to commit the segment writer and release the lock. - 5. If needed, you can cancel the segment writer and release the lock by calling the cancel() method. - - Example: - ```python - from whoosh import index - from whoosh.filedb.filewriting import SegmentWriter - - # Open an existing index - ix = index.open_dir("my_index") - - # Create a SegmentWriter - writer = SegmentWriter(ix) - - # Add a document to the segment writer - writer.add_document(title="Example Document", content="This is an example document.") - - # Commit the segment writer - writer.commit() - ``` - """ - def __init__( self, ix, @@ -219,26 +88,6 @@ def __init__( delay=0.1, **poolargs, ): - """ - Initialize a FileWriter object. - - Parameters: - - ix (Index): The index object to write to. - - poolclass (class, optional): The class to use for multiprocessing. If not provided, it defaults to MultiPool if procs > 1, otherwise TempfilePool. - - procs (int, optional): The number of processes to use for multiprocessing. Defaults to 0, which means no multiprocessing. - - blocklimit (int, optional): The maximum number of documents to write in a single block. Defaults to 128. - - timeout (float, optional): The maximum time to wait for acquiring the lock. Defaults to 0.0, which means no timeout. - - delay (float, optional): The delay between attempts to acquire the lock. Defaults to 0.1 seconds. - - **poolargs (dict, optional): Additional keyword arguments to pass to the poolclass constructor. - - Raises: - - LockError: If the lock cannot be acquired within the specified timeout. - - Usage: - - Create an instance of FileWriter by passing an Index object. - - Optionally, specify the poolclass, procs, blocklimit, timeout, delay, and additional poolargs. - - Use the FileWriter object to write documents to the index. - """ self.lock = ix.storage.lock(ix.indexname + "_LOCK") if not try_for(self.lock.acquire, timeout=timeout, delay=delay): raise LockError @@ -302,52 +151,9 @@ def encode_storedfields(fielddict): self.pool = poolclass(self.fieldlengths, procs=procs, **poolargs) def searcher(self): - """ - Returns a searcher object for the index. - - This method creates and returns a searcher object that can be used to search the index. - The searcher object provides methods for executing queries and retrieving search results. - - Returns: - Searcher: A searcher object for the index. - - Example: - >>> index = Index() - >>> writer = index.writer() - >>> # ... add documents to the index ... - >>> searcher = writer.searcher() - >>> results = searcher.search(Query("hello")) - """ return self.index.searcher() def add_reader(self, reader): - """ - Adds documents from the given reader to the index. - - Parameters: - - reader (Reader): The reader object containing the documents to be added. - - This method adds stored documents, vectors, and field lengths from the given reader - to the index. It also handles deletions, if any, and updates the document mapping accordingly. - - Note: - - The reader object must implement the following methods: - - `has_deletions()`: Returns True if the reader has deleted documents, False otherwise. - - `doc_count_all()`: Returns the total number of documents in the reader. - - `is_deleted(docnum)`: Returns True if the document with the given docnum is deleted, False otherwise. - - `stored_fields(docnum)`: Returns the stored fields of the document with the given docnum. - - `scorable_fields()`: Returns a list of field numbers that are scorable. - - `doc_field_length(docnum, fieldnum)`: Returns the length of the field with the given fieldnum in the document with the given docnum. - - `has_vector(docnum, fieldnum)`: Returns True if the document with the given docnum has a vector for the field with the given fieldnum, False otherwise. - - `vector(docnum, fieldnum)`: Returns the vector for the field with the given fieldnum in the document with the given docnum. - - `postings(fieldnum, text)`: Returns a Postings object for the given fieldnum and text. - - Returns: - None - - Raises: - None - """ startdoc = self.docnum has_deletions = reader.has_deletions() @@ -397,27 +203,6 @@ def add_reader(self, reader): self.pool.add_posting(fieldnum, text, newdoc, freq, valuestring) def add_document(self, **fields): - """ - Add a document to the index. - - Args: - **fields: Keyword arguments representing the fields of the document. - The field names should match the names defined in the schema. - - Raises: - UnknownFieldError: If a field name provided does not exist in the schema. - - Notes: - - The fields are sorted based on their order in the schema. - - The indexed fields are added to the index. - - The vector fields are processed and added to the index. - - The stored fields are stored in the index. - - Example: - schema = Schema(title=TEXT(stored=True), content=TEXT) - writer = IndexWriter(index, schema) - writer.add_document(title="Document 1", content="This is the content of Document 1") - """ schema = self.schema name2num = schema.name_to_number @@ -465,46 +250,9 @@ def add_document(self, **fields): self.docnum += 1 def _add_stored_fields(self, storeddict): - """ - Adds a stored field dictionary to the list of stored fields. - - Args: - storeddict (dict): A dictionary containing the stored field data. - - Returns: - None - - Notes: - - The stored field dictionary should contain key-value pairs representing the field name and its value. - - The stored fields are used to store additional data associated with a document. - - The stored fields can be retrieved later during search or retrieval operations. - - Example: - storeddict = {"title": "Sample Document", "author": "John Doe"} - _add_stored_fields(storeddict) - """ self.storedfields.append(storeddict) def _add_vector(self, fieldnum, vlist): - """ - Add a vector to the index for a given field. - - Args: - fieldnum (int): The field number. - vlist (list): A list of tuples containing the text and valuestring for each vector. - - Raises: - AssertionError: If the text is not of type unicode. - - Notes: - This method adds a vector to the index for a specific field. It takes a list of tuples, where each tuple contains the text and valuestring for a vector. The text should be of type unicode. - - The method uses the vpostwriter to write the vectors to the index file. It starts by obtaining the vformat from the schema for the given field. It then iterates over the vlist and writes each vector to the vpostwriter. Finally, it finishes writing the vectors and adds the vector offset to the vectorindex. - - Example: - vlist = [(u"example text", "valuestring1"), (u"another text", "valuestring2")] - _add_vector(0, vlist) - """ vpostwriter = self.vpostwriter vformat = self.schema[fieldnum].vector @@ -517,28 +265,6 @@ def _add_vector(self, fieldnum, vlist): self.vectorindex.add((self.docnum, fieldnum), offset) def _close_all(self): - """ - Closes all the file resources used by the writer. - - This method is responsible for closing the terms index, post writer, vector index, - vpost writer, stored fields, and field lengths. It ensures that all the resources - are properly closed to prevent any data corruption or resource leaks. - - Usage: - Call this method when you are done writing to the index and want to release - the file resources. It is important to call this method to ensure that all - changes are persisted and the files are closed properly. - - Note: - - If the vector index or vpost writer is not initialized, they will not be closed. - - The field lengths are only closed if they are not already closed. - - Raises: - None - - Returns: - None - """ self.termsindex.close() self.postwriter.close() if self.vectorindex: @@ -550,22 +276,6 @@ def _close_all(self): self.fieldlengths.close() def commit(self, mergetype=MERGE_SMALL): - """ - Commits the changes made by the writer to the index. - - This method finalizes the changes made by the writer and commits them to the index. - It performs the following steps: - 1. Calls the merge policy function to determine if any segments need to be merged into the writer's pool. - 2. Informs the pool to add its accumulated data to the terms index and posting file. - 3. Creates a new segment object for the segment created by this writer and adds it to the list of remaining segments. - 4. Closes all files, writes a new TOC (Table of Contents) with the updated segment list, and releases the lock. - - Parameters: - - mergetype (optional): The merge policy function to be used for determining which segments to merge. Default is MERGE_SMALL. - - Returns: - None - """ # Call the merge policy function. The policy may choose to merge other # segments into this writer's pool new_segments = mergetype(self.index, self, self.segments) @@ -592,21 +302,6 @@ def commit(self, mergetype=MERGE_SMALL): self.lock.release() def cancel(self): - """ - Cancels the current operation and releases any acquired resources. - - This method cancels the current operation by calling the `cancel` method of the underlying - thread pool. It also closes all open file handles and releases the lock held by the current - thread. - - Note: - This method should be called if the current operation needs to be canceled or if any - acquired resources need to be released. - - Example: - >>> writer.cancel() - - """ self.pool.cancel() self._close_all() self.lock.release() diff --git a/src/whoosh/filedb/gae.py b/src/whoosh/filedb/gae.py index 302b421a..bfd1b80c 100644 --- a/src/whoosh/filedb/gae.py +++ b/src/whoosh/filedb/gae.py @@ -15,49 +15,6 @@ To open an existing index:: ix = DatastoreStorage().open_index() - -This module provides the following classes: - -- `DatastoreFile`: A file-like object that is backed by a BytesIO() object whose contents - is loaded from a BlobProperty in the app engine datastore. - -- `MemcacheLock`: A lock object that uses the Google App Engine memcache service for synchronization. - -- `DatastoreStorage`: An implementation of `whoosh.store.Storage` that stores files in - the app engine datastore as blob properties. - -Usage: -1. Creating an index: - storage = DatastoreStorage() - schema = Schema(...) - index = storage.create_index(schema) - -2. Opening an existing index: - storage = DatastoreStorage() - index = storage.open_index() - -3. Listing all files in the storage: - storage = DatastoreStorage() - files = storage.list() - -4. Deleting a file: - storage = DatastoreStorage() - storage.delete_file(filename) - -5. Renaming a file: - storage = DatastoreStorage() - storage.rename_file(old_filename, new_filename) - -6. Creating a new file: - storage = DatastoreStorage() - file = storage.create_file(filename) - -7. Opening an existing file: - storage = DatastoreStorage() - file = storage.open_file(filename) - -Note: This class assumes that the necessary dependencies and configurations -for using the app engine datastore are already set up. """ import time @@ -74,81 +31,17 @@ class DatastoreFile(db.Model): """A file-like object that is backed by a BytesIO() object whose contents is loaded from a BlobProperty in the app engine datastore. - - Attributes: - value (db.BlobProperty): The contents of the file stored as a BlobProperty. - mtime (db.IntegerProperty): The modification time of the file in seconds since the epoch. - - Methods: - __init__: Initializes a new instance of the DatastoreFile class. - loadfile: Loads a DatastoreFile object from the datastore or memcache. - close: Closes the file, updates the value and mtime properties, and stores the changes in the datastore. - tell: Returns the current position in the file. - write: Writes the specified data to the file. - read: Reads the specified number of bytes from the file. - seek: Changes the current position in the file. - readline: Reads a line from the file. - getvalue: Returns the contents of the file as a string. - - Usage: - # Create a new DatastoreFile object - file = DatastoreFile() - - # Load a DatastoreFile object from the datastore or memcache - file = DatastoreFile.loadfile("filename") - - # Read from the file - data = file.read(100) - - # Write to the file - file.write("Hello, World!") - - # Close the file and store the changes in the datastore - file.close() """ value = db.BlobProperty() mtime = db.IntegerProperty(default=0) def __init__(self, *args, **kwargs): - """ - Initialize a GAEStorage object. - - This method initializes the GAEStorage object by calling the parent class's - __init__ method and setting up the necessary attributes. - - Args: - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - Attributes: - data (BytesIO): A BytesIO object to store the data. - - Returns: - None - """ super().__init__(*args, **kwargs) self.data = BytesIO() @classmethod def loadfile(cls, name): - """ - Load a file from the datastore or memcache. - - This method retrieves a file from the datastore or memcache based on the given name. - If the file is not found in memcache, it is fetched from the datastore and stored in memcache for future use. - - Parameters: - - cls: The class representing the file entity. - - name: The name of the file to load. - - Returns: - - file: The loaded file object. - - Usage: - file = loadfile(FileEntity, "example.txt") - """ - value = memcache.get(name, namespace="DatastoreFile") if value is None: file = cls.get_by_key_name(name) @@ -159,19 +52,6 @@ def loadfile(cls, name): return file def close(self): - """ - Closes the file and updates the value in the datastore. - - This method is responsible for closing the file and updating the value in the datastore - if the value has changed. It also updates the modification time and stores the value in - the memcache for faster access. - - Returns: - None - - Raises: - None - """ oldvalue = self.value self.value = self.getvalue() if oldvalue != self.value: @@ -180,161 +60,29 @@ def close(self): memcache.set(self.key().id_or_name(), self.value, namespace="DatastoreFile") def tell(self): - """ - Returns the current position of the file pointer. - - This method returns the current position of the file pointer within the file. - It is equivalent to calling the `tell()` method on the underlying file object. - - Returns: - int: The current position of the file pointer. - - Example: - >>> file = GAEFile(...) - >>> file.tell() - 42 - """ return self.data.tell() def write(self, data): - """ - Writes the given data to the file. - - Args: - data (bytes): The data to be written to the file. - - Returns: - int: The number of bytes written. - - Raises: - IOError: If an error occurs while writing to the file. - - Example: - >>> file = File() - >>> data = b"Hello, World!" - >>> file.write(data) - 13 - """ return self.data.write(data) def read(self, length): - """ - Read the specified number of bytes from the data. - - Args: - length (int): The number of bytes to read. - - Returns: - bytes: The bytes read from the data. - - Raises: - IOError: If an error occurs while reading the data. - - Example: - To read 10 bytes from the data, you can use the following code: - - >>> data = GAEFileData() - >>> data.read(10) - """ return self.data.read(length) def seek(self, *args): - """ - Seeks to a specified position in the file. - - Args: - *args: Variable-length argument list. The arguments are passed to the underlying `seek` method. - - Returns: - int: The new position in the file. - - Raises: - OSError: If an error occurs while seeking the file. - - Example: - To seek to the beginning of the file, use `seek(0)`. - - """ return self.data.seek(*args) def readline(self): - """ - Read and return the next line from the data file. - - Returns: - str: The next line from the data file. - - Raises: - None - - Notes: - This method reads and returns the next line from the data file associated with the current instance of the `GAEFile` class. - - Example: - >>> file = GAEFile() - >>> line = file.readline() - """ return self.data.readline() def getvalue(self): - """ - Returns the value stored in the data attribute. - - This method retrieves the value stored in the data attribute of the current object. - It returns the value as a string. - - Returns: - str: The value stored in the data attribute. - - Example: - >>> obj = ClassName() - >>> obj.getvalue() - 'some value' - """ return self.data.getvalue() class MemcacheLock: - """ - A lock implementation using Google App Engine's memcache. - - This class provides a simple lock mechanism using memcache to synchronize access to a resource. - It allows acquiring and releasing locks, with an optional blocking behavior. - - Usage: - lock = MemcacheLock("my_lock_name") - lock.acquire() # Acquire the lock - # Critical section - lock.release() # Release the lock - - If blocking is set to True, the acquire method will block until the lock is acquired. - If the lock is already acquired by another process, the acquire method will retry every 0.1 seconds until it succeeds. - - Note: This lock implementation assumes that the memcache service is available and properly configured. - - Args: - name (str): The name of the lock. - - Attributes: - name (str): The name of the lock. - - """ - def __init__(self, name): self.name = name def acquire(self, blocking=False): - """ - Acquire the lock. - - Args: - blocking (bool, optional): If True, the method will block until the lock is acquired. - If False (default), the method will return immediately. - - Returns: - bool: True if the lock is acquired, False otherwise. - - """ val = memcache.add(self.name, "L", 360, namespace="whooshlocks") if blocking and not val: @@ -348,70 +96,15 @@ def acquire(self, blocking=False): return val def release(self): - """ - Release the lock. - - """ memcache.delete(self.name, namespace="whooshlocks") class DatastoreStorage(Storage): """An implementation of :class:`whoosh.store.Storage` that stores files in the app engine datastore as blob properties. - - This class provides methods to create, open, list, clean, and manipulate files - stored in the app engine datastore. It is designed to be used as a storage - backend for the Whoosh search engine library. - - Usage: - 1. Creating an index: - storage = DatastoreStorage() - schema = Schema(...) - index = storage.create_index(schema) - - 2. Opening an existing index: - storage = DatastoreStorage() - index = storage.open_index() - - 3. Listing all files in the storage: - storage = DatastoreStorage() - files = storage.list() - - 4. Deleting a file: - storage = DatastoreStorage() - storage.delete_file(filename) - - 5. Renaming a file: - storage = DatastoreStorage() - storage.rename_file(old_filename, new_filename) - - 6. Creating a new file: - storage = DatastoreStorage() - file = storage.create_file(filename) - - 7. Opening an existing file: - storage = DatastoreStorage() - file = storage.open_file(filename) - - Note: This class assumes that the necessary dependencies and configurations - for using the app engine datastore are already set up. - """ def create_index(self, schema, indexname=_DEF_INDEX_NAME): - """Create a new index with the given schema. - - Args: - schema (Schema): The schema for the index. - indexname (str, optional): The name of the index. Defaults to _DEF_INDEX_NAME. - - Returns: - FileIndex: The created index. - - Raises: - ReadOnlyError: If the storage is in read-only mode. - - """ if self.readonly: raise ReadOnlyError @@ -419,103 +112,32 @@ def create_index(self, schema, indexname=_DEF_INDEX_NAME): return FileIndex(self, schema, indexname) def open_index(self, indexname=_DEF_INDEX_NAME, schema=None): - """Open an existing index. - - Args: - indexname (str, optional): The name of the index. Defaults to _DEF_INDEX_NAME. - schema (Schema, optional): The schema for the index. Defaults to None. - - Returns: - FileIndex: The opened index. - - """ return FileIndex(self, schema=schema, indexname=indexname) def list(self): - """List all files in the storage. - - Returns: - list: A list of file names. - - """ query = DatastoreFile.all() return [file.key().id_or_name() for file in query] def clean(self): - """Clean up the storage. - - This method does nothing in the case of the app engine datastore storage. - - """ pass def total_size(self): - """Get the total size of the storage. - - Returns: - int: The total size in bytes. - - """ return sum(self.file_length(f) for f in self.list()) def file_exists(self, name): - """Check if a file exists in the storage. - - Args: - name (str): The name of the file. - - Returns: - bool: True if the file exists, False otherwise. - - """ return DatastoreFile.get_by_key_name(name) is not None def file_modified(self, name): - """Get the modification time of a file. - - Args: - name (str): The name of the file. - - Returns: - datetime: The modification time of the file. - - """ return DatastoreFile.get_by_key_name(name).mtime def file_length(self, name): - """Get the length of a file. - - Args: - name (str): The name of the file. - - Returns: - int: The length of the file in bytes. - - """ return len(DatastoreFile.get_by_key_name(name).value) def delete_file(self, name): - """Delete a file from the storage. - - Args: - name (str): The name of the file. - - Returns: - bool: True if the file was successfully deleted, False otherwise. - - """ memcache.delete(name, namespace="DatastoreFile") return DatastoreFile.get_by_key_name(name).delete() def rename_file(self, name, newname, safe=False): - """Rename a file in the storage. - - Args: - name (str): The current name of the file. - newname (str): The new name for the file. - safe (bool, optional): Whether to perform a safe rename. Defaults to False. - - """ file = DatastoreFile.get_by_key_name(name) newfile = DatastoreFile(key_name=newname) newfile.value = file.value @@ -524,16 +146,6 @@ def rename_file(self, name, newname, safe=False): file.delete() def create_file(self, name, **kwargs): - """Create a new file in the storage. - - Args: - name (str): The name of the file. - **kwargs: Additional keyword arguments. - - Returns: - StructFile: The created file. - - """ f = StructFile( DatastoreFile(key_name=name), name=name, @@ -542,40 +154,11 @@ def create_file(self, name, **kwargs): return f def open_file(self, name, *args, **kwargs): - """Open an existing file in the storage. - - Args: - name (str): The name of the file. - *args: Additional positional arguments. - **kwargs: Additional keyword arguments. - - Returns: - StructFile: The opened file. - - """ return StructFile(DatastoreFile.loadfile(name)) def lock(self, name): - """Lock a file in the storage. - - Args: - name (str): The name of the file. - - Returns: - MemcacheLock: The lock object. - - """ return MemcacheLock(name) def temp_storage(self, name=None): - """Create a temporary storage. - - Args: - name (str, optional): The name of the temporary storage. Defaults to None. - - Returns: - DatastoreStorage: The temporary storage. - - """ tempstore = DatastoreStorage() return tempstore.create() diff --git a/src/whoosh/filedb/misc.py b/src/whoosh/filedb/misc.py index d4647e1a..0b66516b 100644 --- a/src/whoosh/filedb/misc.py +++ b/src/whoosh/filedb/misc.py @@ -30,46 +30,11 @@ def encode_termkey(term): - """ - Encodes a term key. - - This function takes a term tuple consisting of a field number and text, and encodes it into a byte string. - The field number is packed as an unsigned short, followed by the UTF-8 encoded text. - - Parameters: - term (tuple): A tuple containing the field number and text. - - Returns: - bytes: The encoded term key as a byte string. - - Example: - >>> term = (1, "example") - >>> encode_termkey(term) - b'\x00\x01example' - """ fieldnum, text = term return pack_ushort(fieldnum) + utf8encode(text)[0] def decode_termkey(key): - """ - Decode a term key. - - Args: - key (bytes): The term key to decode. - - Returns: - tuple: A tuple containing the decoded term key. The first element is an - unsigned short integer, and the second element is a Unicode string. - - Raises: - IndexError: If the key is too short to be decoded. - - Example: - >>> key = b'\x00\x01hello' - >>> decode_termkey(key) - (1, 'hello') - """ return (unpack_ushort(key[:_SHORT_SIZE])[0], utf8decode(key[_SHORT_SIZE:])[0]) diff --git a/src/whoosh/filedb/pools.py b/src/whoosh/filedb/pools.py index 014e2a69..fee6f3d0 100644 --- a/src/whoosh/filedb/pools.py +++ b/src/whoosh/filedb/pools.py @@ -66,32 +66,10 @@ def imerge(iterators): - """ - Merge multiple sorted iterators into a single sorted iterator. - - This function takes a list of sorted iterators and merges them into a single - sorted iterator. It uses a heap data structure to efficiently merge the - iterators. - - Parameters: - - iterators (list): A list of sorted iterators to be merged. - - Yields: - - item: The next item in the merged sorted iterator. - - Example: - ``` - iterators = [iter([1, 3, 5]), iter([2, 4, 6]), iter([7, 8, 9])] - merged_iterator = imerge(iterators) - for item in merged_iterator: - print(item) - # Output: 1, 2, 3, 4, 5, 6, 7, 8, 9 - ``` - """ current = [] for g in iterators: try: - current.append((next(g), g)) + current.append((g.next(), g)) except StopIteration: pass heapify(current) @@ -100,7 +78,7 @@ def imerge(iterators): item, gen = heappop(current) yield item try: - heappush(current, (next(gen), gen)) + heappush(current, (gen.next(), gen)) except StopIteration: pass @@ -112,28 +90,6 @@ def imerge(iterators): def bimerge(iter1, iter2): - """ - Merge two sorted iterators into a single sorted iterator. - - This function takes two sorted iterators, `iter1` and `iter2`, and merges them into a single sorted iterator. - The merged iterator will contain all the elements from both `iter1` and `iter2`, in ascending order. - - Parameters: - - iter1 (iterator): The first sorted iterator. - - iter2 (iterator): The second sorted iterator. - - Returns: - - iterator: A merged iterator containing all the elements from `iter1` and `iter2`, in ascending order. - - Example: - ``` - >>> iter1 = iter([1, 3, 5]) - >>> iter2 = iter([2, 4, 6]) - >>> merged_iter = bimerge(iter1, iter2) - >>> list(merged_iter) - [1, 2, 3, 4, 5, 6] - ``` - """ try: p1 = iter1.next() except StopIteration: @@ -168,22 +124,6 @@ def bimerge(iter1, iter2): def dividemerge(iters): - """ - Divides a list of iterators into smaller sublists recursively and merges them using bimerge. - - Parameters: - - iters (list): A list of iterators to be divided and merged. - - Returns: - - merged_iter (iterator): An iterator that merges the divided sublists. - - Example: - >>> iters = [iter([1, 2, 3]), iter([4, 5, 6]), iter([7, 8, 9])] - >>> merged_iter = dividemerge(iters) - >>> list(merged_iter) - [1, 2, 3, 4, 5, 6, 7, 8, 9] - """ - length = len(iters) if length == 0: return [] @@ -195,53 +135,21 @@ def dividemerge(iters): def read_run(filename, count): - """ - Read and yield objects from a binary file. - - Args: - filename (str): The path to the binary file. - count (int): The number of objects to read. - - Yields: - object: The loaded object from the file. - - Raises: - FileNotFoundError: If the specified file does not exist. - - Example: - >>> for obj in read_run("data.bin", 3): - ... print(obj) - """ f = open(filename, "rb") - try: - while count: - count -= 1 - yield load(f) - finally: - f.close() + while count: + count -= 1 + yield load(f) + f.close() def write_postings(schema, termtable, postwriter, postiter): - """ - Writes postings to the posting file and adds terms to the term table. - - This method pulls postings out of the posting pool (built up as documents are added) - and writes them to the posting file. Each time it encounters a posting for a new term, - it writes the previous term to the term index. By waiting to write the term entry, - we can easily count the document frequency and sum the terms by looking at the postings. - - Args: - schema (Schema): The schema object that defines the fields and their properties. - termtable (TermTable): The term table object that stores the term entries. - postwriter (PostWriter): The post writer object that writes postings to the posting file. - postiter (iterable): An iterable that provides the postings in (field number, lexical) order. - - Raises: - ValueError: If the postings are out of order. - - Returns: - None - """ + # This method pulls postings out of the posting pool (built up as + # documents are added) and writes them to the posting file. Each time + # it encounters a posting for a new term, it writes the previous term + # to the term index (by waiting to write the term entry, we can easily + # count the document frequency and sum the terms by looking at the + # postings). + current_fieldnum = None # Field number of the current term current_text = None # Text of the current term first = True @@ -273,8 +181,9 @@ def write_postings(schema, termtable, postwriter, postiter): fieldnum == current_fieldnum and text < current_text ): # This should never happen! - raise ValueError( - f"Postings are out of order: {current_fieldnum}:{current_text} .. {fieldnum}:{text}" + raise Exception( + "Postings are out of order: %s:%s .. %s:%s" + % (current_fieldnum, current_text, fieldnum, text) ) # Write a posting for this occurrence of the current term @@ -290,84 +199,21 @@ def write_postings(schema, termtable, postwriter, postiter): class LengthSpool: - """ - A class for managing a spool file that stores length information. - - The LengthSpool class provides methods to create a spool file, add length information - for documents and fields, finish writing to the spool file, and read back the length - information. - - Usage: - spool = LengthSpool(filename) - spool.create() - spool.add(docnum, fieldnum, length) - spool.finish() - for length_info in spool.readback(): - # Process length_info - - Args: - filename (str): The path to the spool file. - - Attributes: - filename (str): The path to the spool file. - file (file object): The file object representing the spool file. - - Methods: - create(): Creates the spool file for writing. - add(docnum, fieldnum, length): Adds length information for a document and field to the spool file. - finish(): Finishes writing to the spool file and closes it. - readback(): Reads back the length information from the spool file. - - """ - def __init__(self, filename): self.filename = filename self.file = None def create(self): - """ - Creates the spool file for writing. - - This method opens the spool file in write binary mode. - - """ self.file = open(self.filename, "wb") def add(self, docnum, fieldnum, length): - """ - Adds length information for a document and field to the spool file. - - This method writes the packed length information to the spool file. - - Args: - docnum (int): The document number. - fieldnum (int): The field number. - length (int): The length of the field. - - """ self.file.write(pack_length(docnum, fieldnum, length_to_byte(length))) def finish(self): - """ - Finishes writing to the spool file and closes it. - - This method closes the spool file after writing is complete. - - """ self.file.close() self.file = None def readback(self): - """ - Reads back the length information from the spool file. - - This method opens the spool file in read binary mode and reads the length information - in chunks of the specified size. It yields each unpacked length information. - - Yields: - tuple: A tuple containing the document number, field number, and length. - - """ f = open(self.filename, "rb") size = _length_struct.size while True: @@ -379,280 +225,127 @@ def readback(self): class PoolBase: - """ - Base class for pool implementations. - - A pool is responsible for managing resources, such as file handles or connections, - that need to be reused across multiple operations. This class provides a basic - implementation for managing the pool directory, field length totals, and field length maxes. - - Attributes: - _dir (str): The directory path where the pool is located. - _fieldlength_totals (defaultdict): A dictionary that stores the total field lengths for each field. - _fieldlength_maxes (dict): A dictionary that stores the maximum field lengths for each field. - - Methods: - __init__(self, dir): Initializes the PoolBase instance with the specified directory. - _filename(self, name): Returns the full path of a file within the pool directory. - cancel(self): Cancels any pending operations or releases any acquired resources. - fieldlength_totals(self): Returns a dictionary containing the total field lengths for each field. - fieldlength_maxes(self): Returns a dictionary containing the maximum field lengths for each field. - """ - def __init__(self, dir): self._dir = dir self._fieldlength_totals = defaultdict(int) self._fieldlength_maxes = {} def _filename(self, name): - """ - Returns the full path of a file within the pool directory. - - Args: - name (str): The name of the file. - - Returns: - str: The full path of the file within the pool directory. - """ return os.path.join(self._dir, name) def cancel(self): - """ - Cancels any pending operations or releases any acquired resources. - """ pass def fieldlength_totals(self): - """ - Returns a dictionary containing the total field lengths for each field. - - Returns: - dict: A dictionary where the keys are field names and the values are the total field lengths. - """ return dict(self._fieldlength_totals) def fieldlength_maxes(self): - """ - Returns a dictionary containing the maximum field lengths for each field. - - Returns: - dict: A dictionary where the keys are field names and the values are the maximum field lengths. - """ return self._fieldlength_maxes class TempfilePool(PoolBase): - """ - A pool for managing temporary files used for indexing in Whoosh. - - This class is responsible for managing temporary files used during the indexing process in Whoosh. - It provides methods for adding content, postings, field lengths, and dumping runs to temporary files. - The temporary files are used to store the intermediate data during the indexing process. - - Parameters: - - lengthfile (str): The path to the length file. - - limitmb (int): The maximum size limit in megabytes for the temporary files. Default is 32MB. - - temp_dir (str): The directory where the temporary files will be created. If not provided, a temporary directory will be created. - - basename (str): The base name for the temporary files. Default is an empty string. - - **kw: Additional keyword arguments. - - Attributes: - - lengthfile (str): The path to the length file. - - limit (int): The maximum size limit in bytes for the temporary files. - - size (int): The current size of the temporary files in bytes. - - count (int): The number of postings in the temporary files. - - postings (list): A list of postings to be written to the temporary files. - - runs (list): A list of tuples containing the temporary file names and the number of postings in each run. - - basename (str): The base name for the temporary files. - - lenspool (LengthSpool): The spool for managing field lengths. - """ - - def __init__(self, lengthfile, limitmb=32, temp_dir=None, basename="", **kw): - """ - Initialize the TempfilePool. - - Parameters: - - lengthfile (str): The path to the length file. - - limitmb (int): The maximum size limit in megabytes for the temporary files. Default is 32MB. - - temp_dir (str): The directory where the temporary files will be created. If not provided, a temporary directory will be created. - - basename (str): The base name for the temporary files. Default is an empty string. - - **kw: Additional keyword arguments. - """ - # Implementation details... - - def add_content(self, docnum, fieldnum, field, value): - """ - Add content to the temporary pool. + def __init__(self, lengthfile, limitmb=32, dir=None, basename="", **kw): + if dir is None: + dir = tempfile.mkdtemp("whoosh") + PoolBase.__init__(self, dir) - This method adds the content of a field in a document to the temporary pool. - It processes the field's index and adds the postings to the pool. - If the field is scorable, it also adds the field length. + self.lengthfile = lengthfile + self.limit = limitmb * 1024 * 1024 - Parameters: - - docnum (int): The document number. - - fieldnum (int): The field number. - - field (Field): The field object. - - value (str): The field value. + self.size = 0 + self.count = 0 + self.postings = [] + self.runs = [] - Returns: - - int: The total term count for the field. - """ - # Implementation details... + self.basename = basename - def add_posting(self, fieldnum, text, docnum, freq, datastring): - """ - Add a posting to the temporary pool. - - This method adds a posting to the temporary pool. - It calculates the size of the posting and checks if the size limit has been reached. - If the limit is reached, it dumps the current postings to a temporary file. - - Parameters: - - fieldnum (int): The field number. - - text (str): The text of the posting. - - docnum (int): The document number. - - freq (int): The term frequency. - - datastring (str): The data string associated with the posting. - """ - # Implementation details... + self.lenspool = LengthSpool(self._filename(basename + "length")) + self.lenspool.create() - def add_field_length(self, docnum, fieldnum, length): - """ - Add a field length to the temporary pool. + def add_content(self, docnum, fieldnum, field, value): + add_posting = self.add_posting + termcount = 0 + # TODO: Method for adding progressive field values, ie + # setting start_pos/start_char? + for w, freq, valuestring in field.index(value): + # assert w != "" + add_posting(fieldnum, w, docnum, freq, valuestring) + termcount += freq - This method adds the length of a field in a document to the temporary pool. - It updates the field length totals and maximums. + if field.scorable and termcount: + self.add_field_length(docnum, fieldnum, termcount) - Parameters: - - docnum (int): The document number. - - fieldnum (int): The field number. - - length (int): The length of the field. - """ - # Implementation details... + return termcount - def dump_run(self): - """ - Dump the current postings to a temporary file. + def add_posting(self, fieldnum, text, docnum, freq, datastring): + if self.size >= self.limit: + # print ("Flushing...") + self.dump_run() - This method dumps the current postings to a temporary file. - It sorts the postings, writes them to the file, and updates the runs list. - It also resets the size and count of the temporary pool. - """ - # Implementation details... + self.size += len(text) + 2 + 8 + len(datastring) + self.postings.append((fieldnum, text, docnum, freq, datastring)) + self.count += 1 - def run_filenames(self): - """ - Get the filenames of the temporary runs. + def add_field_length(self, docnum, fieldnum, length): + self._fieldlength_totals[fieldnum] += length + if length > self._fieldlength_maxes.get(fieldnum, 0): + self._fieldlength_maxes[fieldnum] = length + self.lenspool.add(docnum, fieldnum, length) - This method returns a list of the filenames of the temporary runs. + def dump_run(self): + if self.size > 0: + tempname = self._filename(self.basename + str(time.time()) + ".run") + runfile = open(tempname, "w+b") + self.postings.sort() + for p in self.postings: + dump(p, runfile) + runfile.close() + + self.runs.append((tempname, self.count)) + self.postings = [] + self.size = 0 + self.count = 0 - Returns: - - list: A list of filenames. - """ - # Implementation details... + def run_filenames(self): + return [filename for filename, _ in self.runs] def cancel(self): - """ - Cancel the indexing process. - - This method cancels the indexing process and cleans up the temporary files. - """ - # Implementation details... + self.cleanup() def cleanup(self): - """ - Clean up the temporary files. - - This method cleans up the temporary files by removing the temporary directory. - """ - # Implementation details... + shutil.rmtree(self._dir) def _finish_lengths(self, schema, doccount): - """ - Finish writing the field lengths. - - This method finishes writing the field lengths to the length file. - - Parameters: - - schema (Schema): The schema object. - - doccount (int): The total number of documents. - """ - # Implementation details... + lengthfile = LengthWriter(self.lengthfile, doccount, schema.scorable_fields()) + lengthfile.add_all(self.lenspool.readback()) + lengthfile.close() def finish(self, schema, doccount, termtable, postingwriter): - """ - Finish the indexing process. - - This method finishes the indexing process by writing the postings to the posting writer. - It also finishes writing the field lengths and cleans up the temporary files. + self.lenspool.finish() + self._finish_lengths(schema, doccount) + + if self.postings and len(self.runs) == 0: + self.postings.sort() + postiter = iter(self.postings) + # total = len(self.postings) + elif not self.postings and not self.runs: + postiter = iter([]) + # total = 0 + else: + postiter = imerge( + [read_run(runname, count) for runname, count in self.runs] + ) + # total = sum(count for runname, count in self.runs) - Parameters: - - schema (Schema): The schema object. - - doccount (int): The total number of documents. - - termtable (TermTable): The term table object. - - postingwriter (PostingWriter): The posting writer object. - """ - # Implementation details... + write_postings(schema, termtable, postingwriter, postiter) + self.cleanup() # Multiprocessing -class PoolWritingTask(Process): - """A process that handles writing data to a temporary pool. - This process is responsible for receiving data units from a posting queue and - writing them to a temporary pool. The data units can represent content, postings, - or field lengths. Once all the data units have been processed, the process - finishes by dumping the temporary pool and sending the results to a result queue. - - Parameters: - - dir (str): The directory where the temporary pool will be stored. - - postingqueue (Queue): The queue from which the data units are received. - - resultqueue (Queue): The queue to which the results will be sent. - - limitmb (int): The maximum size limit of the temporary pool in megabytes. - - Attributes: - - dir (str): The directory where the temporary pool will be stored. - - postingqueue (Queue): The queue from which the data units are received. - - resultqueue (Queue): The queue to which the results will be sent. - - limitmb (int): The maximum size limit of the temporary pool in megabytes. - - """ +class PoolWritingTask(Process): def __init__(self, dir, postingqueue, resultqueue, limitmb): - """ - Initialize a PoolProcess object. - - Args: - dir (str): The directory where the pool process will operate. - postingqueue (Queue): The queue used for sending posting data to the pool process. - resultqueue (Queue): The queue used for receiving results from the pool process. - limitmb (int): The maximum memory limit in megabytes for the pool process. - - Returns: - None - - Raises: - None - - Notes: - This method initializes a PoolProcess object with the given parameters. The PoolProcess is a subclass of Process and represents a separate process that can be used for performing tasks in parallel. - - The `dir` parameter specifies the directory where the pool process will operate. This directory should exist and be writable. - - The `postingqueue` parameter is a Queue object used for sending posting data to the pool process. The pool process will consume data from this queue and perform the necessary operations. - - The `resultqueue` parameter is a Queue object used for receiving results from the pool process. The pool process will put the results of its operations into this queue for the calling process to consume. - - The `limitmb` parameter specifies the maximum memory limit in megabytes for the pool process. If the pool process exceeds this limit, it may be terminated or take appropriate action to free up memory. - - Example usage: - ``` - posting_queue = Queue() - result_queue = Queue() - pool_process = PoolProcess('/path/to/directory', posting_queue, result_queue, 100) - pool_process.start() - ``` - """ Process.__init__(self) self.dir = dir self.postingqueue = postingqueue @@ -660,21 +353,11 @@ def __init__(self, dir, postingqueue, resultqueue, limitmb): self.limitmb = limitmb def run(self): - """Starts the process and handles writing data to the temporary pool. - - This method is automatically called when the process starts. It continuously - retrieves data units from the posting queue and writes them to a temporary - pool until it receives a termination signal. Once all the data units have - been processed, the method finishes by dumping the temporary pool and - sending the results to the result queue. - - """ - pqueue = self.postingqueue rqueue = self.resultqueue subpool = TempfilePool( - None, limitmb=self.limitmb, temp_dir=self.dir, basename=self.name + None, limitmb=self.limitmb, dir=self.dir, basename=self.name ) while True: @@ -703,52 +386,9 @@ def run(self): class MultiPool(PoolBase): - """A multi-process pool for efficient indexing. - - This class represents a multi-process pool that is used for efficient indexing in the Whoosh library. - It inherits from the `PoolBase` class. - - Parameters: - - lengthfile (str): The path to the length file. - - procs (int): The number of processes to use. Default is 2. - - limitmb (int): The maximum memory limit in megabytes. Default is 32. - - **kw: Additional keyword arguments. - - Attributes: - - lengthfile (str): The path to the length file. - - procs (int): The number of processes to use. - - limitmb (int): The maximum memory limit in megabytes. - - postingqueue (Queue): The queue for posting tasks. - - resultsqueue (Queue): The queue for storing results. - - tasks (list): The list of PoolWritingTask instances. - - Methods: - - add_content(*args): Adds content to the posting queue. - - add_posting(*args): Adds a posting to the posting queue. - - add_field_length(*args): Adds a field length to the posting queue. - - cancel(): Cancels the pool and terminates all tasks. - - cleanup(): Cleans up the temporary directory. - - finish(schema, doccount, termtable, postingwriter): Finishes the indexing process. - """ - def __init__(self, lengthfile, procs=2, limitmb=32, **kw): - """ - Initialize a Pool object. - - Parameters: - - lengthfile (str): The path to the length file. - - procs (int, optional): The number of worker processes to use. Defaults to 2. - - limitmb (int, optional): The maximum amount of memory (in megabytes) that each worker process can use. Defaults to 32. - - **kw: Additional keyword arguments. - - Raises: - - None. - - Returns: - - None. - """ - temp_dir = tempfile.mkdtemp(".whoosh") - PoolBase.__init__(self, temp_dir) + dir = tempfile.mkdtemp(".whoosh") + PoolBase.__init__(self, dir) self.lengthfile = lengthfile @@ -767,55 +407,23 @@ def __init__(self, lengthfile, procs=2, limitmb=32, **kw): task.start() def add_content(self, *args): - """Adds content to the posting queue. - - Parameters: - - *args: The content to be added. - """ self.postingqueue.put((0, args)) def add_posting(self, *args): - """Adds a posting to the posting queue. - - Parameters: - - *args: The posting to be added. - """ self.postingqueue.put((1, args)) def add_field_length(self, *args): - """Adds a field length to the posting queue. - - Parameters: - - *args: The field length to be added. - """ self.postingqueue.put((2, args)) def cancel(self): - """Cancels the pool and terminates all tasks.""" for task in self.tasks: task.terminate() self.cleanup() def cleanup(self): - """Cleans up the temporary directory.""" shutil.rmtree(self._dir) def finish(self, schema, doccount, termtable, postingwriter): - """Finishes the indexing process. - - This method is called to finish the indexing process. It performs the following steps: - 1. Joins all the tasks. - 2. Retrieves the results from the results queue. - 3. Writes the lengths to the length file. - 4. Merges the runs. - 5. Cleans up the temporary directory. - - Parameters: - - schema (Schema): The schema object. - - doccount (int): The total number of documents. - - termtable (TermTable): The term table object. - - postingwriter (PostingWriter): The posting writer object. - """ _fieldlength_totals = self._fieldlength_totals if not self.tasks: return @@ -840,9 +448,9 @@ def finish(self, schema, doccount, termtable, postingwriter): taskruns, flentotals, flenmaxes, lenspool = rqueue.get() runs.extend(taskruns) lenspools.append(lenspool) - for fieldnum, total in flentotals.items(): + for fieldnum, total in flentotals.iteritems(): _fieldlength_totals[fieldnum] += total - for fieldnum, length in flenmaxes.items(): + for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length print("Results:", time.time() - t) @@ -857,8 +465,12 @@ def finish(self, schema, doccount, termtable, postingwriter): t = time.time() iterator = dividemerge([read_run(runname, count) for runname, count in runs]) - # total = sum(count for runname, count in runs) + total = sum(count for runname, count in runs) write_postings(schema, termtable, postingwriter, iterator) print("Merge:", time.time() - t) self.cleanup() + + +if __name__ == "__main__": + pass diff --git a/src/whoosh/filedb/structfile.py b/src/whoosh/filedb/structfile.py index 5cbca7d9..dec60071 100644 --- a/src/whoosh/filedb/structfile.py +++ b/src/whoosh/filedb/structfile.py @@ -70,104 +70,12 @@ class StructFile: - """A "structured file" object that wraps a given file object and provides - additional methods for writing and reading structured data. - - This class provides a convenient way to work with structured data in a file. - It wraps a file object and adds methods for reading and writing various data - types, such as strings, integers, floats, and arrays. - - Usage: - ------ - To use the StructFile class, create an instance by passing a file object to - the constructor: - - >>> with open('data.bin', 'wb') as file: - ... sf = StructFile(file) - - You can then use the various methods provided by StructFile to read and write - data: - - >>> sf.write_string('Hello, World!') - >>> sf.write_int(42) - >>> sf.write_float(3.14) - - To read data from the file, use the corresponding read methods: - - >>> string = sf.read_string() - >>> integer = sf.read_int() - >>> float_num = sf.read_float() - - Methods: - -------- - The StructFile class provides the following methods: - - - read: Read a specified number of bytes from the file. - - write: Write data to the file. - - read_string: Read a string from the file. - - write_string: Write a string to the file. - - read_int: Read an integer from the file. - - write_int: Write an integer to the file. - - read_float: Read a float from the file. - - write_float: Write a float to the file. - - read_array: Read an array from the file. - - write_array: Write an array to the file. - - seek: Move the file pointer to a specified position. - - tell: Get the current position of the file pointer. - - flush: Flush the buffer of the wrapped file. - - close: Close the wrapped file. - - Note: - ----- - The StructFile class is designed to work with binary files. It provides - methods for reading and writing various data types in their binary - representation. Make sure to open the file in binary mode when using - StructFile. - + """Returns a "structured file" object that wraps the given file object and + provides numerous additional methods for writing structured data, such as + "write_varint" and "write_long". """ def __init__(self, fileobj, name=None, onclose=None): - """ - Initialize a StructFile object. - - Args: - fileobj (file-like object): The file-like object to be wrapped by the StructFile. - name (str, optional): The name of the file. Defaults to None. - onclose (callable, optional): A callable object to be called when the StructFile is closed. Defaults to None. - - Attributes: - file (file-like object): The wrapped file-like object. - _name (str): The name of the file. - onclose (callable): A callable object to be called when the StructFile is closed. - is_closed (bool): Indicates whether the StructFile is closed or not. - is_real (bool): Indicates whether the wrapped file-like object has a fileno() method. - fileno (method): The fileno() method of the wrapped file-like object. - - Note: - The StructFile is a wrapper around a file-like object that provides additional functionality. - It keeps track of the file's name, whether it is closed, and whether it is a real file object. - The fileno() method is only available if the wrapped file-like object has a fileno() method. - - Usage: - # Create a StructFile object - fileobj = open("example.txt", "r") - struct_file = StructFile(fileobj, "example.txt", onclose=my_callback) - - # Access the wrapped file object - file = struct_file.file - - # Check if the StructFile is closed - is_closed = struct_file.is_closed - - # Check if the wrapped file object is a real file object - is_real = struct_file.is_real - - # Call the onclose callback when the StructFile is closed - struct_file.onclose = my_callback - - # Get the fileno of the wrapped file object - fileno = struct_file.fileno() - """ self.file = fileobj self._name = name self.onclose = onclose @@ -178,302 +86,54 @@ def __init__(self, fileobj, name=None, onclose=None): self.fileno = fileobj.fileno def __repr__(self): - """ - Return a string representation of the StructFile object. - - The returned string includes the class name and the name of the file. - - Returns: - str: A string representation of the StructFile object. - - Example: - >>> file = StructFile("example.txt") - >>> repr(file) - 'StructFile("example.txt")' - """ return f"{self.__class__.__name__}({self._name!r})" def __str__(self): - """ - Returns a string representation of the StructFile object. - - The string representation is the name of the file associated with the StructFile object. - - Returns: - str: The name of the file associated with the StructFile object. - - Example: - >>> file = StructFile("example.txt") - >>> str(file) - 'example.txt' - """ return self._name def __enter__(self): - """ - Enter method for the StructFile context manager. - - This method is automatically called when using the `with` statement to open a StructFile. - It returns the StructFile object itself, allowing it to be used within the `with` block. - - Returns: - StructFile: The StructFile object itself. - - Example: - with StructFile("data.bin", "rb") as file: - # Perform operations on the file - data = file.read(1024) - # ... - """ return self def __exit__(self, exc_type, exc_val, exc_tb): - """ - Closes the file when exiting a context manager. - - Args: - exc_type (type): The type of the exception raised, if any. - exc_val (Exception): The exception raised, if any. - exc_tb (traceback): The traceback object associated with the exception, if any. - - Returns: - None - - Raises: - Any exception raised during the closing process. - - This method is automatically called when exiting a `with` statement. It ensures that the file is properly closed, - regardless of whether an exception occurred or not. It should not be called directly. - """ self.close() def __iter__(self): - """ - Returns an iterator over the lines of the file. - - This method allows the `StructFile` object to be used in a `for` loop or - with other iterable constructs. It returns an iterator that yields each - line of the file. - - Returns: - An iterator over the lines of the file. - - Example: - >>> with StructFile('data.txt') as file: - ... for line in file: - ... print(line) - """ return iter(self.file) def raw_file(self): - """ - Returns the raw file object associated with this StructFile. - - This method returns the underlying file object that is used by the StructFile - instance. It can be used to perform low-level file operations directly on the file. - - Returns: - file: The raw file object associated with this StructFile. - - Example: - # Open a StructFile - sf = StructFile("data.bin", "rb") - - # Get the raw file object - f = sf.raw_file() - - # Perform low-level file operations - f.seek(0) - data = f.read(1024) - - Note: - Modifying the raw file object directly may lead to unexpected behavior and - should be done with caution. It is recommended to use the methods provided by - the StructFile class for reading and writing data to the file. - """ return self.file def read(self, *args, **kwargs): - """ - Read data from the file. - - This method reads data from the file and returns it. It delegates the actual reading - operation to the underlying file object. - - Parameters: - *args: Variable length argument list to be passed to the underlying file object's read method. - **kwargs: Arbitrary keyword arguments to be passed to the underlying file object's read method. - - Returns: - The data read from the file. - - Example usage: - file = StructFile("example.txt") - data = file.read(10) # Read 10 bytes from the file - """ - return self.file.read(*args, **kwargs) def readline(self, *args, **kwargs): - """ - Read and return a line from the file. - - This method reads a line from the file and returns it as a string. It delegates the actual reading to the underlying file object. - - Parameters: - *args: Variable length argument list to be passed to the underlying file object's readline method. - **kwargs: Arbitrary keyword arguments to be passed to the underlying file object's readline method. - - Returns: - str: The line read from the file. - - Raises: - Any exceptions raised by the underlying file object's readline method. - - Example: - >>> file = StructFile("example.txt") - >>> line = file.readline() - >>> print(line) - "This is an example line." - - Note: - This method assumes that the file has been opened in text mode. - """ return self.file.readline(*args, **kwargs) def write(self, *args, **kwargs): - """ - Writes the specified data to the file. - - Args: - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - Returns: - int: The number of bytes written to the file. - - Raises: - OSError: If an error occurs while writing to the file. - - Example: - To write a string to the file: - - >>> file.write("Hello, World!") - - Note: - This method delegates the write operation to the underlying file object. - """ return self.file.write(*args, **kwargs) def tell(self, *args, **kwargs): - """ - Return the current file position. - - This method returns the current file position in bytes. It delegates the call to the underlying file object's `tell()` method. - - :param args: Optional positional arguments to be passed to the `tell()` method of the underlying file object. - :param kwargs: Optional keyword arguments to be passed to the `tell()` method of the underlying file object. - :return: The current file position in bytes. - """ return self.file.tell(*args, **kwargs) def seek(self, *args, **kwargs): - """ - Change the file position to the given offset. - - This method is a wrapper around the `seek` method of the underlying file object. - It allows you to change the current position within the file. - - Parameters: - *args: Variable length argument list to be passed to the `seek` method. - **kwargs: Arbitrary keyword arguments to be passed to the `seek` method. - - Returns: - The new absolute position within the file. - - Raises: - OSError: If an error occurs while seeking the file. - - Example: - To seek to the beginning of the file: - ``` - file.seek(0) - ``` - - To seek to a specific offset from the current position: - ``` - file.seek(10, 1) - ``` - - To seek to a specific offset from the end of the file: - ``` - file.seek(-10, 2) - ``` - """ return self.file.seek(*args, **kwargs) def truncate(self, *args, **kwargs): - """ - Truncates the file to the specified size. - - Args: - *args: Variable length argument list. - **kwargs: Arbitrary keyword arguments. - - Returns: - int: The new size of the file after truncation. - - Raises: - OSError: If an error occurs while truncating the file. - - Note: - This method is a wrapper around the `truncate` method of the underlying file object. - - Example: - # Truncate the file to 100 bytes - size = file.truncate(100) - """ return self.file.truncate(*args, **kwargs) def flush(self): - """ - Flushes the buffer of the wrapped file. This is a no-op if the + """Flushes the buffer of the wrapped file. This is a no-op if the wrapped file does not have a flush method. - - This method ensures that any buffered data in the file is written to the underlying storage. - It is recommended to call this method after performing write operations to ensure data integrity. - - Usage: - file = StructFile(...) - # Perform write operations - file.flush() - - Note: - If the wrapped file does not have a flush method, this method does nothing. - """ + if hasattr(self.file, "flush"): self.file.flush() def close(self): - """ - Closes the wrapped file. - - This method closes the file object that is being wrapped by the StructFile. - It is important to close the file after using it to free up system resources - and ensure data integrity. - - Raises: - ValueError: If the file is already closed. + """Closes the wrapped file.""" - Usage: - To close the StructFile object, simply call the close() method: - - file = StructFile(...) - file.close() - """ if self.is_closed: - raise ValueError("This file is already closed") + raise Exception("This file is already closed") if self.onclose: self.onclose(self) if hasattr(self.file, "close"): @@ -481,344 +141,79 @@ def close(self): self.is_closed = True def subset(self, offset, length, name=None): - """ - Returns a subset of the current StructFile object. - - Args: - offset (int): The starting offset of the subset, in bytes. - length (int): The length of the subset, in bytes. - name (str, optional): The name of the subset. If not provided, the name of the current StructFile object is used. - - Returns: - StructFile: A new StructFile object representing the subset. - - Raises: - None. - - Example: - # Create a StructFile object - sf = StructFile(file, name="example.txt") - - # Get a subset of the StructFile object - subset = sf.subset(10, 20, name="subset.txt") - """ from whoosh.filedb.compound import SubFile name = name or self._name return StructFile(SubFile(self.file, offset, length), name=name) def write_string(self, s): - """ - Writes a string to the wrapped file. - - This method writes the length of the string first, so you can read the string back - without having to know how long it was. - - :param s: The string to be written. - :type s: str + """Writes a string to the wrapped file. This method writes the length + of the string first, so you can read the string back without having to + know how long it was. """ self.write_varint(len(s)) self.write(s) def write_string2(self, s): - """ - Writes a string to the file. - - Args: - s (str): The string to be written. - - Raises: - TypeError: If the input is not a string. - - Notes: - This method writes the length of the string as an unsigned short (2 bytes) followed by the string itself. - The length of the string is encoded using the `pack_ushort` function. - - Example: - >>> file = StructFile() - >>> file.write_string2("Hello, World!") - """ self.write(pack_ushort(len(s)) + s) def write_string4(self, s): - """ - Writes a string to the file using a custom 4-byte length prefix. - - Args: - s (str): The string to be written. - - Raises: - TypeError: If the input is not a string. - - Notes: - This method writes the length of the string as a 4-byte integer - followed by the string itself. The length prefix allows for efficient - reading of the string later on. - - Example: - >>> file.write_string4("Hello, World!") - """ self.write(pack_int(len(s)) + s) def read_string(self): - """ - Reads a string from the wrapped file. - - This method reads a string from the file by first reading the length of the string - using the `read_varint` method, and then reading the actual string using the `read` method. - - Returns: - str: The string read from the file. - - Raises: - IOError: If there is an error reading from the file. - """ + """Reads a string from the wrapped file.""" return self.read(self.read_varint()) def read_string2(self): - """ - Reads a string from the file. - - This method reads a string from the file by first reading the length of the string as an unsigned short, - and then reading the actual string data from the file. - - Returns: - str: The string read from the file. - - Raises: - IOError: If there is an error reading from the file. - - Usage: - string = read_string2() - """ l = self.read_ushort() return self.read(l) def read_string4(self): - """ - Reads a string from the file. - - This method reads a string from the file by first reading the length of the string - as an integer using the `read_int()` method, and then reading the actual string - using the `read()` method. - - Returns: - str: The string read from the file. - - """ l = self.read_int() return self.read(l) def get_string2(self, pos): - """ - Retrieves a string from the file at the given position. - - Args: - pos (int): The position in the file where the string starts. - - Returns: - tuple: A tuple containing the string and the position of the next byte after the string. - - Raises: - IndexError: If the position is out of range. - - Notes: - This method reads the length of the string from the file, and then reads the string itself. - The length of the string is stored as an unsigned short (2 bytes) at the given position. - The string is read from the file starting at `pos + 2` and its length is determined by the value read from the file. - The returned tuple contains the string and the position of the next byte after the string. - - Example: - >>> file = StructFile(...) - >>> string, next_pos = file.get_string2(10) - """ l = self.get_ushort(pos) base = pos + _SHORT_SIZE return self.get(base, l), base + l def get_string4(self, pos): - """ - Retrieves a string from the file at the given position. - - Args: - pos (int): The position in the file where the string starts. - - Returns: - tuple: A tuple containing the string and the position of the next byte after the string. - - Raises: - ValueError: If the position is invalid or the string length is negative. - - Notes: - This method reads the length of the string from the file at the given position, - then reads the string itself from the file. It returns the string and the position - of the next byte after the string. - - The string is read from the file using the `get` method, which reads a specified - number of bytes from the file starting at a given position. - - Example usage: - ``` - string, next_pos = structfile.get_string4(10) - ``` - - """ l = self.get_int(pos) base = pos + _INT_SIZE return self.get(base, l), base + l def skip_string(self): - """ - Skips a string in the file. - - This method reads the length of the string from the file using the `read_varint` method, - and then seeks forward in the file by that length. - - Note: - - This method assumes that the file pointer is positioned at the start of the string. - - The `read_varint` method is responsible for reading the variable-length integer that - represents the length of the string. - - Returns: - None - - Raises: - IOError: If there is an error reading or seeking in the file. - """ l = self.read_varint() self.seek(l, 1) def write_varint(self, i): - """ - Writes a variable-length unsigned integer to the wrapped file. - - Parameters: - i (int): The integer value to be written. - - Returns: - None - - Raises: - TypeError: If the input value is not an integer. - ValueError: If the input value is negative. - - Notes: - This method writes a variable-length unsigned integer to the file. The integer value is encoded using a - variable-length encoding scheme, where smaller values require fewer bytes to represent. The encoded value - is written to the file using the `write` method of the wrapped file object. - - Example: - To write the integer value 42 to the file, you can use the following code: - - >>> file = StructFile(...) - >>> file.write_varint(42) - """ + """Writes a variable-length unsigned integer to the wrapped file.""" self.write(varint(i)) def write_svarint(self, i): - """ - Writes a variable-length signed integer to the wrapped file. - - Parameters: - i (int): The signed integer to be written. - - Returns: - None - - Raises: - IOError: If an error occurs while writing to the file. - - Notes: - This method writes a variable-length signed integer to the file. The integer is encoded using a - variable-length encoding scheme, where the most significant bit of each byte indicates whether - there are more bytes to follow. This allows for efficient storage of integers that can have a - wide range of values. - - The method uses the `signed_varint` function to encode the integer before writing it to the file. - - Example: - To write a signed integer to a file: - - ``` - file = StructFile("data.bin") - file.write_svarint(-42) - file.close() - ``` - """ + """Writes a variable-length signed integer to the wrapped file.""" self.write(signed_varint(i)) def read_varint(self): """Reads a variable-length encoded unsigned integer from the wrapped file. - - This method reads a variable-length encoded unsigned integer from the - file object that is wrapped by this StructFile instance. The integer - is encoded using a variable-length encoding scheme, where the number - of bytes used to represent the integer depends on its value. - - Returns: - int: The decoded unsigned integer. - - Raises: - IOError: If there is an error reading from the file. - - Example: - >>> with open('data.bin', 'rb') as f: - ... sf = StructFile(f) - ... value = sf.read_varint() - ... print(value) - 42 - - Note: - This method assumes that the file object is positioned at the - start of the encoded integer. After reading the integer, the file - object's position will be advanced by the number of bytes read. - """ return read_varint(self.read) def read_svarint(self): - """Reads a variable-length encoded signed integer from the wrapped file. - - This method reads a variable-length encoded signed integer from the wrapped file. - It uses the `read_varint` function to read the variable-length encoded integer, - and then decodes it as a signed integer using the `decode_signed_varint` function. - - Returns: - int: The decoded signed integer. - - Raises: - IOError: If there is an error reading from the file. - - Example: - >>> file = StructFile("data.bin") - >>> value = file.read_svarint() + """Reads a variable-length encoded signed integer from the wrapped + file. """ return decode_signed_varint(read_varint(self.read)) def write_tagint(self, i): + """Writes a sometimes-compressed unsigned integer to the wrapped file. + This is similar to the varint methods but uses a less compressed but + faster format. """ - Writes a sometimes-compressed unsigned integer to the wrapped file. - The write_tagint method is used to write an unsigned integer to the file. It uses a - sometimes-compressed format for faster writing. The method supports numbers from 0 to - 2^32-1. - - Parameters: - - i (int): The unsigned integer to be written to the file. - - Notes: - - Numbers from 0 to 253 are stored in one byte. - - Byte 254 indicates that an unsigned 16-bit integer follows. - - Byte 255 indicates that an unsigned 32-bit integer follows. - - Example usage: - ``` - file = StructFile() - file.write_tagint(42) - ``` - - """ + # Store numbers 0-253 in one byte. Byte 254 means "an unsigned 16-bit + # int follows." Byte 255 means "An unsigned 32-bit int follows." if i <= 253: self.write(chr(i)) elif i <= 65535: @@ -828,30 +223,10 @@ def write_tagint(self, i): def read_tagint(self): """Reads a sometimes-compressed unsigned integer from the wrapped file. - - This method reads an unsigned integer from the file. The integer can be - stored in two different formats: a compressed format and a faster but - less compressed format. - - The compressed format uses a single byte to represent the integer. If - the first byte read from the file is 254, the integer is stored in the - compressed format and can be retrieved using the `read_ushort()` method. - If the first byte is 255, the integer is stored in the compressed format - and can be retrieved using the `read_uint()` method. Otherwise, the first - byte represents the integer itself. - - Returns: - int: The unsigned integer read from the file. - - Example: - Suppose we have a file with the following bytes: [253, 42]. Calling - `read_tagint()` on this file will return 253, as the first byte - represents the integer itself. - - Note: - This method assumes that the file is opened in binary mode. - + This is similar to the varint methods but uses a less compressed but + faster format. """ + tb = ord(self.read(1)) if tb == 254: return self.read_ushort() @@ -861,286 +236,50 @@ def read_tagint(self): return tb def write_byte(self, n): - """Writes a single byte to the wrapped file. - - This method writes a single byte to the file object that is wrapped by the StructFile instance. - It is a shortcut for calling `file.write(chr(n))`. - - Parameters: - - n (int): The byte value to be written to the file. Must be an integer between 0 and 255. - - Raises: - - TypeError: If the provided value `n` is not an integer. - - ValueError: If the provided value `n` is not within the valid range of 0 to 255. - - Example: - ``` - with open("data.bin", "wb") as file: - struct_file = StructFile(file) - struct_file.write_byte(65) # Writes the ASCII value for 'A' to the file - ``` - - Note: - This method assumes that the file object is opened in binary mode ('b'). - + """Writes a single byte to the wrapped file, shortcut for + ``file.write(chr(n))``. """ self.write(pack_byte(n)) def read_byte(self): - """ - Reads a single byte from the file and returns its integer value. - - Returns: - int: The integer value of the byte read from the file. - - Raises: - IOError: If an error occurs while reading from the file. - """ return ord(self.read(1)) def write_pickle(self, obj, protocol=-1): - """ - Writes a pickled representation of obj to the wrapped file. - - Parameters: - obj (object): The object to be pickled and written to the file. - protocol (int, optional): The pickling protocol to use. Default is -1. - - Raises: - pickle.PicklingError: If an error occurs during pickling. - - Notes: - This method uses the `pickle.dump()` function to write a pickled representation - of the given object to the file. The pickling protocol determines the format - in which the object is serialized. The default protocol (-1) uses the highest - available protocol supported by the Python interpreter. - - Example: - # Create a StructFile object - file = StructFile("data.bin") - - # Write a list object to the file using pickle - data = [1, 2, 3, 4, 5] - file.write_pickle(data) - - """ + """Writes a pickled representation of obj to the wrapped file.""" dump(obj, self.file, protocol) def read_pickle(self): - """ - Reads a pickled object from the wrapped file. - - Returns: - object: The pickled object read from the file. - - Raises: - EOFError: If the end of the file is reached before a pickled object is found. - pickle.UnpicklingError: If there is an error while unpickling the object. - """ + """Reads a pickled object from the wrapped file.""" return load(self.file) def write_sbyte(self, n): - """ - Writes a signed byte to the file. - - Args: - n (int): The signed byte value to write. - - Raises: - IOError: If an error occurs while writing to the file. - - Notes: - - The signed byte value should be within the range of -128 to 127. - - The file should be opened in binary mode before calling this method. - - Example: - To write a signed byte value of -42 to the file: - - >>> file.write_sbyte(-42) - """ self.write(pack_sbyte(n)) def write_int(self, n): - """ - Writes an integer to the file. - - Parameters: - - n (int): The integer to be written. - - Returns: - None - - Raises: - - TypeError: If the input is not an integer. - - Notes: - - This method writes the integer to the file using the pack_int function. - - The pack_int function converts the integer into a binary representation. - - The binary representation is then written to the file. - - If the input is not an integer, a TypeError is raised. - """ self.write(pack_int(n)) def write_uint(self, n): - """ - Writes an unsigned integer to the file. - - Parameters: - n (int): The unsigned integer to write. - - Returns: - None - - Raises: - IOError: If an error occurs while writing to the file. - - Notes: - This method writes the unsigned integer `n` to the file. The integer is encoded using the `pack_uint` function. - - Example: - file.write_uint(42) - """ self.write(pack_uint(n)) def write_uint_le(self, n): - """ - Writes an unsigned integer in little-endian format to the file. - - Parameters: - - n (int): The unsigned integer to write. - - Returns: - None - - Raises: - - TypeError: If the input is not an integer. - - ValueError: If the input is a negative integer. - - Example: - >>> file.write_uint_le(42) - """ self.write(pack_uint_le(n)) def write_ushort(self, n): - """ - Writes an unsigned short integer (2 bytes) to the file. - - Parameters: - - n (int): The unsigned short integer to be written. - - Returns: - None - - Raises: - - IOError: If an error occurs while writing to the file. - - Usage: - file.write_ushort(42) - """ self.write(pack_ushort(n)) def write_ushort_le(self, n): - """ - Writes an unsigned short integer (2 bytes) in little-endian byte order to the file. - - Parameters: - - n (int): The unsigned short integer to be written. - - Returns: - None - - Raises: - - IOError: If an error occurs while writing to the file. - - Usage: - file.write_ushort_le(65535) - """ self.write(pack_ushort_le(n)) def write_long(self, n): - """ - Writes a long integer to the file. - - Parameters: - - n (int): The long integer to be written. - - Returns: - None - - Raises: - - IOError: If an error occurs while writing to the file. - - Notes: - - This method writes the long integer to the file using the pack_long function. - - The pack_long function converts the long integer into a binary representation. - - The binary representation is then written to the file. - - If an error occurs while writing to the file, an IOError is raised. - """ self.write(pack_long(n)) def write_ulong(self, n): - """ - Writes an unsigned long integer to the file. - - Parameters: - n (int): The unsigned long integer to write. - - Returns: - None - - Raises: - IOError: If an error occurs while writing to the file. - - Notes: - This method writes an unsigned long integer to the file using the pack_ulong function. - The pack_ulong function converts the integer into a byte string representation according to the platform's byte order. - The resulting byte string is then written to the file. - - Example: - To write the unsigned long integer 123456789 to the file: - - >>> file.write_ulong(123456789) - """ self.write(pack_ulong(n)) def write_float(self, n): - """ - Writes a floating-point number to the file. - - Args: - n (float): The floating-point number to write. - - Raises: - IOError: If an error occurs while writing to the file. - - Notes: - This method uses the `pack_float` function to convert the floating-point number - into a binary representation before writing it to the file. - - Example: - >>> file = StructFile("data.bin", "wb") - >>> file.write_float(3.14) - """ self.write(pack_float(n)) def write_array(self, arry): - """ - Write an array to the file. - - This method writes the given array to the file. If the system is little-endian, - the array is first byte-swapped before writing. If the file is a real file, - the array is written using the `tofile()` method. Otherwise, the array is - converted to bytes and written using the `write()` method. - - Parameters: - - arry (array): The array to be written to the file. - - Returns: - None - - Raises: - None - """ if IS_LITTLE: arry = copy(arry) arry.byteswap() @@ -1150,180 +289,33 @@ def write_array(self, arry): self.write(arry.tobytes()) def read_sbyte(self): - """ - Reads a signed byte from the file. - - Returns: - int: The signed byte value read from the file. - - Raises: - IOError: If an error occurs while reading from the file. - - Notes: - This method reads a single byte from the file and interprets it as a signed value. - The byte is unpacked using the `unpack_sbyte` function, which returns a tuple. - The first element of the tuple is the signed byte value, which is then returned. - - Example: - >>> file = StructFile("data.bin") - >>> byte = file.read_sbyte() - >>> print(byte) - -42 - """ return unpack_sbyte(self.read(1))[0] def read_int(self): - """ - Reads an integer value from the file. - - Returns: - int: The integer value read from the file. - - Raises: - IOError: If there is an error reading from the file. - - """ return unpack_int(self.read(_INT_SIZE))[0] def read_uint(self): - """ - Reads an unsigned integer from the file. - - Returns: - int: The unsigned integer read from the file. - - Raises: - IOError: If there is an error reading from the file. - """ return unpack_uint(self.read(_INT_SIZE))[0] def read_uint_le(self): - """ - Reads an unsigned integer from the file using little-endian byte order. - - Returns: - int: The unsigned integer read from the file. - - Raises: - IOError: If an error occurs while reading from the file. - - Notes: - This method reads an unsigned integer from the file using little-endian byte order. - It assumes that the file is opened in binary mode. - - Example: - >>> file = StructFile("data.bin") - >>> value = file.read_uint_le() - """ return unpack_uint_le(self.read(_INT_SIZE))[0] def read_ushort(self): - """ - Reads an unsigned short (2 bytes) from the file. - - Returns: - int: The unsigned short value read from the file. - - Raises: - IOError: If there is an error reading from the file. - """ return unpack_ushort(self.read(_SHORT_SIZE))[0] def read_ushort_le(self): - """ - Reads an unsigned short (2 bytes) from the file in little-endian byte order. - - Returns: - int: The unsigned short value read from the file. - - Raises: - IOError: If there is an error reading from the file. - - Example: - >>> file = StructFile("data.bin") - >>> value = file.read_ushort_le() - """ return unpack_ushort_le(self.read(_SHORT_SIZE))[0] def read_long(self): - """ - Reads a long integer from the file. - - Returns: - int: The long integer read from the file. - - Raises: - IOError: If an error occurs while reading from the file. - - Notes: - This method reads a long integer from the file using the `read` method of the file object. - The long integer is unpacked from the binary data using the `unpack_long` function. - The `unpack_long` function returns a tuple, and the first element of the tuple is returned as the result. - - Example: - >>> file = StructFile("data.bin") - >>> value = file.read_long() - """ return unpack_long(self.read(_LONG_SIZE))[0] def read_ulong(self): - """ - Reads an unsigned long integer from the file. - - Returns: - int: The unsigned long integer read from the file. - - Raises: - IOError: If an error occurs while reading from the file. - - Notes: - This method reads a fixed-size unsigned long integer from the file. The size of the - unsigned long integer is determined by the `_LONG_SIZE` constant. - - Example: - >>> file = StructFile("data.bin") - >>> value = file.read_ulong() - """ return unpack_ulong(self.read(_LONG_SIZE))[0] def read_float(self): - """ - Reads a single floating-point number from the file. - - Returns: - float: The floating-point number read from the file. - - Raises: - IOError: If an error occurs while reading from the file. - """ return unpack_float(self.read(_FLOAT_SIZE))[0] def read_array(self, typecode, length): - """ - Read an array of elements from the file. - - Args: - typecode (str): The typecode of the array elements. - length (int): The number of elements to read. - - Returns: - array: The array of elements read from the file. - - Raises: - IOError: If there is an error reading from the file. - - Notes: - - If the file is in "real" mode, the array is read using the `fromfile` method of the array object. - - If the file is not in "real" mode, the array is read using the `read` method of the file object and then converted to an array using the `frombytes` method of the array object. - - If the system is little-endian, the byte order of the array is swapped using the `byteswap` method of the array object. - - Example: - # Create a StructFile object - file = StructFile("data.bin") - - # Read an array of integers from the file - arr = file.read_array('i', 10) - """ a = array(typecode) if self.is_real: a.fromfile(self.file, length) @@ -1334,240 +326,40 @@ def read_array(self, typecode, length): return a def get(self, position, length): - """ - Reads a specified number of bytes from the file starting at the given position. - - Args: - position (int): The position in the file to start reading from. - length (int): The number of bytes to read from the file. - - Returns: - bytes: The bytes read from the file. - - Raises: - OSError: If an error occurs while reading from the file. - - Example: - >>> file = StructFile("data.bin") - >>> data = file.get(10, 20) - """ self.seek(position) return self.read(length) def get_byte(self, position): - """ - Retrieves a single byte from the file at the specified position. - - Parameters: - position (int): The position in the file from which to retrieve the byte. - - Returns: - int: The byte value at the specified position. - - Raises: - IndexError: If the position is out of range. - - Example: - # Create a StructFile object - file = StructFile("data.bin") - - # Get the byte at position 10 - byte = file.get_byte(10) - """ return unpack_byte(self.get(position, 1))[0] def get_sbyte(self, position): - """ - Retrieves a signed byte (8-bit integer) from the file at the specified position. - - Parameters: - - position (int): The position in the file from which to read the signed byte. - - Returns: - - int: The signed byte value read from the file. - - Raises: - - IndexError: If the position is out of range. - - Example: - ``` - file = StructFile("data.bin") - byte = file.get_sbyte(10) - print(byte) # Output: -42 - ``` - """ return unpack_sbyte(self.get(position, 1))[0] def get_int(self, position): - """ - Retrieves an integer value from the file at the specified position. - - Parameters: - position (int): The position in the file from which to retrieve the integer value. - - Returns: - int: The integer value retrieved from the file. - - Raises: - IndexError: If the position is out of range. - - """ return unpack_int(self.get(position, _INT_SIZE))[0] def get_uint(self, position): - """ - Retrieves an unsigned integer from the file at the given position. - - Parameters: - - position (int): The position in the file from which to read the unsigned integer. - - Returns: - - int: The unsigned integer value read from the file. - - Raises: - - IndexError: If the position is out of range. - """ return unpack_uint(self.get(position, _INT_SIZE))[0] def get_ushort(self, position): - """ - Retrieves an unsigned short integer (2 bytes) from the file at the specified position. - - Parameters: - - position (int): The position in the file from which to read the unsigned short integer. - - Returns: - - ushort (int): The unsigned short integer value read from the file. - - Raises: - - IndexError: If the position is out of range. - - Example: - ``` - file = StructFile("data.bin") - ushort_value = file.get_ushort(10) - ``` - """ return unpack_ushort(self.get(position, _SHORT_SIZE))[0] def get_long(self, position): - """ - Retrieves a long integer value from the file at the given position. - - Parameters: - position (int): The position in the file from which to read the long integer. - - Returns: - int: The long integer value read from the file. - - Raises: - ValueError: If the position is out of bounds or if the file is not open. - - Notes: - - This method reads a long integer value from the file at the specified position. - - The file must be open before calling this method. - - The position must be a valid position within the file. - """ return unpack_long(self.get(position, _LONG_SIZE))[0] def get_ulong(self, position): - """ - Retrieves an unsigned long integer from the file at the specified position. - - Parameters: - position (int): The position in the file from which to read the unsigned long integer. - - Returns: - int: The unsigned long integer value read from the file. - - Raises: - IndexError: If the position is out of range. - - Notes: - - The unsigned long integer is read from the file using the `get` method. - - The `unpack_ulong` function is used to convert the byte string to an unsigned long integer. - - Only the first value of the unpacked result is returned. - - Example: - # Create a StructFile object - file = StructFile("data.bin") - - # Read an unsigned long integer from the file at position 100 - value = file.get_ulong(100) - """ return unpack_ulong(self.get(position, _LONG_SIZE))[0] def get_float(self, position): - """ - Retrieves a float value from the file at the specified position. - - Parameters: - position (int): The position in the file where the float value is located. - - Returns: - float: The float value retrieved from the file. - - Raises: - IndexError: If the position is out of range. - - """ return unpack_float(self.get(position, _FLOAT_SIZE))[0] def get_array(self, position, typecode, length): - """ - Reads an array of elements from the file starting at the given position. - - Args: - position (int): The position in the file to start reading from. - typecode (str): The typecode of the elements in the array. - length (int): The number of elements to read. - - Returns: - list: A list containing the elements read from the file. - - Raises: - OSError: If there is an error reading the file. - - Example: - To read an array of 10 integers starting from position 100 in the file: - - >>> file = StructFile("data.bin") - >>> array = file.get_array(100, 'i', 10) - """ self.seek(position) return self.read_array(typecode, length) class BufferFile(StructFile): - """ - A class representing a file stored in memory as a buffer. - - This class provides methods to manipulate and retrieve data from the buffer. - - Attributes: - _buf (bytes): The buffer containing the file data. - _name (str): The name of the file. - file (BytesIO): A BytesIO object representing the file. - onclose (callable): A callback function to be called when the file is closed. - is_real (bool): Indicates whether the file is a real file or a buffer. - is_closed (bool): Indicates whether the file is closed. - - Methods: - __init__(self, buf, name=None, onclose=None): Initializes a BufferFile object. - subset(self, position, length, name=None): Creates a new BufferFile object representing a subset of the current file. - get(self, position, length): Retrieves a portion of the file data. - get_array(self, position, typecode, length): Retrieves an array of data from the file. - """ - def __init__(self, buf, name=None, onclose=None): - """ - Initializes a BufferFile object. - - Args: - buf (bytes): The buffer containing the file data. - name (str, optional): The name of the file. Defaults to None. - onclose (callable, optional): A callback function to be called when the file is closed. Defaults to None. - """ self._buf = buf self._name = name self.file = BytesIO(buf) @@ -1577,45 +369,13 @@ def __init__(self, buf, name=None, onclose=None): self.is_closed = False def subset(self, position, length, name=None): - """ - Creates a new BufferFile object that represents a subset of the current file. - - Args: - position (int): The starting position of the subset. - length (int): The length of the subset. - name (str, optional): The name of the new file. Defaults to None. - - Returns: - BufferFile: A new BufferFile object representing the subset of the current file. - """ name = name or self._name return BufferFile(self.get(position, length), name=name) def get(self, position, length): - """ - Retrieves a portion of the file data. - - Args: - position (int): The starting position of the data. - length (int): The length of the data to retrieve. - - Returns: - bytes: The requested portion of the file data. - """ return bytes(self._buf[position : position + length]) def get_array(self, position, typecode, length): - """ - Retrieves an array of data from the file. - - Args: - position (int): The starting position of the array. - typecode (str): The typecode of the array elements. - length (int): The length of the array. - - Returns: - array: An array of data retrieved from the file. - """ a = array(typecode) a.frombytes(self.get(position, length * _SIZEMAP[typecode])) if IS_LITTLE: @@ -1624,39 +384,6 @@ def get_array(self, position, typecode, length): class ChecksumFile(StructFile): - """ - A file-like object that calculates a checksum of the data read or written. - - This class inherits from StructFile and provides additional functionality to calculate a checksum - using the CRC32 algorithm from the zlib module. The checksum is updated as data is read or written. - - Note: This class does not support seeking. - - Usage: - - Create an instance of ChecksumFile by passing the file path or file object to the constructor. - - Read or write data using the file-like methods provided by ChecksumFile. - - Call the checksum() method to get the calculated checksum. - - Example: - ``` - with ChecksumFile("data.txt", "rb") as file: - data = file.read(1024) - print(file.checksum()) - ``` - - Attributes: - - _check: The current checksum value. - - _crc32: The CRC32 function from the zlib module. - - Methods: - - __iter__(): Returns an iterator over the lines of the file. - - seek(): Raises a ValueError as seeking is not supported. - - read(): Reads data from the file and updates the checksum. - - write(): Writes data to the file and updates the checksum. - - checksum(): Returns the calculated checksum. - - """ - def __init__(self, *args, **kwargs): StructFile.__init__(self, *args, **kwargs) self._check = 0 @@ -1668,41 +395,16 @@ def __iter__(self): yield line def seek(self, *args): - raise ValueError("Cannot seek on a ChecksumFile") + raise Exception("Cannot seek on a ChecksumFile") def read(self, *args, **kwargs): - """ - Read data from the file and update the checksum. - - Args: - - *args: Variable length argument list to pass to the underlying file's read() method. - - **kwargs: Arbitrary keyword arguments to pass to the underlying file's read() method. - - Returns: - - b: The read data. - - """ b = self.file.read(*args, **kwargs) self._check = self._crc32(b, self._check) return b def write(self, b): - """ - Write data to the file and update the checksum. - - Args: - - b: The data to write. - - """ self._check = self._crc32(b, self._check) self.file.write(b) def checksum(self): - """ - Get the calculated checksum. - - Returns: - - The calculated checksum as an unsigned 32-bit integer. - - """ return self._check & 0xFFFFFFFF diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py index 29095b7f..27b1a05a 100644 --- a/src/whoosh/highlight.py +++ b/src/whoosh/highlight.py @@ -39,8 +39,8 @@ * **Order functions** control in what order the top-scoring fragments are presented to the user. For example, you can show the fragments in the order - they appear in the document (first) or show higher-scoring fragments first - (score) + they appear in the document (FIRST) or show higher-scoring fragments first + (SCORE) * **Formatters** turn the fragment objects into human-readable output, such as an HTML string. @@ -360,7 +360,7 @@ class SentenceFragmenter(Fragmenter): When highlighting with this fragmenter, you should use an analyzer that does NOT remove stop words, for example:: - sa = standard_analyzer(stoplist=None) + sa = StandardAnalyzer(stoplist=None) """ def __init__(self, maxchars=200, sentencechars=".!?", charlimit=DEFAULT_CHARLIMIT): @@ -584,12 +584,12 @@ def fragment_matches(self, text, tokens): currentlen = right - left while j < len(tokens) - 1 and currentlen < maxchars: - next_token = tokens[j + 1] - ec = next_token.endchar + next = tokens[j + 1] + ec = next.endchar if ec - right <= surround and ec - left <= maxchars: j += 1 right = ec - currentlen += ec - next_token.startchar + currentlen += ec - next.startchar else: break @@ -623,22 +623,22 @@ def __call__(self, f): # Fragment sorters -def score(fragment): +def SCORE(fragment): "Sorts higher scored passages first." return 1 -def first(fragment): +def FIRST(fragment): "Sorts passages from earlier in the document first." return fragment.startchar -def longer(fragment): +def LONGER(fragment): "Sorts longer passages first." return 0 - len(fragment) -def shorter(fragment): +def SHORTER(fragment): "Sort shorter passages first." return len(fragment) @@ -934,7 +934,7 @@ def highlight( top=3, scorer=None, minscore=1, - order=first, + order=FIRST, mode="query", ): if scorer is None: @@ -965,7 +965,7 @@ def __init__( scorer=None, formatter=None, always_retokenize=False, - order=first, + order=FIRST, ): self.fragmenter = fragmenter or ContextFragmenter() self.scorer = scorer or BasicFragmentScorer() diff --git a/src/whoosh/legacy.py b/src/whoosh/legacy.py index 928ac0e2..13b21e79 100644 --- a/src/whoosh/legacy.py +++ b/src/whoosh/legacy.py @@ -52,8 +52,8 @@ def load_110_toc(stream, gen, schema, version): "wcw2": "whoosh.codec.whoosh2", } objmap = { - "%(wf)s.NUMERIC": "%(wcw2)s.old_numeric", - "%(wf)s.DATETIME": "%(wcw2)s.old_datetime", + "%(wf)s.NUMERIC": "%(wcw2)s.OLD_NUMERIC", + "%(wf)s.DATETIME": "%(wcw2)s.OLD_DATETIME", "%(wsn)s.int_to_text": "%(wcw2)s.int_to_text", "%(wsn)s.text_to_int": "%(wcw2)s.text_to_int", "%(wsn)s.long_to_text": "%(wcw2)s.long_to_text", diff --git a/src/whoosh/qparser/dateparse.py b/src/whoosh/qparser/dateparse.py index 839b234b..ffbb4fc7 100644 --- a/src/whoosh/qparser/dateparse.py +++ b/src/whoosh/qparser/dateparse.py @@ -46,8 +46,6 @@ class DateParseError(Exception): "Represents an error in parsing date text." - pass - # Utility functions diff --git a/src/whoosh/query/terms.py b/src/whoosh/query/terms.py index 01212aad..dd08eff5 100644 --- a/src/whoosh/query/terms.py +++ b/src/whoosh/query/terms.py @@ -402,7 +402,7 @@ def _find_prefix(self, text): lp = len(prefix) if lp < len(text) and text[lp] in "*?": # we stripped something starting from * or ? - they both MAY mean - # "0 times". As we had stripped starting from first special char, + # "0 times". As we had stripped starting from FIRST special char, # that implies there were only ordinary chars left of it. Thus, # the very last of them is not part of the real prefix: prefix = prefix[:-1] diff --git a/src/whoosh/support/base85.py b/src/whoosh/support/base85.py index 38e8ecb2..adb9e74c 100644 --- a/src/whoosh/support/base85.py +++ b/src/whoosh/support/base85.py @@ -25,41 +25,19 @@ def to_base85(x, islong=False): - """ - Encodes the given integer using base 85. + "Encodes the given integer using base 85." - Parameters: - - x: The integer to be encoded. - - islong: A boolean indicating whether the integer is a long integer or not. Default is False. - - Returns: - - The base 85 encoded string. - - Example: - >>> to_base85(12345) - '3qo' - """ size = 10 if islong else 5 rems = "" - for _ in range(size): + for i in range(size): rems = b85chars[x % 85] + rems x //= 85 return rems def from_base85(text): - """ - Decodes the given base 85 text into an integer. - - Parameters: - text (str): The base 85 encoded text to be decoded. + "Decodes the given base 85 text into an integer." - Returns: - int: The decoded integer value. - - Raises: - KeyError: If the input text contains characters not present in the base 85 encoding table. - """ acc = 0 for c in text: acc = acc * 85 + b85dec[c] @@ -67,28 +45,9 @@ def from_base85(text): # Bytes encoding and decoding functions -def b85encode(text, pad=False): - """ - Encode the given text using Base85 encoding. - Args: - text (str): The text to be encoded. - pad (bool, optional): Whether to pad the encoded output. Defaults to False. - Returns: - str: The Base85 encoded string. - - Raises: - None - - Example: - >>> b85encode("Hello World") - '87cURD]j7BEbo80' - - Note: - Base85 encoding is a binary-to-text encoding scheme that represents binary data in an ASCII string format. - It is commonly used in various applications such as data compression and data transmission. - """ +def b85encode(text, pad=False): l = len(text) r = l % 4 if r: @@ -116,39 +75,6 @@ def b85encode(text, pad=False): def b85decode(text): - """ - Decode a base85 encoded string. - - Args: - text (str): The base85 encoded string to decode. - - Returns: - bytes: The decoded binary data. - - Raises: - TypeError: If the input string contains invalid base85 characters. - OverflowError: If the decoded value exceeds the maximum representable value. - - Example: - >>> encoded = "9jqo^BlbD-BleB1DJ+*+F(f,q" - >>> decoded = b85decode(encoded) - >>> print(decoded) - b'Hello, World!' - - This function decodes a base85 encoded string and returns the corresponding binary data. - Base85 encoding is a method of representing binary data as ASCII text using 85 different characters. - The function takes a base85 encoded string as input and returns the decoded binary data. - - The function raises a TypeError if the input string contains invalid base85 characters. - It also raises an OverflowError if the decoded value exceeds the maximum representable value. - - Example usage: - >>> encoded = "9jqo^BlbD-BleB1DJ+*+F(f,q" - >>> decoded = b85decode(encoded) - >>> print(decoded) - b'Hello, World!' - """ - l = len(text) out = [] for i in range(0, len(text), 5): diff --git a/src/whoosh/support/bench.py b/src/whoosh/support/bench.py index 0f78eb81..de7aed6c 100644 --- a/src/whoosh/support/bench.py +++ b/src/whoosh/support/bench.py @@ -59,51 +59,31 @@ def __init__(self, d): class Module: def __init__(self, bench, options, args): - """ - Initializes a Module object. - - Args: - bench (object): The benchmark object. - options (object): The options object. - args (object): The arguments object. - """ self.bench = bench self.options = options self.args = args def __repr__(self): - """ - Returns a string representation of the Module object. - """ return self.__class__.__name__ def indexer(self, **kwargs): """ - Indexes the data using the specified keyword arguments. + This method is responsible for indexing the data using the specified keyword arguments. - Args: - **kwargs: Additional keyword arguments for configuring the indexing process. + Parameters: + - kwargs: Additional keyword arguments for configuring the indexing process. Returns: - None + - None """ pass def index_document(self, d): - """ - Indexes a document. - - Args: - d (object): The document object. - - Raises: - NotImplementedError: If the method is not implemented in the subclass. - """ raise NotImplementedError def finish(self, **kwargs): """ - Finishes the benchmark and performs any necessary cleanup. + Finish the benchmark and perform any necessary cleanup. Args: **kwargs: Additional keyword arguments. @@ -114,15 +94,6 @@ def finish(self, **kwargs): pass def _process_result(self, d): - """ - Processes the result. - - Args: - d (object): The result object. - - Returns: - The processed result. - """ attrname = f"process_result_{self.options.lib}" if hasattr(self.bench.spec, attrname): method = getattr(self.bench.spec, attrname) @@ -134,97 +105,33 @@ def _process_result(self, d): def searcher(self): """ - Returns a searcher object. + This method returns a searcher object. """ pass def query(self): - """ - Executes a query. - - Raises: - NotImplementedError: If the method is not implemented in the subclass. - """ raise NotImplementedError def find(self, q): - """ - Finds a query. - - Args: - q (object): The query object. - - Raises: - NotImplementedError: If the method is not implemented in the subclass. - """ raise NotImplementedError def findterms(self, terms): - """ - Finds terms. - - Args: - terms (object): The terms object. - - Raises: - NotImplementedError: If the method is not implemented in the subclass. - """ raise NotImplementedError def results(self, r): - """ - Generates processed results. - - Args: - r (object): The results object. - - Yields: - The processed results. - """ for hit in r: yield self._process_result(hit) class Spec: - """ - The Spec class represents a benchmark specification. - - Attributes: - headline_field (str): The name of the field containing the headline. - main_field (str): The name of the main field. - options (object): The benchmark options. - args (list): The benchmark arguments. - - Methods: - __init__(self, options, args): Initializes a new instance of the Spec class. - documents(self): Abstract method to be implemented by subclasses. - setup(self): Performs the setup for the benchmark. - print_results(self, ls): Prints the benchmark results. - - Usage: - spec = Spec(options, args) - spec.setup() - spec.print_results(ls) - """ - headline_field = "title" main_field = "body" def __init__(self, options, args): - """ - Initializes a new instance of the Spec class. - - Args: - options (object): The benchmark options. - args (list): The benchmark arguments. - """ self.options = options self.args = args def documents(self): - """ - Abstract method to be implemented by subclasses. - """ raise NotImplementedError def setup(self): @@ -234,12 +141,6 @@ def setup(self): pass def print_results(self, ls): - """ - Prints the benchmark results. - - Args: - ls (list): The list of benchmark results. - """ showbody = self.options.showbody snippets = self.options.snippets limit = self.options.limit @@ -255,47 +156,7 @@ def print_results(self, ls): class WhooshModule(Module): - """ - A module for interacting with the Whoosh search engine. - - This module provides methods for indexing documents, searching the index, and retrieving search results. - - Attributes: - writer: An instance of the Whoosh IndexWriter used for adding documents to the index. - srch: An instance of the Whoosh IndexSearcher used for searching the index. - parser: An instance of the Whoosh QueryParser used for parsing search queries. - - Methods: - indexer(create=True): Initializes the Whoosh index and sets up the IndexWriter. - index_document(d): Indexes a document in the Whoosh index. - finish(merge=True, optimize=False): Commits changes to the index. - searcher(): Initializes the IndexSearcher and QueryParser. - query(): Parses the search query string and returns a Query object. - find(q): Executes a search query and returns the search results. - findterms(terms): Executes multiple search queries for each term and returns the search results. - - Usage: - module = WhooshModule() - module.indexer() - module.index_document(document) - module.finish() - module.searcher() - query = module.query() - results = module.find(query) - """ - def indexer(self, create=True): - """ - Creates or opens an index using the specified schema and options. - - Args: - create (bool, optional): If True, creates a new index if it doesn't exist. - If False, opens an existing index. - Defaults to True. - - Returns: - IndexWriter: An instance of IndexWriter for the created or opened index. - """ schema = self.bench.spec.whoosh_schema() path = os.path.join(self.options.dir, f"{self.options.indexname}_whoosh") @@ -323,123 +184,30 @@ def indexer(self, create=True): self._procdoc = self.bench.spec.process_document_whoosh def index_document(self, d): - """ - Indexes a document in the Whoosh index. - - Args: - d (dict): The document to be indexed. The keys represent the field names and the values represent the field values. - - Returns: - None - """ _procdoc = self._procdoc if _procdoc: _procdoc(d) self.writer.add_document(**d) def finish(self, merge=True, optimize=False): - """ - Commits the changes made to the index. - - Args: - merge (bool, optional): Specifies whether to perform a merge operation before committing. - Defaults to True. - optimize (bool, optional): Specifies whether to optimize the index after committing. - Defaults to False. - - Returns: - None - - Raises: - Any exceptions raised by the underlying writer.commit() method. - - Notes: - - This method should be called after making changes to the index to ensure that the changes - are persisted. - - By default, a merge operation is performed before committing. This helps in optimizing - the index by merging smaller segments into larger ones. - - If the `optimize` parameter is set to True, the index will be further optimized after - committing. This can improve search performance but may take longer to complete. - - Usage: - bench = Bench() - # ... perform index modifications ... - bench.finish(merge=True, optimize=False) - """ self.writer.commit(merge=merge, optimize=optimize) def searcher(self): - """ - Creates and returns a searcher object for performing searches on the index. - - Returns: - Searcher: A searcher object that can be used to perform searches on the index. - - Raises: - OSError: If there is an error while opening the index directory. - """ path = os.path.join(self.options.dir, f"{self.options.indexname}_whoosh") ix = index.open_dir(path) self.srch = ix.searcher(weighting=scoring.PL2()) self.parser = qparser.QueryParser(self.bench.spec.main_field, schema=ix.schema) def query(self): - """ - Parses the query string and returns a parsed query object. - - Args: - None - - Returns: - A parsed query object. - - Raises: - None - - Example: - bench = Bench() - bench.query() # Returns a parsed query object - """ qstring = " ".join(self.args).decode("utf-8") return self.parser.parse(qstring) def find(self, q): - """ - Executes a search query and returns the results. - - Args: - q (str): The search query string. - - Returns: - list: A list of search results. - - """ return self.srch.search( q, limit=int(self.options.limit), optimize=self.options.optimize ) def findterms(self, terms): - """ - Searches for the given terms in the specified field and returns the search results. - - Args: - terms (list): A list of terms to search for. - - Yields: - whoosh.searching.Results: The search results for each term. - - Returns: - None - - Raises: - None - - Example: - bench = Bench() - terms = ["term1", "term2", "term3"] - for result in bench.findterms(terms): - print(result) - """ limit = int(self.options.limit) s = self.srch q = query.Term(self.bench.spec.main_field, None) @@ -449,75 +217,12 @@ def findterms(self, terms): class XappyModule(Module): - """ - A module for indexing and searching documents using Xappy. - - This module provides methods for indexing documents, performing searches, - and retrieving search results using the Xappy library. - - Usage: - 1. Create an instance of XappyModule. - 2. Call the `indexer` method to obtain a connection to the Xappy index. - 3. Use the `index_document` method to add documents to the index. - 4. Call the `finish` method to flush any pending changes to the index. - 5. Call the `searcher` method to obtain a connection for searching the index. - 6. Use the `query` method to create a query object for searching. - 7. Call the `find` method to perform a search and retrieve the results. - 8. Use the `results` method to iterate over the search results. - - Note: Before using this module, make sure to install the Xappy library. - - Attributes: - options (object): An object containing configuration options. - bench (object): An object representing the benchmarking tool. - - Methods: - indexer(**kwargs): Returns a connection to the Xappy index. - index_document(conn=None, d=None): Indexes a document in the Xappy index. - finish(conn): Flushes any pending changes to the Xappy index. - searcher(): Returns a connection for searching the Xappy index. - query(conn=None): Creates a query object for searching the Xappy index. - find(conn=None, q=None): Performs a search and retrieves the results. - findterms(conn=None, terms=None): Performs searches for multiple terms. - results(r): Iterates over the search results. - - """ - def indexer(self, **kwargs): - """ - Creates and returns a connection to the Xappy index. - - Args: - **kwargs: Additional keyword arguments to be passed to the Xappy connection. - - Returns: - Xappy connection: A connection to the Xappy index. - - Raises: - None. - - Example usage: - conn = indexer() - # Use the connection to perform operations on the Xappy index - """ path = os.path.join(self.options.dir, f"{self.options.indexname}_xappy") conn = self.bench.spec.xappy_connection(path) return conn def index_document(self, conn=None, d=None): - """ - Indexes a document in the Xappy index. - - Args: - conn (Xappy connection, optional): The connection to the Xappy index. If not provided, a new connection will be created. - d (dict): The document to be indexed. - - Returns: - None. - - Raises: - None. - """ if hasattr(self.bench, "process_document_xappy"): self.bench.process_document_xappy(d) doc = xappy.UnprocessedDocument() @@ -529,99 +234,25 @@ def index_document(self, conn=None, d=None): conn.add(doc) def finish(self, conn): - """ - Flushes any pending changes to the Xappy index. - - Args: - conn (Xappy connection): The connection to the Xappy index. - - Returns: - None. - - Raises: - None. - """ conn.flush() def searcher(self): - """ - Returns a connection for searching the Xappy index. - - Args: - None. - - Returns: - Xappy connection: A connection for searching the Xappy index. - - Raises: - None. - """ path = os.path.join(self.options.dir, f"{self.options.indexname}_xappy") return xappy.SearchConnection(path) def query(self, conn=None): - """ - Creates a query object for searching the Xappy index. - - Args: - conn (Xappy connection, optional): The connection to the Xappy index. If not provided, a new connection will be created. - - Returns: - Xappy query: A query object for searching the Xappy index. - - Raises: - None. - """ return conn.query_parse(" ".join(self.args)) def find(self, conn=None, q=None): - """ - Performs a search and retrieves the results. - - Args: - conn (Xappy connection, optional): The connection to the Xappy index. If not provided, a new connection will be created. - q (Xappy query): The query object for searching the Xappy index. - - Returns: - Xappy results: The search results. - - Raises: - None. - """ return conn.search(q, 0, int(self.options.limit)) def findterms(self, conn=None, terms=None): - """ - Performs searches for multiple terms. - - Args: - conn (Xappy connection, optional): The connection to the Xappy index. If not provided, a new connection will be created. - terms (list): The list of terms to search for. - - Returns: - generator: A generator that yields the search results for each term. - - Raises: - None. - """ limit = int(self.options.limit) for term in terms: q = conn.query_field(self.bench.spec.main_field, term) yield conn.search(q, 0, limit) def results(self, r): - """ - Iterates over the search results. - - Args: - r (Xappy results): The search results. - - Returns: - generator: A generator that yields each search result. - - Raises: - None. - """ hf = self.bench.spec.headline_field mf = self.bench.spec.main_field for hit in r: @@ -629,41 +260,12 @@ def results(self, r): class XapianModule(Module): - """ - XapianModule is a module that provides indexing and searching capabilities using Xapian. - - Args: - Module (class): The base class for all modules. - - Attributes: - database (xapian.WritableDatabase): The Xapian writable database. - ixer (xapian.TermGenerator): The Xapian term generator. - db (xapian.Database): The Xapian database. - enq (xapian.Enquire): The Xapian enquire object. - qp (xapian.QueryParser): The Xapian query parser. - - """ - def indexer(self, **kwargs): - """ - Initializes the Xapian indexer. - - Args: - **kwargs: Additional keyword arguments. - - """ path = os.path.join(self.options.dir, f"{self.options.indexname}_xapian") self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN) self.ixer = xapian.TermGenerator() def index_document(self, d): - """ - Indexes a document in the Xapian database. - - Args: - d (dict): The document to be indexed. - - """ if hasattr(self.bench, "process_document_xapian"): self.bench.process_document_xapian(d) doc = xapian.Document() @@ -674,20 +276,9 @@ def index_document(self, d): self.database.add_document(doc) def finish(self, **kwargs): - """ - Flushes the Xapian database. - - Args: - **kwargs: Additional keyword arguments. - - """ self.database.flush() def searcher(self): - """ - Initializes the Xapian searcher. - - """ path = os.path.join(self.options.dir, f"{self.options.indexname}_xappy") self.db = xapian.Database(path) self.enq = xapian.Enquire(self.db) @@ -695,40 +286,13 @@ def searcher(self): self.qp.set_database(self.db) def query(self): - """ - Parses and returns the query. - - Returns: - xapian.Query: The parsed query. - - """ return self.qp.parse_query(" ".join(self.args)) def find(self, q): - """ - Finds and returns the matching documents for the given query. - - Args: - q (xapian.Query): The query to search for. - - Returns: - xapian.MSet: The matching documents. - - """ self.enq.set_query(q) return self.enq.get_mset(0, int(self.options.limit)) def findterms(self, terms): - """ - Finds and returns the matching documents for each term in the given list. - - Args: - terms (list): The list of terms to search for. - - Yields: - xapian.MSet: The matching documents for each term. - - """ limit = int(self.options.limit) for term in terms: q = self.qp.parse_query(term) @@ -736,16 +300,6 @@ def findterms(self, terms): yield self.enq.get_mset(0, limit) def results(self, matches): - """ - Processes and yields the results from the given matches. - - Args: - matches (xapian.MSet): The matches to process. - - Yields: - dict: The processed result for each match. - - """ hf = self.bench.spec.headline_field mf = self.bench.spec.main_field for m in matches: @@ -755,176 +309,47 @@ def results(self, matches): class SolrModule(Module): - """ - A module for interacting with Apache Solr. - - This module provides methods for indexing documents, searching for documents, - and retrieving search results from an Apache Solr server. - - Args: - Module (class): The base class for all modules. - - Attributes: - solr_doclist (list): A list to store the documents to be indexed. - conn (pysolr.Solr): A connection object to interact with the Solr server. - solr (pysolr.Solr): A connection object to interact with the Solr server for searching. - - """ - def indexer(self, **kwargs): - """ - Initializes the SolrModule for indexing. - - This method initializes the SolrModule by creating a connection to the Solr server, - deleting all existing documents in the server, and committing the changes. - - Args: - **kwargs: Additional keyword arguments. - - """ - self.solr_doclist = [] self.conn = pysolr.Solr(self.options.url) self.conn.delete("*:*") self.conn.commit() def index_document(self, d): - """ - Adds a document to the list of documents to be indexed. - - This method adds a document to the list of documents to be indexed. - If the number of documents in the list reaches the batch size specified in the options, - the documents are added to the Solr server and the list is cleared. - - Args: - d (dict): The document to be indexed. - - """ - self.solr_doclist.append(d) if len(self.solr_doclist) >= int(self.options.batch): self.conn.add(self.solr_doclist, commit=False) self.solr_doclist = [] def finish(self, **kwargs): - """ - Finalizes the indexing process. - - This method finalizes the indexing process by adding any remaining documents in the list - to the Solr server, optimizing the server, and cleaning up resources. - - Args: - **kwargs: Additional keyword arguments. - - """ - if self.solr_doclist: self.conn.add(self.solr_doclist) del self.solr_doclist self.conn.optimize(block=True) def searcher(self): - """ - Initializes the SolrModule for searching. - - This method initializes the SolrModule by creating a connection to the Solr server - specifically for searching. - - """ - self.solr = pysolr.Solr(self.options.url) def query(self): - """ - Constructs a query string. - - This method constructs a query string by joining the arguments passed to the script. - - Returns: - str: The constructed query string. - - """ - return " ".join(self.args) def find(self, q): - """ - Executes a search query. - - This method executes a search query on the Solr server using the provided query string. - - Args: - q (str): The query string. - - Returns: - pysolr.Results: The search results. - - """ - return self.solr.search(q, limit=int(self.options.limit)) def findterms(self, terms): - """ - Executes search queries for each term. - - This method executes search queries on the Solr server for each term in the provided list. - The search queries are constructed by appending the term to the "body:" field. - - Args: - terms (list): The list of terms to search for. - - Yields: - pysolr.Results: The search results for each term. - - """ - limit = int(self.options.limit) for term in terms: yield self.solr.search("body:" + term, limit=limit) class ZcatalogModule(Module): - """ - A module for indexing and searching documents using ZCatalog. - - This module provides functionality for indexing and searching documents using ZCatalog, - which is a powerful indexing and search system for Python applications. - - Usage: - 1. Create an instance of ZcatalogModule. - 2. Call the `indexer` method to set up the indexing environment. - 3. Call the `index_document` method to index a document. - 4. Call the `finish` method to commit the changes and clean up resources. - 5. Call the `searcher` method to set up the searching environment. - 6. Call the `query` method to specify the search query. - 7. Call the `find` method to retrieve search results. - 8. Call the `findterms` method to retrieve search results for each term in a list. - 9. Call the `results` method to process and iterate over search results. - - Note: This module requires the ZODB package to be installed. - - Attributes: - - cat: The ZCatalog instance used for indexing and searching. - - zcatalog_count: The count of indexed documents. - - """ - def indexer(self, **kwargs): - """ - Set up the indexing environment. - - This method creates the necessary directory and storage for indexing, - initializes the ZCatalog instance, and commits the changes. - - Args: - - kwargs: Additional keyword arguments. - - """ - - import transaction - from zcatalog import catalog - from ZODB.DB import DB - from ZODB.FileStorage import FileStorage + import transaction # type: ignore # type: ignore @UnresolvedImport + from zcatalog import catalog # type: ignore # type: ignore @UnresolvedImport + from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport + from ZODB.FileStorage import ( + FileStorage, # type: ignore # type: ignore @UnresolvedImport + ) directory = os.path.join(self.options.dir, f"{self.options.indexname}_zcatalog") if os.path.exists(directory): @@ -943,57 +368,28 @@ def indexer(self, **kwargs): self.zcatalog_count = 0 def index_document(self, d): - """ - Index a document. - - This method indexes a document by processing it with the `process_document_zcatalog` - method (if available), creating a ZDoc instance, and indexing the document using the - ZCatalog instance. It also commits the changes periodically based on the `zcatalog_count` - attribute. - - Args: - - d: The document to be indexed. - - """ - if hasattr(self.bench, "process_document_zcatalog"): self.bench.process_document_zcatalog(d) doc = ZDoc(d) self.cat.index_doc(doc) self.zcatalog_count += 1 if self.zcatalog_count >= 100: - import transaction + import transaction # type: ignore # type: ignore @UnresolvedImport transaction.commit() self.zcatalog_count = 0 def finish(self, **kwargs): - """ - Finish indexing and clean up resources. - - This method commits the changes made during indexing and cleans up resources. - - Args: - - kwargs: Additional keyword arguments. - - """ - - import transaction + import transaction # type: ignore # type: ignore @UnresolvedImport transaction.commit() del self.zcatalog_count def searcher(self): - """ - Set up the searching environment. - - This method sets up the searching environment by opening the ZODB connection, - retrieving the ZCatalog instance, and assigning it to the `cat` attribute. - - """ - - from ZODB.DB import DB - from ZODB.FileStorage import FileStorage + from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport + from ZODB.FileStorage import ( + FileStorage, # type: ignore # type: ignore @UnresolvedImport + ) path = os.path.join( self.options.dir, f"{self.options.indexname}_zcatalog", "index" @@ -1005,66 +401,16 @@ def searcher(self): self.cat = conn.root()["cat"] def query(self): - """ - Get the search query. - - This method returns the search query as a string. - - Returns: - - The search query. - - """ - return " ".join(self.args) def find(self, q): - """ - Find search results. - - This method performs a search using the ZCatalog instance and the specified query. - - Args: - - q: The search query. - - Returns: - - The search results. - - """ - return self.cat.searchResults(body=q) def findterms(self, terms): - """ - Find search results for each term. - - This method performs a search for each term in the specified list using the ZCatalog instance. - - Args: - - terms: The list of terms to search for. - - Yields: - - The search results for each term. - - """ - for term in terms: yield self.cat.searchResults(body=term) def results(self, r): - """ - Process and iterate over search results. - - This method processes and iterates over the search results, retrieving the headline and main - fields for each hit. - - Args: - - r: The search results. - - Yields: - - The processed search results. - - """ - hf = self.bench.spec.headline_field mf = self.bench.spec.main_field for hit in r: @@ -1073,17 +419,7 @@ def results(self, r): class NucularModule(Module): - """ - A module for indexing and searching documents using the Nucular library. - """ - def indexer(self, create=True): - """ - Indexes a document using the Nucular library. - - Args: - create (bool, optional): Whether to create a new index. Defaults to True. - """ import shutil from nucular import Nucular # type: ignore # type: ignore @UnresolvedImport @@ -1099,12 +435,6 @@ def indexer(self, create=True): self.count = 0 def index_document(self, d): - """ - Indexes a document. - - Args: - d (dict): The document to be indexed. - """ try: self.archive.indexDictionary(str(self.count), d) except ValueError: @@ -1112,22 +442,17 @@ def index_document(self, d): raise self.count += 1 if not self.count % int(self.options.batch): + t = now() self.archive.store(lazy=True) self.indexer(create=False) def finish(self, **kwargs): - """ - Finishes the indexing process. - """ self.archive.store(lazy=False) self.archive.aggregateRecent(fast=False, verbose=True) self.archive.moveTransientToBase(verbose=True) self.archive.cleanUp() def searcher(self): - """ - Initializes the searcher for querying the indexed documents. - """ from nucular import Nucular # type: ignore # type: ignore @UnresolvedImport directory = os.path.join( @@ -1136,36 +461,12 @@ def searcher(self): self.archive = Nucular.Nucular(directory) def query(self): - """ - Constructs a query string from the arguments. - - Returns: - str: The constructed query string. - """ return " ".join(self.args) def find(self, q): - """ - Finds documents matching the given query. - - Args: - q (str): The query string. - - Returns: - list: A list of dictionaries representing the matching documents. - """ return self.archive.dictionaries(q) def findterms(self, terms): - """ - Finds documents containing the given terms. - - Args: - terms (list): A list of terms to search for. - - Yields: - list: A list of dictionaries representing the matching documents for each term. - """ for term in terms: q = self.archive.Query() q.anyWord(term) @@ -1173,10 +474,6 @@ def findterms(self, terms): class Bench: - """ - The Bench class provides methods for indexing and searching documents using different libraries. - """ - libs = { "whoosh": WhooshModule, "xappy": XappyModule, @@ -1187,23 +484,6 @@ class Bench: } def index(self, lib): - """ - Indexes documents using the specified library. - - Args: - lib: The library to use for indexing. - - Returns: - None - - Raises: - None - - Example: - bench = Bench() - bench.index(MyLibrary()) - """ - print(f"Indexing with {lib}...") options = self.options @@ -1253,18 +533,6 @@ def index(self, lib): print(f"Indexed {count / totaltime:0.3f} docs/s") def search(self, lib): - """ - Perform a search using the given library. - - Args: - lib: The library object to use for searching. - - Returns: - None - - Raises: - None - """ lib.searcher() t = now() @@ -1278,19 +546,6 @@ def search(self, lib): print("Print time:", now() - t) def search_file(self, lib): - """ - Searches for terms in a file using the specified library. - - Args: - lib (str): The name of the library to use for searching. - - Returns: - None - - Raises: - FileNotFoundError: If the termfile specified in the options does not exist. - - """ f = open(self.options.termfile, "rb") terms = [line.strip() for line in f] f.close() @@ -1298,50 +553,12 @@ def search_file(self, lib): print(f"Searching {len(terms)} terms with {lib}") lib.searcher() starttime = now() - for _ in lib.findterms(terms): + for r in lib.findterms(terms): pass searchtime = now() - starttime print("Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime) def _parser(self, name): - """ - Create an OptionParser object with predefined options for command-line parsing. - - Parameters: - - name (str): The name used as a prefix for the index name. - - Returns: - - OptionParser: The OptionParser object with predefined options. - - The _parser function creates an OptionParser object and adds several options to it. - These options are used for command-line parsing in the bench.py script. - - Options: - - -x, --lib: Name of the library to use to index/search. Default is "whoosh". - - -d, --dir: Directory in which to store index. Default is the current directory. - - -s, --setup: Set up any support files or caches. Default is False. - - -i, --index: Index the documents. Default is False. - - -n, --name: Index name prefix. Default is "{name}_index". - - -U, --url: Solr URL. Default is "http://localhost:8983/solr". - - -m, --mb: Max. memory usage, in MB. Default is "128". - - -c, --chunk: Number of documents to index between progress messages. Default is 1000. - - -B, --batch: Batch size for batch adding documents. Default is 1000. - - -k, --skip: Index every Nth document. Default is 1. - - -e, --commit-every: Commit every NUM documents. Default is None. - - -M, --no-merge: Don't merge segments when doing multiple commits. Default is True. - - -u, --upto: Index up to this document number. Default is 600000. - - -p, --procs: Number of processors to use. Default is 0. - - -l, --limit: Maximum number of search results to retrieve. Default is 10. - - -b, --body: Show the body text in search results. Default is False. - - -g, --gen: Generate a list at most N terms present in all libraries. Default is None. - - -f, --file: Search using the list of terms in this file. Default is None. - - -t, --tempdir: Whoosh temp dir. Default is None. - - -P, --pool: Whoosh pool class. Default is None. - - -X, --xms: Experimental Whoosh feature. Default is False. - - -Z, --storebody: Store the body text in index. Default is False. - - -q, --snippets: Show highlighted snippets. Default is False. - - -O, --no-optimize: Turn off searcher optimization. Default is True. - """ p = OptionParser() p.add_option( "-x", @@ -1531,35 +748,13 @@ def _parser(self, name): return p def run(self, specclass): - """ - Runs the benchmarking process. - - Args: - specclass: The benchmark specification class. - - Raises: - ValueError: If the specified library is unknown. - - Notes: - This method parses the command line arguments, initializes the benchmark options and arguments, - creates an instance of the specified library, and executes the benchmark action based on the - command line options. - - Example: - To run the benchmark using a specific specification class: - - ``` - bench = Benchmark() - bench.run(MySpecClass) - ``` - """ parser = self._parser(specclass.name) options, args = parser.parse_args() self.options = options self.args = args if options.lib not in self.libs: - raise ValueError(f"Unknown library: {options.lib!r}") + raise Exception(f"Unknown library: {options.lib!r}") lib = self.libs[options.lib](self, options, args) self.spec = specclass(options, args) diff --git a/src/whoosh/support/bitstream.py b/src/whoosh/support/bitstream.py index c326e4ba..50984639 100644 --- a/src/whoosh/support/bitstream.py +++ b/src/whoosh/support/bitstream.py @@ -1,7 +1,7 @@ """ From a post by Patrick Maupin on the Python mailing list: -https://mail.python.org/pipermail/python-list/2003-November/237481.html +http://mail.python.org/pipermail/python-list/2003-November/237481.html """ from array import array @@ -13,15 +13,6 @@ class BitStreamReader: def __init__(self, source): - """ - Initializes a BitStreamReader object. - - Parameters: - - source: The source data to read from. - - The BitStreamReader reads binary data from the given source and provides methods to seek, tell, and read bits from the data. - """ - self._totalbits = len(source) * _bitsperlong self._position = 0 @@ -33,47 +24,16 @@ def __init__(self, source): self._bitstream = bits def seek(self, offset): - """ - Sets the current position in the bitstream. - - Parameters: - - offset: The new position to set. - - The offset is specified in bits from the beginning of the bitstream. - """ - self._position = offset def tell(self): - """ - Returns the current position in the bitstream. - - Returns: - - The current position in bits from the beginning of the bitstream. - """ - return self._position def read(self, numbits): - """ - Reads the specified number of bits from the bitstream. - - Parameters: - - numbits: The number of bits to read. - - Returns: - - The value of the read bits. - - Raises: - - IndexError: If the specified number of bits exceeds the available bits in the bitstream. - - The read method reads the specified number of bits from the current position in the bitstream and advances the position accordingly. - """ - position = self._position if position < 0 or position + numbits > self._totalbits: - raise IndexError("Invalid bitarray._position/numbits") + raise (IndexError, "Invalid bitarray._position/numbits") longaddress, bitoffset = divmod(position, _bitsperlong) diff --git a/src/whoosh/support/bitvector.py b/src/whoosh/support/bitvector.py index ff3352ef..d7ef507d 100644 --- a/src/whoosh/support/bitvector.py +++ b/src/whoosh/support/bitvector.py @@ -6,7 +6,267 @@ from array import array #: Table of the number of '1' bits in each byte (0-255) -BYTE_COUNTS = array("B", [bin(byte).count("1") for byte in range(256)]) +BYTE_COUNTS = array( + "B", + [ + 0, + 1, + 1, + 2, + 1, + 2, + 2, + 3, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 5, + 6, + 6, + 7, + 6, + 7, + 7, + 8, + ], +) class BitVector: @@ -42,14 +302,6 @@ class BitVector: """ def __init__(self, size, source=None, bits=None): - """ - Initializes a BitVector object. - - Args: - size (int): The size of the BitVector. - source (iterable, optional): An iterable of integers representing bit positions to turn on. Defaults to None. - bits (array, optional): An array of bytes representing the bit values. Defaults to None. - """ self.size = size if bits: @@ -58,123 +310,51 @@ def __init__(self, size, source=None, bits=None): self.bits = array("B", ([0x00] * ((size >> 3) + 1))) if source: - set_var = self.set + set = self.set for num in source: - set_var(num) + set(num) self.bcount = None def __eq__(self, other): - """ - Checks if two BitVector objects are equal. - - Args: - other (BitVector): The other BitVector object to compare. - - Returns: - bool: True if the BitVector objects are equal, False otherwise. - """ if isinstance(other, BitVector): return self.bits == other.bits return False def __repr__(self): - """ - Returns a string representation of the BitVector object. - - Returns: - str: A string representation of the BitVector object. - """ return f"" def __len__(self): - """ - Returns the number of "on" bits in the BitVector. - - Returns: - int: The number of "on" bits in the BitVector. - """ + # This returns the count of "on" bits instead of the size to + # make BitVector exchangeable with a set() object. return self.count() def __contains__(self, index): - """ - Checks if a given index is present in the BitVector. - - Args: - index (int): The index to check. - - Returns: - bool: True if the index is present in the BitVector, False otherwise. - """ return self[index] def __iter__(self): - """ - Returns an iterator over the "on" bits in the BitVector. - - Yields: - int: The indices of the "on" bits in the BitVector. - """ get = self.__getitem__ for i in range(0, self.size): if get(i): yield i def __str__(self): - """ - Returns a string representation of the BitVector object. - - Returns: - str: A string representation of the BitVector object. - """ get = self.__getitem__ return "".join("1" if get(i) else "0" for i in range(0, self.size)) def __nonzero__(self): - """ - Checks if the BitVector has any "on" bits. - - Returns: - bool: True if the BitVector has any "on" bits, False otherwise. - """ return self.count() > 0 def __getitem__(self, index): - """ - Returns the value of the bit at the given index. - - Args: - index (int): The index of the bit to retrieve. - - Returns: - bool: True if the bit is "on", False otherwise. - """ return self.bits[index >> 3] & (1 << (index & 7)) != 0 def __setitem__(self, index, value): - """ - Sets the value of the bit at the given index. - - Args: - index (int): The index of the bit to set. - value (bool): The value to set the bit to. - """ if value: self.set(index) else: self.clear(index) def _logic(self, op, bitv): - """ - Performs a bit-wise logic operation between two BitVector objects. - - Args: - op (function): The bit-wise logic operation to perform. - bitv (BitVector): The other BitVector object to perform the operation with. - - Returns: - BitVector: The result of the bit-wise logic operation. - """ if self.size != bitv.size: raise ValueError("Can't combine bitvectors of different sizes") res = BitVector(size=self.size) @@ -183,124 +363,47 @@ def _logic(self, op, bitv): return res def union(self, other): - """ - Performs a union operation between two BitVector objects. - - Args: - other (BitVector): The other BitVector object to perform the union with. - - Returns: - BitVector: The result of the union operation. - """ return self.__or__(other) def intersection(self, other): - """ - Performs an intersection operation between two BitVector objects. - - Args: - other (BitVector): The other BitVector object to perform the intersection with. - - Returns: - BitVector: The result of the intersection operation. - """ return self.__and__(other) def __and__(self, other): - """ - Performs a bit-wise AND operation between two BitVector objects. - - Args: - other (BitVector): The other BitVector object to perform the AND operation with. - - Returns: - BitVector: The result of the bit-wise AND operation. - """ if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__and__, other) def __or__(self, other): - """ - Performs a bit-wise OR operation between two BitVector objects. - - Args: - other (BitVector): The other BitVector object to perform the OR operation with. - - Returns: - BitVector: The result of the bit-wise OR operation. - """ if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__or__, other) def __ror__(self, other): - """ - Performs a bit-wise OR operation between a BitVector object and another object. - - Args: - other (BitVector): The other object to perform the OR operation with. - - Returns: - BitVector: The result of the bit-wise OR operation. - """ return self.__or__(other) def __rand__(self, other): - """ - Performs a bit-wise AND operation between a BitVector object and another object. - - Args: - other (BitVector): The other object to perform the AND operation with. - - Returns: - BitVector: The result of the bit-wise AND operation. - """ return self.__and__(other) def __xor__(self, other): - """ - Performs a bit-wise XOR operation between two BitVector objects. - - Args: - other (BitVector): The other BitVector object to perform the XOR operation with. - - Returns: - BitVector: The result of the bit-wise XOR operation. - """ if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__xor__, other) def __invert__(self): - """ - Performs a bit-wise inversion operation on the BitVector. - - Returns: - BitVector: The result of the bit-wise inversion operation. - """ return BitVector( self.size, source=(x for x in range(self.size) if x not in self) ) def count(self): - """ - Returns the number of "on" bits in the BitVector. + """Returns the number of "on" bits in the bit array.""" - Returns: - int: The number of "on" bits in the BitVector. - """ if self.bcount is None: self.bcount = sum(BYTE_COUNTS[b & 0xFF] for b in self.bits) return self.bcount def set(self, index): - """ - Turns the bit at the given position on. + """Turns the bit at the given position on.""" - Args: - index (int): The index of the bit to turn on. - """ if index >= self.size: raise IndexError( f"Position {repr(index)} greater than the size of the vector" @@ -309,33 +412,23 @@ def set(self, index): self.bcount = None def clear(self, index): - """ - Turns the bit at the given position off. + """Turns the bit at the given position off.""" - Args: - index (int): The index of the bit to turn off. - """ self.bits[index >> 3] &= ~(1 << (index & 7)) self.bcount = None def set_from(self, iterable): + """Takes an iterable of integers representing positions, and turns + on the bits at those positions. """ - Turns on the bits at the positions specified by an iterable of integers. - Args: - iterable (iterable): An iterable of integers representing positions. - """ - set_var = self.set + set = self.set for index in iterable: - set_var(index) + set(index) def copy(self): - """ - Returns a copy of the BitVector. + """Returns a copy of this BitArray.""" - Returns: - BitVector: A copy of the BitVector. - """ return BitVector(self.size, bits=self.bits) diff --git a/src/whoosh/support/charset.py b/src/whoosh/support/charset.py index 8da99a40..7334d853 100644 --- a/src/whoosh/support/charset.py +++ b/src/whoosh/support/charset.py @@ -1303,11 +1303,11 @@ def charset_table_to_dict(tablestring): character or None if the character is not a valid word character. The Sphinx charset table format is described at - https://www.sphinxsearch.com/docs/current.html#conf-charset-table. + http://www.sphinxsearch.com/docs/current.html#conf-charset-table. """ - # map_dict = {} - map_dict = defaultdict(lambda: None) + # map = {} + map = defaultdict(lambda: None) for line in tablestring.split("\n"): if not line or line.startswith("#"): continue @@ -1326,7 +1326,7 @@ def charset_table_to_dict(tablestring): for fromord, tooord in zip( range(start1, end1 + 1), range(start2, end2 + 1) ): - map_dict[fromord] = chr(tooord) + map[fromord] = chr(tooord) except ValueError: pass continue @@ -1336,16 +1336,16 @@ def charset_table_to_dict(tablestring): fromord = charspec_to_int(match.group(1)) toord = charspec_to_int(match.group(2)) try: - map_dict[fromord] = chr(toord) + map[fromord] = chr(toord) except ValueError: pass continue match = _stray_char.match(item) if match: - ord_charspec = charspec_to_int(match.group(0)) + ord = charspec_to_int(match.group(0)) try: - map_dict[ord_charspec] = chr(ord_charspec) + map[ord] = chr(ord) except ValueError: pass continue @@ -1355,8 +1355,8 @@ def charset_table_to_dict(tablestring): start = charspec_to_int(match.group(1)) end = charspec_to_int(match.group(2)) try: - for ord_charspec in range(start, end + 1): - map_dict[ord_charspec] = chr(ord_charspec) + for ord in range(start, end + 1): + map[ord] = chr(ord) except ValueError: pass continue @@ -1366,13 +1366,13 @@ def charset_table_to_dict(tablestring): fromord = charspec_to_int(match.group(1)) toord = charspec_to_int(match.group(2)) assert toord - fromord % 2 == 0 - for ord_charspec in range(fromord, toord + 1, 2): + for ord in range(fromord, toord + 1, 2): try: - map_dict[ord_charspec] = chr(ord_charspec + 1) - map_dict[ord_charspec + 1] = chr(ord_charspec + 1) + map[ord] = chr(ord + 1) + map[ord + 1] = chr(ord + 1) except ValueError: pass continue - raise ValueError(f"Don't know what to do with {item}") - return dict(map_dict) + raise Exception(f"Don't know what to do with {item!r}") + return dict(map) diff --git a/src/whoosh/support/pyparsing.py b/src/whoosh/support/pyparsing.py index 04711766..ea133368 100644 --- a/src/whoosh/support/pyparsing.py +++ b/src/whoosh/support/pyparsing.py @@ -810,13 +810,9 @@ def set_default_whitespace_chars(chars): set_default_whitespace_chars = staticmethod(set_default_whitespace_chars) def __init__(self, savelist=False): - """Initialize the ParserElement. - - Args: - savelist (bool, optional): Whether to save the results as a list. Defaults to False. - """ self.parse_action = [] self.fail_action = None + # ~ self.name = "" # don't define self.name, let subclasses try/except upcall self.str_repr = None self.results_name = None self.saveas_list = savelist @@ -837,11 +833,8 @@ def __init__(self, savelist=False): self.call_during_try = False def copy(self): - """Make a copy of this ParserElement. - - Returns: - ParserElement: A copy of the original ParserElement. - """ + """Make a copy of this ParserElement. Useful for defining different parse actions + for the same parsing pattern, using copies of the original parse element.""" cpy = copy.copy(self) cpy.parse_action = self.parse_action[:] cpy.ignore_exprs = self.ignore_exprs[:] @@ -850,14 +843,7 @@ def copy(self): return cpy def set_name(self, name): - """Define name for this expression, for use in debugging. - - Args: - name (str): The name of the expression. - - Returns: - ParserElement: The ParserElement object. - """ + """Define name for this expression, for use in debugging.""" self.name = name self.errmsg = "Expected " + self.name if hasattr(self, "exception"): @@ -867,13 +853,9 @@ def set_name(self, name): def set_results_name(self, name, list_all_matches=False): """Define name for referencing matching tokens as a nested attribute of the returned parse results. - - Args: - name (str): The name of the results. - list_all_matches (bool, optional): Whether to list all matches. Defaults to False. - - Returns: - ParserElement: A copy of the original ParserElement with the results name set. + NOTE: this returns a *copy* of the original ParserElement object; + this is so that the client can define a basic element, such as an + integer, and reference it in multiple places with different names. """ newself = self.copy() newself.results_name = name @@ -882,13 +864,8 @@ def set_results_name(self, name, list_all_matches=False): def set_break(self, break_flag=True): """Method to invoke the Python pdb debugger when this element is - about to be parsed. - - Args: - break_flag (bool, optional): Whether to enable the debugger. Defaults to True. - - Returns: - ParserElement: The ParserElement object. + about to be parsed. Set break_flag to True to enable, False to + disable. """ if break_flag: _parse_method = self._parse @@ -908,14 +885,7 @@ def breaker(instring, loc, do_actions=True, call_pre_parse=True): def _normalize_parse_action_args(f): """Internal method used to decorate parse actions that take fewer than 3 arguments, - so that all parse actions can be called as f(s,l,t). - - Args: - f (callable): The parse action function. - - Returns: - callable: The normalized parse action function. - """ + so that all parse actions can be called as f(s,l,t).""" STAR_ARGS = 4 try: @@ -924,6 +894,8 @@ def _normalize_parse_action_args(f): restore = f f = f.__init__ + # codeObj = f.code + if f.code.co_flags & STAR_ARGS: return f numargs = f.code.co_argcount @@ -934,6 +906,8 @@ def _normalize_parse_action_args(f): f = restore except AttributeError: try: + # call_im_func_code = f.__code__ + # not a function, must be a callable object, get info from the # im_func binding of its bound __call__ method if f.__code__.co_flags & STAR_ARGS: @@ -943,6 +917,8 @@ def _normalize_parse_action_args(f): if hasattr(f.__call__, "__self__"): numargs -= 0 except AttributeError: + # call_func_code = f.__call__.__code__ + # not a bound method, get info directly from __call__ method if f.__call__.__code__.co_flags & STAR_ARGS: return f @@ -970,7 +946,7 @@ def tmp(_, l, t): def tmp(_, __, t): return f(t) - else: + else: # ~ numargs == 0: def tmp(_, __, ___): return f() @@ -1016,15 +992,7 @@ def set_parse_action(self, *fns, **kwargs): return self def add_parse_action(self, *fns, **kwargs): - """Add parse action to expression's list of parse actions. - - Args: - *fns (callable): The parse action functions. - **kwargs: Additional keyword arguments. - - Returns: - ParserElement: The ParserElement object. - """ + """Add parse action to expression's list of parse actions. See L{I{set_parse_action}}.""" self.parse_action += list(map(self._normalize_parse_action_args, list(fns))) self.call_during_try = self.call_during_try or ( "call_during_try" in kwargs and kwargs["call_during_try"] @@ -1045,15 +1013,6 @@ def set_fail_action(self, fn): return self def _skip_ignorables(self, instring, loc): - """Skip over ignored expressions. - - Args: - instring (str): The input string. - loc (int): The current location in the string. - - Returns: - int: The updated location. - """ exprs_found = True while exprs_found: exprs_found = False @@ -1067,15 +1026,6 @@ def _skip_ignorables(self, instring, loc): return loc def pre_parse(self, instring, loc): - """Perform pre-parsing operations. - - Args: - instring (str): The input string. - loc (int): The current location in the string. - - Returns: - int: The updated location. - """ if self.ignore_exprs: loc = self._skip_ignorables(instring, loc) @@ -1088,43 +1038,13 @@ def pre_parse(self, instring, loc): return loc def parse_impl(self, instring, loc, do_actions=True): - """Implementation of the parsing logic. - - Args: - instring (str): The input string. - loc (int): The current location in the string. - do_actions (bool, optional): Whether to perform parse actions. Defaults to True. - - Returns: - tuple: The updated location and the list of matched tokens. - """ return loc, [] def post_parse(self, instring, loc, tokenlist): - """Perform post-parsing operations. - - Args: - instring (str): The input string. - loc (int): The current location in the string. - tokenlist (list): The list of matched tokens. - - Returns: - list: The updated list of tokens. - """ return tokenlist # ~ @profile def _parse_no_cache(self, instring, loc, do_actions=True, call_pre_parse=True): - """Parse the input string without using the cache. - - Args: - instring (str): The input string. - loc (int): The current location in the string. - do_actions (bool, optional): Whether to perform parse actions. Defaults to True. - call_pre_parse (bool, optional): Whether to call the pre_parse method. Defaults to True. - """ - # Implementation details omitted for brevity - pass debugging = self.debug # and do_actions ) if debugging or self.fail_action: @@ -1535,14 +1455,7 @@ def __xor__(self, other): return Or([self, other]) def __rxor__(self, other): - """Implementation of ^ operator when left operand is not a ParserElement - - Args: - other (str or ParserElement): The right operand of the ^ operator. - - Returns: - ParserElement: The result of the ^ operation. - """ + """Implementation of ^ operator when left operand is not a ParserElement""" if isinstance(other, str): other = Literal(other) if not isinstance(other, ParserElement): @@ -1555,14 +1468,7 @@ def __rxor__(self, other): return other ^ self def __and__(self, other): - """Implementation of & operator - returns Each - - Args: - other (str or ParserElement): The element to combine with. - - Returns: - Each: A new `Each` object containing both `self` and `other`. - """ + """Implementation of & operator - returns Each""" if isinstance(other, str): other = Literal(other) if not isinstance(other, ParserElement): @@ -1575,14 +1481,7 @@ def __and__(self, other): return Each([self, other]) def __rand__(self, other): - """Implementation of & operator when left operand is not a ParserElement - - Args: - other (str or ParserElement): The left operand of the & operator. - - Returns: - ParserElement: The result of combining the left operand with self using the & operator. - """ + """Implementation of & operator when left operand is not a ParserElement""" if isinstance(other, str): other = Literal(other) if not isinstance(other, ParserElement): @@ -1595,84 +1494,49 @@ def __rand__(self, other): return other & self def __invert__(self): - """Implementation of ~ operator - returns NotAny - - Returns: - NotAny: A new instance of the NotAny class. - """ + """Implementation of ~ operator - returns NotAny""" return NotAny(self) def __call__(self, name): """Shortcut for set_results_name, with list_all_matches=default:: - userdata = Word(alphas).set_results_name("name") + Word(nums+"-").set_results_name("socsecno") + userdata = Word(alphas).set_results_name("name") + Word(nums+"-").set_results_name("socsecno") could be written as:: - userdata = Word(alphas)("name") + Word(nums+"-")("socsecno") - - Args: - name (str): The name to assign to the parsed results. - - Returns: - pyparsing.ParseResults: The modified pyparsing object with the specified name assigned to it. + userdata = Word(alphas)("name") + Word(nums+"-")("socsecno") """ return self.set_results_name(name) def suppress(self): """Suppresses the output of this ParserElement; useful to keep punctuation from cluttering up returned output. - - Returns: - Suppress: A new ParserElement that suppresses the output of the original ParserElement. """ return Suppress(self) def leave_whitespace(self): - """ - Disables the skipping of whitespace before matching the characters in the - ParserElement's defined pattern. This is normally only used internally by + """Disables the skipping of whitespace before matching the characters in the + ParserElement's defined pattern. This is normally only used internally by the pyparsing module, but may be needed in some whitespace-sensitive grammars. - - Returns: - ParserElement: The ParserElement object with whitespace skipping disabled. """ self.skip_whitespace = False return self def set_whitespace_chars(self, chars): - """ - Overrides the default whitespace chars. - - Args: - chars (str): The characters to be considered as whitespace. - - Returns: - self: The current instance of the class. - """ + """Overrides the default whitespace chars""" self.skip_whitespace = True self.white_chars = chars self.copy_default_white_chars = False return self def parse_with_tabs(self): - """ - Overrides default behavior to expand s to spaces before parsing the input string. + """Overrides default behavior to expand s to spaces before parsing the input string. Must be called before parse_string when the input grammar contains elements that - match characters. - - Returns: - self: The current instance of the class. - """ + match characters.""" self.keep_tabs = True return self def ignore(self, other): - """ - Define expression to be ignored (e.g., comments) while doing pattern matching. - - Parameters: - other (str or pyparsing.ParserElement): The expression to be ignored. - - Returns: - pyparsing.ParserElement: The current instance of the ParserElement. + """Define expression to be ignored (e.g., comments) while doing pattern + matching; may be called repeatedly, to define multiple comment or other + ignorable patterns. """ if isinstance(other, Suppress): if other not in self.ignore_exprs: @@ -1682,18 +1546,7 @@ def ignore(self, other): return self def set_debug_actions(self, start_action, success_action, exception_action): - """ - Enable display of debugging messages while doing pattern matching. - - Args: - start_action (callable): The action to perform when pattern matching starts. - success_action (callable): The action to perform when pattern matching succeeds. - exception_action (callable): The action to perform when an exception occurs during pattern matching. - - Returns: - self: The current instance of the class. - - """ + """Enable display of debugging messages while doing pattern matching.""" self.debug_actions = ( start_action or _default_start_debug_action, success_action or _default_success_debug_action, @@ -1703,14 +1556,8 @@ def set_debug_actions(self, start_action, success_action, exception_action): return self def set_debug(self, flag=True): - """Enable or disable display of debugging messages while doing pattern matching. - - Args: - flag (bool, optional): Set to True to enable debugging messages, False to disable. Defaults to True. - - Returns: - self: The current instance of the class. - """ + """Enable display of debugging messages while doing pattern matching. + Set flag to True to enable, False to disable.""" if flag: self.set_debug_actions( _default_start_debug_action, @@ -1728,12 +1575,6 @@ def __repr__(self): return str(self) def streamline(self): - """ - Streamlines the object by marking it as streamlined and resetting the string representation. - - Returns: - The streamlined object. - """ self.streamlined = True self.str_repr = None return self diff --git a/src/whoosh/util/__init__.py b/src/whoosh/util/__init__.py index e47d91b7..81593209 100644 --- a/src/whoosh/util/__init__.py +++ b/src/whoosh/util/__init__.py @@ -45,32 +45,10 @@ def random_name(size=28): - """ - Generates a random name consisting of alphanumeric characters. - - Parameters: - - size (int): The length of the random name to generate. Default is 28. - - Returns: - - str: The randomly generated name. - """ return "".join(random.choice(IDCHARS) for _ in range(size)) def random_bytes(size=28): - """ - Generate a random byte string of the specified size. - - Parameters: - - size (int): The size of the byte string to generate. Default is 28. - - Returns: - - bytes: A random byte string of the specified size. - - Example: - >>> random_bytes(16) - b'\x8f\x9a\x0b\x1e\x9c\x8d\x8c\x9e\x1f\x9d\x9e\x0e\x1e\x9e\x1e\x9e' - """ return bytes(random.randint(0, 255) for _ in range(size)) @@ -78,38 +56,13 @@ def make_binary_tree(fn, args, **kwargs): """Takes a function/class that takes two positional arguments and a list of arguments and returns a binary tree of results/instances. - Args: - fn (callable): A function or class that takes two positional arguments. - args (list): A list of arguments to be used to construct the binary tree. - - Keyword Args: - **kwargs: Additional keyword arguments to be passed to the class initializer. - - Returns: - object: The binary tree of results/instances. - - Raises: - ValueError: If called with an empty list. - - Examples: - >>> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3]) - UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3)) - - This function takes a function or class `fn` that takes two positional arguments, - and a list of arguments `args`. It constructs a binary tree of results/instances - by recursively splitting the `args` list into two halves and calling `fn` with - the left and right halves as arguments. - - If the `args` list contains only one element, that element is returned as is. - - Any additional keyword arguments given to this function are passed to the class - initializer of `fn`. - - Note: - The `fn` should be a function or class that can be called with two positional - arguments and returns a result/instance. + >>> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3]) + UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3)) + Any keyword arguments given to this function are passed to the class + initializer. """ + count = len(args) if not count: raise ValueError("Called make_binary_tree with empty list") @@ -125,30 +78,11 @@ def make_binary_tree(fn, args, **kwargs): def make_weighted_tree(fn, ls, **kwargs): - """ - Takes a function/class that takes two positional arguments and a list of + """Takes a function/class that takes two positional arguments and a list of (weight, argument) tuples and returns a huffman-like weighted tree of results/instances. - - Args: - fn (function/class): The function or class that takes two positional arguments. - ls (list): A list of (weight, argument) tuples. - **kwargs: Additional keyword arguments that can be passed to the function/class. - - Returns: - object: The huffman-like weighted tree of results/instances. - - Raises: - ValueError: If the input list is empty. - - Example: - >>> def combine(a, b): - ... return a + b - ... - >>> ls = [(1, 'a'), (2, 'b'), (3, 'c')] - >>> make_weighted_tree(combine, ls) - 'abc' """ + if not ls: raise ValueError("Called make_weighted_tree with empty list") @@ -166,19 +100,8 @@ def make_weighted_tree(fn, ls, **kwargs): def fib(n): - """ - Returns the nth value in the Fibonacci sequence. - - Parameters: - - n (int): The position of the value in the Fibonacci sequence to be returned. - - Returns: - - int: The nth value in the Fibonacci sequence. + """Returns the nth value in the Fibonacci sequence.""" - Notes: - - The Fibonacci sequence starts with 0 and 1, and each subsequent value is the sum of the two preceding values. - - The function uses memoization to improve performance by caching previously calculated values. - """ if n <= 2: return n if n in _fib_cache: @@ -194,23 +117,6 @@ def fib(n): def synchronized(func): """Decorator for storage-access methods, which synchronizes on a threading lock. The parent object must have 'is_closed' and '_sync_lock' attributes. - - Args: - func (callable): The function to be decorated. - - Returns: - callable: The decorated function. - - Example: - >>> class MyClass: - ... def __init__(self): - ... self._sync_lock = threading.Lock() - ... - ... @synchronized - ... def my_method(self): - ... # Access shared storage here - ... pass - """ @wraps(func) @@ -224,23 +130,6 @@ def synchronized_wrapper(self, *args, **kwargs): def unclosed(method): """ Decorator to check if the object is closed. - - This decorator can be used to wrap methods in a class to ensure that the object is not closed before executing the method. - If the object is closed, a ValueError is raised. - - Parameters: - - method: The method to be wrapped. - - Returns: - - The wrapped method. - - Example usage: - ``` - class MyClass: - @unclosed - def my_method(self): - # Method implementation - ``` """ @wraps(method) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 36d23c0e..2f643834 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -432,8 +432,8 @@ def test_url(): sample = "Visit https://github.com/sygil-dev/whoosh-reloaded or urn:isbn:5930502 or http://www.apple.com/." anas = [ - analysis.simple_analyzer(analysis.url_pattern), - analysis.standard_analyzer(analysis.url_pattern, stoplist=None), + analysis.SimpleAnalyzer(analysis.url_pattern), + analysis.StandardAnalyzer(analysis.url_pattern, stoplist=None), ] for ana in anas: ts = [t.text for t in ana(sample)] @@ -543,14 +543,14 @@ def test_language_analyzer(): ] for lang, source, target in domain: - ana = analysis.language_analyzer(lang) + ana = analysis.LanguageAnalyzer(lang) words = [t.text for t in ana(source)] assert words == target @pytest.mark.skipif("sys.version_info < (2,6)") def test_la_pickleability(): - ana = analysis.language_analyzer("en") + ana = analysis.LanguageAnalyzer("en") _ = dumps(ana, -1) @@ -558,7 +558,7 @@ def test_charset_pickeability(): from whoosh.support import charset charmap = charset.charset_table_to_dict(charset.default_charset) - ana = analysis.standard_analyzer() | analysis.CharsetFilter(charmap) + ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap) _ = dumps(ana, -1) ana = analysis.CharsetTokenizer(charmap) @@ -638,7 +638,7 @@ def test_stop_lang(): def test_issue358(): t = analysis.RegexTokenizer(r"\w+") with pytest.raises(analysis.CompositionError): - _ = t | analysis.standard_analyzer() + _ = t | analysis.StandardAnalyzer() def test_ngramwords_tokenizer(): diff --git a/tests/test_classify.py b/tests/test_classify.py index c3a51a52..d2a43c8d 100644 --- a/tests/test_classify.py +++ b/tests/test_classify.py @@ -16,7 +16,7 @@ def create_index(): - analyzer = analysis.standard_analyzer() + analyzer = analysis.StandardAnalyzer() vector_format = formats.Frequency() schema = fields.Schema( path=fields.ID(stored=True), @@ -94,7 +94,7 @@ def _check(schema, **kwargs): schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT(stored=True)) _check(schema) - ana = analysis.standard_analyzer() + ana = analysis.StandardAnalyzer() schema = fields.Schema( id=fields.ID(stored=True), text=fields.TEXT(analyzer=ana, vector=formats.Frequency()), diff --git a/tests/test_codecs.py b/tests/test_codecs.py index f8d9e5ec..b757bd0b 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -508,9 +508,9 @@ def test_skip(): # # # def test_special_spelled_field(): -# from whoosh.analysis import stemming_analyzer +# from whoosh.analysis import StemmingAnalyzer # -# field = fields.TEXT(analyzer=stemming_analyzer(), spelling=True) +# field = fields.TEXT(analyzer=StemmingAnalyzer(), spelling=True) # st, codec, seg = _make_codec() # # fw = codec.field_writer(st, seg) @@ -537,7 +537,7 @@ def test_skip(): def test_plaintext_codec(): pytest.importorskip("ast") - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema( a=fields.TEXT(vector=True, sortable=True), b=fields.STORED, @@ -612,7 +612,7 @@ def test_plaintext_codec(): def test_memory_codec(): - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema( a=fields.TEXT(vector=True), b=fields.STORED, diff --git a/tests/test_fields.py b/tests/test_fields.py index 3b9a39e6..aba2e903 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -648,9 +648,7 @@ def test_pickle_schema(): from whoosh import analysis from whoosh.support.charset import accent_map - freetext_analyzer = analysis.stemming_analyzer() | analysis.CharsetFilter( - accent_map - ) + freetext_analyzer = analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map) schema = fields.Schema( path=fields.ID(stored=True, unique=True), diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 89fb873e..eee21cd0 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -14,7 +14,7 @@ def u(s): def test_null_fragment(): terms = frozenset(("bravo", "india")) - sa = analysis.standard_analyzer() + sa = analysis.StandardAnalyzer() nf = highlight.WholeFragmenter() uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, nf, uc) @@ -89,7 +89,7 @@ def test_sentence_fragment(): + "This sentence is the second. Third sentence here." ) terms = ("sentence",) - sa = analysis.standard_analyzer(stoplist=None) + sa = analysis.StandardAnalyzer(stoplist=None) sf = highlight.SentenceFragmenter() uc = highlight.UppercaseFormatter() htext = highlight.highlight(text, terms, sa, sf, uc) @@ -101,7 +101,7 @@ def test_sentence_fragment(): def test_context_fragment(): terms = frozenset(("bravo", "india")) - sa = analysis.standard_analyzer() + sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, cf, uc) @@ -110,7 +110,7 @@ def test_context_fragment(): def test_context_at_start(): terms = frozenset(["alfa"]) - sa = analysis.standard_analyzer() + sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=15) uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, cf, uc) @@ -119,7 +119,7 @@ def test_context_at_start(): def test_html_format(): terms = frozenset(("bravo", "india")) - sa = analysis.standard_analyzer() + sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter() htext = highlight.highlight(_doc, terms, sa, cf, hf) @@ -131,7 +131,7 @@ def test_html_format(): def test_html_escape(): terms = frozenset(["bravo"]) - sa = analysis.standard_analyzer() + sa = analysis.StandardAnalyzer() wf = highlight.WholeFragmenter() hf = highlight.HtmlFormatter() htext = highlight.highlight('alfa delta', terms, sa, wf, hf) @@ -143,7 +143,7 @@ def test_html_escape(): def test_maxclasses(): terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo")) - sa = analysis.standard_analyzer() + sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2) htext = highlight.highlight(_doc, terms, sa, cf, hf) @@ -325,7 +325,7 @@ def test_highlight_ngrams(): def test_issue324(): - sa = analysis.stemming_analyzer() + sa = analysis.StemmingAnalyzer() result = highlight.highlight( "Indexed!\n1", ["index"], diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 1dc6910a..363c8a17 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -608,7 +608,7 @@ def test_indentical_fields(): def test_multivalue(): - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema( id=fields.STORED, date=fields.DATETIME, @@ -636,7 +636,7 @@ def test_multivalue(): def test_multi_language(): # Analyzer for English - ana_eng = analysis.stemming_analyzer() + ana_eng = analysis.StemmingAnalyzer() # analyzer for Pig Latin def stem_piglatin(w): @@ -644,9 +644,7 @@ def stem_piglatin(w): w = w[:-2] return w - ana_pig = analysis.stemming_analyzer( - stoplist=["nday", "roay"], stemfn=stem_piglatin - ) + ana_pig = analysis.StemmingAnalyzer(stoplist=["nday", "roay"], stemfn=stem_piglatin) # Dictionary mapping languages to analyzers analyzers = {"eng": ana_eng, "pig": ana_pig} diff --git a/tests/test_parse_plugins.py b/tests/test_parse_plugins.py index d8ee7ed6..d5b398c5 100644 --- a/tests/test_parse_plugins.py +++ b/tests/test_parse_plugins.py @@ -190,7 +190,7 @@ def test_daterange_empty_field(): def test_free_dates(): - a = analysis.standard_analyzer(stoplist=None) + a = analysis.StandardAnalyzer(stoplist=None) schema = fields.Schema(text=fields.TEXT(analyzer=a), date=fields.DATETIME) qp = qparser.QueryParser("text", schema) basedate = datetime(2010, 9, 20, 15, 16, 6, 454000, tzinfo=timezone.utc) @@ -332,7 +332,7 @@ def test_copyfield(): str(qp.parse("hello c:matt")) == "((a:hello OR c:hello) AND (c:matt OR a:matt))" ) - ana = analysis.regex_analyzer(r"\w+") | analysis.DoubleMetaphoneFilter() + ana = analysis.RegexAnalyzer(r"\w+") | analysis.DoubleMetaphoneFilter() fmt = formats.Frequency() ft = fields.FieldType(fmt, ana, multitoken_query="or") schema = fields.Schema(name=fields.KEYWORD, name_phone=ft) @@ -434,7 +434,7 @@ def rev_text(node): def test_fuzzy_plugin(): - ana = analysis.standard_analyzer("\\S+") + ana = analysis.StandardAnalyzer("\\S+") schema = fields.Schema(f=fields.TEXT(analyzer=ana)) qp = default.QueryParser("f", schema) qp.add_plugin(plugins.FuzzyTermPlugin()) @@ -665,7 +665,7 @@ def test_sequence_andmaybe(): def test_sequence_complex(): - ana = analysis.standard_analyzer(stoplist=None) + ana = analysis.StandardAnalyzer(stoplist=None) schema = fields.Schema( title=fields.TEXT(stored=True), path=fields.ID(stored=True), diff --git a/tests/test_parsing.py b/tests/test_parsing.py index de0c9028..dd1d22b1 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -725,7 +725,7 @@ def test_numrange_multi(): def test_nonexistant_fieldnames(): # Need an analyzer that won't mangle a URL - a = analysis.simple_analyzer("\\S+") + a = analysis.SimpleAnalyzer("\\S+") schema = fields.Schema(id=fields.ID, text=fields.TEXT(analyzer=a)) qp = default.QueryParser("text", schema) @@ -747,7 +747,7 @@ def test_stopped(): def test_analyzing_terms(): - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana)) qp = default.QueryParser("text", schema) q = qp.parse("Indexed!") @@ -996,7 +996,7 @@ def test_star_paren(): def test_dash(): - ana = analysis.standard_analyzer("[^ \t\r\n()*?]+") + ana = analysis.StandardAnalyzer("[^ \t\r\n()*?]+") schema = fields.Schema( title=fields.TEXT(analyzer=ana), text=fields.TEXT(analyzer=ana), time=fields.ID ) @@ -1013,7 +1013,7 @@ def test_dash(): assert str(q) == "(title:*ben-hayden* OR text:*ben-hayden* OR time:*Ben-Hayden*)" -def test_bool_true(): +def test_bool_True(): schema = fields.Schema(text=fields.TEXT, bool=fields.BOOLEAN) qp = default.QueryParser("text", schema) q = qp.parse("bool:True") diff --git a/tests/test_postings.py b/tests/test_postings.py index 29478d5f..8d87c97b 100644 --- a/tests/test_postings.py +++ b/tests/test_postings.py @@ -15,7 +15,7 @@ def _roundtrip(content, format_, astype, ana=None): with TempStorage("roundtrip") as st: codec = default_codec() seg = codec.new_segment(st, "") - ana = ana or analysis.standard_analyzer() + ana = ana or analysis.StandardAnalyzer() field = fields.FieldType(format=format_, analyzer=ana) fw = codec.field_writer(st, seg) diff --git a/tests/test_results.py b/tests/test_results.py index 1d0b0f7e..6a586fe1 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -389,7 +389,7 @@ def test_highlight_setters(): def test_snippets(): - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(stored=True, analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() @@ -437,7 +437,7 @@ def test_snippets(): def test_keyterms(): - ana = analysis.standard_analyzer() + ana = analysis.StandardAnalyzer() vectorformat = formats.Frequency() schema = fields.Schema( path=fields.ID, content=fields.TEXT(analyzer=ana, vector=vectorformat) diff --git a/tests/test_searching.py b/tests/test_searching.py index 9cb8eac4..4caaf95b 100644 --- a/tests/test_searching.py +++ b/tests/test_searching.py @@ -635,7 +635,7 @@ def test_stop_phrase(): def test_phrase_order(): - tfield = fields.TEXT(stored=True, analyzer=analysis.simple_analyzer()) + tfield = fields.TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()) schema = fields.Schema(text=tfield) storage = RamStorage() ix = storage.create_index(schema) @@ -1277,7 +1277,7 @@ def test_scorer(): def test_pos_scorer(): - ana = analysis.simple_analyzer() + ana = analysis.SimpleAnalyzer() schema = fields.Schema(id=fields.STORED, key=fields.TEXT(analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() diff --git a/tests/test_spans.py b/tests/test_spans.py index 341000de..01c78731 100644 --- a/tests/test_spans.py +++ b/tests/test_spans.py @@ -16,7 +16,7 @@ def get_index(): return _ix charfield = fields.FieldType( - formats.Characters(), analysis.simple_analyzer(), scorable=True, stored=True + formats.Characters(), analysis.SimpleAnalyzer(), scorable=True, stored=True ) schema = fields.Schema(text=charfield) st = RamStorage() @@ -93,10 +93,10 @@ def test_span_term(): ids = set() while m.is_active(): - matcher_id = m.id() + id = m.id() sps = m.spans() - ids.add(matcher_id) - original = list(s.stored_fields(matcher_id)["text"]) + ids.add(id) + original = list(s.stored_fields(id)["text"]) assert word in original if word != "bravo": @@ -208,7 +208,7 @@ def test_near_unordered(): def test_span_near_tree(): - ana = analysis.simple_analyzer() + ana = analysis.SimpleAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True)) st = RamStorage() ix = st.create_index(schema) diff --git a/tests/test_spelling.py b/tests/test_spelling.py index 6c67cb28..3f773785 100644 --- a/tests/test_spelling.py +++ b/tests/test_spelling.py @@ -186,7 +186,7 @@ def test_correct_query(): def test_spelling_field(): text = "rendering shading modeling reactions" - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) assert schema["text"].spelling @@ -214,7 +214,7 @@ def test_spelling_field(): def test_correct_spell_field(): - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) with TempIndex(schema) as ix: with ix.writer() as w: @@ -328,7 +328,7 @@ def test_very_long_words(): strings1 = [u(chr(i) * length) for i in range(65, 70)] strings2 = [u(chr(i) * length) for i in range(71, 75)] - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema( text=fields.TEXT( analyzer=ana, diff --git a/tests/test_writing.py b/tests/test_writing.py index 05dd5f21..0014c98b 100644 --- a/tests/test_writing.py +++ b/tests/test_writing.py @@ -395,7 +395,7 @@ def test_add_reader_spelling(): # Test whether add_spell_word() items get copied over in a merge # Because b is stemming and spelled, it will use add_spell_word() - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema( a=fields.TEXT(analyzer=ana), b=fields.TEXT(analyzer=ana, spelling=True) ) @@ -455,7 +455,7 @@ def test_clear(): def test_spellable_list(): # Make sure a spellable field works with a list of pre-analyzed tokens - ana = analysis.stemming_analyzer() + ana = analysis.StemmingAnalyzer() schema = fields.Schema( Location=fields.STORED, Lang=fields.STORED,