diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..f6f4b158 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,20 @@ +[run] +branch = True +omit = + # Autogenerated missed code handles other VCSes. + devito/_version.py + examples/*__init__* +concurrency = multiprocessing +parallel = True + +[report] +# Regexes for lines to exclude from consideration +exclude_lines = + # Don't complain about missing debug-only code: + def __repr__ + + # Don't complain if tests don't hit defensive assertion code: + raise NotImplementedError + raise ValueError + raise TypeError + raise RuntimeError diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5f5d4eba..4f015c93 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,6 +23,6 @@ jobs: - name: Run test run: pytest - name: Upload coverage reports to Codecov - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.gitignore b/.gitignore index 36187487..a36dd250 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ eggs/ .eggs/ *.egg *.egg-info/ +/test.py diff --git a/benchmark/dictionary.py b/benchmark/dictionary.py index 7292305e..ef9912a1 100644 --- a/benchmark/dictionary.py +++ b/benchmark/dictionary.py @@ -1,7 +1,7 @@ import os.path, gzip -from whoosh_reloaded import analysis, fields -from whoosh_reloaded.support.bench import Bench, Spec +from whoosh import analysis, fields +from whoosh.support.bench import Bench, Spec class VulgarTongue(Spec): @@ -35,7 +35,7 @@ def whoosh_schema(self): return schema def zcatalog_setup(self, cat): - from zcatalog import indexes # @UnresolvedImport + from zcatalog import indexes # type: ignore @UnresolvedImport cat["head"] = indexes.FieldIndex(field_name="head") cat["body"] = indexes.TextIndex(field_name="body") diff --git a/benchmark/enron.py b/benchmark/enron.py index 92292c59..a82c2f26 100644 --- a/benchmark/enron.py +++ b/benchmark/enron.py @@ -5,14 +5,14 @@ from zlib import compress, decompress try: - import xappy + import xappy # type: ignore except ImportError: pass -from whoosh_reloaded import analysis, fields -from whoosh_reloaded.compat import urlretrieve, next -from whoosh_reloaded.support.bench import Bench, Spec -from whoosh_reloaded.util import now +from whoosh import analysis, fields +from whoosh.compat import urlretrieve, next +from whoosh.support.bench import Bench, Spec +from whoosh.util import now # Benchmark class @@ -165,7 +165,7 @@ def xappy_indexer_connection(self, path): return conn def zcatalog_setup(self, cat): - from zcatalog import indexes + from zcatalog import indexes # type: ignore for name in ("date", "frm"): cat[name] = indexes.FieldIndex(field_name=name) diff --git a/benchmark/marc21.py b/benchmark/marc21.py index 587daf4f..c3c189aa 100644 --- a/benchmark/marc21.py +++ b/benchmark/marc21.py @@ -1,9 +1,9 @@ from __future__ import with_statement, print_function import fnmatch, logging, os.path, re -from whoosh_reloaded import analysis, fields, index, qparser, query, scoring -from whoosh_reloaded.compat import xrange -from whoosh_reloaded.util import now +from whoosh import analysis, fields, index, qparser, query, scoring +from whoosh.compat import range +from whoosh.util import now log = logging.getLogger(__name__) @@ -59,7 +59,7 @@ def parse_record(data, tags=None): field_count = (dirend - dirstart) // DIRECTORY_ENTRY_LEN result = {} - for i in xrange(field_count): + for i in range(field_count): start = dirstart + i * DIRECTORY_ENTRY_LEN end = start + DIRECTORY_ENTRY_LEN tag = data[start : start + 3] diff --git a/benchmark/reuters.py b/benchmark/reuters.py index 69e818ca..0aaa3276 100644 --- a/benchmark/reuters.py +++ b/benchmark/reuters.py @@ -1,8 +1,8 @@ import gzip, os.path -from whoosh_reloaded import analysis, fields, index, qparser, query -from whoosh_reloaded.support.bench import Bench, Spec -from whoosh_reloaded.util import now +from whoosh import analysis, fields, index, qparser, query +from whoosh.support.bench import Bench, Spec +from whoosh.util import now class Reuters(Spec): @@ -22,7 +22,7 @@ def whoosh_schema(self): return schema def zcatalog_setup(self, cat): - from zcatalog import indexes # @UnresolvedImport + from zcatalog import indexes # type: ignore @UnresolvedImport cat["id"] = indexes.FieldIndex(field_name="id") cat["headline"] = indexes.TextIndex(field_name="headline") diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000..08509703 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,28 @@ +codecov: + require_ci_to_pass: yes + notify: + wait_for_ci: yes + +coverage: + precision: 2 + round: down + range: 80...90 + + status: + # Learn more at http://docs.codecov.io/docs/codecov-yaml + project: + default: + enabled: yes + target: 1 + threshold: 0.1 + patch: + default: + enabled: off + +ignore: + - "**/*.ipynb" + - docs + - docker + - binder + - .github + - .git diff --git a/docs/Makefile b/docs/Makefile index 25797aed..09b24957 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -85,17 +85,17 @@ qthelp: @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/whoosh_reloaded.qhcp" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/whoosh.qhcp" @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/whoosh_reloaded.qhc" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/whoosh.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/whoosh_reloaded" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/whoosh_reloaded" + @echo "# mkdir -p $$HOME/.local/share/devhelp/whoosh" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/whoosh" @echo "# devhelp" epub: diff --git a/docs/make.bat b/docs/make.bat index d30558c7..6b6fea17 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -115,9 +115,9 @@ if "%1" == "qthelp" ( echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: - echo.^> qcollectiongenerator %BUILDDIR%\qthelp\whoosh_reloaded.qhcp + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\whoosh.qhcp echo.To view the help file: - echo.^> assistant -collectionFile %BUILDDIR%\qthelp\whoosh_reloaded.ghc + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\whoosh.ghc goto end ) diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst index 5de16964..27297f61 100644 --- a/docs/source/analysis.rst +++ b/docs/source/analysis.rst @@ -20,13 +20,13 @@ tokenizer, and the tokenizer will usually be wrapped in a few filters. A tokenizer is a callable that takes a unicode string and yields a series of ``analysis.Token`` objects. -For example, the provided :class:`whoosh_reloaded.analysis.RegexTokenizer` class +For example, the provided :class:`whoosh.analysis.RegexTokenizer` class implements a customizable, regular-expression-based tokenizer that extracts words and ignores whitespace and punctuation. :: - >>> from whoosh_reloaded.analysis import RegexTokenizer + >>> from whoosh.analysis import RegexTokenizer >>> tokenizer = RegexTokenizer() >>> for token in tokenizer(u"Hello there my friend!"): ... print repr(token.text) @@ -38,7 +38,7 @@ words and ignores whitespace and punctuation. A filter is a callable that takes a generator of Tokens (either a tokenizer or another filter) and in turn yields a series of Tokens. -For example, the provided :meth:`whoosh_reloaded.analysis.LowercaseFilter` filters tokens +For example, the provided :meth:`whoosh.analysis.LowercaseFilter` filters tokens by converting their text to lowercase. The implementation is very simple:: def LowercaseFilter(tokens): @@ -52,7 +52,7 @@ by converting their text to lowercase. The implementation is very simple:: You can wrap the filter around a tokenizer to see it in operation:: - >>> from whoosh_reloaded.analysis import LowercaseFilter + >>> from whoosh.analysis import LowercaseFilter >>> for token in LowercaseFilter(tokenizer(u"These ARE the things I want!")): ... print repr(token.text) u'these' @@ -72,10 +72,10 @@ tokenizers and filters together using the ``|`` character:: The first item must be a tokenizer and the rest must be filters (you can't put a filter first or a tokenizer after the first item). Note that this only works if at -least the tokenizer is a subclass of ``whoosh_reloaded.analysis.Composable``, as all the +least the tokenizer is a subclass of ``whoosh.analysis.Composable``, as all the tokenizers and filters that ship with Whoosh are. -See the :mod:`whoosh_reloaded.analysis` module for information on the available analyzers, +See the :mod:`whoosh.analysis` module for information on the available analyzers, tokenizers, and filters shipped with Whoosh. @@ -185,7 +185,7 @@ want to apply at indexing or query parsing:: else: ... -The :class:`whoosh_reloaded.analysis.MultiFilter` filter class lets you specify different +The :class:`whoosh.analysis.MultiFilter` filter class lets you specify different filters to use based on the mode setting:: intraword = MultiFilter(index=IntraWordFilter(mergewords=True, mergenums=True), @@ -201,7 +201,7 @@ filter out stop words, and includes a default list of common stop words. :: - >>> from whoosh_reloaded.analysis import StopFilter + >>> from whoosh.analysis import StopFilter >>> stopper = StopFilter() >>> for token in stopper(LowercaseFilter(tokenizer(u"These ARE the things I want!"))): ... print repr(token.text) @@ -276,7 +276,7 @@ be removed from the stream or left in. :: - >>> from whoosh_reloaded.analysis import StandardAnalyzer + >>> from whoosh.analysis import StandardAnalyzer >>> analyzer = StandardAnalyzer() >>> [(t.text, t.stopped) for t in analyzer(u"This is a test")] [(u'test', False)] diff --git a/docs/source/api/analysis.rst b/docs/source/api/analysis.rst index 94918661..bbb1b978 100644 --- a/docs/source/api/analysis.rst +++ b/docs/source/api/analysis.rst @@ -2,7 +2,7 @@ ``analysis`` module =================== -.. automodule:: whoosh_reloaded.analysis +.. automodule:: whoosh.analysis Analyzers ========= diff --git a/docs/source/api/codec/base.rst b/docs/source/api/codec/base.rst index e1d55076..28f707c4 100644 --- a/docs/source/api/codec/base.rst +++ b/docs/source/api/codec/base.rst @@ -2,7 +2,7 @@ ``codec.base`` module ===================== -.. automodule:: whoosh_reloaded.codec.base +.. automodule:: whoosh.codec.base Classes diff --git a/docs/source/api/collectors.rst b/docs/source/api/collectors.rst index be229e3c..b27b8c1f 100644 --- a/docs/source/api/collectors.rst +++ b/docs/source/api/collectors.rst @@ -2,7 +2,7 @@ ``collectors`` module ===================== -.. automodule:: whoosh_reloaded.collectors +.. automodule:: whoosh.collectors Base classes diff --git a/docs/source/api/columns.rst b/docs/source/api/columns.rst index f991ea1a..26fa7916 100644 --- a/docs/source/api/columns.rst +++ b/docs/source/api/columns.rst @@ -2,7 +2,7 @@ ``columns`` module ===================== -.. automodule:: whoosh_reloaded.columns +.. automodule:: whoosh.columns Base classes diff --git a/docs/source/api/fields.rst b/docs/source/api/fields.rst index 5c265929..290feb35 100644 --- a/docs/source/api/fields.rst +++ b/docs/source/api/fields.rst @@ -2,7 +2,7 @@ ``fields`` module ================= -.. automodule:: whoosh_reloaded.fields +.. automodule:: whoosh.fields Schema class ============ diff --git a/docs/source/api/filedb/filestore.rst b/docs/source/api/filedb/filestore.rst index 5ffb4835..2dfc2ec4 100644 --- a/docs/source/api/filedb/filestore.rst +++ b/docs/source/api/filedb/filestore.rst @@ -2,7 +2,7 @@ ``filedb.filestore`` module =========================== -.. automodule:: whoosh_reloaded.filedb.filestore +.. automodule:: whoosh.filedb.filestore Base class ========== diff --git a/docs/source/api/filedb/filetables.rst b/docs/source/api/filedb/filetables.rst index 54ff3e1b..3fbf70fe 100644 --- a/docs/source/api/filedb/filetables.rst +++ b/docs/source/api/filedb/filetables.rst @@ -2,7 +2,7 @@ ``filedb.filetables`` module ============================ -.. automodule:: whoosh_reloaded.filedb.filetables +.. automodule:: whoosh.filedb.filetables Hash file diff --git a/docs/source/api/filedb/structfile.rst b/docs/source/api/filedb/structfile.rst index 4bab3819..7d45c664 100644 --- a/docs/source/api/filedb/structfile.rst +++ b/docs/source/api/filedb/structfile.rst @@ -2,7 +2,7 @@ ``filedb.structfile`` module ============================ -.. automodule:: whoosh_reloaded.filedb.structfile +.. automodule:: whoosh.filedb.structfile Classes ======= diff --git a/docs/source/api/formats.rst b/docs/source/api/formats.rst index 3e0ebfc0..9cd9dd19 100644 --- a/docs/source/api/formats.rst +++ b/docs/source/api/formats.rst @@ -2,7 +2,7 @@ ``formats`` module ================== -.. automodule:: whoosh_reloaded.formats +.. automodule:: whoosh.formats Base class ========== diff --git a/docs/source/api/highlight.rst b/docs/source/api/highlight.rst index 05f5aaab..74d2ab9e 100644 --- a/docs/source/api/highlight.rst +++ b/docs/source/api/highlight.rst @@ -2,7 +2,7 @@ ``highlight`` module ==================== -.. automodule:: whoosh_reloaded.highlight +.. automodule:: whoosh.highlight See :doc:`how to highlight terms in search results `. diff --git a/docs/source/api/idsets.rst b/docs/source/api/idsets.rst index 4e056e1b..0f55306e 100644 --- a/docs/source/api/idsets.rst +++ b/docs/source/api/idsets.rst @@ -2,7 +2,7 @@ ``support.bitvector`` module ============================ -.. automodule:: whoosh_reloaded.idsets +.. automodule:: whoosh.idsets Base classes diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst index 627d1382..ee386455 100644 --- a/docs/source/api/index.rst +++ b/docs/source/api/index.rst @@ -2,7 +2,7 @@ ``index`` module ================ -.. automodule:: whoosh_reloaded.index +.. automodule:: whoosh.index Functions diff --git a/docs/source/api/lang/morph_en.rst b/docs/source/api/lang/morph_en.rst index 763b464a..2a3dfe0e 100644 --- a/docs/source/api/lang/morph_en.rst +++ b/docs/source/api/lang/morph_en.rst @@ -2,6 +2,6 @@ ``lang.morph_en`` module ======================== -.. automodule:: whoosh_reloaded.lang.morph_en +.. automodule:: whoosh.lang.morph_en .. autofunction:: variations diff --git a/docs/source/api/lang/porter.rst b/docs/source/api/lang/porter.rst index f5ba22fc..4a0220f2 100644 --- a/docs/source/api/lang/porter.rst +++ b/docs/source/api/lang/porter.rst @@ -2,6 +2,6 @@ ``lang.porter`` module ====================== -.. automodule:: whoosh_reloaded.lang.porter +.. automodule:: whoosh.lang.porter .. autofunction:: stem diff --git a/docs/source/api/lang/wordnet.rst b/docs/source/api/lang/wordnet.rst index 7b784441..8adcdb0b 100644 --- a/docs/source/api/lang/wordnet.rst +++ b/docs/source/api/lang/wordnet.rst @@ -2,7 +2,7 @@ ``lang.wordnet`` module ======================== -.. automodule:: whoosh_reloaded.lang.wordnet +.. automodule:: whoosh.lang.wordnet Thesaurus ========= diff --git a/docs/source/api/matching.rst b/docs/source/api/matching.rst index f57d3a78..12f24c6b 100644 --- a/docs/source/api/matching.rst +++ b/docs/source/api/matching.rst @@ -2,7 +2,7 @@ ``matching`` module =================== -.. automodule:: whoosh_reloaded.matching +.. automodule:: whoosh.matching Matchers ======== diff --git a/docs/source/api/qparser.rst b/docs/source/api/qparser.rst index fdba2db1..d3c5ecda 100644 --- a/docs/source/api/qparser.rst +++ b/docs/source/api/qparser.rst @@ -2,7 +2,7 @@ ``qparser`` module ================== -.. automodule:: whoosh_reloaded.qparser +.. automodule:: whoosh.qparser Parser object ============= diff --git a/docs/source/api/query.rst b/docs/source/api/query.rst index 4d614a6f..ca31ddf8 100644 --- a/docs/source/api/query.rst +++ b/docs/source/api/query.rst @@ -2,9 +2,9 @@ ``query`` module ================ -.. automodule:: whoosh_reloaded.query +.. automodule:: whoosh.query -See also :mod:` whoosh_reloaded.qparser` which contains code for parsing user queries +See also :mod:` whoosh.qparser` which contains code for parsing user queries into query objects. Base classes diff --git a/docs/source/api/reading.rst b/docs/source/api/reading.rst index e79f3c19..e0fd2a12 100644 --- a/docs/source/api/reading.rst +++ b/docs/source/api/reading.rst @@ -2,7 +2,7 @@ ``reading`` module ================== -.. automodule:: whoosh_reloaded.reading +.. automodule:: whoosh.reading Classes ======= diff --git a/docs/source/api/scoring.rst b/docs/source/api/scoring.rst index 0f25e312..73ea1e76 100644 --- a/docs/source/api/scoring.rst +++ b/docs/source/api/scoring.rst @@ -2,7 +2,7 @@ ``scoring`` module ================== -.. automodule:: whoosh_reloaded.scoring +.. automodule:: whoosh.scoring Base classes diff --git a/docs/source/api/searching.rst b/docs/source/api/searching.rst index 8bc094e4..8acfe492 100644 --- a/docs/source/api/searching.rst +++ b/docs/source/api/searching.rst @@ -2,7 +2,7 @@ ``searching`` module ==================== -.. automodule:: whoosh_reloaded.searching +.. automodule:: whoosh.searching Searching classes diff --git a/docs/source/api/sorting.rst b/docs/source/api/sorting.rst index 811e58bc..faf78d0f 100644 --- a/docs/source/api/sorting.rst +++ b/docs/source/api/sorting.rst @@ -2,7 +2,7 @@ ``sorting`` module ================== -.. automodule:: whoosh_reloaded.sorting +.. automodule:: whoosh.sorting Base types diff --git a/docs/source/api/spelling.rst b/docs/source/api/spelling.rst index 0c1f830b..79d5961e 100644 --- a/docs/source/api/spelling.rst +++ b/docs/source/api/spelling.rst @@ -4,7 +4,7 @@ See :doc:`correcting errors in user queries <../spelling>`. -.. automodule:: whoosh_reloaded.spelling +.. automodule:: whoosh.spelling Corrector objects diff --git a/docs/source/api/support/charset.rst b/docs/source/api/support/charset.rst index 3b2af0ef..b0a687e9 100644 --- a/docs/source/api/support/charset.rst +++ b/docs/source/api/support/charset.rst @@ -2,7 +2,7 @@ ``support.charset`` module ========================== -.. automodule:: whoosh_reloaded.support.charset +.. automodule:: whoosh.support.charset .. data:: default_charset diff --git a/docs/source/api/support/levenshtein.rst b/docs/source/api/support/levenshtein.rst index 119db615..cb64027e 100644 --- a/docs/source/api/support/levenshtein.rst +++ b/docs/source/api/support/levenshtein.rst @@ -2,7 +2,7 @@ ``support.levenshtein`` module ============================== -.. automodule:: whoosh_reloaded.support.levenshtein +.. automodule:: whoosh.support.levenshtein .. autofunction:: relative diff --git a/docs/source/api/util.rst b/docs/source/api/util.rst index a90fb17a..9359f742 100644 --- a/docs/source/api/util.rst +++ b/docs/source/api/util.rst @@ -2,6 +2,6 @@ ``util`` module =============== -.. automodule:: whoosh_reloaded.util +.. automodule:: whoosh.util :members: diff --git a/docs/source/api/writing.rst b/docs/source/api/writing.rst index 9a25ffb5..0bebc86f 100644 --- a/docs/source/api/writing.rst +++ b/docs/source/api/writing.rst @@ -2,7 +2,7 @@ ``writing`` module ================== -.. automodule:: whoosh_reloaded.writing +.. automodule:: whoosh.writing Writer diff --git a/docs/source/batch.rst b/docs/source/batch.rst index a9bd453c..5caf256e 100644 --- a/docs/source/batch.rst +++ b/docs/source/batch.rst @@ -39,7 +39,7 @@ analyzer, consider using an unbounded cache:: The ``limitmb`` parameter ========================= -The ``limitmb`` parameter to :meth:`whoosh_reloaded.index.Index.writer` controls the +The ``limitmb`` parameter to :meth:`whoosh.index.Index.writer` controls the *maximum* memory (in megabytes) the writer will use for the indexing pool. The higher the number, the faster indexing will be. @@ -47,7 +47,7 @@ The default value of ``128`` is actually somewhat low, considering many people have multiple gigabytes of RAM these days. Setting it higher can speed up indexing considerably:: - from whoosh_reloaded import index + from whoosh import index ix = index.open_dir("indexdir") writer = ix.writer(limitmb=256) @@ -61,11 +61,11 @@ indexing considerably:: The ``procs`` parameter ======================= -The ``procs`` parameter to :meth:`whoosh_reloaded.index.Index.writer` controls the +The ``procs`` parameter to :meth:`whoosh.index.Index.writer` controls the number of processors the writer will use for indexing (via the ``multiprocessing`` module):: - from whoosh_reloaded import index + from whoosh import index ix = index.open_dir("indexdir") writer = ix.writer(procs=4) @@ -89,7 +89,7 @@ You can get much better indexing speed by also using the ``multisegment=True`` keyword argument, which instead of merging the results of each sub-writer, simply has them each just write out a new segment:: - from whoosh_reloaded import index + from whoosh import index ix = index.open_dir("indexdir") writer = ix.writer(procs=4, multisegment=True) diff --git a/docs/source/conf.py b/docs/source/conf.py index 3a8303e2..77011a03 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,7 +1,7 @@ import sys, os, os.path sys.path.append(os.path.abspath("../../src")) -import whoosh_reloaded +import whoosh # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -48,9 +48,9 @@ # built documents. # # The short X.Y version. -version = whoosh_reloaded.versionstring(build=False) +version = whoosh.versionstring(build=False) # The full version, including alpha/beta/rc tags. -release = whoosh_reloaded.versionstring() +release = whoosh.versionstring() # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/source/dates.rst b/docs/source/dates.rst index d6d7fb66..ab1aadd6 100644 --- a/docs/source/dates.rst +++ b/docs/source/dates.rst @@ -6,11 +6,11 @@ Indexing dates ============== Whoosh lets you index and search dates/times using the -:class:`whoosh_reloaded.fields.DATETIME` field type. Instead of passing text for the +:class:`whoosh.fields.DATETIME` field type. Instead of passing text for the field in ``add_document()``, you use a Python ``datetime.datetime`` object:: from datetime import datetime, timedelta - from whoosh_reloaded import fields, index + from whoosh import fields, index schema = fields.Schema(title=fields.TEXT, content=fields.TEXT, date=fields.DATETIME) @@ -28,11 +28,11 @@ Parsing date queries ==================== Once you've have an indexed ``DATETIME`` field, you can search it using a rich -date parser contained in the :class:`whoosh_reloaded.qparser.dateparse.DateParserPlugin`:: +date parser contained in the :class:`whoosh.qparser.dateparse.DateParserPlugin`:: - from whoosh_reloaded import index - from whoosh_reloaded.qparser import QueryParser - from whoosh_reloaded.qparser.dateparse import DateParserPlugin + from whoosh import index + from whoosh.qparser import QueryParser + from whoosh.qparser.dateparse import DateParserPlugin ix = index.open_dir("indexdir") @@ -90,8 +90,8 @@ If you don't use the ``DateParserPlugin``, users can still search DATETIME fields using a simple numeric form ``YYYY[MM[DD[hh[mm[ss]]]]]`` that is built into the ``DATETIME`` field:: - from whoosh_reloaded import index - from whoosh_reloaded.qparser import QueryParser + from whoosh import index + from whoosh.qparser import QueryParser ix = index.open_dir("indexdir") qp = QueryParser("content", schema=ix.schema) diff --git a/docs/source/facets.rst b/docs/source/facets.rst index eb3f99ad..b8c16936 100644 --- a/docs/source/facets.rst +++ b/docs/source/facets.rst @@ -45,25 +45,25 @@ When you create a field using ``sortable=True``, you are telling Whoosh to store per-document values for that field in a *column*. A column object specifies the format to use to store the per-document values on disk. -The :mod:`whoosh_reloaded.columns` module contains several different column object +The :mod:`whoosh.columns` module contains several different column object implementations. Each field type specifies a reasonable default column type (for -example, the default for text fields is :class:`whoosh_reloaded.columns.VarBytesColumn`, -the default for numeric fields is :class:`whoosh_reloaded.columns.NumericColumn`). +example, the default for text fields is :class:`whoosh.columns.VarBytesColumn`, +the default for numeric fields is :class:`whoosh.columns.NumericColumn`). However, if you want maximum efficiency you may want to use a different column type for a field. For example, if all document values in a field are a fixed length, you can use a -:class:`whoosh_reloaded.columns.FixedBytesColumn`. If you have a field where many +:class:`whoosh.columns.FixedBytesColumn`. If you have a field where many documents share a relatively small number of possible values (an example might be a "category" field, or "month" or other enumeration type fields), you might -want to use :class:`whoosh_reloaded.columns.RefBytesColumn` (which can handle both +want to use :class:`whoosh.columns.RefBytesColumn` (which can handle both variable and fixed-length values). There are column types for storing per-document bit values, structs, pickled objects, and compressed byte values. To specify a custom column object for a field, pass it as the ``sortable`` keyword argument instead of ``True``:: - from whoosh_reloaded import columns, fields + from whoosh import columns, fields category_col = columns.RefBytesColumn() schema = fields.Schema(title=fields.TEXT(sortable=True), @@ -95,16 +95,16 @@ different values for the visible title and the value used for sorting:: sort_title = "unbearable lightness of being, the" The best way to do this is to use an additional field just for sorting. You can -use the :class:`whoosh_reloaded.fields.COLUMN` field type to create a field that is not +use the :class:`whoosh.fields.COLUMN` field type to create a field that is not indexed or stored, it only holds per-document column values:: schema = fields.Schema(title=fields.TEXT(stored=True), sort_title=fields.COLUMN(columns.VarBytesColumn()) ) -The single argument to the :class:`whoosh_reloaded.fields.COLUMN` initializer is a -:class:`whoosh_reloaded.columns.ColumnType` object. You can use any of the various -column types in the :mod:`whoosh_reloaded.columns` module. +The single argument to the :class:`whoosh.fields.COLUMN` initializer is a +:class:`whoosh.columns.ColumnType` object. You can use any of the various +column types in the :mod:`whoosh.columns` module. As another example, say you are indexing documents that have a custom sorting order associated with each document, such as a "priority" number:: @@ -140,9 +140,9 @@ Making existing fields sortable If you have an existing index from before the ``sortable`` argument was added in Whoosh 3.0, or you didn't think you needed a field to be sortable but now you find that you need to sort it, you can add "sortability" to an existing -index using the :func:`whoosh_reloaded.sorting.add_sortable` utility function:: +index using the :func:`whoosh.sorting.add_sortable` utility function:: - from whoosh_reloaded import columns, fields, index, sorting + from whoosh import columns, fields, index, sorting # Say we have an existing index with this schema schema = fields.Schema(title=fields.TEXT, @@ -165,7 +165,7 @@ You can specify a custom column type when you call ``add_sortable`` using the add_sortable(w, "chapter", sorting.FieldFacet("chapter"), column=columns.RefBytesColumn()) -See the documentation for :func:`~whoosh_reloaded.sorting.add_sortable` for more +See the documentation for :func:`~whoosh.sorting.add_sortable` for more information. @@ -177,9 +177,9 @@ values in the field's column as sorting keys for the documents. Normally search results are sorted by descending relevance score. You can tell Whoosh to use a different ordering by passing the ``sortedby`` keyword argument -to the :meth:`~whoosh_reloaded.searching.Searcher.search` method:: +to the :meth:`~whoosh.searching.Searcher.search` method:: - from whoosh_reloaded import fields, index, qparser + from whoosh import fields, index, qparser schema = fields.Schema(title=fields.TEXT(stored=True), price=fields.NUMERIC(sortable=True)) @@ -255,7 +255,7 @@ Sort by the "category" field, then by the document's score:: Accessing column values ----------------------- -Per-document column values are available in :class:`~whoosh_reloaded.searching.Hit` +Per-document column values are available in :class:`~whoosh.searching.Hit` objects just like stored field values:: schema = fields.Schema(title=fields.TEXT(stored=True), @@ -386,16 +386,16 @@ document numbers in the same relative order as in the results. You can use the return the document's stored fields as a dictionary:: for category_name in categories: - print "Top 5 documents in the %s category" % category_name + print ("Top 5 documents in the %s category") % category_name doclist = categories[category_name] for docnum, score in doclist[:5]: - print " ", searcher.stored_fields(docnum) + print (" ", searcher.stored_fields(docnum)) if len(doclist) > 5: - print " (%s more)" % (len(doclist) - 5) + print (" (%s more)") % (len(doclist) - 5) If you want different information about the groups, for example just the count of documents in each group, or you don't need the groups to be ordered, you can -specify a :class:`whoosh_reloaded.sorting.FacetMap` type or instance with the +specify a :class:`whoosh.sorting.FacetMap` type or instance with the ``maptype`` keyword argument when creating the ``FacetType``:: # This is the same as the default @@ -607,7 +607,7 @@ documents. This is usually slower than using an indexed field, but when using ``allow_overlap`` it can actually be faster for large indexes just because it avoids the overhead of reading posting lists. -:class:`~whoosh_reloaded.sorting.StoredFieldFacet` supports ``allow_overlap`` by +:class:`~whoosh.sorting.StoredFieldFacet` supports ``allow_overlap`` by splitting the stored value into separate keys. By default it calls the value's ``split()`` method (since most stored values are strings), but you can supply a custom split function. See the section on ``allow_overlap`` below. @@ -659,9 +659,9 @@ single-value approach. Of course, there are situations where you want documents to be sorted into multiple groups based on a field with multiple terms per document. The most common example would be a ``tags`` field. The ``allow_overlap`` keyword -argument to the :class:`~whoosh_reloaded.sorting.FieldFacet`, -:class:`~whoosh_reloaded.sorting.QueryFacet`, and -:class:`~whoosh_reloaded.sorting.StoredFieldFacet` allows this multi-value approach. +argument to the :class:`~whoosh.sorting.FieldFacet`, +:class:`~whoosh.sorting.QueryFacet`, and +:class:`~whoosh.sorting.StoredFieldFacet` allows this multi-value approach. However, there is an important caveat: using ``allow_overlap=True`` is slower than the default, potentially *much* slower for very large result sets. This is @@ -674,7 +674,7 @@ sets, where Whoosh has to open the vector list for every matched document, this can still be very slow. For very large indexes and result sets, if a field is stored, you can get -faster overlapped faceting using :class:`~whoosh_reloaded.sorting.StoredFieldFacet` +faster overlapped faceting using :class:`~whoosh.sorting.StoredFieldFacet` instead of ``FieldFacet``. While reading stored values is usually slower than using the index, in this case avoiding the overhead of opening large numbers of posting readers can make it worthwhile. @@ -716,7 +716,7 @@ the sorting order you want for a given field value, such as an implementation of the Unicode Collation Algorithm (UCA), you can customize the sort order for the user's language. -The :class:`whoosh_reloaded.sorting.TranslateFacet` lets you apply a function to the +The :class:`whoosh.sorting.TranslateFacet` lets you apply a function to the value of another facet. This lets you "translate" a field value into an arbitrary sort key, such as with UCA:: diff --git a/docs/source/fieldcaches.rst b/docs/source/fieldcaches.rst index 6c443b71..49091dc7 100644 --- a/docs/source/fieldcaches.rst +++ b/docs/source/fieldcaches.rst @@ -29,7 +29,7 @@ By default, if caches are written to disk they are saved in the index directory. To tell a reader or searcher to save cache files to a different location, create a storage object and pass it to the ``storage`` keyword argument:: - from whoosh_reloaded.filedb.filestore import FileStorage + from whoosh.filedb.filestore import FileStorage mystorage = FileStorage("path/to/cachedir") reader.set_caching_policy(storage=mystorage) @@ -39,7 +39,7 @@ Creating a custom caching policy ================================ Expert users who want to implement a custom caching policy (for example, to add -cache expiration) should subclass :class:`whoosh_reloaded.filedb.fieldcache.FieldCachingPolicy`. +cache expiration) should subclass :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`. Then you can pass an instance of your policy object to the ``set_caching_policy`` method:: diff --git a/docs/source/highlight.rst b/docs/source/highlight.rst index b1c50554..79c76ae9 100644 --- a/docs/source/highlight.rst +++ b/docs/source/highlight.rst @@ -40,8 +40,8 @@ Get search results:: for hit in results: print(hit["title"]) -You can use the :meth:`~ whoosh_reloaded.searching.Hit.highlights` method on the -:class:` whoosh_reloaded.searching.Hit` object to get highlighted snippets from the +You can use the :meth:`~ whoosh.searching.Hit.highlights` method on the +:class:` whoosh.searching.Hit` object to get highlighted snippets from the document containing the search terms. The first argument is the name of the field to highlight. If the field is @@ -136,28 +136,28 @@ A fragmenter controls how to extract excerpts from the original text. The ``highlight`` module has the following pre-made fragmenters: -:class:` whoosh_reloaded.highlight.ContextFragmenter` (the default) +:class:` whoosh.highlight.ContextFragmenter` (the default) This is a "smart" fragmenter that finds matched terms and then pulls in surround text to form fragments. This fragmenter only yields fragments that contain matched terms. -:class:` whoosh_reloaded.highlight.SentenceFragmenter` +:class:` whoosh.highlight.SentenceFragmenter` Tries to break the text into fragments based on sentence punctuation (".", "!", and "?"). This object works by looking in the original text for a sentence end as the next character after each token's 'endchar'. Can be fooled by e.g. source code, decimals, etc. -:class:` whoosh_reloaded.highlight.WholeFragmenter` +:class:` whoosh.highlight.WholeFragmenter` Returns the entire text as one "fragment". This can be useful if you are highlighting a short bit of text and don't need to fragment it. The different fragmenters have different options. For example, the default -:class:`~ whoosh_reloaded.highlight.ContextFragmenter` lets you set the maximum +:class:`~ whoosh.highlight.ContextFragmenter` lets you set the maximum fragment size and the size of the context to add on either side:: my_cf = highlight.ContextFragmenter(maxchars=100, surround=30) -See the :mod:` whoosh_reloaded.highlight` docs for more information. +See the :mod:` whoosh.highlight` docs for more information. To use a different fragmenter:: @@ -167,7 +167,7 @@ To use a different fragmenter:: Scorer ------ -A scorer is a callable that takes a :class:` whoosh_reloaded.highlight.Fragment` object and +A scorer is a callable that takes a :class:` whoosh.highlight.Fragment` object and returns a sortable value (where higher values represent better fragments). The default scorer adds up the number of matched terms in the fragment, and adds a "bonus" for the number of __different__ matched terms. The highlighting @@ -224,14 +224,14 @@ or anything else useful to the calling system). The ``highlight`` module contains the following pre-made formatters. -:class:` whoosh_reloaded.highlight.HtmlFormatter` +:class:` whoosh.highlight.HtmlFormatter` Outputs a string containing HTML tags (with a class attribute) around the matched terms. -:class:` whoosh_reloaded.highlight.UppercaseFormatter` +:class:` whoosh.highlight.UppercaseFormatter` Converts the matched terms to UPPERCASE. -:class:` whoosh_reloaded.highlight.GenshiFormatter` +:class:` whoosh.highlight.GenshiFormatter` Outputs a Genshi event stream, with the matched terms wrapped in a configurable element. @@ -258,19 +258,19 @@ To use a different formatter:: If you need more control over the formatting (or want to output something other than strings), you will need to override other methods. See the documentation -for the :class:` whoosh_reloaded.highlight.Formatter` class. +for the :class:` whoosh.highlight.Formatter` class. Highlighter object ================== Rather than setting attributes on the results object, you can create a -reusable :class:` whoosh_reloaded.highlight.Highlighter` object. Keyword arguments let +reusable :class:` whoosh.highlight.Highlighter` object. Keyword arguments let you change the ``fragmenter``, ``scorer``, ``order``, and/or ``formatter``:: hi = highlight.Highlighter(fragmenter=my_cf, scorer=sds) -You can then use the :meth:` whoosh_reloaded.highlight.Highlighter.highlight_hit` method +You can then use the :meth:` whoosh.highlight.Highlighter.highlight_hit` method to get highlights for a ``Hit`` object:: for hit in results: @@ -305,7 +305,7 @@ Instead of retokenizing, Whoosh can look up the character positions of the matched terms in the index. Looking up the character positions is not instantaneous, but is usually faster than analyzing large amounts of text. -To use :class:` whoosh_reloaded.highlight.PinpointFragmenter` and avoid re-tokenizing the +To use :class:` whoosh.highlight.PinpointFragmenter` and avoid re-tokenizing the document text, you must do all of the following: Index the field with character information (this will require re-indexing an @@ -319,7 +319,7 @@ Record per-document term matches in the results:: # Record per-document term matches results = searcher.search(myquery, terms=True) -Set a :class:` whoosh_reloaded.highlight.PinpointFragmenter` as the fragmenter:: +Set a :class:` whoosh.highlight.PinpointFragmenter` as the fragmenter:: results.fragmenter = highlight.PinpointFragmenter() @@ -368,7 +368,7 @@ Usage The following function lets you retokenize and highlight a piece of text using an analyzer:: - from whoosh_reloaded.highlight import highlight + from whoosh.highlight import highlight excerpts = highlight(text, terms, analyzer, fragmenter, formatter, top=3, scorer=BasicFragmentScorer, minscore=1, order=FIRST) @@ -386,17 +386,17 @@ an analyzer:: query terms are in. ``fragmenter`` - A :class:` whoosh_reloaded.highlight.Fragmenter` object, see below. + A :class:` whoosh.highlight.Fragmenter` object, see below. ``formatter`` - A :class:` whoosh_reloaded.highlight.Formatter` object, see below. + A :class:` whoosh.highlight.Formatter` object, see below. ``top`` The number of fragments to include in the output. ``scorer`` - A :class:` whoosh_reloaded.highlight.FragmentScorer` object. The only scorer currently - included with Whoosh is :class:`~ whoosh_reloaded.highlight.BasicFragmentScorer`, the + A :class:` whoosh.highlight.FragmentScorer` object. The only scorer currently + included with Whoosh is :class:`~ whoosh.highlight.BasicFragmentScorer`, the default. ``minscore`` diff --git a/docs/source/indexing.rst b/docs/source/indexing.rst index 3a10c638..e8278df2 100644 --- a/docs/source/indexing.rst +++ b/docs/source/indexing.rst @@ -8,7 +8,7 @@ Creating an Index object To create an index in a directory, use ``index.create_in``:: import os, os.path - from whoosh_reloaded import index + from whoosh import index if not os.path.exists("indexdir"): os.mkdir("indexdir") @@ -17,13 +17,13 @@ To create an index in a directory, use ``index.create_in``:: To open an existing index in a directory, use ``index.open_dir``:: - import whoosh_reloaded.index as index + import whoosh.index as index ix = index.open_dir("indexdir") These are convenience methods for:: - from whoosh_reloaded.filedb.filestore import FileStorage + from whoosh.filedb.filestore import FileStorage storage = FileStorage("indexdir") # Create an index @@ -80,10 +80,10 @@ a time can have a writer open. Because opening a writer locks the index for writing, in a multi-threaded or multi-process environment your code needs to be aware that opening a - writer may raise an exception (``whoosh_reloaded.store.LockError``) if a writer is + writer may raise an exception (``whoosh.store.LockError``) if a writer is already open. Whoosh includes a couple of example implementations - (:class:`whoosh_reloaded.writing.AsyncWriter` and - :class:`whoosh_reloaded.writing.BufferedWriter`) of ways to work around the write + (:class:`whoosh.writing.AsyncWriter` and + :class:`whoosh.writing.BufferedWriter`) of ways to work around the write lock. .. note:: @@ -199,7 +199,7 @@ index (merge all the segments together). It simply creates a writer and calls For more control over segment merging, you can write your own merge policy function and use it as an argument to the ``commit()`` method. See the implementation of the ``NO_MERGE``, ``MERGE_SMALL``, and ``OPTIMIZE`` functions -in the ``whoosh_reloaded.writing`` module. +in the ``whoosh.writing`` module. Deleting documents @@ -256,7 +256,7 @@ For ``update_document`` to work, you must have marked at least one of the fields in the schema as "unique". Whoosh will then use the contents of the "unique" field(s) to search for documents to delete:: - from whoosh_reloaded.fields import Schema, ID, TEXT + from whoosh.fields import Schema, ID, TEXT schema = Schema(path = ID(unique=True), content=TEXT) @@ -293,8 +293,8 @@ add/update documents according to user actions). Indexing everything from scratch is pretty easy. Here's a simple example:: import os.path - from whoosh_reloaded import index - from whoosh_reloaded.fields import Schema, ID, TEXT + from whoosh import index + from whoosh.fields import Schema, ID, TEXT def clean_index(dirname): # Always create the index from scratch @@ -418,7 +418,7 @@ Clearing the index In some cases you may want to re-index from scratch. To clear the index without disrupting any existing readers:: - from whoosh_reloaded import writing + from whoosh import writing with myindex.writer() as mywriter: # You can optionally add documents to the writer here diff --git a/docs/source/intro.rst b/docs/source/intro.rst index 99f64a24..66e9fa8e 100644 --- a/docs/source/intro.rst +++ b/docs/source/intro.rst @@ -5,7 +5,7 @@ Introduction to Whoosh About Whoosh ------------ -Whoosh was created by `Matt Chaput `_. It started as a quick and dirty +Whoosh was created by `Matt Chaput `_. It started as a quick and dirty search server for the online documentation of the `Houdini `_ 3D animation software package. Side Effects Software generously allowed Matt to open source the code in case it might be useful to anyone else who needs a very flexible or pure-Python @@ -57,4 +57,4 @@ Getting help with Whoosh You can view outstanding issues on the `Whoosh Github page `_ -and get help on the `Whoosh mailing list `_. +and get help on the `Whoosh mailing list `_. diff --git a/docs/source/keywords.rst b/docs/source/keywords.rst index a36fdf6a..fe0e91f2 100644 --- a/docs/source/keywords.rst +++ b/docs/source/keywords.rst @@ -34,18 +34,18 @@ Usage you want to match on is vectored or stored, or that you have access to the original text (such as from a database)*. - Use :meth:`~ whoosh_reloaded.searching.Hit.more_like_this`:: + Use :meth:`~ whoosh.searching.Hit.more_like_this`:: results = mysearcher.search(myquery) first_hit = results[0] more_results = first_hit.more_like_this("content") * Extract keywords for the top N documents in a - :class:` whoosh_reloaded.searching.Results` object. *This requires that the field is + :class:` whoosh.searching.Results` object. *This requires that the field is either vectored or stored*. - Use the :meth:`~ whoosh_reloaded.searching.Results.key_terms` method of the - :class:` whoosh_reloaded.searching.Results` object to extract keywords from the top N + Use the :meth:`~ whoosh.searching.Results.key_terms` method of the + :class:` whoosh.searching.Results` object to extract keywords from the top N documents of the result set. For example, to extract *five* key terms from the ``content`` field of the top @@ -57,13 +57,13 @@ Usage * Extract keywords for an arbitrary set of documents. *This requires that the field is either vectored or stored*. - Use the :meth:`~ whoosh_reloaded.searching.Searcher.document_number` or - :meth:`~ whoosh_reloaded.searching.Searcher.document_numbers` methods of the - :class:` whoosh_reloaded.searching.Searcher` object to get the document numbers for the + Use the :meth:`~ whoosh.searching.Searcher.document_number` or + :meth:`~ whoosh.searching.Searcher.document_numbers` methods of the + :class:` whoosh.searching.Searcher` object to get the document numbers for the document(s) you want to extract keywords from. - Use the :meth:`~ whoosh_reloaded.searching.Searcher.key_terms` method of a - :class:` whoosh_reloaded.searching.Searcher` to extract the keywords, given the list of + Use the :meth:`~ whoosh.searching.Searcher.key_terms` method of a + :class:` whoosh.searching.Searcher` to extract the keywords, given the list of document numbers. For example, let's say you have an index of emails. To extract key terms from @@ -77,8 +77,8 @@ Usage * Extract keywords from arbitrary text not in the index. - Use the :meth:`~ whoosh_reloaded.searching.Searcher.key_terms_from_text` method of a - :class:` whoosh_reloaded.searching.Searcher` to extract the keywords, given the text:: + Use the :meth:`~ whoosh.searching.Searcher.key_terms_from_text` method of a + :class:` whoosh.searching.Searcher` to extract the keywords, given the text:: with email_index.searcher() as s: keywords = [keyword for keyword, score @@ -88,7 +88,7 @@ Usage Expansion models ================ -The ``ExpansionModel`` subclasses in the :mod:` whoosh_reloaded.classify` module implement +The ``ExpansionModel`` subclasses in the :mod:` whoosh.classify` module implement different weighting functions for key words. These models are translated into Python from original Java implementations in Terrier. diff --git a/docs/source/nested.rst b/docs/source/nested.rst index 3703b651..da43d282 100644 --- a/docs/source/nested.rst +++ b/docs/source/nested.rst @@ -13,7 +13,7 @@ hierarchy, such as "Chapter - Section - Paragraph" or You can specify parent-child relationships *at indexing time*, by grouping documents in the same hierarchy, and then use the -:class:` whoosh_reloaded.query.NestedParent` and/or :class:`whoosh_reloaded.query.NestedChildren` +:class:` whoosh.query.NestedParent` and/or :class:`whoosh.query.NestedChildren` to find parents based on their children or vice-versa. Alternatively, you can use *query time joins*, essentially like external key @@ -80,7 +80,7 @@ find parents based on children or vice-versa. NestedParent query ------------------ -The :class:` whoosh_reloaded.query.NestedParent` query type lets you specify a query for +The :class:` whoosh.query.NestedParent` query type lets you specify a query for child documents, but have the query return an "ancestor" document from higher in the hierarchy:: @@ -124,7 +124,7 @@ no matter how many children match). This parent lookup is very efficient:: NestedChildren query -------------------- -The opposite of ``NestedParent`` is :class:` whoosh_reloaded.query.NestedChildren`. This +The opposite of ``NestedParent`` is :class:` whoosh.query.NestedChildren`. This query lets you match parents but return their children. This is useful, for example, to search for an album title and return the songs in the album:: diff --git a/docs/source/ngrams.rst b/docs/source/ngrams.rst index d83317d8..558f4e34 100644 --- a/docs/source/ngrams.rst +++ b/docs/source/ngrams.rst @@ -15,7 +15,7 @@ characters, trigrams are groups of three characters, and so on. Whoosh includes two methods for analyzing N-gram fields: an N-gram tokenizer, and a filter that breaks tokens into N-grams. -:class:` whoosh_reloaded.analysis.NgramTokenizer` tokenizes the entire field into N-grams. +:class:` whoosh.analysis.NgramTokenizer` tokenizes the entire field into N-grams. This is more useful for Chinese/Japanese/Korean languages, where it's useful to index bigrams of characters rather than individual characters. Using this tokenizer with roman languages leads to spaces in the tokens. @@ -27,7 +27,7 @@ tokenizer with roman languages leads to spaces in the tokens. [u'hi', u'hi ', u'hi t',u'i ', u'i t', u'i th', u' t', u' th', u' the', u'th', u'the', u'ther', u'he', u'her', u'here', u'er', u'ere', u're'] -:class:` whoosh_reloaded.analysis.NgramFilter` breaks individual tokens into N-grams as +:class:` whoosh.analysis.NgramFilter` breaks individual tokens into N-grams as part of an analysis pipeline. This is more useful for languages with word separation. @@ -40,7 +40,7 @@ separation. u'ader', u'der', u'ders', u'ers'] Whoosh includes two pre-configured field types for N-grams: -:class:` whoosh_reloaded.fields.NGRAM` and :class:`whoosh_reloaded.fields.NGRAMWORDS`. The only +:class:` whoosh.fields.NGRAM` and :class:`whoosh.fields.NGRAMWORDS`. The only difference is that ``NGRAM`` runs all text through the N-gram filter, including whitespace and punctuation, while ``NGRAMWORDS`` extracts words from the text using a tokenizer, then runs each word through the N-gram filter. diff --git a/docs/source/parsing.rst b/docs/source/parsing.rst index 3b1c2099..c4acc746 100644 --- a/docs/source/parsing.rst +++ b/docs/source/parsing.rst @@ -6,7 +6,7 @@ Overview ======== The job of a query parser is to convert a *query string* submitted by a user -into *query objects* (objects from the :mod:` whoosh_reloaded.query` module). +into *query objects* (objects from the :mod:` whoosh.query` module). For example, the user query: @@ -19,10 +19,10 @@ might be parsed into query objects like this:: And([Term("content", u"rendering"), Term("content", u"shading")]) Whoosh includes a powerful, modular parser for user queries in the -:mod:` whoosh_reloaded.qparser` module. The default parser implements a query language +:mod:` whoosh.qparser` module. The default parser implements a query language similar to the one that ships with Lucene. However, by changing plugins or using -functions such as :func:` whoosh_reloaded.qparser.MultifieldParser`, -:func:` whoosh_reloaded.qparser.SimpleParser` or :func:` whoosh_reloaded.qparser.DisMaxParser`, you +functions such as :func:` whoosh.qparser.MultifieldParser`, +:func:` whoosh.qparser.SimpleParser` or :func:` whoosh.qparser.DisMaxParser`, you can change how the parser works, get a simpler parser or change the query language syntax. @@ -32,7 +32,7 @@ The new hand-written parser is less brittle and more flexible.) .. note:: Remember that you can directly create query objects programmatically using - the objects in the :mod:` whoosh_reloaded.query` module. If you are not processing + the objects in the :mod:` whoosh.query` module. If you are not processing actual user queries, this is preferable to building a query string just to parse it. @@ -40,12 +40,12 @@ The new hand-written parser is less brittle and more flexible.) Using the default parser ======================== -To create a :class:` whoosh_reloaded.qparser.QueryParser` object, pass it the name of the +To create a :class:` whoosh.qparser.QueryParser` object, pass it the name of the *default field* to search and the schema of the index you'll be searching. :: - from whoosh_reloaded.qparser import QueryParser + from whoosh.qparser import QueryParser parser = QueryParser("content", schema=myindex.schema) @@ -88,7 +88,7 @@ present for a document to match, i.e.:: ...configure the QueryParser using the ``group`` keyword argument like this:: - from whoosh_reloaded import qparser + from whoosh import qparser parser = qparser.QueryParser(fieldname, schema=myindex.schema, group=qparser.OrGroup) @@ -133,11 +133,11 @@ However, you might want to let the user search *multiple* fields by default. For example, you might want "unfielded" terms to search both the ``title`` and ``content`` fields. -In that case, you can use a :class:` whoosh_reloaded.qparser.MultifieldParser`. This is +In that case, you can use a :class:` whoosh.qparser.MultifieldParser`. This is just like the normal QueryParser, but instead of a default field name string, it takes a *sequence* of field names:: - from whoosh_reloaded.qparser import MultifieldParser + from whoosh.qparser import MultifieldParser mparser = MultifieldParser(["title", "content"], schema=myschema) @@ -157,7 +157,7 @@ Once you have a parser:: parser = qparser.QueryParser("content", schema=myschema) you can remove features from it using the -:meth:`~ whoosh_reloaded.qparser.QueryParser.remove_plugin_class` method. +:meth:`~ whoosh.qparser.QueryParser.remove_plugin_class` method. For example, to remove the ability of the user to specify fields to search:: @@ -183,7 +183,7 @@ and NOT functions:: You can replace the default ``OperatorsPlugin`` object to replace the default English tokens with your own regular expressions. -The :class:` whoosh_reloaded.qparser.OperatorsPlugin` implements the ability to use AND, +The :class:` whoosh.qparser.OperatorsPlugin` implements the ability to use AND, OR, NOT, ANDNOT, and ANDMAYBE clauses in queries. You can instantiate a new ``OperatorsPlugin`` and use the ``And``, ``Or``, ``Not``, ``AndNot``, and ``AndMaybe`` keyword arguments to change the token patterns:: @@ -216,7 +216,7 @@ an open ended range:: field:{apple to] -The :class:` whoosh_reloaded.qparser.GtLtPlugin` lets you specify the same search like +The :class:` whoosh.qparser.GtLtPlugin` lets you specify the same search like this:: field:>apple @@ -233,7 +233,7 @@ Adding fuzzy term queries ------------------------- Fuzzy queries are good for catching misspellings and similar words. -The :class:` whoosh_reloaded.qparser.FuzzyTermPlugin` lets you search for "fuzzy" terms, +The :class:` whoosh.qparser.FuzzyTermPlugin` lets you search for "fuzzy" terms, that is, terms that don't have to match exactly. The fuzzy term will match any similar term within a certain number of "edits" (character insertions, deletions, and/or transpositions -- this is called the "Damerau-Levenshtein @@ -283,13 +283,13 @@ Allowing complex phrase queries The default parser setup allows phrase (proximity) queries such as:: - " whoosh_reloaded search library" + " whoosh search library" The default phrase query tokenizes the text between the quotes and creates a search for those terms in proximity. If you want to do more complex proximity searches, you can replace the phrase -plugin with the :class:` whoosh_reloaded.qparser.SequencePlugin`, which allows any query +plugin with the :class:` whoosh.qparser.SequencePlugin`, which allows any query between the quotes. For example:: "(john OR jon OR jonathan~) peters*" @@ -331,24 +331,24 @@ QueryParser supports two extra keyword arguments: specify a boolean operator, such as ``AND`` or ``OR``. This lets you change the default operator from ``AND`` to ``OR``. - This will be the :class:` whoosh_reloaded.qparser.AndGroup` or - :class:` whoosh_reloaded.qparser.OrGroup` class (*not* an instantiated object) unless + This will be the :class:` whoosh.qparser.AndGroup` or + :class:` whoosh.qparser.OrGroup` class (*not* an instantiated object) unless you've written your own custom grouping syntax you want to use. ``termclass`` The query class to use to wrap single terms. - This must be a :class:` whoosh_reloaded.query.Query` subclass (*not* an instantiated + This must be a :class:` whoosh.query.Query` subclass (*not* an instantiated object) that accepts a fieldname string and term text unicode string in its - ``__init__`` method. The default is :class:` whoosh_reloaded.query.Term`. + ``__init__`` method. The default is :class:` whoosh.query.Term`. This is useful if you want to change the default term class to - :class:` whoosh_reloaded.query.Variations`, or if you've written a custom term class + :class:` whoosh.query.Variations`, or if you've written a custom term class you want the parser to use instead of the ones shipped with Whoosh. :: - >>> from whoosh_reloaded.qparser import QueryParser, OrGroup + >>> from whoosh.qparser import QueryParser, OrGroup >>> orparser = QueryParser("content", schema=myschema, group=OrGroup) @@ -359,9 +359,9 @@ The query parser's functionality is provided by a set of plugins. You can remove plugins to remove functionality, add plugins to add functionality, or replace default plugins with re-configured or rewritten versions. -The :meth:` whoosh_reloaded.qparser.QueryParser.add_plugin`, -:meth:` whoosh_reloaded.qparser.QueryParser.remove_plugin_class`, and -:meth:` whoosh_reloaded.qparser.QueryParser.replace_plugin` methods let you manipulate +The :meth:` whoosh.qparser.QueryParser.add_plugin`, +:meth:` whoosh.qparser.QueryParser.remove_plugin_class`, and +:meth:` whoosh.qparser.QueryParser.replace_plugin` methods let you manipulate the plugins in a ``QueryParser`` object. See :doc:`/api/qparser` for information about the available plugins. @@ -374,9 +374,9 @@ Creating custom operators * Decide whether you want a ``PrefixOperator``, ``PostfixOperator``, or ``InfixOperator``. -* Create a new :class:` whoosh_reloaded.qparser.syntax.GroupNode` subclass to hold +* Create a new :class:` whoosh.qparser.syntax.GroupNode` subclass to hold nodes affected by your operator. This object is responsible for generating - a :class:` whoosh_reloaded.query.Query` object corresponding to the syntax. + a :class:` whoosh.query.Query` object corresponding to the syntax. * Create a regular expression pattern for the operator's query syntax. @@ -389,7 +389,7 @@ Creating custom operators For example, if you were creating a ``BEFORE`` operator:: - from whoosh_reloaded import qparser, query + from whoosh import qparser, query optype = qparser.InfixOperator pattern = " BEFORE " @@ -410,7 +410,7 @@ infix operator, do this:: qparser.InfixOperator, leftassoc=False) -Create an :class:`~ whoosh_reloaded.qparser.plugins.OperatorsPlugin` instance with your +Create an :class:`~ whoosh.qparser.plugins.OperatorsPlugin` instance with your new operator, and replace the default operators plugin in your query parser:: qp = qparser.QueryParser("text", myschema) diff --git a/docs/source/query.rst b/docs/source/query.rst index a40e5915..f56b26b6 100644 --- a/docs/source/query.rst +++ b/docs/source/query.rst @@ -2,7 +2,7 @@ Query objects ============= -The classes in the :mod:` whoosh_reloaded.query` module implement *queries* you can run against the index. +The classes in the :mod:` whoosh.query` module implement *queries* you can run against the index. TBD. diff --git a/docs/source/querylang.rst b/docs/source/querylang.rst index 5cdd36b3..085363da 100644 --- a/docs/source/querylang.rst +++ b/docs/source/querylang.rst @@ -12,7 +12,7 @@ terms and *phrases*. Multiple terms can be combined with operators such as *AND* and *OR*. Whoosh supports indexing text in different *fields*. You must specify the -*default field* when you create the :class:` whoosh_reloaded.qparser.QueryParser` object. +*default field* when you create the :class:` whoosh.qparser.QueryParser` object. This is the field in which any terms the user does not explicitly specify a field for will be searched. @@ -37,9 +37,9 @@ that field. Normally when you specify a phrase, the maximum difference in position between each word in the phrase is 1 (that is, the words must be right next to each other in the document). For example, the following matches if a document has -``library`` within 5 words after `` whoosh_reloaded``:: +``library`` within 5 words after `` whoosh``:: - " whoosh_reloaded library"~5 + " whoosh library"~5 Boolean operators diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 1de15bf2..b169fb7c 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -13,8 +13,8 @@ A quick introduction :: - >>> from whoosh_reloaded.index import create_in - >>> from whoosh_reloaded.fields import * + >>> from whoosh.index import create_in + >>> from whoosh.fields import * >>> schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) >>> ix = create_in("indexdir", schema) >>> writer = ix.writer() @@ -23,7 +23,7 @@ A quick introduction >>> writer.add_document(title=u"Second document", path=u"/b", ... content=u"The second one is even more interesting!") >>> writer.commit() - >>> from whoosh_reloaded.qparser import QueryParser + >>> from whoosh.qparser import QueryParser >>> with ix.searcher() as searcher: ... query = QueryParser("content", ix.schema).parse("first") ... results = searcher.search(query) @@ -44,7 +44,7 @@ with the results; this is useful for fields such as the title). This schema has two fields, "title" and "content":: - from whoosh_reloaded.fields import Schema, TEXT + from whoosh.fields import Schema, TEXT schema = Schema(title=TEXT, content=TEXT) @@ -56,44 +56,44 @@ to field types. The list of fields and their types defines what you are indexing and what's searchable. Whoosh comes with some very useful predefined field types, and you can easily create your own. -:class:` whoosh_reloaded.fields.ID` +:class:` whoosh.fields.ID` This type simply indexes (and optionally stores) the entire value of the field as a single unit (that is, it doesn't break it up into individual words). This is useful for fields such as a file path, URL, date, category, etc. -:class:` whoosh_reloaded.fields.STORED` +:class:` whoosh.fields.STORED` This field is stored with the document, but not indexed. This field type is not indexed and not searchable. This is useful for document information you want to display to the user in the search results. -:class:` whoosh_reloaded.fields.KEYWORD` +:class:` whoosh.fields.KEYWORD` This type is designed for space- or comma-separated keywords. This type is indexed and searchable (and optionally stored). To save space, it does not support phrase searching. -:class:` whoosh_reloaded.fields.TEXT` +:class:` whoosh.fields.TEXT` This type is for body text. It indexes (and optionally stores) the text and stores term positions to allow phrase searching. -:class:` whoosh_reloaded.fields.NUMERIC` +:class:` whoosh.fields.NUMERIC` This type is for numbers. You can store integers or floating point numbers. -:class:` whoosh_reloaded.fields.BOOLEAN` +:class:` whoosh.fields.BOOLEAN` This type is for boolean (true/false) values. -:class:` whoosh_reloaded.fields.DATETIME` +:class:` whoosh.fields.DATETIME` This type is for ``datetime`` objects. See :doc:`dates` for more information. -:class:` whoosh_reloaded.fields.NGRAM` and :class:` whoosh_reloaded.fields.NGRAMWORDS` +:class:` whoosh.fields.NGRAM` and :class:` whoosh.fields.NGRAMWORDS` These types break the field text or individual terms into N-grams. See :doc:`ngrams` for more information. (As a shortcut, if you don't need to pass any arguments to the field type, you can just give the class name and Whoosh will instantiate the object for you.) :: - from whoosh_reloaded.fields import Schema, STORED, ID, KEYWORD, TEXT + from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT schema = Schema(title=TEXT(stored=True), content=TEXT, path=ID(stored=True), tags=KEYWORD, icon=STORED) @@ -104,7 +104,7 @@ Once you have the schema, you can create an index using the ``create_in`` function:: import os.path - from whoosh_reloaded.index import create_in + from whoosh.index import create_in if not os.path.exists("index"): os.mkdir("index") @@ -118,7 +118,7 @@ in a directory.) After you've created an index, you can open it using the ``open_dir`` convenience function:: - from whoosh_reloaded.index import open_dir + from whoosh.index import open_dir ix = open_dir("index") @@ -194,7 +194,7 @@ For example, this query would match documents that contain both "apple" and # Construct query objects directly - from whoosh_reloaded.query import * + from whoosh.query import * myquery = And([Term("content", u"apple"), Term("content", "bear")]) To parse a query string, you can use the default query parser in the ``qparser`` @@ -204,7 +204,7 @@ argument is a schema to use to understand how to parse the fields:: # Parse a query string - from whoosh_reloaded.qparser import QueryParser + from whoosh.qparser import QueryParser parser = QueryParser("content", ix.schema) myquery = parser.parse(querystring) diff --git a/docs/source/recipes.rst b/docs/source/recipes.rst index c60b9633..19150153 100644 --- a/docs/source/recipes.rst +++ b/docs/source/recipes.rst @@ -18,7 +18,7 @@ Analysis Eliminate words shorter/longer than N ------------------------------------- -Use a :class:`~ whoosh_reloaded.analysis.StopFilter` and the ``minsize`` and ``maxsize`` +Use a :class:`~ whoosh.analysis.StopFilter` and the ``minsize`` and ``maxsize`` keyword arguments. If you just want to filter based on size and not common words, set the ``stoplist`` to ``None``:: @@ -61,7 +61,7 @@ Find every document iTunes-style search-as-you-type ------------------------------- -Use the :class:` whoosh_reloaded.analysis.NgramWordAnalyzer` as the analyzer for the +Use the :class:` whoosh.analysis.NgramWordAnalyzer` as the analyzer for the field you want to search as the user types. You can save space in the index by turning off positions in the field using ``phrase=False``, since phrase searching on N-gram fields usually doesn't make much sense:: @@ -71,7 +71,7 @@ searching on N-gram fields usually doesn't make much sense:: title_field = fields.TEXT(analyzer=analyzer, phrase=False) schema = fields.Schema(title=title_field) -See the documentation for the :class:`~ whoosh_reloaded.analysis.NgramWordAnalyzer` class +See the documentation for the :class:`~ whoosh.analysis.NgramWordAnalyzer` class for information on the available options. @@ -103,7 +103,7 @@ The following scoring function uses the position of the first occurance of a term in each document to calculate the score, so documents with the given term earlier in the document will score higher:: - from whoosh_reloaded import scoring + from whoosh import scoring def pos_score_fn(searcher, fieldname, text, matcher): poses = matcher.value_as("positions") diff --git a/docs/source/releases/0_3.rst b/docs/source/releases/0_3.rst index 4eb7077b..e30660c2 100644 --- a/docs/source/releases/0_3.rst +++ b/docs/source/releases/0_3.rst @@ -6,9 +6,9 @@ Whoosh 0.3 release notes * Changed default post limit (run size) from 4 MB to 32 MB. -* Finished migrating backend-specific code into `` whoosh_reloaded.filedb`` package. +* Finished migrating backend-specific code into `` whoosh.filedb`` package. -* Moved formats from whoosh_reloaded.fields module into new whoosh_reloaded.formats module. +* Moved formats from whoosh.fields module into new whoosh.formats module. * DocReader and TermReader classes combined into new IndexReader interface. You can get an IndexReader implementation by calling Index.reader(). @@ -23,8 +23,8 @@ Whoosh 0.3 release notes * Added experimental DATETIME field type lets you pass a ``datetime.datetime`` object as a field value to ``add_document``:: - from whoosh_reloaded.fields import Schema, ID, DATETIME - from whoosh_reloaded.filedb.filestore import RamStorage + from whoosh.fields import Schema, ID, DATETIME + from whoosh.filedb.filestore import RamStorage from datetime import datetime schema = Schema(id=ID, date=DATETIME) @@ -48,14 +48,14 @@ Whoosh 0.3 release notes has moved from the last to the first argument), e.g. ``v = ixreader.vector_as("frequency", 102, "content")``. -* Added whoosh_reloaded.support.charset for translating Sphinx charset table files. +* Added whoosh.support.charset for translating Sphinx charset table files. -* Added whoosh_reloaded.analysis.CharsetTokenizer and CharsetFilter to enable case and +* Added whoosh.analysis.CharsetTokenizer and CharsetFilter to enable case and accent folding. -* Added experimental `` whoosh_reloaded.ramdb`` in-memory backend. +* Added experimental `` whoosh.ramdb`` in-memory backend. -* Added experimental `` whoosh_reloaded.query.FuzzyTerm`` query type. +* Added experimental `` whoosh.query.FuzzyTerm`` query type. -* Added `` whoosh_reloaded.lang.wordnet`` module containing ``Thesaurus`` object for using +* Added `` whoosh.lang.wordnet`` module containing ``Thesaurus`` object for using WordNet synonym database. diff --git a/docs/source/releases/1_0.rst b/docs/source/releases/1_0.rst index 3522ce8f..7312123b 100644 --- a/docs/source/releases/1_0.rst +++ b/docs/source/releases/1_0.rst @@ -71,7 +71,7 @@ Whoosh 1.7.7 ============ Setting a TEXT field to store term vectors is now much easier. Instead of -having to pass an instantiated whoosh_reloaded.formats.Format object to the vector= +having to pass an instantiated whoosh.formats.Format object to the vector= keyword argument, you can pass True to automatically use the same format and analyzer as the inverted index. Alternatively, you can pass a Format subclass and Whoosh will instantiate it for you. @@ -79,14 +79,14 @@ and Whoosh will instantiate it for you. For example, to store term vectors using the same settings as the inverted index (Positions format and StandardAnalyzer):: - from whoosh_reloaded.fields import Schema, TEXT + from whoosh.fields import Schema, TEXT schema = Schema(content=TEXT(vector=True)) To store term vectors that use the same analyzer as the inverted index (StandardAnalyzer by default) but only store term frequency:: - from whoosh_reloaded.formats import Frequency + from whoosh.formats import Frequency schema = Schema(content=TEXT(vector=Frequency)) @@ -94,13 +94,13 @@ Note that currently the only place term vectors are used in Whoosh is keyword extraction/more like this, but they can be useful for expert users with custom code. -Added :meth:` whoosh_reloaded.searching.Searcher.more_like` and -:meth:` whoosh_reloaded.searching.Hit.more_like_this` methods, as shortcuts for doing +Added :meth:` whoosh.searching.Searcher.more_like` and +:meth:` whoosh.searching.Hit.more_like_this` methods, as shortcuts for doing keyword extraction yourself. Return a Results object. "python setup.py test" works again, as long as you have nose installed. -The :meth:` whoosh_reloaded.searching.Searcher.sort_query_using` method lets you sort documents matching a given query using an arbitrary function. Note that like "complex" searching with the Sorter object, this can be slow on large multi-segment indexes. +The :meth:` whoosh.searching.Searcher.sort_query_using` method lets you sort documents matching a given query using an arbitrary function. Note that like "complex" searching with the Sorter object, this can be slow on large multi-segment indexes. Whoosh 1.7 @@ -110,9 +110,9 @@ You can once again perform complex sorting of search results (that is, a sort with some fields ascending and some fields descending). You can still use the ``sortedby`` keyword argument to -:meth:` whoosh_reloaded.searching.Searcher.search` to do a simple sort (where all fields +:meth:` whoosh.searching.Searcher.search` to do a simple sort (where all fields are sorted in the same direction), or you can use the new -:class:`~ whoosh_reloaded.sorting.Sorter` class to do a simple or complex sort:: +:class:`~ whoosh.sorting.Sorter` class to do a simple or complex sort:: searcher = myindex.searcher() sorter = searcher.sorter() @@ -123,19 +123,19 @@ are sorted in the same direction), or you can use the new # Get the Results results = sorter.sort_query(myquery) -See the documentation for the :class:`~ whoosh_reloaded.sorting.Sorter` class for more +See the documentation for the :class:`~ whoosh.sorting.Sorter` class for more information. Bear in mind that complex sorts will be much slower on large indexes because they can't use the per-segment field caches. You can now get highlighted snippets for a hit automatically using -:meth:` whoosh_reloaded.searching.Hit.highlights`:: +:meth:` whoosh.searching.Hit.highlights`:: results = searcher.search(myquery, limit=20) for hit in results: print hit["title"] print hit.highlights("content") -See :meth:` whoosh_reloaded.searching.Hit.highlights` for more information. +See :meth:` whoosh.searching.Hit.highlights` for more information. Added the ability to filter search results so that only hits in a Results set, a set of docnums, or matching a query are returned. The filter is @@ -148,12 +148,12 @@ cached on the searcher. results = searcher.search(userquery, filter=query.Term("chapter", "basics")) You can now specify a time limit for a search. If the search does not finish -in the given time, a :class:` whoosh_reloaded.searching.TimeLimit` exception is raised, +in the given time, a :class:` whoosh.searching.TimeLimit` exception is raised, but you can still retrieve the partial results from the collector. See the ``timelimit`` and ``greedy`` arguments in the -:class:` whoosh_reloaded.searching.Collector` documentation. +:class:` whoosh.searching.Collector` documentation. -Added back the ability to set :class:` whoosh_reloaded.analysis.StemFilter` to use an +Added back the ability to set :class:` whoosh.analysis.StemFilter` to use an unlimited cache. This is useful for one-shot batch indexing (see :doc:`../batch`). @@ -180,8 +180,8 @@ correctly. The change alters the semantics of certain parsing "corner cases" Whoosh 1.6 ========== -The `` whoosh_reloaded.writing.BatchWriter`` class is now called -:class:` whoosh_reloaded.writing.BufferedWriter`. It is similar to the old ``BatchWriter`` +The `` whoosh.writing.BatchWriter`` class is now called +:class:` whoosh.writing.BufferedWriter`. It is similar to the old ``BatchWriter`` class but allows you to search and update the buffered documents as well as the documents that have been flushed to disk:: @@ -198,26 +198,26 @@ documents that have been flushed to disk:: (BatchWriter is still available as an alias for backwards compatibility.) -The :class:` whoosh_reloaded.qparser.QueryParser` initialization method now requires a +The :class:` whoosh.qparser.QueryParser` initialization method now requires a schema as the second argument. Previously the default was to create a ``QueryParser`` without a schema, which was confusing:: qp = qparser.QueryParser("content", myindex.schema) -The :meth:` whoosh_reloaded.searching.Searcher.search` method now takes a ``scored`` +The :meth:` whoosh.searching.Searcher.search` method now takes a ``scored`` keyword. If you search with ``scored=False``, the results will be in "natural" order (the order the documents were added to the index). This is useful when you don't need scored results but want the convenience of the Results object. -Added the :class:` whoosh_reloaded.qparser.GtLtPlugin` parser plugin to allow greater +Added the :class:` whoosh.qparser.GtLtPlugin` parser plugin to allow greater than/less as an alternative syntax for ranges:: count:>100 tag:<=zebra date:>='29 march 2001' Added the ability to define schemas declaratively, similar to Django models:: - from whoosh_reloaded import index - from whoosh_reloaded.fields import SchemaClass, ID, KEYWORD, STORED, TEXT + from whoosh import index + from whoosh.fields import SchemaClass, ID, KEYWORD, STORED, TEXT class MySchema(SchemaClass): uuid = ID(stored=True, unique=True) @@ -227,14 +227,14 @@ Added the ability to define schemas declaratively, similar to Django models:: index.create_in("indexdir", MySchema) -Whoosh 1.6.2: Added :class:` whoosh_reloaded.searching.TermTrackingCollector` which tracks +Whoosh 1.6.2: Added :class:` whoosh.searching.TermTrackingCollector` which tracks which part of the query matched which documents in the final results. -Replaced the unbounded cache in :class:` whoosh_reloaded.analysis.StemFilter` with a +Replaced the unbounded cache in :class:` whoosh.analysis.StemFilter` with a bounded LRU (least recently used) cache. This will make stemming analysis slightly slower but prevent it from eating up too much memory over time. -Added a simple :class:` whoosh_reloaded.analysis.PyStemmerFilter` that works when the +Added a simple :class:` whoosh.analysis.PyStemmerFilter` that works when the py-stemmer library is installed:: ana = RegexTokenizer() | PyStemmerFilter("spanish") @@ -243,8 +243,8 @@ The estimation of memory usage for the ``limitmb`` keyword argument to ``FileIndex.writer()`` is more accurate, which should help keep memory usage memory usage by the sorting pool closer to the limit. -The `` whoosh_reloaded.ramdb`` package was removed and replaced with a single -`` whoosh_reloaded.ramindex`` module. +The `` whoosh.ramdb`` package was removed and replaced with a single +`` whoosh.ramindex`` module. Miscellaneous bug fixes. @@ -272,13 +272,13 @@ deals with a "term" that when analyzed generates multiple tokens. The default value is `"first"` which throws away all but the first token (the previous behavior). Other possible values are `"and"`, `"or"`, or `"phrase"`. -Added :class:` whoosh_reloaded.analysis.DoubleMetaphoneFilter`, -:class:` whoosh_reloaded.analysis.SubstitutionFilter`, and -:class:` whoosh_reloaded.analysis.ShingleFilter`. +Added :class:` whoosh.analysis.DoubleMetaphoneFilter`, +:class:` whoosh.analysis.SubstitutionFilter`, and +:class:` whoosh.analysis.ShingleFilter`. -Added :class:` whoosh_reloaded.qparser.CopyFieldPlugin`. +Added :class:` whoosh.qparser.CopyFieldPlugin`. -Added :class:` whoosh_reloaded.query.Otherwise`. +Added :class:` whoosh.query.Otherwise`. Generalized parsing of operators (such as OR, AND, NOT, etc.) in the query parser to make it easier to add new operators. In intend to add a better API @@ -289,7 +289,7 @@ representations of numbers. Fixed a bug in the porter2 stemmer when stemming the string `"y"`. -Added methods to :class:` whoosh_reloaded.searching.Hit` to make it more like a `dict`. +Added methods to :class:` whoosh.searching.Hit` to make it more like a `dict`. Short posting lists (by default, single postings) are inline in the term file instead of written to the posting file for faster retrieval and a small saving @@ -327,48 +327,48 @@ Faster indexing and ability to use multiple processors (via ``multiprocessing`` module) to speed up indexing. Flexible Schema: you can now add and remove fields in an index with the -:meth:` whoosh_reloaded.writing.IndexWriter.add_field` and -:meth:` whoosh_reloaded.writing.IndexWriter.remove_field` methods. +:meth:` whoosh.writing.IndexWriter.add_field` and +:meth:` whoosh.writing.IndexWriter.remove_field` methods. New hand-written query parser based on plug-ins. Less brittle, more robust, more flexible, and easier to fix/improve than the old pyparsing-based parser. On-disk formats now use 64-bit disk pointers allowing files larger than 4 GB. -New :class:` whoosh_reloaded.searching.Facets` class efficiently sorts results into +New :class:` whoosh.searching.Facets` class efficiently sorts results into facets based on any criteria that can be expressed as queries, for example tags or price ranges. -New :class:` whoosh_reloaded.writing.BatchWriter` class automatically batches up +New :class:` whoosh.writing.BatchWriter` class automatically batches up individual ``add_document`` and/or ``delete_document`` calls until a certain number of calls or a certain amount of time passes, then commits them all at once. -New :class:` whoosh_reloaded.analysis.BiWordFilter` lets you create bi-word indexed +New :class:` whoosh.analysis.BiWordFilter` lets you create bi-word indexed fields a possible alternative to phrase searching. Fixed bug where files could be deleted before a reader could open them in threaded situations. -New :class:` whoosh_reloaded.analysis.NgramFilter` filter, -:class:` whoosh_reloaded.analysis.NgramWordAnalyzer` analyzer, and -:class:` whoosh_reloaded.fields.NGRAMWORDS` field type allow producing n-grams from +New :class:` whoosh.analysis.NgramFilter` filter, +:class:` whoosh.analysis.NgramWordAnalyzer` analyzer, and +:class:` whoosh.fields.NGRAMWORDS` field type allow producing n-grams from tokenized text. -Errors in query parsing now raise a specific `` whoosh_reloaded.qparse.QueryParserError`` +Errors in query parsing now raise a specific `` whoosh.qparse.QueryParserError`` exception instead of a generic exception. Previously, the query string ``*`` was optimized to a -:class:` whoosh_reloaded.query.Every` query which matched every document. Now the +:class:` whoosh.query.Every` query which matched every document. Now the ``Every`` query only matches documents that actually have an indexed term from the given field, to better match the intuitive sense of what a query string like ``tag:*`` should do. -New :meth:` whoosh_reloaded.searching.Searcher.key_terms_from_text` method lets you +New :meth:` whoosh.searching.Searcher.key_terms_from_text` method lets you extract key words from arbitrary text instead of documents in the index. -Previously the :meth:` whoosh_reloaded.searching.Searcher.key_terms` and -:meth:` whoosh_reloaded.searching.Results.key_terms` methods required that the given +Previously the :meth:` whoosh.searching.Searcher.key_terms` and +:meth:` whoosh.searching.Results.key_terms` methods required that the given field store term vectors. They now also work if the given field is stored instead. They will analyze the stored string into a term vector on-the-fly. The field must still be indexed. @@ -378,7 +378,7 @@ User API changes ================ The default for the ``limit`` keyword argument to -:meth:` whoosh_reloaded.searching.Searcher.search` is now ``10``. To return all results +:meth:` whoosh.searching.Searcher.search` is now ``10``. To return all results in a single ``Results`` object, use ``limit=None``. The ``Index`` object no longer represents a snapshot of the index at the time @@ -390,9 +390,9 @@ created. Because the ``Index`` object no longer represents the index at a specific version, several methods such as ``up_to_date`` and ``refresh`` were removed from its interface. The Searcher object now has -:meth:`~ whoosh_reloaded.searching.Searcher.last_modified`, -:meth:`~ whoosh_reloaded.searching.Searcher.up_to_date`, and -:meth:`~ whoosh_reloaded.searching.Searcher.refresh` methods similar to those that used to +:meth:`~ whoosh.searching.Searcher.last_modified`, +:meth:`~ whoosh.searching.Searcher.up_to_date`, and +:meth:`~ whoosh.searching.Searcher.refresh` methods similar to those that used to be on ``Index``. The document deletion and field add/remove methods on the ``Index`` object now @@ -417,8 +417,8 @@ The ``postlimit`` argument to ``Index.writer()`` has been changed to writer = myindex.writer(postlimitmb=128) -Instead of having to import `` whoosh_reloaded.filedb.filewriting.NO_MERGE`` or -`` whoosh_reloaded.filedb.filewriting.OPTIMIZE`` to use as arguments to ``commit()``, you +Instead of having to import `` whoosh.filedb.filewriting.NO_MERGE`` or +`` whoosh.filedb.filewriting.OPTIMIZE`` to use as arguments to ``commit()``, you can now simply do the following:: # Do not merge segments @@ -429,7 +429,7 @@ can now simply do the following:: # Merge all segments writer.commit(optimize=True) -The `` whoosh_reloaded.postings`` module is gone. The `` whoosh_reloaded.matching`` module contains +The `` whoosh.postings`` module is gone. The `` whoosh.matching`` module contains classes for posting list readers. Whoosh no longer maps field names to numbers for internal use or writing to @@ -439,7 +439,7 @@ instead. Custom Weighting implementations that use the ``final()`` method must now set the ``use_final`` attribute to ``True``:: - from whoosh_reloaded.scoring import BM25F + from whoosh.scoring import BM25F class MyWeighting(BM25F): use_final = True @@ -450,7 +450,7 @@ set the ``use_final`` attribute to ``True``:: This disables the new optimizations, forcing Whoosh to score every matching document. -:class:` whoosh_reloaded.writing.AsyncWriter` now takes an :class:` whoosh_reloaded.index.Index` +:class:` whoosh.writing.AsyncWriter` now takes an :class:` whoosh.index.Index` object as its first argument, not a callable. Also, the keyword arguments to pass to the index's ``writer()`` method should now be passed as a dictionary using the ``writerargs`` keyword argument. @@ -464,11 +464,11 @@ argument representing the default to return if the given document and field do not have a length (i.e. the field is not scored or the field was not provided for the given document). -The :class:` whoosh_reloaded.analysis.StopFilter` now has a ``maxsize`` argument as well +The :class:` whoosh.analysis.StopFilter` now has a ``maxsize`` argument as well as a ``minsize`` argument to its initializer. Analyzers that use the ``StopFilter`` have the ``maxsize`` argument in their initializers now also. -The interface of :class:` whoosh_reloaded.writing.AsyncWriter` has changed. +The interface of :class:` whoosh.writing.AsyncWriter` has changed. Misc diff --git a/docs/source/releases/2_0.rst b/docs/source/releases/2_0.rst index a52ebf29..053966fe 100644 --- a/docs/source/releases/2_0.rst +++ b/docs/source/releases/2_0.rst @@ -46,13 +46,13 @@ Whoosh 2.5 * Whoosh now includes pure-Python implementations of the Snowball stemmers and stop word lists for various languages adapted from NLTK. These are available - through the :class:` whoosh_reloaded.analysis.LanguageAnalyzer` analyzer or through the + through the :class:` whoosh.analysis.LanguageAnalyzer` analyzer or through the ``lang=`` keyword argument to the - :class:`~ whoosh_reloaded.fields.TEXT` field. + :class:`~ whoosh.fields.TEXT` field. * You can now use the - :meth:` whoosh_reloaded.filedb.filestore.Storage.create()` and - :meth:` whoosh_reloaded.filedb.filestore.Storage.destory()` + :meth:` whoosh.filedb.filestore.Storage.create()` and + :meth:` whoosh.filedb.filestore.Storage.destory()` methods as a consistent API to set up and tear down different types of storage. @@ -120,13 +120,13 @@ Whoosh 2.3.1 Whoosh 2.3 ========== -* Added a :class:` whoosh_reloaded.query.Regex` term query type, similar to - :class:` whoosh_reloaded.query.Wildcard`. The parser does not allow regex term queries - by default. You need to add the :class:` whoosh_reloaded.qparser.RegexPlugin` plugin. +* Added a :class:` whoosh.query.Regex` term query type, similar to + :class:` whoosh.query.Wildcard`. The parser does not allow regex term queries + by default. You need to add the :class:` whoosh.qparser.RegexPlugin` plugin. After you add the plugin, you can use ``r"expression"`` query syntax for regular expression term queries. For example, ``r"foo.*bar"``. -* Added the :class:` whoosh_reloaded.qparser.PseudoFieldPlugin` parser plugin. This +* Added the :class:` whoosh.qparser.PseudoFieldPlugin` parser plugin. This plugin lets you create "pseudo-fields" that run a transform function on whatever query syntax the user applies the field to. This is fairly advanced functionality right now; I'm trying to think of ways to make its power easier @@ -137,10 +137,10 @@ Whoosh 2.3 it much easier to display the "top N" results in each category, for example. * The ``groupids`` keyword argument to ``Searcher.search`` has been removed. - Instead you can now pass a :class:` whoosh_reloaded.sorting.FacetMap` object to the + Instead you can now pass a :class:` whoosh.sorting.FacetMap` object to the ``Searcher.search`` method's ``maptype`` argument to control how faceted documents are grouped, and/or set the ``maptype`` argument on individual - :class:` whoosh_reloaded.sorting.FacetType`` objects to set custom grouping per facet. + :class:` whoosh.sorting.FacetType`` objects to set custom grouping per facet. See :doc:`../facets` for more information. * Calling ``Searcher.documents()`` or ``Searcher.document_numbers()`` with no @@ -169,17 +169,17 @@ Whoosh 2.2 * Fixes several bugs, including a bad bug in BM25F scoring. -* Added ``allow_overlap`` option to :class:` whoosh_reloaded.sorting.StoredFieldFacet`. +* Added ``allow_overlap`` option to :class:` whoosh.sorting.StoredFieldFacet`. -* In :meth:`~ whoosh_reloaded.writing.IndexWriter.add_document`, You can now pass +* In :meth:`~ whoosh.writing.IndexWriter.add_document`, You can now pass query-like strings for BOOLEAN and DATETIME fields (e.g ``boolfield="true"`` and ``dtfield="20101131-16:01"``) as an alternative to actual ``bool`` or ``datetime`` objects. The implementation of this is incomplete: it only works in the default ``filedb`` backend, and if the field is stored, the stored value will be the string, not the parsed object. -* Added :class:` whoosh_reloaded.analysis.CompoundWordFilter` and - :class:` whoosh_reloaded.analysis.TeeFilter`. +* Added :class:` whoosh.analysis.CompoundWordFilter` and + :class:` whoosh.analysis.TeeFilter`. Whoosh 2.1 @@ -204,7 +204,7 @@ Improvements For example, to sort by first name and then score:: - from whoosh_reloaded import sorting + from whoosh import sorting mf = sorting.MultiFacet([sorting.FieldFacet("firstname"), sorting.ScoreFacet()]) @@ -231,15 +231,15 @@ Improvements "spelling index". You can get suggestions for individual words using - :meth:` whoosh_reloaded.searching.Searcher.suggest`:: + :meth:` whoosh.searching.Searcher.suggest`:: suglist = searcher.suggest("content", "werd", limit=3) Whoosh now includes convenience methods to spell-check and correct user queries, with optional highlighting of corrections using the - `` whoosh_reloaded.highlight`` module:: + `` whoosh.highlight`` module:: - from whoosh_reloaded import highlight, qparser + from whoosh import highlight, qparser # User query string qstring = request.get("q") @@ -268,18 +268,18 @@ Improvements See :doc:`/spelling` for more information. -* :class:` whoosh_reloaded.query.FuzzyTerm` now uses the new word graph feature as well +* :class:` whoosh.query.FuzzyTerm` now uses the new word graph feature as well and so is much faster. * You can now set a boost factor for individual documents as you index them, to increase the score of terms in those documents in searches. See the - documentation for the :meth:`~ whoosh_reloaded.writing.IndexWriter.add_document` for + documentation for the :meth:`~ whoosh.writing.IndexWriter.add_document` for more information. * Added built-in recording of which terms matched in which documents. Use the - ``terms=True`` argument to :meth:` whoosh_reloaded.searching.Searcher.search` and use - :meth:` whoosh_reloaded.searching.Hit.matched_terms` and - :meth:` whoosh_reloaded.searching.Hit.contains_term` to check matched terms. + ``terms=True`` argument to :meth:` whoosh.searching.Searcher.search` and use + :meth:` whoosh.searching.Hit.matched_terms` and + :meth:` whoosh.searching.Hit.contains_term` to check matched terms. * Whoosh now supports whole-term quality optimizations, so for example if the system knows that a UnionMatcher cannot possibly contribute to the "top N" @@ -325,9 +325,9 @@ Compatibility release when it will be replaced by a more robust and useful feature. * Reader iteration methods (``__iter__``, ``iter_from``, ``iter_field``, etc.) - now yield :class:` whoosh_reloaded.reading.TermInfo` objects. + now yield :class:` whoosh.reading.TermInfo` objects. -* The arguments to :class:` whoosh_reloaded.query.FuzzyTerm` changed. +* The arguments to :class:` whoosh.query.FuzzyTerm` changed. diff --git a/docs/source/schema.rst b/docs/source/schema.rst index eb6629a0..043facb5 100644 --- a/docs/source/schema.rst +++ b/docs/source/schema.rst @@ -27,11 +27,11 @@ Built-in field types Whoosh provides some useful predefined field types: -:class:` whoosh_reloaded.fields.TEXT` +:class:` whoosh.fields.TEXT` This type is for body text. It indexes (and optionally stores) the text and stores term positions to allow phrase searching. - ``TEXT`` fields use :class:`~ whoosh_reloaded.analysis.StandardAnalyzer` by default. To specify a different + ``TEXT`` fields use :class:`~ whoosh.analysis.StandardAnalyzer` by default. To specify a different analyzer, use the ``analyzer`` keyword argument to the constructor, e.g. ``TEXT(analyzer=analysis.StemmingAnalyzer())``. See :doc:`analysis`. @@ -47,7 +47,7 @@ Whoosh provides some useful predefined field types: circumstances it can be useful (see :doc:`highlight`). Use ``TEXT(stored=True)`` to specify that the text should be stored in the index. -:class:` whoosh_reloaded.fields.KEYWORD` +:class:` whoosh.fields.KEYWORD` This field type is designed for space- or comma-separated keywords. This type is indexed and searchable (and optionally stored). To save space, it does not support phrase searching. @@ -61,7 +61,7 @@ Whoosh provides some useful predefined field types: If your users will use the keyword field for searching, use ``scorable=True``. -:class:` whoosh_reloaded.fields.ID` +:class:` whoosh.fields.ID` The ``ID`` field type simply indexes (and optionally stores) the entire value of the field as a single unit (that is, it doesn't break it up into individual terms). This type of field does not store frequency information, so it's @@ -76,23 +76,23 @@ Whoosh provides some useful predefined field types: search results. For example, you would want to store the value of a url field so you could provide links to the original in your search results. -:class:` whoosh_reloaded.fields.STORED` +:class:` whoosh.fields.STORED` This field is stored with the document, but not indexed and not searchable. This is useful for document information you want to display to the user in the search results, but don't need to be able to search for. -:class:` whoosh_reloaded.fields.NUMERIC` +:class:` whoosh.fields.NUMERIC` This field stores int, long, or floating point numbers in a compact, sortable format. -:class:` whoosh_reloaded.fields.DATETIME` +:class:` whoosh.fields.DATETIME` This field stores datetime objects in a compact, sortable format. -:class:` whoosh_reloaded.fields.BOOLEAN` +:class:` whoosh.fields.BOOLEAN` This simple filed indexes boolean values and allows users to search for ``yes``, ``no``, ``true``, ``false``, ``1``, ``0``, ``t`` or ``f``. -:class:` whoosh_reloaded.fields.NGRAM` +:class:` whoosh.fields.NGRAM` TBD. Expert users can create their own field types. @@ -103,8 +103,8 @@ Creating a Schema To create a schema:: - from whoosh_reloaded.fields import Schema, TEXT, KEYWORD, ID, STORED - from whoosh_reloaded.analysis import StemmingAnalyzer + from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED + from whoosh.analysis import StemmingAnalyzer schema = Schema(from_addr=ID(stored=True), to_addr=ID(stored=True), @@ -119,7 +119,7 @@ of ``fieldname=TEXT()``). Whoosh will instantiate the class for you. Alternatively you can create a schema declaratively using the ``SchemaClass`` base class:: - from whoosh_reloaded.fields import SchemaClass, TEXT, KEYWORD, ID, STORED + from whoosh.fields import SchemaClass, TEXT, KEYWORD, ID, STORED class MySchema(SchemaClass): path = ID(stored=True) @@ -127,9 +127,9 @@ base class:: content = TEXT tags = KEYWORD -You can pass a declarative class to :func:`~ whoosh_reloaded.index.create_in` or -:meth:`~ whoosh_reloaded.store.Storage.create_index()` instead of a -:class:`~ whoosh_reloaded.fields.Schema` instance. +You can pass a declarative class to :func:`~ whoosh.index.create_in` or +:meth:`~ whoosh.store.Storage.create_index()` instead of a +:class:`~ whoosh.fields.Schema` instance. Modifying the schema after indexing @@ -264,7 +264,7 @@ stored bool If True, the value of this field is stored unique bool If True, the value of this field may be used to replace documents with the same value when the user calls - :meth:`~ whoosh_reloaded.writing.IndexWriter.document_update` + :meth:`~ whoosh.writing.IndexWriter.document_update` on an ``IndexWriter``. ============ =============== ====================================================== diff --git a/docs/source/searching.rst b/docs/source/searching.rst index 633ded55..603244a4 100644 --- a/docs/source/searching.rst +++ b/docs/source/searching.rst @@ -8,7 +8,7 @@ documents. The ``Searcher`` object ======================= -To get a :class:` whoosh_reloaded.searching.Searcher` object, call ``searcher()`` on your +To get a :class:` whoosh.searching.Searcher` object, call ``searcher()`` on your ``Index`` object:: searcher = myindex.searcher() @@ -36,14 +36,14 @@ has lots of useful methods for getting information about the index, such as :: >>> list(searcher.lexicon("content")) - [u"document", u"index", u" whoosh_reloaded"] + [u"document", u"index", u" whoosh"] However, the most important method on the ``Searcher`` object is -:meth:`~ whoosh_reloaded.searching.Searcher.search`, which takes a -:class:` whoosh_reloaded.query.Query` object and returns a -:class:`~ whoosh_reloaded.searching.Results` object:: +:meth:`~ whoosh.searching.Searcher.search`, which takes a +:class:` whoosh.query.Query` object and returns a +:class:`~ whoosh.searching.Results` object:: - from whoosh_reloaded.qparser import QueryParser + from whoosh.qparser import QueryParser qp = QueryParser("content", schema=myindex.schema) q = qp.parse(u"hello world") @@ -75,7 +75,7 @@ to set a different page length:: Results object ============== -The :class:`~ whoosh_reloaded.searching.Results` object acts like a list of the matched +The :class:`~ whoosh.searching.Results` object acts like a list of the matched documents. You can use it to access the stored fields of each hit document, to display to the user. @@ -107,9 +107,9 @@ Calling ``len(Results)`` runs a fast (unscored) version of the query again to figure out the total number of matching documents. This is usually very fast but for large indexes it can cause a noticeable delay. If you want to avoid this delay on very large indexes, you can use the -:meth:`~ whoosh_reloaded.searching.Results.has_exact_length`, -:meth:`~ whoosh_reloaded.searching.Results.estimated_length`, and -:meth:`~ whoosh_reloaded.searching.Results.estimated_min_length` methods to estimate the +:meth:`~ whoosh.searching.Results.has_exact_length`, +:meth:`~ whoosh.searching.Results.estimated_length`, and +:meth:`~ whoosh.searching.Results.estimated_min_length` methods to estimate the number of matching documents without calling ``len()``:: found = results.scored_length() @@ -129,18 +129,18 @@ Scoring ------- Normally the list of result documents is sorted by *score*. The -:mod:` whoosh_reloaded.scoring` module contains implementations of various scoring -algorithms. The default is :class:`~ whoosh_reloaded.scoring.BM25F`. +:mod:` whoosh.scoring` module contains implementations of various scoring +algorithms. The default is :class:`~ whoosh.scoring.BM25F`. You can set the scoring object to use when you create the searcher using the ``weighting`` keyword argument:: - from whoosh_reloaded import scoring + from whoosh import scoring with myindex.searcher(weighting=scoring.TF_IDF()) as s: ... -A weighting model is a :class:`~ whoosh_reloaded.scoring.WeightingModel` subclass with a +A weighting model is a :class:`~ whoosh.scoring.WeightingModel` subclass with a ``scorer()`` method that produces a "scorer" instance. This instance has a method that takes the current matcher and returns a floating point score. @@ -161,7 +161,7 @@ Filtering results You can use the ``filter`` keyword argument to ``search()`` to specify a set of documents to permit in the results. The argument can be a -:class:` whoosh_reloaded.query.Query` object, a :class:` whoosh_reloaded.searching.Results` object, +:class:` whoosh.query.Query` object, a :class:` whoosh.searching.Results` object, or a set-like object containing document numbers. The searcher caches filters so if for example you use the same query filter with a searcher multiple times, the additional searches will be faster because the searcher will cache the @@ -210,7 +210,7 @@ search record which terms in the query matched which documents:: results = s.seach(myquery, terms=True) You can then get information about which terms matched from the -:class:` whoosh_reloaded.searching.Results` and :class:` whoosh_reloaded.searching.Hit` objects:: +:class:` whoosh.searching.Results` and :class:` whoosh.searching.Hit` objects:: # Was this results object created with terms=True? if results.has_matched_terms(): @@ -254,7 +254,7 @@ See :doc:`/facets` for information on facets. print(results.collapsed_counts) Collapsing works with both scored and sorted results. You can use any of the -facet types available in the :mod:` whoosh_reloaded.sorting` module. +facet types available in the :mod:` whoosh.sorting` module. By default, Whoosh uses the results order (score or sort key) to determine the documents to collapse. For example, in scored results, the best scoring @@ -264,7 +264,7 @@ to control which documents to keep when collapsing. For example, in a product search you could display results sorted by decreasing price, and eliminate all but the highest rated item of each product type:: - from whoosh_reloaded import sorting + from whoosh import sorting with myindex.searcher() as s: price_facet = sorting.FieldFacet("price", reverse=True) @@ -284,8 +284,8 @@ already-collected documents. Since this collector must sometimes go back and remove already-collected documents, if you use it in combination with -:class:`~ whoosh_reloaded.collectors.TermsCollector` and/or -:class:`~ whoosh_reloaded.collectors.FacetCollector`, those collectors may contain +:class:`~ whoosh.collectors.TermsCollector` and/or +:class:`~ whoosh.collectors.FacetCollector`, those collectors may contain information about documents that were filtered out of the final results by collapsing. @@ -295,7 +295,7 @@ Time limited searches To limit the amount of time a search can take:: - from whoosh_reloaded.collectors import TimeLimitCollector, TimeLimit + from whoosh.collectors import TimeLimitCollector, TimeLimit with myindex.searcher() as s: # Get a collector object @@ -316,8 +316,8 @@ To limit the amount of time a search can take:: Convenience methods =================== -The :meth:`~ whoosh_reloaded.searching.Searcher.document` and -:meth:`~ whoosh_reloaded.searching.Searcher.documents` methods on the ``Searcher`` object let +The :meth:`~ whoosh.searching.Searcher.document` and +:meth:`~ whoosh.searching.Searcher.documents` methods on the ``Searcher`` object let you retrieve the stored fields of documents matching terms you pass in keyword arguments. @@ -343,7 +343,7 @@ Combining Results objects ========================= It is sometimes useful to use the results of another query to influence the -order of a :class:` whoosh_reloaded.searching.Results` object. +order of a :class:` whoosh.searching.Results` object. For example, you might have a "best bet" field. This field contains hand-picked keywords for documents. When the user searches for those keywords, you want diff --git a/docs/source/spelling.rst b/docs/source/spelling.rst index 422c6276..36fbb777 100644 --- a/docs/source/spelling.rst +++ b/docs/source/spelling.rst @@ -14,7 +14,7 @@ mis-typed word:: for mistyped_word in mistyped_words: print corrector.suggest(mistyped_word, limit=3) -See the :meth:` whoosh_reloaded.spelling.Corrector.suggest` method documentation +See the :meth:` whoosh.spelling.Corrector.suggest` method documentation for information on the arguments. Currently the suggestion engine is more like a "typo corrector" than a @@ -40,7 +40,7 @@ unmodified versions of the terms for spelling suggestions:: ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=TEXT(analyzer=ana, spelling=True)) -You can then use the :meth:` whoosh_reloaded.searching.Searcher.corrector` method +You can then use the :meth:` whoosh.searching.Searcher.corrector` method to get a corrector for a field:: corrector = searcher.corrector("content") @@ -61,9 +61,9 @@ populate the spelling dictionary. (In the following examples, ``word_list`` can be a list of unicode strings, or a file object with one word on each line.) -To create a :class:` whoosh_reloaded.spelling.Corrector` object from a sorted word list:: +To create a :class:` whoosh.spelling.Corrector` object from a sorted word list:: - from whoosh_reloaded.spelling import ListCorrector + from whoosh.spelling import ListCorrector # word_list must be a sorted list of unicocde strings corrector = ListCorrector(word_list) @@ -74,7 +74,7 @@ Merging two or more correctors You can combine suggestions from two sources (for example, the contents of an index field and a word list) using a -:class:` whoosh_reloaded.spelling.MultiCorrector`:: +:class:` whoosh.spelling.MultiCorrector`:: c1 = searcher.corrector("content") c2 = spelling.ListCorrector(word_list) @@ -85,9 +85,9 @@ Correcting user queries ======================= You can spell-check a user query using the -:meth:` whoosh_reloaded.searching.Searcher.correct_query` method:: +:meth:` whoosh.searching.Searcher.correct_query` method:: - from whoosh_reloaded import qparser + from whoosh import qparser # Parse the user query string qp = qparser.QueryParser("content", myindex.schema) @@ -103,7 +103,7 @@ The ``correct_query`` method returns an object with the following attributes: ``query`` - A corrected :class:` whoosh_reloaded.query.Query` tree. You can test + A corrected :class:` whoosh.query.Query` tree. You can test whether this is equal (``==``) to the original parsed query to check if the corrector actually changed anything. @@ -115,16 +115,16 @@ attributes: terms. You can use this to reformat the user query (see below). -You can use a :class:` whoosh_reloaded.highlight.Formatter` object to format the +You can use a :class:` whoosh.highlight.Formatter` object to format the corrected query string. For example, use the -:class:`~ whoosh_reloaded.highlight.HtmlFormatter` to format the corrected string +:class:`~ whoosh.highlight.HtmlFormatter` to format the corrected string as HTML:: - from whoosh_reloaded import highlight + from whoosh import highlight hf = highlight.HtmlFormatter() corrected = s.correct_query(q, qstring, formatter=hf) See the documentation for -:meth:` whoosh_reloaded.searching.Searcher.correct_query` for information on the +:meth:` whoosh.searching.Searcher.correct_query` for information on the defaults and arguments. diff --git a/docs/source/stemming.rst b/docs/source/stemming.rst index 8572b5d4..e88c66b7 100644 --- a/docs/source/stemming.rst +++ b/docs/source/stemming.rst @@ -15,7 +15,7 @@ the original text but not in the user's query, or vice versa. For example, we want the user to be able to search for ``cafe`` and find documents containing ``café``. -The default analyzer for the :class:` whoosh_reloaded.fields.TEXT` field does not do +The default analyzer for the :class:` whoosh.fields.TEXT` field does not do stemming or accent folding. @@ -29,7 +29,7 @@ and Lovins. :: - >>> from whoosh_reloaded.lang.porter import stem + >>> from whoosh.lang.porter import stem >>> stem("rendering") 'render' @@ -39,7 +39,7 @@ to words in user queries. So in theory all variations of a root word ("render", index, saving space. And all possible variations users might use in a query are reduced to the root, so stemming enhances "recall". -The :class:` whoosh_reloaded.analysis.StemFilter` lets you add a stemming filter to an +The :class:` whoosh.analysis.StemFilter` lets you add a stemming filter to an analyzer chain. :: @@ -50,11 +50,11 @@ analyzer chain. >>> [token.text for token in stemmer(stream)] [u"fundament", u"willow"] -The :func:` whoosh_reloaded.analysis.StemmingAnalyzer` is a pre-packaged analyzer that +The :func:` whoosh.analysis.StemmingAnalyzer` is a pre-packaged analyzer that combines a tokenizer, lower-case filter, optional stop filter, and stem filter:: - from whoosh_reloaded import fields - from whoosh_reloaded.analysis import StemmingAnalyzer + from whoosh import fields + from whoosh.analysis import StemmingAnalyzer stem_ana = StemmingAnalyzer() schema = fields.Schema(title=TEXT(analyzer=stem_ana, stored=True), @@ -86,7 +86,7 @@ variations of the word. :: - >>> from whoosh_reloaded.lang.morph_en import variations + >>> from whoosh.lang.morph_en import variations >>> variations("rendered") set(['rendered', 'rendernesses', 'render', 'renderless', 'rendering', 'renderness', 'renderes', 'renderer', 'renderements', 'rendereless', @@ -98,8 +98,8 @@ Many of the generated variations for a given word will not be valid words, but it's fairly fast for Whoosh to check which variations are actually in the index and only search for those. -The :class:` whoosh_reloaded.query.Variations` query object lets you search for variations -of a word. Whereas the normal :class:` whoosh_reloaded.query.Term` object only searches +The :class:` whoosh.query.Variations` query object lets you search for variations +of a word. Whereas the normal :class:` whoosh.query.Term` object only searches for the given term, the ``Variations`` query acts like an ``Or`` query for the variations of the given word in the index. For example, the query:: @@ -110,11 +110,11 @@ variations of the given word in the index. For example, the query:: query.Or([query.Term("content", "render"), query.Term("content", "rendered"), query.Term("content", "renders"), query.Term("content", "rendering")]) -To have the query parser use :class:` whoosh_reloaded.query.Variations` instead of -:class:` whoosh_reloaded.query.Term` for individual terms, use the ``termclass`` +To have the query parser use :class:` whoosh.query.Variations` instead of +:class:` whoosh.query.Term` for individual terms, use the ``termclass`` keyword argument to the parser initialization method:: - from whoosh_reloaded import qparser, query + from whoosh import qparser, query qp = qparser.QueryParser("content", termclass=query.Variations) @@ -142,7 +142,7 @@ language models, often involving analysis of the surrounding context and part-of-speech tagging. Whoosh does not include any lemmatization functions, but if you have separate -lemmatizing code you could write a custom :class:` whoosh_reloaded.analysis.Filter` +lemmatizing code you could write a custom :class:` whoosh.analysis.Filter` to integrate it into a Whoosh analyzer. @@ -165,14 +165,14 @@ http://www.alistapart.com/articles/accent-folding-for-auto-complete/ Whoosh includes several mechanisms for adding character folding to an analyzer. -The :class:` whoosh_reloaded.analysis.CharsetFilter` applies a character map to token +The :class:` whoosh.analysis.CharsetFilter` applies a character map to token text. For example, it will filter the tokens ``u'café', u'resumé', ...`` to ``u'cafe', u'resume', ...``. This is usually the method you'll want to use unless you need to use a charset to tokenize terms:: - from whoosh_reloaded.analysis import CharsetFilter, StemmingAnalyzer - from whoosh_reloaded import fields - from whoosh_reloaded.support.charset import accent_map + from whoosh.analysis import CharsetFilter, StemmingAnalyzer + from whoosh import fields + from whoosh.support.charset import accent_map # For example, to add an accent-folding filter to a stemming analyzer: my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) @@ -180,22 +180,22 @@ unless you need to use a charset to tokenize terms:: # To use this analyzer in your schema: my_schema = fields.Schema(content=fields.TEXT(analyzer=my_analyzer)) -The :class:` whoosh_reloaded.analysis.CharsetTokenizer` uses a Sphinx charset table to +The :class:` whoosh.analysis.CharsetTokenizer` uses a Sphinx charset table to both separate terms and perform character folding. This tokenizer is slower -than the :class:` whoosh_reloaded.analysis.RegexTokenizer` because it loops over each +than the :class:` whoosh.analysis.RegexTokenizer` because it loops over each character in Python. If the language(s) you're indexing can be tokenized using regular expressions, it will be much faster to use ``RegexTokenizer`` and ``CharsetFilter`` in combination instead of using ``CharsetTokenizer``. -The :mod:` whoosh_reloaded.support.charset` module contains an accent folding map useful +The :mod:` whoosh.support.charset` module contains an accent folding map useful for most Western languages, as well as a much more extensive Sphinx charset table and a function to convert Sphinx charset tables into the character maps required by ``CharsetTokenizer`` and ``CharsetFilter``:: # To create a filter using an enourmous character map for most languages # generated from a Sphinx charset table - from whoosh_reloaded.analysis import CharsetFilter - from whoosh_reloaded.support.charset import default_charset, charset_table_to_dict + from whoosh.analysis import CharsetFilter + from whoosh.support.charset import default_charset, charset_table_to_dict charmap = charset_table_to_dict(default_charset) my_analyzer = StemmingAnalyzer() | CharsetFilter(charmap) diff --git a/docs/source/tech/backend.rst b/docs/source/tech/backend.rst index 467367c9..bb3bf2dc 100644 --- a/docs/source/tech/backend.rst +++ b/docs/source/tech/backend.rst @@ -5,171 +5,171 @@ How to implement a new backend Index ===== -* Subclass :class:` whoosh_reloaded.index.Index`. +* Subclass :class:` whoosh.index.Index`. * Indexes must implement the following methods. - * :meth:` whoosh_reloaded.index.Index.is_empty` + * :meth:` whoosh.index.Index.is_empty` - * :meth:` whoosh_reloaded.index.Index.doc_count` + * :meth:` whoosh.index.Index.doc_count` - * :meth:` whoosh_reloaded.index.Index.reader` + * :meth:` whoosh.index.Index.reader` - * :meth:` whoosh_reloaded.index.Index.writer` + * :meth:` whoosh.index.Index.writer` * Indexes that require/support locking must implement the following methods. - * :meth:` whoosh_reloaded.index.Index.lock` + * :meth:` whoosh.index.Index.lock` - * :meth:` whoosh_reloaded.index.Index.unlock` + * :meth:` whoosh.index.Index.unlock` * Indexes that support deletion must implement the following methods. - * :meth:` whoosh_reloaded.index.Index.delete_document` + * :meth:` whoosh.index.Index.delete_document` - * :meth:` whoosh_reloaded.index.Index.doc_count_all` -- if the backend has delayed + * :meth:` whoosh.index.Index.doc_count_all` -- if the backend has delayed deletion. * Indexes that require/support versioning/transactions *may* implement the following methods. - * :meth:` whoosh_reloaded.index.Index.latest_generation` + * :meth:` whoosh.index.Index.latest_generation` - * :meth:` whoosh_reloaded.index.Index.up_to_date` + * :meth:` whoosh.index.Index.up_to_date` - * :meth:` whoosh_reloaded.index.Index.last_modified` + * :meth:` whoosh.index.Index.last_modified` * Index *may* implement the following methods (the base class's versions are no-ops). - * :meth:` whoosh_reloaded.index.Index.optimize` + * :meth:` whoosh.index.Index.optimize` - * :meth:` whoosh_reloaded.index.Index.close` + * :meth:` whoosh.index.Index.close` IndexWriter =========== -* Subclass :class:` whoosh_reloaded.writing.IndexWriter`. +* Subclass :class:` whoosh.writing.IndexWriter`. * IndexWriters must implement the following methods. - * :meth:` whoosh_reloaded.writing.IndexWriter.add_document` + * :meth:` whoosh.writing.IndexWriter.add_document` - * :meth:` whoosh_reloaded.writing.IndexWriter.add_reader` + * :meth:` whoosh.writing.IndexWriter.add_reader` * Backends that support deletion must implement the following methods. - * :meth:` whoosh_reloaded.writing.IndexWriter.delete_document` + * :meth:` whoosh.writing.IndexWriter.delete_document` * IndexWriters that work as transactions must implement the following methods. - * :meth:` whoosh_reloaded.reading.IndexWriter.commit` -- Save the additions/deletions done with + * :meth:` whoosh.reading.IndexWriter.commit` -- Save the additions/deletions done with this IndexWriter to the main index, and release any resources used by the IndexWriter. - * :meth:` whoosh_reloaded.reading.IndexWriter.cancel` -- Throw away any additions/deletions done + * :meth:` whoosh.reading.IndexWriter.cancel` -- Throw away any additions/deletions done with this IndexWriter, and release any resources used by the IndexWriter. IndexReader =========== -* Subclass :class:` whoosh_reloaded.reading.IndexReader`. +* Subclass :class:` whoosh.reading.IndexReader`. * IndexReaders must implement the following methods. - * :meth:` whoosh_reloaded.reading.IndexReader.__contains__` + * :meth:` whoosh.reading.IndexReader.__contains__` - * :meth:` whoosh_reloaded.reading.IndexReader.__iter__` + * :meth:` whoosh.reading.IndexReader.__iter__` - * :meth:` whoosh_reloaded.reading.IndexReader.iter_from` + * :meth:` whoosh.reading.IndexReader.iter_from` - * :meth:` whoosh_reloaded.reading.IndexReader.stored_fields` + * :meth:` whoosh.reading.IndexReader.stored_fields` - * :meth:` whoosh_reloaded.reading.IndexReader.doc_count_all` + * :meth:` whoosh.reading.IndexReader.doc_count_all` - * :meth:` whoosh_reloaded.reading.IndexReader.doc_count` + * :meth:` whoosh.reading.IndexReader.doc_count` - * :meth:` whoosh_reloaded.reading.IndexReader.doc_field_length` + * :meth:` whoosh.reading.IndexReader.doc_field_length` - * :meth:` whoosh_reloaded.reading.IndexReader.field_length` + * :meth:` whoosh.reading.IndexReader.field_length` - * :meth:` whoosh_reloaded.reading.IndexReader.max_field_length` + * :meth:` whoosh.reading.IndexReader.max_field_length` - * :meth:` whoosh_reloaded.reading.IndexReader.postings` + * :meth:` whoosh.reading.IndexReader.postings` - * :meth:` whoosh_reloaded.reading.IndexReader.has_vector` + * :meth:` whoosh.reading.IndexReader.has_vector` - * :meth:` whoosh_reloaded.reading.IndexReader.vector` + * :meth:` whoosh.reading.IndexReader.vector` - * :meth:` whoosh_reloaded.reading.IndexReader.doc_frequency` + * :meth:` whoosh.reading.IndexReader.doc_frequency` - * :meth:` whoosh_reloaded.reading.IndexReader.frequency` + * :meth:` whoosh.reading.IndexReader.frequency` * Backends that support deleting documents should implement the following methods. - * :meth:` whoosh_reloaded.reading.IndexReader.has_deletions` - * :meth:` whoosh_reloaded.reading.IndexReader.is_deleted` + * :meth:` whoosh.reading.IndexReader.has_deletions` + * :meth:` whoosh.reading.IndexReader.is_deleted` * Backends that support versioning should implement the following methods. - * :meth:` whoosh_reloaded.reading.IndexReader.generation` + * :meth:` whoosh.reading.IndexReader.generation` * If the IndexReader object does not keep the schema in the ``self.schema`` attribute, it needs to override the following methods. - * :meth:` whoosh_reloaded.reading.IndexReader.field` + * :meth:` whoosh.reading.IndexReader.field` - * :meth:` whoosh_reloaded.reading.IndexReader.field_names` + * :meth:` whoosh.reading.IndexReader.field_names` - * :meth:` whoosh_reloaded.reading.IndexReader.scorable_names` + * :meth:` whoosh.reading.IndexReader.scorable_names` - * :meth:` whoosh_reloaded.reading.IndexReader.vector_names` + * :meth:` whoosh.reading.IndexReader.vector_names` * IndexReaders *may* implement the following methods. - * :meth:` whoosh_reloaded.reading.DocReader.close` -- closes any open resources associated with the + * :meth:` whoosh.reading.DocReader.close` -- closes any open resources associated with the reader. Matcher ======= -The :meth:` whoosh_reloaded.reading.IndexReader.postings` method returns a -:class:` whoosh_reloaded.matching.Matcher` object. You will probably need to implement +The :meth:` whoosh.reading.IndexReader.postings` method returns a +:class:` whoosh.matching.Matcher` object. You will probably need to implement a custom Matcher class for reading from your posting lists. -* Subclass :class:` whoosh_reloaded.matching.Matcher`. +* Subclass :class:` whoosh.matching.Matcher`. * Implement the following methods at minimum. - * :meth:` whoosh_reloaded.matching.Matcher.is_active` + * :meth:` whoosh.matching.Matcher.is_active` - * :meth:` whoosh_reloaded.matching.Matcher.copy` + * :meth:` whoosh.matching.Matcher.copy` - * :meth:` whoosh_reloaded.matching.Matcher.id` + * :meth:` whoosh.matching.Matcher.id` - * :meth:` whoosh_reloaded.matching.Matcher.next` + * :meth:` whoosh.matching.Matcher.next` - * :meth:` whoosh_reloaded.matching.Matcher.value` + * :meth:` whoosh.matching.Matcher.value` - * :meth:` whoosh_reloaded.matching.Matcher.value_as` + * :meth:` whoosh.matching.Matcher.value_as` - * :meth:` whoosh_reloaded.matching.Matcher.score` + * :meth:` whoosh.matching.Matcher.score` * Depending on the implementation, you *may* implement the following methods more efficiently. - * :meth:` whoosh_reloaded.matching.Matcher.skip_to` + * :meth:` whoosh.matching.Matcher.skip_to` - * :meth:` whoosh_reloaded.matching.Matcher.weight` + * :meth:` whoosh.matching.Matcher.weight` * If the implementation supports quality, you should implement the following methods. - * :meth:` whoosh_reloaded.matching.Matcher.supports_quality` + * :meth:` whoosh.matching.Matcher.supports_quality` - * :meth:` whoosh_reloaded.matching.Matcher.quality` + * :meth:` whoosh.matching.Matcher.quality` - * :meth:` whoosh_reloaded.matching.Matcher.block_quality` + * :meth:` whoosh.matching.Matcher.block_quality` - * :meth:` whoosh_reloaded.matching.Matcher.skip_to_quality` + * :meth:` whoosh.matching.Matcher.skip_to_quality` diff --git a/docs/source/tech/filedb.rst b/docs/source/tech/filedb.rst index 53549067..0fe22be7 100644 --- a/docs/source/tech/filedb.rst +++ b/docs/source/tech/filedb.rst @@ -10,7 +10,7 @@ Files created .toc The "master" file containing information about the index and its segments. -The index directory will contain a set of files for each segment. A segment is like a mini-index -- when you add documents to the index, whoosh_reloaded creates a new segment and then searches the old segment(s) and the new segment to avoid having to do a big merge every time you add a document. When you get enough small segments whoosh_reloaded will merge them into larger segments or a single segment. +The index directory will contain a set of files for each segment. A segment is like a mini-index -- when you add documents to the index, Whoosh creates a new segment and then searches the old segment(s) and the new segment to avoid having to do a big merge every time you add a document. When you get enough small segments Whoosh will merge them into larger segments or a single segment. .dci Contains per-document information (e.g. field lengths). This will grow linearly with the number of documents. diff --git a/docs/source/threads.rst b/docs/source/threads.rst index 712600b0..0b45a643 100644 --- a/docs/source/threads.rst +++ b/docs/source/threads.rst @@ -24,12 +24,12 @@ Locking Only one thread/process can write to an index at a time. When you open a writer, it locks the index. If you try to open a writer on the same index in another -thread/process, it will raise `` whoosh_reloaded.store.LockError``. +thread/process, it will raise `` whoosh.store.LockError``. In a multi-threaded or multi-process environment your code needs to be aware that opening a writer may raise this exception if a writer is already open. Whoosh includes a couple of example implementations -(:class:` whoosh_reloaded.writing.AsyncWriter` and :class:` whoosh_reloaded.writing.BufferedWriter`) +(:class:` whoosh.writing.AsyncWriter` and :class:` whoosh.writing.BufferedWriter`) of ways to work around the write lock. While the writer is open and during the commit, **the index is still available @@ -56,9 +56,9 @@ always sees the index as it existed when the reader was opened. If you are re-using a Searcher across multiple search requests, you can check whether the Searcher is a view of the latest version of the index using -:meth:` whoosh_reloaded.searching.Searcher.up_to_date`. If the searcher is not up to date, +:meth:` whoosh.searching.Searcher.up_to_date`. If the searcher is not up to date, you can get an up-to-date copy of the searcher using -:meth:` whoosh_reloaded.searching.Searcher.refresh`:: +:meth:` whoosh.searching.Searcher.refresh`:: # If 'searcher' is not up-to-date, replace it searcher = searcher.refresh() diff --git a/files/whoosh_small.svg b/files/whoosh_small.svg index 7b942e0a..0d967b97 100644 --- a/files/whoosh_small.svg +++ b/files/whoosh_small.svg @@ -15,10 +15,10 @@ sodipodi:version="0.32" inkscape:version="0.46" sodipodi:docbase="e:\dev_clean\src\houdini\support\icons\misc" - sodipodi:docname=" whoosh_reloaded_small.svg" + sodipodi:docname="whoosh_small.svg" version="1.0" inkscape:output_extension="org.inkscape.output.svg.inkscape" - inkscape:export-filename="C:\Documents and Settings\matt\Desktop\ whoosh_reloaded_small.png" + inkscape:export-filename="C:\Documents and Settings\matt\Desktop\whoosh_small.png" inkscape:export-xdpi="90" inkscape:export-ydpi="90"> >> [token.text for token in ana("Por el mar corren las liebres")] ['mar', 'corr', 'liebr'] - The list of available languages is in `whoosh_reloaded.lang.languages`. - You can use :func:`whoosh_reloaded.lang.has_stemmer` and - :func:`whoosh_reloaded.lang.has_stopwords` to check if a given language has a + The list of available languages is in `whoosh.lang.languages`. + You can use :func:`whoosh.lang.has_stemmer` and + :func:`whoosh.lang.has_stopwords` to check if a given language has a stemming function and/or stop word list available. :param expression: The regular expression pattern to use to extract tokens. @@ -296,7 +296,7 @@ def LanguageAnalyzer(lang, expression=default_pattern, gaps=False, cachesize=500 use. """ - from whoosh_reloaded.lang import NoStemmer, NoStopWords + from whoosh.lang import NoStemmer, NoStopWords # Make the start of the chain chain = RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter() diff --git a/src/whoosh_reloaded/analysis/filters.py b/src/whoosh/analysis/filters.py similarity index 95% rename from src/whoosh_reloaded/analysis/filters.py rename to src/whoosh/analysis/filters.py index b542943d..110e6ba3 100644 --- a/src/whoosh_reloaded/analysis/filters.py +++ b/src/whoosh/analysis/filters.py @@ -29,9 +29,9 @@ from itertools import chain -from whoosh_reloaded.compat import next -from whoosh_reloaded.analysis.acore import Composable -from whoosh_reloaded.util.text import rcompile +from whoosh.compat import next +from whoosh.analysis.acore import Composable +from whoosh.util.text import rcompile # Default list of stop words (words so common it's usually wasteful to index @@ -134,14 +134,14 @@ class LoggingFilter(Filter): def __init__(self, logger=None): """ - :param target: the logger to use. If omitted, the "whoosh_reloaded.analysis" + :param target: the logger to use. If omitted, the "whoosh.analysis" logger is used. """ if logger is None: import logging - logger = logging.getLogger("whoosh_reloaded.analysis") + logger = logging.getLogger("whoosh.analysis") self.logger = logger def __call__(self, tokens): @@ -291,8 +291,8 @@ class StopFilter(Filter): >>> [token.text for token in es_stopper(u"el lapiz es en la mesa")] ["lapiz", "mesa"] - The list of available languages is in `whoosh_reloaded.lang.languages`. - You can use :func:`whoosh_reloaded.lang.has_stopwords` to check if a given language + The list of available languages is in `whoosh.lang.languages`. + You can use :func:`whoosh.lang.has_stopwords` to check if a given language has a stop word list available. """ @@ -317,7 +317,7 @@ def __init__( if stoplist: stops.update(stoplist) if lang: - from whoosh_reloaded.lang import stopwords_for_language + from whoosh.lang import stopwords_for_language stops.update(stopwords_for_language(lang)) @@ -371,9 +371,9 @@ class CharsetFilter(Filter): supplied character mapping object. This is useful for case and accent folding. - The ``whoosh_reloaded.support.charset`` module has a useful map for accent folding. + The ``whoosh.support.charset`` module has a useful map for accent folding. - >>> from whoosh_reloaded.support.charset import accent_map + >>> from whoosh.support.charset import accent_map >>> retokenizer = RegexTokenizer() >>> chfilter = CharsetFilter(accent_map) >>> [t.text for t in chfilter(retokenizer(u'café'))] @@ -381,10 +381,10 @@ class CharsetFilter(Filter): Another way to get a character mapping object is to convert a Sphinx charset table file using - :func:`whoosh_reloaded.support.charset.charset_table_to_dict`. + :func:`whoosh.support.charset.charset_table_to_dict`. - >>> from whoosh_reloaded.support.charset import charset_table_to_dict - >>> from whoosh_reloaded.support.charset import default_charset + >>> from whoosh.support.charset import charset_table_to_dict + >>> from whoosh.support.charset import default_charset >>> retokenizer = RegexTokenizer() >>> charmap = charset_table_to_dict(default_charset) >>> chfilter = CharsetFilter(charmap) diff --git a/src/whoosh_reloaded/analysis/intraword.py b/src/whoosh/analysis/intraword.py similarity index 98% rename from src/whoosh_reloaded/analysis/intraword.py rename to src/whoosh/analysis/intraword.py index f2602aa0..85355f11 100644 --- a/src/whoosh_reloaded/analysis/intraword.py +++ b/src/whoosh/analysis/intraword.py @@ -28,9 +28,9 @@ import re from collections import deque -from whoosh_reloaded.compat import u, text_type -from whoosh_reloaded.compat import xrange -from whoosh_reloaded.analysis.filters import Filter +from whoosh.compat import u, text_type +from whoosh.compat import range +from whoosh.analysis.filters import Filter class CompoundWordFilter(Filter): @@ -71,7 +71,7 @@ def subwords(self, s, memo): if s in memo: return memo[s] - for i in xrange(1, len(s)): + for i in range(1, len(s)): prefix = s[:i] if prefix in self.wordset: suffix = s[i:] @@ -307,7 +307,7 @@ def __init__( additional token with the same position as the last subword. """ - from whoosh_reloaded.support.unicode import digits, lowercase, uppercase + from whoosh.support.unicode import digits, lowercase, uppercase self.delims = re.escape(delims) diff --git a/src/whoosh_reloaded/analysis/morph.py b/src/whoosh/analysis/morph.py similarity index 93% rename from src/whoosh_reloaded/analysis/morph.py rename to src/whoosh/analysis/morph.py index 23b22fc7..50d5a631 100644 --- a/src/whoosh_reloaded/analysis/morph.py +++ b/src/whoosh/analysis/morph.py @@ -25,11 +25,11 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.analysis.filters import Filter -from whoosh_reloaded.compat import integer_types -from whoosh_reloaded.lang.dmetaphone import double_metaphone -from whoosh_reloaded.lang.porter import stem -from whoosh_reloaded.util.cache import lfu_cache, unbound_cache +from whoosh.analysis.filters import Filter +from whoosh.compat import integer_types +from whoosh.lang.dmetaphone import double_metaphone +from whoosh.lang.porter import stem +from whoosh.util.cache import lfu_cache, unbound_cache class StemFilter(Filter): @@ -52,8 +52,8 @@ class StemFilter(Filter): >>> stemfilter = StemFilter(lang="ru") - The list of available languages is in `whoosh_reloaded.lang.languages`. - You can use :func:`whoosh_reloaded.lang.has_stemmer` to check if a given language has + The list of available languages is in `whoosh.lang.languages`. + You can use :func:`whoosh.lang.has_stemmer` to check if a given language has a stemming function available. By default, this class wraps an LRU cache around the stemming function. The @@ -74,7 +74,7 @@ def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): """ :param stemfn: the function to use for stemming. :param lang: if not None, overrides the stemfn with a language stemmer - from the ``whoosh_reloaded.lang.snowball`` package. + from the ``whoosh.lang.snowball`` package. :param ignore: a set/list of words that should not be stemmed. This is converted into a frozenset. If you omit this argument, all tokens are stemmed. @@ -114,7 +114,7 @@ def __setstate__(self, state): def clear(self): if self.lang: - from whoosh_reloaded.lang import stemmer_for_language + from whoosh.lang import stemmer_for_language stemfn = stemmer_for_language(self.lang) else: @@ -180,7 +180,7 @@ def algorithms(self): library. """ - import Stemmer # @UnresolvedImport + import Stemmer # type: ignore @UnresolvedImport return Stemmer.algorithms() @@ -188,7 +188,7 @@ def cache_info(self): return None def _get_stemmer_fn(self): - import Stemmer # @UnresolvedImport + import Stemmer # type: ignore @UnresolvedImport stemmer = Stemmer.Stemmer(self.lang) stemmer.maxCacheSize = self.cachesize diff --git a/src/whoosh_reloaded/analysis/ngrams.py b/src/whoosh/analysis/ngrams.py similarity index 91% rename from src/whoosh_reloaded/analysis/ngrams.py rename to src/whoosh/analysis/ngrams.py index 2750d8a0..8ada2b58 100644 --- a/src/whoosh_reloaded/analysis/ngrams.py +++ b/src/whoosh/analysis/ngrams.py @@ -25,11 +25,11 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.compat import text_type -from whoosh_reloaded.compat import xrange -from whoosh_reloaded.analysis.acore import Token -from whoosh_reloaded.analysis.filters import Filter, LowercaseFilter -from whoosh_reloaded.analysis.tokenizers import Tokenizer, RegexTokenizer +from whoosh.compat import text_type +from whoosh.compat import range +from whoosh.analysis.acore import Token +from whoosh.analysis.filters import Filter, LowercaseFilter +from whoosh.analysis.tokenizers import Tokenizer, RegexTokenizer # Tokenizer @@ -89,7 +89,7 @@ def __call__( if mode == "query": size = min(self.max, inlen) - for start in xrange(0, inlen - size + 1): + for start in range(0, inlen - size + 1): end = start + size if end > inlen: continue @@ -105,8 +105,8 @@ def __call__( yield t pos += 1 else: - for start in xrange(0, inlen - self.min + 1): - for size in xrange(self.min, self.max + 1): + for start in range(0, inlen - self.min + 1): + for size in range(self.min, self.max + 1): end = start + size if end > inlen: continue @@ -193,7 +193,7 @@ def __call__(self, tokens): t.startchar = t.endchar - size yield t else: - for start in xrange(0, len(text) - size + 1): + for start in range(0, len(text) - size + 1): t.text = text[start : start + size] if chars: t.startchar = startchar + start @@ -202,7 +202,7 @@ def __call__(self, tokens): else: if at == -1: limit = min(self.max, len(text)) - for size in xrange(self.min, limit + 1): + for size in range(self.min, limit + 1): t.text = text[:size] if chars: t.endchar = startchar + size @@ -212,14 +212,14 @@ def __call__(self, tokens): if chars: original_startchar = t.startchar start = max(0, len(text) - self.max) - for i in xrange(start, len(text) - self.min + 1): + for i in range(start, len(text) - self.min + 1): t.text = text[i:] if chars: t.startchar = original_startchar + i yield t else: - for start in xrange(0, len(text) - self.min + 1): - for size in xrange(self.min, self.max + 1): + for start in range(0, len(text) - self.min + 1): + for size in range(self.min, self.max + 1): end = start + size if end > len(text): continue diff --git a/src/whoosh_reloaded/analysis/tokenizers.py b/src/whoosh/analysis/tokenizers.py similarity index 96% rename from src/whoosh_reloaded/analysis/tokenizers.py rename to src/whoosh/analysis/tokenizers.py index 7d0761f0..b0340fcb 100644 --- a/src/whoosh_reloaded/analysis/tokenizers.py +++ b/src/whoosh/analysis/tokenizers.py @@ -25,9 +25,9 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.compat import u, text_type -from whoosh_reloaded.analysis.acore import Composable, Token -from whoosh_reloaded.util.text import rcompile +from whoosh.compat import u, text_type +from whoosh.analysis.acore import Composable, Token +from whoosh.util.text import rcompile default_pattern = rcompile(r"[\w\*]+(\.?[\w\*]+)*") @@ -210,10 +210,10 @@ class CharsetTokenizer(Tokenizer): slower than :class:`RegexTokenizer`. One way to get a character mapping object is to convert a Sphinx charset - table file using :func:`whoosh_reloaded.support.charset.charset_table_to_dict`. + table file using :func:`whoosh.support.charset.charset_table_to_dict`. - >>> from whoosh_reloaded.support.charset import charset_table_to_dict - >>> from whoosh_reloaded.support.charset import default_charset + >>> from whoosh.support.charset import charset_table_to_dict + >>> from whoosh.support.charset import default_charset >>> charmap = charset_table_to_dict(default_charset) >>> chtokenizer = CharsetTokenizer(charmap) >>> [t.text for t in chtokenizer(u'Stra\\xdfe ABC')] @@ -339,7 +339,7 @@ def CommaSeparatedTokenizer(): ["hi there", "what's", "up"] """ - from whoosh_reloaded.analysis.filters import StripFilter + from whoosh.analysis.filters import StripFilter return RegexTokenizer(r"[^,]+") | StripFilter() diff --git a/src/whoosh_reloaded/automata/__init__.py b/src/whoosh/automata/__init__.py similarity index 100% rename from src/whoosh_reloaded/automata/__init__.py rename to src/whoosh/automata/__init__.py diff --git a/src/whoosh_reloaded/automata/fsa.py b/src/whoosh/automata/fsa.py similarity index 99% rename from src/whoosh_reloaded/automata/fsa.py rename to src/whoosh/automata/fsa.py index 08efabf7..56fa1623 100644 --- a/src/whoosh_reloaded/automata/fsa.py +++ b/src/whoosh/automata/fsa.py @@ -5,7 +5,7 @@ import sys from bisect import bisect_left -from whoosh_reloaded.compat import iteritems, next, text_type, unichr, xrange +from whoosh.compat import iteritems, next, text_type, unichr, range unull = unichr(0) @@ -340,7 +340,7 @@ def minimize(self): parts = [final_states, reachable - final_states] while changed: changed = False - for i in xrange(len(parts)): + for i in range(len(parts)): part = parts[i] changed_part = False for label in labels: diff --git a/src/whoosh/automata/fst.py b/src/whoosh/automata/fst.py new file mode 100644 index 00000000..fed1ce7e --- /dev/null +++ b/src/whoosh/automata/fst.py @@ -0,0 +1,1566 @@ +# Copyright 2009 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module implements an FST/FSA writer and reader. An FST (Finite State +Transducer) stores a directed acyclic graph with values associated with the +leaves. Common elements of the values are pushed inside the tree. An FST that +does not store values is a regular FSA. + +The format of the leaf values is pluggable using subclasses of the Values +class. + +Whoosh uses these structures to store a directed acyclic word graph (DAWG) for +use in (at least) spell checking. +""" + + +import sys, copy +from array import array +from hashlib import sha1 # type: ignore @UnresolvedImport + +from whoosh.compat import b, u, BytesIO +from whoosh.compat import range, iteritems, iterkeys, izip, array_tobytes +from whoosh.compat import bytes_type, text_type +from whoosh.filedb.structfile import StructFile +from whoosh.system import _INT_SIZE +from whoosh.system import pack_byte, pack_int, pack_uint, pack_long +from whoosh.system import emptybytes +from whoosh.util.text import utf8encode, utf8decode +from whoosh.util.varints import varint + + +class FileVersionError(Exception): + pass + + +class InactiveCursor(Exception): + pass + + +ARC_LAST = 1 +ARC_ACCEPT = 2 +ARC_STOP = 4 +ARC_HAS_VAL = 8 +ARC_HAS_ACCEPT_VAL = 16 +MULTIBYTE_LABEL = 32 + + +# FST Value types + + +class Values(object): + """Base for classes the describe how to encode and decode FST values.""" + + @staticmethod + def is_valid(v): + """Returns True if v is a valid object that can be stored by this + class. + """ + + raise NotImplementedError + + @staticmethod + def common(v1, v2): + """Returns the "common" part of the two values, for whatever "common" + means for this class. For example, a string implementation would return + the common shared prefix, for an int implementation it would return + the minimum of the two numbers. + + If there is no common part, this method should return None. + """ + + raise NotImplementedError + + @staticmethod + def add(prefix, v): + """Adds the given prefix (the result of a call to common()) to the + given value. + """ + + raise NotImplementedError + + @staticmethod + def subtract(v, prefix): + """Subtracts the "common" part (the prefix) from the given value.""" + + raise NotImplementedError + + @staticmethod + def write(dbfile, v): + """Writes value v to a file.""" + + raise NotImplementedError + + @staticmethod + def read(dbfile): + """Reads a value from the given file.""" + + raise NotImplementedError + + @classmethod + def skip(cls, dbfile): + """Skips over a value in the given file.""" + + cls.read(dbfile) + + @staticmethod + def to_bytes(v): + """Returns a str (Python 2.x) or bytes (Python 3) representation of + the given value. This is used for calculating node digests, so it + should be unique but fast to calculate, and does not have to be + parseable. + """ + + raise NotImplementedError + + @staticmethod + def merge(v1, v2): + raise NotImplementedError + + +class IntValues(Values): + """Stores integer values in an FST.""" + + @staticmethod + def is_valid(v): + return isinstance(v, int) and v >= 0 + + @staticmethod + def common(v1, v2): + if v1 is None or v2 is None: + return None + if v1 == v2: + return v1 + return min(v1, v2) + + @staticmethod + def add(base, v): + if base is None: + return v + if v is None: + return base + return base + v + + @staticmethod + def subtract(v, base): + if v is None: + return None + if base is None: + return v + return v - base + + @staticmethod + def write(dbfile, v): + dbfile.write_uint(v) + + @staticmethod + def read(dbfile): + return dbfile.read_uint() + + @staticmethod + def skip(dbfile): + dbfile.seek(_INT_SIZE, 1) + + @staticmethod + def to_bytes(v): + return pack_int(v) + + +class SequenceValues(Values): + """Abstract base class for value types that store sequences.""" + + @staticmethod + def is_valid(self, v): + return isinstance(self, (list, tuple)) + + @staticmethod + def common(v1, v2): + if v1 is None or v2 is None: + return None + + i = 0 + while i < len(v1) and i < len(v2): + if v1[i] != v2[i]: + break + i += 1 + + if i == 0: + return None + if i == len(v1): + return v1 + if i == len(v2): + return v2 + return v1[:i] + + @staticmethod + def add(prefix, v): + if prefix is None: + return v + if v is None: + return prefix + return prefix + v + + @staticmethod + def subtract(v, prefix): + if prefix is None: + return v + if v is None: + return None + if len(v) == len(prefix): + return None + if len(v) < len(prefix) or len(prefix) == 0: + raise ValueError((v, prefix)) + return v[len(prefix) :] + + @staticmethod + def write(dbfile, v): + dbfile.write_pickle(v) + + @staticmethod + def read(dbfile): + return dbfile.read_pickle() + + +class BytesValues(SequenceValues): + """Stores bytes objects (str in Python 2.x) in an FST.""" + + @staticmethod + def is_valid(v): + return isinstance(v, bytes_type) + + @staticmethod + def write(dbfile, v): + dbfile.write_int(len(v)) + dbfile.write(v) + + @staticmethod + def read(dbfile): + length = dbfile.read_int() + return dbfile.read(length) + + @staticmethod + def skip(dbfile): + length = dbfile.read_int() + dbfile.seek(length, 1) + + @staticmethod + def to_bytes(v): + return v + + +class ArrayValues(SequenceValues): + """Stores array.array objects in an FST.""" + + def __init__(self, typecode): + self.typecode = typecode + self.itemsize = array(self.typecode).itemsize + + def is_valid(self, v): + return isinstance(v, array) and v.typecode == self.typecode + + @staticmethod + def write(dbfile, v): + dbfile.write(b(v.typecode)) + dbfile.write_int(len(v)) + dbfile.write_array(v) + + def read(self, dbfile): + typecode = u(dbfile.read(1)) + length = dbfile.read_int() + return dbfile.read_array(self.typecode, length) + + def skip(self, dbfile): + length = dbfile.read_int() + dbfile.seek(length * self.itemsize, 1) + + @staticmethod + def to_bytes(v): + return array_tobytes(v) + + +class IntListValues(SequenceValues): + """Stores lists of positive, increasing integers (that is, lists of + integers where each number is >= 0 and each number is greater than or equal + to the number that precedes it) in an FST. + """ + + @staticmethod + def is_valid(v): + if isinstance(v, (list, tuple)): + if len(v) < 2: + return True + for i in range(1, len(v)): + if not isinstance(v[i], int) or v[i] < v[i - 1]: + return False + return True + return False + + @staticmethod + def write(dbfile, v): + base = 0 + dbfile.write_varint(len(v)) + for x in v: + delta = x - base + assert delta >= 0 + dbfile.write_varint(delta) + base = x + + @staticmethod + def read(dbfile): + length = dbfile.read_varint() + result = [] + if length > 0: + base = 0 + for _ in range(length): + base += dbfile.read_varint() + result.append(base) + return result + + @staticmethod + def to_bytes(v): + return b(repr(v)) + + +# Node-like interface wrappers + + +class Node(object): + """A slow but easier-to-use wrapper for FSA/DAWGs. Translates the low-level + arc-based interface of GraphReader into Node objects with methods to follow + edges. + """ + + def __init__(self, owner, address, accept=False): + self.owner = owner + self.address = address + self._edges = None + self.accept = accept + + def __iter__(self): + if not self._edges: + self._load() + return iterkeys(self._edges) + + def __contains__(self, key): + if self._edges is None: + self._load() + return key in self._edges + + def _load(self): + owner = self.owner + if self.address is None: + d = {} + else: + d = dict( + (arc.label, Node(owner, arc.target, arc.accept)) + for arc in self.owner.iter_arcs(self.address) + ) + self._edges = d + + def keys(self): + if self._edges is None: + self._load() + return self._edges.keys() + + def all_edges(self): + if self._edges is None: + self._load() + return self._edges + + def edge(self, key): + if self._edges is None: + self._load() + return self._edges[key] + + def flatten(self, sofar=emptybytes): + if self.accept: + yield sofar + for key in sorted(self): + node = self.edge(key) + for result in node.flatten(sofar + key): + yield result + + def flatten_strings(self): + return (utf8decode(k)[0] for k in self.flatten()) + + +class ComboNode(Node): + """Base class for nodes that blend the nodes of two different graphs. + + Concrete subclasses need to implement the ``edge()`` method and possibly + override the ``accept`` property. + """ + + def __init__(self, a, b): + self.a = a + self.b = b + + def __repr__(self): + return "<%s %r %r>" % (self.__class__.__name__, self.a, self.b) + + def __contains__(self, key): + return key in self.a or key in self.b + + def __iter__(self): + return iter(set(self.a) | set(self.b)) + + @property + def accept(self): + return self.a.accept or self.b.accept + + +class UnionNode(ComboNode): + """Makes two graphs appear to be the union of the two graphs.""" + + def edge(self, key): + a = self.a + b = self.b + if key in a and key in b: + return UnionNode(a.edge(key), b.edge(key)) + elif key in a: + return a.edge(key) + else: + return b.edge(key) + + +class IntersectionNode(ComboNode): + """Makes two graphs appear to be the intersection of the two graphs.""" + + def edge(self, key): + a = self.a + b = self.b + if key in a and key in b: + return IntersectionNode(a.edge(key), b.edge(key)) + + +# Cursor + + +class BaseCursor(object): + """Base class for a cursor-type object for navigating an FST/word graph, + represented by a :class:`GraphReader` object. + + >>> cur = GraphReader(dawgfile).cursor() + >>> for key in cur.follow(): + ... print(repr(key)) + + The cursor "rests" on arcs in the FSA/FST graph, rather than nodes. + """ + + def is_active(self): + """Returns True if this cursor is still active, that is it has not + read past the last arc in the graph. + """ + + raise NotImplementedError + + def label(self): + """Returns the label bytes of the current arc.""" + + raise NotImplementedError + + def prefix(self): + """Returns a sequence of the label bytes for the path from the root + to the current arc. + """ + + raise NotImplementedError + + def prefix_bytes(self): + """Returns the label bytes for the path from the root to the current + arc as a single joined bytes object. + """ + + return emptybytes.join(self.prefix()) + + def prefix_string(self): + """Returns the labels of the path from the root to the current arc as + a decoded unicode string. + """ + + return utf8decode(self.prefix_bytes())[0] + + def peek_key(self): + """Returns a sequence of label bytes representing the next closest + key in the graph. + """ + + for label in self.prefix(): + yield label + c = self.copy() + while not c.stopped(): + c.follow() + yield c.label() + + def peek_key_bytes(self): + """Returns the next closest key in the graph as a single bytes object.""" + + return emptybytes.join(self.peek_key()) + + def peek_key_string(self): + """Returns the next closest key in the graph as a decoded unicode + string. + """ + + return utf8decode(self.peek_key_bytes())[0] + + def stopped(self): + """Returns True if the current arc leads to a stop state.""" + + raise NotImplementedError + + def value(self): + """Returns the value at the current arc, if reading an FST.""" + + raise NotImplementedError + + def accept(self): + """Returns True if the current arc leads to an accept state (the end + of a valid key). + """ + + raise NotImplementedError + + def at_last_arc(self): + """Returns True if the current arc is the last outgoing arc from the + previous node. + """ + + raise NotImplementedError + + def next_arc(self): + """Moves to the next outgoing arc from the previous node.""" + + raise NotImplementedError + + def follow(self): + """Follows the current arc.""" + + raise NotImplementedError + + def switch_to(self, label): + """Switch to the sibling arc with the given label bytes.""" + + _label = self.label + _at_last_arc = self.at_last_arc + _next_arc = self.next_arc + + while True: + thislabel = _label() + if thislabel == label: + return True + if thislabel > label or _at_last_arc(): + return False + _next_arc() + + def skip_to(self, key): + """Moves the cursor to the path represented by the given key bytes.""" + + _accept = self.accept + _prefix = self.prefix + _next_arc = self.next_arc + + keylist = list(key) + while True: + if _accept(): + thiskey = list(_prefix()) + if keylist == thiskey: + return True + elif keylist > thiskey: + return False + _next_arc() + + def flatten(self): + """Yields the keys in the graph, starting at the current position.""" + + _is_active = self.is_active + _accept = self.accept + _stopped = self.stopped + _follow = self.follow + _next_arc = self.next_arc + _prefix_bytes = self.prefix_bytes + + if not _is_active(): + raise InactiveCursor + while _is_active(): + if _accept(): + yield _prefix_bytes() + if not _stopped(): + _follow() + continue + _next_arc() + + def flatten_v(self): + """Yields (key, value) tuples in an FST, starting at the current + position. + """ + + for key in self.flatten(): + yield key, self.value() + + def flatten_strings(self): + return (utf8decode(k)[0] for k in self.flatten()) + + def find_path(self, path): + """Follows the labels in the given path, starting at the current + position. + """ + + path = to_labels(path) + _switch_to = self.switch_to + _follow = self.follow + _stopped = self.stopped + + first = True + for i, label in enumerate(path): + if not first: + _follow() + if not _switch_to(label): + return False + if _stopped(): + if i < len(path) - 1: + return False + first = False + return True + + +class Cursor(BaseCursor): + def __init__(self, graph, root=None, stack=None): + self.graph = graph + self.vtype = graph.vtype + self.root = root if root is not None else graph.default_root() + if stack: + self.stack = stack + else: + self.reset() + + def _current_attr(self, name): + stack = self.stack + if not stack: + raise InactiveCursor + return getattr(stack[-1], name) + + def is_active(self): + return bool(self.stack) + + def stopped(self): + return self._current_attr("target") is None + + def accept(self): + return self._current_attr("accept") + + def at_last_arc(self): + return self._current_attr("lastarc") + + def label(self): + return self._current_attr("label") + + def reset(self): + self.stack = [] + self.sums = [None] + self._push(self.graph.arc_at(self.root)) + + def copy(self): + return self.__class__(self.graph, self.root, copy.deepcopy(self.stack)) + + def prefix(self): + stack = self.stack + if not stack: + raise InactiveCursor + return (arc.label for arc in stack) + + # Override: more efficient implementation using graph methods directly + def peek_key(self): + if not self.stack: + raise InactiveCursor + + for label in self.prefix(): + yield label + arc = copy.copy(self.stack[-1]) + graph = self.graph + while not arc.accept and arc.target is not None: + graph.arc_at(arc.target, arc) + yield arc.label + + def value(self): + stack = self.stack + if not stack: + raise InactiveCursor + vtype = self.vtype + if not vtype: + raise Exception("No value type") + + v = self.sums[-1] + current = stack[-1] + if current.value: + v = vtype.add(v, current.value) + if current.accept and current.acceptval is not None: + v = vtype.add(v, current.acceptval) + return v + + def next_arc(self): + stack = self.stack + if not stack: + raise InactiveCursor + + while stack and stack[-1].lastarc: + self.pop() + if stack: + current = stack[-1] + self.graph.arc_at(current.endpos, current) + return current + + def follow(self): + address = self._current_attr("target") + if address is None: + raise Exception("Can't follow a stop arc") + self._push(self.graph.arc_at(address)) + return self + + # Override: more efficient implementation manipulating the stack + def skip_to(self, key): + key = to_labels(key) + stack = self.stack + if not stack: + raise InactiveCursor + + _follow = self.follow + _next_arc = self.next_arc + + i = self._pop_to_prefix(key) + while stack and i < len(key): + curlabel = stack[-1].label + keylabel = key[i] + if curlabel == keylabel: + _follow() + i += 1 + elif curlabel > keylabel: + return + else: + _next_arc() + + # Override: more efficient implementation using find_arc + def switch_to(self, label): + stack = self.stack + if not stack: + raise InactiveCursor + + current = stack[-1] + if label == current.label: + return True + else: + arc = self.graph.find_arc(current.endpos, label, current) + return arc + + def _push(self, arc): + if self.vtype and self.stack: + sums = self.sums + sums.append(self.vtype.add(sums[-1], self.stack[-1].value)) + self.stack.append(arc) + + def pop(self): + self.stack.pop() + if self.vtype: + self.sums.pop() + + def _pop_to_prefix(self, key): + stack = self.stack + if not stack: + raise InactiveCursor + + i = 0 + maxpre = min(len(stack), len(key)) + while i < maxpre and key[i] == stack[i].label: + i += 1 + if stack[i].label > key[i]: + self.current = None + return + while len(stack) > i + 1: + self.pop() + self.next_arc() + return i + + +class UncompiledNode(object): + # Represents an "in-memory" node used by the GraphWriter before it is + # written to disk. + + compiled = False + + def __init__(self, owner): + self.owner = owner + self._digest = None + self.clear() + + def clear(self): + self.arcs = [] + self.value = None + self.accept = False + self.inputcount = 0 + + def __repr__(self): + return "<%r>" % ([(a.label, a.value) for a in self.arcs],) + + def digest(self): + if self._digest is None: + d = sha1() + vtype = self.owner.vtype + for arc in self.arcs: + d.update(arc.label) + if arc.target: + d.update(pack_long(arc.target)) + else: + d.update(b("z")) + if arc.value: + d.update(vtype.to_bytes(arc.value)) + if arc.accept: + d.update(b("T")) + self._digest = d.digest() + return self._digest + + def edges(self): + return self.arcs + + def last_value(self, label): + assert self.arcs[-1].label == label + return self.arcs[-1].value + + def add_arc(self, label, target): + self.arcs.append(Arc(label, target)) + + def replace_last(self, label, target, accept, acceptval=None): + arc = self.arcs[-1] + assert arc.label == label, "%r != %r" % (arc.label, label) + arc.target = target + arc.accept = accept + arc.acceptval = acceptval + + def delete_last(self, label, target): + arc = self.arcs.pop() + assert arc.label == label + assert arc.target == target + + def set_last_value(self, label, value): + arc = self.arcs[-1] + assert arc.label == label, "%r->%r" % (arc.label, label) + arc.value = value + + def prepend_value(self, prefix): + add = self.owner.vtype.add + for arc in self.arcs: + arc.value = add(prefix, arc.value) + if self.accept: + self.value = add(prefix, self.value) + + +class Arc(object): + """ + Represents a directed arc between two nodes in an FSA/FST graph. + + The ``lastarc`` attribute is True if this is the last outgoing arc from the + previous node. + """ + + __slots__ = ("label", "target", "accept", "value", "lastarc", "acceptval", "endpos") + + def __init__( + self, + label=None, + target=None, + value=None, + accept=False, + acceptval=None, + lastarc=None, + endpos=None, + ): + """ + :param label: The label bytes for this arc. For a word graph, this will + be a character. + :param target: The address of the node at the endpoint of this arc. + :param value: The inner FST value at the endpoint of this arc. + :param accept: Whether the endpoint of this arc is an accept state + (e.g. the end of a valid word). + :param acceptval: If the endpoint of this arc is an accept state, the + final FST value for that accepted state. + """ + + self.label = label + self.target = target + self.value = value + self.accept = accept + self.acceptval = acceptval + self.lastarc = lastarc + self.endpos = endpos + + def __repr__(self): + return "<%r-%s %s%s>" % ( + self.label, + self.target, + "." if self.accept else "", + (" %r" % self.value) if self.value else "", + ) + + def __eq__(self, other): + if ( + isinstance(other, self.__class__) + and self.accept == other.accept + and self.lastarc == other.lastarc + and self.target == other.target + and self.value == other.value + and self.label == other.label + ): + return True + return False + + def copy(self): + # This is faster than using the copy module + return Arc( + label=self.label, + target=self.target, + value=self.value, + accept=self.accept, + acceptval=self.acceptval, + lastarc=self.lastarc, + endpos=self.endpos, + ) + + +# Graph writer + + +class GraphWriter(object): + """Writes an FSA/FST graph to disk. + + Call ``insert(key)`` to insert keys into the graph. You must + insert keys in sorted order. Call ``close()`` to finish the graph and close + the file. + + >>> gw = GraphWriter(my_file) + >>> gw.insert("alfa") + >>> gw.insert("bravo") + >>> gw.insert("charlie") + >>> gw.close() + + The graph writer can write separate graphs for multiple fields. Use + ``start_field(name)`` and ``finish_field()`` to separate fields. + + >>> gw = GraphWriter(my_file) + >>> gw.start_field("content") + >>> gw.insert("alfalfa") + >>> gw.insert("apple") + >>> gw.finish_field() + >>> gw.start_field("title") + >>> gw.insert("artichoke") + >>> gw.finish_field() + >>> gw.close() + """ + + version = 1 + + def __init__(self, dbfile, vtype=None, merge=None): + """ + :param dbfile: the file to write to. + :param vtype: a :class:`Values` class to use for storing values. This + is only necessary if you will be storing values for the keys. + :param merge: a function that takes two values and returns a single + value. This is called if you insert two identical keys with values. + """ + + self.dbfile = dbfile + self.vtype = vtype + self.merge = merge + self.fieldroots = {} + self.arc_count = 0 + self.node_count = 0 + self.fixed_count = 0 + + dbfile.write(b("GRPH")) + dbfile.write_int(self.version) + dbfile.write_uint(0) + + self._infield = False + + def start_field(self, fieldname): + """Starts a new graph for the given field.""" + + if not fieldname: + raise ValueError("Field name cannot be equivalent to False") + if self._infield: + self.finish_field() + self.fieldname = fieldname + self.seen = {} + self.nodes = [UncompiledNode(self)] + self.lastkey = "" + self._inserted = False + self._infield = True + + def finish_field(self): + """Finishes the graph for the current field.""" + + if not self._infield: + raise Exception("Called finish_field before start_field") + self._infield = False + if self._inserted: + self.fieldroots[self.fieldname] = self._finish() + self.fieldname = None + + def close(self): + """Finishes the current graph and closes the underlying file.""" + + if self.fieldname is not None: + self.finish_field() + dbfile = self.dbfile + here = dbfile.tell() + dbfile.write_pickle(self.fieldroots) + dbfile.flush() + dbfile.seek(4 + _INT_SIZE) # Seek past magic and version number + dbfile.write_uint(here) + dbfile.close() + + def insert(self, key, value=None): + """Inserts the given key into the graph. + + :param key: a sequence of bytes objects, a bytes object, or a string. + :param value: an optional value to encode in the graph along with the + key. If the writer was not instantiated with a value type, passing + a value here will raise an error. + """ + + if not self._infield: + raise Exception("Inserted %r before starting a field" % key) + self._inserted = True + key = to_labels(key) # Python 3 sucks + + vtype = self.vtype + lastkey = self.lastkey + nodes = self.nodes + if len(key) < 1: + raise KeyError("Can't store a null key %r" % (key,)) + if lastkey and lastkey > key: + raise KeyError("Keys out of order %r..%r" % (lastkey, key)) + + # Find the common prefix shared by this key and the previous one + prefixlen = 0 + for i in range(min(len(lastkey), len(key))): + if lastkey[i] != key[i]: + break + prefixlen += 1 + # Compile the nodes after the prefix, since they're not shared + self._freeze_tail(prefixlen + 1) + + # Create new nodes for the parts of this key after the shared prefix + for char in key[prefixlen:]: + node = UncompiledNode(self) + # Create an arc to this node on the previous node + nodes[-1].add_arc(char, node) + nodes.append(node) + # Mark the last node as an accept state + lastnode = nodes[-1] + lastnode.accept = True + + if vtype: + if value is not None and not vtype.is_valid(value): + raise ValueError("%r is not valid for %s" % (value, vtype)) + + # Push value commonalities through the tree + common = None + for i in range(1, prefixlen + 1): + node = nodes[i] + parent = nodes[i - 1] + lastvalue = parent.last_value(key[i - 1]) + if lastvalue is not None: + common = vtype.common(value, lastvalue) + suffix = vtype.subtract(lastvalue, common) + parent.set_last_value(key[i - 1], common) + node.prepend_value(suffix) + else: + common = suffix = None + value = vtype.subtract(value, common) + + if key == lastkey: + # If this key is a duplicate, merge its value with the value of + # the previous (same) key + lastnode.value = self.merge(lastnode.value, value) + else: + nodes[prefixlen].set_last_value(key[prefixlen], value) + elif value: + raise Exception("Value %r but no value type" % value) + + self.lastkey = key + + def _freeze_tail(self, prefixlen): + nodes = self.nodes + lastkey = self.lastkey + downto = max(1, prefixlen) + + while len(nodes) > downto: + node = nodes.pop() + parent = nodes[-1] + inlabel = lastkey[len(nodes) - 1] + + self._compile_targets(node) + accept = node.accept or len(node.arcs) == 0 + address = self._compile_node(node) + parent.replace_last(inlabel, address, accept, node.value) + + def _finish(self): + nodes = self.nodes + root = nodes[0] + # Minimize nodes in the last word's suffix + self._freeze_tail(0) + # Compile remaining targets + self._compile_targets(root) + return self._compile_node(root) + + def _compile_targets(self, node): + for arc in node.arcs: + if isinstance(arc.target, UncompiledNode): + n = arc.target + if len(n.arcs) == 0: + arc.accept = n.accept = True + arc.target = self._compile_node(n) + + def _compile_node(self, uncnode): + seen = self.seen + + if len(uncnode.arcs) == 0: + # Leaf node + address = self._write_node(uncnode) + else: + d = uncnode.digest() + address = seen.get(d) + if address is None: + address = self._write_node(uncnode) + seen[d] = address + return address + + def _write_node(self, uncnode): + vtype = self.vtype + dbfile = self.dbfile + arcs = uncnode.arcs + numarcs = len(arcs) + + if not numarcs: + if uncnode.accept: + return None + else: + # What does it mean for an arc to stop but not be accepted? + raise Exception + self.node_count += 1 + + buf = StructFile(BytesIO()) + nodestart = dbfile.tell() + # self.count += 1 + # self.arccount += numarcs + + fixedsize = -1 + arcstart = buf.tell() + for i, arc in enumerate(arcs): + self.arc_count += 1 + target = arc.target + label = arc.label + + flags = 0 + if len(label) > 1: + flags += MULTIBYTE_LABEL + if i == numarcs - 1: + flags += ARC_LAST + if arc.accept: + flags += ARC_ACCEPT + if target is None: + flags += ARC_STOP + if arc.value is not None: + flags += ARC_HAS_VAL + if arc.acceptval is not None: + flags += ARC_HAS_ACCEPT_VAL + + buf.write(pack_byte(flags)) + if len(label) > 1: + buf.write(varint(len(label))) + buf.write(label) + if target is not None: + buf.write(pack_uint(target)) + if arc.value is not None: + vtype.write(buf, arc.value) + if arc.acceptval is not None: + vtype.write(buf, arc.acceptval) + + here = buf.tell() + thissize = here - arcstart + arcstart = here + if fixedsize == -1: + fixedsize = thissize + elif fixedsize > 0 and thissize != fixedsize: + fixedsize = 0 + + if fixedsize > 0: + # Write a fake arc containing the fixed size and number of arcs + dbfile.write_byte(255) # FIXED_SIZE + dbfile.write_int(fixedsize) + dbfile.write_int(numarcs) + self.fixed_count += 1 + dbfile.write(buf.file.getvalue()) + + return nodestart + + +# Graph reader + + +class BaseGraphReader(object): + def cursor(self, rootname=None): + return Cursor(self, self.root(rootname)) + + def has_root(self, rootname): + raise NotImplementedError + + def root(self, rootname=None): + raise NotImplementedError + + # Low level methods + + def arc_at(self, address, arc): + raise NotImplementedError + + def iter_arcs(self, address, arc=None): + raise NotImplementedError + + def find_arc(self, address, label, arc=None): + arc = arc or Arc() + for arc in self.iter_arcs(address, arc): + thislabel = arc.label + if thislabel == label: + return arc + elif thislabel > label: + return None + + # Convenience methods + + def list_arcs(self, address): + return list(arc.copy() for arc in self.iter_arcs(address)) + + def arc_dict(self, address): + return dict((arc.label, arc.copy()) for arc in self.iter_arcs(address)) + + def find_path(self, path, arc=None, address=None): + path = to_labels(path) + + if arc: + address = address if address is not None else arc.target + else: + arc = Arc() + + if address is None: + address = self._root + + find_arc = self.find_arc + for label in path: + if address is None: + return None + if not find_arc(address, label, arc): + return None + address = arc.target + return arc + + +class GraphReader(BaseGraphReader): + def __init__(self, dbfile, rootname=None, vtype=None, filebase=0): + self.dbfile = dbfile + self.vtype = vtype + self.filebase = filebase + + dbfile.seek(filebase) + magic = dbfile.read(4) + if magic != b("GRPH"): + raise FileVersionError + self.version = dbfile.read_int() + dbfile.seek(dbfile.read_uint()) + self.roots = dbfile.read_pickle() + + self._root = None + if rootname is None and len(self.roots) == 1: + # If there's only one root, just use it. Have to wrap a list around + # the keys() method here because of Python 3. + rootname = list(self.roots.keys())[0] + if rootname is not None: + self._root = self.root(rootname) + + def close(self): + self.dbfile.close() + + # Overrides + + def has_root(self, rootname): + return rootname in self.roots + + def root(self, rootname=None): + if rootname is None: + return self._root + else: + return self.roots[rootname] + + def default_root(self): + return self._root + + def arc_at(self, address, arc=None): + arc = arc or Arc() + self.dbfile.seek(address) + return self._read_arc(arc) + + def iter_arcs(self, address, arc=None): + arc = arc or Arc() + _read_arc = self._read_arc + + self.dbfile.seek(address) + while True: + _read_arc(arc) + yield arc + if arc.lastarc: + break + + def find_arc(self, address, label, arc=None): + # Overrides the default scanning implementation + + arc = arc or Arc() + dbfile = self.dbfile + dbfile.seek(address) + + # If records are fixed size, we can do a binary search + finfo = self._read_fixed_info() + if finfo: + size, count = finfo + address = dbfile.tell() + if count > 2: + return self._binary_search(address, size, count, label, arc) + + # If records aren't fixed size, fall back to the parent's linear + # search method + return BaseGraphReader.find_arc(self, address, label, arc) + + # Implementations + + def _read_arc(self, toarc=None): + toarc = toarc or Arc() + dbfile = self.dbfile + flags = dbfile.read_byte() + if flags == 255: + # This is a fake arc containing fixed size information; skip it + # and read the next arc + dbfile.seek(_INT_SIZE * 2, 1) + flags = dbfile.read_byte() + toarc.label = self._read_label(flags) + return self._read_arc_data(flags, toarc) + + def _read_label(self, flags): + dbfile = self.dbfile + if flags & MULTIBYTE_LABEL: + length = dbfile.read_varint() + else: + length = 1 + label = dbfile.read(length) + return label + + def _read_fixed_info(self): + dbfile = self.dbfile + + flags = dbfile.read_byte() + if flags == 255: + size = dbfile.read_int() + count = dbfile.read_int() + return (size, count) + else: + return None + + def _read_arc_data(self, flags, arc): + dbfile = self.dbfile + accept = arc.accept = bool(flags & ARC_ACCEPT) + arc.lastarc = flags & ARC_LAST + if flags & ARC_STOP: + arc.target = None + else: + arc.target = dbfile.read_uint() + if flags & ARC_HAS_VAL: + arc.value = self.vtype.read(dbfile) + else: + arc.value = None + if accept and flags & ARC_HAS_ACCEPT_VAL: + arc.acceptval = self.vtype.read(dbfile) + arc.endpos = dbfile.tell() + return arc + + def _binary_search(self, address, size, count, label, arc): + dbfile = self.dbfile + _read_label = self._read_label + + lo = 0 + hi = count + while lo < hi: + mid = (lo + hi) // 2 + midaddr = address + mid * size + dbfile.seek(midaddr) + flags = dbfile.read_byte() + midlabel = self._read_label(flags) + if midlabel == label: + arc.label = midlabel + return self._read_arc_data(flags, arc) + elif midlabel < label: + lo = mid + 1 + else: + hi = mid + if lo == count: + return None + + +def to_labels(key): + """Takes a string and returns a list of bytestrings, suitable for use as + a key or path in an FSA/FST graph. + """ + + # Convert to tuples of bytestrings (must be tuples so they can be hashed) + keytype = type(key) + + # I hate the Python 3 bytes object so friggin much + if keytype is tuple or keytype is list: + if not all(isinstance(e, bytes_type) for e in key): + raise TypeError("%r contains a non-bytestring" % key) + if keytype is list: + key = tuple(key) + elif isinstance(key, bytes_type): + key = tuple(key[i : i + 1] for i in range(len(key))) + elif isinstance(key, text_type): + key = tuple(utf8encode(key[i : i + 1])[0] for i in range(len(key))) + else: + raise TypeError("Don't know how to convert %r" % key) + return key + + +# Within edit distance function + + +def within(graph, text, k=1, prefix=0, address=None): + """Yields a series of keys in the given graph within ``k`` edit distance of + ``text``. If ``prefix`` is greater than 0, all keys must match the first + ``prefix`` characters of ``text``. + """ + + text = to_labels(text) + if address is None: + address = graph._root + + sofar = emptybytes + accept = False + if prefix: + prefixchars = text[:prefix] + arc = graph.find_path(prefixchars, address=address) + if arc is None: + return + sofar = emptybytes.join(prefixchars) + address = arc.target + accept = arc.accept + + stack = [(address, k, prefix, sofar, accept)] + seen = set() + while stack: + state = stack.pop() + # Have we already tried this state? + if state in seen: + continue + seen.add(state) + + address, k, i, sofar, accept = state + # If we're at the end of the text (or deleting enough chars would get + # us to the end and still within K), and we're in the accept state, + # yield the current result + if (len(text) - i <= k) and accept: + yield utf8decode(sofar)[0] + + # If we're in the stop state, give up + if address is None: + continue + + # Exact match + if i < len(text): + arc = graph.find_arc(address, text[i]) + if arc: + stack.append((arc.target, k, i + 1, sofar + text[i], arc.accept)) + # If K is already 0, can't do any more edits + if k < 1: + continue + k -= 1 + + arcs = graph.arc_dict(address) + # Insertions + stack.extend( + (arc.target, k, i, sofar + char, arc.accept) + for char, arc in iteritems(arcs) + ) + + # Deletion, replacement, and transpo only work before the end + if i >= len(text): + continue + char = text[i] + + # Deletion + stack.append((address, k, i + 1, sofar, False)) + # Replacement + for char2, arc in iteritems(arcs): + if char2 != char: + stack.append((arc.target, k, i + 1, sofar + char2, arc.accept)) + # Transposition + if i < len(text) - 1: + char2 = text[i + 1] + if char != char2 and char2 in arcs: + # Find arc from next char to this char + target = arcs[char2].target + if target: + arc = graph.find_arc(target, char) + if arc: + stack.append( + (arc.target, k, i + 2, sofar + char2 + char, arc.accept) + ) + + +# Utility functions + + +def dump_graph(graph, address=None, tab=0, out=None): + if address is None: + address = graph._root + if out is None: + out = sys.stdout + + here = "%06d" % address + for i, arc in enumerate(graph.list_arcs(address)): + if i == 0: + out.write(here) + else: + out.write(" " * 6) + out.write(" " * tab) + out.write("%r %r %s %r\n" % (arc.label, arc.target, arc.accept, arc.value)) + if arc.target is not None: + dump_graph(graph, arc.target, tab + 1, out=out) diff --git a/src/whoosh_reloaded/automata/glob.py b/src/whoosh/automata/glob.py similarity index 98% rename from src/whoosh_reloaded/automata/glob.py rename to src/whoosh/automata/glob.py index 54597c0b..32573afa 100644 --- a/src/whoosh_reloaded/automata/glob.py +++ b/src/whoosh/automata/glob.py @@ -25,7 +25,7 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.automata.fsa import ANY, EPSILON, NFA +from whoosh.automata.fsa import ANY, EPSILON, NFA # Constants for glob diff --git a/src/whoosh_reloaded/automata/lev.py b/src/whoosh/automata/lev.py similarity index 76% rename from src/whoosh_reloaded/automata/lev.py rename to src/whoosh/automata/lev.py index 5d78faba..8d71fae4 100644 --- a/src/whoosh_reloaded/automata/lev.py +++ b/src/whoosh/automata/lev.py @@ -1,19 +1,19 @@ from __future__ import print_function -from whoosh_reloaded.compat import xrange -from whoosh_reloaded.automata.fsa import ANY, EPSILON, NFA +from whoosh.compat import range +from whoosh.automata.fsa import ANY, EPSILON, NFA def levenshtein_automaton(term, k, prefix=0): nfa = NFA((0, 0)) if prefix: - for i in xrange(prefix): + for i in range(prefix): c = term[i] nfa.add_transition((i, 0), c, (i + 1, 0)) - for i in xrange(prefix, len(term)): + for i in range(prefix, len(term)): c = term[i] - for e in xrange(k + 1): + for e in range(k + 1): # Correct character nfa.add_transition((i, e), c, (i + 1, e)) if e < k: @@ -23,7 +23,7 @@ def levenshtein_automaton(term, k, prefix=0): nfa.add_transition((i, e), EPSILON, (i + 1, e + 1)) # Substitution nfa.add_transition((i, e), ANY, (i + 1, e + 1)) - for e in xrange(k + 1): + for e in range(k + 1): if e < k: nfa.add_transition((len(term), e), ANY, (len(term), e + 1)) nfa.add_final_state((len(term), e)) diff --git a/src/whoosh_reloaded/automata/nfa.py b/src/whoosh/automata/nfa.py similarity index 96% rename from src/whoosh_reloaded/automata/nfa.py rename to src/whoosh/automata/nfa.py index 77fafeec..5853a1e4 100644 --- a/src/whoosh_reloaded/automata/nfa.py +++ b/src/whoosh/automata/nfa.py @@ -25,7 +25,7 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.automata.fst import Arc +from whoosh.automata.fst import Arc class Instruction(object): @@ -292,10 +292,10 @@ def regex_limit(graph, mode, program, address): # if __name__ == "__main__": -# from whoosh_reloaded import index, query -# from whoosh_reloaded.filedb.filestore import RamStorage -# from whoosh_reloaded.automata import fst -# from whoosh_reloaded.util.testing import timing +# from whoosh import index, query +# from whoosh.filedb.filestore import RamStorage +# from whoosh.automata import fst +# from whoosh.util.testing import timing # # st = RamStorage() # gw = fst.GraphWriter(st.create_file("test")) @@ -334,8 +334,8 @@ def regex_limit(graph, mode, program, address): # # print len(x), x # # with timing(): -# print "lo=", regex_limit(gr, LO, program, gr.root("path")) -# print "hi=", regex_limit(gr, HI, program, gr.root("path")) +# print ("lo=", regex_limit(gr, LO, program, gr.root("path"))) +# print ("hi=", regex_limit(gr, HI, program, gr.root("path"))) # # # diff --git a/src/whoosh_reloaded/automata/reg.py b/src/whoosh/automata/reg.py similarity index 98% rename from src/whoosh_reloaded/automata/reg.py rename to src/whoosh/automata/reg.py index 311ba97c..f70f68b4 100644 --- a/src/whoosh_reloaded/automata/reg.py +++ b/src/whoosh/automata/reg.py @@ -25,7 +25,7 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.automata.fsa import ANY, EPSILON, NFA +from whoosh.automata.fsa import ANY, EPSILON, NFA # Operator precedence diff --git a/src/whoosh_reloaded/classify.py b/src/whoosh/classify.py similarity index 96% rename from src/whoosh_reloaded/classify.py rename to src/whoosh/classify.py index a450821e..37898c77 100644 --- a/src/whoosh_reloaded/classify.py +++ b/src/whoosh/classify.py @@ -34,7 +34,7 @@ from collections import defaultdict from math import log -from whoosh_reloaded.compat import xrange, iteritems +from whoosh.compat import range, iteritems # Expansion models @@ -106,7 +106,7 @@ class Expander(object): def __init__(self, ixreader, fieldname, model=Bo1Model): """ - :param reader: A :class: whoosh_reloaded.reading.IndexReader object. + :param reader: A :class: whoosh.reading.IndexReader object. :param fieldname: The name of the field in which to search. :param model: (classify.ExpansionModel) The model to use for expanding the query terms. If you omit this parameter, the expander uses @@ -211,7 +211,7 @@ def expanded_terms(self, number, normalize=True): def shingles(input, size=2): d = defaultdict(int) - for shingle in (input[i : i + size] for i in xrange(len(input) - (size - 1))): + for shingle in (input[i : i + size] for i in range(len(input) - (size - 1))): d[shingle] += 1 return iteritems(d) @@ -225,7 +225,7 @@ def simhash(features, hashbits=32): vs = [0] * hashbits for feature, weight in features: h = hashfn(feature) - for i in xrange(hashbits): + for i in range(hashbits): if h & (1 << i): vs[i] += weight else: @@ -307,14 +307,14 @@ def kmeans(data, k, t=0.0001, distfun=None, maxiter=50, centers=None): error = 0 # clear old counts and temp centroids - for i in xrange(k): + for i in range(k): counts[i] = 0 c1[i] = 0 - for h in xrange(n): + for h in range(n): # identify the closest cluster min_distance = DOUBLE_MAX - for i in xrange(k): + for i in range(k): distance = (data[h] - c[i]) ** 2 if distance < min_distance: labels[h] = i @@ -326,7 +326,7 @@ def kmeans(data, k, t=0.0001, distfun=None, maxiter=50, centers=None): # update standard error error += min_distance - for i in xrange(k): # update all centroids + for i in range(k): # update all centroids c[i] = c1[i] / counts[i] if counts[i] else c1[i] niter += 1 diff --git a/src/whoosh_reloaded/codec/__init__.py b/src/whoosh/codec/__init__.py similarity index 96% rename from src/whoosh_reloaded/codec/__init__.py rename to src/whoosh/codec/__init__.py index 5ec42868..70445636 100644 --- a/src/whoosh_reloaded/codec/__init__.py +++ b/src/whoosh/codec/__init__.py @@ -27,6 +27,6 @@ def default_codec(*args, **kwargs): - from whoosh_reloaded.codec.whoosh_reloaded3 import W3Codec + from whoosh.codec.whoosh3 import W3Codec return W3Codec(*args, **kwargs) diff --git a/src/whoosh_reloaded/codec/base.py b/src/whoosh/codec/base.py similarity index 98% rename from src/whoosh_reloaded/codec/base.py rename to src/whoosh/codec/base.py index 58bbd852..d2cfeebe 100644 --- a/src/whoosh_reloaded/codec/base.py +++ b/src/whoosh/codec/base.py @@ -31,12 +31,12 @@ from bisect import bisect_right -from whoosh_reloaded import columns -from whoosh_reloaded.automata import lev -from whoosh_reloaded.compat import abstractmethod, izip, unichr, xrange -from whoosh_reloaded.filedb.compound import CompoundStorage -from whoosh_reloaded.system import emptybytes -from whoosh_reloaded.util import random_name +from whoosh import columns +from whoosh.automata import lev +from whoosh.compat import abstractmethod, izip, unichr, range +from whoosh.filedb.compound import CompoundStorage +from whoosh.system import emptybytes +from whoosh.util import random_name # Exceptions @@ -420,7 +420,7 @@ def all_doc_ids(self): is_deleted = self.is_deleted return ( - docnum for docnum in xrange(self.doc_count_all()) if not is_deleted(docnum) + docnum for docnum in range(self.doc_count_all()) if not is_deleted(docnum) ) def iter_docs(self): diff --git a/src/whoosh_reloaded/codec/memory.py b/src/whoosh/codec/memory.py similarity index 96% rename from src/whoosh_reloaded/codec/memory.py rename to src/whoosh/codec/memory.py index 31a0f6bc..312befc0 100644 --- a/src/whoosh_reloaded/codec/memory.py +++ b/src/whoosh/codec/memory.py @@ -29,11 +29,11 @@ from bisect import bisect_left from threading import Lock -from whoosh_reloaded.compat import xrange -from whoosh_reloaded.codec import base -from whoosh_reloaded.matching import ListMatcher -from whoosh_reloaded.reading import SegmentReader, TermInfo, TermNotFound -from whoosh_reloaded.writing import SegmentWriter +from whoosh.compat import range +from whoosh.codec import base +from whoosh.matching import ListMatcher +from whoosh.reading import SegmentReader, TermInfo, TermNotFound +from whoosh.writing import SegmentWriter class MemWriter(SegmentWriter): @@ -43,7 +43,7 @@ def commit(self): class MemoryCodec(base.Codec): def __init__(self): - from whoosh_reloaded.filedb.filestore import RamStorage + from whoosh.filedb.filestore import RamStorage self.storage = RamStorage() self.segment = MemSegment(self, "blah") @@ -273,7 +273,7 @@ def terms_from(self, fieldname, prefix): if not terms: return start = bisect_left(terms, prefix) - for i in xrange(start, len(terms)): + for i in range(start, len(terms)): yield (fieldname, terms[i]) def term_info(self, fieldname, text): @@ -332,7 +332,7 @@ def is_deleted(self, docnum): def deleted_docs(self): stored = self._stored - for docnum in xrange(self.doc_count_all()): + for docnum in range(self.doc_count_all()): if docnum not in stored: yield docnum diff --git a/src/whoosh_reloaded/codec/plaintext.py b/src/whoosh/codec/plaintext.py similarity index 97% rename from src/whoosh_reloaded/codec/plaintext.py rename to src/whoosh/codec/plaintext.py index 99baf425..6c547368 100644 --- a/src/whoosh_reloaded/codec/plaintext.py +++ b/src/whoosh/codec/plaintext.py @@ -27,11 +27,11 @@ from ast import literal_eval -from whoosh_reloaded.compat import b, bytes_type, text_type, integer_types, PY3 -from whoosh_reloaded.compat import iteritems, dumps, loads, xrange -from whoosh_reloaded.codec import base -from whoosh_reloaded.matching import ListMatcher -from whoosh_reloaded.reading import TermInfo, TermNotFound +from whoosh.compat import b, bytes_type, text_type, integer_types, PY3 +from whoosh.compat import iteritems, dumps, loads, range +from whoosh.codec import base +from whoosh.matching import ListMatcher +from whoosh.reading import TermInfo, TermNotFound if not PY3: @@ -106,7 +106,7 @@ def _parse_line(self, line): parts = line.split("\t") command = parts[0] args = {} - for i in xrange(1, len(parts)): + for i in range(1, len(parts)): n, v = parts[i].split("=") args[n] = literal_eval(v) return (indent, command, args) diff --git a/src/whoosh/codec/whoosh2.py b/src/whoosh/codec/whoosh2.py new file mode 100644 index 00000000..ffcf3f20 --- /dev/null +++ b/src/whoosh/codec/whoosh2.py @@ -0,0 +1,2224 @@ +# Copyright 2011 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import struct, sys +from array import array +from binascii import crc32 +from collections import defaultdict +from decimal import Decimal +from hashlib import md5 # type: ignore @UnresolvedImport +from struct import Struct + +try: + import zlib +except ImportError: + zlib = None + +from whoosh.compat import b, PY3 +from whoosh.compat import loads, dumps +from whoosh.compat import range, iteritems +from whoosh.compat import bytes_type, text_type, string_type, integer_types +from whoosh.compat import array_frombytes, array_tobytes +from whoosh.codec import base +from whoosh.filedb.filestore import Storage +from whoosh.matching import ListMatcher, ReadTooFar, LeafMatcher +from whoosh.reading import NoGraphError, TermInfo, TermNotFound +from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE, IS_LITTLE +from whoosh.system import emptybytes +from whoosh.system import pack_byte +from whoosh.system import pack_ushort, unpack_ushort, pack_long, unpack_long + +from whoosh.automata.fst import GraphWriter, GraphReader +from whoosh.util.numeric import byte_to_length, length_to_byte +from whoosh.util.numeric import to_sortable, from_sortable, NaN +from whoosh.util.numlists import GrowableArray +from whoosh.util.text import utf8encode, utf8decode +from whoosh.util.times import datetime_to_long, long_to_datetime + + +# Old hash file implementations + +_4GB = 4 * 1024 * 1024 * 1024 + + +def cdb_hash(key): + h = 5381 + for c in key: + h = (h + (h << 5)) & 0xffffffff ^ ord(c) + return h + + +def md5_hash(key): + return int(md5(key).hexdigest(), 16) & 0xffffffff + + +def crc_hash(key): + return crc32(key) & 0xffffffff + + +hash_functions = (hash, cdb_hash, md5_hash, crc_hash) + +_header_entry_struct = struct.Struct("!qI") # Position, number of slots +header_entry_size = _header_entry_struct.size +pack_header_entry = _header_entry_struct.pack +unpack_header_entry = _header_entry_struct.unpack + +_lengths_struct = struct.Struct("!II") # Length of key, length of data +lengths_size = _lengths_struct.size +pack_lengths = _lengths_struct.pack +unpack_lengths = _lengths_struct.unpack + +_pointer_struct = struct.Struct("!Iq") # Hash value, position +pointer_size = _pointer_struct.size +pack_pointer = _pointer_struct.pack +unpack_pointer = _pointer_struct.unpack + + +# Table classes + +class HashWriter(object): + def __init__(self, dbfile, hashtype=2): + self.dbfile = dbfile + self.hashtype = hashtype + self.extras = {} + + self.startoffset = dbfile.tell() + dbfile.write(b("HASH")) # Magic tag + dbfile.write_byte(self.hashtype) # Identify hashing function used + dbfile.write(b("\x00\x00\x00")) # Unused bytes + dbfile.write_long(0) # Pointer to end of hashes + + self.header_size = 16 + 256 * header_entry_size + self.hash_func = hash_functions[self.hashtype] + + # Seek past the first "header_size" bytes of the file... we'll come + # back here to write the header later + dbfile.seek(self.header_size) + # Store the directory of hashed values + self.hashes = defaultdict(list) + + def add(self, key, value): + assert isinstance(key, bytes_type) + assert isinstance(value, bytes_type) + + dbfile = self.dbfile + pos = dbfile.tell() + dbfile.write(pack_lengths(len(key), len(value))) + dbfile.write(key) + dbfile.write(value) + + h = self.hash_func(key) + self.hashes[h & 255].append((h, pos)) + + def add_all(self, items): + add = self.add + for key, value in items: + add(key, value) + + def _write_hashes(self): + dbfile = self.dbfile + hashes = self.hashes + directory = self.directory = [] + + pos = dbfile.tell() + for i in range(0, 256): + entries = hashes[i] + numslots = 2 * len(entries) + directory.append((pos, numslots)) + + null = (0, 0) + hashtable = [null] * numslots + for hashval, position in entries: + n = (hashval >> 8) % numslots + while hashtable[n] != null: + n = (n + 1) % numslots + hashtable[n] = (hashval, position) + + write = dbfile.write + for hashval, position in hashtable: + write(pack_pointer(hashval, position)) + pos += pointer_size + + dbfile.flush() + self.extrasoffset = dbfile.tell() + + def _write_extras(self): + self.dbfile.write_pickle(self.extras) + # Seek back and write the pointer to the extras + self.dbfile.flush() + self.dbfile.seek(self.startoffset + 8) + self.dbfile.write_long(self.extrasoffset) + + def _write_directory(self): + dbfile = self.dbfile + directory = self.directory + + # Seek back to the header + dbfile.seek(self.startoffset + 8) + # Write the pointer to the end of the hashes + dbfile.write_long(self.extrasoffset) + # Write the pointers to the hash tables + for position, numslots in directory: + dbfile.write(pack_header_entry(position, numslots)) + + dbfile.flush() + assert dbfile.tell() == self.header_size + + def close(self): + self._write_hashes() + self._write_extras() + self._write_directory() + self.dbfile.close() + + +class HashReader(object): + def __init__(self, dbfile, startoffset=0): + self.dbfile = dbfile + self.startoffset = startoffset + self.is_closed = False + + dbfile.seek(startoffset) + # Check magic tag + magic = dbfile.read(4) + if magic != b("HASH"): + raise Exception("Unknown file header %r" % magic) + + self.hashtype = dbfile.read_byte() # Hash function type + self.hash_func = hash_functions[self.hashtype] + + dbfile.read(3) # Unused + self.extrasoffset = dbfile.read_long() # Pointer to end of hashes + + self.header_size = 16 + 256 * header_entry_size + assert self.extrasoffset >= self.header_size + + # Read pointers to hash tables + self.buckets = [] + for _ in range(256): + he = unpack_header_entry(dbfile.read(header_entry_size)) + self.buckets.append(he) + self._start_of_hashes = self.buckets[0][0] + + dbfile.seek(self.extrasoffset) + self._read_extras() + + def _read_extras(self): + try: + self.extras = self.dbfile.read_pickle() + except EOFError: + self.extras = {} + + def close(self): + if self.is_closed: + raise Exception("Tried to close %r twice" % self) + self.dbfile.close() + self.is_closed = True + + def read(self, position, length): + self.dbfile.seek(position) + return self.dbfile.read(length) + + def _ranges(self, pos=None): + if pos is None: + pos = self.header_size + eod = self._start_of_hashes + read = self.read + while pos < eod: + keylen, datalen = unpack_lengths(read(pos, lengths_size)) + keypos = pos + lengths_size + datapos = pos + lengths_size + keylen + pos = datapos + datalen + yield (keypos, keylen, datapos, datalen) + + def __iter__(self): + return iter(self.items()) + + def items(self): + read = self.read + for keypos, keylen, datapos, datalen in self._ranges(): + key = read(keypos, keylen) + value = read(datapos, datalen) + yield (key, value) + + def keys(self): + read = self.read + for keypos, keylen, _, _ in self._ranges(): + yield read(keypos, keylen) + + def values(self): + read = self.read + for _, _, datapos, datalen in self._ranges(): + yield read(datapos, datalen) + + def __getitem__(self, key): + for data in self.all(key): + return data + raise KeyError(key) + + def get(self, key, default=None): + for data in self.all(key): + return data + return default + + def all(self, key): + read = self.read + for datapos, datalen in self.ranges_for_key(key): + yield read(datapos, datalen) + + def __contains__(self, key): + for _ in self.ranges_for_key(key): + return True + return False + + def _hashtable_info(self, keyhash): + # Return (directory_position, number_of_hash_entries) + return self.buckets[keyhash & 255] + + def _key_position(self, key): + keyhash = self.hash_func(key) + hpos, hslots = self._hashtable_info(keyhash) + if not hslots: + raise KeyError(key) + slotpos = hpos + (((keyhash >> 8) % hslots) * header_entry_size) + + return self.dbfile.get_long(slotpos + _INT_SIZE) + + def _key_at(self, pos): + keylen = self.dbfile.get_uint(pos) + return self.read(pos + lengths_size, keylen) + + def ranges_for_key(self, key): + read = self.read + if not isinstance(key, bytes_type): + raise TypeError("Key %r should be bytes" % key) + keyhash = self.hash_func(key) + hpos, hslots = self._hashtable_info(keyhash) + if not hslots: + return + + slotpos = hpos + (((keyhash >> 8) % hslots) * pointer_size) + for _ in range(hslots): + slothash, pos = unpack_pointer(read(slotpos, pointer_size)) + if not pos: + return + + slotpos += pointer_size + # If we reach the end of the hashtable, wrap around + if slotpos == hpos + (hslots * pointer_size): + slotpos = hpos + + if slothash == keyhash: + keylen, datalen = unpack_lengths(read(pos, lengths_size)) + if keylen == len(key): + if key == read(pos + lengths_size, keylen): + yield (pos + lengths_size + keylen, datalen) + + def range_for_key(self, key): + for item in self.ranges_for_key(key): + return item + raise KeyError(key) + + +class OrderedHashWriter(HashWriter): + def __init__(self, dbfile): + HashWriter.__init__(self, dbfile) + self.index = GrowableArray("H") + self.lastkey = emptybytes + + def add(self, key, value): + if key <= self.lastkey: + raise ValueError("Keys must increase: %r..%r" + % (self.lastkey, key)) + self.index.append(self.dbfile.tell()) + HashWriter.add(self, key, value) + self.lastkey = key + + def _write_extras(self): + dbfile = self.dbfile + + # Save information about the index in the extras + ndxarray = self.index + self.extras["indexbase"] = dbfile.tell() + self.extras["indextype"] = ndxarray.typecode + self.extras["indexlen"] = len(ndxarray) + # Write key index + ndxarray.to_file(dbfile) + + # Call the super method to write the extras + self.extrasoffset = dbfile.tell() + HashWriter._write_extras(self) + + +class OrderedHashReader(HashReader): + def __init__(self, dbfile): + HashReader.__init__(self, dbfile) + self.indexbase = self.extras["indexbase"] + self.indexlen = self.extras["indexlen"] + + self.indextype = indextype = self.extras["indextype"] + self._ixsize = struct.calcsize(indextype) + if indextype == "B": + self._ixpos = dbfile.get_byte + elif indextype == "H": + self._ixpos = dbfile.get_ushort + elif indextype == "i": + self._ixpos = dbfile.get_int + elif indextype == "I": + self._ixpos = dbfile.get_uint + elif indextype == "q": + self._ixpos = dbfile.get_long + else: + raise Exception("Unknown index type %r" % indextype) + + def _closest_key(self, key): + key_at = self._key_at + indexbase = self.indexbase + ixpos, ixsize = self._ixpos, self._ixsize + + lo = 0 + hi = self.indexlen + if not isinstance(key, bytes_type): + raise TypeError("Key %r should be bytes" % key) + while lo < hi: + mid = (lo + hi) // 2 + midkey = key_at(ixpos(indexbase + mid * ixsize)) + if midkey < key: + lo = mid + 1 + else: + hi = mid + #i = max(0, mid - 1) + if lo == self.indexlen: + return None + return ixpos(indexbase + lo * ixsize) + + def closest_key(self, key): + pos = self._closest_key(key) + if pos is None: + return None + return self._key_at(pos) + + def _ranges_from(self, key): + #read = self.read + pos = self._closest_key(key) + if pos is None: + return + + for x in self._ranges(pos=pos): + yield x + + def items_from(self, key): + read = self.read + for keypos, keylen, datapos, datalen in self._ranges_from(key): + yield (read(keypos, keylen), read(datapos, datalen)) + + def keys_from(self, key): + read = self.read + for keypos, keylen, _, _ in self._ranges_from(key): + yield read(keypos, keylen) + + +# Standard codec top-level object + +class W2Codec(base.Codec): + TERMS_EXT = ".trm" # Term index + POSTS_EXT = ".pst" # Term postings + DAWG_EXT = FST_EXT = ".dag" # Spelling graph file + LENGTHS_EXT = ".fln" # Field lengths file + VECTOR_EXT = ".vec" # Vector index + VPOSTS_EXT = ".vps" # Vector postings + STORED_EXT = ".sto" # Stored fields file + + def __init__(self, blocklimit=128, compression=3, loadlengths=False, + inlinelimit=1): + self.blocklimit = blocklimit + self.compression = compression + self.loadlengths = loadlengths + self.inlinelimit = inlinelimit + + # Per-document value writer + def per_document_writer(self, storage, segment): + return W2PerDocWriter(storage, segment, blocklimit=self.blocklimit, + compression=self.compression) + + # Inverted index writer + def field_writer(self, storage, segment): + return W2FieldWriter(storage, segment, blocklimit=self.blocklimit, + compression=self.compression, + inlinelimit=self.inlinelimit) + + # Readers + + def terms_reader(self, storage, segment): + tifile = segment.open_file(storage, self.TERMS_EXT) + postfile = segment.open_file(storage, self.POSTS_EXT) + return W2TermsReader(tifile, postfile) + + def per_document_reader(self, storage, segment): + return W2PerDocReader(storage, segment) + + def graph_reader(self, storage, segment): + try: + dawgfile = segment.open_file(storage, self.DAWG_EXT) + except: + raise NoGraphError + return GraphReader(dawgfile) + + # Segments and generations + + def new_segment(self, storage, indexname): + return W2Segment(indexname) + + +# Per-document value writer + +class W2PerDocWriter(base.PerDocumentWriter): + def __init__(self, storage, segment, blocklimit=128, compression=3): + if not isinstance(blocklimit, int): + raise ValueError + self.storage = storage + self.segment = segment + self.blocklimit = blocklimit + self.compression = compression + self.doccount = 0 + self.is_closed = False + + sffile = segment.create_file(storage, W2Codec.STORED_EXT) + self.stored = StoredFieldWriter(sffile) + self.storedfields = None + + self.lengths = InMemoryLengths() + + # We'll wait to create the vector files until someone actually tries + # to add a vector + self.vindex = self.vpostfile = None + + def _make_vector_files(self): + vifile = self.segment.create_file(self.storage, W2Codec.VECTOR_EXT) + self.vindex = VectorWriter(vifile) + self.vpostfile = self.segment.create_file(self.storage, + W2Codec.VPOSTS_EXT) + + def start_doc(self, docnum): + self.docnum = docnum + self.storedfields = {} + self.doccount = max(self.doccount, docnum + 1) + + def add_field(self, fieldname, fieldobj, value, length): + if length: + self.lengths.add(self.docnum, fieldname, length) + if value is not None: + self.storedfields[fieldname] = value + + def _new_block(self, vformat): + postingsize = vformat.posting_size + return W2Block(postingsize, stringids=True) + + def add_vector_items(self, fieldname, fieldobj, items): + if self.vindex is None: + self._make_vector_files() + + # items = (text, weight, value_bytes) ... + postfile = self.vpostfile + blocklimit = self.blocklimit + block = self._new_block(fieldobj.vector) + + startoffset = postfile.tell() + postfile.write(block.magic) # Magic number + blockcount = 0 + postfile.write_uint(0) # Placeholder for block count + + countdown = blocklimit + for text, weight, valuestring in items: + block.add(text, weight, valuestring) + countdown -= 1 + if countdown == 0: + block.to_file(postfile, compression=self.compression) + block = self._new_block(fieldobj.vector) + blockcount += 1 + countdown = blocklimit + # If there are leftover items in the current block, write them out + if block: + block.to_file(postfile, compression=self.compression) + blockcount += 1 + + # Seek back to the start of this list of posting blocks and write the + # number of blocks + postfile.flush() + here = postfile.tell() + postfile.seek(startoffset + 4) + postfile.write_uint(blockcount) + postfile.seek(here) + + # Add to the index + self.vindex.add((self.docnum, fieldname), startoffset) + + def finish_doc(self): + self.stored.add(self.storedfields) + self.storedfields = None + + def close(self): + if self.storedfields is not None: + self.stored.add(self.storedfields) + self.stored.close() + flfile = self.segment.create_file(self.storage, W2Codec.LENGTHS_EXT) + self.lengths.to_file(flfile, self.doccount) + if self.vindex: + self.vindex.close() + self.vpostfile.close() + self.is_closed = True + + +# Inverted index writer + +class W2FieldWriter(base.FieldWriter): + def __init__(self, storage, segment, blocklimit=128, compression=3, + inlinelimit=1): + assert isinstance(storage, Storage) + assert isinstance(segment, base.Segment) + assert isinstance(blocklimit, int) + assert isinstance(compression, int) + assert isinstance(inlinelimit, int) + + self.storage = storage + self.segment = segment + self.fieldname = None + self.text = None + self.field = None + self.format = None + self.spelling = False + + tifile = segment.create_file(storage, W2Codec.TERMS_EXT) + self.termsindex = TermIndexWriter(tifile) + self.postfile = segment.create_file(storage, W2Codec.POSTS_EXT) + + # We'll wait to create the DAWG builder until someone actually adds + # a spelled field + self.dawg = None + + self.blocklimit = blocklimit + self.compression = compression + self.inlinelimit = inlinelimit + self.block = None + self.terminfo = None + self._infield = False + self.is_closed = False + + def _make_dawg_files(self): + dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT) + self.dawg = GraphWriter(dawgfile) + + def _new_block(self): + return W2Block(self.format.posting_size) + + def _reset_block(self): + self.block = self._new_block() + + def _write_block(self): + self.terminfo.add_block(self.block) + self.block.to_file(self.postfile, compression=self.compression) + self._reset_block() + self.blockcount += 1 + + def _start_blocklist(self): + postfile = self.postfile + self._reset_block() + + # Magic number + self.startoffset = postfile.tell() + postfile.write(W2Block.magic) + # Placeholder for block count + self.blockcount = 0 + postfile.write_uint(0) + + def start_field(self, fieldname, fieldobj): + self.fieldname = fieldname + self.field = fieldobj + self.format = fieldobj.format + self.spelling = fieldobj.spelling and not fieldobj.separate_spelling() + self._dawgfield = False + if self.spelling or fieldobj.separate_spelling(): + if self.dawg is None: + self._make_dawg_files() + self.dawg.start_field(fieldname) + self._dawgfield = True + self._infield = True + + def start_term(self, text): + if self.block is not None: + raise Exception("Called start_term in a block") + self.text = text + self.terminfo = FileTermInfo() + if self.spelling: + self.dawg.insert(text.decode("utf-8")) # TODO: how to decode bytes? + self._start_blocklist() + + def add(self, docnum, weight, valuestring, length): + self.block.add(docnum, weight, valuestring, length) + if len(self.block) > self.blocklimit: + self._write_block() + + def add_spell_word(self, fieldname, text): + if self.dawg is None: + self._make_dawg_files() + self.dawg.insert(text) + + def finish_term(self): + block = self.block + if block is None: + raise Exception("Called finish_term when not in a block") + + terminfo = self.terminfo + if self.blockcount < 1 and block and len(block) < self.inlinelimit: + # Inline the single block + terminfo.add_block(block) + vals = None if not block.values else tuple(block.values) + postings = (tuple(block.ids), tuple(block.weights), vals) + else: + if block: + # Write the current unfinished block to disk + self._write_block() + + # Seek back to the start of this list of posting blocks and write + # the number of blocks + postfile = self.postfile + postfile.flush() + here = postfile.tell() + postfile.seek(self.startoffset + 4) + postfile.write_uint(self.blockcount) + postfile.seek(here) + + self.block = None + postings = self.startoffset + + self.block = None + terminfo.postings = postings + self.termsindex.add((self.fieldname, self.text), terminfo) + + def finish_field(self): + if not self._infield: + raise Exception("Called finish_field before start_field") + self._infield = False + + if self._dawgfield: + self.dawg.finish_field() + self._dawgfield = False + + def close(self): + self.termsindex.close() + self.postfile.close() + if self.dawg is not None: + self.dawg.close() + self.is_closed = True + + +# Matcher + +class W2LeafMatcher(LeafMatcher): + def __init__(self, postfile, startoffset, fmt, scorer=None, term=None, + stringids=False): + self.postfile = postfile + self.startoffset = startoffset + self.format = fmt + self.scorer = scorer + self._term = term + self.stringids = stringids + + postfile.seek(startoffset) + magic = postfile.read(4) + assert magic == W2Block.magic + self.blockclass = W2Block + + self.blockcount = postfile.read_uint() + self.baseoffset = postfile.tell() + + self._active = True + self.currentblock = -1 + self._next_block() + + def id(self): + return self.block.ids[self.i] + + def is_active(self): + return self._active + + def weight(self): + weights = self.block.weights + if not weights: + weights = self.block.read_weights() + return weights[self.i] + + def value(self): + values = self.block.values + if values is None: + values = self.block.read_values() + return values[self.i] + + def all_ids(self): + nextoffset = self.baseoffset + for _ in range(self.blockcount): + block = self._read_block(nextoffset) + nextoffset = block.nextoffset + ids = block.read_ids() + for id in ids: + yield id + + def next(self): + if self.i == self.block.count - 1: + self._next_block() + return True + else: + self.i += 1 + return False + + def skip_to(self, id): + if not self.is_active(): + raise ReadTooFar + + i = self.i + # If we're already in the block with the target ID, do nothing + if id <= self.block.ids[i]: + return + + # Skip to the block that would contain the target ID + if id > self.block.maxid: + self._skip_to_block(lambda: id > self.block.maxid) + if not self.is_active(): + return + + # Iterate through the IDs in the block until we find or pass the + # target + ids = self.block.ids + i = self.i + while ids[i] < id: + i += 1 + if i == len(ids): + self._active = False + return + self.i = i + + def skip_to_quality(self, minquality): + bq = self.block_quality + if bq() > minquality: + return 0 + return self._skip_to_block(lambda: bq() <= minquality) + + def block_min_length(self): + return self.block.min_length() + + def block_max_length(self): + return self.block.max_length() + + def block_max_weight(self): + return self.block.max_weight() + + def block_max_wol(self): + return self.block.max_wol() + + def _read_block(self, offset): + pf = self.postfile + pf.seek(offset) + return self.blockclass.from_file(pf, self.format.posting_size, + stringids=self.stringids) + + def _consume_block(self): + self.block.read_ids() + self.block.read_weights() + self.i = 0 + + def _next_block(self, consume=True): + if not (self.currentblock < self.blockcount): + raise Exception("No next block") + + self.currentblock += 1 + if self.currentblock == self.blockcount: + self._active = False + return + + if self.currentblock == 0: + pos = self.baseoffset + else: + pos = self.block.nextoffset + + self.block = self._read_block(pos) + if consume: + self._consume_block() + + def _skip_to_block(self, targetfn): + skipped = 0 + while self._active and targetfn(): + self._next_block(consume=False) + skipped += 1 + + if self._active: + self._consume_block() + + return skipped + + +# Tables + +# Writers + +class TermIndexWriter(HashWriter): + def __init__(self, dbfile): + HashWriter.__init__(self, dbfile) + self.index = [] + self.fieldcounter = 0 + self.fieldmap = {} + + def keycoder(self, term): + # Encode term + fieldmap = self.fieldmap + fieldname, text = term + + if fieldname in fieldmap: + fieldnum = fieldmap[fieldname] + else: + fieldnum = self.fieldcounter + fieldmap[fieldname] = fieldnum + self.fieldcounter += 1 + + key = pack_ushort(fieldnum) + text + return key + + def valuecoder(self, terminfo): + return terminfo.to_string() + + def add(self, key, value): + pos = self.dbfile.tell() + self.index.append(pos) + HashWriter.add(self, self.keycoder(key), self.valuecoder(value)) + + def _write_extras(self): + dbfile = self.dbfile + dbfile.write_uint(len(self.index)) + for n in self.index: + dbfile.write_long(n) + dbfile.write_pickle(self.fieldmap) + + +class VectorWriter(TermIndexWriter): + def keycoder(self, key): + fieldmap = self.fieldmap + docnum, fieldname = key + + if fieldname in fieldmap: + fieldnum = fieldmap[fieldname] + else: + fieldnum = self.fieldcounter + fieldmap[fieldname] = fieldnum + self.fieldcounter += 1 + + return _vectorkey_struct.pack(docnum, fieldnum) + + def valuecoder(self, offset): + return pack_long(offset) + + +# Readers + +class PostingIndexBase(HashReader): + def __init__(self, dbfile, postfile): + HashReader.__init__(self, dbfile) + self.postfile = postfile + + def _read_extras(self): + dbfile = self.dbfile + + self.length = dbfile.read_uint() + self.indexbase = dbfile.tell() + + dbfile.seek(self.indexbase + self.length * _LONG_SIZE) + self.fieldmap = dbfile.read_pickle() + self.names = [None] * len(self.fieldmap) + for name, num in iteritems(self.fieldmap): + self.names[num] = name + + def _closest_key(self, key): + dbfile = self.dbfile + key_at = self._key_at + indexbase = self.indexbase + lo = 0 + hi = self.length + if not isinstance(key, bytes_type): + raise TypeError("Key %r should be bytes" % key) + while lo < hi: + mid = (lo + hi) // 2 + midkey = key_at(dbfile.get_long(indexbase + mid * _LONG_SIZE)) + if midkey < key: + lo = mid + 1 + else: + hi = mid + #i = max(0, mid - 1) + if lo == self.length: + return None + return dbfile.get_long(indexbase + lo * _LONG_SIZE) + + def closest_key(self, key): + pos = self._closest_key(key) + if pos is None: + return None + return self._key_at(pos) + + def _ranges_from(self, key): + #read = self.read + pos = self._closest_key(key) + if pos is None: + return + + for x in self._ranges(pos=pos): + yield x + + def __getitem__(self, key): + k = self.keycoder(key) + return self.valuedecoder(HashReader.__getitem__(self, k)) + + def __contains__(self, key): + try: + codedkey = self.keycoder(key) + except KeyError: + return False + return HashReader.__contains__(self, codedkey) + + def range_for_key(self, key): + return HashReader.range_for_key(self, self.keycoder(key)) + + def get(self, key, default=None): + k = self.keycoder(key) + return self.valuedecoder(HashReader.get(self, k, default)) + + def keys(self): + kd = self.keydecoder + for k in HashReader.keys(self): + yield kd(k) + + def items(self): + kd = self.keydecoder + vd = self.valuedecoder + for key, value in HashReader.items(self): + yield (kd(key), vd(value)) + + def terms_from(self, fieldname, prefix): + return self.keys_from((fieldname, prefix)) + + def keys_from(self, key): + key = self.keycoder(key) + kd = self.keydecoder + read = self.read + for keypos, keylen, _, _ in self._ranges_from(key): + yield kd(read(keypos, keylen)) + + def items_from(self, fieldname, prefix): + read = self.read + key = self.keycoder((fieldname, prefix)) + kd = self.keydecoder + vd = self.valuedecoder + for keypos, keylen, datapos, datalen in self._ranges_from(key): + yield (kd(read(keypos, keylen)), vd(read(datapos, datalen))) + + def values(self): + vd = self.valuedecoder + for v in HashReader.values(self): + yield vd(v) + + def close(self): + HashReader.close(self) + self.postfile.close() + + +class W2TermsReader(PostingIndexBase): + # Implements whoosh.codec.base.TermsReader + + def terms(self): + return self.keys() + + def term_info(self, fieldname, text): + return self[fieldname, text] + + def matcher(self, fieldname, text, format_, scorer=None): + # Note this does not filter out deleted documents; a higher level is + # expected to wrap this matcher to eliminate deleted docs + pf = self.postfile + + term = (fieldname, text) + try: + terminfo = self[term] + except KeyError: + raise TermNotFound("No term %s:%r" % (fieldname, text)) + + p = terminfo.postings + if isinstance(p, integer_types): + # terminfo.postings is an offset into the posting file + pr = W2LeafMatcher(pf, p, format_, scorer=scorer, term=term) + else: + # terminfo.postings is an inlined tuple of (ids, weights, values) + docids, weights, values = p + pr = ListMatcher(docids, weights, values, format_, scorer=scorer, + term=term) + return pr + + def keycoder(self, key): + fieldname, tbytes = key + fnum = self.fieldmap.get(fieldname, 65535) + return pack_ushort(fnum) + tbytes + + def keydecoder(self, v): + assert isinstance(v, bytes_type) + return (self.names[unpack_ushort(v[:2])[0]], v[2:]) + + def valuedecoder(self, v): + assert isinstance(v, bytes_type) + return FileTermInfo.from_string(v) + + def frequency(self, fieldname, btext): + assert isinstance(btext, bytes_type) + datapos = self.range_for_key((fieldname, btext))[0] + return FileTermInfo.read_weight(self.dbfile, datapos) + + def doc_frequency(self, fieldname, btext): + assert isinstance(btext, bytes_type) + datapos = self.range_for_key((fieldname, btext))[0] + return FileTermInfo.read_doc_freq(self.dbfile, datapos) + + +# docnum, fieldnum +_vectorkey_struct = Struct("!IH") + + +class W2VectorReader(PostingIndexBase): + # Implements whoosh.codec.base.VectorReader + + def matcher(self, docnum, fieldname, format_): + pf = self.postfile + offset = self[(docnum, fieldname)] + pr = W2LeafMatcher(pf, offset, format_, stringids=True) + return pr + + def keycoder(self, key): + return _vectorkey_struct.pack(key[0], self.fieldmap[key[1]]) + + def keydecoder(self, v): + docnum, fieldnum = _vectorkey_struct.unpack(v) + return (docnum, self.names[fieldnum]) + + def valuedecoder(self, v): + return unpack_long(v)[0] + + +class W2PerDocReader(base.PerDocumentReader): + def __init__(self, storage, segment): + self._storage = storage + self._segment = segment + self._doccount = segment.doc_count_all() + + flfile = segment.open_file(storage, W2Codec.LENGTHS_EXT) + self._lengths = InMemoryLengths.from_file(flfile, self._doccount) + + sffile = segment.open_file(storage, W2Codec.STORED_EXT) + self._stored = StoredFieldReader(sffile) + + self._vectors = None # Lazy load + + def supports_columns(self): + return False + + def close(self): + self._lengths.close() + if self._vectors: + self._vectors.close() + self._stored.close() + + def doc_count(self): + return self._segment.doc_count() + + def doc_count_all(self): + return self._doccount + + def has_deletions(self): + return self._segment.has_deletions() + + def is_deleted(self, docnum): + return self._segment.is_deleted(docnum) + + def deleted_docs(self): + return self._segment.deleted_docs() + + # Lengths + + def doc_field_length(self, docnum, fieldname, default=0): + return self._lengths.doc_field_length(docnum, fieldname, default) + + def field_length(self, fieldname): + return self._lengths.field_length(fieldname) + + def min_field_length(self, fieldname): + return self._lengths.min_field_length(fieldname) + + def max_field_length(self, fieldname): + return self._lengths.max_field_length(fieldname) + + # Vectors + + def _prep_vectors(self): + vifile = self._segment.open_file(self._storage, W2Codec.VECTOR_EXT) + vpostfile = self._segment.open_file(self._storage, W2Codec.VPOSTS_EXT) + self._vectors = W2VectorReader(vifile, vpostfile) + + def has_vector(self, docnum, fieldname): + if self._vectors is None: + try: + self._prep_vectors() + except (NameError, IOError): + return False + return (docnum, fieldname) in self._vectors + + def vector(self, docnum, fieldname, format_): + if self._vectors is None: + self._prep_vectors() + return self._vectors.matcher(docnum, fieldname, format_) + + # Stored + + def stored_fields(self, docnum): + return self._stored[docnum] + + +# Single-byte field lengths implementations + +class ByteLengthsBase(object): + magic = b("~LN1") + + def __init__(self): + self.starts = {} + self.totals = {} + self.minlens = {} + self.maxlens = {} + + def _read_header(self, dbfile, doccount): + first = dbfile.read(4) # Magic + assert first == self.magic + version = dbfile.read_int() # Version number + assert version == 1 + + self._count = dbfile.read_uint() # Number of documents saved + + fieldcount = dbfile.read_ushort() # Number of fields + # Read per-field info + for i in range(fieldcount): + fieldname = dbfile.read_string().decode('utf-8') + self.totals[fieldname] = dbfile.read_long() + self.minlens[fieldname] = byte_to_length(dbfile.read_byte()) + self.maxlens[fieldname] = byte_to_length(dbfile.read_byte()) + self.starts[fieldname] = i * doccount + + # Add header length to per-field offsets + eoh = dbfile.tell() # End of header + for fieldname in self.starts: + self.starts[fieldname] += eoh + + def doc_count_all(self): + return self._count + + def field_length(self, fieldname): + return self.totals.get(fieldname, 0) + + def min_field_length(self, fieldname): + return self.minlens.get(fieldname, 0) + + def max_field_length(self, fieldname): + return self.maxlens.get(fieldname, 0) + + +class InMemoryLengths(ByteLengthsBase): + def __init__(self): + ByteLengthsBase.__init__(self) + self.totals = defaultdict(int) + self.lengths = {} + self._count = 0 + + def close(self): + pass + + # IO + + def to_file(self, dbfile, doccount): + self._pad_arrays(doccount) + fieldnames = list(self.lengths.keys()) + + dbfile.write(self.magic) + dbfile.write_int(1) # Format version number + dbfile.write_uint(doccount) # Number of documents + dbfile.write_ushort(len(self.lengths)) # Number of fields + + # Write per-field info + for fieldname in fieldnames: + dbfile.write_string(fieldname.encode('utf-8')) # Fieldname + dbfile.write_long(self.field_length(fieldname)) + dbfile.write_byte(length_to_byte(self.min_field_length(fieldname))) + dbfile.write_byte(length_to_byte(self.max_field_length(fieldname))) + + # Write byte arrays + for fieldname in fieldnames: + dbfile.write_array(self.lengths[fieldname]) + dbfile.close() + + @classmethod + def from_file(cls, dbfile, doccount=None): + obj = cls() + obj._read_header(dbfile, doccount) + for fieldname, start in iteritems(obj.starts): + obj.lengths[fieldname] = dbfile.get_array(start, "B", obj._count) + dbfile.close() + return obj + + # Get + + def doc_field_length(self, docnum, fieldname, default=0): + try: + arry = self.lengths[fieldname] + except KeyError: + return default + if docnum >= len(arry): + return default + return byte_to_length(arry[docnum]) + + # Min/max cache setup -- not meant to be called while adding + + def _minmax(self, fieldname, op, cache): + if fieldname in cache: + return cache[fieldname] + else: + ls = self.lengths[fieldname] + if ls: + result = byte_to_length(op(ls)) + else: + result = 0 + cache[fieldname] = result + return result + + def min_field_length(self, fieldname): + return self._minmax(fieldname, min, self.minlens) + + def max_field_length(self, fieldname): + return self._minmax(fieldname, max, self.maxlens) + + # Add + + def _create_field(self, fieldname, docnum): + dc = max(self._count, docnum + 1) + self.lengths[fieldname] = array("B", (0 for _ in range(dc))) + self._count = dc + + def _pad_arrays(self, doccount): + # Pad out arrays to full length + for fieldname in self.lengths.keys(): + arry = self.lengths[fieldname] + if len(arry) < doccount: + for _ in range(doccount - len(arry)): + arry.append(0) + self._count = doccount + + def add(self, docnum, fieldname, length): + lengths = self.lengths + if length: + if fieldname not in lengths: + self._create_field(fieldname, docnum) + + arry = self.lengths[fieldname] + count = docnum + 1 + if len(arry) < count: + for _ in range(count - len(arry)): + arry.append(0) + if count > self._count: + self._count = count + byte = length_to_byte(length) + arry[docnum] = byte + self.totals[fieldname] += length + + def add_other(self, other): + lengths = self.lengths + totals = self.totals + doccount = self._count + for fname in other.lengths: + if fname not in lengths: + lengths[fname] = array("B") + self._pad_arrays(doccount) + + for fname in other.lengths: + lengths[fname].extend(other.lengths[fname]) + self._count = doccount + other._count + self._pad_arrays(self._count) + + for fname in other.totals: + totals[fname] += other.totals[fname] + + +class OnDiskLengths(ByteLengthsBase): + def __init__(self, dbfile, doccount=None): + ByteLengthsBase.__init__(self) + self.dbfile = dbfile + self._read_header(dbfile, doccount) + + def doc_field_length(self, docnum, fieldname, default=0): + try: + start = self.starts[fieldname] + except KeyError: + return default + return byte_to_length(self.dbfile.get_byte(start + docnum)) + + def close(self): + self.dbfile.close() + + +# Stored fields + +_stored_pointer_struct = Struct("!qI") # offset, length +stored_pointer_size = _stored_pointer_struct.size +pack_stored_pointer = _stored_pointer_struct.pack +unpack_stored_pointer = _stored_pointer_struct.unpack + + +class StoredFieldWriter(object): + def __init__(self, dbfile): + self.dbfile = dbfile + self.length = 0 + self.directory = [] + + self.dbfile.write_long(0) + self.dbfile.write_uint(0) + + self.names = [] + self.name_map = {} + + def add(self, vdict): + f = self.dbfile + names = self.names + name_map = self.name_map + + vlist = [None] * len(names) + for k, v in iteritems(vdict): + if k in name_map: + vlist[name_map[k]] = v + else: + name_map[k] = len(names) + names.append(k) + vlist.append(v) + + vstring = dumps(tuple(vlist), -1)[2:-1] + self.length += 1 + self.directory.append(pack_stored_pointer(f.tell(), len(vstring))) + f.write(vstring) + + def add_reader(self, sfreader): + add = self.add + for vdict in sfreader: + add(vdict) + + def close(self): + f = self.dbfile + dirpos = f.tell() + f.write_pickle(self.names) + for pair in self.directory: + f.write(pair) + f.flush() + f.seek(0) + f.write_long(dirpos) + f.write_uint(self.length) + f.close() + + +class StoredFieldReader(object): + def __init__(self, dbfile): + self.dbfile = dbfile + + dbfile.seek(0) + dirpos = dbfile.read_long() + self.length = dbfile.read_uint() + self.basepos = dbfile.tell() + + dbfile.seek(dirpos) + + nameobj = dbfile.read_pickle() + if isinstance(nameobj, dict): + # Previous versions stored the list of names as a map of names to + # positions... it seemed to make sense at the time... + self.names = [None] * len(nameobj) + for name, pos in iteritems(nameobj): + self.names[pos] = name + else: + self.names = nameobj + self.directory_offset = dbfile.tell() + + def close(self): + self.dbfile.close() + + def __iter__(self): + dbfile = self.dbfile + names = self.names + lengths = array("I") + + dbfile.seek(self.directory_offset) + for i in range(self.length): + dbfile.seek(_LONG_SIZE, 1) + lengths.append(dbfile.read_uint()) + + dbfile.seek(self.basepos) + for length in lengths: + vlist = loads(dbfile.read(length) + b(".")) + vdict = dict((names[i], vlist[i]) for i in range(len(vlist)) + if vlist[i] is not None) + yield vdict + + def __getitem__(self, num): + if num > self.length - 1: + raise IndexError("Tried to get document %s, file has %s" + % (num, self.length)) + + dbfile = self.dbfile + start = self.directory_offset + num * stored_pointer_size + dbfile.seek(start) + ptr = dbfile.read(stored_pointer_size) + if len(ptr) != stored_pointer_size: + raise Exception("Error reading %r @%s %s < %s" + % (dbfile, start, len(ptr), stored_pointer_size)) + position, length = unpack_stored_pointer(ptr) + dbfile.seek(position) + vlist = loads(dbfile.read(length) + b(".")) + + names = self.names + # Recreate a dictionary by putting the field names and values back + # together by position. We can't just use dict(zip(...)) because we + # want to filter out the None values. + vdict = dict((names[i], vlist[i]) for i in range(len(vlist)) + if vlist[i] is not None) + return vdict + + +# Segment object + +class W2Segment(base.Segment): + def __init__(self, indexname, doccount=0, segid=None, deleted=None): + """ + :param name: The name of the segment (the Index object computes this + from its name and the generation). + :param doccount: The maximum document number in the segment. + :param term_count: Total count of all terms in all documents. + :param deleted: A set of deleted document numbers, or None if no + deleted documents exist in this segment. + """ + + assert isinstance(indexname, string_type) + self.indexname = indexname + assert isinstance(doccount, integer_types) + self.doccount = doccount + self.segid = self._random_id() if segid is None else segid + self.deleted = deleted + self.compound = False + + def codec(self, **kwargs): + return W2Codec(**kwargs) + + def set_doc_count(self, dc): + self.doccount = dc + + def doc_count_all(self): + return self.doccount + + def doc_count(self): + return self.doccount - self.deleted_count() + + def has_deletions(self): + return self.deleted is not None and bool(self.deleted) + + def deleted_count(self): + if self.deleted is None: + return 0 + return len(self.deleted) + + def delete_document(self, docnum, delete=True): + if delete: + if self.deleted is None: + self.deleted = set() + self.deleted.add(docnum) + elif self.deleted is not None and docnum in self.deleted: + self.deleted.clear(docnum) + + def is_deleted(self, docnum): + if self.deleted is None: + return False + return docnum in self.deleted + + def deleted_docs(self): + if self.deleted is None: + return () + else: + return iter(self.deleted) + + +# Posting blocks + +class W2Block(object): + magic = b("Blk3") + + infokeys = ("count", "maxid", "maxweight", "minlength", "maxlength", + "idcode", "compression", "idslen", "weightslen") + + def __init__(self, postingsize, stringids=False): + self.postingsize = postingsize + self.stringids = stringids + self.ids = [] if stringids else array("I") + self.weights = array("f") + self.values = None + + self.minlength = None + self.maxlength = 0 + self.maxweight = 0 + + def __len__(self): + return len(self.ids) + + def __nonzero__(self): + return bool(self.ids) + + def min_id(self): + if self.ids: + return self.ids[0] + else: + raise IndexError + + def max_id(self): + if self.ids: + return self.ids[-1] + else: + raise IndexError + + def min_length(self): + return self.minlength + + def max_length(self): + return self.maxlength + + def max_weight(self): + return self.maxweight + + def add(self, id_, weight, valuestring, length=None): + self.ids.append(id_) + self.weights.append(weight) + if weight > self.maxweight: + self.maxweight = weight + if valuestring: + if self.values is None: + self.values = [] + self.values.append(valuestring) + if length: + if self.minlength is None or length < self.minlength: + self.minlength = length + if length > self.maxlength: + self.maxlength = length + + def to_file(self, postfile, compression=3): + ids = self.ids + idcode, idstring = minimize_ids(ids, self.stringids, compression) + wtstring = minimize_weights(self.weights, compression) + vstring = minimize_values(self.postingsize, self.values, compression) + + info = (len(ids), ids[-1], self.maxweight, + length_to_byte(self.minlength), length_to_byte(self.maxlength), + idcode, compression, len(idstring), len(wtstring)) + infostring = dumps(info, -1) + + # Offset to next block + postfile.write_uint(len(infostring) + len(idstring) + len(wtstring) + + len(vstring)) + # Block contents + postfile.write(infostring) + postfile.write(idstring) + postfile.write(wtstring) + postfile.write(vstring) + + @classmethod + def from_file(cls, postfile, postingsize, stringids=False): + block = cls(postingsize, stringids=stringids) + block.postfile = postfile + + delta = postfile.read_uint() + block.nextoffset = postfile.tell() + delta + info = postfile.read_pickle() + block.dataoffset = postfile.tell() + + for key, value in zip(cls.infokeys, info): + if key in ("minlength", "maxlength"): + value = byte_to_length(value) + setattr(block, key, value) + + return block + + def read_ids(self): + offset = self.dataoffset + self.postfile.seek(offset) + idstring = self.postfile.read(self.idslen) + ids = deminimize_ids(self.idcode, self.count, idstring, + self.compression) + self.ids = ids + return ids + + def read_weights(self): + if self.weightslen == 0: + weights = [1.0] * self.count + else: + offset = self.dataoffset + self.idslen + self.postfile.seek(offset) + wtstring = self.postfile.read(self.weightslen) + weights = deminimize_weights(self.count, wtstring, + self.compression) + self.weights = weights + return weights + + def read_values(self): + postingsize = self.postingsize + if postingsize == 0: + values = [None] * self.count + else: + offset = self.dataoffset + self.idslen + self.weightslen + self.postfile.seek(offset) + vstring = self.postfile.read(self.nextoffset - offset) + values = deminimize_values(postingsize, self.count, vstring, + self.compression) + self.values = values + return values + + +# File TermInfo + +NO_ID = 0xffffffff + + +class FileTermInfo(TermInfo): + # Freq, Doc freq, min len, max length, max weight, unused, min ID, max ID + struct = Struct("!fIBBffII") + + def __init__(self, *args, **kwargs): + self.postings = None + if "postings" in kwargs: + self.postings = kwargs["postings"] + del kwargs["postings"] + TermInfo.__init__(self, *args, **kwargs) + + # filedb specific methods + + def add_block(self, block): + self._weight += sum(block.weights) + self._df += len(block) + + ml = block.min_length() + if self._minlength is None: + self._minlength = ml + else: + self._minlength = min(self._minlength, ml) + + self._maxlength = max(self._maxlength, block.max_length()) + self._maxweight = max(self._maxweight, block.max_weight()) + if self._minid is None: + self._minid = block.ids[0] + self._maxid = block.ids[-1] + + def to_string(self): + # Encode the lengths as 0-255 values + ml = 0 if self._minlength is None else length_to_byte(self._minlength) + xl = length_to_byte(self._maxlength) + # Convert None values to the out-of-band NO_ID constant so they can be + # stored as unsigned ints + mid = NO_ID if self._minid is None else self._minid + xid = NO_ID if self._maxid is None else self._maxid + + # Pack the term info into bytes + st = self.struct.pack(self._weight, self._df, ml, xl, self._maxweight, + 0, mid, xid) + + if isinstance(self.postings, tuple): + # Postings are inlined - dump them using the pickle protocol + isinlined = 1 + st += dumps(self.postings, -1)[2:-1] + else: + # Append postings pointer as long to end of term info bytes + isinlined = 0 + # It's possible for a term info to not have a pointer to postings + # on disk, in which case postings will be None. Convert a None + # value to -1 so it can be stored as a long. + p = -1 if self.postings is None else self.postings + st += pack_long(p) + + # Prepend byte indicating whether the postings are inlined to the term + # info bytes + return pack_byte(isinlined) + st + + @classmethod + def from_string(cls, s): + assert isinstance(s, bytes_type) + + if isinstance(s, string_type): + hbyte = ord(s[0]) # Python 2.x - str + else: + hbyte = s[0] # Python 3 - bytes + + if hbyte < 2: + st = cls.struct + # Weight, Doc freq, min len, max len, max w, unused, min ID, max ID + w, df, ml, xl, xw, _, mid, xid = st.unpack(s[1:st.size + 1]) + mid = None if mid == NO_ID else mid + xid = None if xid == NO_ID else xid + # Postings + pstr = s[st.size + 1:] + if hbyte == 0: + p = unpack_long(pstr)[0] + else: + p = loads(pstr + b(".")) + else: + # Old format was encoded as a variable length pickled tuple + v = loads(s + b(".")) + if len(v) == 1: + w = df = 1 + p = v[0] + elif len(v) == 2: + w = df = v[1] + p = v[0] + else: + w, p, df = v + # Fake values for stats which weren't stored before + ml = 1 + xl = 255 + xw = 999999999 + mid = -1 + xid = -1 + + ml = byte_to_length(ml) + xl = byte_to_length(xl) + obj = cls(w, df, ml, xl, xw, mid, xid) + obj.postings = p + return obj + + @classmethod + def read_weight(cls, dbfile, datapos): + return dbfile.get_float(datapos + 1) + + @classmethod + def read_doc_freq(cls, dbfile, datapos): + return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE) + + @classmethod + def read_min_and_max_length(cls, dbfile, datapos): + lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + ml = byte_to_length(dbfile.get_byte(lenpos)) + xl = byte_to_length(dbfile.get_byte(lenpos + 1)) + return ml, xl + + @classmethod + def read_max_weight(cls, dbfile, datapos): + weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2 + return dbfile.get_float(weightspos) + + +# Utility functions + +def minimize_ids(arry, stringids, compression=0): + amax = arry[-1] + + if stringids: + typecode = '' + string = dumps(arry) + else: + typecode = arry.typecode + if amax <= 255: + typecode = "B" + elif amax <= 65535: + typecode = "H" + + if typecode != arry.typecode: + arry = array(typecode, iter(arry)) + if not IS_LITTLE: + arry.byteswap() + string = array_tobytes(arry) + if compression: + string = zlib.compress(string, compression) + return (typecode, string) + + +def deminimize_ids(typecode, count, string, compression=0): + if compression: + string = zlib.decompress(string) + if typecode == '': + return loads(string) + else: + arry = array(typecode) + array_frombytes(arry, string) + if not IS_LITTLE: + arry.byteswap() + return arry + + +def minimize_weights(weights, compression=0): + if all(w == 1.0 for w in weights): + string = b("") + else: + if not IS_LITTLE: + weights.byteswap() + string = array_tobytes(weights) + if string and compression: + string = zlib.compress(string, compression) + return string + + +def deminimize_weights(count, string, compression=0): + if not string: + return array("f", (1.0 for _ in range(count))) + if compression: + string = zlib.decompress(string) + arry = array("f") + array_frombytes(arry, string) + if not IS_LITTLE: + arry.byteswap() + return arry + + +def minimize_values(postingsize, values, compression=0): + if postingsize < 0: + string = dumps(values, -1)[2:] + elif postingsize == 0: + string = b('') + else: + string = b('').join(values) + if string and compression: + string = zlib.compress(string, compression) + return string + + +def deminimize_values(postingsize, count, string, compression=0): + if compression: + string = zlib.decompress(string) + + if postingsize < 0: + return loads(string) + elif postingsize == 0: + return [None] * count + else: + return [string[i:i + postingsize] for i + in range(0, len(string), postingsize)] + + +# Legacy field types + +from whoosh.compat import long_type +from whoosh.fields import NUMERIC + + +class OLD_NUMERIC(NUMERIC): + NUMERIC_DEFAULTS = {"b": 2 ** 7 - 1, "B": 2 ** 8 - 1, "h": 2 ** 15 - 1, + "H": 2 ** 16 - 1, "i": 2 ** 31 - 1, "I": 2 ** 32 - 1, + "q": 2 ** 63 - 1, "Q": 2 ** 64 - 1, "f": NaN, + "d": NaN, + } + + def __init__(self, type=int, stored=False, unique=False, field_boost=1.0, + decimal_places=0, shift_step=4, signed=True): + from whoosh import analysis, formats + + self.type = type + if self.type is long_type: + # This will catch the Python 3 int type + self._to_text = self._long_to_text + self._from_text = self._text_to_long + self.sortable_typecode = "q" if signed else "Q" + elif self.type is int: + self._to_text = self._int_to_text + self._from_text = self._text_to_int + self.sortable_typecode = "i" if signed else "I" + elif self.type is float: + self._to_text = self._float_to_text + self._from_text = self._text_to_float + self.sortable_typecode = "f" + elif self.type is Decimal: + raise TypeError("To store Decimal instances, set type to int or " + "float and use the decimal_places argument") + else: + raise TypeError("%s field type can't store %r" % (self.__class__, + self.type)) + + self.stored = stored + self.unique = unique + self.decimal_places = decimal_places + self.shift_step = shift_step + self.signed = signed + + self.analyzer = analysis.IDAnalyzer() + self.format = formats.Existence(field_boost=field_boost) + + def __setstate__(self, d): + self.__dict__.update(d) + self.numtype = d["type"] + self.bits = 32 if (d["type"] is int and not PY3) else 64 + + def prepare_number(self, x): + if x is None or x == emptybytes: + return x + if self.decimal_places: + x = Decimal(x) + x *= 10 ** self.decimal_places + x = self.type(x) + return x + + def unprepare_number(self, x): + dc = self.decimal_places + if dc: + s = str(x) + x = Decimal(s[:-dc] + "." + s[-dc:]) + return x + + def to_bytes(self, x, shift=0): + if isinstance(x, bytes_type): + return x + return utf8encode(self.to_text(x, shift))[0] + + def from_bytes(self, bs): + return self.from_text(utf8decode(bs)[0]) + + def sortable_to_bytes(self, x, shift=0): + if shift: + x >>= shift + return pack_byte(shift) + self._to_text() + + def to_text(self, x, shift=0): + x = self.prepare_number(x) + x = self._to_text(x, shift=shift, signed=self.signed) + return x + + def from_text(self, t): + x = self._from_text(t, signed=self.signed) + return self.unprepare_number(x) + + def process_text(self, text, **kwargs): + return (self.to_text(text),) + + def self_parsing(self): + return True + + def parse_query(self, fieldname, qstring, boost=1.0): + from whoosh import query + + if qstring == "*": + return query.Every(fieldname, boost=boost) + + try: + text = self.to_text(qstring) + except Exception: + e = sys.exc_info()[1] + return query.error_query(e) + + return query.Term(fieldname, text, boost=boost) + + def parse_range(self, fieldname, start, end, startexcl, endexcl, + boost=1.0): + from whoosh import query + from whoosh.qparser.common import QueryParserError + + try: + if start is not None: + start = self.from_text(self.to_text(start)) + if end is not None: + end = self.from_text(self.to_text(end)) + except Exception: + e = sys.exc_info()[1] + raise QueryParserError(e) + + return query.NumericRange(fieldname, start, end, startexcl, endexcl, + boost=boost) + + def sortable_terms(self, ixreader, fieldname): + for btext in ixreader.lexicon(fieldname): + if btext[0:1] != "\x00": + # Only yield the full-precision values + break + yield btext + + +class OLD_DATETIME(OLD_NUMERIC): + def __init__(self, stored=False, unique=False): + OLD_NUMERIC.__init__(self, type=long_type, stored=stored, + unique=unique, shift_step=8) + + def to_text(self, x, shift=0): + from datetime import datetime + from whoosh.util.times import floor + + try: + if isinstance(x, text_type): + # For indexing, support same strings as for query parsing + x = self._parse_datestring(x) + x = floor(x) # this makes most sense (unspecified = lowest) + if isinstance(x, datetime): + x = datetime_to_long(x) + elif not isinstance(x, integer_types): + raise TypeError() + except Exception: + raise ValueError("DATETIME.to_text can't convert from %r" % (x,)) + + x = OLD_NUMERIC.to_text(self, x, shift=shift) + return x + + def from_text(self, x): + x = OLD_NUMERIC.from_text(self, x) + return long_to_datetime(x) + + def _parse_datestring(self, qstring): + # This method parses a very simple datetime representation of the form + # YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]] + from whoosh.util.times import adatetime, fix, is_void + + qstring = qstring.replace(" ", "").replace("-", "").replace(".", "") + year = month = day = hour = minute = second = microsecond = None + if len(qstring) >= 4: + year = int(qstring[:4]) + if len(qstring) >= 6: + month = int(qstring[4:6]) + if len(qstring) >= 8: + day = int(qstring[6:8]) + if len(qstring) >= 10: + hour = int(qstring[8:10]) + if len(qstring) >= 12: + minute = int(qstring[10:12]) + if len(qstring) >= 14: + second = int(qstring[12:14]) + if len(qstring) == 20: + microsecond = int(qstring[14:]) + + at = fix(adatetime(year, month, day, hour, minute, second, + microsecond)) + if is_void(at): + raise Exception("%r is not a parseable date" % qstring) + return at + + def parse_query(self, fieldname, qstring, boost=1.0): + from whoosh import query + from whoosh.util.times import is_ambiguous + + try: + at = self._parse_datestring(qstring) + except: + e = sys.exc_info()[1] + return query.error_query(e) + + if is_ambiguous(at): + startnum = datetime_to_long(at.floor()) + endnum = datetime_to_long(at.ceil()) + return query.NumericRange(fieldname, startnum, endnum) + else: + return query.Term(fieldname, self.to_text(at), boost=boost) + + def parse_range(self, fieldname, start, end, startexcl, endexcl, + boost=1.0): + from whoosh import query + + if start is None and end is None: + return query.Every(fieldname, boost=boost) + + if start is not None: + startdt = self._parse_datestring(start).floor() + start = datetime_to_long(startdt) + + if end is not None: + enddt = self._parse_datestring(end).ceil() + end = datetime_to_long(enddt) + + return query.NumericRange(fieldname, start, end, boost=boost) + + +# Functions for converting numbers to and from text + +def int_to_text(x, shift=0, signed=True): + x = to_sortable(int, 32, signed, x) + return sortable_int_to_text(x, shift) + + +def text_to_int(text, signed=True): + x = text_to_sortable_int(text) + x = from_sortable(int, 32, signed, x) + return x + + +def long_to_text(x, shift=0, signed=True): + x = to_sortable(long_type, 64, signed, x) + return sortable_long_to_text(x, shift) + + +def text_to_long(text, signed=True): + x = text_to_sortable_long(text) + x = from_sortable(long_type, 64, signed, x) + return x + + +def float_to_text(x, shift=0, signed=True): + x = to_sortable(float, 32, signed, x) + return sortable_long_to_text(x, shift) + + +def text_to_float(text, signed=True): + x = text_to_sortable_long(text) + x = from_sortable(float, 32, signed, x) + return x + + +# Functions for converting sortable representations to and from text. + +from whoosh.support.base85 import to_base85, from_base85 + + +def sortable_int_to_text(x, shift=0): + if shift: + x >>= shift + #text = chr(shift) + u"%08x" % x + text = chr(shift) + to_base85(x, False) + return text + + +def sortable_long_to_text(x, shift=0): + if shift: + x >>= shift + #text = chr(shift) + u"%016x" % x + #assert len(text) == 17 + text = chr(shift) + to_base85(x, True) + return text + + +def text_to_sortable_int(text): + #assert len(text) == 9 + #return int(text[1:], 16) + return from_base85(text[1:]) + + +def text_to_sortable_long(text): + #assert len(text) == 17 + #return long(text[1:], 16) + return from_base85(text[1:]) diff --git a/src/whoosh_reloaded/codec/whoosh_reloaded3.py b/src/whoosh/codec/whoosh3.py similarity index 97% rename from src/whoosh_reloaded/codec/whoosh_reloaded3.py rename to src/whoosh/codec/whoosh3.py index e92f882e..ca846af3 100644 --- a/src/whoosh_reloaded/codec/whoosh_reloaded3.py +++ b/src/whoosh/codec/whoosh3.py @@ -33,19 +33,19 @@ from array import array from collections import defaultdict -from whoosh_reloaded import columns, formats -from whoosh_reloaded.compat import b, bytes_type, string_type, integer_types -from whoosh_reloaded.compat import dumps, loads, iteritems, xrange -from whoosh_reloaded.codec import base -from whoosh_reloaded.filedb import compound, filetables -from whoosh_reloaded.matching import ListMatcher, ReadTooFar, LeafMatcher -from whoosh_reloaded.reading import TermInfo, TermNotFound -from whoosh_reloaded.system import emptybytes -from whoosh_reloaded.system import _SHORT_SIZE, _INT_SIZE, _LONG_SIZE, _FLOAT_SIZE -from whoosh_reloaded.system import pack_ushort, unpack_ushort -from whoosh_reloaded.system import pack_int, unpack_int, pack_long, unpack_long -from whoosh_reloaded.util.numlists import delta_encode, delta_decode -from whoosh_reloaded.util.numeric import length_to_byte, byte_to_length +from whoosh import columns, formats +from whoosh.compat import b, bytes_type, string_type, integer_types +from whoosh.compat import dumps, loads, iteritems, range +from whoosh.codec import base +from whoosh.filedb import compound, filetables +from whoosh.matching import ListMatcher, ReadTooFar, LeafMatcher +from whoosh.reading import TermInfo, TermNotFound +from whoosh.system import emptybytes +from whoosh.system import _SHORT_SIZE, _INT_SIZE, _LONG_SIZE, _FLOAT_SIZE +from whoosh.system import pack_ushort, unpack_ushort +from whoosh.system import pack_int, unpack_int, pack_long, unpack_long +from whoosh.util.numlists import delta_encode, delta_decode +from whoosh.util.numeric import length_to_byte, byte_to_length try: import zlib @@ -896,7 +896,7 @@ def max_weight(self): class W3LeafMatcher(LeafMatcher): """Reads on-disk postings from the postings file and presents the - :class:`whoosh_reloaded.matching.Matcher` interface. + :class:`whoosh.matching.Matcher` interface. """ def __init__( @@ -1140,9 +1140,9 @@ def _read_weights(self): # De-minify the weights postcount = self._blocklength if weights is None: - self._weights = array("f", (1.0 for _ in xrange(postcount))) + self._weights = array("f", (1.0 for _ in range(postcount))) elif isinstance(weights, float): - self._weights = array("f", (weights for _ in xrange(postcount))) + self._weights = array("f", (weights for _ in range(postcount))) else: self._weights = weights @@ -1161,7 +1161,7 @@ def _read_values(self): else: assert isinstance(vs, bytes_type) self._values = tuple( - vs[i : i + fixedsize] for i in xrange(0, len(vs), fixedsize) + vs[i : i + fixedsize] for i in range(0, len(vs), fixedsize) ) diff --git a/src/whoosh_reloaded/collectors.py b/src/whoosh/collectors.py similarity index 95% rename from src/whoosh_reloaded/collectors.py rename to src/whoosh/collectors.py index 0815c019..50b71c1e 100644 --- a/src/whoosh_reloaded/collectors.py +++ b/src/whoosh/collectors.py @@ -27,27 +27,27 @@ """ This module contains "collector" objects. Collectors provide a way to gather -"raw" results from a :class:` whoosh_reloaded.matching.Matcher` object, implement +"raw" results from a :class:` whoosh.matching.Matcher` object, implement sorting, filtering, collation, etc., and produce a -:class:` whoosh_reloaded.searching.Results` object. +:class:` whoosh.searching.Results` object. The basic collectors are: TopCollector Returns the top N matching results sorted by score, using block-quality optimizations to skip blocks of documents that can't contribute to the top - N. The :meth:` whoosh_reloaded.searching.Searcher.search` method uses this type of + N. The :meth:` whoosh.searching.Searcher.search` method uses this type of collector by default or when you specify a ``limit``. UnlimitedCollector Returns all matching results sorted by score. The - :meth:` whoosh_reloaded.searching.Searcher.search` method uses this type of collector + :meth:` whoosh.searching.Searcher.search` method uses this type of collector when you specify ``limit=None`` or you specify a limit equal to or greater than the number of documents in the searcher. SortingCollector - Returns all matching results sorted by a :class:` whoosh_reloaded.sorting.Facet` - object. The :meth:` whoosh_reloaded.searching.Searcher.search` method uses this type + Returns all matching results sorted by a :class:` whoosh.sorting.Facet` + object. The :meth:` whoosh.searching.Searcher.search` method uses this type of collector when you use the ``sortedby`` parameter. Here's an example of a simple collector that instead of remembering the matched @@ -68,7 +68,7 @@ def collect(self, sub_docnum): print(c.count) There are also several wrapping collectors that extend or modify the -functionality of other collectors. The meth:` whoosh_reloaded.searching.Searcher.search` +functionality of other collectors. The meth:` whoosh.searching.Searcher.search` method uses many of these when you specify various parameters. NOTE: collectors are not designed to be reentrant or thread-safe. It is @@ -82,10 +82,10 @@ def collect(self, sub_docnum): from collections import defaultdict from heapq import heapify, heappush, heapreplace -from whoosh_reloaded import sorting -from whoosh_reloaded.compat import abstractmethod, iteritems, itervalues, xrange -from whoosh_reloaded.searching import Results, TimeLimit -from whoosh_reloaded.util import now +from whoosh import sorting +from whoosh.compat import abstractmethod, iteritems, itervalues, range +from whoosh.searching import Results, TimeLimit +from whoosh.util import now # Functions @@ -122,10 +122,10 @@ def prepare(self, top_searcher, q, context): to use faster methods that don't necessarily keep the matcher updated, such as ``matcher.all_ids()``. - :param top_searcher: the top-level :class:` whoosh_reloaded.searching.Searcher` + :param top_searcher: the top-level :class:` whoosh.searching.Searcher` object. - :param q: the :class:` whoosh_reloaded.query.Query` object being searched for. - :param context: a :class:` whoosh_reloaded.searching.SearchContext` object + :param q: the :class:` whoosh.query.Query` object being searched for. + :param context: a :class:` whoosh.searching.SearchContext` object containing information about the search. """ @@ -163,7 +163,7 @@ def set_subsearcher(self, subsearcher, offset): :meth:`Collector.collect` to get the top-level document number for use in results. self.matcher - A :class:` whoosh_reloaded.matching.Matcher` object representing the matches + A :class:` whoosh.matching.Matcher` object representing the matches for the query in the current sub-searcher. """ @@ -260,7 +260,7 @@ def remove(self, global_docnum): """ items = self.items - for i in xrange(len(items)): + for i in range(len(items)): if items[i][1] == global_docnum: items.pop(i) return @@ -307,7 +307,7 @@ def _results(self, items, **kwargs): @abstractmethod def results(self): - """Returns a :class:`~ whoosh_reloaded.searching.Results` object containing the + """Returns a :class:`~ whoosh.searching.Results` object containing the results of the search. Subclasses must implement this method """ @@ -489,7 +489,7 @@ def remove(self, global_docnum): # Remove the document if it's on the list (it may not be since # TopCollector forgets documents that don't make the top N list) - for i in xrange(len(items)): + for i in range(len(items)): if items[i][1] == negated: items.pop(i) # Restore the heap invariant @@ -536,7 +536,7 @@ def results(self): class SortingCollector(Collector): """A collector that returns results sorted by a given - :class:` whoosh_reloaded.sorting.Facet` object. See :doc:`/facets` for more + :class:` whoosh.sorting.Facet` object. See :doc:`/facets` for more information. """ @@ -764,12 +764,12 @@ def results(self): class FacetCollector(WrappingCollector): """A collector that creates groups of documents based on - :class:` whoosh_reloaded.sorting.Facet` objects. See :doc:`/facets` for more + :class:` whoosh.sorting.Facet` objects. See :doc:`/facets` for more information. This collector is used if you specify a ``groupedby`` parameter in the - :meth:` whoosh_reloaded.searching.Searcher.search` method. You can use the - :meth:` whoosh_reloaded.searching.Results.groups` method to access the facet groups. + :meth:` whoosh.searching.Searcher.search` method. You can use the + :meth:` whoosh.searching.Results.groups` method to access the facet groups. If you have a reference to the collector can also use ``FacetedCollector.facetmaps`` to access the groups directly:: @@ -783,7 +783,7 @@ class FacetCollector(WrappingCollector): def __init__(self, child, groupedby, maptype=None): """ :param groupedby: see :doc:`/facets`. - :param maptype: a :class:` whoosh_reloaded.sorting.FacetMap` type to use for any + :param maptype: a :class:` whoosh.sorting.FacetMap` type to use for any facets that don't specify their own. """ @@ -876,11 +876,11 @@ class CollapseCollector(WrappingCollector): def __init__(self, child, keyfacet, limit=1, order=None): """ :param child: the collector to wrap. - :param keyfacet: a :class:` whoosh_reloaded.sorting.Facet` to use for collapsing. + :param keyfacet: a :class:` whoosh.sorting.Facet` to use for collapsing. All but the top N documents that share a key will be eliminated from the results. :param limit: the maximum number of documents to keep for each key. - :param order: an optional :class:` whoosh_reloaded.sorting.Facet` to use + :param order: an optional :class:` whoosh.sorting.Facet` to use to determine the "top" document(s) to keep when collapsing. The default (``orderfaceet=None``) uses the results order (e.g. the highest score in a scored search). @@ -1116,7 +1116,7 @@ class TermsCollector(WrappingCollector): in each matched document. This collector is used if you specify ``terms=True`` in the - :meth:` whoosh_reloaded.searching.Searcher.search` method. + :meth:` whoosh.searching.Searcher.search` method. If you have a reference to the collector can also use ``TermsCollector.termslist`` to access the term lists directly:: diff --git a/src/whoosh_reloaded/columns.py b/src/whoosh/columns.py similarity index 96% rename from src/whoosh_reloaded/columns.py rename to src/whoosh/columns.py index 51b677b8..a711ef72 100644 --- a/src/whoosh_reloaded/columns.py +++ b/src/whoosh/columns.py @@ -56,15 +56,15 @@ except ImportError: zlib = None -from whoosh_reloaded.compat import b, bytes_type, BytesIO -from whoosh_reloaded.compat import array_tobytes, xrange -from whoosh_reloaded.compat import dumps, loads -from whoosh_reloaded.filedb.structfile import StructFile -from whoosh_reloaded.idsets import BitSet, OnDiskBitSet -from whoosh_reloaded.system import emptybytes -from whoosh_reloaded.util.numeric import typecode_max, typecode_min -from whoosh_reloaded.util.numlists import GrowableArray -from whoosh_reloaded.util.varints import varint, read_varint +from whoosh.compat import b, bytes_type, BytesIO +from whoosh.compat import array_tobytes, range +from whoosh.compat import dumps, loads +from whoosh.filedb.structfile import StructFile +from whoosh.idsets import BitSet, OnDiskBitSet +from whoosh.system import emptybytes +from whoosh.util.numeric import typecode_max, typecode_min +from whoosh.util.numlists import GrowableArray +from whoosh.util.varints import varint, read_varint # Base classes @@ -84,7 +84,7 @@ def writer(self, dbfile): """Returns a :class:`ColumnWriter` object you can use to use to create a column of this type on disk. - :param dbfile: the :class:`~ whoosh_reloaded.filedb.structfile.StructFile` to + :param dbfile: the :class:`~ whoosh.filedb.structfile.StructFile` to write to. """ @@ -94,7 +94,7 @@ def reader(self, dbfile, basepos, length, doccount): """Returns a :class:`ColumnReader` object you can use to read a column of this type from disk. - :param dbfile: the :class:`~ whoosh_reloaded.filedb.structfile.StructFile` to + :param dbfile: the :class:`~ whoosh.filedb.structfile.StructFile` to read from. :param basepos: the offset within the file at which the column starts. :param length: the length in bytes of the column occupies in the file. @@ -125,7 +125,7 @@ def fill(self, docnum): write = self._dbfile.write default = self._defaultbytes if docnum > self._count: - for _ in xrange(docnum - self._count): + for _ in range(docnum - self._count): write(default) def add(self, docnum, value): @@ -152,7 +152,7 @@ def sort_key(self, docnum): return self[docnum] def __iter__(self): - for i in xrange(self._doccount): + for i in range(self._doccount): yield self[i] def load(self): @@ -210,8 +210,8 @@ def __repr__(self): def fill(self, docnum): base = self._offset_base if docnum > self._count: - self._lengths.extend(0 for _ in xrange(docnum - self._count)) - self._offsets.extend(base for _ in xrange(docnum - self._count)) + self._lengths.extend(0 for _ in range(docnum - self._count)) + self._offsets.extend(base for _ in range(docnum - self._count)) def add(self, docnum, v): self.fill(docnum) @@ -380,7 +380,7 @@ def __getitem__(self, docnum): def __iter__(self): count = self._count default = self._default - for i in xrange(self._doccount): + for i in range(self._doccount): if i < count: yield self[i] else: @@ -459,10 +459,10 @@ def __repr__(self): def fill(self, docnum): if docnum > self._count: if self._refs is not None: - self._refs.extend(0 for _ in xrange(docnum - self._count)) + self._refs.extend(0 for _ in range(docnum - self._count)) else: dbfile = self._dbfile - for _ in xrange(docnum - self._count): + for _ in range(docnum - self._count): dbfile.write_ushort(0) def add(self, docnum, v): @@ -546,7 +546,7 @@ def _read_uniques(self): ucount = dbfile.read_varint() length = fixedlen uniques = [] - for _ in xrange(ucount): + for _ in range(ucount): if not fixedlen: length = dbfile.read_varint() uniques.append(dbfile.read(length)) @@ -564,7 +564,7 @@ def __iter__(self): unpack = self._unpack itemsize = self._itemsize - for i in xrange(self._doccount): + for i in range(self._doccount): pos = basepos + i * itemsize ref = unpack(get(pos, itemsize))[0] yield uniques[ref] @@ -743,12 +743,12 @@ def __iter__(self): i = 0 for num in self._bitset: if num > i: - for _ in xrange(num - i): + for _ in range(num - i): yield False yield True i = num + 1 if self._doccount > i: - for _ in xrange(self._doccount - i): + for _ in range(self._doccount - i): yield False def load(self): @@ -940,17 +940,17 @@ def __iter__(self): startdoc = block[0] enddoc = block[1] if startdoc > (last + 1): - for _ in xrange(startdoc - last): + for _ in range(startdoc - last): yield emptybytes values = self._get_block(i) - for docnum in xrange(startdoc, enddoc + 1): + for docnum in range(startdoc, enddoc + 1): if docnum in values: yield values[docnum] else: yield emptybytes last = enddoc if enddoc < self._doccount - 1: - for _ in xrange(self._doccount - enddoc): + for _ in range(self._doccount - enddoc): yield emptybytes @@ -1022,7 +1022,7 @@ def __getitem__(self, docnum): return self._default def __iter__(self): - return (self._default for _ in xrange(self._doccount)) + return (self._default for _ in range(self._doccount)) def load(self): return self @@ -1237,7 +1237,7 @@ def sort_key(self, docnum): return self[docnum][0] def __iter__(self): - for docnum in xrange(len(self)): + for docnum in range(len(self)): yield self[docnum] @@ -1262,7 +1262,7 @@ def __getitem__(self, docnum): bio = BytesIO(data) count = read_varint(bio.read) out = [] - for _ in xrange(count): + for _ in range(count): vlen = read_varint(bio.read) v = bio.read(vlen) out.append(v) @@ -1305,7 +1305,7 @@ def __getitem__(self, docnum): v = self._child[docnum] if not v: return [] - ls = [v[i : i + fixedlen] for i in xrange(0, len(v), fixedlen)] + ls = [v[i : i + fixedlen] for i in range(0, len(v), fixedlen)] return ls @@ -1346,7 +1346,7 @@ def __getitem__(self, docnum): # # def fill(self, docnum): # if docnum > self._count: -# self._lengths.extend(0 for _ in xrange(docnum - self._count)) +# self._lengths.extend(0 for _ in range(docnum - self._count)) # # def add(self, docnum, ls): # uniques = self._uniques diff --git a/src/whoosh_reloaded/compat.py b/src/whoosh/compat.py similarity index 97% rename from src/whoosh_reloaded/compat.py rename to src/whoosh/compat.py index 4a67aa44..c2b602eb 100644 --- a/src/whoosh_reloaded/compat.py +++ b/src/whoosh/compat.py @@ -1,5 +1,5 @@ -import array, sys - +import array +import sys # Run time aliasing of Python2/3 differences @@ -35,13 +35,14 @@ def b(s): long_type = long next = lambda o: o.next() # import cPickle as pickle - from cPickle import dumps, loads, dump, load + from cPickle import dump, dumps, load, loads string_type = basestring text_type = unicode bytes_type = str unichr = unichr from urllib import urlretrieve + import Queue as queue def byte(num): @@ -56,7 +57,7 @@ class _WhooshBase(base): return _WhooshBase - xrange = xrange + # range = range zip_ = zip def memoryview_(source, offset=None, length=None): @@ -85,15 +86,15 @@ def b(s): long_type = int next = next import pickle - from pickle import dumps, loads, dump, load + from pickle import dump, dumps, load, loads StringIO = io.StringIO string_type = str text_type = str bytes_type = bytes unichr = chr - from urllib.request import urlretrieve import queue + from urllib.request import urlretrieve def byte(num): return bytes((num,)) @@ -112,7 +113,7 @@ def with_metaclass(meta, base=object): ) return ns["_WhooshBase"] - xrange = range + range = range zip_ = lambda *args: list(zip(*args)) def memoryview_(source, offset=None, length=None): diff --git a/src/whoosh_reloaded/externalsort.py b/src/whoosh/externalsort.py similarity index 98% rename from src/whoosh_reloaded/externalsort.py rename to src/whoosh/externalsort.py index 990f2e3a..2dc299a7 100644 --- a/src/whoosh_reloaded/externalsort.py +++ b/src/whoosh/externalsort.py @@ -34,7 +34,7 @@ import os, tempfile from heapq import heapify, heappop, heapreplace -from whoosh_reloaded.compat import dump, load +from whoosh.compat import dump, load ## Python 3.2 had a bug that make marshal.load unusable @@ -42,8 +42,8 @@ # and platform.python_implementation() == "CPython" # and platform.python_version() == "3.2.0"): # # Use pickle instead of marshal on Python 3.2 -# from whoosh_reloaded.compat import dump as dump_pickle -# from whoosh_reloaded.compat import load +# from whoosh.compat import dump as dump_pickle +# from whoosh.compat import load # # def dump(obj, f): # dump_pickle(obj, f, -1) diff --git a/src/whoosh_reloaded/fields.py b/src/whoosh/fields.py similarity index 96% rename from src/whoosh_reloaded/fields.py rename to src/whoosh/fields.py index 28f08edf..2ffa9ea7 100644 --- a/src/whoosh_reloaded/fields.py +++ b/src/whoosh/fields.py @@ -33,16 +33,16 @@ from array import array from decimal import Decimal -from whoosh_reloaded import analysis, columns, formats -from whoosh_reloaded.compat import with_metaclass -from whoosh_reloaded.compat import itervalues, xrange -from whoosh_reloaded.compat import bytes_type, string_type, text_type -from whoosh_reloaded.system import emptybytes -from whoosh_reloaded.system import pack_byte -from whoosh_reloaded.util.numeric import to_sortable, from_sortable -from whoosh_reloaded.util.numeric import typecode_max, NaN -from whoosh_reloaded.util.text import utf8encode, utf8decode -from whoosh_reloaded.util.times import datetime_to_long, long_to_datetime +from whoosh import analysis, columns, formats +from whoosh.compat import with_metaclass +from whoosh.compat import itervalues +from whoosh.compat import bytes_type, string_type, text_type +from whoosh.system import emptybytes +from whoosh.system import pack_byte +from whoosh.util.numeric import to_sortable, from_sortable +from whoosh.util.numeric import typecode_max, NaN +from whoosh.util.text import utf8encode, utf8decode +from whoosh.util.times import datetime_to_long, long_to_datetime # Exceptions @@ -696,7 +696,7 @@ def index(self, num, **kwargs): # word, freq, weight, valuestring if self.shift_step: - for shift in xrange(0, self.bits, self.shift_step): + for shift in range(0, self.bits, self.shift_step): yield (self.to_bytes(num, shift), 1, 1.0, emptybytes) else: yield (self.to_bytes(num), 1, 1.0, emptybytes) @@ -775,8 +775,8 @@ def self_parsing(self): return True def parse_query(self, fieldname, qstring, boost=1.0): - from whoosh_reloaded import query - from whoosh_reloaded.qparser.common import QueryParserError + from whoosh import query + from whoosh.qparser.common import QueryParserError if qstring == "*": return query.Every(fieldname, boost=boost) @@ -788,8 +788,8 @@ def parse_query(self, fieldname, qstring, boost=1.0): return query.Term(fieldname, token, boost=boost) def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): - from whoosh_reloaded import query - from whoosh_reloaded.qparser.common import QueryParserError + from whoosh import query + from whoosh.qparser.common import QueryParserError if start is not None: if not self.is_valid(start): @@ -843,7 +843,7 @@ def __init__(self, stored=False, unique=False, sortable=False): ) def prepare_datetime(self, x): - from whoosh_reloaded.util.times import floor + from whoosh.util.times import floor if isinstance(x, text_type): # For indexing, support same strings as for query parsing -- @@ -879,7 +879,7 @@ def from_bytes(self, bs): def _parse_datestring(self, qstring): # This method parses a very simple datetime representation of the form # YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]] - from whoosh_reloaded.util.times import adatetime, fix, is_void + from whoosh.util.times import adatetime, fix, is_void qstring = qstring.replace(" ", "").replace("-", "").replace(".", "") year = month = day = hour = minute = second = microsecond = None @@ -904,8 +904,8 @@ def _parse_datestring(self, qstring): return at def parse_query(self, fieldname, qstring, boost=1.0): - from whoosh_reloaded import query - from whoosh_reloaded.util.times import is_ambiguous + from whoosh import query + from whoosh.util.times import is_ambiguous try: at = self._parse_datestring(qstring) @@ -921,7 +921,7 @@ def parse_query(self, fieldname, qstring, boost=1.0): return query.Term(fieldname, at, boost=boost) def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): - from whoosh_reloaded import query + from whoosh import query if start is None and end is None: return query.Every(fieldname, boost=boost) @@ -998,7 +998,7 @@ def self_parsing(self): return True def parse_query(self, fieldname, qstring, boost=1.0): - from whoosh_reloaded import query + from whoosh import query if qstring == "*": return query.Every(fieldname, boost=boost) @@ -1129,14 +1129,14 @@ def __init__( ``spelling_prefix`` keyword argument) to allow spelling suggestions to use the unchanged word forms as spelling suggestions. :param sortable: If True, make this field sortable using the default - column type. If you pass a :class:`whoosh_reloaded.columns.Column` instance + column type. If you pass a :class:`whoosh.columns.Column` instance instead of True, the field will use the given column type. :param lang: automaticaly configure a - :class:`whoosh_reloaded.analysis.LanguageAnalyzer` for the given language. + :class:`whoosh.analysis.LanguageAnalyzer` for the given language. This is ignored if you also specify an ``analyzer``. :param vector: if this value evaluates to true, store a list of the terms in this field in each document. If the value is an instance - of :class:`whoosh_reloaded.formats.Format`, the index will use the object to + of :class:`whoosh.formats.Format`, the index will use the object to store the term vector. Any other true value (e.g. ``vector=True``) will use the field's index format to store the term vector as well. """ @@ -1278,7 +1278,7 @@ def self_parsing(self): return True def parse_query(self, fieldname, qstring, boost=1.0): - from whoosh_reloaded import query + from whoosh import query terms = [] for g in self.process_text(qstring, mode="query"): @@ -1317,7 +1317,7 @@ def __init__( document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. - :param tokenizer: an instance of :class:`whoosh_reloaded.analysis.Tokenizer` + :param tokenizer: an instance of :class:`whoosh.analysis.Tokenizer` used to break the text into words. :param at: if 'start', only takes N-grams from the start of the word. If 'end', only takes N-grams from the end. Otherwise the default @@ -1340,7 +1340,6 @@ class ReverseField(FieldWrapper): def __init__(self, subfield, prefix="rev_"): FieldWrapper.__init__(self, subfield, prefix) self.analyzer = subfield.analyzer | analysis.ReverseTextFilter() - self.format = BasicFormat(lengths=False, weights=False) self.scorable = False self.set_sortable(False) @@ -1638,7 +1637,7 @@ class Child2(Parent): ... >>> s = MySchema() >>> type(s) - + """ @@ -1679,6 +1678,6 @@ def merge_schema(s1, s2): def merge_schemas(schemas): schema = schemas[0] - for i in xrange(1, len(schemas)): + for i in range(1, len(schemas)): schema = merge_schema(schema, schemas[i]) return schema diff --git a/src/whoosh_reloaded/filedb/__init__.py b/src/whoosh/filedb/__init__.py similarity index 100% rename from src/whoosh_reloaded/filedb/__init__.py rename to src/whoosh/filedb/__init__.py diff --git a/src/whoosh_reloaded/filedb/compound.py b/src/whoosh/filedb/compound.py similarity index 97% rename from src/whoosh_reloaded/filedb/compound.py rename to src/whoosh/filedb/compound.py index 7db2b5c8..1e257774 100644 --- a/src/whoosh_reloaded/filedb/compound.py +++ b/src/whoosh/filedb/compound.py @@ -36,11 +36,11 @@ except ImportError: mmap = None -from whoosh_reloaded.compat import BytesIO, memoryview_ -from whoosh_reloaded.filedb.structfile import BufferFile, StructFile -from whoosh_reloaded.filedb.filestore import FileStorage, StorageError -from whoosh_reloaded.system import emptybytes -from whoosh_reloaded.util import random_name +from whoosh.compat import BytesIO, memoryview_ +from whoosh.filedb.structfile import BufferFile, StructFile +from whoosh.filedb.filestore import FileStorage, StorageError +from whoosh.system import emptybytes +from whoosh.util import random_name class CompoundStorage(FileStorage): diff --git a/src/whoosh_reloaded/filedb/fileindex.py b/src/whoosh/filedb/fileindex.py similarity index 97% rename from src/whoosh_reloaded/filedb/fileindex.py rename to src/whoosh/filedb/fileindex.py index cfe780c4..7148ee7e 100644 --- a/src/whoosh_reloaded/filedb/fileindex.py +++ b/src/whoosh/filedb/fileindex.py @@ -21,18 +21,18 @@ from time import time import pickle -from whoosh_reloaded import __version__ -from whoosh_reloaded.fields import Schema -from whoosh_reloaded.index import ( +from whoosh import __version__ +from whoosh.fields import Schema +from whoosh.index import ( _DEF_INDEX_NAME, EmptyIndexError, Index, IndexVersionError, OutOfDateError, ) -from whoosh_reloaded.index import LockError -from whoosh_reloaded.support.bitvector import BitVector -from whoosh_reloaded.system import _FLOAT_SIZE, _INT_SIZE +from whoosh.index import LockError +from whoosh.support.bitvector import BitVector +from whoosh.system import _FLOAT_SIZE, _INT_SIZE _INDEX_VERSION = -105 @@ -240,7 +240,7 @@ def optimize(self): if len(self.segments) < 2 and not self.segments.has_deletions(): return - from whoosh_reloaded.filedb.filewriting import OPTIMIZE + from whoosh.filedb.filewriting import OPTIMIZE w = self.writer() w.commit(OPTIMIZE) @@ -309,7 +309,7 @@ def reader(self): return self.segments.reader(self.storage, self.schema) def writer(self, **kwargs): - from whoosh_reloaded.filedb.filewriting import SegmentWriter + from whoosh.filedb.filewriting import SegmentWriter return SegmentWriter(self, **kwargs) @@ -438,13 +438,13 @@ def is_deleted(self, docnum): return segment.is_deleted(segdocnum) def reader(self, storage, schema): - from whoosh_reloaded.filedb.filereading import SegmentReader + from whoosh.filedb.filereading import SegmentReader segments = self.segments if len(segments) == 1: return SegmentReader(storage, segments[0], schema) else: - from whoosh_reloaded.reading import MultiReader + from whoosh.reading import MultiReader readers = [SegmentReader(storage, segment, schema) for segment in segments] return MultiReader(readers, schema) diff --git a/src/whoosh_reloaded/filedb/filepostings.py b/src/whoosh/filedb/filepostings.py similarity index 97% rename from src/whoosh_reloaded/filedb/filepostings.py rename to src/whoosh/filedb/filepostings.py index 366fb060..3daf19bf 100644 --- a/src/whoosh_reloaded/filedb/filepostings.py +++ b/src/whoosh/filedb/filepostings.py @@ -17,12 +17,12 @@ import types from array import array from struct import Struct -from whoosh_reloaded.support import unicode +from whoosh.support import unicode -from whoosh_reloaded.writing import PostingWriter -from whoosh_reloaded.matching import Matcher, ReadTooFar -from whoosh_reloaded.system import _INT_SIZE, _FLOAT_SIZE -from whoosh_reloaded.util import utf8encode, utf8decode, length_to_byte, byte_to_length +from whoosh.writing import PostingWriter +from whoosh.matching import Matcher, ReadTooFar +from whoosh.system import _INT_SIZE, _FLOAT_SIZE +from whoosh.util import utf8encode, utf8decode, length_to_byte, byte_to_length class BlockInfo(object): diff --git a/src/whoosh_reloaded/filedb/filereading.py b/src/whoosh/filedb/filereading.py similarity index 95% rename from src/whoosh_reloaded/filedb/filereading.py rename to src/whoosh/filedb/filereading.py index be815780..fe105d44 100644 --- a/src/whoosh_reloaded/filedb/filereading.py +++ b/src/whoosh/filedb/filereading.py @@ -17,19 +17,19 @@ from threading import Lock from marshal import loads -from whoosh_reloaded.fields import FieldConfigurationError -from whoosh_reloaded.filedb.filepostings import FilePostingReader -from whoosh_reloaded.filedb.filetables import ( +from whoosh.fields import FieldConfigurationError +from whoosh.filedb.filepostings import FilePostingReader +from whoosh.filedb.filetables import ( FileTableReader, FileListReader, StructHashReader, LengthReader, ) -from whoosh_reloaded.filedb import misc +from whoosh.filedb import misc -# from whoosh_reloaded.postings import Exclude -from whoosh_reloaded.reading import IndexReader, TermNotFound -from whoosh_reloaded.util import protected, byte_to_length +# from whoosh.postings import Exclude +from whoosh.reading import IndexReader, TermNotFound +from whoosh.util import protected, byte_to_length # Reader class diff --git a/src/whoosh_reloaded/filedb/filestore.py b/src/whoosh/filedb/filestore.py similarity index 91% rename from src/whoosh_reloaded/filedb/filestore.py rename to src/whoosh/filedb/filestore.py index ca993e1a..f21eca2a 100644 --- a/src/whoosh_reloaded/filedb/filestore.py +++ b/src/whoosh/filedb/filestore.py @@ -29,11 +29,11 @@ import errno, os, sys, tempfile from threading import Lock -from whoosh_reloaded.compat import BytesIO, memoryview_ -from whoosh_reloaded.filedb.structfile import BufferFile, StructFile -from whoosh_reloaded.index import _DEF_INDEX_NAME, EmptyIndexError -from whoosh_reloaded.util import random_name -from whoosh_reloaded.util.filelock import FileLock +from whoosh.compat import BytesIO, memoryview_ +from whoosh.filedb.structfile import BufferFile, StructFile +from whoosh.index import _DEF_INDEX_NAME, EmptyIndexError +from whoosh.util import random_name +from whoosh.util.filelock import FileLock # Exceptions @@ -55,7 +55,7 @@ class Storage(object): A storage object is a virtual flat filesystem, allowing the creation and retrieval of file-like objects - (:class:`~whoosh_reloaded.filedb.structfile.StructFile` objects). The default + (:class:`~whoosh.filedb.structfile.StructFile` objects). The default implementation (:class:`FileStorage`) uses actual files in a directory. All access to files in Whoosh goes through this object. This allows more @@ -94,7 +94,7 @@ def create(self): a filesystem-based implementation might create a directory, while a database implementation might create tables. For example:: - from whoosh_reloaded.filedb.filestore import FileStorage + from whoosh.filedb.filestore import FileStorage # Create a storage object st = FileStorage("indexdir") # Create any necessary resources @@ -125,58 +125,58 @@ def destroy(self, *args, **kwargs): def create_index(self, schema, indexname=_DEF_INDEX_NAME, indexclass=None): """Creates a new index in this storage. - >>> from whoosh_reloaded import fields - >>> from whoosh_reloaded.filedb.filestore import FileStorage + >>> from whoosh import fields + >>> from whoosh.filedb.filestore import FileStorage >>> schema = fields.Schema(content=fields.TEXT) >>> # Create the storage directory >>> st = FileStorage.create("indexdir") >>> # Create an index in the storage >>> ix = st.create_index(schema) - :param schema: the :class:`whoosh_reloaded.fields.Schema` object to use for the + :param schema: the :class:`whoosh.fields.Schema` object to use for the new index. :param indexname: the name of the index within the storage object. You can use this option to store multiple indexes in the same storage. :param indexclass: an optional custom ``Index`` sub-class to use to create the index files. The default is - :class:`whoosh_reloaded.index.FileIndex`. This method will call the + :class:`whoosh.index.FileIndex`. This method will call the ``create`` class method on the given class to create the index. - :return: a :class:`whoosh_reloaded.index.Index` instance. + :return: a :class:`whoosh.index.Index` instance. """ if self.readonly: raise ReadOnlyError if indexclass is None: - import whoosh_reloaded.index + import whoosh.index - indexclass = whoosh_reloaded.index.FileIndex + indexclass = whoosh.index.FileIndex return indexclass.create(self, schema, indexname) def open_index(self, indexname=_DEF_INDEX_NAME, schema=None, indexclass=None): """Opens an existing index (created using :meth:`create_index`) in this storage. - >>> from whoosh_reloaded.filedb.filestore import FileStorage + >>> from whoosh.filedb.filestore import FileStorage >>> st = FileStorage("indexdir") >>> # Open an index in the storage >>> ix = st.open_index() :param indexname: the name of the index within the storage object. You can use this option to store multiple indexes in the same storage. - :param schema: if you pass in a :class:`whoosh_reloaded.fields.Schema` object + :param schema: if you pass in a :class:`whoosh.fields.Schema` object using this argument, it will override the schema that was stored with the index. :param indexclass: an optional custom ``Index`` sub-class to use to open the index files. The default is - :class:`whoosh_reloaded.index.FileIndex`. This method will instantiate the + :class:`whoosh.index.FileIndex`. This method will instantiate the class with this storage object. - :return: a :class:`whoosh_reloaded.index.Index` instance. + :return: a :class:`whoosh.index.Index` instance. """ if indexclass is None: - import whoosh_reloaded.index + import whoosh.index - indexclass = whoosh_reloaded.index.FileIndex + indexclass = whoosh.index.FileIndex return indexclass(self, schema=schema, indexname=indexname) def index_exists(self, indexname=None): @@ -202,7 +202,7 @@ def create_file(self, name): """Creates a file with the given name in this storage. :param name: the name for the new file. - :return: a :class:`whoosh_reloaded.filedb.structfile.StructFile` instance. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ raise NotImplementedError @@ -211,7 +211,7 @@ def open_file(self, name, *args, **kwargs): """Opens a file with the given name in this storage. :param name: the name for the new file. - :return: a :class:`whoosh_reloaded.filedb.structfile.StructFile` instance. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ raise NotImplementedError @@ -412,7 +412,7 @@ def create(self): """Creates this storage object's directory path using ``os.makedirs`` if it doesn't already exist. - >>> from whoosh_reloaded.filedb.filestore import FileStorage + >>> from whoosh.filedb.filestore import FileStorage >>> st = FileStorage("indexdir") >>> st.create() @@ -477,7 +477,7 @@ def create_file(self, name, excl=False, mode="wb", **kwargs): :param excl: if True, try to open the file in "exclusive" mode. :param mode: the mode flags with which to open the file. The default is ``"wb"``. - :return: a :class:`whoosh_reloaded.filedb.structfile.StructFile` instance. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ if self.readonly: @@ -501,8 +501,8 @@ def open_file(self, name, **kwargs): :param name: the name of the file to open. :param kwargs: additional keyword arguments are passed through to the - :class:`~whoosh_reloaded.filedb.structfile.StructFile` initializer. - :return: a :class:`whoosh_reloaded.filedb.structfile.StructFile` instance. + :class:`~whoosh.filedb.structfile.StructFile` initializer. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ f = StructFile(open(self._fpath(name), "rb"), name=name, **kwargs) diff --git a/src/whoosh_reloaded/filedb/filetables.py b/src/whoosh/filedb/filetables.py similarity index 97% rename from src/whoosh_reloaded/filedb/filetables.py rename to src/whoosh/filedb/filetables.py index 86cc4afb..0e877468 100644 --- a/src/whoosh_reloaded/filedb/filetables.py +++ b/src/whoosh/filedb/filetables.py @@ -32,12 +32,12 @@ import os, struct, sys from binascii import crc32 -from hashlib import md5 # @UnresolvedImport +from hashlib import md5 # type: ignore @UnresolvedImport -from whoosh_reloaded.compat import b, bytes_type -from whoosh_reloaded.compat import xrange -from whoosh_reloaded.util.numlists import GrowableArray -from whoosh_reloaded.system import _INT_SIZE, emptybytes +from whoosh.compat import b, bytes_type +from whoosh.compat import range +from whoosh.util.numlists import GrowableArray +from whoosh.system import _INT_SIZE, emptybytes # Exceptions @@ -99,7 +99,7 @@ class HashWriter(object): def __init__(self, dbfile, magic=b("HSH3"), hashtype=0): """ - :param dbfile: a :class:`~whoosh_reloaded.filedb.structfile.StructFile` object + :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object to write to. :param magic: the format tag bytes to write at the start of the file. :param hashtype: an integer indicating which hashing algorithm to use. @@ -122,7 +122,7 @@ def __init__(self, dbfile, magic=b("HSH3"), hashtype=0): dbfile.write_int(0) # 256 lists of hashed keys and positions - self.buckets = [[] for _ in xrange(256)] + self.buckets = [[] for _ in range(256)] # List to remember the positions of the hash tables self.directory = [] @@ -226,7 +226,7 @@ class HashReader(object): def __init__(self, dbfile, length=None, magic=b("HSH3"), startoffset=0): """ - :param dbfile: a :class:`~whoosh_reloaded.filedb.structfile.StructFile` object + :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object to read from. :param length: the length of the file data. This is necessary since the hashing information is written at the end of the file. @@ -271,7 +271,7 @@ def __init__(self, dbfile, length=None, magic=b("HSH3"), startoffset=0): self.tables = [] entrysize = _dir_entry.size unpackentry = _dir_entry.unpack - for _ in xrange(256): + for _ in range(256): # position, numslots self.tables.append(unpackentry(dbfile.read(entrysize))) # The position of the first hash table is the end of the key/value pairs @@ -280,7 +280,7 @@ def __init__(self, dbfile, length=None, magic=b("HSH3"), startoffset=0): @classmethod def open(cls, storage, name): """Convenience method to open a hash file given a - :class:`whoosh_reloaded.filedb.filestore.Storage` object and a name. This takes + :class:`whoosh.filedb.filestore.Storage` object and a name. This takes care of opening the file and passing its length to the initializer. """ @@ -410,7 +410,7 @@ def ranges_for_key(self, key): # Calculate where the key's slot should be slotpos = tablestart + (((keyhash >> 8) % numslots) * ptrsize) # Read slots looking for our key's hash value - for _ in xrange(numslots): + for _ in range(numslots): slothash, itempos = unpackptr(dbfile.get(slotpos, ptrsize)) # If this slot is empty, we're done if not itempos: diff --git a/src/whoosh_reloaded/filedb/filewriting.py b/src/whoosh/filedb/filewriting.py similarity index 93% rename from src/whoosh_reloaded/filedb/filewriting.py rename to src/whoosh/filedb/filewriting.py index d6df399d..e204044a 100644 --- a/src/whoosh_reloaded/filedb/filewriting.py +++ b/src/whoosh/filedb/filewriting.py @@ -18,21 +18,21 @@ from marshal import dumps from build.lib.whoosh.support import unicode -from whoosh_reloaded.fields import UnknownFieldError -from whoosh_reloaded.filedb.fileindex import SegmentDeletionMixin, Segment, SegmentSet -from whoosh_reloaded.filedb.filepostings import FilePostingWriter -from whoosh_reloaded.filedb.filetables import ( +from whoosh.fields import UnknownFieldError +from whoosh.filedb.fileindex import SegmentDeletionMixin, Segment, SegmentSet +from whoosh.filedb.filepostings import FilePostingWriter +from whoosh.filedb.filetables import ( FileListWriter, FileTableWriter, StructHashWriter, LengthWriter, ) -from whoosh_reloaded.filedb import misc -from whoosh_reloaded.filedb.pools import TempfilePool, MultiPool -from whoosh_reloaded.index import LockError -from whoosh_reloaded.util.filelock import try_for -from whoosh_reloaded.util import fib -from whoosh_reloaded.writing import IndexWriter +from whoosh.filedb import misc +from whoosh.filedb.pools import TempfilePool, MultiPool +from whoosh.index import LockError +from whoosh.util.filelock import try_for +from whoosh.util import fib +from whoosh.writing import IndexWriter # Merge policies @@ -52,7 +52,7 @@ def MERGE_SMALL(ix, writer, segments): heuristic based on the fibonacci sequence. """ - from whoosh_reloaded.filedb.filereading import SegmentReader + from whoosh.filedb.filereading import SegmentReader newsegments = SegmentSet() sorted_segment_list = sorted((s.doc_count_all(), s) for s in segments) @@ -70,7 +70,7 @@ def MERGE_SMALL(ix, writer, segments): def OPTIMIZE(ix, writer, segments): """This policy merges all existing segments.""" - from whoosh_reloaded.filedb.filereading import SegmentReader + from whoosh.filedb.filereading import SegmentReader for seg in segments: writer.add_reader(SegmentReader(ix.storage, seg, ix.schema)) diff --git a/src/whoosh_reloaded/filedb/gae.py b/src/whoosh/filedb/gae.py similarity index 87% rename from src/whoosh_reloaded/filedb/gae.py rename to src/whoosh/filedb/gae.py index 2ff263fe..2e13d846 100644 --- a/src/whoosh_reloaded/filedb/gae.py +++ b/src/whoosh/filedb/gae.py @@ -8,7 +8,7 @@ To create a new index:: - from whoosh_reloaded.filedb.gae import DatastoreStorage + from whoosh.filedb.gae import DatastoreStorage ix = DatastoreStorage().create_index(schema) @@ -19,13 +19,13 @@ import time -from google.appengine.api import memcache # @UnresolvedImport -from google.appengine.ext import db # @UnresolvedImport +from google.appengine.api import memcache # type: ignore @UnresolvedImport +from google.appengine.ext import db # type: ignore @UnresolvedImport -from whoosh_reloaded.compat import BytesIO -from whoosh_reloaded.index import TOC, FileIndex, _DEF_INDEX_NAME -from whoosh_reloaded.filedb.filestore import ReadOnlyError, Storage -from whoosh_reloaded.filedb.structfile import StructFile +from whoosh.compat import BytesIO +from whoosh.index import TOC, FileIndex, _DEF_INDEX_NAME +from whoosh.filedb.filestore import ReadOnlyError, Storage +from whoosh.filedb.structfile import StructFile class DatastoreFile(db.Model): @@ -83,7 +83,7 @@ def __init__(self, name): self.name = name def acquire(self, blocking=False): - val = memcache.add(self.name, "L", 360, namespace="whoosh_reloadedlocks") + val = memcache.add(self.name, "L", 360, namespace="whooshlocks") if blocking and not val: # Simulate blocking by retrying the acquire over and over @@ -91,16 +91,16 @@ def acquire(self, blocking=False): while not val: time.sleep(0.1) - val = memcache.add(self.name, "", 360, namespace="whoosh_reloadedlocks") + val = memcache.add(self.name, "", 360, namespace="whooshlocks") return val def release(self): - memcache.delete(self.name, namespace="whoosh_reloadedlocks") + memcache.delete(self.name, namespace="whooshlocks") class DatastoreStorage(Storage): - """An implementation of :class:`whoosh_reloaded.store.Storage` that stores files in + """An implementation of :class:`whoosh.store.Storage` that stores files in the app engine datastore as blob properties. """ diff --git a/src/whoosh_reloaded/filedb/misc.py b/src/whoosh/filedb/misc.py similarity index 100% rename from src/whoosh_reloaded/filedb/misc.py rename to src/whoosh/filedb/misc.py diff --git a/src/whoosh_reloaded/filedb/pools.py b/src/whoosh/filedb/pools.py similarity index 78% rename from src/whoosh_reloaded/filedb/pools.py rename to src/whoosh/filedb/pools.py index df2670f0..3ca83768 100644 --- a/src/whoosh_reloaded/filedb/pools.py +++ b/src/whoosh/filedb/pools.py @@ -1,44 +1,42 @@ - -#=============================================================================== +# =============================================================================== # Copyright 2010 Matt Chaput -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import itertools, os, shutil, tempfile, time +import os +import shutil +import tempfile +import time from collections import defaultdict -from heapq import heapify, heappush, heappop -from marshal import load, dump, loads, dumps -from Queue import Empty +from heapq import heapify, heappop, heappush +from marshal import dump, load from multiprocessing import Process, Queue from struct import Struct from whoosh.filedb.filetables import LengthWriter -from whoosh.filedb.structfile import StructFile -from whoosh.system import (_INT_SIZE, _SHORT_SIZE, - pack_ushort, unpack_ushort) -from whoosh.util import utf8encode, utf8decode, length_to_byte +from whoosh.util import length_to_byte _2int_struct = Struct("!II") pack2ints = _2int_struct.pack unpack2ints = _2int_struct.unpack -_length_struct = Struct("!IHB") # Docnum, fieldnum, lengthbyte +_length_struct = Struct("!IHB") # Docnum, fieldnum, lengthbyte pack_length = _length_struct.pack unpack_length = _length_struct.unpack -#def encode_posting(fieldNum, text, doc, freq, datastring): +# def encode_posting(fieldNum, text, doc, freq, datastring): # """Encodes a posting as a string, for sorting. # """ # @@ -48,7 +46,7 @@ # pack2ints(doc, freq), # datastring)) # -#def decode_posting(posting): +# def decode_posting(posting): # """Decodes an encoded posting string into a # (field_number, text, document_number, datastring) tuple. # """ @@ -75,7 +73,7 @@ def imerge(iterators): except StopIteration: pass heapify(current) - + while len(current) > 1: item, gen = heappop(current) yield item @@ -83,13 +81,14 @@ def imerge(iterators): heappush(current, (gen.next(), gen)) except StopIteration: pass - + if current: item, gen = current[0] yield item for item in gen: yield item + def bimerge(iter1, iter2): try: p1 = iter1.next() @@ -97,14 +96,14 @@ def bimerge(iter1, iter2): for p2 in iter2: yield p2 return - + try: p2 = iter2.next() except StopIteration: for p1 in iter1: yield p1 return - + while True: if p1 < p2: yield p1 @@ -123,16 +122,17 @@ def bimerge(iter1, iter2): yield p1 return + def dividemerge(iters): length = len(iters) if length == 0: return [] if length == 1: return iters[0] - + mid = length >> 1 return bimerge(dividemerge(iters[:mid]), dividemerge(iters[mid:])) - + def read_run(filename, count): f = open(filename, "rb") @@ -150,8 +150,8 @@ def write_postings(schema, termtable, postwriter, postiter): # count the document frequency and sum the terms by looking at the # postings). - current_fieldnum = None # Field number of the current term - current_text = None # Text of the current term + current_fieldnum = None # Field number of the current term + current_text = None # Text of the current term first = True current_freq = 0 offset = None @@ -167,8 +167,9 @@ def write_postings(schema, termtable, postwriter, postiter): # This is a new term, so finish the postings and add the # term to the term table postcount = postwriter.finish() - termtable.add((current_fieldnum, current_text), - (current_freq, offset, postcount)) + termtable.add( + (current_fieldnum, current_text), (current_freq, offset, postcount) + ) # Reset the post writer and the term variables current_fieldnum = fieldnum @@ -176,11 +177,14 @@ def write_postings(schema, termtable, postwriter, postiter): current_freq = 0 offset = postwriter.start(fieldnum) - elif (fieldnum < current_fieldnum - or (fieldnum == current_fieldnum and text < current_text)): + elif fieldnum < current_fieldnum or ( + fieldnum == current_fieldnum and text < current_text + ): # This should never happen! - raise Exception("Postings are out of order: %s:%s .. %s:%s" % - (current_fieldnum, current_text, fieldnum, text)) + raise Exception( + "Postings are out of order: %s:%s .. %s:%s" + % (current_fieldnum, current_text, fieldnum, text) + ) # Write a posting for this occurrence of the current term current_freq += freq @@ -189,31 +193,33 @@ def write_postings(schema, termtable, postwriter, postiter): # If there are still "uncommitted" postings at the end, finish them off if not first: postcount = postwriter.finish() - termtable.add((current_fieldnum, current_text), - (current_freq, offset, postcount)) + termtable.add( + (current_fieldnum, current_text), (current_freq, offset, postcount) + ) class LengthSpool(object): def __init__(self, filename): self.filename = filename self.file = None - + def create(self): self.file = open(self.filename, "wb") - + def add(self, docnum, fieldnum, length): self.file.write(pack_length(docnum, fieldnum, length_to_byte(length))) - + def finish(self): self.file.close() self.file = None - + def readback(self): f = open(self.filename, "rb") size = _length_struct.size while True: data = f.read(size) - if not data: break + if not data: + break yield unpack_length(data) f.close() @@ -223,69 +229,69 @@ def __init__(self, dir): self._dir = dir self._fieldlength_totals = defaultdict(int) self._fieldlength_maxes = {} - + def _filename(self, name): return os.path.join(self._dir, name) - + def cancel(self): pass - + def fieldlength_totals(self): return dict(self._fieldlength_totals) - + def fieldlength_maxes(self): return self._fieldlength_maxes - + class TempfilePool(PoolBase): - def __init__(self, lengthfile, limitmb=32, dir=None, basename='', **kw): + def __init__(self, lengthfile, limitmb=32, dir=None, basename="", **kw): if dir is None: dir = tempfile.mkdtemp("whoosh") PoolBase.__init__(self, dir) - + self.lengthfile = lengthfile self.limit = limitmb * 1024 * 1024 - + self.size = 0 self.count = 0 self.postings = [] self.runs = [] - + self.basename = basename - + self.lenspool = LengthSpool(self._filename(basename + "length")) self.lenspool.create() - + def add_content(self, docnum, fieldnum, field, value): add_posting = self.add_posting termcount = 0 # TODO: Method for adding progressive field values, ie # setting start_pos/start_char? for w, freq, valuestring in field.index(value): - #assert w != "" + # assert w != "" add_posting(fieldnum, w, docnum, freq, valuestring) termcount += freq - + if field.scorable and termcount: self.add_field_length(docnum, fieldnum, termcount) - + return termcount - + def add_posting(self, fieldnum, text, docnum, freq, datastring): if self.size >= self.limit: - #print "Flushing..." + # print ("Flushing...") self.dump_run() self.size += len(text) + 2 + 8 + len(datastring) self.postings.append((fieldnum, text, docnum, freq, datastring)) self.count += 1 - + def add_field_length(self, docnum, fieldnum, length): self._fieldlength_totals[fieldnum] += length if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length self.lenspool.add(docnum, fieldnum, length) - + def dump_run(self): if self.size > 0: tempname = self._filename(self.basename + str(time.time()) + ".run") @@ -299,44 +305,45 @@ def dump_run(self): self.postings = [] self.size = 0 self.count = 0 - + def run_filenames(self): return [filename for filename, _ in self.runs] - + def cancel(self): self.cleanup() - + def cleanup(self): shutil.rmtree(self._dir) - + def _finish_lengths(self, schema, doccount): - lengthfile = LengthWriter(self.lengthfile, doccount, - schema.scorable_fields()) + lengthfile = LengthWriter(self.lengthfile, doccount, schema.scorable_fields()) lengthfile.add_all(self.lenspool.readback()) lengthfile.close() - + def finish(self, schema, doccount, termtable, postingwriter): self.lenspool.finish() self._finish_lengths(schema, doccount) - + if self.postings and len(self.runs) == 0: self.postings.sort() postiter = iter(self.postings) - #total = len(self.postings) + # total = len(self.postings) elif not self.postings and not self.runs: postiter = iter([]) - #total = 0 + # total = 0 else: - postiter = imerge([read_run(runname, count) - for runname, count in self.runs]) - #total = sum(count for runname, count in self.runs) - + postiter = imerge( + [read_run(runname, count) for runname, count in self.runs] + ) + # total = sum(count for runname, count in self.runs) + write_postings(schema, termtable, postingwriter, postiter) self.cleanup() - + # Multiprocessing + class PoolWritingTask(Process): def __init__(self, dir, postingqueue, resultqueue, limitmb): Process.__init__(self) @@ -344,19 +351,20 @@ def __init__(self, dir, postingqueue, resultqueue, limitmb): self.postingqueue = postingqueue self.resultqueue = resultqueue self.limitmb = limitmb - + def run(self): pqueue = self.postingqueue rqueue = self.resultqueue - - subpool = TempfilePool(None, limitmb=self.limitmb, dir=self.dir, - basename=self.name) - + + subpool = TempfilePool( + None, limitmb=self.limitmb, dir=self.dir, basename=self.name + ) + while True: unit = pqueue.get() if unit is None: break - + code, args = unit if code == 0: subpool.add_content(*args) @@ -364,66 +372,75 @@ def run(self): subpool.add_posting(*args) elif code == 2: subpool.add_field_length(*args) - + subpool.lenspool.finish() subpool.dump_run() - rqueue.put((subpool.runs, subpool.fieldlength_totals(), - subpool.fieldlength_maxes(), subpool.lenspool)) + rqueue.put( + ( + subpool.runs, + subpool.fieldlength_totals(), + subpool.fieldlength_maxes(), + subpool.lenspool, + ) + ) class MultiPool(PoolBase): def __init__(self, lengthfile, procs=2, limitmb=32, **kw): dir = tempfile.mkdtemp(".whoosh") PoolBase.__init__(self, dir) - + self.lengthfile = lengthfile - + self.procs = procs self.limitmb = limitmb - + self.postingqueue = Queue() self.resultsqueue = Queue() - self.tasks = [PoolWritingTask(self._dir, self.postingqueue, - self.resultsqueue, self.limitmb) - for _ in xrange(procs)] + self.tasks = [ + PoolWritingTask( + self._dir, self.postingqueue, self.resultsqueue, self.limitmb + ) + for _ in range(procs) + ] for task in self.tasks: task.start() - + def add_content(self, *args): self.postingqueue.put((0, args)) - + def add_posting(self, *args): self.postingqueue.put((1, args)) - + def add_field_length(self, *args): self.postingqueue.put((2, args)) - + def cancel(self): for task in self.tasks: task.terminate() self.cleanup() - + def cleanup(self): shutil.rmtree(self._dir) - + def finish(self, schema, doccount, termtable, postingwriter): _fieldlength_totals = self._fieldlength_totals if not self.tasks: return - + pqueue = self.postingqueue rqueue = self.resultsqueue - - for _ in xrange(self.procs): + + for _ in range(self.procs): pqueue.put(None) - - print "Joining..." + + print("Joining...") t = time.time() for task in self.tasks: task.join() - print "Join:", time.time() - t - - print "Getting results..." + print("Join:", time.time() - t) + + print("Getting results...") t = time.time() runs = [] lenspools = [] @@ -436,30 +453,24 @@ def finish(self, schema, doccount, termtable, postingwriter): for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length - print "Results:", time.time() - t - - print "Writing lengths..." + print("Results:", time.time() - t) + + print("Writing lengths...") t = time.time() lengthfile = LengthWriter(self.lengthfile, doccount, schema.scorable_fields()) for lenspool in lenspools: lengthfile.add_all(lenspool.readback()) lengthfile.close() - print "Lengths:", time.time() - t - + print("Lengths:", time.time() - t) + t = time.time() - iterator = dividemerge([read_run(runname, count) - for runname, count in runs]) + iterator = dividemerge([read_run(runname, count) for runname, count in runs]) total = sum(count for runname, count in runs) write_postings(schema, termtable, postingwriter, iterator) - print "Merge:", time.time() - t - + print("Merge:", time.time() - t) + self.cleanup() - if __name__ == "__main__": pass - - - - diff --git a/src/whoosh_reloaded/filedb/structfile.py b/src/whoosh/filedb/structfile.py similarity index 91% rename from src/whoosh_reloaded/filedb/structfile.py rename to src/whoosh/filedb/structfile.py index f31d571b..ffe4649f 100644 --- a/src/whoosh_reloaded/filedb/structfile.py +++ b/src/whoosh/filedb/structfile.py @@ -29,21 +29,21 @@ from copy import copy from struct import calcsize -from whoosh_reloaded.compat import BytesIO, bytes_type -from whoosh_reloaded.compat import dump as dump_pickle -from whoosh_reloaded.compat import load as load_pickle -from whoosh_reloaded.compat import array_frombytes, array_tobytes -from whoosh_reloaded.system import _INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE -from whoosh_reloaded.system import IS_LITTLE -from whoosh_reloaded.system import pack_byte, unpack_byte, pack_sbyte, unpack_sbyte -from whoosh_reloaded.system import pack_ushort, unpack_ushort -from whoosh_reloaded.system import pack_ushort_le, unpack_ushort_le -from whoosh_reloaded.system import pack_int, unpack_int, pack_uint, unpack_uint -from whoosh_reloaded.system import pack_uint_le, unpack_uint_le -from whoosh_reloaded.system import pack_long, unpack_long, pack_ulong, unpack_ulong -from whoosh_reloaded.system import pack_float, unpack_float -from whoosh_reloaded.util.varints import varint, read_varint -from whoosh_reloaded.util.varints import signed_varint, decode_signed_varint +from whoosh.compat import BytesIO, bytes_type +from whoosh.compat import dump as dump_pickle +from whoosh.compat import load as load_pickle +from whoosh.compat import array_frombytes, array_tobytes +from whoosh.system import _INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE +from whoosh.system import IS_LITTLE +from whoosh.system import pack_byte, unpack_byte, pack_sbyte, unpack_sbyte +from whoosh.system import pack_ushort, unpack_ushort +from whoosh.system import pack_ushort_le, unpack_ushort_le +from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint +from whoosh.system import pack_uint_le, unpack_uint_le +from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong +from whoosh.system import pack_float, unpack_float +from whoosh.util.varints import varint, read_varint +from whoosh.util.varints import signed_varint, decode_signed_varint _SIZEMAP = dict((typecode, calcsize(typecode)) for typecode in "bBiIhHqQf") @@ -127,7 +127,7 @@ def close(self): self.is_closed = True def subset(self, offset, length, name=None): - from whoosh_reloaded.filedb.compound import SubFile + from whoosh.filedb.compound import SubFile name = name or self._name return StructFile(SubFile(self.file, offset, length), name=name) diff --git a/src/whoosh_reloaded/formats.py b/src/whoosh/formats.py similarity index 98% rename from src/whoosh_reloaded/formats.py rename to src/whoosh/formats.py index 6068edc2..43d4aadc 100644 --- a/src/whoosh_reloaded/formats.py +++ b/src/whoosh/formats.py @@ -33,11 +33,11 @@ from collections import defaultdict -from whoosh_reloaded.analysis import unstopped, entoken -from whoosh_reloaded.compat import iteritems, dumps, loads, b -from whoosh_reloaded.system import emptybytes -from whoosh_reloaded.system import _INT_SIZE, _FLOAT_SIZE -from whoosh_reloaded.system import pack_uint, unpack_uint, pack_float, unpack_float +from whoosh.analysis import unstopped, entoken +from whoosh.compat import iteritems, dumps, loads, b +from whoosh.system import emptybytes +from whoosh.system import _INT_SIZE, _FLOAT_SIZE +from whoosh.system import pack_uint, unpack_uint, pack_float, unpack_float # Format base class diff --git a/src/whoosh_reloaded/highlight.py b/src/whoosh/highlight.py similarity index 99% rename from src/whoosh_reloaded/highlight.py rename to src/whoosh/highlight.py index 59934c01..3b526413 100644 --- a/src/whoosh_reloaded/highlight.py +++ b/src/whoosh/highlight.py @@ -54,8 +54,8 @@ from heapq import nlargest from itertools import groupby -from whoosh_reloaded.analysis import Token -from whoosh_reloaded.compat import htmlescape +from whoosh.analysis import Token +from whoosh.compat import htmlescape # The default value for the maximum chars to examine when fragmenting DEFAULT_CHARLIMIT = 2**15 @@ -864,8 +864,8 @@ def __init__(self, qname="strong", between="..."): self.qname = qname self.between = between - from genshi.core import START, END, TEXT # @UnresolvedImport - from genshi.core import Attrs, Stream # @UnresolvedImport + from genshi.core import START, END, TEXT # type: ignore @UnresolvedImport + from genshi.core import Attrs, Stream # type: ignore @UnresolvedImport self.START, self.END, self.TEXT = START, END, TEXT self.Attrs, self.Stream = Attrs, Stream diff --git a/src/whoosh_reloaded/idsets.py b/src/whoosh/idsets.py similarity index 97% rename from src/whoosh_reloaded/idsets.py rename to src/whoosh/idsets.py index 8b0f0371..d17aff10 100644 --- a/src/whoosh_reloaded/idsets.py +++ b/src/whoosh/idsets.py @@ -6,8 +6,8 @@ from array import array from bisect import bisect_left, bisect_right -from whoosh_reloaded.compat import izip, izip_longest, next, xrange -from whoosh_reloaded.util.numeric import bytes_for_bits +from whoosh.compat import izip, izip_longest, next, range +from whoosh.util.numeric import bytes_for_bits # Number of '1' bits in each byte (0-255) @@ -339,7 +339,7 @@ def invert_update(self, size): ``[0 - size)`` except numbers that are in this set. """ - for i in xrange(size): + for i in range(size): if i in self: self.discard(i) else: @@ -412,7 +412,7 @@ def __len__(self): def __iter__(self): base = 0 for byte in self._iter_bytes(): - for i in xrange(8): + for i in range(8): if byte & (1 << i): yield base + i base += 8 @@ -504,7 +504,7 @@ class OnDiskBitSet(BaseBitSet): def __init__(self, dbfile, basepos, bytecount): """ - :param dbfile: a :class:`~whoosh_reloaded.filedb.structfile.StructFile` object + :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object to read from. :param basepos: the base position of the bytes in the given file. :param bytecount: the number of bytes to use for the bit array. @@ -531,7 +531,7 @@ def _get_byte(self, n): def _iter_bytes(self): dbfile = self._dbfile dbfile.seek(self._basepos) - for _ in xrange(self._bytecount): + for _ in range(self._bytecount): yield dbfile.read_byte() @@ -553,7 +553,7 @@ def __init__(self, source=None, size=0): if not size and isinstance(source, (list, tuple, set, frozenset)): size = max(source) bytecount = bytes_for_bits(size) - self.bits = array("B", (0 for _ in xrange(bytecount))) + self.bits = array("B", (0 for _ in range(bytecount))) if source: add = self.add @@ -628,7 +628,7 @@ def copy(self): return b def clear(self): - for i in xrange(len(self.bits)): + for i in range(len(self.bits)): self.bits[i] = 0 def add(self, i): @@ -668,7 +668,7 @@ def difference_update(self, other): def invert_update(self, size): bits = self.bits - for i in xrange(len(bits)): + for i in range(len(bits)): bits[i] = ~bits[i] & 0xFF self._zero_extra_bits(size) @@ -824,7 +824,7 @@ def __iter__(self): except StopIteration: nx = -1 - for i in xrange(self.limit): + for i in range(self.limit): if i == nx: try: nx = next(ids) @@ -849,7 +849,7 @@ def last(self): if idset.last() < maxid - 1: return maxid - for i in xrange(maxid, -1, -1): + for i in range(maxid, -1, -1): if i not in idset: return i @@ -894,7 +894,7 @@ def _find(self, n): floor = n << 16 if bucket >= len(self.idsets): self.idsets.extend( - [SortedIntSet() for _ in xrange(len(self.idsets), bucket + 1)] + [SortedIntSet() for _ in range(len(self.idsets), bucket + 1)] ) idset = self.idsets[bucket] return bucket, floor, idset diff --git a/src/whoosh_reloaded/index.py b/src/whoosh/index.py similarity index 94% rename from src/whoosh_reloaded/index.py rename to src/whoosh/index.py index 3a85ec12..b2acf559 100644 --- a/src/whoosh_reloaded/index.py +++ b/src/whoosh/index.py @@ -36,11 +36,11 @@ import sys from time import time, sleep -from whoosh_reloaded import __version__ -from whoosh_reloaded.compat import pickle, string_type -from whoosh_reloaded.fields import ensure_schema -from whoosh_reloaded.legacy import toc_loaders -from whoosh_reloaded.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE +from whoosh import __version__ +from whoosh.compat import pickle, string_type +from whoosh.fields import ensure_schema +from whoosh.legacy import toc_loaders +from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE _DEF_INDEX_NAME = "MAIN" _CURRENT_TOC_VERSION = -111 @@ -89,7 +89,7 @@ def create_in(dirname, schema, indexname=None): :param dirname: the path string of the directory in which to create the index. - :param schema: a :class:`whoosh_reloaded.fields.Schema` object describing the + :param schema: a :class:`whoosh.fields.Schema` object describing the index's fields. :param indexname: the name of the index to create; you only need to specify this if you are creating multiple indexes within the same storage @@ -97,7 +97,7 @@ def create_in(dirname, schema, indexname=None): :returns: :class:`Index` """ - from whoosh_reloaded.filedb.filestore import FileStorage + from whoosh.filedb.filestore import FileStorage if not indexname: indexname = _DEF_INDEX_NAME @@ -118,7 +118,7 @@ def open_dir(dirname, indexname=None, readonly=False, schema=None): this if you have multiple indexes within the same storage object. """ - from whoosh_reloaded.filedb.filestore import FileStorage + from whoosh.filedb.filestore import FileStorage if indexname is None: indexname = _DEF_INDEX_NAME @@ -165,7 +165,7 @@ def version_in(dirname, indexname=None): version). This is simply a version number for the TOC file and probably should not have been exposed in a public interface. The best way to check if the current version of Whoosh can open an index is to actually try to - open it and see if it raises a ``whoosh_reloaded.index.IndexVersionError`` exception. + open it and see if it raises a ``whoosh.index.IndexVersionError`` exception. Note that the release and format version are available as attributes on the Index object in Index.release and Index.version. @@ -176,7 +176,7 @@ def version_in(dirname, indexname=None): :returns: ((major_ver, minor_ver, build_ver), format_ver) """ - from whoosh_reloaded.filedb.filestore import FileStorage + from whoosh.filedb.filestore import FileStorage storage = FileStorage(dirname) return version(storage, indexname=indexname) @@ -192,7 +192,7 @@ def version(storage, indexname=None): version). This is simply a version number for the TOC file and probably should not have been exposed in a public interface. The best way to check if the current version of Whoosh can open an index is to actually try to - open it and see if it raises a ``whoosh_reloaded.index.IndexVersionError`` exception. + open it and see if it raises a ``whoosh.index.IndexVersionError`` exception. Note that the release and format version are available as attributes on the Index object in Index.release and Index.version. @@ -231,7 +231,7 @@ def add_field(self, fieldname, fieldspec): """Adds a field to the index's schema. :param fieldname: the name of the field to add. - :param fieldspec: an instantiated :class:`whoosh_reloaded.fields.FieldType` + :param fieldspec: an instantiated :class:`whoosh.fields.FieldType` object. """ @@ -313,10 +313,10 @@ def searcher(self, **kwargs): """Returns a Searcher object for this index. Keyword arguments are passed to the Searcher object's constructor. - :rtype: :class:`whoosh_reloaded.searching.Searcher` + :rtype: :class:`whoosh.searching.Searcher` """ - from whoosh_reloaded.searching import Searcher + from whoosh.searching import Searcher return Searcher(self.reader(), fromindex=self, **kwargs) @@ -345,7 +345,7 @@ def reader(self, reuse=None): resources from this existing reader to create the new reader. Note that any resources in the "recycled" reader that are not used by the new reader will be CLOSED, so you CANNOT use it afterward. - :rtype: :class:`whoosh_reloaded.reading.IndexReader` + :rtype: :class:`whoosh.reading.IndexReader` """ raise NotImplementedError @@ -353,7 +353,7 @@ def reader(self, reuse=None): def writer(self, **kwargs): """Returns an IndexWriter object for this index. - :rtype: :class:`whoosh_reloaded.writing.IndexWriter` + :rtype: :class:`whoosh.writing.IndexWriter` """ raise NotImplementedError @@ -405,7 +405,7 @@ def clean_files(storage, indexname, gen, segments): class FileIndex(Index): def __init__(self, storage, schema=None, indexname=_DEF_INDEX_NAME): - from whoosh_reloaded.filedb.filestore import Storage + from whoosh.filedb.filestore import Storage if not isinstance(storage, Storage): raise ValueError("%r is not a Storage object" % storage) @@ -458,11 +458,11 @@ def optimize(self, **kwargs): def writer(self, procs=1, **kwargs): if procs > 1: - from whoosh_reloaded.multiproc import MpWriter + from whoosh.multiproc import MpWriter return MpWriter(self, procs=procs, **kwargs) else: - from whoosh_reloaded.writing import SegmentWriter + from whoosh.writing import SegmentWriter return SegmentWriter(self, **kwargs) @@ -498,7 +498,7 @@ def version(self): def _reader(cls, storage, schema, segments, generation, reuse=None): # Returns a reader for the given segments, possibly reusing already # opened readers - from whoosh_reloaded.reading import SegmentReader, MultiReader, EmptyReader + from whoosh.reading import SegmentReader, MultiReader, EmptyReader if reuse: # Merge segments with reuse segments diff --git a/src/whoosh_reloaded/lang/__init__.py b/src/whoosh/lang/__init__.py similarity index 100% rename from src/whoosh_reloaded/lang/__init__.py rename to src/whoosh/lang/__init__.py diff --git a/src/whoosh_reloaded/lang/dmetaphone.py b/src/whoosh/lang/dmetaphone.py similarity index 99% rename from src/whoosh_reloaded/lang/dmetaphone.py rename to src/whoosh/lang/dmetaphone.py index 69ab3faa..b7bb23ce 100644 --- a/src/whoosh_reloaded/lang/dmetaphone.py +++ b/src/whoosh/lang/dmetaphone.py @@ -8,7 +8,7 @@ import re -from whoosh_reloaded.compat import u +from whoosh.compat import u vowels = frozenset("AEIOUY") slavo_germ_exp = re.compile("W|K|CZ|WITZ") diff --git a/src/whoosh_reloaded/lang/isri.py b/src/whoosh/lang/isri.py similarity index 100% rename from src/whoosh_reloaded/lang/isri.py rename to src/whoosh/lang/isri.py diff --git a/src/whoosh_reloaded/lang/lovins.py b/src/whoosh/lang/lovins.py similarity index 100% rename from src/whoosh_reloaded/lang/lovins.py rename to src/whoosh/lang/lovins.py diff --git a/src/whoosh_reloaded/lang/morph_en.py b/src/whoosh/lang/morph_en.py similarity index 99% rename from src/whoosh_reloaded/lang/morph_en.py rename to src/whoosh/lang/morph_en.py index 0aa75745..359baa24 100644 --- a/src/whoosh_reloaded/lang/morph_en.py +++ b/src/whoosh/lang/morph_en.py @@ -8,7 +8,7 @@ class of Sun's `Minion search engine `_. import re -from whoosh_reloaded.compat import xrange, iteritems +from whoosh.compat import range, iteritems # Rule exceptions @@ -1099,7 +1099,7 @@ class of Sun's `Minion search engine `_. _partition_size = 20 _partitions = [] -for p in xrange(0, len(rules) // _partition_size + 1): +for p in range(0, len(rules) // _partition_size + 1): start = p * _partition_size end = (p + 1) * _partition_size pattern = "|".join( diff --git a/src/whoosh_reloaded/lang/paicehusk.py b/src/whoosh/lang/paicehusk.py similarity index 100% rename from src/whoosh_reloaded/lang/paicehusk.py rename to src/whoosh/lang/paicehusk.py diff --git a/src/whoosh_reloaded/lang/phonetic.py b/src/whoosh/lang/phonetic.py similarity index 98% rename from src/whoosh_reloaded/lang/phonetic.py rename to src/whoosh/lang/phonetic.py index 04817518..4a760ec7 100644 --- a/src/whoosh_reloaded/lang/phonetic.py +++ b/src/whoosh/lang/phonetic.py @@ -6,7 +6,7 @@ import re -from whoosh_reloaded.compat import iteritems +from whoosh.compat import iteritems # This soundex implementation is adapted from the recipe here: # http://code.activestate.com/recipes/52213/ diff --git a/src/whoosh_reloaded/lang/porter.py b/src/whoosh/lang/porter.py similarity index 100% rename from src/whoosh_reloaded/lang/porter.py rename to src/whoosh/lang/porter.py diff --git a/src/whoosh_reloaded/lang/porter2.py b/src/whoosh/lang/porter2.py similarity index 100% rename from src/whoosh_reloaded/lang/porter2.py rename to src/whoosh/lang/porter2.py diff --git a/src/whoosh_reloaded/lang/snowball/LICENSE.txt b/src/whoosh/lang/snowball/LICENSE.txt similarity index 100% rename from src/whoosh_reloaded/lang/snowball/LICENSE.txt rename to src/whoosh/lang/snowball/LICENSE.txt diff --git a/src/whoosh_reloaded/lang/snowball/__init__.py b/src/whoosh/lang/snowball/__init__.py similarity index 100% rename from src/whoosh_reloaded/lang/snowball/__init__.py rename to src/whoosh/lang/snowball/__init__.py diff --git a/src/whoosh_reloaded/lang/snowball/bases.py b/src/whoosh/lang/snowball/bases.py similarity index 100% rename from src/whoosh_reloaded/lang/snowball/bases.py rename to src/whoosh/lang/snowball/bases.py diff --git a/src/whoosh_reloaded/lang/snowball/danish.py b/src/whoosh/lang/snowball/danish.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/danish.py rename to src/whoosh/lang/snowball/danish.py index fb983b8b..8c4f4878 100644 --- a/src/whoosh_reloaded/lang/snowball/danish.py +++ b/src/whoosh/lang/snowball/danish.py @@ -1,6 +1,6 @@ from .bases import _ScandinavianStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class DanishStemmer(_ScandinavianStemmer): diff --git a/src/whoosh_reloaded/lang/snowball/dutch.py b/src/whoosh/lang/snowball/dutch.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/dutch.py rename to src/whoosh/lang/snowball/dutch.py index 5bae2538..0d683649 100644 --- a/src/whoosh_reloaded/lang/snowball/dutch.py +++ b/src/whoosh/lang/snowball/dutch.py @@ -1,6 +1,6 @@ from .bases import _StandardStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class DutchStemmer(_StandardStemmer): diff --git a/src/whoosh_reloaded/lang/snowball/english.py b/src/whoosh/lang/snowball/english.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/english.py rename to src/whoosh/lang/snowball/english.py index fc2d3fbe..a2567dab 100644 --- a/src/whoosh_reloaded/lang/snowball/english.py +++ b/src/whoosh/lang/snowball/english.py @@ -1,6 +1,6 @@ from .bases import _StandardStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class EnglishStemmer(_StandardStemmer): diff --git a/src/whoosh_reloaded/lang/snowball/finnish.py b/src/whoosh/lang/snowball/finnish.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/finnish.py rename to src/whoosh/lang/snowball/finnish.py index 7799e25c..63f5a752 100644 --- a/src/whoosh_reloaded/lang/snowball/finnish.py +++ b/src/whoosh/lang/snowball/finnish.py @@ -1,6 +1,6 @@ from .bases import _StandardStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class FinnishStemmer(_StandardStemmer): diff --git a/src/whoosh_reloaded/lang/snowball/french.py b/src/whoosh/lang/snowball/french.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/french.py rename to src/whoosh/lang/snowball/french.py index fbf466c9..f204adf3 100644 --- a/src/whoosh_reloaded/lang/snowball/french.py +++ b/src/whoosh/lang/snowball/french.py @@ -1,6 +1,6 @@ from .bases import _StandardStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class FrenchStemmer(_StandardStemmer): diff --git a/src/whoosh_reloaded/lang/snowball/german.py b/src/whoosh/lang/snowball/german.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/german.py rename to src/whoosh/lang/snowball/german.py index 5eabca7a..1c5f94f3 100644 --- a/src/whoosh_reloaded/lang/snowball/german.py +++ b/src/whoosh/lang/snowball/german.py @@ -1,6 +1,6 @@ from .bases import _StandardStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class GermanStemmer(_StandardStemmer): diff --git a/src/whoosh_reloaded/lang/snowball/hungarian.py b/src/whoosh/lang/snowball/hungarian.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/hungarian.py rename to src/whoosh/lang/snowball/hungarian.py index 6588b882..05597c5d 100644 --- a/src/whoosh_reloaded/lang/snowball/hungarian.py +++ b/src/whoosh/lang/snowball/hungarian.py @@ -1,4 +1,4 @@ -from whoosh_reloaded.compat import u +from whoosh.compat import u class HungarianStemmer(object): diff --git a/src/whoosh_reloaded/lang/snowball/italian.py b/src/whoosh/lang/snowball/italian.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/italian.py rename to src/whoosh/lang/snowball/italian.py index 729d0dd6..daadac9a 100644 --- a/src/whoosh_reloaded/lang/snowball/italian.py +++ b/src/whoosh/lang/snowball/italian.py @@ -1,6 +1,6 @@ from .bases import _StandardStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class ItalianStemmer(_StandardStemmer): diff --git a/src/whoosh_reloaded/lang/snowball/norwegian.py b/src/whoosh/lang/snowball/norwegian.py similarity index 98% rename from src/whoosh_reloaded/lang/snowball/norwegian.py rename to src/whoosh/lang/snowball/norwegian.py index 3d868a46..4bc0f7b0 100644 --- a/src/whoosh_reloaded/lang/snowball/norwegian.py +++ b/src/whoosh/lang/snowball/norwegian.py @@ -1,6 +1,6 @@ from .bases import _ScandinavianStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class NorwegianStemmer(_ScandinavianStemmer): diff --git a/src/whoosh_reloaded/lang/snowball/portugese.py b/src/whoosh/lang/snowball/portugese.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/portugese.py rename to src/whoosh/lang/snowball/portugese.py index 48534e2e..54dcb5aa 100644 --- a/src/whoosh_reloaded/lang/snowball/portugese.py +++ b/src/whoosh/lang/snowball/portugese.py @@ -1,6 +1,6 @@ from .bases import _StandardStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class PortugueseStemmer(_StandardStemmer): diff --git a/src/whoosh_reloaded/lang/snowball/romanian.py b/src/whoosh/lang/snowball/romanian.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/romanian.py rename to src/whoosh/lang/snowball/romanian.py index 7bce4449..89a96de6 100644 --- a/src/whoosh_reloaded/lang/snowball/romanian.py +++ b/src/whoosh/lang/snowball/romanian.py @@ -1,6 +1,6 @@ from .bases import _StandardStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class RomanianStemmer(_StandardStemmer): diff --git a/src/whoosh_reloaded/lang/snowball/russian.py b/src/whoosh/lang/snowball/russian.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/russian.py rename to src/whoosh/lang/snowball/russian.py index 69ba313e..dc4a825e 100644 --- a/src/whoosh_reloaded/lang/snowball/russian.py +++ b/src/whoosh/lang/snowball/russian.py @@ -1,4 +1,4 @@ -from whoosh_reloaded.compat import u +from whoosh.compat import u class RussianStemmer(object): diff --git a/src/whoosh_reloaded/lang/snowball/spanish.py b/src/whoosh/lang/snowball/spanish.py similarity index 99% rename from src/whoosh_reloaded/lang/snowball/spanish.py rename to src/whoosh/lang/snowball/spanish.py index 7286275d..ccb21871 100644 --- a/src/whoosh_reloaded/lang/snowball/spanish.py +++ b/src/whoosh/lang/snowball/spanish.py @@ -1,6 +1,6 @@ from .bases import _StandardStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class SpanishStemmer(_StandardStemmer): diff --git a/src/whoosh_reloaded/lang/snowball/swedish.py b/src/whoosh/lang/snowball/swedish.py similarity index 98% rename from src/whoosh_reloaded/lang/snowball/swedish.py rename to src/whoosh/lang/snowball/swedish.py index 20524b82..9303e3f7 100644 --- a/src/whoosh_reloaded/lang/snowball/swedish.py +++ b/src/whoosh/lang/snowball/swedish.py @@ -1,6 +1,6 @@ from .bases import _ScandinavianStemmer -from whoosh_reloaded.compat import u +from whoosh.compat import u class SwedishStemmer(_ScandinavianStemmer): diff --git a/src/whoosh_reloaded/lang/stopwords.py b/src/whoosh/lang/stopwords.py similarity index 100% rename from src/whoosh_reloaded/lang/stopwords.py rename to src/whoosh/lang/stopwords.py diff --git a/src/whoosh_reloaded/lang/wordnet.py b/src/whoosh/lang/wordnet.py similarity index 94% rename from src/whoosh_reloaded/lang/wordnet.py rename to src/whoosh/lang/wordnet.py index 538f254a..843da196 100644 --- a/src/whoosh_reloaded/lang/wordnet.py +++ b/src/whoosh/lang/wordnet.py @@ -34,8 +34,8 @@ from collections import defaultdict -from whoosh_reloaded.compat import iterkeys, text_type -from whoosh_reloaded.fields import Schema, ID, STORED +from whoosh.compat import iterkeys, text_type +from whoosh.fields import Schema, ID, STORED def parse_file(f): @@ -113,7 +113,7 @@ class Thesaurus(object): To save the in-memory Thesaurus to a Whoosh index... - >>> from whoosh_reloaded.filedb.filestore import FileStorage + >>> from whoosh.filedb.filestore import FileStorage >>> fs = FileStorage("index") >>> t.to_storage(fs) @@ -192,13 +192,13 @@ def from_storage(cls, storage, indexname="THES"): """Creates a Thesaurus object from the given storage object, which should contain an index created by Thesaurus.to_storage(). - >>> from whoosh_reloaded.filedb.filestore import FileStorage + >>> from whoosh.filedb.filestore import FileStorage >>> fs = FileStorage("index") >>> t = Thesaurus.from_storage(fs) >>> t.synonyms("hail") ['acclaim', 'come', 'herald'] - :param storage: A :class:`whoosh_reloaded.store.Storage` object from + :param storage: A :class:`whoosh.store.Storage` object from which to load the index. :param indexname: A name for the index. This allows you to store multiple indexes in the same storage object. @@ -213,12 +213,12 @@ def to_storage(self, storage, indexname="THES"): """Creates am index in the given storage object from the synonyms loaded from a WordNet file. - >>> from whoosh_reloaded.filedb.filestore import FileStorage + >>> from whoosh.filedb.filestore import FileStorage >>> fs = FileStorage("index") >>> t = Thesaurus.from_filename("wn_s.pl") >>> t.to_storage(fs) - :param storage: A :class:`whoosh_reloaded.store.Storage` object in + :param storage: A :class:`whoosh.store.Storage` object in which to save the index. :param indexname: A name for the index. This allows you to store multiple indexes in the same storage object. diff --git a/src/whoosh_reloaded/legacy.py b/src/whoosh/legacy.py similarity index 93% rename from src/whoosh_reloaded/legacy.py rename to src/whoosh/legacy.py index eeeb9618..13b21e79 100644 --- a/src/whoosh_reloaded/legacy.py +++ b/src/whoosh/legacy.py @@ -30,7 +30,7 @@ index formats. """ -from whoosh_reloaded.util.loading import RenamingUnpickler +from whoosh.util.loading import RenamingUnpickler def load_110_toc(stream, gen, schema, version): @@ -47,9 +47,9 @@ def load_110_toc(stream, gen, schema, version): # Remap the old classes and functions to their moved versions as we # unpickle the schema scuts = { - "wf": "whoosh_reloaded.fields", - "wsn": "whoosh_reloaded.support.numeric", - "wcw2": "whoosh_reloaded.codec.whoosh_reloaded2", + "wf": "whoosh.fields", + "wsn": "whoosh.support.numeric", + "wcw2": "whoosh.codec.whoosh2", } objmap = { "%(wf)s.NUMERIC": "%(wcw2)s.OLD_NUMERIC", diff --git a/src/whoosh_reloaded/matching/__init__.py b/src/whoosh/matching/__init__.py similarity index 89% rename from src/whoosh_reloaded/matching/__init__.py rename to src/whoosh/matching/__init__.py index cc349030..3f826b98 100644 --- a/src/whoosh_reloaded/matching/__init__.py +++ b/src/whoosh/matching/__init__.py @@ -25,7 +25,7 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.matching.mcore import * -from whoosh_reloaded.matching.binary import * -from whoosh_reloaded.matching.wrappers import * -from whoosh_reloaded.matching.combo import * +from whoosh.matching.mcore import * +from whoosh.matching.binary import * +from whoosh.matching.wrappers import * +from whoosh.matching.combo import * diff --git a/src/whoosh_reloaded/matching/binary.py b/src/whoosh/matching/binary.py similarity index 99% rename from src/whoosh_reloaded/matching/binary.py rename to src/whoosh/matching/binary.py index 9b9345dc..7ff1183d 100644 --- a/src/whoosh_reloaded/matching/binary.py +++ b/src/whoosh/matching/binary.py @@ -25,7 +25,7 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.matching import mcore +from whoosh.matching import mcore class BiMatcher(mcore.Matcher): diff --git a/src/whoosh_reloaded/matching/combo.py b/src/whoosh/matching/combo.py similarity index 95% rename from src/whoosh_reloaded/matching/combo.py rename to src/whoosh/matching/combo.py index 3efea58a..e642feec 100644 --- a/src/whoosh_reloaded/matching/combo.py +++ b/src/whoosh/matching/combo.py @@ -28,8 +28,8 @@ from __future__ import division from array import array -from whoosh_reloaded.compat import xrange -from whoosh_reloaded.matching import mcore +from whoosh.compat import range +from whoosh.matching import mcore class CombinationMatcher(mcore.Matcher): @@ -62,7 +62,7 @@ class PreloadedUnionMatcher(CombinationMatcher): for speed. This is faster than the implementation using a binary tree of - :class:`~whoosh_reloaded.matching.binary.UnionMatcher` objects (possibly just + :class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just because of less overhead), but it doesn't allow getting information about the "current" document other than the score, because there isn't really a current document, just an array of scores. @@ -87,7 +87,7 @@ def __init__(self, submatchers, doccount, boost=1.0, scored=True): docnum = m.id() place = docnum - offset if len(a) <= place: - a.extend(0 for _ in xrange(place - len(a) + 1)) + a.extend(0 for _ in range(place - len(a) + 1)) a[place] += score m.next() self._a = a @@ -165,7 +165,7 @@ class ArrayUnionMatcher(CombinationMatcher): matcher, accumulating the scores in an array. This is faster than the implementation using a binary tree of - :class:`~whoosh_reloaded.matching.binary.UnionMatcher` objects (possibly just + :class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just because of less overhead), but it doesn't allow getting information about the "current" document other than the score, because there isn't really a current document, just an array of scores. @@ -180,7 +180,7 @@ def __init__(self, submatchers, doccount, boost=1.0, scored=True, partsize=2048) partsize = doccount self._partsize = partsize - self._a = array("d", (0 for _ in xrange(self._partsize))) + self._a = array("d", (0 for _ in range(self._partsize))) self._docnum = self._min_id() self._read_part() @@ -208,7 +208,7 @@ def _read_part(self): a = self._a # Clear the array - for i in xrange(self._partsize): + for i in range(self._partsize): a[i] = 0 # Add the scores from the submatchers into the array diff --git a/src/whoosh_reloaded/matching/mcore.py b/src/whoosh/matching/mcore.py similarity index 94% rename from src/whoosh_reloaded/matching/mcore.py rename to src/whoosh/matching/mcore.py index 9879565d..a901cc2f 100644 --- a/src/whoosh_reloaded/matching/mcore.py +++ b/src/whoosh/matching/mcore.py @@ -29,18 +29,18 @@ This module contains "matcher" classes. Matchers deal with posting lists. The most basic matcher, which reads the list of postings for a term, will be provided by the backend implementation (for example, -:class:`whoosh_reloaded.filedb.filepostings.FilePostingReader`). The classes in this +:class:`whoosh.filedb.filepostings.FilePostingReader`). The classes in this module provide additional functionality, such as combining the results of two matchers, or modifying the results of a matcher. You do not need to deal with the classes in this module unless you need to write your own Matcher implementation to provide some new functionality. These classes are not instantiated by the user. They are usually created by a -:class:`~whoosh_reloaded.query.Query` object's :meth:`~whoosh_reloaded.query.Query.matcher()` +:class:`~whoosh.query.Query` object's :meth:`~whoosh.query.Query.matcher()` method, which returns the appropriate matcher to implement the query (for -example, the :class:`~whoosh_reloaded.query.Or` query's -:meth:`~whoosh_reloaded.query.Or.matcher()` method returns a -:py:class:`~whoosh_reloaded.matching.UnionMatcher` object). +example, the :class:`~whoosh.query.Or` query's +:meth:`~whoosh.query.Or.matcher()` method returns a +:py:class:`~whoosh.matching.UnionMatcher` object). Certain backends support "quality" optimizations. These backends have the ability to skip ahead if it knows the current block of postings can't @@ -51,16 +51,16 @@ from itertools import repeat -from whoosh_reloaded.compat import izip -from whoosh_reloaded.compat import abstractmethod +from whoosh.compat import izip +from whoosh.compat import abstractmethod # Exceptions class ReadTooFar(Exception): - """Raised when :meth:`~whoosh_reloaded.matching.Matcher.next()` or - :meth:`~whoosh_reloaded.matching.Matcher.skip_to()` are called on an inactive + """Raised when :meth:`~whoosh.matching.Matcher.next()` or + :meth:`~whoosh.matching.Matcher.skip_to()` are called on an inactive matcher. """ @@ -268,12 +268,12 @@ def value_as(self, astype): raise NotImplementedError("value_as not implemented in %s" % self.__class__) def spans(self): - """Returns a list of :class:`~whoosh_reloaded.query.spans.Span` objects for the + """Returns a list of :class:`~whoosh.query.spans.Span` objects for the matches in this document. Raises an exception if the field being searched does not store positions. """ - from whoosh_reloaded.query.spans import Span + from whoosh.query.spans import Span if self.supports("characters"): return [ @@ -422,9 +422,9 @@ def __init__( If this argument is not supplied, a list of 1.0 values is used. :param values: a list of encoded values corresponding to the list of IDs. - :param format: a :class:`whoosh_reloaded.formats.Format` object representing the + :param format: a :class:`whoosh.formats.Format` object representing the format of the field. - :param scorer: a :class:`whoosh_reloaded.scoring.BaseScorer` object for scoring + :param scorer: a :class:`whoosh.scoring.BaseScorer` object for scoring the postings. :param term: a ``("fieldname", "text")`` tuple, or None if this is not a term matcher. @@ -604,7 +604,7 @@ def value_as(self, astype): return decoder(self.value()) def spans(self): - from whoosh_reloaded.query.spans import Span + from whoosh.query.spans import Span if self.supports("characters"): return [ diff --git a/src/whoosh_reloaded/matching/wrappers.py b/src/whoosh/matching/wrappers.py similarity index 99% rename from src/whoosh_reloaded/matching/wrappers.py rename to src/whoosh/matching/wrappers.py index 88e7fd52..0532bde4 100644 --- a/src/whoosh_reloaded/matching/wrappers.py +++ b/src/whoosh/matching/wrappers.py @@ -27,7 +27,7 @@ from __future__ import division -from whoosh_reloaded.matching import mcore +from whoosh.matching import mcore class WrappingMatcher(mcore.Matcher): @@ -435,7 +435,7 @@ class RequireMatcher(WrappingMatcher): """ def __init__(self, a, b): - from whoosh_reloaded.matching.binary import IntersectionMatcher + from whoosh.matching.binary import IntersectionMatcher self.a = a self.b = b diff --git a/src/whoosh_reloaded/multiproc.py b/src/whoosh/multiproc.py similarity index 98% rename from src/whoosh_reloaded/multiproc.py rename to src/whoosh/multiproc.py index 4ba1d634..bf792680 100644 --- a/src/whoosh_reloaded/multiproc.py +++ b/src/whoosh/multiproc.py @@ -28,11 +28,11 @@ from __future__ import with_statement from multiprocessing import Process, Queue, cpu_count -from whoosh_reloaded.compat import queue, xrange, pickle -from whoosh_reloaded.codec import base -from whoosh_reloaded.writing import SegmentWriter -from whoosh_reloaded.externalsort import imerge -from whoosh_reloaded.util import random_name +from whoosh.compat import queue, range, pickle +from whoosh.codec import base +from whoosh.writing import SegmentWriter +from whoosh.externalsort import imerge +from whoosh.util import random_name def finish_subsegment(writer, k=64): @@ -141,7 +141,7 @@ def _process_file(self, filename, doc_count): load = pickle.load with tempstorage.open_file(filename).raw_file() as f: - for _ in xrange(doc_count): + for _ in range(doc_count): # Load the next pickled tuple from the file code, args = load(f) assert code == 0 @@ -358,7 +358,7 @@ def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs): self.batchsize = batchsize self.subargs = subargs if subargs else kwargs self.tasks = [ - SegmentWriter(ix, _lk=False, **self.subargs) for _ in xrange(self.procs) + SegmentWriter(ix, _lk=False, **self.subargs) for _ in range(self.procs) ] self.pointer = 0 self._added_sub = False diff --git a/src/whoosh_reloaded/qparser/__init__.py b/src/whoosh/qparser/__init__.py similarity index 91% rename from src/whoosh_reloaded/qparser/__init__.py rename to src/whoosh/qparser/__init__.py index e9eddb8b..a61f9052 100644 --- a/src/whoosh_reloaded/qparser/__init__.py +++ b/src/whoosh/qparser/__init__.py @@ -25,6 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.qparser.default import * -from whoosh_reloaded.qparser.plugins import * -from whoosh_reloaded.qparser.syntax import * +from whoosh.qparser.default import * +from whoosh.qparser.plugins import * +from whoosh.qparser.syntax import * diff --git a/src/whoosh_reloaded/qparser/common.py b/src/whoosh/qparser/common.py similarity index 100% rename from src/whoosh_reloaded/qparser/common.py rename to src/whoosh/qparser/common.py diff --git a/src/whoosh_reloaded/qparser/dateparse.py b/src/whoosh/qparser/dateparse.py similarity index 97% rename from src/whoosh_reloaded/qparser/dateparse.py rename to src/whoosh/qparser/dateparse.py index 654ffe91..b1ff47cf 100644 --- a/src/whoosh_reloaded/qparser/dateparse.py +++ b/src/whoosh/qparser/dateparse.py @@ -29,14 +29,14 @@ import sys from datetime import datetime, timedelta -from whoosh_reloaded.compat import string_type, iteritems -from whoosh_reloaded.qparser import plugins, syntax -from whoosh_reloaded.qparser.taggers import Tagger -from whoosh_reloaded.support.relativedelta import relativedelta -from whoosh_reloaded.util.text import rcompile -from whoosh_reloaded.util.times import adatetime, timespan -from whoosh_reloaded.util.times import fill_in, is_void, relative_days -from whoosh_reloaded.util.times import TimeError +from whoosh.compat import string_type, iteritems +from whoosh.qparser import plugins, syntax +from whoosh.qparser.taggers import Tagger +from whoosh.support.relativedelta import relativedelta +from whoosh.util.text import rcompile +from whoosh.util.times import adatetime, timespan +from whoosh.util.times import fill_in, is_void, relative_days +from whoosh.util.times import TimeError class DateParseError(Exception): @@ -803,9 +803,9 @@ def __init__( against which to measure relative dates. If you do not supply this argument, the plugin uses ``datetime.utcnow()``. :param dateparser: an instance of - :class:`whoosh_reloaded.qparser.dateparse.DateParser`. If you do not supply + :class:`whoosh.qparser.dateparse.DateParser`. If you do not supply this argument, the plugin automatically uses - :class:`whoosh_reloaded.qparser.dateparse.English`. + :class:`whoosh.qparser.dateparse.English`. :param callback: a callback function for parsing errors. This allows you to provide feedback to the user about problems parsing dates. :param remove: if True, unparseable dates are removed from the token @@ -889,7 +889,7 @@ def do_dates(self, parser, group): if not schema: return group - from whoosh_reloaded.fields import DATETIME + from whoosh.fields import DATETIME datefields = frozenset( fieldname @@ -926,7 +926,7 @@ def r(self): return repr(self.dt) def query(self, parser): - from whoosh_reloaded import query + from whoosh import query fieldname = self.fieldname or parser.fieldname field = parser.schema[fieldname] @@ -954,7 +954,7 @@ def r(self): return "%r-%r" % (self.start, self.end) def query(self, parser): - from whoosh_reloaded import query + from whoosh import query fieldname = self.fieldname or parser.fieldname return query.DateRange(fieldname, self.start, self.end, boost=self.boost) @@ -966,7 +966,7 @@ def __init__(self, plugin, expr): self.expr = rcompile(expr, re.IGNORECASE) def match(self, parser, text, pos): - from whoosh_reloaded.fields import DATETIME + from whoosh.fields import DATETIME match = self.expr.match(text, pos) if match: diff --git a/src/whoosh_reloaded/qparser/default.py b/src/whoosh/qparser/default.py similarity index 94% rename from src/whoosh_reloaded/qparser/default.py rename to src/whoosh/qparser/default.py index 56cda9b9..0369c319 100644 --- a/src/whoosh_reloaded/qparser/default.py +++ b/src/whoosh/qparser/default.py @@ -27,10 +27,10 @@ import sys -from whoosh_reloaded import query -from whoosh_reloaded.compat import text_type -from whoosh_reloaded.qparser import syntax -from whoosh_reloaded.qparser.common import print_debug, QueryParserError +from whoosh import query +from whoosh.compat import text_type +from whoosh.qparser import syntax +from whoosh.qparser.common import print_debug, QueryParserError # Query parser object @@ -45,7 +45,7 @@ class QueryParser(object): the default list of plug-ins, and/or use ``add_plugin()`` and/or ``remove_plugin_class()`` to change the plug-ins included in the parser. - >>> from whoosh_reloaded import qparser + >>> from whoosh import qparser >>> parser = qparser.QueryParser("content", schema) >>> parser.remove_plugin_class(qparser.WildcardPlugin) >>> parser.add_plugin(qparser.PrefixPlugin()) @@ -65,7 +65,7 @@ def __init__( """ :param fieldname: the default field -- the parser uses this as the field for any terms without an explicit field. - :param schema: a :class:`whoosh_reloaded.fields.Schema` object to use when + :param schema: a :class:`whoosh.fields.Schema` object to use when parsing. The appropriate fields in the schema will be used to tokenize terms/phrases before they are turned into query objects. You can specify None for the schema to create a parser that does @@ -75,9 +75,9 @@ def __init__( the default list of plugins. Classes in the list will be automatically instantiated. :param termclass: the query class to use for individual search terms. - The default is :class:`whoosh_reloaded.query.Term`. + The default is :class:`whoosh.query.Term`. :param phraseclass: the query class to use for phrases. The default - is :class:`whoosh_reloaded.query.Phrase`. + is :class:`whoosh.query.Phrase`. :param group: the default grouping. ``AndGroup`` makes terms required by default. ``OrGroup`` makes terms optional by default. """ @@ -97,7 +97,7 @@ def __init__( def default_set(self): """Returns the default list of plugins to use.""" - from whoosh_reloaded.qparser import plugins + from whoosh.qparser import plugins return [ # plugins.WhitespacePlugin(), @@ -128,7 +128,7 @@ def add_plugin(self, pin): self.plugins.append(pin) def _add_ws_plugin(self): - from whoosh_reloaded.qparser.plugins import WhitespacePlugin + from whoosh.qparser.plugins import WhitespacePlugin self.add_plugin(WhitespacePlugin()) @@ -356,14 +356,14 @@ def process(self, text, pos=0, debug=False): return nodes def parse(self, text, normalize=True, debug=False): - """Parses the input string and returns a :class:`whoosh_reloaded.query.Query` + """Parses the input string and returns a :class:`whoosh.query.Query` object/tree. :param text: the unicode string to parse. :param normalize: whether to call normalize() on the query object/tree before returning it. This should be left on unless you're trying to debug the parser output. - :rtype: :class:`whoosh_reloaded.query.Query` + :rtype: :class:`whoosh.query.Query` """ if not isinstance(text, text_type): @@ -406,7 +406,7 @@ def MultifieldParser(fieldnames, schema, fieldboosts=None, **kwargs): :param fieldboosts: an optional dictionary mapping field names to boosts. """ - from whoosh_reloaded.qparser.plugins import MultifieldPlugin + from whoosh.qparser.plugins import MultifieldPlugin p = QueryParser(None, schema, **kwargs) mfp = MultifieldPlugin(fieldnames, fieldboosts=fieldboosts) @@ -419,7 +419,7 @@ def SimpleParser(fieldname, schema, **kwargs): syntax. """ - from whoosh_reloaded.qparser import plugins, syntax + from whoosh.qparser import plugins, syntax pins = [plugins.WhitespacePlugin, plugins.PlusMinusPlugin, plugins.PhrasePlugin] orgroup = syntax.OrGroup @@ -434,7 +434,7 @@ def DisMaxParser(fieldboosts, schema, tiebreak=0.0, **kwargs): :param fieldboosts: a dictionary mapping field names to boosts. """ - from whoosh_reloaded.qparser import plugins, syntax + from whoosh.qparser import plugins, syntax mfp = plugins.MultifieldPlugin( list(fieldboosts.keys()), fieldboosts=fieldboosts, group=syntax.DisMaxGroup diff --git a/src/whoosh_reloaded/qparser/plugins.py b/src/whoosh/qparser/plugins.py similarity index 98% rename from src/whoosh_reloaded/qparser/plugins.py rename to src/whoosh/qparser/plugins.py index 926b61b4..a2d2f6bb 100644 --- a/src/whoosh_reloaded/qparser/plugins.py +++ b/src/whoosh/qparser/plugins.py @@ -27,13 +27,13 @@ import copy -from whoosh_reloaded import query -from whoosh_reloaded.compat import u -from whoosh_reloaded.compat import iteritems, xrange -from whoosh_reloaded.qparser import syntax -from whoosh_reloaded.qparser.common import attach -from whoosh_reloaded.qparser.taggers import RegexTagger, FnTagger -from whoosh_reloaded.util.text import rcompile +from whoosh import query +from whoosh.compat import u +from whoosh.compat import iteritems, range +from whoosh.qparser import syntax +from whoosh.qparser.common import attach +from whoosh.qparser.taggers import RegexTagger, FnTagger +from whoosh.util.text import rcompile class Plugin(object): @@ -173,7 +173,7 @@ def do_wildcards(self, parser, group): self.do_wildcards(parser, node) i += 1 - for i in xrange(len(group)): + for i in range(len(group)): node = group[i] if isinstance(node, self.WildcardNode): text = node.text @@ -455,7 +455,7 @@ class FuzzyTermPlugin(TaggingPlugin): >>> qp.add_plugin(qparser.FuzzyTermPlugin()) >>> q = qp.parse("Stephen~2 Colbert") - For example, the following query creates a :class:`whoosh_reloaded.query.FuzzyTerm` + For example, the following query creates a :class:`whoosh.query.FuzzyTerm` query with a maximum edit distance of 1:: bob~ @@ -1322,13 +1322,13 @@ class PseudoFieldPlugin(Plugin): Unfortunately writing the transform function(s) requires knowledge of the parser's abstract syntax tree classes. A transform function takes a - :class:`whoosh_reloaded.qparser.SyntaxNode` and returns a - :class:`~whoosh_reloaded.qparser.SyntaxNode` (or None if the node should be removed + :class:`whoosh.qparser.SyntaxNode` and returns a + :class:`~whoosh.qparser.SyntaxNode` (or None if the node should be removed instead of transformed). Some things you can do in the transform function:: - from whoosh_reloaded import qparser + from whoosh import qparser def my_xform_fn(node): # Is this a text node? @@ -1354,7 +1354,7 @@ def my_xform_fn(node): transforms the text in the pseudo-field "regex" into a regular expression query in the "content" field:: - from whoosh_reloaded import qparser + from whoosh import qparser def regex_maker(node): if node.has_text: @@ -1402,8 +1402,8 @@ def __init__(self, xform_map): """ :param xform_map: a dictionary mapping psuedo-field names to transform functions. The function should take a - :class:`whoosh_reloaded.qparser.SyntaxNode` as an argument, and return a - :class:`~whoosh_reloaded.qparser.SyntaxNode`. If the function returns None, + :class:`whoosh.qparser.SyntaxNode` as an argument, and return a + :class:`~whoosh.qparser.SyntaxNode`. If the function returns None, the node will be removed from the query. """ diff --git a/src/whoosh_reloaded/qparser/syntax.py b/src/whoosh/qparser/syntax.py similarity index 98% rename from src/whoosh_reloaded/qparser/syntax.py rename to src/whoosh/qparser/syntax.py index 6ae0fdaa..a9297350 100644 --- a/src/whoosh_reloaded/qparser/syntax.py +++ b/src/whoosh/qparser/syntax.py @@ -27,14 +27,14 @@ import sys, weakref -from whoosh_reloaded import query -from whoosh_reloaded.qparser.common import get_single_text, QueryParserError, attach +from whoosh import query +from whoosh.qparser.common import get_single_text, QueryParserError, attach class SyntaxNode(object): """Base class for nodes that make up the abstract syntax tree (AST) of a parsed user query string. The AST is an intermediate step, generated - from the query string, then converted into a :class:`whoosh_reloaded.query.Query` + from the query string, then converted into a :class:`whoosh.query.Query` tree by calling the ``query()`` method on the nodes. Instances have the following required attributes: @@ -84,7 +84,7 @@ def fn_wrapper(n): return fn_wrapper(self) def query(self, parser): - """Returns a :class:`whoosh_reloaded.query.Query` instance corresponding to this + """Returns a :class:`whoosh.query.Query` instance corresponding to this syntax tree node. """ diff --git a/src/whoosh_reloaded/qparser/taggers.py b/src/whoosh/qparser/taggers.py similarity index 91% rename from src/whoosh_reloaded/qparser/taggers.py rename to src/whoosh/qparser/taggers.py index b820334c..6c492d3d 100644 --- a/src/whoosh_reloaded/qparser/taggers.py +++ b/src/whoosh/qparser/taggers.py @@ -25,7 +25,7 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.util.text import rcompile +from whoosh.util.text import rcompile # Tagger objects @@ -33,14 +33,14 @@ class Tagger(object): """Base class for taggers, objects which match syntax in the query string - and translate it into a :class:`whoosh_reloaded.qparser.syntax.SyntaxNode` object. + and translate it into a :class:`whoosh.qparser.syntax.SyntaxNode` object. """ def match(self, parser, text, pos): """This method should see if this tagger matches the query string at the given position. If it matches, it should return - :param parser: the :class:`whoosh_reloaded.qparser.default.QueryParser` object. + :param parser: the :class:`whoosh.qparser.default.QueryParser` object. :param text: the text being parsed. :param pos: the position in the text at which the tagger should try to match. @@ -69,7 +69,7 @@ def create(self, parser, match): """When the regular expression matches, this method is called to translate the regex match object into a syntax node. - :param parser: the :class:`whoosh_reloaded.qparser.default.QueryParser` object. + :param parser: the :class:`whoosh.qparser.default.QueryParser` object. :param match: the regex match object. """ diff --git a/src/whoosh_reloaded/query/__init__.py b/src/whoosh/query/__init__.py similarity index 79% rename from src/whoosh_reloaded/query/__init__.py rename to src/whoosh/query/__init__.py index c463c412..97e34a40 100644 --- a/src/whoosh_reloaded/query/__init__.py +++ b/src/whoosh/query/__init__.py @@ -25,12 +25,12 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.query.qcore import * -from whoosh_reloaded.query.terms import * -from whoosh_reloaded.query.compound import * -from whoosh_reloaded.query.positional import * -from whoosh_reloaded.query.ranges import * -from whoosh_reloaded.query.wrappers import * -from whoosh_reloaded.query.nested import * -from whoosh_reloaded.query.qcolumns import * -from whoosh_reloaded.query.spans import * +from whoosh.query.qcore import * +from whoosh.query.terms import * +from whoosh.query.compound import * +from whoosh.query.positional import * +from whoosh.query.ranges import * +from whoosh.query.wrappers import * +from whoosh.query.nested import * +from whoosh.query.qcolumns import * +from whoosh.query.spans import * diff --git a/src/whoosh_reloaded/query/compound.py b/src/whoosh/query/compound.py similarity index 98% rename from src/whoosh_reloaded/query/compound.py rename to src/whoosh/query/compound.py index 6e5d79ea..a7c02798 100644 --- a/src/whoosh_reloaded/query/compound.py +++ b/src/whoosh/query/compound.py @@ -27,10 +27,10 @@ from __future__ import division -from whoosh_reloaded import matching -from whoosh_reloaded.compat import text_type, u -from whoosh_reloaded.query import qcore -from whoosh_reloaded.util import make_binary_tree, make_weighted_tree +from whoosh import matching +from whoosh.compat import text_type, u +from whoosh.query import qcore +from whoosh.util import make_binary_tree, make_weighted_tree class CompoundQuery(qcore.Query): @@ -103,7 +103,7 @@ def estimate_size(self, ixreader): return min(est, ixreader.doc_count()) def estimate_min_size(self, ixreader): - from whoosh_reloaded.query import Not + from whoosh.query import Not subs = self.subqueries qs = [ @@ -118,7 +118,7 @@ def estimate_min_size(self, ixreader): return 0 def normalize(self): - from whoosh_reloaded.query import Every, TermRange, NumericRange + from whoosh.query import Every, TermRange, NumericRange # Normalize subqueries and merge nested instances of this class subqueries = [] diff --git a/src/whoosh_reloaded/query/nested.py b/src/whoosh/query/nested.py similarity index 97% rename from src/whoosh_reloaded/query/nested.py rename to src/whoosh/query/nested.py index 71ede157..c0a9bb29 100644 --- a/src/whoosh_reloaded/query/nested.py +++ b/src/whoosh/query/nested.py @@ -25,18 +25,18 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded import matching -from whoosh_reloaded.compat import xrange -from whoosh_reloaded.query import qcore -from whoosh_reloaded.query.wrappers import WrappingQuery +from whoosh import matching +from whoosh.compat import range +from whoosh.query import qcore +from whoosh.query.wrappers import WrappingQuery class NestedParent(WrappingQuery): """A query that allows you to search for "nested" documents, where you can index (possibly multiple levels of) "parent" and "child" documents using - the :meth:`~whoosh_reloaded.writing.IndexWriter.group` and/or - :meth:`~whoosh_reloaded.writing.IndexWriter.start_group` methods of a - :class:`whoosh_reloaded.writing.IndexWriter` to indicate that hierarchically related + the :meth:`~whoosh.writing.IndexWriter.group` and/or + :meth:`~whoosh.writing.IndexWriter.start_group` methods of a + :class:`whoosh.writing.IndexWriter` to indicate that hierarchically related documents should be kept together:: schema = fields.Schema(type=fields.ID, text=fields.TEXT(stored=True)) @@ -128,7 +128,7 @@ def deletion_docs(self, searcher): docnum = m.id() parentdoc = bits.before(docnum + 1) nextparent = bits.after(docnum) or maxdoc - for i in xrange(parentdoc, nextparent): + for i in range(parentdoc, nextparent): yield i m.skip_to(nextparent) diff --git a/src/whoosh_reloaded/query/positional.py b/src/whoosh/query/positional.py similarity index 95% rename from src/whoosh_reloaded/query/positional.py rename to src/whoosh/query/positional.py index 61cbffbf..66596fb9 100644 --- a/src/whoosh_reloaded/query/positional.py +++ b/src/whoosh/query/positional.py @@ -28,10 +28,10 @@ from __future__ import division import copy -from whoosh_reloaded import matching -from whoosh_reloaded.analysis import Token -from whoosh_reloaded.compat import u -from whoosh_reloaded.query import qcore, terms, compound +from whoosh import matching +from whoosh.analysis import Token +from whoosh.compat import u +from whoosh.query import qcore, terms, compound class Sequence(compound.CompoundQuery): @@ -47,7 +47,7 @@ class Sequence(compound.CompoundQuery): def __init__(self, subqueries, slop=1, ordered=True, boost=1.0): """ - :param subqueries: a list of :class:`whoosh_reloaded.query.Query` objects to + :param subqueries: a list of :class:`whoosh.query.Query` objects to match in sequence. :param slop: the maximum difference in position allowed between the subqueries. @@ -104,7 +104,7 @@ def estimate_min_size(self, ixreader): return self._and_query().estimate_min_size(ixreader) def _matcher(self, subs, searcher, context): - from whoosh_reloaded.query.spans import SpanNear + from whoosh.query.spans import SpanNear # Tell the sub-queries this matcher will need the current match to get # spans @@ -127,7 +127,7 @@ class Ordered(Sequence): JOINT = " BEFORE " def _matcher(self, subs, searcher, context): - from whoosh_reloaded.query.spans import SpanBefore + from whoosh.query.spans import SpanBefore return self._tree_matcher(subs, SpanBefore._Matcher, searcher, context, None) @@ -244,7 +244,7 @@ def estimate_min_size(self, ixreader): return self._and_query().estimate_min_size(ixreader) def matcher(self, searcher, context=None): - from whoosh_reloaded.query import Term, SpanNear2 + from whoosh.query import Term, SpanNear2 fieldname = self.fieldname if fieldname not in searcher.schema: diff --git a/src/whoosh_reloaded/query/qcolumns.py b/src/whoosh/query/qcolumns.py similarity index 97% rename from src/whoosh_reloaded/query/qcolumns.py rename to src/whoosh/query/qcolumns.py index 71f85de3..6aeab5cd 100644 --- a/src/whoosh_reloaded/query/qcolumns.py +++ b/src/whoosh/query/qcolumns.py @@ -25,8 +25,8 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.matching import ConstantScoreMatcher, NullMatcher, ReadTooFar -from whoosh_reloaded.query import Query +from whoosh.matching import ConstantScoreMatcher, NullMatcher, ReadTooFar +from whoosh.query import Query class ColumnQuery(Query): diff --git a/src/whoosh_reloaded/query/qcore.py b/src/whoosh/query/qcore.py similarity index 96% rename from src/whoosh_reloaded/query/qcore.py rename to src/whoosh/query/qcore.py index 54932b42..61305036 100644 --- a/src/whoosh_reloaded/query/qcore.py +++ b/src/whoosh/query/qcore.py @@ -29,10 +29,10 @@ import copy from array import array -from whoosh_reloaded import matching -from whoosh_reloaded.compat import u -from whoosh_reloaded.reading import TermNotFound -from whoosh_reloaded.compat import methodcaller +from whoosh import matching +from whoosh.compat import u +from whoosh.reading import TermNotFound +from whoosh.compat import methodcaller # Exceptions @@ -65,7 +65,7 @@ def token_lists(q, phrases=True): """ if q.is_leaf(): - from whoosh_reloaded.query import Phrase + from whoosh.query import Phrase if phrases or not isinstance(q, Phrase): return list(q.tokens()) @@ -182,7 +182,7 @@ def __or__(self, query): query. """ - from whoosh_reloaded.query import Or + from whoosh.query import Or return Or([self, query]).normalize() @@ -191,7 +191,7 @@ def __and__(self, query): query. """ - from whoosh_reloaded.query import And + from whoosh.query import And return And([self, query]).normalize() @@ -200,7 +200,7 @@ def __sub__(self, query): query as a "NOT" query. """ - from whoosh_reloaded.query import And, Not + from whoosh.query import And, Not return And([self, Not(query)]).normalize() @@ -343,7 +343,7 @@ def existing_terms(self, ixreader, phrases=True, expand=False, fieldname=None): """Returns a set of all byteterms in this query tree that exist in the given ixreader. - :param ixreader: A :class:`whoosh_reloaded.reading.IndexReader` object. + :param ixreader: A :class:`whoosh.reading.IndexReader` object. :param phrases: Whether to add words found in Phrase queries. :param expand: If True, queries that match multiple terms will return all matching expansions. @@ -383,8 +383,8 @@ def phrases(self): Recursively get all individual terms and phrases that are part of this Query """ - from whoosh_reloaded.query.positional import Phrase - from whoosh_reloaded.query.terms import Term + from whoosh.query.positional import Phrase + from whoosh.query.terms import Term terms = [] phrases = [] @@ -532,10 +532,10 @@ def estimate_min_size(self, ixreader): return self.estimate_size(ixreader) def matcher(self, searcher, context=None): - """Returns a :class:`~whoosh_reloaded.matching.Matcher` object you can use to + """Returns a :class:`~whoosh.matching.Matcher` object you can use to retrieve documents and scores matching this query. - :rtype: :class:`whoosh_reloaded.matching.Matcher` + :rtype: :class:`whoosh.matching.Matcher` """ raise NotImplementedError @@ -547,7 +547,7 @@ def docs(self, searcher): ... list(my_query.docs(searcher)) [10, 34, 78, 103] - :param searcher: A :class:`whoosh_reloaded.searching.Searcher` object. + :param searcher: A :class:`whoosh.searching.Searcher` object. """ try: @@ -558,7 +558,7 @@ def docs(self, searcher): def deletion_docs(self, searcher): """Returns an iterator of docnums matching this query for the purpose - of deletion. The :meth:`~whoosh_reloaded.writing.IndexWriter.delete_by_query` + of deletion. The :meth:`~whoosh.writing.IndexWriter.delete_by_query` method will use this method when deciding what documents to delete, allowing special queries (e.g. nested queries) to override what documents are deleted. The default implementation just forwards to diff --git a/src/whoosh_reloaded/query/ranges.py b/src/whoosh/query/ranges.py similarity index 97% rename from src/whoosh_reloaded/query/ranges.py rename to src/whoosh/query/ranges.py index 2bb4e7c0..75ff8a82 100644 --- a/src/whoosh_reloaded/query/ranges.py +++ b/src/whoosh/query/ranges.py @@ -27,9 +27,9 @@ from __future__ import division -from whoosh_reloaded.compat import b, u -from whoosh_reloaded.query import qcore, terms, compound, wrappers -from whoosh_reloaded.util.times import datetime_to_long +from whoosh.compat import b, u +from whoosh.query import qcore, terms, compound, wrappers +from whoosh.util.times import datetime_to_long class RangeMixin(object): @@ -194,7 +194,7 @@ def __init__( def normalize(self): if self.start in ("", None) and self.end in (u("\uffff"), None): - from whoosh_reloaded.query import Every + from whoosh.query import Every return Every(self.fieldname, boost=self.boost) elif self.start == self.end: @@ -314,8 +314,8 @@ def docs(self, searcher): return q.docs(searcher) def _compile_query(self, ixreader): - from whoosh_reloaded.fields import NUMERIC - from whoosh_reloaded.util.numeric import tiered_ranges + from whoosh.fields import NUMERIC + from whoosh.util.numeric import tiered_ranges field = ixreader.schema[self.fieldname] if not isinstance(field, NUMERIC): diff --git a/src/whoosh_reloaded/query/spans.py b/src/whoosh/query/spans.py similarity index 95% rename from src/whoosh_reloaded/query/spans.py rename to src/whoosh/query/spans.py index ddc25c41..13a8321a 100644 --- a/src/whoosh_reloaded/query/spans.py +++ b/src/whoosh/query/spans.py @@ -29,23 +29,23 @@ This module contains Query objects that deal with "spans". Span queries allow for positional constraints on matching documents. For -example, the :class:`whoosh_reloaded.spans.SpanNear` query matches documents where one +example, the :class:`whoosh.spans.SpanNear` query matches documents where one term occurs near another. Because you can nest span queries, and wrap them around almost any non-span query, you can create very complex constraints. -For example, to find documents containing "whoosh_reloaded" at most 5 positions before +For example, to find documents containing "whoosh" at most 5 positions before "library" in the "text" field:: - from whoosh_reloaded import query, spans - t1 = query.Term("text", "whoosh_reloaded") + from whoosh import query, spans + t1 = query.Term("text", "whoosh") t2 = query.Term("text", "library") q = spans.SpanNear(t1, t2, slop=5) """ -from whoosh_reloaded.matching import mcore, wrappers, binary -from whoosh_reloaded.query import Query, And, AndMaybe, Or, Term -from whoosh_reloaded.util import make_binary_tree +from whoosh.matching import mcore, wrappers, binary +from whoosh.query import Query, And, AndMaybe, Or, Term +from whoosh.util import make_binary_tree # Span class @@ -361,20 +361,20 @@ class SpanNear(SpanQuery): queries that occur right next to each other (slop=1) and in order (ordered=True). - For example, to find documents where "whoosh_reloaded" occurs next to "library" + For example, to find documents where "whoosh" occurs next to "library" in the "text" field:: - from whoosh_reloaded import query, spans - t1 = query.Term("text", "whoosh_reloaded") + from whoosh import query, spans + t1 = query.Term("text", "whoosh") t2 = query.Term("text", "library") q = spans.SpanNear(t1, t2) - To find documents where "whoosh_reloaded" occurs at most 5 positions before + To find documents where "whoosh" occurs at most 5 positions before "library":: q = spans.SpanNear(t1, t2, slop=5) - To find documents where "whoosh_reloaded" occurs at most 5 positions before or after + To find documents where "whoosh" occurs at most 5 positions before or after "library":: q = spans.SpanNear(t1, t2, slop=5, ordered=False) @@ -382,7 +382,7 @@ class SpanNear(SpanQuery): You can use the ``phrase()`` class method to create a tree of SpanNear queries to match a list of terms:: - q = spans.SpanNear.phrase("text", ["whoosh_reloaded", "search", "library"], + q = spans.SpanNear.phrase("text", ["whoosh", "search", "library"], slop=2) """ @@ -538,20 +538,20 @@ class SpanNear2(SpanQuery): requiring you to build a binary tree of query objects. This query should also be slightly faster due to less overhead.) - For example, to find documents where "whoosh_reloaded" occurs next to "library" + For example, to find documents where "whoosh" occurs next to "library" in the "text" field:: - from whoosh_reloaded import query, spans - t1 = query.Term("text", "whoosh_reloaded") + from whoosh import query, spans + t1 = query.Term("text", "whoosh") t2 = query.Term("text", "library") q = spans.SpanNear2([t1, t2]) - To find documents where "whoosh_reloaded" occurs at most 5 positions before + To find documents where "whoosh" occurs at most 5 positions before "library":: q = spans.SpanNear2([t1, t2], slop=5) - To find documents where "whoosh_reloaded" occurs at most 5 positions before or after + To find documents where "whoosh" occurs at most 5 positions before or after "library":: q = spans.SpanNear2(t1, t2, slop=5, ordered=False) @@ -775,7 +775,7 @@ class SpanNot(SpanBiQuery): For example, to match documents that contain "bear" at most 2 places after "apple" in the "text" field but don't have "cute" between them:: - from whoosh_reloaded import query, spans + from whoosh import query, spans t1 = query.Term("text", "apple") t2 = query.Term("text", "bear") near = spans.SpanNear(t1, t2, slop=2) @@ -824,7 +824,7 @@ class SpanContains(SpanBiQuery): For example, to match documents where "apple" occurs at most 10 places before "bear" in the "text" field and "cute" is between them:: - from whoosh_reloaded import query, spans + from whoosh import query, spans t1 = query.Term("text", "apple") t2 = query.Term("text", "bear") near = spans.SpanNear(t1, t2, slop=10) @@ -872,7 +872,7 @@ class SpanBefore(SpanBiQuery): For example, to match documents where "apple" occurs anywhere before "bear":: - from whoosh_reloaded import query, spans + from whoosh import query, spans t1 = query.Term("text", "apple") t2 = query.Term("text", "bear") q = spans.SpanBefore(t1, t2) diff --git a/src/whoosh_reloaded/query/terms.py b/src/whoosh/query/terms.py similarity index 96% rename from src/whoosh_reloaded/query/terms.py rename to src/whoosh/query/terms.py index eb4d962a..6922a2b7 100644 --- a/src/whoosh_reloaded/query/terms.py +++ b/src/whoosh/query/terms.py @@ -30,11 +30,11 @@ import fnmatch import re -from whoosh_reloaded import matching -from whoosh_reloaded.analysis import Token -from whoosh_reloaded.compat import bytes_type, text_type, u -from whoosh_reloaded.lang.morph_en import variations -from whoosh_reloaded.query import qcore +from whoosh import matching +from whoosh.analysis import Token +from whoosh.compat import bytes_type, text_type, u +from whoosh.lang.morph_en import variations +from whoosh.query import qcore class Term(qcore.Query): @@ -197,7 +197,7 @@ def simplify(self, ixreader): if len(existing) == 1: return existing[0] elif existing: - from whoosh_reloaded.query import Or + from whoosh.query import Or return Or(existing) else: @@ -216,7 +216,7 @@ def estimate_min_size(self, ixreader): ) def matcher(self, searcher, context=None): - from whoosh_reloaded.query import Or + from whoosh.query import Or fieldname = self.field() constantscore = self.constantscore @@ -236,7 +236,7 @@ def matcher(self, searcher, context=None): if context: context = context.set(weighting=None) else: - from whoosh_reloaded.searching import SearchContext + from whoosh.searching import SearchContext context = SearchContext(weighting=None) # Or the terms together @@ -327,7 +327,7 @@ def _btexts(self, ixreader): def matcher(self, searcher, context=None): if self.text == "": - from whoosh_reloaded.query import Every + from whoosh.query import Every eq = Every(self.fieldname, boost=self.boost) return eq.matcher(searcher, context) @@ -357,7 +357,7 @@ def normalize(self): # a simple Term text = self.text if text == "*": - from whoosh_reloaded.query import Every + from whoosh.query import Every return Every(self.fieldname, boost=self.boost) if "*" not in text and "?" not in text: @@ -372,7 +372,7 @@ def normalize(self): def matcher(self, searcher, context=None): if self.text == "*": - from whoosh_reloaded.query import Every + from whoosh.query import Every eq = Every(self.fieldname, boost=self.boost) return eq.matcher(searcher, context) @@ -419,7 +419,7 @@ def _find_prefix(self, text): def matcher(self, searcher, context=None): if self.text == ".*": - from whoosh_reloaded.query import Every + from whoosh.query import Every eq = Every(self.fieldname, boost=self.boost) return eq.matcher(searcher, context) diff --git a/src/whoosh_reloaded/query/wrappers.py b/src/whoosh/query/wrappers.py similarity index 95% rename from src/whoosh_reloaded/query/wrappers.py rename to src/whoosh/query/wrappers.py index 8f4ebb61..5f07a80d 100644 --- a/src/whoosh_reloaded/query/wrappers.py +++ b/src/whoosh/query/wrappers.py @@ -28,9 +28,9 @@ from __future__ import division from array import array -from whoosh_reloaded import matching -from whoosh_reloaded.compat import text_type, u -from whoosh_reloaded.query import qcore +from whoosh import matching +from whoosh.compat import text_type, u +from whoosh.query import qcore class WrappingQuery(qcore.Query): @@ -173,7 +173,7 @@ def _rewrap(self, child): return self.__class__(child, self.score) def matcher(self, searcher, context=None): - from whoosh_reloaded.searching import SearchContext + from whoosh.searching import SearchContext context = context or SearchContext() m = self.child.matcher(searcher, context) @@ -185,7 +185,7 @@ def matcher(self, searcher, context=None): class WeightingQuery(WrappingQuery): - """Wraps a query and uses a specific :class:`whoosh_reloaded.sorting.WeightingModel` + """Wraps a query and uses a specific :class:`whoosh.sorting.WeightingModel` to score documents that match the wrapped query. """ diff --git a/src/whoosh_reloaded/reading.py b/src/whoosh/reading.py similarity index 96% rename from src/whoosh_reloaded/reading.py rename to src/whoosh/reading.py index aeda01af..f9f0a13e 100644 --- a/src/whoosh_reloaded/reading.py +++ b/src/whoosh/reading.py @@ -34,13 +34,13 @@ from cached_property import cached_property -from whoosh_reloaded import columns -from whoosh_reloaded.compat import abstractmethod -from whoosh_reloaded.compat import xrange, zip_, next, iteritems -from whoosh_reloaded.filedb.filestore import OverlayStorage -from whoosh_reloaded.matching import MultiMatcher -from whoosh_reloaded.support.levenshtein import distance -from whoosh_reloaded.system import emptybytes +from whoosh import columns +from whoosh.compat import abstractmethod +from whoosh.compat import zip_, next, iteritems +from whoosh.filedb.filestore import OverlayStorage +from whoosh.matching import MultiMatcher +from whoosh.support.levenshtein import distance +from whoosh.system import emptybytes # Exceptions @@ -162,7 +162,7 @@ def __contains__(self, term): raise NotImplementedError def codec(self): - """Returns the :class:`whoosh_reloaded.codec.base.Codec` object used to read + """Returns the :class:`whoosh.codec.base.Codec` object used to read this reader's segment. If this reader is not atomic (``reader.is_atomic() == True``), returns None. """ @@ -170,7 +170,7 @@ def codec(self): return None def segment(self): - """Returns the :class:`whoosh_reloaded.index.Segment` object used by this reader. + """Returns the :class:`whoosh.index.Segment` object used by this reader. If this reader is not atomic (``reader.is_atomic() == True``), returns None. """ @@ -178,12 +178,12 @@ def segment(self): return None def segments(self): - """Returns a list of :class:`whoosh_reloaded.index.Segment` objects used by this reader.""" + """Returns a list of :class:`whoosh.index.Segment` objects used by this reader.""" return None def storage(self): - """Returns the :class:`whoosh_reloaded.filedb.filestore.Storage` object used by + """Returns the :class:`whoosh.filedb.filestore.Storage` object used by this reader to read its files. If the reader is not atomic, (``reader.is_atomic() == True``), returns None. """ @@ -322,7 +322,7 @@ def all_doc_ids(self): is_deleted = self.is_deleted return ( - docnum for docnum in xrange(self.doc_count_all()) if not is_deleted(docnum) + docnum for docnum in range(self.doc_count_all()) if not is_deleted(docnum) ) def iter_docs(self): @@ -353,7 +353,7 @@ def all_stored_fields(self): """Yields the stored fields for all non-deleted documents.""" is_deleted = self.is_deleted - for docnum in xrange(self.doc_count_all()): + for docnum in range(self.doc_count_all()): if not is_deleted(docnum): yield self.stored_fields(docnum) @@ -435,7 +435,7 @@ def iter_postings(self): @abstractmethod def postings(self, fieldname, text): - """Returns a :class:`~whoosh_reloaded.matching.Matcher` for the postings of the + """Returns a :class:`~whoosh.matching.Matcher` for the postings of the given term. >>> pr = reader.postings("content", "render") @@ -445,7 +445,7 @@ def postings(self, fieldname, text): :param fieldname: the field name or field number of the term. :param text: the text of the term. - :rtype: :class:`whoosh_reloaded.matching.Matcher` + :rtype: :class:`whoosh.matching.Matcher` """ raise NotImplementedError @@ -459,7 +459,7 @@ def has_vector(self, docnum, fieldname): @abstractmethod def vector(self, docnum, fieldname, format_=None): - """Returns a :class:`~whoosh_reloaded.matching.Matcher` object for the + """Returns a :class:`~whoosh.matching.Matcher` object for the given term vector. >>> docnum = searcher.document_number(path=u'/a/b/c') @@ -471,7 +471,7 @@ def vector(self, docnum, fieldname, format_=None): the term vector. :param fieldname: the field name or field number of the field for which you want the term vector. - :rtype: :class:`whoosh_reloaded.matching.Matcher` + :rtype: :class:`whoosh.matching.Matcher` """ raise NotImplementedError @@ -506,11 +506,11 @@ def vector_as(self, astype, docnum, fieldname): vec.next() def corrector(self, fieldname): - """Returns a :class:`whoosh_reloaded.spelling.Corrector` object that suggests + """Returns a :class:`whoosh.spelling.Corrector` object that suggests corrections based on the terms in the given field. """ - from whoosh_reloaded.spelling import ReaderCorrector + from whoosh.spelling import ReaderCorrector fieldobj = self.schema[fieldname] return ReaderCorrector(self, fieldname, fieldobj) @@ -592,7 +592,7 @@ def column_reader(self, fieldname, column=None, reverse=False, translate=False): reversible, this will raise a ``NotImplementedError``. :param translate: if True, wrap the reader to call the field's ``from_bytes()`` method on the returned values. - :return: a :class:`whoosh_reloaded.columns.ColumnReader` object. + :return: a :class:`whoosh.columns.ColumnReader` object. """ raise NotImplementedError @@ -829,7 +829,7 @@ def deleted_docs_set(self): return frozenset(self._perdoc.deleted_docs()) def postings(self, fieldname, text, scorer=None): - from whoosh_reloaded.matching.wrappers import FilterMatcher + from whoosh.matching.wrappers import FilterMatcher if self.is_closed: raise ReaderClosed @@ -922,7 +922,7 @@ def segments(self): return None def cursor(self, fieldname): - from whoosh_reloaded.codec.base import EmptyCursor + from whoosh.codec.base import EmptyCursor return EmptyCursor() diff --git a/src/whoosh_reloaded/scoring.py b/src/whoosh/scoring.py similarity index 98% rename from src/whoosh_reloaded/scoring.py rename to src/whoosh/scoring.py index 8729de3e..40de76e5 100644 --- a/src/whoosh_reloaded/scoring.py +++ b/src/whoosh/scoring.py @@ -32,7 +32,7 @@ from __future__ import division from math import log, pi -from whoosh_reloaded.compat import iteritems +from whoosh.compat import iteritems # Base classes @@ -41,7 +41,7 @@ class WeightingModel(object): """Abstract base class for scoring models. A WeightingModel object provides a method, ``scorer``, which returns an instance of - :class:`whoosh_reloaded.scoring.Scorer`. + :class:`whoosh.scoring.Scorer`. Basically, WeightingModel objects store the configuration information for the model (for example, the values of B and K1 in the BM25F model), and @@ -60,7 +60,7 @@ def idf(self, searcher, fieldname, text): return log(dc / (n + 1)) + 1 def scorer(self, searcher, fieldname, text, qf=1): - """Returns an instance of :class:`whoosh_reloaded.scoring.Scorer` configured + """Returns an instance of :class:`whoosh.scoring.Scorer` configured for the given searcher, fieldname, and term text. """ @@ -75,7 +75,7 @@ def final(self, searcher, docnum, score): WeightingModel sub-classes that use ``final()`` should have the attribute ``use_final`` set to ``True``. - :param searcher: :class:`whoosh_reloaded.searching.Searcher` for the index. + :param searcher: :class:`whoosh.searching.Searcher` for the index. :param docnum: the doc number of the document being scored. :param score: the document's accumulated term score. @@ -280,7 +280,7 @@ class BM25F(WeightingModel): def __init__(self, B=0.75, K1=1.2, **kwargs): """ - >>> from whoosh_reloaded import scoring + >>> from whoosh import scoring >>> # Set a custom B value for the "content" field >>> w = scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) diff --git a/src/whoosh_reloaded/searching.py b/src/whoosh/searching.py similarity index 94% rename from src/whoosh_reloaded/searching.py rename to src/whoosh/searching.py index 1197aaad..af5e3b8e 100644 --- a/src/whoosh_reloaded/searching.py +++ b/src/whoosh/searching.py @@ -34,10 +34,10 @@ import weakref from math import ceil -from whoosh_reloaded import classify, highlight, query, scoring -from whoosh_reloaded.compat import iteritems, itervalues, iterkeys, xrange -from whoosh_reloaded.idsets import DocIdSet, BitSet -from whoosh_reloaded.reading import TermNotFound +from whoosh import classify, highlight, query, scoring +from whoosh.compat import iteritems, itervalues, iterkeys, range +from whoosh.idsets import DocIdSet, BitSet +from whoosh.reading import TermNotFound class NoTermsException(Exception): @@ -99,7 +99,7 @@ def set(self, **kwargs): class Searcher(object): - """Wraps an :class:`~whoosh_reloaded.reading.IndexReader` object and provides + """Wraps an :class:`~whoosh.reading.IndexReader` object and provides methods for searching the index. """ @@ -112,9 +112,9 @@ def __init__( parent=None, ): """ - :param reader: An :class:`~whoosh_reloaded.reading.IndexReader` object for + :param reader: An :class:`~whoosh.reading.IndexReader` object for the index to search. - :param weighting: A :class:`whoosh_reloaded.scoring.Weighting` object to use to + :param weighting: A :class:`whoosh.scoring.Weighting` object to use to score found documents. :param closereader: Whether the underlying reader will be closed when the searcher is closed. @@ -280,7 +280,7 @@ def avg_field_length(self, fieldname, default=None): return self.field_length(fieldname) / (self._doccount or 1) def reader(self): - """Returns the underlying :class:`~whoosh_reloaded.reading.IndexReader`.""" + """Returns the underlying :class:`~whoosh.reading.IndexReader`.""" return self.ixreader def context(self, **kwargs): @@ -299,8 +299,8 @@ def boolean_context(self): return self.context(needs_current=False, weighting=None) def postings(self, fieldname, text, weighting=None, qf=1): - """Returns a :class:`whoosh_reloaded.matching.Matcher` for the postings of the - given term. Unlike the :func:`whoosh_reloaded.reading.IndexReader.postings` + """Returns a :class:`whoosh.matching.Matcher` for the postings of the + given term. Unlike the :func:`whoosh.reading.IndexReader.postings` method, this method automatically sets the scoring functions on the matcher from the searcher's weighting object. """ @@ -311,7 +311,7 @@ def postings(self, fieldname, text, weighting=None, qf=1): if self.is_atomic(): return self.ixreader.postings(fieldname, text, scorer=globalscorer) else: - from whoosh_reloaded.matching import MultiMatcher + from whoosh.matching import MultiMatcher matchers = [] docoffsets = [] @@ -487,7 +487,7 @@ def suggest(self, fieldname, text, limit=5, maxdist=2, prefix=0): This is a convenience method. If you are planning to get suggestions for multiple words in the same field, it is more efficient to get a - :class:`~whoosh_reloaded.spelling.Corrector` object and use it directly:: + :class:`~whoosh.spelling.Corrector` object and use it directly:: corrector = searcher.corrector("fieldname") for word in words: @@ -592,7 +592,7 @@ def more_like( :param numterms: the number of "key terms" to extract from the hit and search for. Using more terms is slower but gives potentially more and more accurate results. - :param model: (expert) a :class:`whoosh_reloaded.classify.ExpansionModel` to use + :param model: (expert) a :class:`whoosh.classify.ExpansionModel` to use to compute "key terms". :param normalize: whether to normalize term weights. :param filter: a query, Results object, or set of docnums. The results @@ -655,7 +655,7 @@ def search_page(self, query, pagenum, pagelen=10, **kwargs): This method will raise a ``ValueError`` if you ask for a page number higher than the number of pages in the resulting query. - :param query: the :class:`whoosh_reloaded.query.Query` object to match. + :param query: the :class:`whoosh.query.Query` object to match. :param pagenum: the page number to retrieve, starting at ``1`` for the first page. :param pagelen: the number of results per page. @@ -669,7 +669,7 @@ def search_page(self, query, pagenum, pagelen=10, **kwargs): return ResultsPage(results, pagenum, pagelen) def find(self, defaultfield, querystring, **kwargs): - from whoosh_reloaded.qparser import QueryParser + from whoosh.qparser import QueryParser qp = QueryParser(defaultfield, schema=self.ixreader.schema) q = qp.parse(querystring) @@ -677,7 +677,7 @@ def find(self, defaultfield, querystring, **kwargs): def docs_for_query(self, q, for_deletion=False): """Returns an iterator of document numbers for documents matching the - given :class:`whoosh_reloaded.query.Query` object. + given :class:`whoosh.query.Query` object. """ # If we're getting the document numbers so we can delete them, use the @@ -713,7 +713,7 @@ def collector( scored=True, ): """Low-level method: returns a configured - :class:`whoosh_reloaded.collectors.Collector` object based on the given + :class:`whoosh.collectors.Collector` object based on the given arguments. You can use this object with :meth:`Searcher.search_with_collector` to search. @@ -721,7 +721,7 @@ def collector( description of the parameters. This method may be useful to get a basic collector object and then wrap - it with another collector from ``whoosh_reloaded.collectors`` or with a custom + it with another collector from ``whoosh.collectors`` or with a custom collector of your own:: # Equivalent of @@ -733,14 +733,14 @@ def collector( # Wrap it with a TimeLimitedCollector with a time limit of # 10.5 seconds - from whoosh_reloaded.collectors import TimeLimitedCollector + from whoosh.collectors import TimeLimitedCollector c = TimeLimitCollector(c, 10.5) # Search using the custom collector results = mysearcher.search_with_collector(myquery, c) """ - from whoosh_reloaded import collectors + from whoosh import collectors if limit is not None and limit < 1: raise ValueError("limit must be >= 1") @@ -772,7 +772,7 @@ def collector( return c def search(self, q, **kwargs): - """Runs a :class:`whoosh_reloaded.query.Query` object on this searcher and + """Runs a :class:`whoosh.query.Query` object on this searcher and returns a :class:`Results` object. See :doc:`/searching` for more information. @@ -782,7 +782,7 @@ def search(self, q, **kwargs): ``groupedby``. See :ref:`collapsing` for more information on using ``collapse``, ``collapse_limit``, and ``collapse_order``. - :param query: a :class:`whoosh_reloaded.query.Query` object to use to match + :param query: a :class:`whoosh.query.Query` object to use to match documents. :param limit: the maximum number of documents to score. If you're only interested in the top N documents, you can set limit=N to limit the @@ -806,7 +806,7 @@ def search(self, q, **kwargs): :param maptype: by default, the results of faceting with ``groupedby`` is a dictionary mapping group names to ordered lists of document numbers in the group. You can pass a - :class:`whoosh_reloaded.sorting.FacetMap` subclass to this keyword argument + :class:`whoosh.sorting.FacetMap` subclass to this keyword argument to specify a different (usually faster) method for grouping. For example, ``maptype=sorting.Count`` would store only the count of documents in each group, instead of the full list of document IDs. @@ -830,8 +830,8 @@ def search(self, q, **kwargs): return c.results() def search_with_collector(self, q, collector, context=None): - """Low-level method: runs a :class:`whoosh_reloaded.query.Query` object on this - searcher using the given :class:`whoosh_reloaded.collectors.Collector` object + """Low-level method: runs a :class:`whoosh.query.Query` object on this + searcher using the given :class:`whoosh.collectors.Collector` object to collect the results:: myquery = query.Term("content", "cabbage") @@ -847,9 +847,9 @@ def search_with_collector(self, q, collector, context=None): need to access the collector to get a results object or other information the collector might hold after the search. - :param q: a :class:`whoosh_reloaded.query.Query` object to use to match + :param q: a :class:`whoosh.query.Query` object to use to match documents. - :param collector: a :class:`whoosh_reloaded.collectors.Collector` object to feed + :param collector: a :class:`whoosh.collectors.Collector` object to feed the results into. """ @@ -865,7 +865,7 @@ def correct_query( ): """ Returns a corrected version of the given user query using a default - :class:`whoosh_reloaded.spelling.ReaderCorrector`. + :class:`whoosh.spelling.ReaderCorrector`. The default: @@ -873,18 +873,18 @@ def correct_query( * Takes suggestions from the words in the index. To make certain fields use custom correctors, use the ``correctors`` argument to pass a - dictionary mapping field names to :class:`whoosh_reloaded.spelling.Corrector` + dictionary mapping field names to :class:`whoosh.spelling.Corrector` objects. Expert users who want more sophisticated correction behavior can create - a custom :class:`whoosh_reloaded.spelling.QueryCorrector` and use that instead + a custom :class:`whoosh.spelling.QueryCorrector` and use that instead of this method. - Returns a :class:`whoosh_reloaded.spelling.Correction` object with a ``query`` - attribute containing the corrected :class:`whoosh_reloaded.query.Query` object + Returns a :class:`whoosh.spelling.Correction` object with a ``query`` + attribute containing the corrected :class:`whoosh.query.Query` object and a ``string`` attributes containing the corrected query string. - >>> from whoosh_reloaded import qparser, highlight + >>> from whoosh import qparser, highlight >>> qtext = 'mary "litle lamb"' >>> q = qparser.QueryParser("text", myindex.schema) >>> mysearcher = myindex.searcher() @@ -897,23 +897,23 @@ def correct_query( You can use the ``Correction`` object's ``format_string`` method to format the corrected query string using a - :class:`whoosh_reloaded.highlight.Formatter` object. For example, you can format + :class:`whoosh.highlight.Formatter` object. For example, you can format the corrected string as HTML, emphasizing the changed words. >>> hf = highlight.HtmlFormatter(classname="change") >>> correction.format_string(hf) 'mary "little lamb"' - :param q: the :class:`whoosh_reloaded.query.Query` object to correct. + :param q: the :class:`whoosh.query.Query` object to correct. :param qstring: the original user query from which the query object was created. You can pass None instead of a string, in which the second item in the returned tuple will also be None. :param correctors: an optional dictionary mapping fieldnames to - :class:`whoosh_reloaded.spelling.Corrector` objects. By default, this method + :class:`whoosh.spelling.Corrector` objects. By default, this method uses the contents of the index to spell check the terms in the query. You can use this argument to "override" some fields with a different correct, for example a - :class:`whoosh_reloaded.spelling.GraphCorrector`. + :class:`whoosh.spelling.GraphCorrector`. :param terms: a sequence of ``("fieldname", "text")`` tuples to correct in the query. By default, this method corrects terms that don't appear in the index. You can use this argument to override that @@ -930,7 +930,7 @@ def correct_query( :param aliases: an optional dictionary mapping field names in the query to different field names to use as the source of spelling suggestions. The mappings in ``correctors`` are applied after this. - :rtype: :class:`whoosh_reloaded.spelling.Correction` + :rtype: :class:`whoosh.spelling.Correction` """ reader = self.reader() @@ -969,7 +969,7 @@ def correct_query( terms.append((token.fieldname, token.text)) # Make q query corrector - from whoosh_reloaded import spelling + from whoosh import spelling sqc = spelling.SimpleQueryCorrector( correctors, terms, aliases, maxdist=maxdist, prefix=prefix @@ -1047,7 +1047,7 @@ def __getitem__(self, n): start, stop, step = n.indices(len(self.top_n)) return [ Hit(self, self.top_n[i][1], i, self.top_n[i][0]) - for i in xrange(start, stop, step) + for i in range(start, stop, step) ] else: if n >= len(self.top_n): @@ -1059,7 +1059,7 @@ def __getitem__(self, n): def __iter__(self): """Yields a :class:`Hit` object for each result in ranked order.""" - for i in xrange(len(self.top_n)): + for i in range(len(self.top_n)): yield Hit(self, self.top_n[i][1], i, self.top_n[i][0]) def __contains__(self, docnum): @@ -1132,7 +1132,7 @@ def groups(self, name=None): If you specified a different ``maptype`` for the facet when you searched, the values in the dictionary depend on the - :class:`whoosh_reloaded.sorting.FacetMap`. + :class:`whoosh.sorting.FacetMap`. >>> myfacet = sorting.FieldFacet("tag", maptype=sorting.Count) >>> results = mysearcher.search(myquery, groupedby=myfacet) @@ -1482,12 +1482,12 @@ def highlights(self, fieldname, text=None, top=3, minscore=1, strict_phrase=Fals To change the fragmeter, formatter, order, or scorer used in highlighting, you can set attributes on the results object:: - from whoosh_reloaded import highlight + from whoosh import highlight results = searcher.search(myquery, terms=True) results.fragmenter = highlight.SentenceFragmenter() - ...or use a custom :class:`whoosh_reloaded.highlight.Highlighter` object:: + ...or use a custom :class:`whoosh.highlight.Highlighter` object:: hl = highlight.Highlighter(fragmenter=sf) results.highlighter = hl @@ -1544,7 +1544,7 @@ def more_like_this( :param numterms: the number of "key terms" to extract from the hit and search for. Using more terms is slower but gives potentially more and more accurate results. - :param model: (expert) a :class:`whoosh_reloaded.classify.ExpansionModel` to use + :param model: (expert) a :class:`whoosh.classify.ExpansionModel` to use to compute "key terms". :param normalize: whether to normalize term weights. """ @@ -1627,8 +1627,8 @@ def update(self, dict=None, **kwargs): class ResultsPage(object): """Represents a single page out of a longer list of results, as returned - by :func:`whoosh_reloaded.searching.Searcher.search_page`. Supports a subset of the - interface of the :class:`~whoosh_reloaded.searching.Results` object, namely getting + by :func:`whoosh.searching.Searcher.search_page`. Supports a subset of the + interface of the :class:`~whoosh.searching.Results` object, namely getting stored fields with __getitem__ (square brackets), iterating, and the ``score()`` and ``docnum()`` methods. @@ -1672,7 +1672,7 @@ class ResultsPage(object): def __init__(self, results, pagenum, pagelen=10): """ - :param results: a :class:`~whoosh_reloaded.searching.Results` object. + :param results: a :class:`~whoosh.searching.Results` object. :param pagenum: which page of the results to use, numbered from ``1``. :param pagelen: the number of hits per page. """ diff --git a/src/whoosh_reloaded/sorting.py b/src/whoosh/sorting.py similarity index 97% rename from src/whoosh_reloaded/sorting.py rename to src/whoosh/sorting.py index ef915eaa..b3cfc9a6 100644 --- a/src/whoosh_reloaded/sorting.py +++ b/src/whoosh/sorting.py @@ -28,8 +28,8 @@ from array import array from collections import defaultdict -from whoosh_reloaded.compat import string_type -from whoosh_reloaded.compat import iteritems, izip, xrange +from whoosh.compat import string_type +from whoosh.compat import iteritems, izip, range # Faceting objects @@ -71,7 +71,7 @@ class Categorizer(object): Categorizers are created by FacetType objects through the :meth:`FacetType.categorizer` method. The - :class:`whoosh_reloaded.searching.Searcher` object passed to the ``categorizer`` + :class:`whoosh.searching.Searcher` object passed to the ``categorizer`` method may be a composite searcher (that is, wrapping a multi-reader), but categorizers are always run **per-segment**, with segment-relative document numbers. @@ -110,7 +110,7 @@ def set_searcher(self, segment_searcher, docoffset): def key_for(self, matcher, segment_docnum): """Returns a key for the current match. - :param matcher: a :class:`whoosh_reloaded.matching.Matcher` object. If + :param matcher: a :class:`whoosh.matching.Matcher` object. If ``self.needs_current`` is ``False``, DO NOT use this object, since it may be inconsistent. Use the given ``segment_docnum`` instead. @@ -132,7 +132,7 @@ def keys_for(self, matcher, segment_docnum): This method will be called instead of ``key_for`` if ``self.allow_overlap`` is ``True``. - :param matcher: a :class:`whoosh_reloaded.matching.Matcher` object. If + :param matcher: a :class:`whoosh.matching.Matcher` object. If ``self.needs_current`` is ``False``, DO NOT use this object, since it may be inconsistent. Use the given ``segment_docnum`` instead. @@ -305,7 +305,7 @@ def set_searcher(self, segment_searcher, docoffset): field = segment_searcher.schema[fieldname] from_bytes = field.from_bytes - self._lists = [[] for _ in xrange(dc)] + self._lists = [[] for _ in range(dc)] for btext in field.sortable_terms(reader, fieldname): text = from_bytes(btext) postings = reader.postings(fieldname, btext) @@ -409,7 +409,7 @@ class QueryFacet(FacetType): def __init__(self, querydict, other=None, allow_overlap=False, maptype=None): """ :param querydict: a dictionary mapping keys to - :class:`whoosh_reloaded.query.Query` objects. + :class:`whoosh.query.Query` objects. :param other: the key to use for documents that don't match any of the queries. """ @@ -492,7 +492,7 @@ def default_name(self): return self.fieldname def _rangetype(self): - from whoosh_reloaded import query + from whoosh import query return query.NumericRange @@ -537,13 +537,13 @@ class DateRangeFacet(RangeFacet): """Sorts/facets based on date ranges. This is the same as RangeFacet except you are expected to use ``daterange`` objects as the start and end of the range, and ``timedelta`` or ``relativedelta`` objects as the gap(s), - and it generates :class:`~whoosh_reloaded.query.DateRange` queries instead of - :class:`~whoosh_reloaded.query.TermRange` queries. + and it generates :class:`~whoosh.query.DateRange` queries instead of + :class:`~whoosh.query.TermRange` queries. For example, to facet a "birthday" range into 5 year buckets:: from datetime import datetime - from whoosh_reloaded.support.relativedelta import relativedelta + from whoosh.support.relativedelta import relativedelta startdate = datetime(1920, 0, 0) enddate = datetime.now() @@ -556,7 +556,7 @@ class DateRangeFacet(RangeFacet): """ def _rangetype(self): - from whoosh_reloaded import query + from whoosh import query return query.DateRange @@ -691,7 +691,7 @@ def key_for(self, matcher, segment_docnum): class StoredFieldFacet(FacetType): """Lets you sort/group using the value in an unindexed, stored field (e.g. - :class:`whoosh_reloaded.fields.STORED`). This is usually slower than using an indexed + :class:`whoosh.fields.STORED`). This is usually slower than using an indexed field. For fields where the stored value is a space-separated list of keywords, @@ -921,7 +921,7 @@ def add_query(self, name, querydict, **kwargs): :param name: a name for the facet. :param querydict: a dictionary mapping keys to - :class:`whoosh_reloaded.query.Query` objects. + :class:`whoosh.query.Query` objects. """ self.facets[name] = QueryFacet(querydict, **kwargs) @@ -1085,17 +1085,17 @@ def add_sortable(writer, fieldname, facet, column=None): """Adds a per-document value column to an existing field which was created without the ``sortable`` keyword argument. - >>> from whoosh_reloaded import index, sorting + >>> from whoosh import index, sorting >>> ix = index.open_dir("indexdir") >>> with ix.writer() as w: ... facet = sorting.FieldFacet("price") ... sorting.add_sortable(w, "price", facet) ... - :param writer: a :class:`whoosh_reloaded.writing.IndexWriter` object. + :param writer: a :class:`whoosh.writing.IndexWriter` object. :param fieldname: the name of the field to add the per-document sortable values to. If this field doesn't exist in the writer's schema, the - function will add a :class:`whoosh_reloaded.fields.COLUMN` field to the schema, + function will add a :class:`whoosh.fields.COLUMN` field to the schema, and you must specify the column object to using the ``column`` keyword argument. :param facet: a :class:`FacetType` object to use to generate the @@ -1116,7 +1116,7 @@ def add_sortable(writer, fieldname, facet, column=None): if column: if fieldname not in schema: - from whoosh_reloaded.fields import COLUMN + from whoosh.fields import COLUMN field = COLUMN(column) schema.add(fieldname, field) diff --git a/src/whoosh_reloaded/spelling.py b/src/whoosh/spelling.py similarity index 93% rename from src/whoosh_reloaded/spelling.py rename to src/whoosh/spelling.py index a6fd0928..cbc1eca2 100644 --- a/src/whoosh_reloaded/spelling.py +++ b/src/whoosh/spelling.py @@ -32,8 +32,8 @@ from bisect import bisect_left from heapq import heappush, heapreplace -from whoosh_reloaded import highlight -from whoosh_reloaded.compat import iteritems, xrange +from whoosh import highlight +from whoosh.compat import iteritems, range # Corrector objects @@ -125,11 +125,11 @@ def __init__(self, wordlist): self.wordlist = wordlist def _suggestions(self, text, maxdist, prefix): - from whoosh_reloaded.automata.lev import levenshtein_automaton - from whoosh_reloaded.automata.fsa import find_all_matches + from whoosh.automata.lev import levenshtein_automaton + from whoosh.automata.fsa import find_all_matches seen = set() - for mxd in xrange(1, maxdist + 1): + for mxd in range(1, maxdist + 1): dfa = levenshtein_automaton(text, mxd, prefix).to_dfa() sk = self.Skipper(self.wordlist) for sug in find_all_matches(dfa, sk): @@ -183,22 +183,22 @@ class Correction(object): following attributes: ``query`` - The corrected :class:`whoosh_reloaded.query.Query` object. + The corrected :class:`whoosh.query.Query` object. ``string`` The corrected user query string. ``original_query`` - The original :class:`whoosh_reloaded.query.Query` object that was corrected. + The original :class:`whoosh.query.Query` object that was corrected. ``original_string`` The original user query string. ``tokens`` A list of token objects representing the corrected words. You can also use the :meth:`Correction.format_string` method to reformat the - corrected query string using a :class:`whoosh_reloaded.highlight.Formatter` class. + corrected query string using a :class:`whoosh.highlight.Formatter` class. For example, to display the corrected query string as HTML with the changed words emphasized:: - from whoosh_reloaded import highlight + from whoosh import highlight correction = mysearcher.correct_query(q, qstring) @@ -223,9 +223,9 @@ def __repr__(self): def format_string(self, formatter): """ Highlights the corrected words in the original query string using the - given :class:`~whoosh_reloaded.highlight.Formatter`. + given :class:`~whoosh.highlight.Formatter`. - :param formatter: A :class:`whoosh_reloaded.highlight.Formatter` instance. + :param formatter: A :class:`whoosh.highlight.Formatter` instance. :return: the output of the formatter (usually a string). """ @@ -255,7 +255,7 @@ def correct_query(self, q, qstring): Returns a :class:`Correction` object representing the corrected form of the given query. - :param q: the original :class:`whoosh_reloaded.query.Query` tree to be + :param q: the original :class:`whoosh.query.Query` tree to be corrected. :param qstring: the original user query. This may be None if the original query string is not available, in which case the diff --git a/src/whoosh_reloaded/support/__init__.py b/src/whoosh/support/__init__.py similarity index 100% rename from src/whoosh_reloaded/support/__init__.py rename to src/whoosh/support/__init__.py diff --git a/src/whoosh_reloaded/support/base85.py b/src/whoosh/support/base85.py similarity index 94% rename from src/whoosh_reloaded/support/base85.py rename to src/whoosh/support/base85.py index 9b82d8e6..66e7915c 100644 --- a/src/whoosh_reloaded/support/base85.py +++ b/src/whoosh/support/base85.py @@ -1,6 +1,6 @@ """ This module contains generic base85 encoding and decoding functions. The -whoosh_reloaded.util.numeric module contains faster variants for encoding and +whoosh.util.numeric module contains faster variants for encoding and decoding integers. Modified from: @@ -9,7 +9,7 @@ import struct -from whoosh_reloaded.compat import xrange +from whoosh.compat import range # Instead of using the character set from the ascii85 algorithm, I put the @@ -32,7 +32,7 @@ def to_base85(x, islong=False): size = 10 if islong else 5 rems = "" - for i in xrange(size): + for i in range(size): rems = b85chars[x % 85] + rems x //= 85 return rems diff --git a/src/whoosh_reloaded/support/bench.py b/src/whoosh/support/bench.py similarity index 93% rename from src/whoosh_reloaded/support/bench.py rename to src/whoosh/support/bench.py index 8c8673ec..3ec9d6f0 100644 --- a/src/whoosh_reloaded/support/bench.py +++ b/src/whoosh/support/bench.py @@ -30,24 +30,24 @@ from optparse import OptionParser from shutil import rmtree -from whoosh_reloaded import index, qparser, query, scoring -from whoosh_reloaded.util import now, find_object +from whoosh import index, qparser, query, scoring +from whoosh.util import now, find_object try: - import xappy + import xappy # type: ignore except ImportError: pass try: - import xapian + import xapian # type: ignore except ImportError: pass try: - import pysolr + import pysolr # type: ignore except ImportError: pass try: - from persistent import Persistent + from persistent import Persistent # type: ignore class ZDoc(Persistent): def __init__(self, d): @@ -134,9 +134,7 @@ def print_results(self, ls): class WhooshModule(Module): def indexer(self, create=True): schema = self.bench.spec.whoosh_schema() - path = os.path.join( - self.options.dir, "%s_whoosh_reloaded" % self.options.indexname - ) + path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) if not os.path.exists(path): os.mkdir(path) @@ -158,8 +156,8 @@ def indexer(self, create=True): multisegment=self.options.xms, ) self._procdoc = None - if hasattr(self.bench.spec, "process_document_whoosh_reloaded"): - self._procdoc = self.bench.spec.process_document_whoosh - reloaded + if hasattr(self.bench.spec, "process_document_whoosh"): + self._procdoc = self.bench.spec.process_document_whoosh def index_document(self, d): _procdoc = self._procdoc @@ -171,9 +169,7 @@ def finish(self, merge=True, optimize=False): self.writer.commit(merge=merge, optimize=optimize) def searcher(self): - path = os.path.join( - self.options.dir, "%s_whoosh_reloaded" % self.options.indexname - ) + path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) ix = index.open_dir(path) self.srch = ix.searcher(weighting=scoring.PL2()) self.parser = qparser.QueryParser(self.bench.spec.main_field, schema=ix.schema) @@ -324,10 +320,10 @@ def findterms(self, terms): class ZcatalogModule(Module): def indexer(self, **kwargs): - from ZODB.FileStorage import FileStorage # @UnresolvedImport - from ZODB.DB import DB # @UnresolvedImport - from zcatalog import catalog # @UnresolvedImport - import transaction # @UnresolvedImport + from ZODB.FileStorage import FileStorage # type: ignore # type: ignore @UnresolvedImport + from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport + from zcatalog import catalog # type: ignore # type: ignore @UnresolvedImport + import transaction # type: ignore # type: ignore @UnresolvedImport dir = os.path.join(self.options.dir, "%s_zcatalog" % self.options.indexname) if os.path.exists(dir): @@ -352,20 +348,20 @@ def index_document(self, d): self.cat.index_doc(doc) self.zcatalog_count += 1 if self.zcatalog_count >= 100: - import transaction # @UnresolvedImport + import transaction # type: ignore # type: ignore @UnresolvedImport transaction.commit() self.zcatalog_count = 0 def finish(self, **kwargs): - import transaction # @UnresolvedImport + import transaction # type: ignore # type: ignore @UnresolvedImport transaction.commit() del self.zcatalog_count def searcher(self): - from ZODB.FileStorage import FileStorage # @UnresolvedImport - from ZODB.DB import DB # @UnresolvedImport + from ZODB.FileStorage import FileStorage # type: ignore # type: ignore @UnresolvedImport + from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport path = os.path.join( self.options.dir, "%s_zcatalog" % self.options.indexname, "index" @@ -397,7 +393,7 @@ def results(self, r): class NucularModule(Module): def indexer(self, create=True): import shutil - from nucular import Nucular + from nucular import Nucular # type: ignore # type: ignore @UnresolvedImport dir = os.path.join(self.options.dir, "%s_nucular" % self.options.indexname) if create: @@ -428,7 +424,7 @@ def finish(self, **kwargs): self.archive.cleanUp() def searcher(self): - from nucular import Nucular + from nucular import Nucular # type: ignore # type: ignore @UnresolvedImport dir = os.path.join(self.options.dir, "%s_nucular" % self.options.indexname) self.archive = Nucular.Nucular(dir) @@ -448,7 +444,7 @@ def findterms(self, terms): class Bench(object): libs = { - "whoosh_reloaded": WhooshModule, + "whoosh": WhooshModule, "xappy": XappyModule, "xapian": XapianModule, "solr": SolrModule, @@ -538,7 +534,7 @@ def _parser(self, name): "--lib", dest="lib", help="Name of the library to use to index/search.", - default="whoosh_reloaded", + default="whoosh", ) p.add_option( "-d", diff --git a/src/whoosh_reloaded/support/bitstream.py b/src/whoosh/support/bitstream.py similarity index 93% rename from src/whoosh_reloaded/support/bitstream.py rename to src/whoosh/support/bitstream.py index 594ee46c..682afbb8 100644 --- a/src/whoosh_reloaded/support/bitstream.py +++ b/src/whoosh/support/bitstream.py @@ -11,6 +11,7 @@ _bitsperlong = _LONG_SIZE * 8 + class BitStreamReader(object): def __init__(self, source): self._totalbits = len(source) * _bitsperlong @@ -33,14 +34,14 @@ def read(self, numbits): position = self._position if position < 0 or position + numbits > self._totalbits: - raise IndexError, "Invalid bitarray._position/numbits" + raise (IndexError, "Invalid bitarray._position/numbits") longaddress, bitoffset = divmod(position, _bitsperlong) # We may read bits in the final word after ones we care # about, so create a mask to remove them later. - finalmask = (1L << numbits) - 1 + finalmask = (1 << numbits) - 1 # We may read bits in the first word before the ones we # care about, so bump the total bits to read by this @@ -50,7 +51,7 @@ def read(self, numbits): # Read and concatenate every long containing a bit we need - outval, outshift = 0L, 0 + outval, outshift = 0, 0 while numbits > 0: outval += self._bitstream[longaddress] << outshift longaddress += 1 @@ -67,5 +68,3 @@ def read(self, numbits): # off the high-order bits we don't want. return (outval >> bitoffset) & finalmask - - diff --git a/src/whoosh_reloaded/support/bitvector.py b/src/whoosh/support/bitvector.py similarity index 97% rename from src/whoosh_reloaded/support/bitvector.py rename to src/whoosh/support/bitvector.py index a045c7a5..7790735b 100644 --- a/src/whoosh_reloaded/support/bitvector.py +++ b/src/whoosh/support/bitvector.py @@ -90,14 +90,14 @@ def __contains__(self, index): def __iter__(self): get = self.__getitem__ - for i in xrange(0, self.size): + for i in range(0, self.size): if get(i): yield i def __str__(self): get = self.__getitem__ return "".join("1" if get(i) else "0" - for i in xrange(0, self.size)) + for i in range(0, self.size)) def __nonzero__(self): return self.count() > 0 @@ -147,7 +147,7 @@ def __xor__(self, other): return self._logic(operator.__xor__, other) def __invert__(self): - return BitVector(self.size, source=(x for x in xrange(self.size) if x not in self)) + return BitVector(self.size, source=(x for x in range(self.size) if x not in self)) def count(self): """Returns the number of "on" bits in the bit array.""" diff --git a/src/whoosh_reloaded/support/charset.py b/src/whoosh/support/charset.py similarity index 99% rename from src/whoosh_reloaded/support/charset.py rename to src/whoosh/support/charset.py index 1e015cbf..2aef38a7 100644 --- a/src/whoosh_reloaded/support/charset.py +++ b/src/whoosh/support/charset.py @@ -2,13 +2,13 @@ """This module contains tools for working with Sphinx charset table files. These files are useful for doing case and accent folding. -See :class:`whoosh_reloaded.analysis.CharsetTokenizer` and :class:`whoosh_reloaded.analysis.CharsetFilter`. +See :class:`whoosh.analysis.CharsetTokenizer` and :class:`whoosh.analysis.CharsetFilter`. """ from collections import defaultdict import re -from whoosh_reloaded.compat import izip, u, iteritems, unichr, xrange +from whoosh.compat import izip, u, iteritems, unichr, range # This is a straightforward accent-folding charset taken from Carlos Bueno's # article "Accent Folding for Auto-Complete", for use with CharsetFilter. @@ -1328,7 +1328,7 @@ def charset_table_to_dict(tablestring): assert (end1 - start1) == (end2 - start2) try: for fromord, tooord in izip( - xrange(start1, end1 + 1), xrange(start2, end2 + 1) + range(start1, end1 + 1), range(start2, end2 + 1) ): map[fromord] = unichr(tooord) except ValueError: @@ -1359,7 +1359,7 @@ def charset_table_to_dict(tablestring): start = charspec_to_int(match.group(1)) end = charspec_to_int(match.group(2)) try: - for ord in xrange(start, end + 1): + for ord in range(start, end + 1): map[ord] = unichr(ord) except ValueError: pass @@ -1370,7 +1370,7 @@ def charset_table_to_dict(tablestring): fromord = charspec_to_int(match.group(1)) toord = charspec_to_int(match.group(2)) assert toord - fromord % 2 == 0 - for ord in xrange(fromord, toord + 1, 2): + for ord in range(fromord, toord + 1, 2): try: map[ord] = unichr(ord + 1) map[ord + 1] = unichr(ord + 1) diff --git a/src/whoosh_reloaded/support/levenshtein.py b/src/whoosh/support/levenshtein.py similarity index 92% rename from src/whoosh_reloaded/support/levenshtein.py rename to src/whoosh/support/levenshtein.py index 5b221288..2ee222fb 100644 --- a/src/whoosh_reloaded/support/levenshtein.py +++ b/src/whoosh/support/levenshtein.py @@ -2,7 +2,7 @@ Contains functions implementing edit distance algorithms. """ -from whoosh_reloaded.compat import xrange +from whoosh.compat import range def levenshtein(seq1, seq2, limit=None): @@ -10,12 +10,12 @@ def levenshtein(seq1, seq2, limit=None): oneago = None thisrow = list(range(1, len(seq2) + 1)) + [0] - for x in xrange(len(seq1)): + for x in range(len(seq1)): # Python lists wrap around for negative indices, so put the # leftmost column at the *end* of the list. This matches with # the zero-indexed strings and saves extra calculation. oneago, thisrow = thisrow, [0] * len(seq2) + [x + 1] - for y in xrange(len(seq2)): + for y in range(len(seq2)): delcost = oneago[y] + 1 addcost = thisrow[y - 1] + 1 subcost = oneago[y - 1] + (seq1[x] != seq2[y]) @@ -32,12 +32,12 @@ def damerau_levenshtein(seq1, seq2, limit=None): oneago = None thisrow = list(range(1, len(seq2) + 1)) + [0] - for x in xrange(len(seq1)): + for x in range(len(seq1)): # Python lists wrap around for negative indices, so put the # leftmost column at the *end* of the list. This matches with # the zero-indexed strings and saves extra calculation. twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] - for y in xrange(len(seq2)): + for y in range(len(seq2)): delcost = oneago[y] + 1 addcost = thisrow[y - 1] + 1 subcost = oneago[y - 1] + (seq1[x] != seq2[y]) diff --git a/src/whoosh_reloaded/support/pyparsing.py b/src/whoosh/support/pyparsing.py similarity index 99% rename from src/whoosh_reloaded/support/pyparsing.py rename to src/whoosh/support/pyparsing.py index d80d1d74..6d25fd81 100644 --- a/src/whoosh_reloaded/support/pyparsing.py +++ b/src/whoosh/support/pyparsing.py @@ -69,6 +69,8 @@ class names, and the use of '+', '|' and '^' operators. import warnings import re import sre_constants + +from whoosh.support import unicode #~ sys.stderr.write( "testing pyparsing module, version %s, %s\n" % (__version__,__versionTime__ ) ) __all__ = [ @@ -927,7 +929,7 @@ def _parseNoCache( self, instring, loc, doActions=True, callPreParse=True ): loc,tokens = self.parseImpl( instring, preloc, doActions ) except IndexError: raise ParseException( instring, len(instring), self.errmsg, self ) - except ParseBaseException, err: + except ParseBaseException as err: #~ print ("Exception raised:", err) if self.debugActions[2]: self.debugActions[2]( instring, tokensStart, self, err ) @@ -961,8 +963,8 @@ def _parseNoCache( self, instring, loc, doActions=True, callPreParse=True ): self.resultsName, asList=self.saveAsList and isinstance(tokens,(ParseResults,list)), modal=self.modalResults ) - except ParseBaseException, err: - #~ print "Exception raised in user parse action:", err + except ParseBaseException as err: + # print ("Exception raised in user parse action:", err) if (self.debugActions[2] ): self.debugActions[2]( instring, tokensStart, self, err ) raise @@ -1002,7 +1004,7 @@ def _parseCache( self, instring, loc, doActions=True, callPreParse=True ): value = self._parseNoCache( instring, loc, doActions, callPreParse ) ParserElement._exprArgCache[ lookup ] = (value[0],value[1].copy()) return value - except ParseBaseException, pe: + except ParseBaseException as pe: ParserElement._exprArgCache[ lookup ] = pe raise @@ -1071,7 +1073,7 @@ def parseString( self, instring, parseAll=False ): if parseAll: loc = self.preParse( instring, loc ) StringEnd()._parse( instring, loc ) - except ParseBaseException, exc: + except ParseBaseException as exc: # catch and re-raise exception from here, clears out pyparsing internal stack trace raise exc else: @@ -1109,7 +1111,7 @@ def scanString( self, instring, maxMatches=_MAX_INT ): matches += 1 yield tokens, preloc, nextLoc loc = nextLoc - except ParseBaseException, pe: + except ParseBaseException as pe: raise pe def transformString( self, instring ): @@ -1137,7 +1139,7 @@ def transformString( self, instring ): lastE = e out.append(instring[lastE:]) return "".join(map(_ustr,out)) - except ParseBaseException, pe: + except ParseBaseException as pe: raise pe def searchString( self, instring, maxMatches=_MAX_INT ): @@ -1147,7 +1149,7 @@ def searchString( self, instring, maxMatches=_MAX_INT ): """ try: return ParseResults([ t for t,s,e in self.scanString( instring, maxMatches ) ]) - except ParseBaseException, pe: + except ParseBaseException as pe: raise pe def __add__(self, other ): @@ -1403,7 +1405,7 @@ def parseFile( self, file_or_filename, parseAll=False ): f.close() try: return self.parseString(file_contents, parseAll) - except ParseBaseException, exc: + except ParseBaseException as exc: # catch and re-raise exception from here, clears out pyparsing internal stack trace raise exc @@ -2344,9 +2346,9 @@ def parseImpl( self, instring, loc, doActions=True ): loc, exprtokens = e._parse( instring, loc, doActions ) except ParseSyntaxException: raise - except ParseBaseException, pe: + except ParseBaseException as pe: raise ParseSyntaxException(pe) - except IndexError, ie: + except IndexError as ie: raise ParseSyntaxException( ParseException(instring, len(instring), self.errmsg, self) ) else: loc, exprtokens = e._parse( instring, loc, doActions ) @@ -2396,7 +2398,7 @@ def parseImpl( self, instring, loc, doActions=True ): for e in self.exprs: try: loc2 = e.tryParse( instring, loc ) - except ParseException, err: + except ParseException as err: if err.loc > maxExcLoc: maxException = err maxExcLoc = err.loc @@ -2460,7 +2462,7 @@ def parseImpl( self, instring, loc, doActions=True ): try: ret = e._parse( instring, loc, doActions ) return ret - except ParseException, err: + except ParseException as err: if err.loc > maxExcLoc: maxException = err maxExcLoc = err.loc @@ -2861,7 +2863,7 @@ def parseImpl( self, instring, loc, doActions=True ): while 1: try: loc = self.ignoreExpr.tryParse(instring,loc) - print "found ignoreExpr, advance to", loc + print ("found ignoreExpr, advance to", loc) except ParseBaseException: break expr._parse( instring, loc, doActions=False, callPreParse=False ) @@ -3087,7 +3089,7 @@ def z(*paArgs): sys.stderr.write( ">>entering %s(line: '%s', %d, %s)\n" % (thisFunc,line(l,s),l,t) ) try: ret = f(*paArgs) - except Exception, exc: + except Exception as exc: sys.stderr.write( "<") print (err.line) print (" "*(err.column-1) + "^") diff --git a/src/whoosh_reloaded/support/relativedelta.py b/src/whoosh/support/relativedelta.py similarity index 100% rename from src/whoosh_reloaded/support/relativedelta.py rename to src/whoosh/support/relativedelta.py diff --git a/src/whoosh_reloaded/support/unicode.py b/src/whoosh/support/unicode.py similarity index 99% rename from src/whoosh_reloaded/support/unicode.py rename to src/whoosh/support/unicode.py index dfb9da2e..351c7130 100644 --- a/src/whoosh_reloaded/support/unicode.py +++ b/src/whoosh/support/unicode.py @@ -1,7 +1,7 @@ import re from bisect import bisect_right -from whoosh_reloaded.compat import text_type, u +from whoosh.compat import text_type, u # http://unicode.org/Public/UNIDATA/Blocks.txt diff --git a/src/whoosh_reloaded/system.py b/src/whoosh/system.py similarity index 100% rename from src/whoosh_reloaded/system.py rename to src/whoosh/system.py diff --git a/src/whoosh_reloaded/util/__init__.py b/src/whoosh/util/__init__.py similarity index 96% rename from src/whoosh_reloaded/util/__init__.py rename to src/whoosh/util/__init__.py index 71242964..cc91d3d9 100644 --- a/src/whoosh_reloaded/util/__init__.py +++ b/src/whoosh/util/__init__.py @@ -30,7 +30,7 @@ from bisect import insort from functools import wraps -from whoosh_reloaded.compat import xrange +from whoosh.compat import range # These must be valid separate characters in CASE-INSENSTIVE filenames @@ -46,11 +46,11 @@ def random_name(size=28): - return "".join(random.choice(IDCHARS) for _ in xrange(size)) + return "".join(random.choice(IDCHARS) for _ in range(size)) def random_bytes(size=28): - gen = (random.randint(0, 255) for _ in xrange(size)) + gen = (random.randint(0, 255) for _ in range(size)) if sys.version_info[0] >= 3: return bytes(gen) else: diff --git a/src/whoosh_reloaded/util/cache.py b/src/whoosh/util/cache.py similarity index 98% rename from src/whoosh_reloaded/util/cache.py rename to src/whoosh/util/cache.py index 02a19902..00cb3f27 100644 --- a/src/whoosh_reloaded/util/cache.py +++ b/src/whoosh/util/cache.py @@ -30,7 +30,7 @@ from heapq import nsmallest from operator import itemgetter -from whoosh_reloaded.compat import iteritems +from whoosh.compat import iteritems try: diff --git a/src/whoosh_reloaded/util/filelock.py b/src/whoosh/util/filelock.py similarity index 95% rename from src/whoosh_reloaded/util/filelock.py rename to src/whoosh/util/filelock.py index 382c40dc..5534d123 100644 --- a/src/whoosh_reloaded/util/filelock.py +++ b/src/whoosh/util/filelock.py @@ -92,7 +92,7 @@ class FcntlLock(LockBase): """ def acquire(self, blocking=False): - import fcntl # @UnresolvedImport + import fcntl # type: ignore @UnresolvedImport flags = os.O_CREAT | os.O_WRONLY self.fd = os.open(self.filename, flags) @@ -117,7 +117,7 @@ def release(self): if self.fd is None: raise Exception("Lock was not acquired") - import fcntl # @UnresolvedImport + import fcntl # type: ignore @UnresolvedImport fcntl.flock(self.fd, fcntl.LOCK_UN) os.close(self.fd) self.fd = None @@ -128,7 +128,7 @@ class MsvcrtLock(LockBase): """ def acquire(self, blocking=False): - import msvcrt # @UnresolvedImport + import msvcrt # type: ignore @UnresolvedImport flags = os.O_CREAT | os.O_WRONLY mode = msvcrt.LK_NBLCK @@ -148,7 +148,7 @@ def acquire(self, blocking=False): return False def release(self): - import msvcrt # @UnresolvedImport + import msvcrt # type: ignore @UnresolvedImport if self.fd is None: raise Exception("Lock was not acquired") diff --git a/src/whoosh_reloaded/util/loading.py b/src/whoosh/util/loading.py similarity index 96% rename from src/whoosh_reloaded/util/loading.py rename to src/whoosh/util/loading.py index c53c00f3..8d549d3a 100644 --- a/src/whoosh_reloaded/util/loading.py +++ b/src/whoosh/util/loading.py @@ -54,8 +54,8 @@ def find_class(self, modulename, objname): def find_object(name, blacklist=None, whitelist=None): """Imports and returns an object given a fully qualified name. - >>> find_object("whoosh_reloaded.analysis.StopFilter") - + >>> find_object("whoosh.analysis.StopFilter") + """ if blacklist: diff --git a/src/whoosh_reloaded/util/numeric.py b/src/whoosh/util/numeric.py similarity index 96% rename from src/whoosh_reloaded/util/numeric.py rename to src/whoosh/util/numeric.py index dcd4f795..5b4670c8 100644 --- a/src/whoosh_reloaded/util/numeric.py +++ b/src/whoosh/util/numeric.py @@ -30,11 +30,11 @@ from bisect import bisect_left from struct import pack, unpack -from whoosh_reloaded.compat import b, long_type -from whoosh_reloaded.system import pack_byte, unpack_byte, pack_ushort, unpack_ushort -from whoosh_reloaded.system import pack_int, unpack_int, pack_uint, unpack_uint -from whoosh_reloaded.system import pack_long, unpack_long, pack_ulong, unpack_ulong -from whoosh_reloaded.system import pack_float, unpack_float, pack_double, unpack_double +from whoosh.compat import b, long_type +from whoosh.system import pack_byte, unpack_byte, pack_ushort, unpack_ushort +from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint +from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong +from whoosh.system import pack_float, unpack_float, pack_double, unpack_double NaN = struct.unpack("> _bitsize num = _num[key] if _num[key] < n else n bits = 0 - for j in xrange(num): + for j in range(num): v = value >> bits yield v & (0xFFFFFFFF >> (32 - _bits[key][j])) bits += _bits[key][j] @@ -557,7 +557,7 @@ def key_to_sizes(self, key): byte. """ - return [(key >> (i * 2) & 3) + 1 for i in xrange(4)] + return [(key >> (i * 2) & 3) + 1 for i in range(4)] def write_nums(self, f, numbers): buf = emptybytes @@ -597,7 +597,7 @@ def read_nums(self, f, n): count = 0 key = None - for _ in xrange(n): + for _ in range(n): if count == 0: key = f.read_byte() code = key >> (count * 2) & 3 diff --git a/src/whoosh_reloaded/util/testing.py b/src/whoosh/util/testing.py similarity index 96% rename from src/whoosh_reloaded/util/testing.py rename to src/whoosh/util/testing.py index cccade28..80715fd8 100644 --- a/src/whoosh_reloaded/util/testing.py +++ b/src/whoosh/util/testing.py @@ -31,8 +31,8 @@ import tempfile from contextlib import contextmanager -from whoosh_reloaded.filedb.filestore import FileStorage -from whoosh_reloaded.util import now, random_name +from whoosh.filedb.filestore import FileStorage +from whoosh.util import now, random_name class TempDir(object): @@ -40,7 +40,7 @@ def __init__( self, basename="", parentdir=None, - ext=".whoosh_reloaded", + ext=".whoosh", suppress=frozenset(), keepdir=False, ): diff --git a/src/whoosh_reloaded/util/text.py b/src/whoosh/util/text.py similarity index 98% rename from src/whoosh_reloaded/util/text.py rename to src/whoosh/util/text.py index 472b40aa..f4913b7d 100644 --- a/src/whoosh_reloaded/util/text.py +++ b/src/whoosh/util/text.py @@ -27,7 +27,7 @@ import codecs, re -from whoosh_reloaded.compat import string_type, u, byte +from whoosh.compat import string_type, u, byte # Note: these functions return a tuple of (text, length), so when you call diff --git a/src/whoosh_reloaded/util/times.py b/src/whoosh/util/times.py similarity index 99% rename from src/whoosh_reloaded/util/times.py rename to src/whoosh/util/times.py index a27f2e6f..c3b7859b 100644 --- a/src/whoosh_reloaded/util/times.py +++ b/src/whoosh/util/times.py @@ -29,7 +29,7 @@ import copy from datetime import date, datetime, timedelta -from whoosh_reloaded.compat import iteritems +from whoosh.compat import iteritems class TimeError(Exception): diff --git a/src/whoosh_reloaded/util/varints.py b/src/whoosh/util/varints.py similarity index 96% rename from src/whoosh_reloaded/util/varints.py rename to src/whoosh/util/varints.py index 560e029c..d0acdc87 100644 --- a/src/whoosh_reloaded/util/varints.py +++ b/src/whoosh/util/varints.py @@ -27,7 +27,7 @@ from array import array -from whoosh_reloaded.compat import array_tobytes, xrange +from whoosh.compat import array_tobytes, range # Varint cache @@ -48,7 +48,7 @@ def _varint(i): _varint_cache_size = 512 _varint_cache = [] -for i in xrange(0, _varint_cache_size): +for i in range(0, _varint_cache_size): _varint_cache.append(_varint(i)) _varint_cache = tuple(_varint_cache) diff --git a/src/whoosh_reloaded/util/versions.py b/src/whoosh/util/versions.py similarity index 99% rename from src/whoosh_reloaded/util/versions.py rename to src/whoosh/util/versions.py index dd77099a..004a3558 100644 --- a/src/whoosh_reloaded/util/versions.py +++ b/src/whoosh/util/versions.py @@ -25,7 +25,7 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh_reloaded.util.text import rcompile +from whoosh.util.text import rcompile class BaseVersion(object): diff --git a/src/whoosh_reloaded/writing.py b/src/whoosh/writing.py similarity index 96% rename from src/whoosh_reloaded/writing.py rename to src/whoosh/writing.py index b78042fa..57b7d271 100644 --- a/src/whoosh_reloaded/writing.py +++ b/src/whoosh/writing.py @@ -30,14 +30,14 @@ from bisect import bisect_right from contextlib import contextmanager -from whoosh_reloaded import columns -from whoosh_reloaded.compat import abstractmethod, bytes_type -from whoosh_reloaded.externalsort import SortingPool -from whoosh_reloaded.fields import UnknownFieldError -from whoosh_reloaded.index import LockError -from whoosh_reloaded.util import fib, random_name -from whoosh_reloaded.util.filelock import try_for -from whoosh_reloaded.util.text import utf8encode +from whoosh import columns +from whoosh.compat import abstractmethod, bytes_type +from whoosh.externalsort import SortingPool +from whoosh.fields import UnknownFieldError +from whoosh.index import LockError +from whoosh.util import fib, random_name +from whoosh.util.filelock import try_for +from whoosh.util.text import utf8encode # Exceptions @@ -75,7 +75,7 @@ def MERGE_SMALL(writer, segments): heuristic based on the fibonacci sequence. """ - from whoosh_reloaded.reading import SegmentReader + from whoosh.reading import SegmentReader unchanged_segments = [] segments_to_merge = [] @@ -111,7 +111,7 @@ def MERGE_SMALL(writer, segments): def OPTIMIZE(writer, segments): """This policy merges all existing segments.""" - from whoosh_reloaded.reading import SegmentReader + from whoosh.reading import SegmentReader for seg in segments: reader = SegmentReader(writer.storage, writer.schema, seg) @@ -132,7 +132,7 @@ def CLEAR(writer, segments): class PostingPool(SortingPool): - # Subclass whoosh_reloaded.externalsort.SortingPool to use knowledge of + # Subclass whoosh.externalsort.SortingPool to use knowledge of # postings to set run size in bytes instead of items namechars = "abcdefghijklmnopqrstuvwxyz0123456789" @@ -196,7 +196,7 @@ class IndexWriter(object): """High-level object for writing to an index. To get a writer for a particular index, call - :meth:`~whoosh_reloaded.index.Index.writer` on the Index object. + :meth:`~whoosh.index.Index.writer` on the Index object. >>> writer = myindex.writer() @@ -279,7 +279,7 @@ def add_field(self, fieldname, fieldtype, **kwargs): """Adds a field to the index's schema. :param fieldname: the name of the field to add. - :param fieldtype: an instantiated :class:`whoosh_reloaded.fields.FieldType` + :param fieldtype: an instantiated :class:`whoosh.fields.FieldType` object. """ @@ -301,7 +301,7 @@ def reader(self, **kwargs): raise NotImplementedError def searcher(self, **kwargs): - from whoosh_reloaded.searching import Searcher + from whoosh.searching import Searcher return Searcher(self.reader(), **kwargs) @@ -313,7 +313,7 @@ def delete_by_term(self, fieldname, text, searcher=None): :returns: the number of documents deleted. """ - from whoosh_reloaded.query import Term + from whoosh.query import Term q = Term(fieldname, text) return self.delete_by_query(q, searcher=searcher) @@ -358,8 +358,8 @@ def add_document(self, **fields): fields take ``datetime.datetime`` objects:: from datetime import datetime, timedelta - from whoosh_reloaded import index - from whoosh_reloaded.fields import * + from whoosh import index + from whoosh.fields import * schema = Schema(date=DATETIME, size=NUMERIC(float), content=TEXT) myindex = index.create_in("indexdir", schema) @@ -536,7 +536,7 @@ def __init__( raise LockError if codec is None: - from whoosh_reloaded.codec import default_codec + from whoosh.codec import default_codec codec = default_codec() self.codec = codec @@ -571,7 +571,7 @@ def __init__( def __repr__(self): # Author: Ronald Evers - # Origin bitbucket issue: https://bitbucket.org/mchaput/whoosh_reloaded/issues/483 + # Origin bitbucket issue: https://bitbucket.org/mchaput/whoosh/issues/483 # newsegment might not be set due to LockError # so use getattr to be safe return "<%s %r>" % (self.__class__.__name__, getattr(self, "newsegment", None)) @@ -658,7 +658,7 @@ def is_deleted(self, docnum): return segment.is_deleted(segdocnum) def reader(self, reuse=None): - from whoosh_reloaded.index import FileIndex + from whoosh.index import FileIndex self._check_state() return FileIndex._reader( @@ -919,7 +919,7 @@ def _finalize_segment(self): return self.get_segment() def _commit_toc(self, segments): - from whoosh_reloaded.index import TOC, clean_files + from whoosh.index import TOC, clean_files # Write a new TOC with the new segment list (and delete old files) toc = TOC(self.schema, segments, self.generation) @@ -1017,13 +1017,13 @@ class AsyncWriter(threading.Thread, IndexWriter): Do this: - >>> from whoosh_reloaded.writing import AsyncWriter + >>> from whoosh.writing import AsyncWriter >>> writer = AsyncWriter(myindex) """ def __init__(self, index, delay=0.25, writerargs=None): """ - :param index: the :class:`whoosh_reloaded.index.Index` to write to. + :param index: the :class:`whoosh.index.Index` to write to. :param delay: the delay (in seconds) between attempts to instantiate the actual writer. :param writerargs: an optional dictionary specifying keyword arguments @@ -1045,7 +1045,7 @@ def reader(self): return self.index.reader() def searcher(self, **kwargs): - from whoosh_reloaded.searching import Searcher + from whoosh.searching import Searcher return Searcher(self.reader(), fromindex=self.index, **kwargs) @@ -1108,14 +1108,14 @@ def add_spelling(ix, fieldnames, commit=True): >>> ix = index.open_dir("testindex") >>> add_spelling(ix, ["content", "tags"]) - :param ix: a :class:`whoosh_reloaded.filedb.fileindex.FileIndex` object. + :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object. :param fieldnames: a list of field names to create word graphs for. :param force: if True, overwrites existing word graph files. This is only useful for debugging. """ - from whoosh_reloaded.automata import fst - from whoosh_reloaded.reading import SegmentReader + from whoosh.automata import fst + from whoosh.reading import SegmentReader writer = ix.writer() storage = writer.storage @@ -1162,7 +1162,7 @@ class BufferedWriter(IndexWriter): To use this class, create it from your index and *keep it open*, sharing it between threads. - >>> from whoosh_reloaded.writing import BufferedWriter + >>> from whoosh.writing import BufferedWriter >>> writer = BufferedWriter(myindex, period=120, limit=20) >>> # Then you can use the writer to add and update documents >>> writer.add_document(...) @@ -1204,7 +1204,7 @@ class BufferedWriter(IndexWriter): def __init__(self, index, period=60, limit=10, writerargs=None, commitargs=None): """ - :param index: the :class:`whoosh_reloaded.index.Index` to write to. + :param index: the :class:`whoosh.index.Index` to write to. :param period: the maximum amount of time (in seconds) between commits. Set this to ``0`` or ``None`` to not use a timer. Do not set this any lower than a few seconds. @@ -1235,7 +1235,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.close() def _make_ram_index(self): - from whoosh_reloaded.codec.memory import MemoryCodec + from whoosh.codec.memory import MemoryCodec self.codec = MemoryCodec() @@ -1247,7 +1247,7 @@ def schema(self): return self.writer.schema def reader(self, **kwargs): - from whoosh_reloaded.reading import MultiReader + from whoosh.reading import MultiReader reader = self.writer.reader() with self.lock: @@ -1263,7 +1263,7 @@ def reader(self, **kwargs): return reader def searcher(self, **kwargs): - from whoosh_reloaded.searching import Searcher + from whoosh.searching import Searcher return Searcher(self.reader(), fromindex=self.index, **kwargs) diff --git a/stress/test_bigfacet.py b/stress/test_bigfacet.py index 3820db3d..a806f621 100644 --- a/stress/test_bigfacet.py +++ b/stress/test_bigfacet.py @@ -4,8 +4,8 @@ import random import string -from whoosh_reloaded import fields, formats, index, query, sorting -from whoosh_reloaded.util import now +from whoosh import fields, formats, index, query, sorting +from whoosh.util import now tagcount = 100 doccount = 500000 diff --git a/stress/test_bigindex.py b/stress/test_bigindex.py index c32154bd..3501b28a 100644 --- a/stress/test_bigindex.py +++ b/stress/test_bigindex.py @@ -2,10 +2,10 @@ import random -from whoosh_reloaded import fields -from whoosh_reloaded.compat import xrange, text_type, u -from whoosh_reloaded.util.testing import TempIndex -from whoosh_reloaded.util import now +from whoosh import fields +from whoosh.compat import range, text_type, u +from whoosh.util.testing import TempIndex +from whoosh.util import now def test_20000_single(): @@ -27,7 +27,7 @@ def test_20000_single(): ] t = now() - for i in xrange(20000): + for i in range(20000): w = ix.writer() w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.commit() @@ -39,7 +39,7 @@ def test_20000_single(): def test_20000_buffered(): - from whoosh_reloaded.writing import BufferedWriter + from whoosh.writing import BufferedWriter sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) with TempIndex(sc, "20000buffered") as ix: @@ -60,7 +60,7 @@ def test_20000_buffered(): t = now() w = BufferedWriter(ix, limit=100, period=None) - for i in xrange(20000): + for i in range(20000): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.close() print("Write buffered:", now() - t) @@ -90,7 +90,7 @@ def test_20000_batch(): t = now() w = ix.writer() - for i in xrange(20000): + for i in range(20000): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) if not i % 100: w.commit() diff --git a/stress/test_bigsort.py b/stress/test_bigsort.py index 1c71d640..a017e0e1 100644 --- a/stress/test_bigsort.py +++ b/stress/test_bigsort.py @@ -1,9 +1,9 @@ import os.path, random, shutil from datetime import datetime -from whoosh_reloaded import fields, index, query -from whoosh_reloaded.compat import text_type, xrange -from whoosh_reloaded.util import now +from whoosh import fields, index, query +from whoosh.compat import text_type, range +from whoosh.util import now def test_bigsort(): @@ -21,7 +21,7 @@ def test_bigsort(): print("Writing...") t = now() w = ix.writer(limitmb=512) - for i in xrange(times): + for i in range(times): dt = datetime.fromtimestamp(random.randint(15839593, 1294102139)) w.add_document(id=text_type(i), date=dt) w.commit() diff --git a/stress/test_bigtable.py b/stress/test_bigtable.py index 16e4dfc5..33ef1ef2 100644 --- a/stress/test_bigtable.py +++ b/stress/test_bigtable.py @@ -2,21 +2,21 @@ from random import randint, shuffle -from nose.tools import assert_equal # @UnresolvedImport +from nose.tools import assert_equal # type: ignore @UnresolvedImport -from whoosh_reloaded.compat import xrange, iteritems -from whoosh_reloaded.filedb.filetables import HashWriter, HashReader -from whoosh_reloaded.util.testing import TempStorage +from whoosh.compat import range, iteritems +from whoosh.filedb.filetables import HashWriter, HashReader +from whoosh.util.testing import TempStorage def test_bigtable(): with TempStorage("bigtable") as st: def randstring(min, max): - return "".join(chr(randint(1, 255)) for _ in xrange(randint(min, max))) + return "".join(chr(randint(1, 255)) for _ in range(randint(min, max))) count = 100000 - samp = dict((randstring(1, 50), randstring(1, 50)) for _ in xrange(count)) + samp = dict((randstring(1, 50), randstring(1, 50)) for _ in range(count)) fhw = HashWriter(st.create_file("big.hsh")) fhw.add_all(iteritems(samp)) diff --git a/stress/test_hugeindex.py b/stress/test_hugeindex.py index c9cdc10a..ed37e740 100644 --- a/stress/test_hugeindex.py +++ b/stress/test_hugeindex.py @@ -1,12 +1,12 @@ from __future__ import with_statement import struct -from nose.tools import assert_equal # @UnresolvedImport +from nose.tools import assert_equal # type: ignore @UnresolvedImport -from whoosh_reloaded import formats -from whoosh_reloaded.compat import xrange -from whoosh_reloaded.filedb.filepostings import FilePostingReader, FilePostingWriter -from whoosh_reloaded.util.testing import TempStorage +from whoosh import formats +from whoosh.compat import range +from whoosh.filedb.filepostings import FilePostingReader, FilePostingWriter +from whoosh.util.testing import TempStorage def test_huge_postfile(): @@ -21,7 +21,7 @@ def test_huge_postfile(): fpw = FilePostingWriter(pf) format = formats.Frequency(None) offset = fpw.start(format) - for i in xrange(10): + for i in range(10): fpw.write(i, float(i), struct.pack("!I", i), 10) posttotal = fpw.finish() assert_equal(posttotal, 10) diff --git a/stress/test_threading.py b/stress/test_threading.py index 5683d11b..d0c3a13d 100644 --- a/stress/test_threading.py +++ b/stress/test_threading.py @@ -1,9 +1,9 @@ from __future__ import with_statement import random, threading, time -from whoosh_reloaded import fields, query -from whoosh_reloaded.compat import xrange, u, text_type -from whoosh_reloaded.util.testing import TempStorage +from whoosh import fields, query +from whoosh.compat import range, u, text_type +from whoosh.util.testing import TempStorage def test_readwrite(): @@ -43,10 +43,10 @@ def run(self): ix = st.create_index(dir, schema) num = 0 - for i in xrange(50): + for i in range(50): print(i) w = ix.writer() - for _ in xrange(random.randint(1, 100)): + for _ in range(random.randint(1, 100)): content = u(" ").join( random.sample(domain, random.randint(5, 20)) ) @@ -59,7 +59,7 @@ def run(self): class SearcherThread(threading.Thread): def run(self): print(self.name + " starting") - for _ in xrange(10): + for _ in range(10): ix = st.open_index() s = ix.searcher() q = query.Term("content", random.choice(domain)) @@ -72,7 +72,7 @@ def run(self): wt = WriterThread() wt.start() time.sleep(0.5) - for _ in xrange(20): + for _ in range(20): SearcherThread().start() time.sleep(0.5) wt.join() diff --git a/stress/test_update.py b/stress/test_update.py index 6dab1bd8..06f935c3 100644 --- a/stress/test_update.py +++ b/stress/test_update.py @@ -3,15 +3,15 @@ from nose.tools import assert_equal -from whoosh_reloaded import fields, query -from whoosh_reloaded.compat import xrange, text_type -from whoosh_reloaded.util.testing import TempIndex +from whoosh import fields, query +from whoosh.compat import range, text_type +from whoosh.util.testing import TempIndex def test_many_updates(): schema = fields.Schema(key=fields.ID(unique=True, stored=True)) with TempIndex(schema, "manyupdates") as ix: - for _ in xrange(10000): + for _ in range(10000): num = random.randint(0, 5000) w = ix.writer() w.update_document(key=text_type(num)) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 82e00338..4508b4c7 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -4,10 +4,10 @@ import pytest -from whoosh_reloaded import analysis, fields, qparser -from whoosh_reloaded.compat import b, u, unichr -from whoosh_reloaded.compat import dumps -from whoosh_reloaded.filedb.filestore import RamStorage +from whoosh import analysis, fields, qparser +from whoosh.compat import b, u, unichr +from whoosh.compat import dumps +from whoosh.filedb.filestore import RamStorage def test_regextokenizer(): @@ -256,7 +256,7 @@ def test_shingles(): def test_unicode_blocks(): - from whoosh_reloaded.support.unicode import blocks, blockname, blocknum + from whoosh.support.unicode import blocks, blockname, blocknum assert blockname(u("a")) == "Basic Latin" assert blockname(unichr(0x0B80)) == "Tamil" @@ -269,7 +269,7 @@ def test_unicode_blocks(): def test_double_metaphone(): - from whoosh_reloaded.lang.dmetaphone import double_metaphone + from whoosh.lang.dmetaphone import double_metaphone names = { "maurice": ("MRS", None), @@ -360,7 +360,7 @@ def test_delimited_attribute(): def test_porter2(): - from whoosh_reloaded.lang.porter2 import stem + from whoosh.lang.porter2 import stem plurals = [ "caresses", @@ -476,7 +476,7 @@ def test_name_field(): def test_start_pos(): - from whoosh_reloaded import formats + from whoosh import formats ana = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() kw = {"positions": True} @@ -562,7 +562,7 @@ def test_la_pickleability(): def test_charset_pickeability(): - from whoosh_reloaded.support import charset + from whoosh.support import charset charmap = charset.charset_table_to_dict(charset.default_charset) ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap) diff --git a/tests/test_automata.py b/tests/test_automata.py index f83c4b3e..dd122b48 100644 --- a/tests/test_automata.py +++ b/tests/test_automata.py @@ -2,10 +2,10 @@ import os.path from bisect import bisect_left -from whoosh_reloaded.compat import permutations -from whoosh_reloaded.compat import xrange -from whoosh_reloaded.automata import fsa, glob, lev -from whoosh_reloaded.support.levenshtein import levenshtein +from whoosh.compat import permutations +from whoosh.compat import range +from whoosh.automata import fsa, glob, lev +from whoosh.support.levenshtein import levenshtein def test_nfa(): @@ -357,7 +357,7 @@ def test_strings_dfa(): domain = "abcd" words = set() - for i in xrange(1, len(domain) + 1): + for i in range(1, len(domain) + 1): words.update("".join(p) for p in permutations(domain[:i])) words = sorted(words) dfa = fsa.strings_dfa(words) diff --git a/tests/test_bits.py b/tests/test_bits.py index 7b3272d1..db08ce76 100644 --- a/tests/test_bits.py +++ b/tests/test_bits.py @@ -1,5 +1,5 @@ -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.idsets import BitSet, OnDiskBitSet, SortedIntSet +from whoosh.filedb.filestore import RamStorage +from whoosh.idsets import BitSet, OnDiskBitSet, SortedIntSet def test_bit_basics(c=BitSet): diff --git a/tests/test_classify.py b/tests/test_classify.py index f4b2d336..c36c0e4a 100644 --- a/tests/test_classify.py +++ b/tests/test_classify.py @@ -1,10 +1,9 @@ from __future__ import with_statement -from whoosh_reloaded import analysis, classify, fields, formats, query, reading -from whoosh_reloaded.compat import u, text_type -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempIndex - +from whoosh import analysis, classify, fields, formats, query, reading +from whoosh.compat import text_type, u +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex domain = [ u( diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 4503b56a..49eba32a 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -4,12 +4,12 @@ import pytest -from whoosh_reloaded import analysis, fields, formats, query -from whoosh_reloaded.compat import u, b, text_type -from whoosh_reloaded.compat import array_tobytes, xrange -from whoosh_reloaded.codec import default_codec -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempStorage +from whoosh import analysis, fields, formats, query +from whoosh.compat import u, b, text_type +from whoosh.compat import array_tobytes, range +from whoosh.codec import default_codec +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempStorage def _make_codec(**kwargs): @@ -58,15 +58,15 @@ def test_termkey(): def test_random_termkeys(): def random_fieldname(): - return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20)) + return "".join(chr(random.randint(65, 90)) for _ in range(1, 20)) def random_btext(): - a = array("H", (random.randint(0, 0xD7FF) for _ in xrange(1, 20))) + a = array("H", (random.randint(0, 0xD7FF) for _ in range(1, 20))) return array_tobytes(a).decode("utf-16") domain = sorted( set( - [(random_fieldname(), random_btext().encode("utf-8")) for _ in xrange(1000)] + [(random_fieldname(), random_btext().encode("utf-8")) for _ in range(1000)] ) ) @@ -493,7 +493,7 @@ def test_skip(): # # # def test_special_spelled_field(): -# from whoosh_reloaded.analysis import StemmingAnalyzer +# from whoosh.analysis import StemmingAnalyzer # # field = fields.TEXT(analyzer=StemmingAnalyzer(), spelling=True) # st, codec, seg = _make_codec() @@ -520,8 +520,8 @@ def test_skip(): def test_plaintext_codec(): pytest.importorskip("ast") - from whoosh_reloaded.codec.plaintext import PlainTextCodec - from whoosh_reloaded.codec.whoosh_reloaded3 import W3Codec + from whoosh.codec.plaintext import PlainTextCodec + from whoosh.codec.whoosh3 import W3Codec ana = analysis.StemmingAnalyzer() schema = fields.Schema( @@ -600,8 +600,8 @@ def test_plaintext_codec(): def test_memory_codec(): - from whoosh_reloaded.codec import memory - from whoosh_reloaded.searching import Searcher + from whoosh.codec import memory + from whoosh.searching import Searcher ana = analysis.StemmingAnalyzer() schema = fields.Schema( @@ -655,7 +655,7 @@ def test_memory_codec(): def test_memory_multiwrite(): - from whoosh_reloaded.codec import memory + from whoosh.codec import memory domain = [ "alfa bravo charlie delta", diff --git a/tests/test_collector.py b/tests/test_collector.py index 73dac4ef..f4609e21 100644 --- a/tests/test_collector.py +++ b/tests/test_collector.py @@ -2,10 +2,10 @@ import pytest -from whoosh_reloaded import collectors, fields, query, searching -from whoosh_reloaded.compat import u, xrange -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempIndex +from whoosh import collectors, fields, query, searching +from whoosh.compat import u +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex def test_add(): @@ -42,12 +42,13 @@ def test_timelimit(): schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() - for _ in xrange(50): + for _ in range(50): w.add_document(text=u("alfa")) w.commit() import time - from whoosh_reloaded import collectors, matching + + from whoosh import collectors, matching class SlowMatcher(matching.WrappingMatcher): def next(self): @@ -86,7 +87,8 @@ def matcher(self, searcher, context=None): @pytest.mark.skipif("not hasattr(__import__('signal'), 'SIGALRM')") def test_timelimit_alarm(): import time - from whoosh_reloaded import matching + + from whoosh import matching class SlowMatcher(matching.Matcher): def __init__(self): @@ -126,7 +128,7 @@ def matcher(self, searcher, context=None): def test_reverse_collapse(): - from whoosh_reloaded import sorting + from whoosh import sorting schema = fields.Schema( title=fields.TEXT(stored=True), diff --git a/tests/test_columns.py b/tests/test_columns.py index 87986b2c..c4ff967a 100644 --- a/tests/test_columns.py +++ b/tests/test_columns.py @@ -1,13 +1,13 @@ from __future__ import with_statement import inspect, random, sys -from whoosh_reloaded import columns, fields, query -from whoosh_reloaded.codec.whoosh_reloaded3 import W3Codec -from whoosh_reloaded.compat import b, u, BytesIO, bytes_type, text_type -from whoosh_reloaded.compat import izip, xrange, dumps, loads -from whoosh_reloaded.filedb import compound -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempIndex, TempStorage +from whoosh import columns, fields, query +from whoosh.codec.whoosh3 import W3Codec +from whoosh.compat import b, u, BytesIO, bytes_type, text_type +from whoosh.compat import izip, range, dumps, loads +from whoosh.filedb import compound +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex, TempStorage def test_pickleability(): @@ -71,11 +71,11 @@ def test_random_multistream(): letters = "abcdefghijklmnopqrstuvwxyz" def randstring(n): - s = "".join(random.choice(letters) for _ in xrange(n)) + s = "".join(random.choice(letters) for _ in range(n)) return s.encode("latin1") domain = {} - for _ in xrange(100): + for _ in range(100): name = randstring(random.randint(5, 10)) value = randstring(2500) domain[name] = value @@ -129,7 +129,7 @@ def _rt(c, values, default): f = st.create_file("test2") f.write(b("hello")) w = c.writer(f) - for docnum, v in izip(xrange(10, doccount, 7), values): + for docnum, v in izip(range(10, doccount, 7), values): target[docnum] = v w.add(docnum, v) w.finish(doccount) @@ -189,8 +189,8 @@ def test_roundtrip(): _rt(numcol("d"), [1.5, -2.5, 3.5, -4.5, 1.25], 0) c = columns.BitColumn(compress_at=10) - _rt(c, [bool(random.randint(0, 1)) for _ in xrange(70)], False) - _rt(c, [bool(random.randint(0, 1)) for _ in xrange(90)], False) + _rt(c, [bool(random.randint(0, 1)) for _ in range(70)], False) + _rt(c, [bool(random.randint(0, 1)) for _ in range(90)], False) c = columns.PickleColumn(columns.VarBytesColumn()) _rt(c, [None, True, False, 100, -7, "hello"], None) @@ -284,7 +284,7 @@ def rw(size): f = st.create_file("test") cw = col.writer(f) - for i in xrange(size): + for i in range(size): cw.add(i, hex(i).encode("latin1")) cw.finish(size) length = f.tell() @@ -292,7 +292,7 @@ def rw(size): f = st.open_file("test") cr = col.reader(f, 0, length, size) - for i in xrange(size): + for i in range(size): v = cr[i] # Column ignores additional unique values after 65535 if i <= 65535 - 1: @@ -324,7 +324,7 @@ def test_varbytes_offsets(): schema = fields.Schema(name=fields.ID(sortable=col)) with TempIndex(schema) as ix: with ix.writer() as w: - for i in xrange(5000): + for i in range(5000): w.add_document(name=values[i % vlen]) with ix.reader() as r: @@ -339,7 +339,7 @@ def test_varbytes_offsets(): schema = fields.Schema(name=fields.ID(sortable=col)) with TempIndex(schema) as ix: with ix.writer() as w: - for i in xrange(5000): + for i in range(5000): w.add_document(name=values[i % vlen]) with ix.reader() as r: diff --git a/tests/test_compound.py b/tests/test_compound.py index 2dfe50bb..515b966b 100644 --- a/tests/test_compound.py +++ b/tests/test_compound.py @@ -1,9 +1,9 @@ from __future__ import with_statement -from whoosh_reloaded.compat import b -from whoosh_reloaded.filedb.compound import CompoundStorage -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempStorage +from whoosh.compat import b +from whoosh.filedb.compound import CompoundStorage +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempStorage def _test_simple_compound(st): diff --git a/tests/test_dateparse.py b/tests/test_dateparse.py index 9742f9f5..7180e3b3 100644 --- a/tests/test_dateparse.py +++ b/tests/test_dateparse.py @@ -1,4 +1,4 @@ -from whoosh_reloaded.qparser.dateparse import * +from whoosh.qparser.dateparse import * basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) diff --git a/tests/test_fields.py b/tests/test_fields.py index 39c3ec5d..4c796c9e 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -3,11 +3,11 @@ import pytest -from whoosh_reloaded import fields, qparser, query -from whoosh_reloaded.compat import u, b, xrange -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util import times -from whoosh_reloaded.util.testing import TempIndex +from whoosh import fields, qparser, query +from whoosh.compat import u, b, range +from whoosh.filedb.filestore import RamStorage +from whoosh.util import times +from whoosh.util.testing import TempIndex def test_schema_eq(): @@ -252,7 +252,7 @@ def test_numeric_ranges(): ix = RamStorage().create_index(schema) w = ix.writer() - for i in xrange(400): + for i in range(400): w.add_document(id=i, num=i) w.commit() @@ -299,7 +299,7 @@ def test_decimal_ranges(): w = ix.writer() count = Decimal("0.0") inc = Decimal("0.2") - for _ in xrange(500): + for _ in range(500): w.add_document(id=str(count), num=count) count += inc w.commit() @@ -344,7 +344,7 @@ def test_nontext_document(): dt = datetime.now() w = ix.writer() - for i in xrange(50): + for i in range(50): w.add_document(id=i, num=i, date=dt + timedelta(days=i), even=not (i % 2)) w.commit() @@ -369,7 +369,7 @@ def test_nontext_update(): dt = datetime.now() w = ix.writer() - for i in xrange(10): + for i in range(10): w.add_document(id=i, num=i, date=dt + timedelta(days=i)) w.commit() @@ -390,8 +390,8 @@ def test_datetime(): ix = st.create_index(schema) w = ix.writer() - for month in xrange(1, 12): - for day in xrange(1, 28): + for month in range(1, 12): + for day in range(1, 28): w.add_document( id=u("%s-%s") % (month, day), date=datetime(2010, month, day, 14, 0, 0) ) @@ -531,7 +531,7 @@ def test_boolean_find_deleted(): ix = RamStorage().create_index(schema) count = 0 # Create multiple segments just in case - for _ in xrange(5): + for _ in range(5): w = ix.writer() for c in domain: w.add_document(i=count, b=(c == "1")) @@ -544,7 +544,7 @@ def test_boolean_find_deleted(): with ix.searcher() as s: # Double check that documents with b=True are all deleted reader = s.reader() - for docnum in xrange(s.doc_count_all()): + for docnum in range(s.doc_count_all()): b = s.stored_fields(docnum)["b"] assert b == reader.is_deleted(docnum) @@ -622,7 +622,7 @@ def test_missing_field(): def test_token_boost(): - from whoosh_reloaded.analysis import RegexTokenizer, DoubleMetaphoneFilter + from whoosh.analysis import RegexTokenizer, DoubleMetaphoneFilter ana = RegexTokenizer() | DoubleMetaphoneFilter() field = fields.TEXT(analyzer=ana, phrase=False) @@ -645,9 +645,9 @@ def test_pickle_idlist(): def test_pickle_schema(): - from whoosh_reloaded import analysis - from whoosh_reloaded.support.charset import accent_map - from whoosh_reloaded.compat import dumps + from whoosh import analysis + from whoosh.support.charset import accent_map + from whoosh.compat import dumps freetext_analyzer = analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map) diff --git a/tests/test_flexible.py b/tests/test_flexible.py index a1556115..18ac0267 100644 --- a/tests/test_flexible.py +++ b/tests/test_flexible.py @@ -1,8 +1,8 @@ from __future__ import with_statement -from whoosh_reloaded import fields -from whoosh_reloaded.compat import u, b -from whoosh_reloaded.util.testing import TempIndex +from whoosh import fields +from whoosh.compat import u, b +from whoosh.util.testing import TempIndex def test_addfield(): diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 1b9780d9..ca85897f 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -6,10 +6,10 @@ # from jieba.analyse import ChineseAnalyzer -from whoosh_reloaded import analysis, highlight, fields, qparser, query -from whoosh_reloaded.compat import u -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempIndex +from whoosh import analysis, highlight, fields, qparser, query +from whoosh.compat import u +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex _doc = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet " + "kilo lima") diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 4457b0ae..5820e7bd 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -5,12 +5,12 @@ import pytest -from whoosh_reloaded import analysis, fields, index, qparser, query, __version__ -from whoosh_reloaded.compat import b, u, xrange, text_type, permutations -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.writing import IndexingError -from whoosh_reloaded.util.numeric import length_to_byte, byte_to_length -from whoosh_reloaded.util.testing import TempIndex, TempStorage +from whoosh import analysis, fields, index, qparser, query, __version__ +from whoosh.compat import b, u, range, text_type, permutations +from whoosh.filedb.filestore import RamStorage +from whoosh.writing import IndexingError +from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.util.testing import TempIndex, TempStorage def test_creation(): @@ -98,7 +98,7 @@ def test_simple_indexing(): docs = defaultdict(list) with TempIndex(schema, "simple") as ix: with ix.writer() as w: - for i in xrange(100): + for i in range(100): smp = random.sample(domain, 5) for word in smp: docs[word].append(i) @@ -150,9 +150,9 @@ def test_lengths(): w.commit() with ix.reader() as dr: - ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))] + ls1 = [dr.doc_field_length(i, "f1") for i in range(0, len(lengths))] assert ls1 == [0] * len(lengths) - ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))] + ls2 = [dr.doc_field_length(i, "f2") for i in range(0, len(lengths))] assert ls2 == [byte_to_length(length_to_byte(l)) for l in lengths] @@ -163,7 +163,7 @@ def test_many_lengths(): w = ix.writer() for i, word in enumerate(domain): length = (i + 1) ** 6 - w.add_document(text=" ".join(word for _ in xrange(length))) + w.add_document(text=" ".join(word for _ in range(length))) w.commit() s = ix.searcher() @@ -481,7 +481,7 @@ def test_noscorables1(): schema = fields.Schema(id=fields.ID, tags=fields.KEYWORD) with TempIndex(schema, "noscorables1") as ix: w = ix.writer() - for _ in xrange(times): + for _ in range(times): w.add_document( id=choice(values), tags=u(" ").join(sample(values, randint(2, 7))) ) @@ -560,7 +560,7 @@ def test_deleteall(): # This is just a test, don't use this method to delete all docs IRL! doccount = ix.doc_count_all() w = ix.writer() - for docnum in xrange(doccount): + for docnum in range(doccount): w.delete_document(docnum) w.commit() diff --git a/tests/test_matching.py b/tests/test_matching.py index 221853e4..06358f1e 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -1,12 +1,12 @@ from __future__ import with_statement from random import randint, choice, sample -from whoosh_reloaded import fields, matching, qparser, query -from whoosh_reloaded.compat import b, u, xrange, permutations -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.query import And, Term -from whoosh_reloaded.util import make_binary_tree -from whoosh_reloaded.scoring import WeightScorer +from whoosh import fields, matching, qparser, query +from whoosh.compat import b, u, range, permutations +from whoosh.filedb.filestore import RamStorage +from whoosh.query import And, Term +from whoosh.util import make_binary_tree +from whoosh.scoring import WeightScorer def _keys(searcher, docnums): @@ -40,7 +40,7 @@ def test_listmatcher(): assert ls == [9, 10] lm = matching.ListMatcher(ids) - for _ in xrange(3): + for _ in range(3): lm.next() lm = lm.copy() ls = [] @@ -307,12 +307,12 @@ def test_random_intersections(): # Create docsperseg * segments documents containing random words from # the domain list. Add the documents to the index, but also keep them # in the "documents" list for the sanity check - for i in xrange(segments): + for i in range(segments): w = ix.writer() - for j in xrange(docsperseg): + for j in range(docsperseg): docnum = i * docsperseg + j # Create a string of random words - doc = u(" ").join(choice(domain) for _ in xrange(randint(*fieldlimits))) + doc = u(" ").join(choice(domain) for _ in range(randint(*fieldlimits))) # Add the string to the index w.add_document(key=docnum, value=doc) # Add a (docnum, string) tuple to the documents list @@ -324,10 +324,10 @@ def test_random_intersections(): testlimits = (2, 5) with ix.searcher() as s: - for i in xrange(s.doc_count_all()): + for i in range(s.doc_count_all()): assert s.stored_fields(i).get("key") is not None - for _ in xrange(testcount): + for _ in range(testcount): # Create a random list of words and manually do an intersection of # items in "documents" that contain the words ("target"). words = sample(domain, randint(*testlimits)) @@ -388,10 +388,10 @@ def test_random_union(): vals = list(range(100)) - for _ in xrange(testcount): + for _ in range(testcount): target = set() matchers = [] - for _ in xrange(randint(*clauselimits)): + for _ in range(randint(*clauselimits)): nums = sample(vals, randint(*rangelimits)) target = target.union(nums) matchers.append(matching.ListMatcher(sorted(nums))) @@ -442,7 +442,7 @@ def test_random_andnot(): rng = list(range(rangesize)) - for _ in xrange(testcount): + for _ in range(testcount): negs = sorted(sample(rng, randint(0, rangesize - 1))) negset = frozenset(negs) matched = [n for n in rng if n not in negset] @@ -510,7 +510,7 @@ def test_exclusion(): with ix.writer() as w: # Make 39 documents with dates != dt1 and then make a last document # with feed == dt1. - for i in xrange(40): + for i in range(40): w.add_document(id=u(str(i)), date=(dt2 if i >= 1 else dt1)) with ix.searcher() as s: diff --git a/tests/test_misc.py b/tests/test_misc.py index d964fdd7..fe1bf151 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -1,14 +1,14 @@ from __future__ import with_statement import os, threading, time -from whoosh_reloaded.compat import u -from whoosh_reloaded.util.filelock import try_for -from whoosh_reloaded.util.numeric import length_to_byte, byte_to_length -from whoosh_reloaded.util.testing import TempStorage +from whoosh.compat import u +from whoosh.util.filelock import try_for +from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.util.testing import TempStorage def test_now(): - from whoosh_reloaded.util import now + from whoosh.util import now t1 = now() t2 = now() @@ -17,8 +17,8 @@ def test_now(): def test_storage_creation(): import tempfile, uuid - from whoosh_reloaded import fields - from whoosh_reloaded.filedb.filestore import FileStorage + from whoosh import fields + from whoosh.filedb.filestore import FileStorage schema = fields.Schema(text=fields.TEXT) uid = uuid.uuid4() @@ -39,7 +39,7 @@ def test_storage_creation(): def test_ramstorage(): - from whoosh_reloaded.filedb.filestore import RamStorage + from whoosh.filedb.filestore import RamStorage st = RamStorage() lock = st.lock("test") @@ -100,7 +100,7 @@ def test_length_byte(): def test_version_object(): - from whoosh_reloaded.util.versions import SimpleVersion as sv + from whoosh.util.versions import SimpleVersion as sv assert sv.parse("1") == sv(1) assert sv.parse("1.2") == sv(1, 2) diff --git a/tests/test_mpwriter.py b/tests/test_mpwriter.py index 9c9f1f50..d63506de 100644 --- a/tests/test_mpwriter.py +++ b/tests/test_mpwriter.py @@ -4,10 +4,10 @@ import pytest -from whoosh_reloaded import fields, query -from whoosh_reloaded.compat import u, izip, xrange, permutations, text_type -from whoosh_reloaded.util.numeric import length_to_byte, byte_to_length -from whoosh_reloaded.util.testing import TempIndex +from whoosh import fields, query +from whoosh.compat import u, izip, range, permutations, text_type +from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.util.testing import TempIndex def check_multi(): @@ -81,7 +81,7 @@ def _do_basic(writerclass): # Check there are lengths total = sum( r.doc_field_length(docnum, "text", 0) - for docnum in xrange(r.doc_count_all()) + for docnum in range(r.doc_count_all()) ) assert total > 0 @@ -108,21 +108,21 @@ def _do_basic(writerclass): def test_basic_serial(): check_multi() - from whoosh_reloaded.multiproc import SerialMpWriter + from whoosh.multiproc import SerialMpWriter _do_basic(SerialMpWriter) def test_basic_multi(): check_multi() - from whoosh_reloaded.multiproc import MpWriter + from whoosh.multiproc import MpWriter _do_basic(MpWriter) def test_no_add(): check_multi() - from whoosh_reloaded.multiproc import MpWriter + from whoosh.multiproc import MpWriter schema = fields.Schema(text=fields.TEXT(stored=True, spelling=True, vector=True)) with TempIndex(schema) as ix: @@ -207,21 +207,21 @@ def _do_merge(writerclass): def test_merge_serial(): check_multi() - from whoosh_reloaded.multiproc import SerialMpWriter + from whoosh.multiproc import SerialMpWriter _do_merge(SerialMpWriter) def test_merge_multi(): check_multi() - from whoosh_reloaded.multiproc import MpWriter + from whoosh.multiproc import MpWriter _do_merge(MpWriter) def test_no_score_no_store(): check_multi() - from whoosh_reloaded.multiproc import MpWriter + from whoosh.multiproc import MpWriter schema = fields.Schema(a=fields.ID, b=fields.KEYWORD) domain = {} @@ -244,7 +244,7 @@ def test_no_score_no_store(): def test_multisegment(): check_multi() - from whoosh_reloaded.multiproc import MpWriter + from whoosh.multiproc import MpWriter schema = fields.Schema(a=fields.TEXT(stored=True, spelling=True, vector=True)) words = u("alfa bravo charlie delta echo").split() @@ -270,14 +270,14 @@ def test_batchsize_eq_doccount(): schema = fields.Schema(a=fields.KEYWORD(stored=True)) with TempIndex(schema) as ix: with ix.writer(procs=4, batchsize=10) as w: - for i in xrange(10): + for i in range(10): w.add_document(a=u(str(i))) def test_finish_segment(): check_multi() - from whoosh_reloaded.multiproc import MpWriter + from whoosh.multiproc import MpWriter schema = fields.Schema(a=fields.KEYWORD(stored=True)) with TempIndex(schema) as ix: diff --git a/tests/test_nested.py b/tests/test_nested.py index 9ba1984a..4da5274d 100644 --- a/tests/test_nested.py +++ b/tests/test_nested.py @@ -1,9 +1,9 @@ from __future__ import with_statement -from whoosh_reloaded import fields, query, sorting -from whoosh_reloaded.compat import u -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempIndex +from whoosh import fields, query, sorting +from whoosh.compat import u +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex def test_nested_parent(): @@ -285,7 +285,7 @@ def test_no_parents(): def test_parent_score_fn(): - from whoosh_reloaded.scoring import Frequency + from whoosh.scoring import Frequency schema = fields.Schema( name=fields.ID(unique=True, stored=True), keys=fields.TEXT, type=fields.ID diff --git a/tests/test_parse_plugins.py b/tests/test_parse_plugins.py index 72b612e1..706e8a11 100644 --- a/tests/test_parse_plugins.py +++ b/tests/test_parse_plugins.py @@ -2,15 +2,15 @@ import inspect from datetime import datetime -from whoosh_reloaded import analysis, fields, formats, qparser, query -from whoosh_reloaded.compat import u, text_type, xrange -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.qparser import dateparse, default, plugins, syntax -from whoosh_reloaded.util.times import adatetime +from whoosh import analysis, fields, formats, qparser, query +from whoosh.compat import u, text_type, range +from whoosh.filedb.filestore import RamStorage +from whoosh.qparser import dateparse, default, plugins, syntax +from whoosh.util.times import adatetime def _plugin_classes(ignore): - # Get all the subclasses of Plugin in whoosh_reloaded.qparser.plugins + # Get all the subclasses of Plugin in whoosh.qparser.plugins return [ c for _, c in inspect.getmembers(plugins, inspect.isclass) @@ -43,7 +43,7 @@ def test_combos(): count = 0 for i, first in enumerate(pis): - for j in xrange(len(pis)): + for j in range(len(pis)): if i == j: continue plist = [p for p in pis[:j] if p is not first] + [first] @@ -479,7 +479,7 @@ def test_fuzzy_plugin(): def test_fuzzy_prefix(): - from whoosh_reloaded import scoring + from whoosh import scoring schema = fields.Schema( title=fields.TEXT(stored=True), content=fields.TEXT(spelling=True) @@ -506,7 +506,7 @@ def test_fuzzy_prefix(): # Match -> fire is within 2 edits (transpose + delete) of first w.add_document(title=u("Fifth"), content=u("The fire is beautiful")) - from whoosh_reloaded.qparser import QueryParser, FuzzyTermPlugin + from whoosh.qparser import QueryParser, FuzzyTermPlugin parser = QueryParser("content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) @@ -600,7 +600,7 @@ def check(qstring, target): def test_function_first(): - from whoosh_reloaded.query.spans import SpanFirst + from whoosh.query.spans import SpanFirst def make_first(qs): return SpanFirst(qs[0]) diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 735d5697..b1f9f799 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -1,6 +1,6 @@ -from whoosh_reloaded import analysis, fields, query -from whoosh_reloaded.compat import u, text_type -from whoosh_reloaded.qparser import default, plugins +from whoosh import analysis, fields, query +from whoosh.compat import u, text_type +from whoosh.qparser import default, plugins def test_whitespace(): @@ -1115,7 +1115,7 @@ def test_quoted_prefix(): def test_multitoken_with_factory(): - from whoosh_reloaded.qparser.syntax import OrGroup + from whoosh.qparser.syntax import OrGroup schema = fields.Schema(title=fields.TEXT) diff --git a/tests/test_postings.py b/tests/test_postings.py index edec726c..f586e1d8 100644 --- a/tests/test_postings.py +++ b/tests/test_postings.py @@ -1,12 +1,12 @@ from __future__ import with_statement -from whoosh_reloaded import analysis, fields -from whoosh_reloaded.compat import u -from whoosh_reloaded.codec import default_codec -from whoosh_reloaded.formats import Existence, Frequency -from whoosh_reloaded.formats import Positions, PositionBoosts -from whoosh_reloaded.formats import Characters, CharacterBoosts -from whoosh_reloaded.util.testing import TempStorage +from whoosh import analysis, fields +from whoosh.compat import u +from whoosh.codec import default_codec +from whoosh.formats import Existence, Frequency +from whoosh.formats import Positions, PositionBoosts +from whoosh.formats import Characters, CharacterBoosts +from whoosh.util.testing import TempStorage def _roundtrip(content, format_, astype, ana=None): diff --git a/tests/test_quality.py b/tests/test_quality.py index 389c31f0..e051bd2f 100644 --- a/tests/test_quality.py +++ b/tests/test_quality.py @@ -1,10 +1,10 @@ from __future__ import with_statement import random -from whoosh_reloaded import fields, matching, scoring -from whoosh_reloaded.compat import u, xrange -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.numeric import length_to_byte, byte_to_length +from whoosh import fields, matching, scoring +from whoosh.compat import u, range +from whoosh.filedb.filestore import RamStorage +from whoosh.util.numeric import length_to_byte, byte_to_length def _discreet(length): @@ -15,7 +15,7 @@ def test_max_field_length(): st = RamStorage() schema = fields.Schema(t=fields.TEXT) ix = st.create_index(schema) - for i in xrange(1, 200, 7): + for i in range(1, 200, 7): w = ix.writer() w.add_document(t=u(" ").join(["word"] * i)) w.commit() @@ -30,7 +30,7 @@ def test_minmax_field_length(): ix = st.create_index(schema) least = 999999 most = 0 - for _ in xrange(1, 200, 7): + for _ in range(1, 200, 7): w = ix.writer() count = random.randint(1, 100) least = min(count, least) diff --git a/tests/test_queries.py b/tests/test_queries.py index 8c57d7a3..3e5f34e7 100644 --- a/tests/test_queries.py +++ b/tests/test_queries.py @@ -3,34 +3,34 @@ import pytest -from whoosh_reloaded import fields, qparser, query -from whoosh_reloaded.compat import b, u -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.qparser import QueryParser -from whoosh_reloaded.query import And -from whoosh_reloaded.query import AndMaybe -from whoosh_reloaded.query import ConstantScoreQuery -from whoosh_reloaded.query import DateRange -from whoosh_reloaded.query import DisjunctionMax -from whoosh_reloaded.query import Every -from whoosh_reloaded.query import FuzzyTerm -from whoosh_reloaded.query import Not -from whoosh_reloaded.query import NullQuery -from whoosh_reloaded.query import NumericRange -from whoosh_reloaded.query import Or -from whoosh_reloaded.query import Phrase -from whoosh_reloaded.query import Prefix -from whoosh_reloaded.query import Require -from whoosh_reloaded.query import Term -from whoosh_reloaded.query import TermRange -from whoosh_reloaded.query import Variations -from whoosh_reloaded.query import Wildcard -from whoosh_reloaded.query.spans import SpanContains -from whoosh_reloaded.query.spans import SpanFirst -from whoosh_reloaded.query.spans import SpanNear -from whoosh_reloaded.query.spans import SpanNot -from whoosh_reloaded.query.spans import SpanOr -from whoosh_reloaded.util.testing import TempIndex +from whoosh import fields, qparser, query +from whoosh.compat import b, u +from whoosh.filedb.filestore import RamStorage +from whoosh.qparser import QueryParser +from whoosh.query import And +from whoosh.query import AndMaybe +from whoosh.query import ConstantScoreQuery +from whoosh.query import DateRange +from whoosh.query import DisjunctionMax +from whoosh.query import Every +from whoosh.query import FuzzyTerm +from whoosh.query import Not +from whoosh.query import NullQuery +from whoosh.query import NumericRange +from whoosh.query import Or +from whoosh.query import Phrase +from whoosh.query import Prefix +from whoosh.query import Require +from whoosh.query import Term +from whoosh.query import TermRange +from whoosh.query import Variations +from whoosh.query import Wildcard +from whoosh.query.spans import SpanContains +from whoosh.query.spans import SpanFirst +from whoosh.query.spans import SpanNear +from whoosh.query.spans import SpanNot +from whoosh.query.spans import SpanOr +from whoosh.util.testing import TempIndex def test_all_terms(): diff --git a/tests/test_reading.py b/tests/test_reading.py index 4f79684c..0177f4b5 100644 --- a/tests/test_reading.py +++ b/tests/test_reading.py @@ -3,12 +3,12 @@ import random, threading, time import pytest -from whoosh_reloaded import fields, formats, reading +from whoosh import fields, formats, reading -from whoosh_reloaded.compat import b, u, xrange -from whoosh_reloaded.reading import SegmentReader -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempIndex +from whoosh.compat import b, u, range +from whoosh.reading import SegmentReader +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex def _create_index(): @@ -343,7 +343,7 @@ def __init__(self, ix): self.ix = ix def run(self): - for _ in xrange(50): + for _ in range(50): r = self.ix.reader() r.close() @@ -357,7 +357,7 @@ def __init__(self, ix): self.ix = ix def run(self): - for _ in xrange(10): + for _ in range(10): w = self.ix.writer() w.add_document(text=random.sample(self.domain, 4)) w.commit() @@ -384,7 +384,7 @@ def test_nonexclusive_read(): w.commit(merge=False) def fn(): - for _ in xrange(5): + for _ in range(5): r = ix.reader() assert list(r.field_terms("text")) == [ "document", @@ -397,7 +397,7 @@ def fn(): ] r.close() - ths = [threading.Thread(target=fn) for _ in xrange(5)] + ths = [threading.Thread(target=fn) for _ in range(5)] for th in ths: th.start() for th in ths: @@ -408,7 +408,7 @@ def test_doc_count(): schema = fields.Schema(id=fields.NUMERIC) ix = RamStorage().create_index(schema) with ix.writer() as w: - for i in xrange(10): + for i in range(10): w.add_document(id=i) r = ix.reader() @@ -427,7 +427,7 @@ def test_doc_count(): assert r.doc_count_all() == 10 w = ix.writer() - for i in xrange(10, 15): + for i in range(10, 15): w.add_document(id=i) w.commit(merge=False) @@ -452,7 +452,7 @@ def test_doc_count(): def test_reader_subclasses(): - from whoosh_reloaded.util.testing import check_abstract_methods + from whoosh.util.testing import check_abstract_methods check_abstract_methods(reading.IndexReader, SegmentReader) check_abstract_methods(reading.IndexReader, reading.MultiReader) diff --git a/tests/test_results.py b/tests/test_results.py index 4daa05af..b98f39c1 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -2,11 +2,11 @@ import pytest -from whoosh_reloaded import analysis, fields, formats, highlight, qparser, query -from whoosh_reloaded.codec.whoosh_reloaded3 import W3Codec -from whoosh_reloaded.compat import u, xrange, text_type, permutations -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempStorage, TempIndex +from whoosh import analysis, fields, formats, highlight, qparser, query +from whoosh.codec.whoosh3 import W3Codec +from whoosh.compat import u, range, text_type, permutations +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempStorage, TempIndex def test_score_retrieval(): @@ -135,7 +135,7 @@ def check(r, target): def test_sorted_extend(): - from whoosh_reloaded import sorting + from whoosh import sorting schema = fields.Schema( title=fields.TEXT(stored=True), @@ -232,7 +232,7 @@ def test_extend_filtered(): def test_pages(): - from whoosh_reloaded.scoring import Frequency + from whoosh.scoring import Frequency schema = fields.Schema(id=fields.ID(stored=True), c=fields.TEXT) ix = RamStorage().create_index(schema) @@ -260,7 +260,7 @@ def test_pages(): def test_pages_with_filter(): - from whoosh_reloaded.scoring import Frequency + from whoosh.scoring import Frequency schema = fields.Schema(id=fields.ID(stored=True), type=fields.TEXT(), c=fields.TEXT) ix = RamStorage().create_index(schema) @@ -297,14 +297,14 @@ def test_extra_slice(): def test_page_counts(): - from whoosh_reloaded.scoring import Frequency + from whoosh.scoring import Frequency schema = fields.Schema(id=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() - for i in xrange(10): + for i in range(10): w.add_document(id=text_type(i)) w.commit() @@ -494,7 +494,7 @@ def test_lengths2(): schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) count = 0 - for _ in xrange(3): + for _ in range(3): w = ix.writer() for ls in permutations(u("alfa bravo charlie").split()): if "bravo" in ls and "charlie" in ls: @@ -523,7 +523,7 @@ def test_stability(): with ix.searcher() as s: q = query.Term("text", u("bravo")) last = [] - for i in xrange(s.doc_frequency("text", u("bravo"))): + for i in range(s.doc_frequency("text", u("bravo"))): # Only un-optimized results are stable r = s.search(q, limit=i + 1, optimize=False) docnums = [hit.docnum for hit in r] @@ -585,7 +585,7 @@ def test_hit_column(): def test_closed_searcher(): - from whoosh_reloaded.reading import ReaderClosed + from whoosh.reading import ReaderClosed schema = fields.Schema(key=fields.KEYWORD(stored=True, sortable=True)) @@ -683,7 +683,7 @@ def test_filter_by_result(): with TempIndex(schema, "filter") as ix: words = u("foo bar baz qux barney").split() with ix.writer() as w: - for x in xrange(100): + for x in range(100): t = u("even" if x % 2 == 0 else "odd") c = words[x % len(words)] w.add_document(title=t, content=c) diff --git a/tests/test_searching.py b/tests/test_searching.py index b976facb..b140aa56 100644 --- a/tests/test_searching.py +++ b/tests/test_searching.py @@ -6,12 +6,12 @@ import pytest -from whoosh_reloaded import analysis, fields, index, qparser, query, scoring -from whoosh_reloaded.codec.whoosh_reloaded3 import W3Codec -from whoosh_reloaded.compat import b, u, text_type -from whoosh_reloaded.compat import xrange, permutations, izip_longest -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempIndex +from whoosh import analysis, fields, index, qparser, query, scoring +from whoosh.codec.whoosh3 import W3Codec +from whoosh.compat import b, u, text_type +from whoosh.compat import range, permutations, izip_longest +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex def make_index(): @@ -112,7 +112,7 @@ def test_ors(): with ix.searcher() as s: qs = [query.Term("text", word) for word in domain] - for i in xrange(1, len(domain)): + for i in range(1, len(domain)): q = query.Or(qs[:i]) r1 = [(hit.docnum, hit.score) for hit in s.search(q, limit=None)] @@ -372,7 +372,7 @@ def test_open_numeric_ranges(): def test_open_date_ranges(): basedate = datetime(2011, 1, 24, 6, 25, 0, 0) - domain = [basedate + timedelta(days=n) for n in xrange(-20, 20)] + domain = [basedate + timedelta(days=n) for n in range(-20, 20)] schema = fields.Schema(date=fields.DATETIME(stored=True)) ix = RamStorage().create_index(schema) @@ -397,7 +397,7 @@ def test_open_date_ranges(): assert r == target # With date parser - from whoosh_reloaded.qparser.dateparse import DateParserPlugin + from whoosh.qparser.dateparse import DateParserPlugin qp.add_plugin(DateParserPlugin(basedate)) @@ -747,7 +747,7 @@ def test_short_prefix(): def test_weighting(): - from whoosh_reloaded.scoring import Weighting, BaseScorer + from whoosh.scoring import Weighting, BaseScorer schema = fields.Schema(id=fields.ID(stored=True), n_comments=fields.STORED) st = RamStorage() @@ -861,7 +861,7 @@ def test_missing_wildcard(): def test_finalweighting(): - from whoosh_reloaded.scoring import Frequency + from whoosh.scoring import Frequency schema = fields.Schema( id=fields.ID(stored=True), summary=fields.TEXT, n_comments=fields.STORED @@ -1311,7 +1311,7 @@ def pos_score_fn(searcher, fieldname, text, matcher): # schema = fields.Schema(id=fields.STORED, text=fields.TEXT) # ix = RamStorage().create_index(schema) # with ix.writer() as w: -# for i in xrange(200): +# for i in range(200): # text = u("a%s" % i) # w.add_document(id=i, text=text) # @@ -1322,11 +1322,11 @@ def pos_score_fn(searcher, fieldname, text, matcher): # m = q.matcher(s) # assert m.supports("positions") # items = list(m.items_as("positions")) -# assert [(i, [0]) for i in xrange(200)] == items +# assert [(i, [0]) for i in range(200)] == items def test_collapse(): - from whoosh_reloaded import collectors + from whoosh import collectors # id, text, size, tag domain = [ @@ -1379,7 +1379,7 @@ def test_collapse(): def test_collapse_nocolumn(): - from whoosh_reloaded import collectors + from whoosh import collectors # id, text, size, tag domain = [ @@ -1501,7 +1501,7 @@ def check(r): def test_collapse_order(): - from whoosh_reloaded import sorting + from whoosh import sorting schema = fields.Schema( id=fields.STORED, @@ -1540,7 +1540,7 @@ def check(kwargs, target): def test_collapse_order_nocolumn(): - from whoosh_reloaded import sorting + from whoosh import sorting schema = fields.Schema( id=fields.STORED, @@ -1579,7 +1579,7 @@ def check(kwargs, target): def test_coord(): - from whoosh_reloaded.matching import CoordMatcher + from whoosh.matching import CoordMatcher schema = fields.Schema(id=fields.STORED, hits=fields.STORED, tags=fields.KEYWORD) ix = RamStorage().create_index(schema) @@ -1646,7 +1646,7 @@ def test_groupedby_with_terms(): def test_buffered_refresh(): - from whoosh_reloaded import writing + from whoosh import writing schema = fields.Schema(foo=fields.ID()) ix = RamStorage().create_index(schema) diff --git a/tests/test_sorting.py b/tests/test_sorting.py index 706a6894..69881e9b 100644 --- a/tests/test_sorting.py +++ b/tests/test_sorting.py @@ -2,11 +2,11 @@ from datetime import datetime, timedelta import random -from whoosh_reloaded import fields, query, sorting, columns -from whoosh_reloaded.compat import u -from whoosh_reloaded.compat import permutations, xrange -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempIndex +from whoosh import fields, query, sorting, columns +from whoosh.compat import u +from whoosh.compat import permutations, range +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex try: @@ -60,7 +60,7 @@ def make_single_index(ix): def make_multi_index(ix): - for i in xrange(0, len(docs), 3): + for i in range(0, len(docs), 3): w = ix.writer() for doc in docs[i : i + 3]: w.add_document(ev=u("a"), **doc) @@ -418,7 +418,7 @@ def test_daterange_facet(): def test_relative_daterange(): - from whoosh_reloaded.support.relativedelta import relativedelta + from whoosh.support.relativedelta import relativedelta dt = datetime @@ -572,7 +572,7 @@ def test_sort_filter(): groups = u("alfa bravo charlie").split() keys = u("abcdefghijklmnopqrstuvwxyz") source = [] - for i in xrange(100): + for i in range(100): key = keys[i % len(keys)] group = groups[i % len(groups)] source.append({"key": key, "group": group}) @@ -991,7 +991,7 @@ def test_add_sortable(): def test_missing_column(): - from whoosh_reloaded import collectors + from whoosh import collectors schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD) ix = RamStorage().create_index(schema) @@ -1045,7 +1045,7 @@ def test_compound_sort(): assert all(len(ls) == 10 for ls in (alist, blist, clist)) with ix.writer() as w: - for i in xrange(10): + for i in range(10): w.add_document(a=alist[i], b=blist[i], c=clist[i]) with ix.searcher() as s: @@ -1076,7 +1076,7 @@ def test_compound_sort(): def test_column_scoring(): - from whoosh_reloaded import scoring + from whoosh import scoring # "sortable=True" on the "id" field tells it to build a column store # of field values. If you didn't ever need to actually search on this field, diff --git a/tests/test_spans.py b/tests/test_spans.py index 1828a572..4587afe1 100644 --- a/tests/test_spans.py +++ b/tests/test_spans.py @@ -1,11 +1,11 @@ from __future__ import with_statement -from whoosh_reloaded import analysis, fields, formats -from whoosh_reloaded.compat import u, xrange, permutations -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.query import spans -from whoosh_reloaded.query import And, Or, Term, Phrase -from whoosh_reloaded.util.testing import TempIndex +from whoosh import analysis, fields, formats +from whoosh.compat import u, range, permutations +from whoosh.filedb.filestore import RamStorage +from whoosh.query import spans +from whoosh.query import And, Or, Term, Phrase +from whoosh.util.testing import TempIndex domain = ("alfa", "bravo", "bravo", "charlie", "delta", "echo") @@ -39,7 +39,7 @@ def test_multimatcher(): domain = ("alfa", "bravo", "charlie", "delta") - for _ in xrange(3): + for _ in range(3): w = ix.writer() for ls in permutations(domain): w.add_document(content=u(" ").join(ls)) @@ -62,7 +62,7 @@ def test_excludematcher(): domain = ("alfa", "bravo", "charlie", "delta") - for _ in xrange(3): + for _ in range(3): w = ix.writer() for ls in permutations(domain): w.add_document(content=u(" ").join(ls)) diff --git a/tests/test_spelling.py b/tests/test_spelling.py index 5ca5a9cd..93878126 100644 --- a/tests/test_spelling.py +++ b/tests/test_spelling.py @@ -1,11 +1,11 @@ from __future__ import with_statement import gzip -from whoosh_reloaded import analysis, fields, highlight, query, spelling -from whoosh_reloaded.compat import u -from whoosh_reloaded.qparser import QueryParser -from whoosh_reloaded.support.levenshtein import levenshtein -from whoosh_reloaded.util.testing import TempIndex +from whoosh import analysis, fields, highlight, query, spelling +from whoosh.compat import u +from whoosh.qparser import QueryParser +from whoosh.support.levenshtein import levenshtein +from whoosh.util.testing import TempIndex _wordlist = sorted( @@ -297,7 +297,7 @@ def test_prefix_address(): def test_correct_correct(): - from whoosh_reloaded import qparser + from whoosh import qparser schema = fields.Schema(a=fields.TEXT()) with TempIndex(schema) as ix: @@ -357,7 +357,7 @@ def test_very_long_words(): # assert not r.has_word_graph("text1") # assert not r.has_word_graph("text2") # -# from whoosh_reloaded.writing import add_spelling +# from whoosh.writing import add_spelling # add_spelling(ix, ["text1", "text2"]) # # with ix.reader() as r: diff --git a/tests/test_stem.py b/tests/test_stem.py index 7ec95f2d..cbfa274b 100644 --- a/tests/test_stem.py +++ b/tests/test_stem.py @@ -1,7 +1,7 @@ -from whoosh_reloaded.lang.snowball.english import EnglishStemmer -from whoosh_reloaded.lang.snowball.french import FrenchStemmer -from whoosh_reloaded.lang.snowball.finnish import FinnishStemmer -from whoosh_reloaded.lang.snowball.spanish import SpanishStemmer +from whoosh.lang.snowball.english import EnglishStemmer +from whoosh.lang.snowball.french import FrenchStemmer +from whoosh.lang.snowball.finnish import FinnishStemmer +from whoosh.lang.snowball.spanish import SpanishStemmer def test_english(): diff --git a/tests/test_tables.py b/tests/test_tables.py index ee50399b..cb0dc7d5 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -3,11 +3,11 @@ from __future__ import with_statement import random -from whoosh_reloaded.compat import b, xrange, iteritems -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.filedb.filetables import HashReader, HashWriter -from whoosh_reloaded.filedb.filetables import OrderedHashWriter, OrderedHashReader -from whoosh_reloaded.util.testing import TempStorage +from whoosh.compat import b, range, iteritems +from whoosh.filedb.filestore import RamStorage +from whoosh.filedb.filetables import HashReader, HashWriter +from whoosh.filedb.filetables import OrderedHashWriter, OrderedHashReader +from whoosh.util.testing import TempStorage def test_hash_single(): @@ -99,7 +99,7 @@ def randstring(): return b(s) with TempStorage("randomhash") as st: - samp = dict((randstring(), randstring()) for _ in xrange(times)) + samp = dict((randstring(), randstring()) for _ in range(times)) hw = HashWriter(st.create_file("test.hsh")) for k, v in iteritems(samp): @@ -118,7 +118,7 @@ def test_random_access(): times = 1000 with TempStorage("orderedhash") as st: hw = HashWriter(st.create_file("test.hsh")) - hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times)) + hw.add_all((b("%08x" % x), b(str(x))) for x in range(times)) hw.close() keys = list(range(times)) @@ -199,7 +199,7 @@ def test_extras(): def test_checksum_file(): - from whoosh_reloaded.filedb.structfile import ChecksumFile + from whoosh.filedb.structfile import ChecksumFile from zlib import crc32 def wr(f): diff --git a/tests/test_vectors.py b/tests/test_vectors.py index 2f1e73c2..cdd13148 100644 --- a/tests/test_vectors.py +++ b/tests/test_vectors.py @@ -1,10 +1,10 @@ # encoding: utf-8 from __future__ import with_statement -from whoosh_reloaded import fields, formats -from whoosh_reloaded.compat import u -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempIndex +from whoosh import fields, formats +from whoosh.compat import u +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex def test_single_term(): @@ -69,7 +69,7 @@ def test_vector_merge(): def test_vector_unicode(): - from whoosh_reloaded import analysis + from whoosh import analysis cf = fields.TEXT(analyzer=analysis.RegexTokenizer(), vector=True) schema = fields.Schema(id=fields.NUMERIC, text=cf) diff --git a/tests/test_weightings.py b/tests/test_weightings.py index 030f28da..8a51440f 100644 --- a/tests/test_weightings.py +++ b/tests/test_weightings.py @@ -3,13 +3,13 @@ from random import choice, randint import sys -from whoosh_reloaded import fields, query, scoring -from whoosh_reloaded.compat import u, xrange, permutations -from whoosh_reloaded.filedb.filestore import RamStorage +from whoosh import fields, query, scoring +from whoosh.compat import u, range, permutations +from whoosh.filedb.filestore import RamStorage def _weighting_classes(ignore): - # Get all the subclasses of Weighting in whoosh_reloaded.scoring + # Get all the subclasses of Weighting in whoosh.scoring return [ c for _, c in inspect.getmembers(scoring, inspect.isclass) @@ -23,9 +23,9 @@ def test_all(): storage = RamStorage() ix = storage.create_index(schema) w = ix.writer() - for _ in xrange(100): + for _ in range(100): w.add_document( - text=u(" ").join(choice(domain) for _ in xrange(randint(10, 20))) + text=u(" ").join(choice(domain) for _ in range(randint(10, 20))) ) w.commit() @@ -59,7 +59,7 @@ def test_all(): def test_compatibility(): - from whoosh_reloaded.scoring import Weighting + from whoosh.scoring import Weighting # This is the old way of doing a custom weighting model, check that # it's still supported... diff --git a/tests/test_writing.py b/tests/test_writing.py index 4bc56edd..de032b6e 100644 --- a/tests/test_writing.py +++ b/tests/test_writing.py @@ -3,10 +3,10 @@ import pytest -from whoosh_reloaded import analysis, fields, query, writing -from whoosh_reloaded.compat import b, u, xrange, text_type -from whoosh_reloaded.filedb.filestore import RamStorage -from whoosh_reloaded.util.testing import TempIndex +from whoosh import analysis, fields, query, writing +from whoosh.compat import b, u, range, text_type +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex def test_no_stored(): @@ -25,7 +25,7 @@ def test_no_stored(): ) w = ix.writer() - for i in xrange(20): + for i in range(20): w.add_document(id=text_type(i), text=" ".join(random.sample(domain, 5))) w.commit() @@ -52,7 +52,7 @@ def test_asyncwriter(): # Simulate doing 20 (near-)simultaneous commits. If we weren't using # AsyncWriter, at least some of these would fail because the first # writer wouldn't be finished yet. - for i in xrange(20): + for i in range(20): w = writing.AsyncWriter(ix) writers.append(w) w.add_document(id=text_type(i), text=" ".join(random.sample(domain, 5))) @@ -87,7 +87,7 @@ def test_asyncwriter_no_stored(): # Simulate doing 20 (near-)simultaneous commits. If we weren't using # AsyncWriter, at least some of these would fail because the first # writer wouldn't be finished yet. - for i in xrange(20): + for i in range(20): w = writing.AsyncWriter(ix) writers.append(w) w.add_document(id=text_type(i), text=" ".join(random.sample(domain, 5))) @@ -106,7 +106,7 @@ def test_asyncwriter_no_stored(): def test_updates(): schema = fields.Schema(id=fields.ID(unique=True, stored=True)) ix = RamStorage().create_index(schema) - for _ in xrange(10): + for _ in range(10): with ix.writer() as w: w.update_document(id="a") assert ix.doc_count() == 1 @@ -121,7 +121,7 @@ def test_buffered(): w = writing.BufferedWriter( ix, period=None, limit=10, commitargs={"merge": False} ) - for i in xrange(20): + for i in range(20): w.add_document(id=text_type(i), text=" ".join(random.sample(domain, 5))) time.sleep(0.1) w.close() @@ -160,7 +160,7 @@ def test_buffered_update(): ) with TempIndex(schema, "bufferedupdate") as ix: w = writing.BufferedWriter(ix, period=None, limit=5) - for i in xrange(10): + for i in range(10): for char in "abc": fs = dict(id=char, payload=text_type(i) + char) w.update_document(**fs) @@ -186,11 +186,11 @@ def test_buffered_threads(): class SimWriter(threading.Thread): def run(self): - for _ in xrange(5): + for _ in range(5): w.update_document(name=random.choice(domain)) time.sleep(random.uniform(0.01, 0.1)) - threads = [SimWriter() for _ in xrange(5)] + threads = [SimWriter() for _ in range(5)] for thread in threads: thread.start() for thread in threads: @@ -279,7 +279,7 @@ def test_cancel_delete(): def test_delete_nonexistant(): - from whoosh_reloaded.writing import IndexingError + from whoosh.writing import IndexingError schema = fields.Schema(id=fields.ID(stored=True)) # Single segment